diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,92547 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999621656388332, + "eval_steps": 500, + "global_step": 13215, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.56687223336234e-05, + "grad_norm": 272.8875732421875, + "learning_rate": 2.7231467473524962e-08, + "loss": 2.6021, + "step": 1 + }, + { + "epoch": 0.0001513374446672468, + "grad_norm": 513.3167114257812, + "learning_rate": 5.4462934947049924e-08, + "loss": 2.8401, + "step": 2 + }, + { + "epoch": 0.0002270061670008702, + "grad_norm": 390.2225036621094, + "learning_rate": 8.169440242057489e-08, + "loss": 2.7988, + "step": 3 + }, + { + "epoch": 0.0003026748893344936, + "grad_norm": 272.265625, + "learning_rate": 1.0892586989409985e-07, + "loss": 2.912, + "step": 4 + }, + { + "epoch": 0.000378343611668117, + "grad_norm": 306.4111633300781, + "learning_rate": 1.3615733736762482e-07, + "loss": 2.8548, + "step": 5 + }, + { + "epoch": 0.0004540123340017404, + "grad_norm": 251.80809020996094, + "learning_rate": 1.6338880484114979e-07, + "loss": 2.885, + "step": 6 + }, + { + "epoch": 0.0005296810563353637, + "grad_norm": 884.6312866210938, + "learning_rate": 1.9062027231467473e-07, + "loss": 2.9442, + "step": 7 + }, + { + "epoch": 0.0006053497786689872, + "grad_norm": 391.5308837890625, + "learning_rate": 2.178517397881997e-07, + "loss": 3.0565, + "step": 8 + }, + { + "epoch": 0.0006810185010026105, + "grad_norm": 295.6712646484375, + "learning_rate": 2.4508320726172467e-07, + "loss": 2.9165, + "step": 9 + }, + { + "epoch": 0.000756687223336234, + "grad_norm": 733.3089599609375, + "learning_rate": 2.7231467473524963e-07, + "loss": 2.7356, + "step": 10 + }, + { + "epoch": 0.0008323559456698573, + "grad_norm": 363.34002685546875, + "learning_rate": 2.995461422087746e-07, + "loss": 2.6017, + "step": 11 + }, + { + "epoch": 0.0009080246680034808, + "grad_norm": 256.9432373046875, + "learning_rate": 3.2677760968229957e-07, + "loss": 2.4895, + "step": 12 + }, + { + "epoch": 0.0009836933903371041, + "grad_norm": 314.6168212890625, + "learning_rate": 3.5400907715582454e-07, + "loss": 2.8149, + "step": 13 + }, + { + "epoch": 0.0010593621126707275, + "grad_norm": 266.22314453125, + "learning_rate": 3.8124054462934946e-07, + "loss": 2.6049, + "step": 14 + }, + { + "epoch": 0.001135030835004351, + "grad_norm": 263.47576904296875, + "learning_rate": 4.084720121028744e-07, + "loss": 2.5644, + "step": 15 + }, + { + "epoch": 0.0012106995573379744, + "grad_norm": 223.0679473876953, + "learning_rate": 4.357034795763994e-07, + "loss": 2.6094, + "step": 16 + }, + { + "epoch": 0.0012863682796715977, + "grad_norm": 248.41636657714844, + "learning_rate": 4.629349470499244e-07, + "loss": 2.4842, + "step": 17 + }, + { + "epoch": 0.001362037002005221, + "grad_norm": 826.5869750976562, + "learning_rate": 4.901664145234493e-07, + "loss": 2.6023, + "step": 18 + }, + { + "epoch": 0.0014377057243388446, + "grad_norm": 265.9122009277344, + "learning_rate": 5.173978819969742e-07, + "loss": 2.4341, + "step": 19 + }, + { + "epoch": 0.001513374446672468, + "grad_norm": 190.68650817871094, + "learning_rate": 5.446293494704993e-07, + "loss": 2.3036, + "step": 20 + }, + { + "epoch": 0.0015890431690060913, + "grad_norm": 184.1608123779297, + "learning_rate": 5.718608169440242e-07, + "loss": 2.3596, + "step": 21 + }, + { + "epoch": 0.0016647118913397146, + "grad_norm": 425.2568054199219, + "learning_rate": 5.990922844175492e-07, + "loss": 2.4074, + "step": 22 + }, + { + "epoch": 0.0017403806136733382, + "grad_norm": 203.991455078125, + "learning_rate": 6.263237518910741e-07, + "loss": 2.2616, + "step": 23 + }, + { + "epoch": 0.0018160493360069615, + "grad_norm": 135.3365020751953, + "learning_rate": 6.535552193645991e-07, + "loss": 2.1286, + "step": 24 + }, + { + "epoch": 0.0018917180583405849, + "grad_norm": 136.6835174560547, + "learning_rate": 6.807866868381241e-07, + "loss": 2.404, + "step": 25 + }, + { + "epoch": 0.0019673867806742082, + "grad_norm": 105.73369598388672, + "learning_rate": 7.080181543116491e-07, + "loss": 2.2063, + "step": 26 + }, + { + "epoch": 0.002043055503007832, + "grad_norm": 86.31427001953125, + "learning_rate": 7.352496217851739e-07, + "loss": 1.9829, + "step": 27 + }, + { + "epoch": 0.002118724225341455, + "grad_norm": 124.91061401367188, + "learning_rate": 7.624810892586989e-07, + "loss": 1.7351, + "step": 28 + }, + { + "epoch": 0.0021943929476750785, + "grad_norm": 48.06635284423828, + "learning_rate": 7.897125567322239e-07, + "loss": 2.0282, + "step": 29 + }, + { + "epoch": 0.002270061670008702, + "grad_norm": 85.16302490234375, + "learning_rate": 8.169440242057488e-07, + "loss": 1.6746, + "step": 30 + }, + { + "epoch": 0.002345730392342325, + "grad_norm": 54.819854736328125, + "learning_rate": 8.441754916792739e-07, + "loss": 1.9421, + "step": 31 + }, + { + "epoch": 0.0024213991146759487, + "grad_norm": 44.45909881591797, + "learning_rate": 8.714069591527988e-07, + "loss": 1.6696, + "step": 32 + }, + { + "epoch": 0.0024970678370095723, + "grad_norm": 46.482994079589844, + "learning_rate": 8.986384266263238e-07, + "loss": 1.7952, + "step": 33 + }, + { + "epoch": 0.0025727365593431954, + "grad_norm": 117.33851623535156, + "learning_rate": 9.258698940998488e-07, + "loss": 1.6118, + "step": 34 + }, + { + "epoch": 0.002648405281676819, + "grad_norm": 40.68963623046875, + "learning_rate": 9.531013615733736e-07, + "loss": 2.1438, + "step": 35 + }, + { + "epoch": 0.002724074004010442, + "grad_norm": 50.864559173583984, + "learning_rate": 9.803328290468987e-07, + "loss": 1.9518, + "step": 36 + }, + { + "epoch": 0.0027997427263440657, + "grad_norm": 55.14411544799805, + "learning_rate": 1.0075642965204236e-06, + "loss": 1.8224, + "step": 37 + }, + { + "epoch": 0.002875411448677689, + "grad_norm": 112.16168975830078, + "learning_rate": 1.0347957639939485e-06, + "loss": 1.5419, + "step": 38 + }, + { + "epoch": 0.0029510801710113123, + "grad_norm": 37.99906539916992, + "learning_rate": 1.0620272314674736e-06, + "loss": 1.5378, + "step": 39 + }, + { + "epoch": 0.003026748893344936, + "grad_norm": 85.35050964355469, + "learning_rate": 1.0892586989409985e-06, + "loss": 1.6139, + "step": 40 + }, + { + "epoch": 0.0031024176156785595, + "grad_norm": 50.46384048461914, + "learning_rate": 1.1164901664145235e-06, + "loss": 1.6364, + "step": 41 + }, + { + "epoch": 0.0031780863380121826, + "grad_norm": 74.24533081054688, + "learning_rate": 1.1437216338880484e-06, + "loss": 1.8635, + "step": 42 + }, + { + "epoch": 0.003253755060345806, + "grad_norm": 75.34041595458984, + "learning_rate": 1.1709531013615733e-06, + "loss": 1.6906, + "step": 43 + }, + { + "epoch": 0.0033294237826794293, + "grad_norm": 63.062156677246094, + "learning_rate": 1.1981845688350984e-06, + "loss": 1.5042, + "step": 44 + }, + { + "epoch": 0.003405092505013053, + "grad_norm": 80.37957763671875, + "learning_rate": 1.2254160363086233e-06, + "loss": 1.5012, + "step": 45 + }, + { + "epoch": 0.0034807612273466764, + "grad_norm": 35.123077392578125, + "learning_rate": 1.2526475037821482e-06, + "loss": 1.3791, + "step": 46 + }, + { + "epoch": 0.0035564299496802995, + "grad_norm": 64.57160949707031, + "learning_rate": 1.2798789712556734e-06, + "loss": 1.5318, + "step": 47 + }, + { + "epoch": 0.003632098672013923, + "grad_norm": 44.77900314331055, + "learning_rate": 1.3071104387291983e-06, + "loss": 1.4773, + "step": 48 + }, + { + "epoch": 0.0037077673943475466, + "grad_norm": 29.615036010742188, + "learning_rate": 1.3343419062027232e-06, + "loss": 1.3712, + "step": 49 + }, + { + "epoch": 0.0037834361166811698, + "grad_norm": 45.1904411315918, + "learning_rate": 1.3615733736762481e-06, + "loss": 1.539, + "step": 50 + }, + { + "epoch": 0.0038591048390147933, + "grad_norm": 26.724008560180664, + "learning_rate": 1.3888048411497732e-06, + "loss": 1.2884, + "step": 51 + }, + { + "epoch": 0.0039347735613484165, + "grad_norm": 26.483842849731445, + "learning_rate": 1.4160363086232982e-06, + "loss": 1.2467, + "step": 52 + }, + { + "epoch": 0.00401044228368204, + "grad_norm": 25.459138870239258, + "learning_rate": 1.443267776096823e-06, + "loss": 1.376, + "step": 53 + }, + { + "epoch": 0.004086111006015664, + "grad_norm": 612.6622314453125, + "learning_rate": 1.4704992435703478e-06, + "loss": 1.3352, + "step": 54 + }, + { + "epoch": 0.004161779728349287, + "grad_norm": 12.120221138000488, + "learning_rate": 1.497730711043873e-06, + "loss": 1.1906, + "step": 55 + }, + { + "epoch": 0.00423744845068291, + "grad_norm": 49.24264144897461, + "learning_rate": 1.5249621785173978e-06, + "loss": 1.23, + "step": 56 + }, + { + "epoch": 0.004313117173016533, + "grad_norm": 22.598411560058594, + "learning_rate": 1.5521936459909227e-06, + "loss": 1.2773, + "step": 57 + }, + { + "epoch": 0.004388785895350157, + "grad_norm": 22.006301879882812, + "learning_rate": 1.5794251134644479e-06, + "loss": 1.381, + "step": 58 + }, + { + "epoch": 0.0044644546176837805, + "grad_norm": 20.625085830688477, + "learning_rate": 1.6066565809379728e-06, + "loss": 1.3473, + "step": 59 + }, + { + "epoch": 0.004540123340017404, + "grad_norm": 29.52547264099121, + "learning_rate": 1.6338880484114977e-06, + "loss": 1.3312, + "step": 60 + }, + { + "epoch": 0.004615792062351028, + "grad_norm": 13.293269157409668, + "learning_rate": 1.6611195158850228e-06, + "loss": 1.0988, + "step": 61 + }, + { + "epoch": 0.00469146078468465, + "grad_norm": 9.618423461914062, + "learning_rate": 1.6883509833585477e-06, + "loss": 1.1596, + "step": 62 + }, + { + "epoch": 0.004767129507018274, + "grad_norm": 9.537832260131836, + "learning_rate": 1.7155824508320727e-06, + "loss": 1.1767, + "step": 63 + }, + { + "epoch": 0.0048427982293518974, + "grad_norm": 17.054410934448242, + "learning_rate": 1.7428139183055976e-06, + "loss": 1.2305, + "step": 64 + }, + { + "epoch": 0.004918466951685521, + "grad_norm": 23.415481567382812, + "learning_rate": 1.7700453857791227e-06, + "loss": 1.1254, + "step": 65 + }, + { + "epoch": 0.0049941356740191446, + "grad_norm": 6.73915958404541, + "learning_rate": 1.7972768532526476e-06, + "loss": 1.049, + "step": 66 + }, + { + "epoch": 0.005069804396352767, + "grad_norm": 21.562658309936523, + "learning_rate": 1.8245083207261725e-06, + "loss": 1.1231, + "step": 67 + }, + { + "epoch": 0.005145473118686391, + "grad_norm": 16.345674514770508, + "learning_rate": 1.8517397881996977e-06, + "loss": 1.0867, + "step": 68 + }, + { + "epoch": 0.005221141841020014, + "grad_norm": 10.502206802368164, + "learning_rate": 1.8789712556732226e-06, + "loss": 1.3118, + "step": 69 + }, + { + "epoch": 0.005296810563353638, + "grad_norm": 8.155364036560059, + "learning_rate": 1.9062027231467473e-06, + "loss": 1.2176, + "step": 70 + }, + { + "epoch": 0.0053724792856872615, + "grad_norm": 18.021318435668945, + "learning_rate": 1.933434190620272e-06, + "loss": 1.1159, + "step": 71 + }, + { + "epoch": 0.005448148008020884, + "grad_norm": 10.645997047424316, + "learning_rate": 1.9606656580937973e-06, + "loss": 1.3196, + "step": 72 + }, + { + "epoch": 0.005523816730354508, + "grad_norm": 12.845873832702637, + "learning_rate": 1.987897125567322e-06, + "loss": 1.1641, + "step": 73 + }, + { + "epoch": 0.005599485452688131, + "grad_norm": 7.4295148849487305, + "learning_rate": 2.015128593040847e-06, + "loss": 1.2317, + "step": 74 + }, + { + "epoch": 0.005675154175021755, + "grad_norm": 12.633703231811523, + "learning_rate": 2.0423600605143723e-06, + "loss": 1.207, + "step": 75 + }, + { + "epoch": 0.005750822897355378, + "grad_norm": 12.621756553649902, + "learning_rate": 2.069591527987897e-06, + "loss": 1.2415, + "step": 76 + }, + { + "epoch": 0.005826491619689002, + "grad_norm": 7.364112377166748, + "learning_rate": 2.096822995461422e-06, + "loss": 1.2141, + "step": 77 + }, + { + "epoch": 0.005902160342022625, + "grad_norm": 7.511165142059326, + "learning_rate": 2.1240544629349472e-06, + "loss": 1.0121, + "step": 78 + }, + { + "epoch": 0.005977829064356248, + "grad_norm": 14.060636520385742, + "learning_rate": 2.151285930408472e-06, + "loss": 1.1539, + "step": 79 + }, + { + "epoch": 0.006053497786689872, + "grad_norm": 49.88886642456055, + "learning_rate": 2.178517397881997e-06, + "loss": 1.0104, + "step": 80 + }, + { + "epoch": 0.006129166509023495, + "grad_norm": 6.6888017654418945, + "learning_rate": 2.205748865355522e-06, + "loss": 0.9701, + "step": 81 + }, + { + "epoch": 0.006204835231357119, + "grad_norm": 7.409695148468018, + "learning_rate": 2.232980332829047e-06, + "loss": 1.0943, + "step": 82 + }, + { + "epoch": 0.006280503953690742, + "grad_norm": 9.27059268951416, + "learning_rate": 2.260211800302572e-06, + "loss": 1.1012, + "step": 83 + }, + { + "epoch": 0.006356172676024365, + "grad_norm": 8.880078315734863, + "learning_rate": 2.2874432677760967e-06, + "loss": 1.0485, + "step": 84 + }, + { + "epoch": 0.006431841398357989, + "grad_norm": 8.912999153137207, + "learning_rate": 2.314674735249622e-06, + "loss": 1.0787, + "step": 85 + }, + { + "epoch": 0.006507510120691612, + "grad_norm": 6.5734028816223145, + "learning_rate": 2.3419062027231466e-06, + "loss": 1.1154, + "step": 86 + }, + { + "epoch": 0.006583178843025236, + "grad_norm": 6.327483654022217, + "learning_rate": 2.369137670196672e-06, + "loss": 1.1694, + "step": 87 + }, + { + "epoch": 0.0066588475653588585, + "grad_norm": 12.942300796508789, + "learning_rate": 2.396369137670197e-06, + "loss": 0.9813, + "step": 88 + }, + { + "epoch": 0.006734516287692482, + "grad_norm": 10.303013801574707, + "learning_rate": 2.423600605143722e-06, + "loss": 1.0833, + "step": 89 + }, + { + "epoch": 0.006810185010026106, + "grad_norm": 7.25071907043457, + "learning_rate": 2.4508320726172467e-06, + "loss": 1.1512, + "step": 90 + }, + { + "epoch": 0.006885853732359729, + "grad_norm": 4.276927471160889, + "learning_rate": 2.4780635400907718e-06, + "loss": 1.1131, + "step": 91 + }, + { + "epoch": 0.006961522454693353, + "grad_norm": 11.979231834411621, + "learning_rate": 2.5052950075642965e-06, + "loss": 1.1562, + "step": 92 + }, + { + "epoch": 0.0070371911770269755, + "grad_norm": 6.277972221374512, + "learning_rate": 2.532526475037821e-06, + "loss": 1.1771, + "step": 93 + }, + { + "epoch": 0.007112859899360599, + "grad_norm": 7.063577175140381, + "learning_rate": 2.5597579425113467e-06, + "loss": 0.9926, + "step": 94 + }, + { + "epoch": 0.007188528621694223, + "grad_norm": 8.788790702819824, + "learning_rate": 2.5869894099848714e-06, + "loss": 0.9004, + "step": 95 + }, + { + "epoch": 0.007264197344027846, + "grad_norm": 7.295443058013916, + "learning_rate": 2.6142208774583966e-06, + "loss": 1.1524, + "step": 96 + }, + { + "epoch": 0.00733986606636147, + "grad_norm": 9.583985328674316, + "learning_rate": 2.6414523449319213e-06, + "loss": 1.1887, + "step": 97 + }, + { + "epoch": 0.007415534788695093, + "grad_norm": 7.807141304016113, + "learning_rate": 2.6686838124054464e-06, + "loss": 1.0794, + "step": 98 + }, + { + "epoch": 0.007491203511028716, + "grad_norm": 9.609314918518066, + "learning_rate": 2.695915279878971e-06, + "loss": 1.0667, + "step": 99 + }, + { + "epoch": 0.0075668722333623395, + "grad_norm": 7.190148830413818, + "learning_rate": 2.7231467473524962e-06, + "loss": 1.1177, + "step": 100 + }, + { + "epoch": 0.007642540955695963, + "grad_norm": 5.9306206703186035, + "learning_rate": 2.750378214826021e-06, + "loss": 1.0946, + "step": 101 + }, + { + "epoch": 0.007718209678029587, + "grad_norm": 6.257357597351074, + "learning_rate": 2.7776096822995465e-06, + "loss": 1.1852, + "step": 102 + }, + { + "epoch": 0.00779387840036321, + "grad_norm": 4.176384449005127, + "learning_rate": 2.804841149773071e-06, + "loss": 1.1085, + "step": 103 + }, + { + "epoch": 0.007869547122696833, + "grad_norm": 5.795867919921875, + "learning_rate": 2.8320726172465963e-06, + "loss": 1.064, + "step": 104 + }, + { + "epoch": 0.007945215845030457, + "grad_norm": 87.45132446289062, + "learning_rate": 2.859304084720121e-06, + "loss": 1.0562, + "step": 105 + }, + { + "epoch": 0.00802088456736408, + "grad_norm": 5.800858020782471, + "learning_rate": 2.886535552193646e-06, + "loss": 1.213, + "step": 106 + }, + { + "epoch": 0.008096553289697703, + "grad_norm": 6.449529647827148, + "learning_rate": 2.913767019667171e-06, + "loss": 1.2756, + "step": 107 + }, + { + "epoch": 0.008172222012031327, + "grad_norm": 15.208488464355469, + "learning_rate": 2.9409984871406956e-06, + "loss": 1.1593, + "step": 108 + }, + { + "epoch": 0.00824789073436495, + "grad_norm": 6.07266902923584, + "learning_rate": 2.968229954614221e-06, + "loss": 0.9689, + "step": 109 + }, + { + "epoch": 0.008323559456698574, + "grad_norm": 5.045470714569092, + "learning_rate": 2.995461422087746e-06, + "loss": 1.088, + "step": 110 + }, + { + "epoch": 0.008399228179032197, + "grad_norm": 8.590188980102539, + "learning_rate": 3.022692889561271e-06, + "loss": 1.0692, + "step": 111 + }, + { + "epoch": 0.00847489690136582, + "grad_norm": 9.020132064819336, + "learning_rate": 3.0499243570347956e-06, + "loss": 1.0444, + "step": 112 + }, + { + "epoch": 0.008550565623699444, + "grad_norm": 5.601812362670898, + "learning_rate": 3.0771558245083208e-06, + "loss": 1.0266, + "step": 113 + }, + { + "epoch": 0.008626234346033067, + "grad_norm": 3.23660945892334, + "learning_rate": 3.1043872919818455e-06, + "loss": 1.0381, + "step": 114 + }, + { + "epoch": 0.008701903068366691, + "grad_norm": 4.900957107543945, + "learning_rate": 3.131618759455371e-06, + "loss": 1.1345, + "step": 115 + }, + { + "epoch": 0.008777571790700314, + "grad_norm": 7.415119171142578, + "learning_rate": 3.1588502269288957e-06, + "loss": 0.9433, + "step": 116 + }, + { + "epoch": 0.008853240513033937, + "grad_norm": 4.038257122039795, + "learning_rate": 3.186081694402421e-06, + "loss": 1.1984, + "step": 117 + }, + { + "epoch": 0.008928909235367561, + "grad_norm": 4.441867828369141, + "learning_rate": 3.2133131618759456e-06, + "loss": 1.0773, + "step": 118 + }, + { + "epoch": 0.009004577957701184, + "grad_norm": 5.363611698150635, + "learning_rate": 3.2405446293494707e-06, + "loss": 1.1884, + "step": 119 + }, + { + "epoch": 0.009080246680034808, + "grad_norm": 6.429077625274658, + "learning_rate": 3.2677760968229954e-06, + "loss": 1.0864, + "step": 120 + }, + { + "epoch": 0.00915591540236843, + "grad_norm": 4.815485000610352, + "learning_rate": 3.2950075642965205e-06, + "loss": 1.0554, + "step": 121 + }, + { + "epoch": 0.009231584124702055, + "grad_norm": 11.878281593322754, + "learning_rate": 3.3222390317700457e-06, + "loss": 1.2129, + "step": 122 + }, + { + "epoch": 0.009307252847035678, + "grad_norm": 6.188513278961182, + "learning_rate": 3.3494704992435704e-06, + "loss": 1.0672, + "step": 123 + }, + { + "epoch": 0.0093829215693693, + "grad_norm": 4.279908657073975, + "learning_rate": 3.3767019667170955e-06, + "loss": 1.1063, + "step": 124 + }, + { + "epoch": 0.009458590291702925, + "grad_norm": 8.566632270812988, + "learning_rate": 3.40393343419062e-06, + "loss": 0.9051, + "step": 125 + }, + { + "epoch": 0.009534259014036548, + "grad_norm": 10.101946830749512, + "learning_rate": 3.4311649016641453e-06, + "loss": 1.1801, + "step": 126 + }, + { + "epoch": 0.009609927736370172, + "grad_norm": 3.696890115737915, + "learning_rate": 3.45839636913767e-06, + "loss": 1.1499, + "step": 127 + }, + { + "epoch": 0.009685596458703795, + "grad_norm": 4.319789409637451, + "learning_rate": 3.485627836611195e-06, + "loss": 1.1783, + "step": 128 + }, + { + "epoch": 0.009761265181037418, + "grad_norm": 3.6111180782318115, + "learning_rate": 3.51285930408472e-06, + "loss": 1.2035, + "step": 129 + }, + { + "epoch": 0.009836933903371042, + "grad_norm": 6.946579933166504, + "learning_rate": 3.5400907715582454e-06, + "loss": 1.429, + "step": 130 + }, + { + "epoch": 0.009912602625704665, + "grad_norm": 4.380500316619873, + "learning_rate": 3.56732223903177e-06, + "loss": 0.9982, + "step": 131 + }, + { + "epoch": 0.009988271348038289, + "grad_norm": 110.80493927001953, + "learning_rate": 3.5945537065052952e-06, + "loss": 1.053, + "step": 132 + }, + { + "epoch": 0.010063940070371912, + "grad_norm": 4.770391941070557, + "learning_rate": 3.62178517397882e-06, + "loss": 0.9609, + "step": 133 + }, + { + "epoch": 0.010139608792705535, + "grad_norm": 4.621954441070557, + "learning_rate": 3.649016641452345e-06, + "loss": 1.0428, + "step": 134 + }, + { + "epoch": 0.010215277515039159, + "grad_norm": 4.873030185699463, + "learning_rate": 3.6762481089258698e-06, + "loss": 1.0355, + "step": 135 + }, + { + "epoch": 0.010290946237372782, + "grad_norm": 5.315187931060791, + "learning_rate": 3.7034795763993953e-06, + "loss": 1.2877, + "step": 136 + }, + { + "epoch": 0.010366614959706406, + "grad_norm": 3.4662725925445557, + "learning_rate": 3.73071104387292e-06, + "loss": 0.9696, + "step": 137 + }, + { + "epoch": 0.010442283682040029, + "grad_norm": 6.4335784912109375, + "learning_rate": 3.757942511346445e-06, + "loss": 1.0649, + "step": 138 + }, + { + "epoch": 0.010517952404373651, + "grad_norm": 5.575058937072754, + "learning_rate": 3.78517397881997e-06, + "loss": 1.2124, + "step": 139 + }, + { + "epoch": 0.010593621126707276, + "grad_norm": 4.741557598114014, + "learning_rate": 3.8124054462934946e-06, + "loss": 1.0165, + "step": 140 + }, + { + "epoch": 0.010669289849040899, + "grad_norm": 6.094121932983398, + "learning_rate": 3.83963691376702e-06, + "loss": 1.1381, + "step": 141 + }, + { + "epoch": 0.010744958571374523, + "grad_norm": 3.2143642902374268, + "learning_rate": 3.866868381240544e-06, + "loss": 1.1538, + "step": 142 + }, + { + "epoch": 0.010820627293708146, + "grad_norm": 8.476461410522461, + "learning_rate": 3.89409984871407e-06, + "loss": 0.9853, + "step": 143 + }, + { + "epoch": 0.010896296016041768, + "grad_norm": 5.929498672485352, + "learning_rate": 3.921331316187595e-06, + "loss": 1.2708, + "step": 144 + }, + { + "epoch": 0.010971964738375393, + "grad_norm": 3.594865560531616, + "learning_rate": 3.948562783661119e-06, + "loss": 1.077, + "step": 145 + }, + { + "epoch": 0.011047633460709015, + "grad_norm": 5.143126964569092, + "learning_rate": 3.975794251134644e-06, + "loss": 0.8743, + "step": 146 + }, + { + "epoch": 0.01112330218304264, + "grad_norm": 3.9312331676483154, + "learning_rate": 4.00302571860817e-06, + "loss": 1.0474, + "step": 147 + }, + { + "epoch": 0.011198970905376263, + "grad_norm": 8.739069938659668, + "learning_rate": 4.030257186081694e-06, + "loss": 1.1, + "step": 148 + }, + { + "epoch": 0.011274639627709885, + "grad_norm": 4.785227298736572, + "learning_rate": 4.05748865355522e-06, + "loss": 1.0795, + "step": 149 + }, + { + "epoch": 0.01135030835004351, + "grad_norm": 4.471089839935303, + "learning_rate": 4.0847201210287446e-06, + "loss": 1.0992, + "step": 150 + }, + { + "epoch": 0.011425977072377132, + "grad_norm": 9.019781112670898, + "learning_rate": 4.111951588502269e-06, + "loss": 1.0866, + "step": 151 + }, + { + "epoch": 0.011501645794710757, + "grad_norm": 5.071846961975098, + "learning_rate": 4.139183055975794e-06, + "loss": 1.1729, + "step": 152 + }, + { + "epoch": 0.01157731451704438, + "grad_norm": 4.376003742218018, + "learning_rate": 4.1664145234493195e-06, + "loss": 1.203, + "step": 153 + }, + { + "epoch": 0.011652983239378004, + "grad_norm": 6.377297878265381, + "learning_rate": 4.193645990922844e-06, + "loss": 1.1541, + "step": 154 + }, + { + "epoch": 0.011728651961711627, + "grad_norm": 7.824789047241211, + "learning_rate": 4.220877458396369e-06, + "loss": 1.0479, + "step": 155 + }, + { + "epoch": 0.01180432068404525, + "grad_norm": 4.779434680938721, + "learning_rate": 4.2481089258698945e-06, + "loss": 1.0529, + "step": 156 + }, + { + "epoch": 0.011879989406378874, + "grad_norm": 8.425029754638672, + "learning_rate": 4.275340393343419e-06, + "loss": 1.0992, + "step": 157 + }, + { + "epoch": 0.011955658128712496, + "grad_norm": 5.336421966552734, + "learning_rate": 4.302571860816944e-06, + "loss": 1.0226, + "step": 158 + }, + { + "epoch": 0.012031326851046121, + "grad_norm": 7.749419212341309, + "learning_rate": 4.329803328290469e-06, + "loss": 1.0299, + "step": 159 + }, + { + "epoch": 0.012106995573379744, + "grad_norm": 8.775020599365234, + "learning_rate": 4.357034795763994e-06, + "loss": 1.2269, + "step": 160 + }, + { + "epoch": 0.012182664295713366, + "grad_norm": 6.738064289093018, + "learning_rate": 4.384266263237519e-06, + "loss": 1.0477, + "step": 161 + }, + { + "epoch": 0.01225833301804699, + "grad_norm": 4.490780353546143, + "learning_rate": 4.411497730711044e-06, + "loss": 1.0986, + "step": 162 + }, + { + "epoch": 0.012334001740380613, + "grad_norm": 4.881013870239258, + "learning_rate": 4.438729198184569e-06, + "loss": 1.0979, + "step": 163 + }, + { + "epoch": 0.012409670462714238, + "grad_norm": 3.8033907413482666, + "learning_rate": 4.465960665658094e-06, + "loss": 1.2036, + "step": 164 + }, + { + "epoch": 0.01248533918504786, + "grad_norm": 4.627386093139648, + "learning_rate": 4.4931921331316185e-06, + "loss": 1.0643, + "step": 165 + }, + { + "epoch": 0.012561007907381483, + "grad_norm": 4.1467108726501465, + "learning_rate": 4.520423600605144e-06, + "loss": 1.0594, + "step": 166 + }, + { + "epoch": 0.012636676629715108, + "grad_norm": 3.9435312747955322, + "learning_rate": 4.54765506807867e-06, + "loss": 1.0522, + "step": 167 + }, + { + "epoch": 0.01271234535204873, + "grad_norm": 7.734802722930908, + "learning_rate": 4.5748865355521935e-06, + "loss": 1.1185, + "step": 168 + }, + { + "epoch": 0.012788014074382355, + "grad_norm": 11.209372520446777, + "learning_rate": 4.602118003025719e-06, + "loss": 1.0703, + "step": 169 + }, + { + "epoch": 0.012863682796715977, + "grad_norm": 4.650092124938965, + "learning_rate": 4.629349470499244e-06, + "loss": 0.9494, + "step": 170 + }, + { + "epoch": 0.0129393515190496, + "grad_norm": 3.6348683834075928, + "learning_rate": 4.6565809379727684e-06, + "loss": 0.9492, + "step": 171 + }, + { + "epoch": 0.013015020241383225, + "grad_norm": 4.223328590393066, + "learning_rate": 4.683812405446293e-06, + "loss": 1.0867, + "step": 172 + }, + { + "epoch": 0.013090688963716847, + "grad_norm": 3.262395143508911, + "learning_rate": 4.711043872919819e-06, + "loss": 1.0776, + "step": 173 + }, + { + "epoch": 0.013166357686050472, + "grad_norm": 8.45308780670166, + "learning_rate": 4.738275340393344e-06, + "loss": 0.9743, + "step": 174 + }, + { + "epoch": 0.013242026408384094, + "grad_norm": 6.289074420928955, + "learning_rate": 4.765506807866868e-06, + "loss": 1.0345, + "step": 175 + }, + { + "epoch": 0.013317695130717717, + "grad_norm": 3.5306406021118164, + "learning_rate": 4.792738275340394e-06, + "loss": 0.9317, + "step": 176 + }, + { + "epoch": 0.013393363853051342, + "grad_norm": 2.8451457023620605, + "learning_rate": 4.819969742813918e-06, + "loss": 1.1339, + "step": 177 + }, + { + "epoch": 0.013469032575384964, + "grad_norm": 4.440673351287842, + "learning_rate": 4.847201210287444e-06, + "loss": 1.1597, + "step": 178 + }, + { + "epoch": 0.013544701297718589, + "grad_norm": 4.818862438201904, + "learning_rate": 4.874432677760968e-06, + "loss": 1.2254, + "step": 179 + }, + { + "epoch": 0.013620370020052211, + "grad_norm": 28.1450138092041, + "learning_rate": 4.901664145234493e-06, + "loss": 1.0227, + "step": 180 + }, + { + "epoch": 0.013696038742385834, + "grad_norm": 5.457899570465088, + "learning_rate": 4.928895612708019e-06, + "loss": 0.9296, + "step": 181 + }, + { + "epoch": 0.013771707464719458, + "grad_norm": 7.308279514312744, + "learning_rate": 4.9561270801815436e-06, + "loss": 1.0794, + "step": 182 + }, + { + "epoch": 0.013847376187053081, + "grad_norm": 4.073877811431885, + "learning_rate": 4.983358547655068e-06, + "loss": 1.0149, + "step": 183 + }, + { + "epoch": 0.013923044909386706, + "grad_norm": 4.441009044647217, + "learning_rate": 5.010590015128593e-06, + "loss": 1.1169, + "step": 184 + }, + { + "epoch": 0.013998713631720328, + "grad_norm": 3.17683744430542, + "learning_rate": 5.0378214826021185e-06, + "loss": 0.9667, + "step": 185 + }, + { + "epoch": 0.014074382354053951, + "grad_norm": 5.486913204193115, + "learning_rate": 5.065052950075642e-06, + "loss": 0.9504, + "step": 186 + }, + { + "epoch": 0.014150051076387575, + "grad_norm": 3.611471176147461, + "learning_rate": 5.092284417549168e-06, + "loss": 1.0478, + "step": 187 + }, + { + "epoch": 0.014225719798721198, + "grad_norm": 4.8300981521606445, + "learning_rate": 5.1195158850226935e-06, + "loss": 1.0817, + "step": 188 + }, + { + "epoch": 0.014301388521054823, + "grad_norm": 13.371825218200684, + "learning_rate": 5.146747352496218e-06, + "loss": 1.046, + "step": 189 + }, + { + "epoch": 0.014377057243388445, + "grad_norm": 4.5194621086120605, + "learning_rate": 5.173978819969743e-06, + "loss": 0.9503, + "step": 190 + }, + { + "epoch": 0.01445272596572207, + "grad_norm": 4.531895637512207, + "learning_rate": 5.201210287443268e-06, + "loss": 0.8728, + "step": 191 + }, + { + "epoch": 0.014528394688055692, + "grad_norm": 4.822807788848877, + "learning_rate": 5.228441754916793e-06, + "loss": 0.9806, + "step": 192 + }, + { + "epoch": 0.014604063410389315, + "grad_norm": 2.7862422466278076, + "learning_rate": 5.255673222390318e-06, + "loss": 1.0006, + "step": 193 + }, + { + "epoch": 0.01467973213272294, + "grad_norm": 4.66079044342041, + "learning_rate": 5.2829046898638426e-06, + "loss": 1.0805, + "step": 194 + }, + { + "epoch": 0.014755400855056562, + "grad_norm": 3.729022264480591, + "learning_rate": 5.310136157337367e-06, + "loss": 1.1511, + "step": 195 + }, + { + "epoch": 0.014831069577390187, + "grad_norm": 2.492928981781006, + "learning_rate": 5.337367624810893e-06, + "loss": 1.0107, + "step": 196 + }, + { + "epoch": 0.01490673829972381, + "grad_norm": 5.297464370727539, + "learning_rate": 5.364599092284418e-06, + "loss": 1.0515, + "step": 197 + }, + { + "epoch": 0.014982407022057432, + "grad_norm": 5.455732345581055, + "learning_rate": 5.391830559757942e-06, + "loss": 0.9973, + "step": 198 + }, + { + "epoch": 0.015058075744391056, + "grad_norm": 10.293705940246582, + "learning_rate": 5.419062027231468e-06, + "loss": 1.0379, + "step": 199 + }, + { + "epoch": 0.015133744466724679, + "grad_norm": 6.046285152435303, + "learning_rate": 5.4462934947049925e-06, + "loss": 1.0908, + "step": 200 + }, + { + "epoch": 0.015209413189058303, + "grad_norm": 19.921154022216797, + "learning_rate": 5.473524962178517e-06, + "loss": 0.9654, + "step": 201 + }, + { + "epoch": 0.015285081911391926, + "grad_norm": 3.6806235313415527, + "learning_rate": 5.500756429652042e-06, + "loss": 1.1749, + "step": 202 + }, + { + "epoch": 0.015360750633725549, + "grad_norm": 17.536718368530273, + "learning_rate": 5.5279878971255674e-06, + "loss": 0.9876, + "step": 203 + }, + { + "epoch": 0.015436419356059173, + "grad_norm": 6.348948955535889, + "learning_rate": 5.555219364599093e-06, + "loss": 1.1173, + "step": 204 + }, + { + "epoch": 0.015512088078392796, + "grad_norm": 3.97938871383667, + "learning_rate": 5.582450832072617e-06, + "loss": 0.9842, + "step": 205 + }, + { + "epoch": 0.01558775680072642, + "grad_norm": 22.651533126831055, + "learning_rate": 5.609682299546142e-06, + "loss": 1.2029, + "step": 206 + }, + { + "epoch": 0.015663425523060045, + "grad_norm": 2.902076005935669, + "learning_rate": 5.636913767019667e-06, + "loss": 0.9287, + "step": 207 + }, + { + "epoch": 0.015739094245393666, + "grad_norm": 4.252765655517578, + "learning_rate": 5.664145234493193e-06, + "loss": 1.1368, + "step": 208 + }, + { + "epoch": 0.01581476296772729, + "grad_norm": 7.302497386932373, + "learning_rate": 5.6913767019667165e-06, + "loss": 0.909, + "step": 209 + }, + { + "epoch": 0.015890431690060915, + "grad_norm": 3.428435802459717, + "learning_rate": 5.718608169440242e-06, + "loss": 0.8431, + "step": 210 + }, + { + "epoch": 0.015966100412394536, + "grad_norm": 26.798280715942383, + "learning_rate": 5.745839636913768e-06, + "loss": 1.055, + "step": 211 + }, + { + "epoch": 0.01604176913472816, + "grad_norm": 2.7646429538726807, + "learning_rate": 5.773071104387292e-06, + "loss": 0.9447, + "step": 212 + }, + { + "epoch": 0.016117437857061784, + "grad_norm": 41.349822998046875, + "learning_rate": 5.800302571860817e-06, + "loss": 0.9701, + "step": 213 + }, + { + "epoch": 0.016193106579395405, + "grad_norm": 3.406937599182129, + "learning_rate": 5.827534039334342e-06, + "loss": 0.8292, + "step": 214 + }, + { + "epoch": 0.01626877530172903, + "grad_norm": 4.102505207061768, + "learning_rate": 5.854765506807867e-06, + "loss": 1.0225, + "step": 215 + }, + { + "epoch": 0.016344444024062654, + "grad_norm": 3.140641689300537, + "learning_rate": 5.881996974281391e-06, + "loss": 1.0587, + "step": 216 + }, + { + "epoch": 0.01642011274639628, + "grad_norm": 3.22949481010437, + "learning_rate": 5.909228441754917e-06, + "loss": 0.9173, + "step": 217 + }, + { + "epoch": 0.0164957814687299, + "grad_norm": 3.503279209136963, + "learning_rate": 5.936459909228442e-06, + "loss": 1.0667, + "step": 218 + }, + { + "epoch": 0.016571450191063524, + "grad_norm": 3.5009310245513916, + "learning_rate": 5.963691376701967e-06, + "loss": 0.9895, + "step": 219 + }, + { + "epoch": 0.01664711891339715, + "grad_norm": 2.58160400390625, + "learning_rate": 5.990922844175492e-06, + "loss": 0.995, + "step": 220 + }, + { + "epoch": 0.01672278763573077, + "grad_norm": 3.4536960124969482, + "learning_rate": 6.018154311649016e-06, + "loss": 0.965, + "step": 221 + }, + { + "epoch": 0.016798456358064394, + "grad_norm": 4.148844242095947, + "learning_rate": 6.045385779122542e-06, + "loss": 1.1884, + "step": 222 + }, + { + "epoch": 0.01687412508039802, + "grad_norm": 3.2756500244140625, + "learning_rate": 6.0726172465960674e-06, + "loss": 1.0833, + "step": 223 + }, + { + "epoch": 0.01694979380273164, + "grad_norm": 11.904067993164062, + "learning_rate": 6.099848714069591e-06, + "loss": 0.9768, + "step": 224 + }, + { + "epoch": 0.017025462525065264, + "grad_norm": 3.4661612510681152, + "learning_rate": 6.127080181543117e-06, + "loss": 1.0659, + "step": 225 + }, + { + "epoch": 0.017101131247398888, + "grad_norm": 4.6939167976379395, + "learning_rate": 6.1543116490166416e-06, + "loss": 1.0258, + "step": 226 + }, + { + "epoch": 0.017176799969732513, + "grad_norm": 2.9636013507843018, + "learning_rate": 6.181543116490167e-06, + "loss": 1.0982, + "step": 227 + }, + { + "epoch": 0.017252468692066134, + "grad_norm": 4.7203826904296875, + "learning_rate": 6.208774583963691e-06, + "loss": 1.1271, + "step": 228 + }, + { + "epoch": 0.017328137414399758, + "grad_norm": 16.543560028076172, + "learning_rate": 6.2360060514372165e-06, + "loss": 1.0417, + "step": 229 + }, + { + "epoch": 0.017403806136733382, + "grad_norm": 3.9379353523254395, + "learning_rate": 6.263237518910742e-06, + "loss": 1.0619, + "step": 230 + }, + { + "epoch": 0.017479474859067003, + "grad_norm": 3.562490701675415, + "learning_rate": 6.290468986384266e-06, + "loss": 0.8995, + "step": 231 + }, + { + "epoch": 0.017555143581400628, + "grad_norm": 3.5367138385772705, + "learning_rate": 6.3177004538577915e-06, + "loss": 1.0685, + "step": 232 + }, + { + "epoch": 0.017630812303734252, + "grad_norm": 4.708889961242676, + "learning_rate": 6.344931921331316e-06, + "loss": 0.9722, + "step": 233 + }, + { + "epoch": 0.017706481026067873, + "grad_norm": 3.517542600631714, + "learning_rate": 6.372163388804842e-06, + "loss": 1.0182, + "step": 234 + }, + { + "epoch": 0.017782149748401498, + "grad_norm": 5.070290565490723, + "learning_rate": 6.399394856278366e-06, + "loss": 0.9933, + "step": 235 + }, + { + "epoch": 0.017857818470735122, + "grad_norm": 3.898589849472046, + "learning_rate": 6.426626323751891e-06, + "loss": 0.8593, + "step": 236 + }, + { + "epoch": 0.017933487193068746, + "grad_norm": 3.1231884956359863, + "learning_rate": 6.453857791225417e-06, + "loss": 1.2586, + "step": 237 + }, + { + "epoch": 0.018009155915402367, + "grad_norm": 4.742455005645752, + "learning_rate": 6.481089258698941e-06, + "loss": 1.0591, + "step": 238 + }, + { + "epoch": 0.018084824637735992, + "grad_norm": 5.456655502319336, + "learning_rate": 6.508320726172466e-06, + "loss": 0.7356, + "step": 239 + }, + { + "epoch": 0.018160493360069616, + "grad_norm": 3.905632495880127, + "learning_rate": 6.535552193645991e-06, + "loss": 0.881, + "step": 240 + }, + { + "epoch": 0.018236162082403237, + "grad_norm": 2.6957733631134033, + "learning_rate": 6.562783661119516e-06, + "loss": 1.0049, + "step": 241 + }, + { + "epoch": 0.01831183080473686, + "grad_norm": 3.1716785430908203, + "learning_rate": 6.590015128593041e-06, + "loss": 0.981, + "step": 242 + }, + { + "epoch": 0.018387499527070486, + "grad_norm": 3.713045597076416, + "learning_rate": 6.617246596066566e-06, + "loss": 0.9511, + "step": 243 + }, + { + "epoch": 0.01846316824940411, + "grad_norm": 6.078882694244385, + "learning_rate": 6.644478063540091e-06, + "loss": 0.8243, + "step": 244 + }, + { + "epoch": 0.01853883697173773, + "grad_norm": 3.7608482837677, + "learning_rate": 6.671709531013616e-06, + "loss": 0.8829, + "step": 245 + }, + { + "epoch": 0.018614505694071356, + "grad_norm": 6.1873321533203125, + "learning_rate": 6.698940998487141e-06, + "loss": 1.0227, + "step": 246 + }, + { + "epoch": 0.01869017441640498, + "grad_norm": 3.6704089641571045, + "learning_rate": 6.726172465960665e-06, + "loss": 0.9248, + "step": 247 + }, + { + "epoch": 0.0187658431387386, + "grad_norm": 3.1460700035095215, + "learning_rate": 6.753403933434191e-06, + "loss": 1.0313, + "step": 248 + }, + { + "epoch": 0.018841511861072226, + "grad_norm": 3.855139970779419, + "learning_rate": 6.780635400907716e-06, + "loss": 0.9423, + "step": 249 + }, + { + "epoch": 0.01891718058340585, + "grad_norm": 3.8225889205932617, + "learning_rate": 6.80786686838124e-06, + "loss": 0.9772, + "step": 250 + }, + { + "epoch": 0.01899284930573947, + "grad_norm": 2.7261159420013428, + "learning_rate": 6.835098335854766e-06, + "loss": 0.9998, + "step": 251 + }, + { + "epoch": 0.019068518028073096, + "grad_norm": 2.887666702270508, + "learning_rate": 6.862329803328291e-06, + "loss": 1.052, + "step": 252 + }, + { + "epoch": 0.01914418675040672, + "grad_norm": 3.3819353580474854, + "learning_rate": 6.889561270801816e-06, + "loss": 0.9431, + "step": 253 + }, + { + "epoch": 0.019219855472740344, + "grad_norm": 3.0159807205200195, + "learning_rate": 6.91679273827534e-06, + "loss": 0.9702, + "step": 254 + }, + { + "epoch": 0.019295524195073965, + "grad_norm": 3.413630962371826, + "learning_rate": 6.944024205748866e-06, + "loss": 0.8845, + "step": 255 + }, + { + "epoch": 0.01937119291740759, + "grad_norm": 5.078720569610596, + "learning_rate": 6.97125567322239e-06, + "loss": 0.9984, + "step": 256 + }, + { + "epoch": 0.019446861639741214, + "grad_norm": 3.1097350120544434, + "learning_rate": 6.998487140695916e-06, + "loss": 1.0355, + "step": 257 + }, + { + "epoch": 0.019522530362074835, + "grad_norm": 4.542701721191406, + "learning_rate": 7.02571860816944e-06, + "loss": 1.1622, + "step": 258 + }, + { + "epoch": 0.01959819908440846, + "grad_norm": 10.559003829956055, + "learning_rate": 7.052950075642965e-06, + "loss": 1.2439, + "step": 259 + }, + { + "epoch": 0.019673867806742084, + "grad_norm": 5.9329609870910645, + "learning_rate": 7.080181543116491e-06, + "loss": 1.0442, + "step": 260 + }, + { + "epoch": 0.019749536529075705, + "grad_norm": 3.9243202209472656, + "learning_rate": 7.1074130105900155e-06, + "loss": 0.9679, + "step": 261 + }, + { + "epoch": 0.01982520525140933, + "grad_norm": 4.9199910163879395, + "learning_rate": 7.13464447806354e-06, + "loss": 1.1109, + "step": 262 + }, + { + "epoch": 0.019900873973742954, + "grad_norm": 2.8583781719207764, + "learning_rate": 7.161875945537065e-06, + "loss": 1.0152, + "step": 263 + }, + { + "epoch": 0.019976542696076578, + "grad_norm": 3.566678524017334, + "learning_rate": 7.1891074130105905e-06, + "loss": 1.0698, + "step": 264 + }, + { + "epoch": 0.0200522114184102, + "grad_norm": 4.623547554016113, + "learning_rate": 7.216338880484114e-06, + "loss": 1.0657, + "step": 265 + }, + { + "epoch": 0.020127880140743824, + "grad_norm": 3.9932198524475098, + "learning_rate": 7.24357034795764e-06, + "loss": 0.9034, + "step": 266 + }, + { + "epoch": 0.020203548863077448, + "grad_norm": 2.8435165882110596, + "learning_rate": 7.2708018154311654e-06, + "loss": 0.8764, + "step": 267 + }, + { + "epoch": 0.02027921758541107, + "grad_norm": 3.262338161468506, + "learning_rate": 7.29803328290469e-06, + "loss": 1.1094, + "step": 268 + }, + { + "epoch": 0.020354886307744693, + "grad_norm": 8.468903541564941, + "learning_rate": 7.325264750378215e-06, + "loss": 0.9228, + "step": 269 + }, + { + "epoch": 0.020430555030078318, + "grad_norm": 3.0359139442443848, + "learning_rate": 7.3524962178517395e-06, + "loss": 0.8587, + "step": 270 + }, + { + "epoch": 0.02050622375241194, + "grad_norm": 8.242533683776855, + "learning_rate": 7.379727685325265e-06, + "loss": 1.0187, + "step": 271 + }, + { + "epoch": 0.020581892474745563, + "grad_norm": 2.938859224319458, + "learning_rate": 7.406959152798791e-06, + "loss": 1.0385, + "step": 272 + }, + { + "epoch": 0.020657561197079188, + "grad_norm": 3.197413921356201, + "learning_rate": 7.4341906202723145e-06, + "loss": 0.9181, + "step": 273 + }, + { + "epoch": 0.020733229919412812, + "grad_norm": 5.897491931915283, + "learning_rate": 7.46142208774584e-06, + "loss": 1.0107, + "step": 274 + }, + { + "epoch": 0.020808898641746433, + "grad_norm": 3.680340528488159, + "learning_rate": 7.488653555219365e-06, + "loss": 1.0152, + "step": 275 + }, + { + "epoch": 0.020884567364080057, + "grad_norm": 6.313503265380859, + "learning_rate": 7.51588502269289e-06, + "loss": 1.0487, + "step": 276 + }, + { + "epoch": 0.020960236086413682, + "grad_norm": 3.8063981533050537, + "learning_rate": 7.543116490166414e-06, + "loss": 1.0217, + "step": 277 + }, + { + "epoch": 0.021035904808747303, + "grad_norm": 9.276975631713867, + "learning_rate": 7.57034795763994e-06, + "loss": 0.9587, + "step": 278 + }, + { + "epoch": 0.021111573531080927, + "grad_norm": 4.729788303375244, + "learning_rate": 7.597579425113465e-06, + "loss": 0.8946, + "step": 279 + }, + { + "epoch": 0.02118724225341455, + "grad_norm": 3.3282487392425537, + "learning_rate": 7.624810892586989e-06, + "loss": 1.1086, + "step": 280 + }, + { + "epoch": 0.021262910975748176, + "grad_norm": 2.672250509262085, + "learning_rate": 7.652042360060515e-06, + "loss": 0.8411, + "step": 281 + }, + { + "epoch": 0.021338579698081797, + "grad_norm": 4.054312705993652, + "learning_rate": 7.67927382753404e-06, + "loss": 1.0374, + "step": 282 + }, + { + "epoch": 0.02141424842041542, + "grad_norm": 4.272651195526123, + "learning_rate": 7.706505295007564e-06, + "loss": 1.039, + "step": 283 + }, + { + "epoch": 0.021489917142749046, + "grad_norm": 3.3986735343933105, + "learning_rate": 7.733736762481089e-06, + "loss": 0.9995, + "step": 284 + }, + { + "epoch": 0.021565585865082667, + "grad_norm": 4.5488481521606445, + "learning_rate": 7.760968229954613e-06, + "loss": 1.1762, + "step": 285 + }, + { + "epoch": 0.02164125458741629, + "grad_norm": 4.396289348602295, + "learning_rate": 7.78819969742814e-06, + "loss": 0.9458, + "step": 286 + }, + { + "epoch": 0.021716923309749916, + "grad_norm": 4.582161903381348, + "learning_rate": 7.815431164901665e-06, + "loss": 1.0327, + "step": 287 + }, + { + "epoch": 0.021792592032083537, + "grad_norm": 4.647852420806885, + "learning_rate": 7.84266263237519e-06, + "loss": 1.0392, + "step": 288 + }, + { + "epoch": 0.02186826075441716, + "grad_norm": 5.218382358551025, + "learning_rate": 7.869894099848714e-06, + "loss": 0.9462, + "step": 289 + }, + { + "epoch": 0.021943929476750786, + "grad_norm": 5.8986029624938965, + "learning_rate": 7.897125567322239e-06, + "loss": 0.975, + "step": 290 + }, + { + "epoch": 0.02201959819908441, + "grad_norm": 3.1991119384765625, + "learning_rate": 7.924357034795765e-06, + "loss": 0.8724, + "step": 291 + }, + { + "epoch": 0.02209526692141803, + "grad_norm": 3.476820230484009, + "learning_rate": 7.951588502269288e-06, + "loss": 0.9873, + "step": 292 + }, + { + "epoch": 0.022170935643751655, + "grad_norm": 3.6004443168640137, + "learning_rate": 7.978819969742815e-06, + "loss": 0.8335, + "step": 293 + }, + { + "epoch": 0.02224660436608528, + "grad_norm": 2.7738335132598877, + "learning_rate": 8.00605143721634e-06, + "loss": 1.0594, + "step": 294 + }, + { + "epoch": 0.0223222730884189, + "grad_norm": 4.626110076904297, + "learning_rate": 8.033282904689864e-06, + "loss": 1.0716, + "step": 295 + }, + { + "epoch": 0.022397941810752525, + "grad_norm": 4.319602966308594, + "learning_rate": 8.060514372163389e-06, + "loss": 0.9551, + "step": 296 + }, + { + "epoch": 0.02247361053308615, + "grad_norm": 4.208806991577148, + "learning_rate": 8.087745839636913e-06, + "loss": 0.8445, + "step": 297 + }, + { + "epoch": 0.02254927925541977, + "grad_norm": 4.937840938568115, + "learning_rate": 8.11497730711044e-06, + "loss": 1.0403, + "step": 298 + }, + { + "epoch": 0.022624947977753395, + "grad_norm": 6.741675853729248, + "learning_rate": 8.142208774583963e-06, + "loss": 0.9795, + "step": 299 + }, + { + "epoch": 0.02270061670008702, + "grad_norm": 4.731930255889893, + "learning_rate": 8.169440242057489e-06, + "loss": 0.9626, + "step": 300 + }, + { + "epoch": 0.022776285422420644, + "grad_norm": 3.032463550567627, + "learning_rate": 8.196671709531014e-06, + "loss": 0.8131, + "step": 301 + }, + { + "epoch": 0.022851954144754265, + "grad_norm": 3.7094199657440186, + "learning_rate": 8.223903177004539e-06, + "loss": 0.9966, + "step": 302 + }, + { + "epoch": 0.02292762286708789, + "grad_norm": 4.705551624298096, + "learning_rate": 8.251134644478063e-06, + "loss": 1.0382, + "step": 303 + }, + { + "epoch": 0.023003291589421514, + "grad_norm": 4.8940510749816895, + "learning_rate": 8.278366111951588e-06, + "loss": 0.9043, + "step": 304 + }, + { + "epoch": 0.023078960311755135, + "grad_norm": 5.167042255401611, + "learning_rate": 8.305597579425114e-06, + "loss": 0.7667, + "step": 305 + }, + { + "epoch": 0.02315462903408876, + "grad_norm": 5.086777687072754, + "learning_rate": 8.332829046898639e-06, + "loss": 0.8992, + "step": 306 + }, + { + "epoch": 0.023230297756422384, + "grad_norm": 12.379859924316406, + "learning_rate": 8.360060514372164e-06, + "loss": 1.0104, + "step": 307 + }, + { + "epoch": 0.023305966478756008, + "grad_norm": 5.195709228515625, + "learning_rate": 8.387291981845688e-06, + "loss": 0.9138, + "step": 308 + }, + { + "epoch": 0.02338163520108963, + "grad_norm": 4.756507396697998, + "learning_rate": 8.414523449319213e-06, + "loss": 0.9112, + "step": 309 + }, + { + "epoch": 0.023457303923423253, + "grad_norm": 4.002053737640381, + "learning_rate": 8.441754916792738e-06, + "loss": 0.8504, + "step": 310 + }, + { + "epoch": 0.023532972645756878, + "grad_norm": 4.4022064208984375, + "learning_rate": 8.468986384266263e-06, + "loss": 0.9997, + "step": 311 + }, + { + "epoch": 0.0236086413680905, + "grad_norm": 6.589466571807861, + "learning_rate": 8.496217851739789e-06, + "loss": 0.9104, + "step": 312 + }, + { + "epoch": 0.023684310090424123, + "grad_norm": 4.6508989334106445, + "learning_rate": 8.523449319213314e-06, + "loss": 0.9674, + "step": 313 + }, + { + "epoch": 0.023759978812757748, + "grad_norm": 4.066075325012207, + "learning_rate": 8.550680786686838e-06, + "loss": 0.8831, + "step": 314 + }, + { + "epoch": 0.02383564753509137, + "grad_norm": 8.613035202026367, + "learning_rate": 8.577912254160363e-06, + "loss": 0.8986, + "step": 315 + }, + { + "epoch": 0.023911316257424993, + "grad_norm": 3.7301228046417236, + "learning_rate": 8.605143721633888e-06, + "loss": 0.7915, + "step": 316 + }, + { + "epoch": 0.023986984979758617, + "grad_norm": 8.060126304626465, + "learning_rate": 8.632375189107414e-06, + "loss": 0.9269, + "step": 317 + }, + { + "epoch": 0.024062653702092242, + "grad_norm": 4.43103551864624, + "learning_rate": 8.659606656580937e-06, + "loss": 0.9495, + "step": 318 + }, + { + "epoch": 0.024138322424425863, + "grad_norm": 3.900829553604126, + "learning_rate": 8.686838124054464e-06, + "loss": 0.8808, + "step": 319 + }, + { + "epoch": 0.024213991146759487, + "grad_norm": 4.360715866088867, + "learning_rate": 8.714069591527988e-06, + "loss": 1.0735, + "step": 320 + }, + { + "epoch": 0.02428965986909311, + "grad_norm": 5.8079633712768555, + "learning_rate": 8.741301059001513e-06, + "loss": 1.0089, + "step": 321 + }, + { + "epoch": 0.024365328591426733, + "grad_norm": 31.655710220336914, + "learning_rate": 8.768532526475038e-06, + "loss": 1.0929, + "step": 322 + }, + { + "epoch": 0.024440997313760357, + "grad_norm": 5.170853137969971, + "learning_rate": 8.795763993948562e-06, + "loss": 1.066, + "step": 323 + }, + { + "epoch": 0.02451666603609398, + "grad_norm": 3.7530641555786133, + "learning_rate": 8.822995461422089e-06, + "loss": 0.8663, + "step": 324 + }, + { + "epoch": 0.024592334758427602, + "grad_norm": 4.36927604675293, + "learning_rate": 8.850226928895612e-06, + "loss": 0.8544, + "step": 325 + }, + { + "epoch": 0.024668003480761227, + "grad_norm": 3.9863076210021973, + "learning_rate": 8.877458396369138e-06, + "loss": 0.935, + "step": 326 + }, + { + "epoch": 0.02474367220309485, + "grad_norm": 3.438127040863037, + "learning_rate": 8.904689863842663e-06, + "loss": 0.9287, + "step": 327 + }, + { + "epoch": 0.024819340925428476, + "grad_norm": 5.561075210571289, + "learning_rate": 8.931921331316188e-06, + "loss": 1.0388, + "step": 328 + }, + { + "epoch": 0.024895009647762097, + "grad_norm": 4.477010726928711, + "learning_rate": 8.959152798789712e-06, + "loss": 0.9875, + "step": 329 + }, + { + "epoch": 0.02497067837009572, + "grad_norm": 4.917139053344727, + "learning_rate": 8.986384266263237e-06, + "loss": 0.8978, + "step": 330 + }, + { + "epoch": 0.025046347092429345, + "grad_norm": 4.206781387329102, + "learning_rate": 9.013615733736763e-06, + "loss": 0.96, + "step": 331 + }, + { + "epoch": 0.025122015814762966, + "grad_norm": 2.9883878231048584, + "learning_rate": 9.040847201210288e-06, + "loss": 0.857, + "step": 332 + }, + { + "epoch": 0.02519768453709659, + "grad_norm": 9.420825958251953, + "learning_rate": 9.068078668683813e-06, + "loss": 1.0236, + "step": 333 + }, + { + "epoch": 0.025273353259430215, + "grad_norm": 4.354033470153809, + "learning_rate": 9.09531013615734e-06, + "loss": 0.9295, + "step": 334 + }, + { + "epoch": 0.025349021981763836, + "grad_norm": 4.949868679046631, + "learning_rate": 9.122541603630862e-06, + "loss": 0.7853, + "step": 335 + }, + { + "epoch": 0.02542469070409746, + "grad_norm": 4.656820297241211, + "learning_rate": 9.149773071104387e-06, + "loss": 1.096, + "step": 336 + }, + { + "epoch": 0.025500359426431085, + "grad_norm": 4.696194171905518, + "learning_rate": 9.177004538577912e-06, + "loss": 0.8964, + "step": 337 + }, + { + "epoch": 0.02557602814876471, + "grad_norm": 4.845102310180664, + "learning_rate": 9.204236006051438e-06, + "loss": 1.1736, + "step": 338 + }, + { + "epoch": 0.02565169687109833, + "grad_norm": 6.742333889007568, + "learning_rate": 9.231467473524963e-06, + "loss": 0.9915, + "step": 339 + }, + { + "epoch": 0.025727365593431955, + "grad_norm": 4.6622538566589355, + "learning_rate": 9.258698940998487e-06, + "loss": 1.0591, + "step": 340 + }, + { + "epoch": 0.02580303431576558, + "grad_norm": 4.871918201446533, + "learning_rate": 9.285930408472014e-06, + "loss": 1.0679, + "step": 341 + }, + { + "epoch": 0.0258787030380992, + "grad_norm": 3.380262851715088, + "learning_rate": 9.313161875945537e-06, + "loss": 0.7519, + "step": 342 + }, + { + "epoch": 0.025954371760432825, + "grad_norm": 4.895992755889893, + "learning_rate": 9.340393343419062e-06, + "loss": 0.9666, + "step": 343 + }, + { + "epoch": 0.02603004048276645, + "grad_norm": 5.649062633514404, + "learning_rate": 9.367624810892586e-06, + "loss": 0.9477, + "step": 344 + }, + { + "epoch": 0.026105709205100074, + "grad_norm": 5.298853397369385, + "learning_rate": 9.394856278366113e-06, + "loss": 0.8321, + "step": 345 + }, + { + "epoch": 0.026181377927433695, + "grad_norm": 15.001054763793945, + "learning_rate": 9.422087745839637e-06, + "loss": 0.8249, + "step": 346 + }, + { + "epoch": 0.02625704664976732, + "grad_norm": 7.537627220153809, + "learning_rate": 9.449319213313162e-06, + "loss": 0.8955, + "step": 347 + }, + { + "epoch": 0.026332715372100943, + "grad_norm": 6.6606245040893555, + "learning_rate": 9.476550680786688e-06, + "loss": 0.9237, + "step": 348 + }, + { + "epoch": 0.026408384094434564, + "grad_norm": 7.1370673179626465, + "learning_rate": 9.503782148260213e-06, + "loss": 1.0608, + "step": 349 + }, + { + "epoch": 0.02648405281676819, + "grad_norm": 4.019873142242432, + "learning_rate": 9.531013615733736e-06, + "loss": 0.9431, + "step": 350 + }, + { + "epoch": 0.026559721539101813, + "grad_norm": 3.8298895359039307, + "learning_rate": 9.558245083207261e-06, + "loss": 0.9514, + "step": 351 + }, + { + "epoch": 0.026635390261435434, + "grad_norm": 3.851069688796997, + "learning_rate": 9.585476550680787e-06, + "loss": 1.044, + "step": 352 + }, + { + "epoch": 0.02671105898376906, + "grad_norm": 5.827500343322754, + "learning_rate": 9.612708018154312e-06, + "loss": 0.8259, + "step": 353 + }, + { + "epoch": 0.026786727706102683, + "grad_norm": 4.617655277252197, + "learning_rate": 9.639939485627837e-06, + "loss": 0.8898, + "step": 354 + }, + { + "epoch": 0.026862396428436307, + "grad_norm": 7.916740417480469, + "learning_rate": 9.667170953101363e-06, + "loss": 0.8235, + "step": 355 + }, + { + "epoch": 0.02693806515076993, + "grad_norm": 4.243466377258301, + "learning_rate": 9.694402420574888e-06, + "loss": 0.9921, + "step": 356 + }, + { + "epoch": 0.027013733873103553, + "grad_norm": 6.203061580657959, + "learning_rate": 9.72163388804841e-06, + "loss": 1.0662, + "step": 357 + }, + { + "epoch": 0.027089402595437177, + "grad_norm": 4.784158229827881, + "learning_rate": 9.748865355521936e-06, + "loss": 0.8327, + "step": 358 + }, + { + "epoch": 0.027165071317770798, + "grad_norm": 4.381805896759033, + "learning_rate": 9.776096822995462e-06, + "loss": 1.0241, + "step": 359 + }, + { + "epoch": 0.027240740040104423, + "grad_norm": 4.59453821182251, + "learning_rate": 9.803328290468987e-06, + "loss": 1.1991, + "step": 360 + }, + { + "epoch": 0.027316408762438047, + "grad_norm": 4.59682035446167, + "learning_rate": 9.830559757942511e-06, + "loss": 0.9785, + "step": 361 + }, + { + "epoch": 0.027392077484771668, + "grad_norm": 3.2296361923217773, + "learning_rate": 9.857791225416038e-06, + "loss": 0.8848, + "step": 362 + }, + { + "epoch": 0.027467746207105292, + "grad_norm": 4.408949375152588, + "learning_rate": 9.885022692889562e-06, + "loss": 1.1622, + "step": 363 + }, + { + "epoch": 0.027543414929438917, + "grad_norm": 4.724997520446777, + "learning_rate": 9.912254160363087e-06, + "loss": 0.7884, + "step": 364 + }, + { + "epoch": 0.02761908365177254, + "grad_norm": 3.5149667263031006, + "learning_rate": 9.93948562783661e-06, + "loss": 0.9649, + "step": 365 + }, + { + "epoch": 0.027694752374106162, + "grad_norm": 3.3947033882141113, + "learning_rate": 9.966717095310137e-06, + "loss": 0.8381, + "step": 366 + }, + { + "epoch": 0.027770421096439787, + "grad_norm": 7.352261066436768, + "learning_rate": 9.993948562783661e-06, + "loss": 0.8653, + "step": 367 + }, + { + "epoch": 0.02784608981877341, + "grad_norm": 5.134012699127197, + "learning_rate": 1.0021180030257186e-05, + "loss": 0.8702, + "step": 368 + }, + { + "epoch": 0.027921758541107032, + "grad_norm": 4.905878067016602, + "learning_rate": 1.0048411497730712e-05, + "loss": 0.9004, + "step": 369 + }, + { + "epoch": 0.027997427263440657, + "grad_norm": 6.044192790985107, + "learning_rate": 1.0075642965204237e-05, + "loss": 1.0201, + "step": 370 + }, + { + "epoch": 0.02807309598577428, + "grad_norm": 4.332431316375732, + "learning_rate": 1.0102874432677762e-05, + "loss": 0.9866, + "step": 371 + }, + { + "epoch": 0.028148764708107902, + "grad_norm": 5.870851516723633, + "learning_rate": 1.0130105900151285e-05, + "loss": 0.8737, + "step": 372 + }, + { + "epoch": 0.028224433430441526, + "grad_norm": 4.059363842010498, + "learning_rate": 1.0157337367624811e-05, + "loss": 0.851, + "step": 373 + }, + { + "epoch": 0.02830010215277515, + "grad_norm": 5.465144634246826, + "learning_rate": 1.0184568835098336e-05, + "loss": 1.0343, + "step": 374 + }, + { + "epoch": 0.028375770875108775, + "grad_norm": 4.673175811767578, + "learning_rate": 1.021180030257186e-05, + "loss": 0.9237, + "step": 375 + }, + { + "epoch": 0.028451439597442396, + "grad_norm": 3.5958478450775146, + "learning_rate": 1.0239031770045387e-05, + "loss": 0.9103, + "step": 376 + }, + { + "epoch": 0.02852710831977602, + "grad_norm": 9.658095359802246, + "learning_rate": 1.0266263237518912e-05, + "loss": 1.0272, + "step": 377 + }, + { + "epoch": 0.028602777042109645, + "grad_norm": 4.175169944763184, + "learning_rate": 1.0293494704992436e-05, + "loss": 0.8041, + "step": 378 + }, + { + "epoch": 0.028678445764443266, + "grad_norm": 3.949751853942871, + "learning_rate": 1.0320726172465961e-05, + "loss": 0.8977, + "step": 379 + }, + { + "epoch": 0.02875411448677689, + "grad_norm": 4.572116374969482, + "learning_rate": 1.0347957639939486e-05, + "loss": 0.8524, + "step": 380 + }, + { + "epoch": 0.028829783209110515, + "grad_norm": 4.106285095214844, + "learning_rate": 1.037518910741301e-05, + "loss": 1.0366, + "step": 381 + }, + { + "epoch": 0.02890545193144414, + "grad_norm": 3.8881635665893555, + "learning_rate": 1.0402420574886535e-05, + "loss": 0.8127, + "step": 382 + }, + { + "epoch": 0.02898112065377776, + "grad_norm": 6.574056625366211, + "learning_rate": 1.0429652042360062e-05, + "loss": 0.805, + "step": 383 + }, + { + "epoch": 0.029056789376111385, + "grad_norm": 4.1317057609558105, + "learning_rate": 1.0456883509833586e-05, + "loss": 1.1139, + "step": 384 + }, + { + "epoch": 0.02913245809844501, + "grad_norm": 6.96987771987915, + "learning_rate": 1.0484114977307111e-05, + "loss": 1.0626, + "step": 385 + }, + { + "epoch": 0.02920812682077863, + "grad_norm": 5.030163764953613, + "learning_rate": 1.0511346444780636e-05, + "loss": 0.7772, + "step": 386 + }, + { + "epoch": 0.029283795543112254, + "grad_norm": 8.177231788635254, + "learning_rate": 1.053857791225416e-05, + "loss": 0.9079, + "step": 387 + }, + { + "epoch": 0.02935946426544588, + "grad_norm": 4.530209541320801, + "learning_rate": 1.0565809379727685e-05, + "loss": 0.8447, + "step": 388 + }, + { + "epoch": 0.0294351329877795, + "grad_norm": 12.534080505371094, + "learning_rate": 1.059304084720121e-05, + "loss": 0.9899, + "step": 389 + }, + { + "epoch": 0.029510801710113124, + "grad_norm": 4.852429389953613, + "learning_rate": 1.0620272314674735e-05, + "loss": 1.0398, + "step": 390 + }, + { + "epoch": 0.02958647043244675, + "grad_norm": 4.2400054931640625, + "learning_rate": 1.0647503782148261e-05, + "loss": 0.8936, + "step": 391 + }, + { + "epoch": 0.029662139154780373, + "grad_norm": 5.989685535430908, + "learning_rate": 1.0674735249621786e-05, + "loss": 0.8694, + "step": 392 + }, + { + "epoch": 0.029737807877113994, + "grad_norm": 4.045843124389648, + "learning_rate": 1.070196671709531e-05, + "loss": 0.9632, + "step": 393 + }, + { + "epoch": 0.02981347659944762, + "grad_norm": 4.707093238830566, + "learning_rate": 1.0729198184568837e-05, + "loss": 0.8046, + "step": 394 + }, + { + "epoch": 0.029889145321781243, + "grad_norm": 5.141930103302002, + "learning_rate": 1.075642965204236e-05, + "loss": 0.9606, + "step": 395 + }, + { + "epoch": 0.029964814044114864, + "grad_norm": 9.92322826385498, + "learning_rate": 1.0783661119515884e-05, + "loss": 0.9988, + "step": 396 + }, + { + "epoch": 0.03004048276644849, + "grad_norm": 5.097169399261475, + "learning_rate": 1.0810892586989409e-05, + "loss": 0.8988, + "step": 397 + }, + { + "epoch": 0.030116151488782113, + "grad_norm": 4.876684665679932, + "learning_rate": 1.0838124054462936e-05, + "loss": 0.874, + "step": 398 + }, + { + "epoch": 0.030191820211115734, + "grad_norm": 4.846562385559082, + "learning_rate": 1.086535552193646e-05, + "loss": 0.8658, + "step": 399 + }, + { + "epoch": 0.030267488933449358, + "grad_norm": 5.538702011108398, + "learning_rate": 1.0892586989409985e-05, + "loss": 1.0144, + "step": 400 + }, + { + "epoch": 0.030343157655782983, + "grad_norm": 4.698038578033447, + "learning_rate": 1.0919818456883511e-05, + "loss": 0.9173, + "step": 401 + }, + { + "epoch": 0.030418826378116607, + "grad_norm": 6.059201717376709, + "learning_rate": 1.0947049924357034e-05, + "loss": 0.7875, + "step": 402 + }, + { + "epoch": 0.030494495100450228, + "grad_norm": 6.118393421173096, + "learning_rate": 1.0974281391830559e-05, + "loss": 0.9391, + "step": 403 + }, + { + "epoch": 0.030570163822783852, + "grad_norm": 4.088007926940918, + "learning_rate": 1.1001512859304084e-05, + "loss": 0.7948, + "step": 404 + }, + { + "epoch": 0.030645832545117477, + "grad_norm": 4.4451799392700195, + "learning_rate": 1.102874432677761e-05, + "loss": 0.9371, + "step": 405 + }, + { + "epoch": 0.030721501267451098, + "grad_norm": 4.529284477233887, + "learning_rate": 1.1055975794251135e-05, + "loss": 0.9817, + "step": 406 + }, + { + "epoch": 0.030797169989784722, + "grad_norm": 7.541872978210449, + "learning_rate": 1.108320726172466e-05, + "loss": 0.8889, + "step": 407 + }, + { + "epoch": 0.030872838712118347, + "grad_norm": 3.850817918777466, + "learning_rate": 1.1110438729198186e-05, + "loss": 0.8753, + "step": 408 + }, + { + "epoch": 0.030948507434451968, + "grad_norm": 3.5445756912231445, + "learning_rate": 1.113767019667171e-05, + "loss": 0.8393, + "step": 409 + }, + { + "epoch": 0.031024176156785592, + "grad_norm": 5.169709205627441, + "learning_rate": 1.1164901664145234e-05, + "loss": 0.9982, + "step": 410 + }, + { + "epoch": 0.031099844879119216, + "grad_norm": 3.5694003105163574, + "learning_rate": 1.1192133131618758e-05, + "loss": 0.7693, + "step": 411 + }, + { + "epoch": 0.03117551360145284, + "grad_norm": 5.7016921043396, + "learning_rate": 1.1219364599092285e-05, + "loss": 0.8983, + "step": 412 + }, + { + "epoch": 0.031251182323786465, + "grad_norm": 5.174305438995361, + "learning_rate": 1.124659606656581e-05, + "loss": 0.9126, + "step": 413 + }, + { + "epoch": 0.03132685104612009, + "grad_norm": 4.78248929977417, + "learning_rate": 1.1273827534039334e-05, + "loss": 1.0237, + "step": 414 + }, + { + "epoch": 0.03140251976845371, + "grad_norm": 4.276739120483398, + "learning_rate": 1.130105900151286e-05, + "loss": 0.953, + "step": 415 + }, + { + "epoch": 0.03147818849078733, + "grad_norm": 5.136653900146484, + "learning_rate": 1.1328290468986385e-05, + "loss": 0.8138, + "step": 416 + }, + { + "epoch": 0.031553857213120956, + "grad_norm": 3.566028356552124, + "learning_rate": 1.1355521936459908e-05, + "loss": 1.0327, + "step": 417 + }, + { + "epoch": 0.03162952593545458, + "grad_norm": 3.272423267364502, + "learning_rate": 1.1382753403933433e-05, + "loss": 1.0262, + "step": 418 + }, + { + "epoch": 0.031705194657788205, + "grad_norm": 4.595939636230469, + "learning_rate": 1.140998487140696e-05, + "loss": 0.8515, + "step": 419 + }, + { + "epoch": 0.03178086338012183, + "grad_norm": 3.546163320541382, + "learning_rate": 1.1437216338880484e-05, + "loss": 0.7262, + "step": 420 + }, + { + "epoch": 0.03185653210245545, + "grad_norm": 4.943700313568115, + "learning_rate": 1.1464447806354009e-05, + "loss": 0.9381, + "step": 421 + }, + { + "epoch": 0.03193220082478907, + "grad_norm": 5.774724960327148, + "learning_rate": 1.1491679273827535e-05, + "loss": 0.9081, + "step": 422 + }, + { + "epoch": 0.032007869547122696, + "grad_norm": 4.097910404205322, + "learning_rate": 1.151891074130106e-05, + "loss": 0.8876, + "step": 423 + }, + { + "epoch": 0.03208353826945632, + "grad_norm": 4.992226600646973, + "learning_rate": 1.1546142208774585e-05, + "loss": 0.8628, + "step": 424 + }, + { + "epoch": 0.032159206991789945, + "grad_norm": 4.852366924285889, + "learning_rate": 1.1573373676248108e-05, + "loss": 0.9476, + "step": 425 + }, + { + "epoch": 0.03223487571412357, + "grad_norm": 5.32084321975708, + "learning_rate": 1.1600605143721634e-05, + "loss": 0.8845, + "step": 426 + }, + { + "epoch": 0.03231054443645719, + "grad_norm": 5.613223552703857, + "learning_rate": 1.1627836611195159e-05, + "loss": 0.756, + "step": 427 + }, + { + "epoch": 0.03238621315879081, + "grad_norm": 4.22434139251709, + "learning_rate": 1.1655068078668683e-05, + "loss": 0.9494, + "step": 428 + }, + { + "epoch": 0.032461881881124435, + "grad_norm": 4.021113395690918, + "learning_rate": 1.168229954614221e-05, + "loss": 0.7561, + "step": 429 + }, + { + "epoch": 0.03253755060345806, + "grad_norm": 4.726623058319092, + "learning_rate": 1.1709531013615735e-05, + "loss": 0.9944, + "step": 430 + }, + { + "epoch": 0.032613219325791684, + "grad_norm": 5.192655563354492, + "learning_rate": 1.173676248108926e-05, + "loss": 0.9211, + "step": 431 + }, + { + "epoch": 0.03268888804812531, + "grad_norm": 3.9181580543518066, + "learning_rate": 1.1763993948562782e-05, + "loss": 0.9154, + "step": 432 + }, + { + "epoch": 0.03276455677045893, + "grad_norm": 6.171100616455078, + "learning_rate": 1.1791225416036309e-05, + "loss": 0.9509, + "step": 433 + }, + { + "epoch": 0.03284022549279256, + "grad_norm": 3.8820559978485107, + "learning_rate": 1.1818456883509833e-05, + "loss": 1.0268, + "step": 434 + }, + { + "epoch": 0.032915894215126175, + "grad_norm": 4.744935512542725, + "learning_rate": 1.1845688350983358e-05, + "loss": 0.9251, + "step": 435 + }, + { + "epoch": 0.0329915629374598, + "grad_norm": 3.204756736755371, + "learning_rate": 1.1872919818456884e-05, + "loss": 0.8481, + "step": 436 + }, + { + "epoch": 0.033067231659793424, + "grad_norm": 4.440789699554443, + "learning_rate": 1.190015128593041e-05, + "loss": 1.072, + "step": 437 + }, + { + "epoch": 0.03314290038212705, + "grad_norm": 4.594890594482422, + "learning_rate": 1.1927382753403934e-05, + "loss": 0.8416, + "step": 438 + }, + { + "epoch": 0.03321856910446067, + "grad_norm": 8.748790740966797, + "learning_rate": 1.195461422087746e-05, + "loss": 0.7003, + "step": 439 + }, + { + "epoch": 0.0332942378267943, + "grad_norm": 6.574450969696045, + "learning_rate": 1.1981845688350983e-05, + "loss": 0.9899, + "step": 440 + }, + { + "epoch": 0.033369906549127915, + "grad_norm": 3.3959763050079346, + "learning_rate": 1.2009077155824508e-05, + "loss": 0.8483, + "step": 441 + }, + { + "epoch": 0.03344557527146154, + "grad_norm": 9.578702926635742, + "learning_rate": 1.2036308623298033e-05, + "loss": 0.6854, + "step": 442 + }, + { + "epoch": 0.03352124399379516, + "grad_norm": 10.351158142089844, + "learning_rate": 1.2063540090771559e-05, + "loss": 0.9046, + "step": 443 + }, + { + "epoch": 0.03359691271612879, + "grad_norm": 3.339411497116089, + "learning_rate": 1.2090771558245084e-05, + "loss": 0.8759, + "step": 444 + }, + { + "epoch": 0.03367258143846241, + "grad_norm": 3.9380717277526855, + "learning_rate": 1.2118003025718608e-05, + "loss": 0.869, + "step": 445 + }, + { + "epoch": 0.03374825016079604, + "grad_norm": 3.6196255683898926, + "learning_rate": 1.2145234493192135e-05, + "loss": 0.853, + "step": 446 + }, + { + "epoch": 0.03382391888312966, + "grad_norm": 3.6324236392974854, + "learning_rate": 1.2172465960665658e-05, + "loss": 0.9892, + "step": 447 + }, + { + "epoch": 0.03389958760546328, + "grad_norm": 4.3121185302734375, + "learning_rate": 1.2199697428139183e-05, + "loss": 0.8917, + "step": 448 + }, + { + "epoch": 0.0339752563277969, + "grad_norm": 6.199253559112549, + "learning_rate": 1.2226928895612707e-05, + "loss": 0.8886, + "step": 449 + }, + { + "epoch": 0.03405092505013053, + "grad_norm": 5.536099433898926, + "learning_rate": 1.2254160363086234e-05, + "loss": 0.8908, + "step": 450 + }, + { + "epoch": 0.03412659377246415, + "grad_norm": 4.678923606872559, + "learning_rate": 1.2281391830559758e-05, + "loss": 1.0336, + "step": 451 + }, + { + "epoch": 0.034202262494797776, + "grad_norm": 5.405990123748779, + "learning_rate": 1.2308623298033283e-05, + "loss": 0.8555, + "step": 452 + }, + { + "epoch": 0.0342779312171314, + "grad_norm": 5.1637749671936035, + "learning_rate": 1.233585476550681e-05, + "loss": 0.8824, + "step": 453 + }, + { + "epoch": 0.034353599939465025, + "grad_norm": 8.398664474487305, + "learning_rate": 1.2363086232980334e-05, + "loss": 0.993, + "step": 454 + }, + { + "epoch": 0.03442926866179864, + "grad_norm": 4.705966472625732, + "learning_rate": 1.2390317700453857e-05, + "loss": 0.9114, + "step": 455 + }, + { + "epoch": 0.03450493738413227, + "grad_norm": 4.132481575012207, + "learning_rate": 1.2417549167927382e-05, + "loss": 1.108, + "step": 456 + }, + { + "epoch": 0.03458060610646589, + "grad_norm": 6.405203342437744, + "learning_rate": 1.2444780635400908e-05, + "loss": 1.0141, + "step": 457 + }, + { + "epoch": 0.034656274828799516, + "grad_norm": 4.2582597732543945, + "learning_rate": 1.2472012102874433e-05, + "loss": 0.8596, + "step": 458 + }, + { + "epoch": 0.03473194355113314, + "grad_norm": 4.002652168273926, + "learning_rate": 1.2499243570347958e-05, + "loss": 0.9137, + "step": 459 + }, + { + "epoch": 0.034807612273466765, + "grad_norm": 4.454577445983887, + "learning_rate": 1.2526475037821484e-05, + "loss": 0.8705, + "step": 460 + }, + { + "epoch": 0.03488328099580039, + "grad_norm": 4.909870147705078, + "learning_rate": 1.2553706505295009e-05, + "loss": 1.1714, + "step": 461 + }, + { + "epoch": 0.03495894971813401, + "grad_norm": 7.202528953552246, + "learning_rate": 1.2580937972768532e-05, + "loss": 0.9013, + "step": 462 + }, + { + "epoch": 0.03503461844046763, + "grad_norm": 4.110122203826904, + "learning_rate": 1.2608169440242057e-05, + "loss": 0.8812, + "step": 463 + }, + { + "epoch": 0.035110287162801256, + "grad_norm": 3.5475730895996094, + "learning_rate": 1.2635400907715583e-05, + "loss": 0.9704, + "step": 464 + }, + { + "epoch": 0.03518595588513488, + "grad_norm": 3.4889214038848877, + "learning_rate": 1.2662632375189108e-05, + "loss": 0.9656, + "step": 465 + }, + { + "epoch": 0.035261624607468504, + "grad_norm": 3.9124395847320557, + "learning_rate": 1.2689863842662632e-05, + "loss": 0.7515, + "step": 466 + }, + { + "epoch": 0.03533729332980213, + "grad_norm": 6.498013496398926, + "learning_rate": 1.2717095310136159e-05, + "loss": 0.8926, + "step": 467 + }, + { + "epoch": 0.035412962052135746, + "grad_norm": 3.9321653842926025, + "learning_rate": 1.2744326777609683e-05, + "loss": 0.7346, + "step": 468 + }, + { + "epoch": 0.03548863077446937, + "grad_norm": 5.16299295425415, + "learning_rate": 1.2771558245083208e-05, + "loss": 0.8197, + "step": 469 + }, + { + "epoch": 0.035564299496802995, + "grad_norm": 4.675112247467041, + "learning_rate": 1.2798789712556731e-05, + "loss": 0.8971, + "step": 470 + }, + { + "epoch": 0.03563996821913662, + "grad_norm": 2.9948925971984863, + "learning_rate": 1.2826021180030258e-05, + "loss": 0.8531, + "step": 471 + }, + { + "epoch": 0.035715636941470244, + "grad_norm": 4.1595354080200195, + "learning_rate": 1.2853252647503782e-05, + "loss": 0.8913, + "step": 472 + }, + { + "epoch": 0.03579130566380387, + "grad_norm": 2.9230337142944336, + "learning_rate": 1.2880484114977307e-05, + "loss": 1.1192, + "step": 473 + }, + { + "epoch": 0.03586697438613749, + "grad_norm": 6.0981831550598145, + "learning_rate": 1.2907715582450833e-05, + "loss": 0.8437, + "step": 474 + }, + { + "epoch": 0.03594264310847111, + "grad_norm": 6.705804824829102, + "learning_rate": 1.2934947049924358e-05, + "loss": 0.9358, + "step": 475 + }, + { + "epoch": 0.036018311830804735, + "grad_norm": 3.546778440475464, + "learning_rate": 1.2962178517397883e-05, + "loss": 1.0628, + "step": 476 + }, + { + "epoch": 0.03609398055313836, + "grad_norm": 3.6508278846740723, + "learning_rate": 1.2989409984871406e-05, + "loss": 0.8347, + "step": 477 + }, + { + "epoch": 0.036169649275471984, + "grad_norm": 5.718278884887695, + "learning_rate": 1.3016641452344932e-05, + "loss": 0.9513, + "step": 478 + }, + { + "epoch": 0.03624531799780561, + "grad_norm": 9.246580123901367, + "learning_rate": 1.3043872919818457e-05, + "loss": 0.9638, + "step": 479 + }, + { + "epoch": 0.03632098672013923, + "grad_norm": 5.03000545501709, + "learning_rate": 1.3071104387291982e-05, + "loss": 0.9532, + "step": 480 + }, + { + "epoch": 0.03639665544247286, + "grad_norm": 4.656915187835693, + "learning_rate": 1.3098335854765508e-05, + "loss": 0.9497, + "step": 481 + }, + { + "epoch": 0.036472324164806474, + "grad_norm": 4.1055731773376465, + "learning_rate": 1.3125567322239033e-05, + "loss": 0.782, + "step": 482 + }, + { + "epoch": 0.0365479928871401, + "grad_norm": 3.7302215099334717, + "learning_rate": 1.3152798789712557e-05, + "loss": 0.8432, + "step": 483 + }, + { + "epoch": 0.03662366160947372, + "grad_norm": 4.635787487030029, + "learning_rate": 1.3180030257186082e-05, + "loss": 0.857, + "step": 484 + }, + { + "epoch": 0.03669933033180735, + "grad_norm": 4.681149482727051, + "learning_rate": 1.3207261724659607e-05, + "loss": 0.7984, + "step": 485 + }, + { + "epoch": 0.03677499905414097, + "grad_norm": 6.412924289703369, + "learning_rate": 1.3234493192133132e-05, + "loss": 0.8818, + "step": 486 + }, + { + "epoch": 0.0368506677764746, + "grad_norm": 3.8682901859283447, + "learning_rate": 1.3261724659606656e-05, + "loss": 0.7964, + "step": 487 + }, + { + "epoch": 0.03692633649880822, + "grad_norm": 5.080738544464111, + "learning_rate": 1.3288956127080183e-05, + "loss": 0.8436, + "step": 488 + }, + { + "epoch": 0.03700200522114184, + "grad_norm": 3.049335241317749, + "learning_rate": 1.3316187594553707e-05, + "loss": 0.9249, + "step": 489 + }, + { + "epoch": 0.03707767394347546, + "grad_norm": 4.670293807983398, + "learning_rate": 1.3343419062027232e-05, + "loss": 0.729, + "step": 490 + }, + { + "epoch": 0.03715334266580909, + "grad_norm": 4.936186790466309, + "learning_rate": 1.3370650529500757e-05, + "loss": 0.8145, + "step": 491 + }, + { + "epoch": 0.03722901138814271, + "grad_norm": 2.468773365020752, + "learning_rate": 1.3397881996974281e-05, + "loss": 1.0231, + "step": 492 + }, + { + "epoch": 0.037304680110476336, + "grad_norm": 3.9304311275482178, + "learning_rate": 1.3425113464447806e-05, + "loss": 0.9577, + "step": 493 + }, + { + "epoch": 0.03738034883280996, + "grad_norm": 3.941254138946533, + "learning_rate": 1.345234493192133e-05, + "loss": 0.8663, + "step": 494 + }, + { + "epoch": 0.03745601755514358, + "grad_norm": 3.897300958633423, + "learning_rate": 1.3479576399394857e-05, + "loss": 0.9065, + "step": 495 + }, + { + "epoch": 0.0375316862774772, + "grad_norm": 5.854770183563232, + "learning_rate": 1.3506807866868382e-05, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.03760735499981083, + "grad_norm": 5.508477210998535, + "learning_rate": 1.3534039334341907e-05, + "loss": 0.7829, + "step": 497 + }, + { + "epoch": 0.03768302372214445, + "grad_norm": 3.432650566101074, + "learning_rate": 1.3561270801815431e-05, + "loss": 0.9835, + "step": 498 + }, + { + "epoch": 0.037758692444478076, + "grad_norm": 4.13020133972168, + "learning_rate": 1.3588502269288958e-05, + "loss": 0.9426, + "step": 499 + }, + { + "epoch": 0.0378343611668117, + "grad_norm": 3.107402801513672, + "learning_rate": 1.361573373676248e-05, + "loss": 0.7618, + "step": 500 + }, + { + "epoch": 0.037910029889145325, + "grad_norm": 6.790006637573242, + "learning_rate": 1.3642965204236005e-05, + "loss": 1.0121, + "step": 501 + }, + { + "epoch": 0.03798569861147894, + "grad_norm": 4.519580841064453, + "learning_rate": 1.3670196671709532e-05, + "loss": 0.887, + "step": 502 + }, + { + "epoch": 0.03806136733381257, + "grad_norm": 4.3927903175354, + "learning_rate": 1.3697428139183057e-05, + "loss": 0.848, + "step": 503 + }, + { + "epoch": 0.03813703605614619, + "grad_norm": 4.329632759094238, + "learning_rate": 1.3724659606656581e-05, + "loss": 0.8182, + "step": 504 + }, + { + "epoch": 0.038212704778479815, + "grad_norm": 5.273471355438232, + "learning_rate": 1.3751891074130106e-05, + "loss": 0.9141, + "step": 505 + }, + { + "epoch": 0.03828837350081344, + "grad_norm": 3.8324403762817383, + "learning_rate": 1.3779122541603632e-05, + "loss": 0.9766, + "step": 506 + }, + { + "epoch": 0.038364042223147064, + "grad_norm": 3.876749038696289, + "learning_rate": 1.3806354009077157e-05, + "loss": 0.8484, + "step": 507 + }, + { + "epoch": 0.03843971094548069, + "grad_norm": 4.648043155670166, + "learning_rate": 1.383358547655068e-05, + "loss": 0.9291, + "step": 508 + }, + { + "epoch": 0.038515379667814306, + "grad_norm": 4.072823524475098, + "learning_rate": 1.3860816944024205e-05, + "loss": 1.0211, + "step": 509 + }, + { + "epoch": 0.03859104839014793, + "grad_norm": 7.409148216247559, + "learning_rate": 1.3888048411497731e-05, + "loss": 0.9665, + "step": 510 + }, + { + "epoch": 0.038666717112481555, + "grad_norm": 5.668654441833496, + "learning_rate": 1.3915279878971256e-05, + "loss": 0.8603, + "step": 511 + }, + { + "epoch": 0.03874238583481518, + "grad_norm": 4.457876205444336, + "learning_rate": 1.394251134644478e-05, + "loss": 0.8055, + "step": 512 + }, + { + "epoch": 0.038818054557148804, + "grad_norm": 4.217092514038086, + "learning_rate": 1.3969742813918307e-05, + "loss": 0.893, + "step": 513 + }, + { + "epoch": 0.03889372327948243, + "grad_norm": 4.033523082733154, + "learning_rate": 1.3996974281391832e-05, + "loss": 0.8748, + "step": 514 + }, + { + "epoch": 0.03896939200181605, + "grad_norm": 3.2417023181915283, + "learning_rate": 1.4024205748865355e-05, + "loss": 0.7512, + "step": 515 + }, + { + "epoch": 0.03904506072414967, + "grad_norm": 4.064194679260254, + "learning_rate": 1.405143721633888e-05, + "loss": 0.991, + "step": 516 + }, + { + "epoch": 0.039120729446483295, + "grad_norm": 5.263235569000244, + "learning_rate": 1.4078668683812406e-05, + "loss": 0.8356, + "step": 517 + }, + { + "epoch": 0.03919639816881692, + "grad_norm": 3.2027482986450195, + "learning_rate": 1.410590015128593e-05, + "loss": 1.0551, + "step": 518 + }, + { + "epoch": 0.039272066891150544, + "grad_norm": 6.763327121734619, + "learning_rate": 1.4133131618759455e-05, + "loss": 0.8429, + "step": 519 + }, + { + "epoch": 0.03934773561348417, + "grad_norm": 4.308533668518066, + "learning_rate": 1.4160363086232982e-05, + "loss": 0.8884, + "step": 520 + }, + { + "epoch": 0.03942340433581779, + "grad_norm": 4.909972667694092, + "learning_rate": 1.4187594553706506e-05, + "loss": 0.7806, + "step": 521 + }, + { + "epoch": 0.03949907305815141, + "grad_norm": 3.7141098976135254, + "learning_rate": 1.4214826021180031e-05, + "loss": 0.7759, + "step": 522 + }, + { + "epoch": 0.039574741780485034, + "grad_norm": 4.333841800689697, + "learning_rate": 1.4242057488653554e-05, + "loss": 0.9594, + "step": 523 + }, + { + "epoch": 0.03965041050281866, + "grad_norm": 7.9005866050720215, + "learning_rate": 1.426928895612708e-05, + "loss": 0.9144, + "step": 524 + }, + { + "epoch": 0.03972607922515228, + "grad_norm": 4.86323881149292, + "learning_rate": 1.4296520423600605e-05, + "loss": 0.85, + "step": 525 + }, + { + "epoch": 0.03980174794748591, + "grad_norm": 2.843881130218506, + "learning_rate": 1.432375189107413e-05, + "loss": 0.9132, + "step": 526 + }, + { + "epoch": 0.03987741666981953, + "grad_norm": 3.5814990997314453, + "learning_rate": 1.4350983358547656e-05, + "loss": 0.8243, + "step": 527 + }, + { + "epoch": 0.039953085392153156, + "grad_norm": 3.7590556144714355, + "learning_rate": 1.4378214826021181e-05, + "loss": 0.8779, + "step": 528 + }, + { + "epoch": 0.040028754114486774, + "grad_norm": 4.117438316345215, + "learning_rate": 1.4405446293494706e-05, + "loss": 0.9465, + "step": 529 + }, + { + "epoch": 0.0401044228368204, + "grad_norm": 6.806588649749756, + "learning_rate": 1.4432677760968229e-05, + "loss": 0.8587, + "step": 530 + }, + { + "epoch": 0.04018009155915402, + "grad_norm": 3.3301045894622803, + "learning_rate": 1.4459909228441755e-05, + "loss": 0.8867, + "step": 531 + }, + { + "epoch": 0.04025576028148765, + "grad_norm": 3.395404577255249, + "learning_rate": 1.448714069591528e-05, + "loss": 0.9337, + "step": 532 + }, + { + "epoch": 0.04033142900382127, + "grad_norm": 6.818991184234619, + "learning_rate": 1.4514372163388804e-05, + "loss": 1.0161, + "step": 533 + }, + { + "epoch": 0.040407097726154896, + "grad_norm": 3.7646358013153076, + "learning_rate": 1.4541603630862331e-05, + "loss": 0.9049, + "step": 534 + }, + { + "epoch": 0.04048276644848852, + "grad_norm": 3.209998369216919, + "learning_rate": 1.4568835098335856e-05, + "loss": 0.9199, + "step": 535 + }, + { + "epoch": 0.04055843517082214, + "grad_norm": 4.078510761260986, + "learning_rate": 1.459606656580938e-05, + "loss": 0.9277, + "step": 536 + }, + { + "epoch": 0.04063410389315576, + "grad_norm": 4.5334153175354, + "learning_rate": 1.4623298033282907e-05, + "loss": 0.9826, + "step": 537 + }, + { + "epoch": 0.04070977261548939, + "grad_norm": 4.209270000457764, + "learning_rate": 1.465052950075643e-05, + "loss": 0.8722, + "step": 538 + }, + { + "epoch": 0.04078544133782301, + "grad_norm": 4.012211799621582, + "learning_rate": 1.4677760968229954e-05, + "loss": 0.7578, + "step": 539 + }, + { + "epoch": 0.040861110060156636, + "grad_norm": 3.805192232131958, + "learning_rate": 1.4704992435703479e-05, + "loss": 0.7765, + "step": 540 + }, + { + "epoch": 0.04093677878249026, + "grad_norm": 6.301825046539307, + "learning_rate": 1.4732223903177005e-05, + "loss": 0.8551, + "step": 541 + }, + { + "epoch": 0.04101244750482388, + "grad_norm": 3.2638895511627197, + "learning_rate": 1.475945537065053e-05, + "loss": 0.81, + "step": 542 + }, + { + "epoch": 0.0410881162271575, + "grad_norm": 4.314562797546387, + "learning_rate": 1.4786686838124055e-05, + "loss": 1.0944, + "step": 543 + }, + { + "epoch": 0.041163784949491126, + "grad_norm": 3.090569496154785, + "learning_rate": 1.4813918305597581e-05, + "loss": 1.0042, + "step": 544 + }, + { + "epoch": 0.04123945367182475, + "grad_norm": 3.7688913345336914, + "learning_rate": 1.4841149773071104e-05, + "loss": 0.9544, + "step": 545 + }, + { + "epoch": 0.041315122394158375, + "grad_norm": 4.588676929473877, + "learning_rate": 1.4868381240544629e-05, + "loss": 0.7449, + "step": 546 + }, + { + "epoch": 0.041390791116492, + "grad_norm": 6.916925430297852, + "learning_rate": 1.4895612708018154e-05, + "loss": 0.9825, + "step": 547 + }, + { + "epoch": 0.041466459838825624, + "grad_norm": 3.3256642818450928, + "learning_rate": 1.492284417549168e-05, + "loss": 0.9232, + "step": 548 + }, + { + "epoch": 0.04154212856115924, + "grad_norm": 5.033417224884033, + "learning_rate": 1.4950075642965205e-05, + "loss": 0.9566, + "step": 549 + }, + { + "epoch": 0.041617797283492866, + "grad_norm": 3.85809063911438, + "learning_rate": 1.497730711043873e-05, + "loss": 0.993, + "step": 550 + }, + { + "epoch": 0.04169346600582649, + "grad_norm": 5.949283599853516, + "learning_rate": 1.5004538577912256e-05, + "loss": 0.9769, + "step": 551 + }, + { + "epoch": 0.041769134728160115, + "grad_norm": 4.104134559631348, + "learning_rate": 1.503177004538578e-05, + "loss": 0.8464, + "step": 552 + }, + { + "epoch": 0.04184480345049374, + "grad_norm": 5.211521148681641, + "learning_rate": 1.5059001512859304e-05, + "loss": 0.8422, + "step": 553 + }, + { + "epoch": 0.041920472172827364, + "grad_norm": 3.4157001972198486, + "learning_rate": 1.5086232980332828e-05, + "loss": 0.8472, + "step": 554 + }, + { + "epoch": 0.04199614089516099, + "grad_norm": 3.895693778991699, + "learning_rate": 1.5113464447806355e-05, + "loss": 0.7614, + "step": 555 + }, + { + "epoch": 0.042071809617494606, + "grad_norm": 4.627487659454346, + "learning_rate": 1.514069591527988e-05, + "loss": 0.9064, + "step": 556 + }, + { + "epoch": 0.04214747833982823, + "grad_norm": 3.6824750900268555, + "learning_rate": 1.5167927382753404e-05, + "loss": 0.8688, + "step": 557 + }, + { + "epoch": 0.042223147062161855, + "grad_norm": 3.035003185272217, + "learning_rate": 1.519515885022693e-05, + "loss": 0.9319, + "step": 558 + }, + { + "epoch": 0.04229881578449548, + "grad_norm": 3.1040902137756348, + "learning_rate": 1.5222390317700455e-05, + "loss": 0.8281, + "step": 559 + }, + { + "epoch": 0.0423744845068291, + "grad_norm": 3.9689624309539795, + "learning_rate": 1.5249621785173978e-05, + "loss": 0.8638, + "step": 560 + }, + { + "epoch": 0.04245015322916273, + "grad_norm": 4.4839701652526855, + "learning_rate": 1.5276853252647503e-05, + "loss": 0.7902, + "step": 561 + }, + { + "epoch": 0.04252582195149635, + "grad_norm": 5.034473419189453, + "learning_rate": 1.530408472012103e-05, + "loss": 0.8222, + "step": 562 + }, + { + "epoch": 0.04260149067382997, + "grad_norm": 3.7882745265960693, + "learning_rate": 1.5331316187594552e-05, + "loss": 0.8985, + "step": 563 + }, + { + "epoch": 0.042677159396163594, + "grad_norm": 4.125184059143066, + "learning_rate": 1.535854765506808e-05, + "loss": 0.9292, + "step": 564 + }, + { + "epoch": 0.04275282811849722, + "grad_norm": 4.87890625, + "learning_rate": 1.5385779122541605e-05, + "loss": 0.7701, + "step": 565 + }, + { + "epoch": 0.04282849684083084, + "grad_norm": 4.0733513832092285, + "learning_rate": 1.5413010590015128e-05, + "loss": 0.9535, + "step": 566 + }, + { + "epoch": 0.04290416556316447, + "grad_norm": 5.096096515655518, + "learning_rate": 1.5440242057488655e-05, + "loss": 0.7682, + "step": 567 + }, + { + "epoch": 0.04297983428549809, + "grad_norm": 3.4975779056549072, + "learning_rate": 1.5467473524962178e-05, + "loss": 0.9259, + "step": 568 + }, + { + "epoch": 0.04305550300783171, + "grad_norm": 4.021080493927002, + "learning_rate": 1.5494704992435704e-05, + "loss": 0.7498, + "step": 569 + }, + { + "epoch": 0.043131171730165334, + "grad_norm": 4.798002243041992, + "learning_rate": 1.5521936459909227e-05, + "loss": 0.9342, + "step": 570 + }, + { + "epoch": 0.04320684045249896, + "grad_norm": 3.3058738708496094, + "learning_rate": 1.5549167927382753e-05, + "loss": 0.8086, + "step": 571 + }, + { + "epoch": 0.04328250917483258, + "grad_norm": 4.445735931396484, + "learning_rate": 1.557639939485628e-05, + "loss": 0.8993, + "step": 572 + }, + { + "epoch": 0.04335817789716621, + "grad_norm": 3.855530023574829, + "learning_rate": 1.5603630862329803e-05, + "loss": 0.8861, + "step": 573 + }, + { + "epoch": 0.04343384661949983, + "grad_norm": 3.9461214542388916, + "learning_rate": 1.563086232980333e-05, + "loss": 0.8456, + "step": 574 + }, + { + "epoch": 0.043509515341833456, + "grad_norm": 2.9989559650421143, + "learning_rate": 1.5658093797276852e-05, + "loss": 0.9882, + "step": 575 + }, + { + "epoch": 0.043585184064167073, + "grad_norm": 6.7230916023254395, + "learning_rate": 1.568532526475038e-05, + "loss": 0.8234, + "step": 576 + }, + { + "epoch": 0.0436608527865007, + "grad_norm": 4.629927635192871, + "learning_rate": 1.57125567322239e-05, + "loss": 0.8634, + "step": 577 + }, + { + "epoch": 0.04373652150883432, + "grad_norm": 6.619353294372559, + "learning_rate": 1.5739788199697428e-05, + "loss": 0.8626, + "step": 578 + }, + { + "epoch": 0.04381219023116795, + "grad_norm": 2.4462685585021973, + "learning_rate": 1.5767019667170954e-05, + "loss": 0.7996, + "step": 579 + }, + { + "epoch": 0.04388785895350157, + "grad_norm": 3.107055902481079, + "learning_rate": 1.5794251134644477e-05, + "loss": 1.1027, + "step": 580 + }, + { + "epoch": 0.043963527675835196, + "grad_norm": 3.176931858062744, + "learning_rate": 1.5821482602118004e-05, + "loss": 0.8589, + "step": 581 + }, + { + "epoch": 0.04403919639816882, + "grad_norm": 6.571891784667969, + "learning_rate": 1.584871406959153e-05, + "loss": 0.8045, + "step": 582 + }, + { + "epoch": 0.04411486512050244, + "grad_norm": 4.314690589904785, + "learning_rate": 1.5875945537065053e-05, + "loss": 0.8888, + "step": 583 + }, + { + "epoch": 0.04419053384283606, + "grad_norm": 3.6380622386932373, + "learning_rate": 1.5903177004538576e-05, + "loss": 0.7593, + "step": 584 + }, + { + "epoch": 0.044266202565169686, + "grad_norm": 5.430633544921875, + "learning_rate": 1.5930408472012103e-05, + "loss": 0.884, + "step": 585 + }, + { + "epoch": 0.04434187128750331, + "grad_norm": 3.1226465702056885, + "learning_rate": 1.595763993948563e-05, + "loss": 0.9011, + "step": 586 + }, + { + "epoch": 0.044417540009836935, + "grad_norm": 3.8268587589263916, + "learning_rate": 1.5984871406959152e-05, + "loss": 0.8305, + "step": 587 + }, + { + "epoch": 0.04449320873217056, + "grad_norm": 5.864771842956543, + "learning_rate": 1.601210287443268e-05, + "loss": 0.7973, + "step": 588 + }, + { + "epoch": 0.044568877454504184, + "grad_norm": 4.280256748199463, + "learning_rate": 1.6039334341906205e-05, + "loss": 0.7368, + "step": 589 + }, + { + "epoch": 0.0446445461768378, + "grad_norm": 4.382325649261475, + "learning_rate": 1.6066565809379728e-05, + "loss": 0.8988, + "step": 590 + }, + { + "epoch": 0.044720214899171426, + "grad_norm": 4.108618259429932, + "learning_rate": 1.609379727685325e-05, + "loss": 0.8688, + "step": 591 + }, + { + "epoch": 0.04479588362150505, + "grad_norm": 3.6567695140838623, + "learning_rate": 1.6121028744326777e-05, + "loss": 0.8829, + "step": 592 + }, + { + "epoch": 0.044871552343838675, + "grad_norm": 3.6836469173431396, + "learning_rate": 1.6148260211800304e-05, + "loss": 1.0172, + "step": 593 + }, + { + "epoch": 0.0449472210661723, + "grad_norm": 3.5387299060821533, + "learning_rate": 1.6175491679273827e-05, + "loss": 0.7189, + "step": 594 + }, + { + "epoch": 0.045022889788505924, + "grad_norm": 13.61294174194336, + "learning_rate": 1.6202723146747353e-05, + "loss": 0.7605, + "step": 595 + }, + { + "epoch": 0.04509855851083954, + "grad_norm": 4.561513900756836, + "learning_rate": 1.622995461422088e-05, + "loss": 0.7954, + "step": 596 + }, + { + "epoch": 0.045174227233173166, + "grad_norm": 4.984888553619385, + "learning_rate": 1.6257186081694402e-05, + "loss": 1.0385, + "step": 597 + }, + { + "epoch": 0.04524989595550679, + "grad_norm": 3.820335865020752, + "learning_rate": 1.6284417549167925e-05, + "loss": 0.8676, + "step": 598 + }, + { + "epoch": 0.045325564677840414, + "grad_norm": 3.2544524669647217, + "learning_rate": 1.6311649016641452e-05, + "loss": 0.9514, + "step": 599 + }, + { + "epoch": 0.04540123340017404, + "grad_norm": 5.345118999481201, + "learning_rate": 1.6338880484114978e-05, + "loss": 0.7714, + "step": 600 + }, + { + "epoch": 0.04547690212250766, + "grad_norm": 3.907956123352051, + "learning_rate": 1.63661119515885e-05, + "loss": 0.8939, + "step": 601 + }, + { + "epoch": 0.04555257084484129, + "grad_norm": 6.510712146759033, + "learning_rate": 1.6393343419062028e-05, + "loss": 0.8035, + "step": 602 + }, + { + "epoch": 0.045628239567174905, + "grad_norm": 3.6979787349700928, + "learning_rate": 1.6420574886535554e-05, + "loss": 0.8553, + "step": 603 + }, + { + "epoch": 0.04570390828950853, + "grad_norm": 4.597548007965088, + "learning_rate": 1.6447806354009077e-05, + "loss": 0.803, + "step": 604 + }, + { + "epoch": 0.045779577011842154, + "grad_norm": 4.338045120239258, + "learning_rate": 1.64750378214826e-05, + "loss": 0.878, + "step": 605 + }, + { + "epoch": 0.04585524573417578, + "grad_norm": 4.208822727203369, + "learning_rate": 1.6502269288956126e-05, + "loss": 0.807, + "step": 606 + }, + { + "epoch": 0.0459309144565094, + "grad_norm": 3.9648923873901367, + "learning_rate": 1.6529500756429653e-05, + "loss": 0.8748, + "step": 607 + }, + { + "epoch": 0.04600658317884303, + "grad_norm": 2.76554536819458, + "learning_rate": 1.6556732223903176e-05, + "loss": 0.8559, + "step": 608 + }, + { + "epoch": 0.04608225190117665, + "grad_norm": 5.518862724304199, + "learning_rate": 1.6583963691376702e-05, + "loss": 0.8451, + "step": 609 + }, + { + "epoch": 0.04615792062351027, + "grad_norm": 4.203677177429199, + "learning_rate": 1.661119515885023e-05, + "loss": 0.9215, + "step": 610 + }, + { + "epoch": 0.046233589345843894, + "grad_norm": 3.4287822246551514, + "learning_rate": 1.6638426626323752e-05, + "loss": 0.8132, + "step": 611 + }, + { + "epoch": 0.04630925806817752, + "grad_norm": 4.197726726531982, + "learning_rate": 1.6665658093797278e-05, + "loss": 0.8095, + "step": 612 + }, + { + "epoch": 0.04638492679051114, + "grad_norm": 4.408070087432861, + "learning_rate": 1.66928895612708e-05, + "loss": 1.0774, + "step": 613 + }, + { + "epoch": 0.04646059551284477, + "grad_norm": 3.8713626861572266, + "learning_rate": 1.6720121028744328e-05, + "loss": 0.7434, + "step": 614 + }, + { + "epoch": 0.04653626423517839, + "grad_norm": 3.408956527709961, + "learning_rate": 1.674735249621785e-05, + "loss": 0.8511, + "step": 615 + }, + { + "epoch": 0.046611932957512016, + "grad_norm": 2.916395902633667, + "learning_rate": 1.6774583963691377e-05, + "loss": 1.0129, + "step": 616 + }, + { + "epoch": 0.04668760167984563, + "grad_norm": 3.563767671585083, + "learning_rate": 1.6801815431164903e-05, + "loss": 0.927, + "step": 617 + }, + { + "epoch": 0.04676327040217926, + "grad_norm": 12.874147415161133, + "learning_rate": 1.6829046898638426e-05, + "loss": 0.7429, + "step": 618 + }, + { + "epoch": 0.04683893912451288, + "grad_norm": 4.559039115905762, + "learning_rate": 1.6856278366111953e-05, + "loss": 0.9711, + "step": 619 + }, + { + "epoch": 0.04691460784684651, + "grad_norm": 5.084630489349365, + "learning_rate": 1.6883509833585476e-05, + "loss": 0.87, + "step": 620 + }, + { + "epoch": 0.04699027656918013, + "grad_norm": 4.294825553894043, + "learning_rate": 1.6910741301059002e-05, + "loss": 1.1267, + "step": 621 + }, + { + "epoch": 0.047065945291513755, + "grad_norm": 14.884833335876465, + "learning_rate": 1.6937972768532525e-05, + "loss": 0.9987, + "step": 622 + }, + { + "epoch": 0.04714161401384737, + "grad_norm": 3.429875373840332, + "learning_rate": 1.696520423600605e-05, + "loss": 0.8943, + "step": 623 + }, + { + "epoch": 0.047217282736181, + "grad_norm": 4.022202014923096, + "learning_rate": 1.6992435703479578e-05, + "loss": 0.9646, + "step": 624 + }, + { + "epoch": 0.04729295145851462, + "grad_norm": 3.040421962738037, + "learning_rate": 1.70196671709531e-05, + "loss": 0.9784, + "step": 625 + }, + { + "epoch": 0.047368620180848246, + "grad_norm": 4.135276794433594, + "learning_rate": 1.7046898638426627e-05, + "loss": 0.793, + "step": 626 + }, + { + "epoch": 0.04744428890318187, + "grad_norm": 3.7351131439208984, + "learning_rate": 1.7074130105900154e-05, + "loss": 0.7567, + "step": 627 + }, + { + "epoch": 0.047519957625515495, + "grad_norm": 3.320626974105835, + "learning_rate": 1.7101361573373677e-05, + "loss": 0.6704, + "step": 628 + }, + { + "epoch": 0.04759562634784912, + "grad_norm": 4.5212178230285645, + "learning_rate": 1.71285930408472e-05, + "loss": 0.8622, + "step": 629 + }, + { + "epoch": 0.04767129507018274, + "grad_norm": 4.007808208465576, + "learning_rate": 1.7155824508320726e-05, + "loss": 0.8503, + "step": 630 + }, + { + "epoch": 0.04774696379251636, + "grad_norm": 4.011386394500732, + "learning_rate": 1.7183055975794253e-05, + "loss": 0.9573, + "step": 631 + }, + { + "epoch": 0.047822632514849986, + "grad_norm": 4.2028937339782715, + "learning_rate": 1.7210287443267776e-05, + "loss": 0.8327, + "step": 632 + }, + { + "epoch": 0.04789830123718361, + "grad_norm": 3.389353036880493, + "learning_rate": 1.7237518910741302e-05, + "loss": 0.8626, + "step": 633 + }, + { + "epoch": 0.047973969959517235, + "grad_norm": 3.483424663543701, + "learning_rate": 1.726475037821483e-05, + "loss": 1.0075, + "step": 634 + }, + { + "epoch": 0.04804963868185086, + "grad_norm": 2.878598213195801, + "learning_rate": 1.729198184568835e-05, + "loss": 0.8132, + "step": 635 + }, + { + "epoch": 0.048125307404184484, + "grad_norm": 4.429380893707275, + "learning_rate": 1.7319213313161874e-05, + "loss": 0.8764, + "step": 636 + }, + { + "epoch": 0.0482009761265181, + "grad_norm": 3.748349189758301, + "learning_rate": 1.73464447806354e-05, + "loss": 0.7818, + "step": 637 + }, + { + "epoch": 0.048276644848851726, + "grad_norm": 2.982710838317871, + "learning_rate": 1.7373676248108927e-05, + "loss": 0.8131, + "step": 638 + }, + { + "epoch": 0.04835231357118535, + "grad_norm": 3.4813013076782227, + "learning_rate": 1.740090771558245e-05, + "loss": 1.0064, + "step": 639 + }, + { + "epoch": 0.048427982293518974, + "grad_norm": 4.015783309936523, + "learning_rate": 1.7428139183055977e-05, + "loss": 0.775, + "step": 640 + }, + { + "epoch": 0.0485036510158526, + "grad_norm": 5.063205242156982, + "learning_rate": 1.7455370650529503e-05, + "loss": 0.8112, + "step": 641 + }, + { + "epoch": 0.04857931973818622, + "grad_norm": 3.2861835956573486, + "learning_rate": 1.7482602118003026e-05, + "loss": 0.8949, + "step": 642 + }, + { + "epoch": 0.04865498846051984, + "grad_norm": 4.188798904418945, + "learning_rate": 1.750983358547655e-05, + "loss": 0.7644, + "step": 643 + }, + { + "epoch": 0.048730657182853465, + "grad_norm": 2.6496074199676514, + "learning_rate": 1.7537065052950075e-05, + "loss": 0.7834, + "step": 644 + }, + { + "epoch": 0.04880632590518709, + "grad_norm": 3.977748155593872, + "learning_rate": 1.7564296520423602e-05, + "loss": 0.7691, + "step": 645 + }, + { + "epoch": 0.048881994627520714, + "grad_norm": 4.396695613861084, + "learning_rate": 1.7591527987897125e-05, + "loss": 0.7951, + "step": 646 + }, + { + "epoch": 0.04895766334985434, + "grad_norm": 3.3221042156219482, + "learning_rate": 1.761875945537065e-05, + "loss": 0.69, + "step": 647 + }, + { + "epoch": 0.04903333207218796, + "grad_norm": 4.295675754547119, + "learning_rate": 1.7645990922844178e-05, + "loss": 0.6565, + "step": 648 + }, + { + "epoch": 0.04910900079452159, + "grad_norm": 3.0245003700256348, + "learning_rate": 1.76732223903177e-05, + "loss": 0.9869, + "step": 649 + }, + { + "epoch": 0.049184669516855205, + "grad_norm": 3.450180768966675, + "learning_rate": 1.7700453857791224e-05, + "loss": 0.894, + "step": 650 + }, + { + "epoch": 0.04926033823918883, + "grad_norm": 3.598787546157837, + "learning_rate": 1.772768532526475e-05, + "loss": 0.8114, + "step": 651 + }, + { + "epoch": 0.049336006961522454, + "grad_norm": 3.394605875015259, + "learning_rate": 1.7754916792738276e-05, + "loss": 0.913, + "step": 652 + }, + { + "epoch": 0.04941167568385608, + "grad_norm": 3.7939605712890625, + "learning_rate": 1.77821482602118e-05, + "loss": 0.9026, + "step": 653 + }, + { + "epoch": 0.0494873444061897, + "grad_norm": 3.1907098293304443, + "learning_rate": 1.7809379727685326e-05, + "loss": 0.8377, + "step": 654 + }, + { + "epoch": 0.04956301312852333, + "grad_norm": 3.942924976348877, + "learning_rate": 1.7836611195158852e-05, + "loss": 0.8688, + "step": 655 + }, + { + "epoch": 0.04963868185085695, + "grad_norm": 3.0986690521240234, + "learning_rate": 1.7863842662632375e-05, + "loss": 0.8261, + "step": 656 + }, + { + "epoch": 0.04971435057319057, + "grad_norm": 4.000396728515625, + "learning_rate": 1.78910741301059e-05, + "loss": 0.8151, + "step": 657 + }, + { + "epoch": 0.04979001929552419, + "grad_norm": 4.246333122253418, + "learning_rate": 1.7918305597579425e-05, + "loss": 0.7004, + "step": 658 + }, + { + "epoch": 0.04986568801785782, + "grad_norm": 3.094942092895508, + "learning_rate": 1.794553706505295e-05, + "loss": 0.8833, + "step": 659 + }, + { + "epoch": 0.04994135674019144, + "grad_norm": 2.5228271484375, + "learning_rate": 1.7972768532526474e-05, + "loss": 0.786, + "step": 660 + }, + { + "epoch": 0.05001702546252507, + "grad_norm": 3.9577856063842773, + "learning_rate": 1.8e-05, + "loss": 0.9358, + "step": 661 + }, + { + "epoch": 0.05009269418485869, + "grad_norm": 2.605454444885254, + "learning_rate": 1.7999999718195446e-05, + "loss": 0.7314, + "step": 662 + }, + { + "epoch": 0.050168362907192315, + "grad_norm": 4.23893928527832, + "learning_rate": 1.79999988727818e-05, + "loss": 0.7816, + "step": 663 + }, + { + "epoch": 0.05024403162952593, + "grad_norm": 3.738476037979126, + "learning_rate": 1.7999997463759113e-05, + "loss": 0.7877, + "step": 664 + }, + { + "epoch": 0.05031970035185956, + "grad_norm": 3.7416157722473145, + "learning_rate": 1.7999995491127477e-05, + "loss": 0.9857, + "step": 665 + }, + { + "epoch": 0.05039536907419318, + "grad_norm": 2.7435741424560547, + "learning_rate": 1.7999992954887013e-05, + "loss": 0.683, + "step": 666 + }, + { + "epoch": 0.050471037796526806, + "grad_norm": 2.920893430709839, + "learning_rate": 1.7999989855037883e-05, + "loss": 0.9957, + "step": 667 + }, + { + "epoch": 0.05054670651886043, + "grad_norm": 3.039703607559204, + "learning_rate": 1.7999986191580278e-05, + "loss": 0.7383, + "step": 668 + }, + { + "epoch": 0.050622375241194055, + "grad_norm": 3.4982380867004395, + "learning_rate": 1.7999981964514427e-05, + "loss": 0.9463, + "step": 669 + }, + { + "epoch": 0.05069804396352767, + "grad_norm": 6.600189208984375, + "learning_rate": 1.7999977173840594e-05, + "loss": 0.9587, + "step": 670 + }, + { + "epoch": 0.0507737126858613, + "grad_norm": 4.686428070068359, + "learning_rate": 1.7999971819559082e-05, + "loss": 0.8895, + "step": 671 + }, + { + "epoch": 0.05084938140819492, + "grad_norm": 3.403703451156616, + "learning_rate": 1.799996590167023e-05, + "loss": 0.9366, + "step": 672 + }, + { + "epoch": 0.050925050130528546, + "grad_norm": 3.352269411087036, + "learning_rate": 1.7999959420174395e-05, + "loss": 0.9894, + "step": 673 + }, + { + "epoch": 0.05100071885286217, + "grad_norm": 3.1470065116882324, + "learning_rate": 1.7999952375072e-05, + "loss": 0.8608, + "step": 674 + }, + { + "epoch": 0.051076387575195795, + "grad_norm": 5.2624897956848145, + "learning_rate": 1.7999944766363475e-05, + "loss": 0.8582, + "step": 675 + }, + { + "epoch": 0.05115205629752942, + "grad_norm": 4.0696187019348145, + "learning_rate": 1.7999936594049297e-05, + "loss": 0.8385, + "step": 676 + }, + { + "epoch": 0.05122772501986304, + "grad_norm": 4.101423740386963, + "learning_rate": 1.7999927858129984e-05, + "loss": 0.8864, + "step": 677 + }, + { + "epoch": 0.05130339374219666, + "grad_norm": 4.04774284362793, + "learning_rate": 1.7999918558606075e-05, + "loss": 0.8205, + "step": 678 + }, + { + "epoch": 0.051379062464530285, + "grad_norm": 4.320160388946533, + "learning_rate": 1.7999908695478162e-05, + "loss": 1.0081, + "step": 679 + }, + { + "epoch": 0.05145473118686391, + "grad_norm": 4.122174263000488, + "learning_rate": 1.7999898268746852e-05, + "loss": 0.6311, + "step": 680 + }, + { + "epoch": 0.051530399909197534, + "grad_norm": 4.779628276824951, + "learning_rate": 1.7999887278412806e-05, + "loss": 0.9552, + "step": 681 + }, + { + "epoch": 0.05160606863153116, + "grad_norm": 3.8603785037994385, + "learning_rate": 1.7999875724476707e-05, + "loss": 0.889, + "step": 682 + }, + { + "epoch": 0.05168173735386478, + "grad_norm": 2.222905158996582, + "learning_rate": 1.7999863606939286e-05, + "loss": 1.0504, + "step": 683 + }, + { + "epoch": 0.0517574060761984, + "grad_norm": 2.802685260772705, + "learning_rate": 1.7999850925801292e-05, + "loss": 0.9105, + "step": 684 + }, + { + "epoch": 0.051833074798532025, + "grad_norm": 3.3969781398773193, + "learning_rate": 1.7999837681063527e-05, + "loss": 0.7931, + "step": 685 + }, + { + "epoch": 0.05190874352086565, + "grad_norm": 3.549208641052246, + "learning_rate": 1.7999823872726814e-05, + "loss": 0.9147, + "step": 686 + }, + { + "epoch": 0.051984412243199274, + "grad_norm": 2.738788366317749, + "learning_rate": 1.7999809500792023e-05, + "loss": 0.6898, + "step": 687 + }, + { + "epoch": 0.0520600809655329, + "grad_norm": 5.0861711502075195, + "learning_rate": 1.799979456526005e-05, + "loss": 0.9764, + "step": 688 + }, + { + "epoch": 0.05213574968786652, + "grad_norm": 3.7050702571868896, + "learning_rate": 1.799977906613184e-05, + "loss": 0.8309, + "step": 689 + }, + { + "epoch": 0.05221141841020015, + "grad_norm": 2.8487472534179688, + "learning_rate": 1.7999763003408348e-05, + "loss": 0.798, + "step": 690 + }, + { + "epoch": 0.052287087132533765, + "grad_norm": 3.320040464401245, + "learning_rate": 1.7999746377090593e-05, + "loss": 0.9132, + "step": 691 + }, + { + "epoch": 0.05236275585486739, + "grad_norm": 2.664503574371338, + "learning_rate": 1.7999729187179606e-05, + "loss": 0.8706, + "step": 692 + }, + { + "epoch": 0.052438424577201014, + "grad_norm": 5.23117208480835, + "learning_rate": 1.7999711433676474e-05, + "loss": 0.7535, + "step": 693 + }, + { + "epoch": 0.05251409329953464, + "grad_norm": 4.686688423156738, + "learning_rate": 1.7999693116582302e-05, + "loss": 0.9761, + "step": 694 + }, + { + "epoch": 0.05258976202186826, + "grad_norm": 2.983670234680176, + "learning_rate": 1.7999674235898237e-05, + "loss": 0.8898, + "step": 695 + }, + { + "epoch": 0.05266543074420189, + "grad_norm": 3.057015895843506, + "learning_rate": 1.7999654791625463e-05, + "loss": 0.7925, + "step": 696 + }, + { + "epoch": 0.052741099466535504, + "grad_norm": 3.9914684295654297, + "learning_rate": 1.79996347837652e-05, + "loss": 0.7957, + "step": 697 + }, + { + "epoch": 0.05281676818886913, + "grad_norm": 3.987391233444214, + "learning_rate": 1.7999614212318696e-05, + "loss": 0.9454, + "step": 698 + }, + { + "epoch": 0.05289243691120275, + "grad_norm": 4.3634138107299805, + "learning_rate": 1.7999593077287244e-05, + "loss": 1.1532, + "step": 699 + }, + { + "epoch": 0.05296810563353638, + "grad_norm": 3.055154800415039, + "learning_rate": 1.799957137867216e-05, + "loss": 0.8241, + "step": 700 + }, + { + "epoch": 0.05304377435587, + "grad_norm": 3.825345277786255, + "learning_rate": 1.7999549116474813e-05, + "loss": 0.8979, + "step": 701 + }, + { + "epoch": 0.053119443078203626, + "grad_norm": 4.292139530181885, + "learning_rate": 1.7999526290696592e-05, + "loss": 0.9344, + "step": 702 + }, + { + "epoch": 0.05319511180053725, + "grad_norm": 4.645684719085693, + "learning_rate": 1.7999502901338925e-05, + "loss": 0.9731, + "step": 703 + }, + { + "epoch": 0.05327078052287087, + "grad_norm": 2.999361753463745, + "learning_rate": 1.7999478948403278e-05, + "loss": 0.9273, + "step": 704 + }, + { + "epoch": 0.05334644924520449, + "grad_norm": 2.876819610595703, + "learning_rate": 1.7999454431891153e-05, + "loss": 0.8832, + "step": 705 + }, + { + "epoch": 0.05342211796753812, + "grad_norm": 3.5443317890167236, + "learning_rate": 1.7999429351804084e-05, + "loss": 0.792, + "step": 706 + }, + { + "epoch": 0.05349778668987174, + "grad_norm": 2.4923086166381836, + "learning_rate": 1.799940370814364e-05, + "loss": 0.8596, + "step": 707 + }, + { + "epoch": 0.053573455412205366, + "grad_norm": 3.5171520709991455, + "learning_rate": 1.799937750091143e-05, + "loss": 0.8311, + "step": 708 + }, + { + "epoch": 0.05364912413453899, + "grad_norm": 3.3235208988189697, + "learning_rate": 1.799935073010909e-05, + "loss": 0.7817, + "step": 709 + }, + { + "epoch": 0.053724792856872615, + "grad_norm": 3.130582809448242, + "learning_rate": 1.79993233957383e-05, + "loss": 0.8418, + "step": 710 + }, + { + "epoch": 0.05380046157920623, + "grad_norm": 2.8573694229125977, + "learning_rate": 1.7999295497800774e-05, + "loss": 0.7832, + "step": 711 + }, + { + "epoch": 0.05387613030153986, + "grad_norm": 3.514740467071533, + "learning_rate": 1.7999267036298257e-05, + "loss": 0.8998, + "step": 712 + }, + { + "epoch": 0.05395179902387348, + "grad_norm": 4.873480796813965, + "learning_rate": 1.799923801123253e-05, + "loss": 0.9173, + "step": 713 + }, + { + "epoch": 0.054027467746207106, + "grad_norm": 3.092484951019287, + "learning_rate": 1.7999208422605412e-05, + "loss": 0.922, + "step": 714 + }, + { + "epoch": 0.05410313646854073, + "grad_norm": 3.5271174907684326, + "learning_rate": 1.7999178270418757e-05, + "loss": 0.8321, + "step": 715 + }, + { + "epoch": 0.054178805190874355, + "grad_norm": 3.2208545207977295, + "learning_rate": 1.799914755467445e-05, + "loss": 0.9818, + "step": 716 + }, + { + "epoch": 0.05425447391320797, + "grad_norm": 3.0340662002563477, + "learning_rate": 1.7999116275374415e-05, + "loss": 0.7099, + "step": 717 + }, + { + "epoch": 0.054330142635541596, + "grad_norm": 3.023000717163086, + "learning_rate": 1.799908443252061e-05, + "loss": 0.835, + "step": 718 + }, + { + "epoch": 0.05440581135787522, + "grad_norm": 4.08595609664917, + "learning_rate": 1.799905202611504e-05, + "loss": 0.8734, + "step": 719 + }, + { + "epoch": 0.054481480080208845, + "grad_norm": 3.454214572906494, + "learning_rate": 1.799901905615972e-05, + "loss": 1.0036, + "step": 720 + }, + { + "epoch": 0.05455714880254247, + "grad_norm": 3.4939661026000977, + "learning_rate": 1.799898552265672e-05, + "loss": 0.7938, + "step": 721 + }, + { + "epoch": 0.054632817524876094, + "grad_norm": 4.215449333190918, + "learning_rate": 1.799895142560814e-05, + "loss": 0.9008, + "step": 722 + }, + { + "epoch": 0.05470848624720972, + "grad_norm": 3.8113982677459717, + "learning_rate": 1.799891676501612e-05, + "loss": 0.8452, + "step": 723 + }, + { + "epoch": 0.054784154969543336, + "grad_norm": 3.6723668575286865, + "learning_rate": 1.7998881540882822e-05, + "loss": 0.8416, + "step": 724 + }, + { + "epoch": 0.05485982369187696, + "grad_norm": 3.342585325241089, + "learning_rate": 1.7998845753210456e-05, + "loss": 0.9927, + "step": 725 + }, + { + "epoch": 0.054935492414210585, + "grad_norm": 3.9180972576141357, + "learning_rate": 1.7998809402001267e-05, + "loss": 0.9583, + "step": 726 + }, + { + "epoch": 0.05501116113654421, + "grad_norm": 3.55850887298584, + "learning_rate": 1.7998772487257524e-05, + "loss": 0.8274, + "step": 727 + }, + { + "epoch": 0.055086829858877834, + "grad_norm": 3.635193347930908, + "learning_rate": 1.799873500898154e-05, + "loss": 0.851, + "step": 728 + }, + { + "epoch": 0.05516249858121146, + "grad_norm": 2.7853517532348633, + "learning_rate": 1.799869696717567e-05, + "loss": 0.778, + "step": 729 + }, + { + "epoch": 0.05523816730354508, + "grad_norm": 3.3416101932525635, + "learning_rate": 1.799865836184229e-05, + "loss": 0.8563, + "step": 730 + }, + { + "epoch": 0.0553138360258787, + "grad_norm": 3.1914992332458496, + "learning_rate": 1.7998619192983812e-05, + "loss": 0.7991, + "step": 731 + }, + { + "epoch": 0.055389504748212325, + "grad_norm": 3.1963469982147217, + "learning_rate": 1.79985794606027e-05, + "loss": 0.9286, + "step": 732 + }, + { + "epoch": 0.05546517347054595, + "grad_norm": 3.363598346710205, + "learning_rate": 1.7998539164701437e-05, + "loss": 0.8608, + "step": 733 + }, + { + "epoch": 0.05554084219287957, + "grad_norm": 3.9688327312469482, + "learning_rate": 1.7998498305282548e-05, + "loss": 0.947, + "step": 734 + }, + { + "epoch": 0.0556165109152132, + "grad_norm": 3.629190683364868, + "learning_rate": 1.7998456882348587e-05, + "loss": 0.7682, + "step": 735 + }, + { + "epoch": 0.05569217963754682, + "grad_norm": 2.6202425956726074, + "learning_rate": 1.7998414895902153e-05, + "loss": 0.8611, + "step": 736 + }, + { + "epoch": 0.05576784835988045, + "grad_norm": 3.04758882522583, + "learning_rate": 1.7998372345945874e-05, + "loss": 1.0072, + "step": 737 + }, + { + "epoch": 0.055843517082214064, + "grad_norm": 3.110172748565674, + "learning_rate": 1.7998329232482415e-05, + "loss": 0.7794, + "step": 738 + }, + { + "epoch": 0.05591918580454769, + "grad_norm": 3.5827243328094482, + "learning_rate": 1.7998285555514472e-05, + "loss": 0.8902, + "step": 739 + }, + { + "epoch": 0.05599485452688131, + "grad_norm": 3.689215898513794, + "learning_rate": 1.799824131504479e-05, + "loss": 0.9457, + "step": 740 + }, + { + "epoch": 0.05607052324921494, + "grad_norm": 3.847498893737793, + "learning_rate": 1.799819651107613e-05, + "loss": 0.9951, + "step": 741 + }, + { + "epoch": 0.05614619197154856, + "grad_norm": 3.818758249282837, + "learning_rate": 1.7998151143611298e-05, + "loss": 0.8568, + "step": 742 + }, + { + "epoch": 0.056221860693882186, + "grad_norm": 4.948990821838379, + "learning_rate": 1.799810521265314e-05, + "loss": 0.7821, + "step": 743 + }, + { + "epoch": 0.056297529416215804, + "grad_norm": 2.994140625, + "learning_rate": 1.799805871820453e-05, + "loss": 0.8261, + "step": 744 + }, + { + "epoch": 0.05637319813854943, + "grad_norm": 3.428760528564453, + "learning_rate": 1.799801166026838e-05, + "loss": 0.9666, + "step": 745 + }, + { + "epoch": 0.05644886686088305, + "grad_norm": 3.410270929336548, + "learning_rate": 1.7997964038847636e-05, + "loss": 0.7529, + "step": 746 + }, + { + "epoch": 0.05652453558321668, + "grad_norm": 2.595470428466797, + "learning_rate": 1.7997915853945282e-05, + "loss": 0.9564, + "step": 747 + }, + { + "epoch": 0.0566002043055503, + "grad_norm": 2.552440881729126, + "learning_rate": 1.7997867105564336e-05, + "loss": 0.925, + "step": 748 + }, + { + "epoch": 0.056675873027883926, + "grad_norm": 3.9681804180145264, + "learning_rate": 1.7997817793707845e-05, + "loss": 1.0332, + "step": 749 + }, + { + "epoch": 0.05675154175021755, + "grad_norm": 2.687912940979004, + "learning_rate": 1.7997767918378904e-05, + "loss": 0.8711, + "step": 750 + }, + { + "epoch": 0.05682721047255117, + "grad_norm": 3.232062578201294, + "learning_rate": 1.799771747958063e-05, + "loss": 0.6525, + "step": 751 + }, + { + "epoch": 0.05690287919488479, + "grad_norm": 3.3690457344055176, + "learning_rate": 1.7997666477316194e-05, + "loss": 0.9147, + "step": 752 + }, + { + "epoch": 0.05697854791721842, + "grad_norm": 3.5086419582366943, + "learning_rate": 1.7997614911588774e-05, + "loss": 0.9292, + "step": 753 + }, + { + "epoch": 0.05705421663955204, + "grad_norm": 2.7476987838745117, + "learning_rate": 1.7997562782401604e-05, + "loss": 0.7515, + "step": 754 + }, + { + "epoch": 0.057129885361885666, + "grad_norm": 2.3388469219207764, + "learning_rate": 1.7997510089757956e-05, + "loss": 1.0614, + "step": 755 + }, + { + "epoch": 0.05720555408421929, + "grad_norm": 3.508303165435791, + "learning_rate": 1.7997456833661124e-05, + "loss": 0.7057, + "step": 756 + }, + { + "epoch": 0.057281222806552914, + "grad_norm": 4.021640300750732, + "learning_rate": 1.7997403014114445e-05, + "loss": 1.0216, + "step": 757 + }, + { + "epoch": 0.05735689152888653, + "grad_norm": 5.258941173553467, + "learning_rate": 1.7997348631121287e-05, + "loss": 0.8469, + "step": 758 + }, + { + "epoch": 0.057432560251220156, + "grad_norm": 3.1040396690368652, + "learning_rate": 1.7997293684685055e-05, + "loss": 0.8839, + "step": 759 + }, + { + "epoch": 0.05750822897355378, + "grad_norm": 3.224198341369629, + "learning_rate": 1.7997238174809194e-05, + "loss": 0.8264, + "step": 760 + }, + { + "epoch": 0.057583897695887405, + "grad_norm": 3.097722291946411, + "learning_rate": 1.7997182101497175e-05, + "loss": 0.7879, + "step": 761 + }, + { + "epoch": 0.05765956641822103, + "grad_norm": 3.591596841812134, + "learning_rate": 1.7997125464752517e-05, + "loss": 0.8322, + "step": 762 + }, + { + "epoch": 0.057735235140554654, + "grad_norm": 3.280409336090088, + "learning_rate": 1.7997068264578757e-05, + "loss": 0.8275, + "step": 763 + }, + { + "epoch": 0.05781090386288828, + "grad_norm": 3.701860189437866, + "learning_rate": 1.7997010500979488e-05, + "loss": 0.8116, + "step": 764 + }, + { + "epoch": 0.057886572585221896, + "grad_norm": 3.2338805198669434, + "learning_rate": 1.7996952173958317e-05, + "loss": 0.8088, + "step": 765 + }, + { + "epoch": 0.05796224130755552, + "grad_norm": 3.3278093338012695, + "learning_rate": 1.79968932835189e-05, + "loss": 0.7139, + "step": 766 + }, + { + "epoch": 0.058037910029889145, + "grad_norm": 2.84871768951416, + "learning_rate": 1.799683382966493e-05, + "loss": 0.8951, + "step": 767 + }, + { + "epoch": 0.05811357875222277, + "grad_norm": 3.250761032104492, + "learning_rate": 1.7996773812400124e-05, + "loss": 0.834, + "step": 768 + }, + { + "epoch": 0.058189247474556394, + "grad_norm": 3.869211435317993, + "learning_rate": 1.7996713231728244e-05, + "loss": 0.9022, + "step": 769 + }, + { + "epoch": 0.05826491619689002, + "grad_norm": 3.068364143371582, + "learning_rate": 1.7996652087653082e-05, + "loss": 0.882, + "step": 770 + }, + { + "epoch": 0.058340584919223636, + "grad_norm": 3.9008500576019287, + "learning_rate": 1.7996590380178466e-05, + "loss": 0.956, + "step": 771 + }, + { + "epoch": 0.05841625364155726, + "grad_norm": 5.665666580200195, + "learning_rate": 1.7996528109308266e-05, + "loss": 0.8128, + "step": 772 + }, + { + "epoch": 0.058491922363890884, + "grad_norm": 3.024960517883301, + "learning_rate": 1.7996465275046374e-05, + "loss": 0.9174, + "step": 773 + }, + { + "epoch": 0.05856759108622451, + "grad_norm": 3.01311993598938, + "learning_rate": 1.7996401877396733e-05, + "loss": 0.8168, + "step": 774 + }, + { + "epoch": 0.05864325980855813, + "grad_norm": 3.073803186416626, + "learning_rate": 1.7996337916363302e-05, + "loss": 0.7588, + "step": 775 + }, + { + "epoch": 0.05871892853089176, + "grad_norm": 3.6292426586151123, + "learning_rate": 1.7996273391950095e-05, + "loss": 1.1097, + "step": 776 + }, + { + "epoch": 0.05879459725322538, + "grad_norm": 3.8415868282318115, + "learning_rate": 1.7996208304161153e-05, + "loss": 0.9531, + "step": 777 + }, + { + "epoch": 0.058870265975559, + "grad_norm": 2.900418996810913, + "learning_rate": 1.799614265300055e-05, + "loss": 0.8801, + "step": 778 + }, + { + "epoch": 0.058945934697892624, + "grad_norm": 3.2337486743927, + "learning_rate": 1.7996076438472395e-05, + "loss": 0.92, + "step": 779 + }, + { + "epoch": 0.05902160342022625, + "grad_norm": 2.9472317695617676, + "learning_rate": 1.7996009660580836e-05, + "loss": 0.8633, + "step": 780 + }, + { + "epoch": 0.05909727214255987, + "grad_norm": 2.4730706214904785, + "learning_rate": 1.7995942319330056e-05, + "loss": 0.8554, + "step": 781 + }, + { + "epoch": 0.0591729408648935, + "grad_norm": 5.070908546447754, + "learning_rate": 1.7995874414724272e-05, + "loss": 0.7889, + "step": 782 + }, + { + "epoch": 0.05924860958722712, + "grad_norm": 3.5135512351989746, + "learning_rate": 1.7995805946767734e-05, + "loss": 0.802, + "step": 783 + }, + { + "epoch": 0.059324278309560746, + "grad_norm": 3.356902599334717, + "learning_rate": 1.7995736915464735e-05, + "loss": 0.8238, + "step": 784 + }, + { + "epoch": 0.059399947031894364, + "grad_norm": 3.2595343589782715, + "learning_rate": 1.7995667320819595e-05, + "loss": 0.8915, + "step": 785 + }, + { + "epoch": 0.05947561575422799, + "grad_norm": 2.725177526473999, + "learning_rate": 1.799559716283667e-05, + "loss": 0.7803, + "step": 786 + }, + { + "epoch": 0.05955128447656161, + "grad_norm": 3.298215389251709, + "learning_rate": 1.7995526441520354e-05, + "loss": 0.9538, + "step": 787 + }, + { + "epoch": 0.05962695319889524, + "grad_norm": 4.367799758911133, + "learning_rate": 1.7995455156875077e-05, + "loss": 0.9063, + "step": 788 + }, + { + "epoch": 0.05970262192122886, + "grad_norm": 2.9157984256744385, + "learning_rate": 1.7995383308905307e-05, + "loss": 0.8681, + "step": 789 + }, + { + "epoch": 0.059778290643562486, + "grad_norm": 3.340041399002075, + "learning_rate": 1.7995310897615537e-05, + "loss": 0.9215, + "step": 790 + }, + { + "epoch": 0.05985395936589611, + "grad_norm": 3.1033027172088623, + "learning_rate": 1.7995237923010306e-05, + "loss": 0.8081, + "step": 791 + }, + { + "epoch": 0.05992962808822973, + "grad_norm": 3.03116774559021, + "learning_rate": 1.799516438509418e-05, + "loss": 0.9414, + "step": 792 + }, + { + "epoch": 0.06000529681056335, + "grad_norm": 3.3425679206848145, + "learning_rate": 1.7995090283871765e-05, + "loss": 0.8291, + "step": 793 + }, + { + "epoch": 0.06008096553289698, + "grad_norm": 2.8552069664001465, + "learning_rate": 1.7995015619347707e-05, + "loss": 0.9352, + "step": 794 + }, + { + "epoch": 0.0601566342552306, + "grad_norm": 3.53585147857666, + "learning_rate": 1.7994940391526674e-05, + "loss": 0.8699, + "step": 795 + }, + { + "epoch": 0.060232302977564225, + "grad_norm": 2.6548848152160645, + "learning_rate": 1.7994864600413383e-05, + "loss": 0.806, + "step": 796 + }, + { + "epoch": 0.06030797169989785, + "grad_norm": 2.734811782836914, + "learning_rate": 1.7994788246012578e-05, + "loss": 0.73, + "step": 797 + }, + { + "epoch": 0.06038364042223147, + "grad_norm": 3.6536951065063477, + "learning_rate": 1.7994711328329038e-05, + "loss": 0.7225, + "step": 798 + }, + { + "epoch": 0.06045930914456509, + "grad_norm": 2.3973493576049805, + "learning_rate": 1.7994633847367582e-05, + "loss": 0.661, + "step": 799 + }, + { + "epoch": 0.060534977866898716, + "grad_norm": 4.086350917816162, + "learning_rate": 1.7994555803133065e-05, + "loss": 0.8949, + "step": 800 + }, + { + "epoch": 0.06061064658923234, + "grad_norm": 2.1674516201019287, + "learning_rate": 1.799447719563037e-05, + "loss": 1.0709, + "step": 801 + }, + { + "epoch": 0.060686315311565965, + "grad_norm": 3.184936761856079, + "learning_rate": 1.799439802486442e-05, + "loss": 0.7324, + "step": 802 + }, + { + "epoch": 0.06076198403389959, + "grad_norm": 2.968808889389038, + "learning_rate": 1.7994318290840178e-05, + "loss": 0.84, + "step": 803 + }, + { + "epoch": 0.060837652756233214, + "grad_norm": 3.6430764198303223, + "learning_rate": 1.799423799356263e-05, + "loss": 0.8638, + "step": 804 + }, + { + "epoch": 0.06091332147856683, + "grad_norm": 2.8016927242279053, + "learning_rate": 1.799415713303681e-05, + "loss": 0.7604, + "step": 805 + }, + { + "epoch": 0.060988990200900456, + "grad_norm": 7.259315013885498, + "learning_rate": 1.799407570926778e-05, + "loss": 0.9089, + "step": 806 + }, + { + "epoch": 0.06106465892323408, + "grad_norm": 4.342022895812988, + "learning_rate": 1.7993993722260635e-05, + "loss": 0.7734, + "step": 807 + }, + { + "epoch": 0.061140327645567705, + "grad_norm": 4.369460582733154, + "learning_rate": 1.7993911172020517e-05, + "loss": 0.8225, + "step": 808 + }, + { + "epoch": 0.06121599636790133, + "grad_norm": 3.1216466426849365, + "learning_rate": 1.7993828058552593e-05, + "loss": 1.0397, + "step": 809 + }, + { + "epoch": 0.061291665090234954, + "grad_norm": 3.13508677482605, + "learning_rate": 1.799374438186206e-05, + "loss": 0.9094, + "step": 810 + }, + { + "epoch": 0.06136733381256858, + "grad_norm": 11.32715892791748, + "learning_rate": 1.799366014195417e-05, + "loss": 0.9524, + "step": 811 + }, + { + "epoch": 0.061443002534902195, + "grad_norm": 2.600041389465332, + "learning_rate": 1.799357533883419e-05, + "loss": 0.7271, + "step": 812 + }, + { + "epoch": 0.06151867125723582, + "grad_norm": 3.431683301925659, + "learning_rate": 1.7993489972507434e-05, + "loss": 0.8767, + "step": 813 + }, + { + "epoch": 0.061594339979569444, + "grad_norm": 3.009431838989258, + "learning_rate": 1.799340404297925e-05, + "loss": 0.8367, + "step": 814 + }, + { + "epoch": 0.06167000870190307, + "grad_norm": 3.2158117294311523, + "learning_rate": 1.7993317550255014e-05, + "loss": 0.8516, + "step": 815 + }, + { + "epoch": 0.06174567742423669, + "grad_norm": 3.753148317337036, + "learning_rate": 1.7993230494340145e-05, + "loss": 0.8619, + "step": 816 + }, + { + "epoch": 0.06182134614657032, + "grad_norm": 3.217808485031128, + "learning_rate": 1.7993142875240097e-05, + "loss": 0.7954, + "step": 817 + }, + { + "epoch": 0.061897014868903935, + "grad_norm": 6.548532009124756, + "learning_rate": 1.7993054692960354e-05, + "loss": 0.8667, + "step": 818 + }, + { + "epoch": 0.06197268359123756, + "grad_norm": 3.1087334156036377, + "learning_rate": 1.7992965947506437e-05, + "loss": 0.9301, + "step": 819 + }, + { + "epoch": 0.062048352313571184, + "grad_norm": 4.297720432281494, + "learning_rate": 1.7992876638883907e-05, + "loss": 0.753, + "step": 820 + }, + { + "epoch": 0.06212402103590481, + "grad_norm": 3.4362447261810303, + "learning_rate": 1.7992786767098353e-05, + "loss": 0.7636, + "step": 821 + }, + { + "epoch": 0.06219968975823843, + "grad_norm": 2.7632527351379395, + "learning_rate": 1.799269633215541e-05, + "loss": 0.8575, + "step": 822 + }, + { + "epoch": 0.06227535848057206, + "grad_norm": 3.267557144165039, + "learning_rate": 1.7992605334060736e-05, + "loss": 0.7376, + "step": 823 + }, + { + "epoch": 0.06235102720290568, + "grad_norm": 3.381315231323242, + "learning_rate": 1.7992513772820027e-05, + "loss": 0.9032, + "step": 824 + }, + { + "epoch": 0.0624266959252393, + "grad_norm": 3.6174585819244385, + "learning_rate": 1.7992421648439024e-05, + "loss": 0.7052, + "step": 825 + }, + { + "epoch": 0.06250236464757293, + "grad_norm": 3.082953929901123, + "learning_rate": 1.799232896092349e-05, + "loss": 0.9377, + "step": 826 + }, + { + "epoch": 0.06257803336990655, + "grad_norm": 5.397732734680176, + "learning_rate": 1.7992235710279233e-05, + "loss": 0.8913, + "step": 827 + }, + { + "epoch": 0.06265370209224018, + "grad_norm": 3.0445351600646973, + "learning_rate": 1.799214189651209e-05, + "loss": 0.9004, + "step": 828 + }, + { + "epoch": 0.06272937081457379, + "grad_norm": 3.1507112979888916, + "learning_rate": 1.799204751962794e-05, + "loss": 1.0471, + "step": 829 + }, + { + "epoch": 0.06280503953690741, + "grad_norm": 4.134524822235107, + "learning_rate": 1.7991952579632688e-05, + "loss": 0.8125, + "step": 830 + }, + { + "epoch": 0.06288070825924104, + "grad_norm": 2.9399423599243164, + "learning_rate": 1.799185707653228e-05, + "loss": 0.7965, + "step": 831 + }, + { + "epoch": 0.06295637698157466, + "grad_norm": 4.048933506011963, + "learning_rate": 1.7991761010332704e-05, + "loss": 0.8824, + "step": 832 + }, + { + "epoch": 0.06303204570390829, + "grad_norm": 2.8442611694335938, + "learning_rate": 1.7991664381039968e-05, + "loss": 0.6825, + "step": 833 + }, + { + "epoch": 0.06310771442624191, + "grad_norm": 3.6340487003326416, + "learning_rate": 1.7991567188660125e-05, + "loss": 0.8944, + "step": 834 + }, + { + "epoch": 0.06318338314857554, + "grad_norm": 3.6592376232147217, + "learning_rate": 1.7991469433199264e-05, + "loss": 0.8148, + "step": 835 + }, + { + "epoch": 0.06325905187090916, + "grad_norm": 3.6150155067443848, + "learning_rate": 1.7991371114663503e-05, + "loss": 0.6471, + "step": 836 + }, + { + "epoch": 0.06333472059324279, + "grad_norm": 3.5820491313934326, + "learning_rate": 1.7991272233059003e-05, + "loss": 0.9492, + "step": 837 + }, + { + "epoch": 0.06341038931557641, + "grad_norm": 3.483809471130371, + "learning_rate": 1.7991172788391953e-05, + "loss": 0.7662, + "step": 838 + }, + { + "epoch": 0.06348605803791003, + "grad_norm": 3.0306272506713867, + "learning_rate": 1.7991072780668585e-05, + "loss": 0.9009, + "step": 839 + }, + { + "epoch": 0.06356172676024366, + "grad_norm": 3.402259588241577, + "learning_rate": 1.7990972209895155e-05, + "loss": 0.7558, + "step": 840 + }, + { + "epoch": 0.06363739548257728, + "grad_norm": 2.673753023147583, + "learning_rate": 1.7990871076077967e-05, + "loss": 0.7811, + "step": 841 + }, + { + "epoch": 0.0637130642049109, + "grad_norm": 4.777785778045654, + "learning_rate": 1.799076937922335e-05, + "loss": 0.8718, + "step": 842 + }, + { + "epoch": 0.06378873292724452, + "grad_norm": 3.1430823802948, + "learning_rate": 1.799066711933768e-05, + "loss": 0.7224, + "step": 843 + }, + { + "epoch": 0.06386440164957814, + "grad_norm": 3.542694568634033, + "learning_rate": 1.799056429642735e-05, + "loss": 0.7901, + "step": 844 + }, + { + "epoch": 0.06394007037191177, + "grad_norm": 3.038499116897583, + "learning_rate": 1.7990460910498806e-05, + "loss": 0.79, + "step": 845 + }, + { + "epoch": 0.06401573909424539, + "grad_norm": 3.5024659633636475, + "learning_rate": 1.7990356961558523e-05, + "loss": 0.9269, + "step": 846 + }, + { + "epoch": 0.06409140781657902, + "grad_norm": 4.1338067054748535, + "learning_rate": 1.7990252449613008e-05, + "loss": 0.9418, + "step": 847 + }, + { + "epoch": 0.06416707653891264, + "grad_norm": 3.374940872192383, + "learning_rate": 1.7990147374668806e-05, + "loss": 0.9184, + "step": 848 + }, + { + "epoch": 0.06424274526124626, + "grad_norm": 3.2009170055389404, + "learning_rate": 1.7990041736732497e-05, + "loss": 0.7091, + "step": 849 + }, + { + "epoch": 0.06431841398357989, + "grad_norm": 4.257187366485596, + "learning_rate": 1.79899355358107e-05, + "loss": 0.958, + "step": 850 + }, + { + "epoch": 0.06439408270591351, + "grad_norm": 3.449984550476074, + "learning_rate": 1.798982877191006e-05, + "loss": 0.7824, + "step": 851 + }, + { + "epoch": 0.06446975142824714, + "grad_norm": 4.163568019866943, + "learning_rate": 1.798972144503727e-05, + "loss": 0.8174, + "step": 852 + }, + { + "epoch": 0.06454542015058076, + "grad_norm": 4.116754531860352, + "learning_rate": 1.7989613555199045e-05, + "loss": 0.69, + "step": 853 + }, + { + "epoch": 0.06462108887291439, + "grad_norm": 4.340511322021484, + "learning_rate": 1.798950510240214e-05, + "loss": 0.8673, + "step": 854 + }, + { + "epoch": 0.06469675759524801, + "grad_norm": 4.204843521118164, + "learning_rate": 1.798939608665335e-05, + "loss": 0.7685, + "step": 855 + }, + { + "epoch": 0.06477242631758162, + "grad_norm": 5.48193359375, + "learning_rate": 1.7989286507959505e-05, + "loss": 0.8221, + "step": 856 + }, + { + "epoch": 0.06484809503991525, + "grad_norm": 3.1936569213867188, + "learning_rate": 1.7989176366327463e-05, + "loss": 0.8692, + "step": 857 + }, + { + "epoch": 0.06492376376224887, + "grad_norm": 3.160611867904663, + "learning_rate": 1.7989065661764122e-05, + "loss": 0.7909, + "step": 858 + }, + { + "epoch": 0.0649994324845825, + "grad_norm": 2.972747564315796, + "learning_rate": 1.7988954394276416e-05, + "loss": 0.9906, + "step": 859 + }, + { + "epoch": 0.06507510120691612, + "grad_norm": 3.2030298709869385, + "learning_rate": 1.798884256387131e-05, + "loss": 0.8621, + "step": 860 + }, + { + "epoch": 0.06515076992924974, + "grad_norm": 3.12058162689209, + "learning_rate": 1.7988730170555808e-05, + "loss": 0.9119, + "step": 861 + }, + { + "epoch": 0.06522643865158337, + "grad_norm": 3.048793077468872, + "learning_rate": 1.7988617214336953e-05, + "loss": 0.8322, + "step": 862 + }, + { + "epoch": 0.06530210737391699, + "grad_norm": 3.0921437740325928, + "learning_rate": 1.7988503695221814e-05, + "loss": 0.8441, + "step": 863 + }, + { + "epoch": 0.06537777609625062, + "grad_norm": 2.826828718185425, + "learning_rate": 1.7988389613217504e-05, + "loss": 0.9022, + "step": 864 + }, + { + "epoch": 0.06545344481858424, + "grad_norm": 2.6267917156219482, + "learning_rate": 1.798827496833116e-05, + "loss": 0.8584, + "step": 865 + }, + { + "epoch": 0.06552911354091787, + "grad_norm": 2.9729437828063965, + "learning_rate": 1.7988159760569968e-05, + "loss": 0.9119, + "step": 866 + }, + { + "epoch": 0.06560478226325149, + "grad_norm": 2.4244964122772217, + "learning_rate": 1.798804398994114e-05, + "loss": 0.7932, + "step": 867 + }, + { + "epoch": 0.06568045098558511, + "grad_norm": 4.761054515838623, + "learning_rate": 1.7987927656451928e-05, + "loss": 0.9412, + "step": 868 + }, + { + "epoch": 0.06575611970791873, + "grad_norm": 2.717557191848755, + "learning_rate": 1.7987810760109615e-05, + "loss": 0.7506, + "step": 869 + }, + { + "epoch": 0.06583178843025235, + "grad_norm": 3.001830577850342, + "learning_rate": 1.798769330092152e-05, + "loss": 0.8709, + "step": 870 + }, + { + "epoch": 0.06590745715258597, + "grad_norm": 3.638742208480835, + "learning_rate": 1.7987575278895005e-05, + "loss": 0.7777, + "step": 871 + }, + { + "epoch": 0.0659831258749196, + "grad_norm": 2.8013672828674316, + "learning_rate": 1.798745669403745e-05, + "loss": 0.7571, + "step": 872 + }, + { + "epoch": 0.06605879459725322, + "grad_norm": 2.839331865310669, + "learning_rate": 1.7987337546356293e-05, + "loss": 0.7515, + "step": 873 + }, + { + "epoch": 0.06613446331958685, + "grad_norm": 3.6502885818481445, + "learning_rate": 1.798721783585899e-05, + "loss": 0.7892, + "step": 874 + }, + { + "epoch": 0.06621013204192047, + "grad_norm": 2.118971347808838, + "learning_rate": 1.7987097562553037e-05, + "loss": 0.9736, + "step": 875 + }, + { + "epoch": 0.0662858007642541, + "grad_norm": 2.8734261989593506, + "learning_rate": 1.7986976726445966e-05, + "loss": 0.783, + "step": 876 + }, + { + "epoch": 0.06636146948658772, + "grad_norm": 2.609933376312256, + "learning_rate": 1.7986855327545346e-05, + "loss": 0.7125, + "step": 877 + }, + { + "epoch": 0.06643713820892135, + "grad_norm": 3.158010721206665, + "learning_rate": 1.798673336585878e-05, + "loss": 0.7234, + "step": 878 + }, + { + "epoch": 0.06651280693125497, + "grad_norm": 3.257824182510376, + "learning_rate": 1.7986610841393902e-05, + "loss": 0.9167, + "step": 879 + }, + { + "epoch": 0.0665884756535886, + "grad_norm": 2.97019100189209, + "learning_rate": 1.7986487754158386e-05, + "loss": 0.7206, + "step": 880 + }, + { + "epoch": 0.06666414437592222, + "grad_norm": 2.5462703704833984, + "learning_rate": 1.7986364104159942e-05, + "loss": 0.8476, + "step": 881 + }, + { + "epoch": 0.06673981309825583, + "grad_norm": 3.024618625640869, + "learning_rate": 1.7986239891406314e-05, + "loss": 0.929, + "step": 882 + }, + { + "epoch": 0.06681548182058945, + "grad_norm": 4.000933647155762, + "learning_rate": 1.7986115115905276e-05, + "loss": 0.8126, + "step": 883 + }, + { + "epoch": 0.06689115054292308, + "grad_norm": 2.9372494220733643, + "learning_rate": 1.798598977766465e-05, + "loss": 0.8261, + "step": 884 + }, + { + "epoch": 0.0669668192652567, + "grad_norm": 2.813204526901245, + "learning_rate": 1.7985863876692276e-05, + "loss": 0.8327, + "step": 885 + }, + { + "epoch": 0.06704248798759033, + "grad_norm": 3.385720729827881, + "learning_rate": 1.798573741299604e-05, + "loss": 0.7472, + "step": 886 + }, + { + "epoch": 0.06711815670992395, + "grad_norm": 2.617894172668457, + "learning_rate": 1.798561038658387e-05, + "loss": 0.781, + "step": 887 + }, + { + "epoch": 0.06719382543225758, + "grad_norm": 3.153611660003662, + "learning_rate": 1.798548279746371e-05, + "loss": 0.7154, + "step": 888 + }, + { + "epoch": 0.0672694941545912, + "grad_norm": 2.9759254455566406, + "learning_rate": 1.7985354645643556e-05, + "loss": 0.7758, + "step": 889 + }, + { + "epoch": 0.06734516287692482, + "grad_norm": 3.233285665512085, + "learning_rate": 1.798522593113143e-05, + "loss": 0.8326, + "step": 890 + }, + { + "epoch": 0.06742083159925845, + "grad_norm": 3.2557930946350098, + "learning_rate": 1.7985096653935396e-05, + "loss": 0.7994, + "step": 891 + }, + { + "epoch": 0.06749650032159207, + "grad_norm": 3.4396860599517822, + "learning_rate": 1.7984966814063547e-05, + "loss": 0.8146, + "step": 892 + }, + { + "epoch": 0.0675721690439257, + "grad_norm": 2.9307057857513428, + "learning_rate": 1.7984836411524018e-05, + "loss": 0.8404, + "step": 893 + }, + { + "epoch": 0.06764783776625932, + "grad_norm": 3.1052684783935547, + "learning_rate": 1.798470544632497e-05, + "loss": 0.9605, + "step": 894 + }, + { + "epoch": 0.06772350648859295, + "grad_norm": 3.313931465148926, + "learning_rate": 1.798457391847461e-05, + "loss": 0.7114, + "step": 895 + }, + { + "epoch": 0.06779917521092656, + "grad_norm": 3.335641860961914, + "learning_rate": 1.7984441827981166e-05, + "loss": 0.9155, + "step": 896 + }, + { + "epoch": 0.06787484393326018, + "grad_norm": 2.163098096847534, + "learning_rate": 1.7984309174852918e-05, + "loss": 0.807, + "step": 897 + }, + { + "epoch": 0.0679505126555938, + "grad_norm": 3.4636337757110596, + "learning_rate": 1.7984175959098172e-05, + "loss": 0.7748, + "step": 898 + }, + { + "epoch": 0.06802618137792743, + "grad_norm": 2.3630826473236084, + "learning_rate": 1.798404218072527e-05, + "loss": 0.7619, + "step": 899 + }, + { + "epoch": 0.06810185010026105, + "grad_norm": 2.6266446113586426, + "learning_rate": 1.7983907839742587e-05, + "loss": 0.9418, + "step": 900 + }, + { + "epoch": 0.06817751882259468, + "grad_norm": 2.760838747024536, + "learning_rate": 1.798377293615854e-05, + "loss": 0.836, + "step": 901 + }, + { + "epoch": 0.0682531875449283, + "grad_norm": 3.108145236968994, + "learning_rate": 1.798363746998157e-05, + "loss": 0.7683, + "step": 902 + }, + { + "epoch": 0.06832885626726193, + "grad_norm": 2.807042360305786, + "learning_rate": 1.7983501441220168e-05, + "loss": 0.9376, + "step": 903 + }, + { + "epoch": 0.06840452498959555, + "grad_norm": 3.531285047531128, + "learning_rate": 1.798336484988285e-05, + "loss": 0.7328, + "step": 904 + }, + { + "epoch": 0.06848019371192918, + "grad_norm": 3.469963788986206, + "learning_rate": 1.7983227695978168e-05, + "loss": 0.7034, + "step": 905 + }, + { + "epoch": 0.0685558624342628, + "grad_norm": 3.210841417312622, + "learning_rate": 1.798308997951471e-05, + "loss": 0.6951, + "step": 906 + }, + { + "epoch": 0.06863153115659643, + "grad_norm": 2.795273542404175, + "learning_rate": 1.798295170050111e-05, + "loss": 0.7853, + "step": 907 + }, + { + "epoch": 0.06870719987893005, + "grad_norm": 4.241882801055908, + "learning_rate": 1.7982812858946015e-05, + "loss": 0.8056, + "step": 908 + }, + { + "epoch": 0.06878286860126366, + "grad_norm": 2.5910651683807373, + "learning_rate": 1.7982673454858125e-05, + "loss": 0.6758, + "step": 909 + }, + { + "epoch": 0.06885853732359729, + "grad_norm": 3.2898170948028564, + "learning_rate": 1.798253348824617e-05, + "loss": 0.8563, + "step": 910 + }, + { + "epoch": 0.06893420604593091, + "grad_norm": 3.170915126800537, + "learning_rate": 1.7982392959118914e-05, + "loss": 0.9903, + "step": 911 + }, + { + "epoch": 0.06900987476826453, + "grad_norm": 2.6784350872039795, + "learning_rate": 1.7982251867485162e-05, + "loss": 0.82, + "step": 912 + }, + { + "epoch": 0.06908554349059816, + "grad_norm": 2.870120048522949, + "learning_rate": 1.798211021335374e-05, + "loss": 0.7205, + "step": 913 + }, + { + "epoch": 0.06916121221293178, + "grad_norm": 3.627228260040283, + "learning_rate": 1.798196799673353e-05, + "loss": 0.7787, + "step": 914 + }, + { + "epoch": 0.06923688093526541, + "grad_norm": 3.563584089279175, + "learning_rate": 1.7981825217633433e-05, + "loss": 0.9949, + "step": 915 + }, + { + "epoch": 0.06931254965759903, + "grad_norm": 3.695765495300293, + "learning_rate": 1.7981681876062388e-05, + "loss": 0.673, + "step": 916 + }, + { + "epoch": 0.06938821837993266, + "grad_norm": 3.3603649139404297, + "learning_rate": 1.798153797202937e-05, + "loss": 0.9694, + "step": 917 + }, + { + "epoch": 0.06946388710226628, + "grad_norm": 3.820831537246704, + "learning_rate": 1.7981393505543403e-05, + "loss": 0.9224, + "step": 918 + }, + { + "epoch": 0.0695395558245999, + "grad_norm": 3.589085102081299, + "learning_rate": 1.798124847661352e-05, + "loss": 0.8123, + "step": 919 + }, + { + "epoch": 0.06961522454693353, + "grad_norm": 3.0185937881469727, + "learning_rate": 1.798110288524881e-05, + "loss": 0.8082, + "step": 920 + }, + { + "epoch": 0.06969089326926715, + "grad_norm": 3.6897995471954346, + "learning_rate": 1.7980956731458387e-05, + "loss": 0.9175, + "step": 921 + }, + { + "epoch": 0.06976656199160078, + "grad_norm": 3.113912582397461, + "learning_rate": 1.7980810015251407e-05, + "loss": 0.888, + "step": 922 + }, + { + "epoch": 0.06984223071393439, + "grad_norm": 2.264333486557007, + "learning_rate": 1.7980662736637054e-05, + "loss": 0.6739, + "step": 923 + }, + { + "epoch": 0.06991789943626801, + "grad_norm": 1.843481421470642, + "learning_rate": 1.7980514895624558e-05, + "loss": 1.0251, + "step": 924 + }, + { + "epoch": 0.06999356815860164, + "grad_norm": 3.6482927799224854, + "learning_rate": 1.798036649222317e-05, + "loss": 0.7277, + "step": 925 + }, + { + "epoch": 0.07006923688093526, + "grad_norm": 11.99400520324707, + "learning_rate": 1.7980217526442186e-05, + "loss": 0.9066, + "step": 926 + }, + { + "epoch": 0.07014490560326889, + "grad_norm": 2.5644752979278564, + "learning_rate": 1.7980067998290935e-05, + "loss": 0.887, + "step": 927 + }, + { + "epoch": 0.07022057432560251, + "grad_norm": 3.71718692779541, + "learning_rate": 1.797991790777878e-05, + "loss": 0.8685, + "step": 928 + }, + { + "epoch": 0.07029624304793614, + "grad_norm": 2.8822622299194336, + "learning_rate": 1.797976725491512e-05, + "loss": 0.8336, + "step": 929 + }, + { + "epoch": 0.07037191177026976, + "grad_norm": 2.9357829093933105, + "learning_rate": 1.7979616039709396e-05, + "loss": 0.8856, + "step": 930 + }, + { + "epoch": 0.07044758049260338, + "grad_norm": 2.640735387802124, + "learning_rate": 1.7979464262171067e-05, + "loss": 0.7398, + "step": 931 + }, + { + "epoch": 0.07052324921493701, + "grad_norm": 3.1476693153381348, + "learning_rate": 1.7979311922309645e-05, + "loss": 0.9748, + "step": 932 + }, + { + "epoch": 0.07059891793727063, + "grad_norm": 2.6864423751831055, + "learning_rate": 1.7979159020134668e-05, + "loss": 0.7716, + "step": 933 + }, + { + "epoch": 0.07067458665960426, + "grad_norm": 2.750220537185669, + "learning_rate": 1.797900555565571e-05, + "loss": 0.8742, + "step": 934 + }, + { + "epoch": 0.07075025538193788, + "grad_norm": 2.5933568477630615, + "learning_rate": 1.7978851528882382e-05, + "loss": 0.833, + "step": 935 + }, + { + "epoch": 0.07082592410427149, + "grad_norm": 2.8534131050109863, + "learning_rate": 1.7978696939824333e-05, + "loss": 0.8054, + "step": 936 + }, + { + "epoch": 0.07090159282660512, + "grad_norm": 3.7665860652923584, + "learning_rate": 1.7978541788491237e-05, + "loss": 0.9409, + "step": 937 + }, + { + "epoch": 0.07097726154893874, + "grad_norm": 2.939113140106201, + "learning_rate": 1.7978386074892816e-05, + "loss": 0.8041, + "step": 938 + }, + { + "epoch": 0.07105293027127237, + "grad_norm": 3.101107597351074, + "learning_rate": 1.7978229799038816e-05, + "loss": 0.8247, + "step": 939 + }, + { + "epoch": 0.07112859899360599, + "grad_norm": 2.7688238620758057, + "learning_rate": 1.7978072960939034e-05, + "loss": 0.8326, + "step": 940 + }, + { + "epoch": 0.07120426771593961, + "grad_norm": 3.1420252323150635, + "learning_rate": 1.797791556060328e-05, + "loss": 0.8231, + "step": 941 + }, + { + "epoch": 0.07127993643827324, + "grad_norm": 2.776109218597412, + "learning_rate": 1.7977757598041417e-05, + "loss": 0.7977, + "step": 942 + }, + { + "epoch": 0.07135560516060686, + "grad_norm": 4.262285232543945, + "learning_rate": 1.7977599073263335e-05, + "loss": 0.7962, + "step": 943 + }, + { + "epoch": 0.07143127388294049, + "grad_norm": 3.1178438663482666, + "learning_rate": 1.7977439986278962e-05, + "loss": 0.8491, + "step": 944 + }, + { + "epoch": 0.07150694260527411, + "grad_norm": 3.3614895343780518, + "learning_rate": 1.797728033709826e-05, + "loss": 0.9089, + "step": 945 + }, + { + "epoch": 0.07158261132760774, + "grad_norm": 2.6752171516418457, + "learning_rate": 1.797712012573123e-05, + "loss": 0.9587, + "step": 946 + }, + { + "epoch": 0.07165828004994136, + "grad_norm": 3.405928373336792, + "learning_rate": 1.79769593521879e-05, + "loss": 0.8991, + "step": 947 + }, + { + "epoch": 0.07173394877227499, + "grad_norm": 2.2228682041168213, + "learning_rate": 1.7976798016478336e-05, + "loss": 1.106, + "step": 948 + }, + { + "epoch": 0.07180961749460861, + "grad_norm": 2.7371156215667725, + "learning_rate": 1.797663611861265e-05, + "loss": 0.8074, + "step": 949 + }, + { + "epoch": 0.07188528621694222, + "grad_norm": 3.274010181427002, + "learning_rate": 1.7976473658600977e-05, + "loss": 0.8784, + "step": 950 + }, + { + "epoch": 0.07196095493927585, + "grad_norm": 3.2630934715270996, + "learning_rate": 1.797631063645349e-05, + "loss": 0.929, + "step": 951 + }, + { + "epoch": 0.07203662366160947, + "grad_norm": 3.075411796569824, + "learning_rate": 1.7976147052180395e-05, + "loss": 0.7251, + "step": 952 + }, + { + "epoch": 0.0721122923839431, + "grad_norm": 2.965583324432373, + "learning_rate": 1.797598290579194e-05, + "loss": 0.7216, + "step": 953 + }, + { + "epoch": 0.07218796110627672, + "grad_norm": 2.7841546535491943, + "learning_rate": 1.797581819729841e-05, + "loss": 0.8072, + "step": 954 + }, + { + "epoch": 0.07226362982861034, + "grad_norm": 3.408371686935425, + "learning_rate": 1.7975652926710108e-05, + "loss": 0.7652, + "step": 955 + }, + { + "epoch": 0.07233929855094397, + "grad_norm": 3.180001974105835, + "learning_rate": 1.7975487094037386e-05, + "loss": 0.9272, + "step": 956 + }, + { + "epoch": 0.07241496727327759, + "grad_norm": 3.346219301223755, + "learning_rate": 1.7975320699290637e-05, + "loss": 0.9778, + "step": 957 + }, + { + "epoch": 0.07249063599561122, + "grad_norm": 2.9968905448913574, + "learning_rate": 1.7975153742480274e-05, + "loss": 0.8965, + "step": 958 + }, + { + "epoch": 0.07256630471794484, + "grad_norm": 3.1787264347076416, + "learning_rate": 1.7974986223616754e-05, + "loss": 0.7344, + "step": 959 + }, + { + "epoch": 0.07264197344027847, + "grad_norm": 3.266357898712158, + "learning_rate": 1.797481814271057e-05, + "loss": 0.8381, + "step": 960 + }, + { + "epoch": 0.07271764216261209, + "grad_norm": 3.33705472946167, + "learning_rate": 1.7974649499772244e-05, + "loss": 0.745, + "step": 961 + }, + { + "epoch": 0.07279331088494571, + "grad_norm": 3.2236170768737793, + "learning_rate": 1.797448029481234e-05, + "loss": 0.7465, + "step": 962 + }, + { + "epoch": 0.07286897960727932, + "grad_norm": 3.4352869987487793, + "learning_rate": 1.797431052784145e-05, + "loss": 1.004, + "step": 963 + }, + { + "epoch": 0.07294464832961295, + "grad_norm": 3.1209468841552734, + "learning_rate": 1.797414019887021e-05, + "loss": 0.9475, + "step": 964 + }, + { + "epoch": 0.07302031705194657, + "grad_norm": 7.6214823722839355, + "learning_rate": 1.7973969307909286e-05, + "loss": 0.8257, + "step": 965 + }, + { + "epoch": 0.0730959857742802, + "grad_norm": 3.500762939453125, + "learning_rate": 1.797379785496938e-05, + "loss": 0.8723, + "step": 966 + }, + { + "epoch": 0.07317165449661382, + "grad_norm": 3.0872161388397217, + "learning_rate": 1.7973625840061224e-05, + "loss": 0.8551, + "step": 967 + }, + { + "epoch": 0.07324732321894745, + "grad_norm": 3.6307787895202637, + "learning_rate": 1.7973453263195595e-05, + "loss": 0.8331, + "step": 968 + }, + { + "epoch": 0.07332299194128107, + "grad_norm": 3.153038501739502, + "learning_rate": 1.79732801243833e-05, + "loss": 0.8229, + "step": 969 + }, + { + "epoch": 0.0733986606636147, + "grad_norm": 1.7069755792617798, + "learning_rate": 1.797310642363518e-05, + "loss": 0.9515, + "step": 970 + }, + { + "epoch": 0.07347432938594832, + "grad_norm": 2.6972126960754395, + "learning_rate": 1.797293216096211e-05, + "loss": 0.7397, + "step": 971 + }, + { + "epoch": 0.07354999810828194, + "grad_norm": 2.9356179237365723, + "learning_rate": 1.7972757336375012e-05, + "loss": 0.8123, + "step": 972 + }, + { + "epoch": 0.07362566683061557, + "grad_norm": 2.5552573204040527, + "learning_rate": 1.7972581949884823e-05, + "loss": 0.8397, + "step": 973 + }, + { + "epoch": 0.0737013355529492, + "grad_norm": 2.462688684463501, + "learning_rate": 1.7972406001502535e-05, + "loss": 0.8085, + "step": 974 + }, + { + "epoch": 0.07377700427528282, + "grad_norm": 2.716464042663574, + "learning_rate": 1.797222949123916e-05, + "loss": 1.0184, + "step": 975 + }, + { + "epoch": 0.07385267299761644, + "grad_norm": 2.534637451171875, + "learning_rate": 1.797205241910576e-05, + "loss": 0.769, + "step": 976 + }, + { + "epoch": 0.07392834171995005, + "grad_norm": 2.6971538066864014, + "learning_rate": 1.797187478511341e-05, + "loss": 0.8612, + "step": 977 + }, + { + "epoch": 0.07400401044228368, + "grad_norm": 2.319307565689087, + "learning_rate": 1.797169658927325e-05, + "loss": 0.6711, + "step": 978 + }, + { + "epoch": 0.0740796791646173, + "grad_norm": 3.083146333694458, + "learning_rate": 1.7971517831596428e-05, + "loss": 0.9988, + "step": 979 + }, + { + "epoch": 0.07415534788695093, + "grad_norm": 3.323866367340088, + "learning_rate": 1.7971338512094144e-05, + "loss": 0.83, + "step": 980 + }, + { + "epoch": 0.07423101660928455, + "grad_norm": 2.6332504749298096, + "learning_rate": 1.7971158630777623e-05, + "loss": 0.8075, + "step": 981 + }, + { + "epoch": 0.07430668533161817, + "grad_norm": 3.7535693645477295, + "learning_rate": 1.797097818765813e-05, + "loss": 0.7579, + "step": 982 + }, + { + "epoch": 0.0743823540539518, + "grad_norm": 3.424109697341919, + "learning_rate": 1.797079718274697e-05, + "loss": 0.9592, + "step": 983 + }, + { + "epoch": 0.07445802277628542, + "grad_norm": 2.7965245246887207, + "learning_rate": 1.797061561605548e-05, + "loss": 0.8751, + "step": 984 + }, + { + "epoch": 0.07453369149861905, + "grad_norm": 2.6444272994995117, + "learning_rate": 1.7970433487595018e-05, + "loss": 0.8987, + "step": 985 + }, + { + "epoch": 0.07460936022095267, + "grad_norm": 2.68102765083313, + "learning_rate": 1.7970250797377002e-05, + "loss": 0.8993, + "step": 986 + }, + { + "epoch": 0.0746850289432863, + "grad_norm": 2.6379127502441406, + "learning_rate": 1.7970067545412865e-05, + "loss": 0.7778, + "step": 987 + }, + { + "epoch": 0.07476069766561992, + "grad_norm": 4.525475025177002, + "learning_rate": 1.796988373171409e-05, + "loss": 0.8909, + "step": 988 + }, + { + "epoch": 0.07483636638795355, + "grad_norm": 2.7560689449310303, + "learning_rate": 1.7969699356292177e-05, + "loss": 0.8144, + "step": 989 + }, + { + "epoch": 0.07491203511028716, + "grad_norm": 2.7288384437561035, + "learning_rate": 1.7969514419158682e-05, + "loss": 0.802, + "step": 990 + }, + { + "epoch": 0.07498770383262078, + "grad_norm": 3.1117804050445557, + "learning_rate": 1.7969328920325184e-05, + "loss": 0.7979, + "step": 991 + }, + { + "epoch": 0.0750633725549544, + "grad_norm": 3.190317392349243, + "learning_rate": 1.79691428598033e-05, + "loss": 0.773, + "step": 992 + }, + { + "epoch": 0.07513904127728803, + "grad_norm": 4.079197883605957, + "learning_rate": 1.7968956237604678e-05, + "loss": 0.6914, + "step": 993 + }, + { + "epoch": 0.07521470999962165, + "grad_norm": 2.5737321376800537, + "learning_rate": 1.796876905374101e-05, + "loss": 0.752, + "step": 994 + }, + { + "epoch": 0.07529037872195528, + "grad_norm": 3.0443410873413086, + "learning_rate": 1.796858130822401e-05, + "loss": 0.8317, + "step": 995 + }, + { + "epoch": 0.0753660474442889, + "grad_norm": 3.2446975708007812, + "learning_rate": 1.7968393001065445e-05, + "loss": 0.7763, + "step": 996 + }, + { + "epoch": 0.07544171616662253, + "grad_norm": 3.4776625633239746, + "learning_rate": 1.79682041322771e-05, + "loss": 0.6775, + "step": 997 + }, + { + "epoch": 0.07551738488895615, + "grad_norm": 2.343702554702759, + "learning_rate": 1.796801470187081e-05, + "loss": 0.9111, + "step": 998 + }, + { + "epoch": 0.07559305361128978, + "grad_norm": 2.4391534328460693, + "learning_rate": 1.7967824709858428e-05, + "loss": 1.1345, + "step": 999 + }, + { + "epoch": 0.0756687223336234, + "grad_norm": 2.1746954917907715, + "learning_rate": 1.796763415625186e-05, + "loss": 0.6839, + "step": 1000 + }, + { + "epoch": 0.07574439105595702, + "grad_norm": 2.8759877681732178, + "learning_rate": 1.7967443041063037e-05, + "loss": 0.793, + "step": 1001 + }, + { + "epoch": 0.07582005977829065, + "grad_norm": 2.7766287326812744, + "learning_rate": 1.7967251364303927e-05, + "loss": 0.9124, + "step": 1002 + }, + { + "epoch": 0.07589572850062427, + "grad_norm": 3.971047878265381, + "learning_rate": 1.796705912598653e-05, + "loss": 0.9387, + "step": 1003 + }, + { + "epoch": 0.07597139722295788, + "grad_norm": 3.4732584953308105, + "learning_rate": 1.796686632612289e-05, + "loss": 0.7853, + "step": 1004 + }, + { + "epoch": 0.07604706594529151, + "grad_norm": 2.530043125152588, + "learning_rate": 1.7966672964725074e-05, + "loss": 0.7116, + "step": 1005 + }, + { + "epoch": 0.07612273466762513, + "grad_norm": 3.753622531890869, + "learning_rate": 1.79664790418052e-05, + "loss": 0.7789, + "step": 1006 + }, + { + "epoch": 0.07619840338995876, + "grad_norm": 2.8898422718048096, + "learning_rate": 1.7966284557375405e-05, + "loss": 0.8084, + "step": 1007 + }, + { + "epoch": 0.07627407211229238, + "grad_norm": 3.0014569759368896, + "learning_rate": 1.7966089511447872e-05, + "loss": 0.9103, + "step": 1008 + }, + { + "epoch": 0.076349740834626, + "grad_norm": 3.0454745292663574, + "learning_rate": 1.7965893904034813e-05, + "loss": 0.841, + "step": 1009 + }, + { + "epoch": 0.07642540955695963, + "grad_norm": 6.323338031768799, + "learning_rate": 1.7965697735148482e-05, + "loss": 0.7438, + "step": 1010 + }, + { + "epoch": 0.07650107827929326, + "grad_norm": 2.459744930267334, + "learning_rate": 1.7965501004801158e-05, + "loss": 0.7822, + "step": 1011 + }, + { + "epoch": 0.07657674700162688, + "grad_norm": 2.981001138687134, + "learning_rate": 1.796530371300516e-05, + "loss": 0.9066, + "step": 1012 + }, + { + "epoch": 0.0766524157239605, + "grad_norm": 2.747135639190674, + "learning_rate": 1.7965105859772847e-05, + "loss": 0.6591, + "step": 1013 + }, + { + "epoch": 0.07672808444629413, + "grad_norm": 2.3893380165100098, + "learning_rate": 1.796490744511661e-05, + "loss": 0.7462, + "step": 1014 + }, + { + "epoch": 0.07680375316862775, + "grad_norm": 3.2017297744750977, + "learning_rate": 1.796470846904887e-05, + "loss": 0.9729, + "step": 1015 + }, + { + "epoch": 0.07687942189096138, + "grad_norm": 3.246903896331787, + "learning_rate": 1.7964508931582095e-05, + "loss": 0.6984, + "step": 1016 + }, + { + "epoch": 0.07695509061329499, + "grad_norm": 2.975456953048706, + "learning_rate": 1.7964308832728775e-05, + "loss": 0.8159, + "step": 1017 + }, + { + "epoch": 0.07703075933562861, + "grad_norm": 2.729341506958008, + "learning_rate": 1.796410817250144e-05, + "loss": 0.8509, + "step": 1018 + }, + { + "epoch": 0.07710642805796224, + "grad_norm": 3.228543758392334, + "learning_rate": 1.7963906950912657e-05, + "loss": 0.9637, + "step": 1019 + }, + { + "epoch": 0.07718209678029586, + "grad_norm": 2.6817281246185303, + "learning_rate": 1.7963705167975032e-05, + "loss": 0.9073, + "step": 1020 + }, + { + "epoch": 0.07725776550262949, + "grad_norm": 2.673149585723877, + "learning_rate": 1.7963502823701195e-05, + "loss": 0.7813, + "step": 1021 + }, + { + "epoch": 0.07733343422496311, + "grad_norm": 2.8436264991760254, + "learning_rate": 1.7963299918103818e-05, + "loss": 0.7875, + "step": 1022 + }, + { + "epoch": 0.07740910294729673, + "grad_norm": 3.4467597007751465, + "learning_rate": 1.796309645119561e-05, + "loss": 0.7861, + "step": 1023 + }, + { + "epoch": 0.07748477166963036, + "grad_norm": 4.416311740875244, + "learning_rate": 1.7962892422989313e-05, + "loss": 0.8662, + "step": 1024 + }, + { + "epoch": 0.07756044039196398, + "grad_norm": 2.7716546058654785, + "learning_rate": 1.79626878334977e-05, + "loss": 0.6516, + "step": 1025 + }, + { + "epoch": 0.07763610911429761, + "grad_norm": 2.379066228866577, + "learning_rate": 1.796248268273359e-05, + "loss": 0.7032, + "step": 1026 + }, + { + "epoch": 0.07771177783663123, + "grad_norm": 3.2015442848205566, + "learning_rate": 1.7962276970709827e-05, + "loss": 0.8266, + "step": 1027 + }, + { + "epoch": 0.07778744655896486, + "grad_norm": 2.301879405975342, + "learning_rate": 1.796207069743929e-05, + "loss": 0.8037, + "step": 1028 + }, + { + "epoch": 0.07786311528129848, + "grad_norm": 2.7185168266296387, + "learning_rate": 1.7961863862934897e-05, + "loss": 0.8516, + "step": 1029 + }, + { + "epoch": 0.0779387840036321, + "grad_norm": 3.952467679977417, + "learning_rate": 1.796165646720961e-05, + "loss": 0.8924, + "step": 1030 + }, + { + "epoch": 0.07801445272596572, + "grad_norm": 2.7374305725097656, + "learning_rate": 1.79614485102764e-05, + "loss": 0.8953, + "step": 1031 + }, + { + "epoch": 0.07809012144829934, + "grad_norm": 3.123100996017456, + "learning_rate": 1.7961239992148306e-05, + "loss": 0.9221, + "step": 1032 + }, + { + "epoch": 0.07816579017063296, + "grad_norm": 2.811434507369995, + "learning_rate": 1.7961030912838376e-05, + "loss": 0.7309, + "step": 1033 + }, + { + "epoch": 0.07824145889296659, + "grad_norm": 3.855139970779419, + "learning_rate": 1.796082127235971e-05, + "loss": 0.7561, + "step": 1034 + }, + { + "epoch": 0.07831712761530021, + "grad_norm": 3.214775562286377, + "learning_rate": 1.796061107072543e-05, + "loss": 0.965, + "step": 1035 + }, + { + "epoch": 0.07839279633763384, + "grad_norm": 2.650777578353882, + "learning_rate": 1.7960400307948706e-05, + "loss": 0.9342, + "step": 1036 + }, + { + "epoch": 0.07846846505996746, + "grad_norm": 2.863734722137451, + "learning_rate": 1.796018898404273e-05, + "loss": 0.8635, + "step": 1037 + }, + { + "epoch": 0.07854413378230109, + "grad_norm": 5.087371349334717, + "learning_rate": 1.795997709902074e-05, + "loss": 0.6543, + "step": 1038 + }, + { + "epoch": 0.07861980250463471, + "grad_norm": 2.6036596298217773, + "learning_rate": 1.7959764652896006e-05, + "loss": 0.8956, + "step": 1039 + }, + { + "epoch": 0.07869547122696834, + "grad_norm": 2.6661086082458496, + "learning_rate": 1.7959551645681827e-05, + "loss": 0.9456, + "step": 1040 + }, + { + "epoch": 0.07877113994930196, + "grad_norm": 2.790140390396118, + "learning_rate": 1.7959338077391547e-05, + "loss": 0.8146, + "step": 1041 + }, + { + "epoch": 0.07884680867163558, + "grad_norm": 3.7499725818634033, + "learning_rate": 1.795912394803854e-05, + "loss": 0.7346, + "step": 1042 + }, + { + "epoch": 0.07892247739396921, + "grad_norm": 2.917370080947876, + "learning_rate": 1.7958909257636214e-05, + "loss": 0.8348, + "step": 1043 + }, + { + "epoch": 0.07899814611630282, + "grad_norm": 2.5935680866241455, + "learning_rate": 1.795869400619801e-05, + "loss": 0.6081, + "step": 1044 + }, + { + "epoch": 0.07907381483863644, + "grad_norm": 2.097604990005493, + "learning_rate": 1.7958478193737412e-05, + "loss": 0.7379, + "step": 1045 + }, + { + "epoch": 0.07914948356097007, + "grad_norm": 2.9579460620880127, + "learning_rate": 1.7958261820267936e-05, + "loss": 0.9227, + "step": 1046 + }, + { + "epoch": 0.0792251522833037, + "grad_norm": 4.755364418029785, + "learning_rate": 1.7958044885803133e-05, + "loss": 0.8909, + "step": 1047 + }, + { + "epoch": 0.07930082100563732, + "grad_norm": 2.9622743129730225, + "learning_rate": 1.7957827390356577e-05, + "loss": 0.6475, + "step": 1048 + }, + { + "epoch": 0.07937648972797094, + "grad_norm": 2.9405174255371094, + "learning_rate": 1.7957609333941906e-05, + "loss": 0.8427, + "step": 1049 + }, + { + "epoch": 0.07945215845030457, + "grad_norm": 2.8209495544433594, + "learning_rate": 1.795739071657276e-05, + "loss": 0.7966, + "step": 1050 + }, + { + "epoch": 0.07952782717263819, + "grad_norm": 2.4763989448547363, + "learning_rate": 1.795717153826284e-05, + "loss": 0.6992, + "step": 1051 + }, + { + "epoch": 0.07960349589497182, + "grad_norm": 2.8910422325134277, + "learning_rate": 1.7956951799025865e-05, + "loss": 0.8601, + "step": 1052 + }, + { + "epoch": 0.07967916461730544, + "grad_norm": 2.9164462089538574, + "learning_rate": 1.7956731498875598e-05, + "loss": 0.8017, + "step": 1053 + }, + { + "epoch": 0.07975483333963906, + "grad_norm": 2.7864677906036377, + "learning_rate": 1.7956510637825835e-05, + "loss": 0.8465, + "step": 1054 + }, + { + "epoch": 0.07983050206197269, + "grad_norm": 3.374191999435425, + "learning_rate": 1.7956289215890405e-05, + "loss": 0.8502, + "step": 1055 + }, + { + "epoch": 0.07990617078430631, + "grad_norm": 4.065507411956787, + "learning_rate": 1.795606723308318e-05, + "loss": 0.8219, + "step": 1056 + }, + { + "epoch": 0.07998183950663992, + "grad_norm": 3.0881083011627197, + "learning_rate": 1.7955844689418055e-05, + "loss": 0.8383, + "step": 1057 + }, + { + "epoch": 0.08005750822897355, + "grad_norm": 2.8912618160247803, + "learning_rate": 1.7955621584908968e-05, + "loss": 1.0209, + "step": 1058 + }, + { + "epoch": 0.08013317695130717, + "grad_norm": 2.972893714904785, + "learning_rate": 1.7955397919569894e-05, + "loss": 0.7862, + "step": 1059 + }, + { + "epoch": 0.0802088456736408, + "grad_norm": 3.151890277862549, + "learning_rate": 1.7955173693414835e-05, + "loss": 0.9524, + "step": 1060 + }, + { + "epoch": 0.08028451439597442, + "grad_norm": 3.1022751331329346, + "learning_rate": 1.7954948906457836e-05, + "loss": 0.9726, + "step": 1061 + }, + { + "epoch": 0.08036018311830805, + "grad_norm": 3.058262825012207, + "learning_rate": 1.7954723558712973e-05, + "loss": 0.8667, + "step": 1062 + }, + { + "epoch": 0.08043585184064167, + "grad_norm": 3.0045084953308105, + "learning_rate": 1.7954497650194356e-05, + "loss": 0.7895, + "step": 1063 + }, + { + "epoch": 0.0805115205629753, + "grad_norm": 2.1319167613983154, + "learning_rate": 1.7954271180916137e-05, + "loss": 0.6551, + "step": 1064 + }, + { + "epoch": 0.08058718928530892, + "grad_norm": 4.05554723739624, + "learning_rate": 1.795404415089249e-05, + "loss": 1.0026, + "step": 1065 + }, + { + "epoch": 0.08066285800764254, + "grad_norm": 2.6859283447265625, + "learning_rate": 1.795381656013764e-05, + "loss": 0.7522, + "step": 1066 + }, + { + "epoch": 0.08073852672997617, + "grad_norm": 2.984954833984375, + "learning_rate": 1.795358840866584e-05, + "loss": 0.9283, + "step": 1067 + }, + { + "epoch": 0.08081419545230979, + "grad_norm": 5.049993991851807, + "learning_rate": 1.7953359696491368e-05, + "loss": 0.883, + "step": 1068 + }, + { + "epoch": 0.08088986417464342, + "grad_norm": 3.801880359649658, + "learning_rate": 1.7953130423628558e-05, + "loss": 0.8939, + "step": 1069 + }, + { + "epoch": 0.08096553289697704, + "grad_norm": 4.104866981506348, + "learning_rate": 1.795290059009176e-05, + "loss": 0.725, + "step": 1070 + }, + { + "epoch": 0.08104120161931065, + "grad_norm": 3.0002243518829346, + "learning_rate": 1.7952670195895373e-05, + "loss": 0.9259, + "step": 1071 + }, + { + "epoch": 0.08111687034164428, + "grad_norm": 4.523970127105713, + "learning_rate": 1.7952439241053818e-05, + "loss": 0.8686, + "step": 1072 + }, + { + "epoch": 0.0811925390639779, + "grad_norm": 4.293980598449707, + "learning_rate": 1.7952207725581565e-05, + "loss": 0.9891, + "step": 1073 + }, + { + "epoch": 0.08126820778631152, + "grad_norm": 6.949241638183594, + "learning_rate": 1.7951975649493112e-05, + "loss": 0.7205, + "step": 1074 + }, + { + "epoch": 0.08134387650864515, + "grad_norm": 2.5196433067321777, + "learning_rate": 1.795174301280298e-05, + "loss": 0.7382, + "step": 1075 + }, + { + "epoch": 0.08141954523097877, + "grad_norm": 3.6562304496765137, + "learning_rate": 1.7951509815525758e-05, + "loss": 0.7558, + "step": 1076 + }, + { + "epoch": 0.0814952139533124, + "grad_norm": 2.6832971572875977, + "learning_rate": 1.7951276057676035e-05, + "loss": 0.7999, + "step": 1077 + }, + { + "epoch": 0.08157088267564602, + "grad_norm": 2.6821744441986084, + "learning_rate": 1.795104173926845e-05, + "loss": 0.7422, + "step": 1078 + }, + { + "epoch": 0.08164655139797965, + "grad_norm": 3.0677764415740967, + "learning_rate": 1.795080686031768e-05, + "loss": 0.9085, + "step": 1079 + }, + { + "epoch": 0.08172222012031327, + "grad_norm": 2.698085069656372, + "learning_rate": 1.7950571420838438e-05, + "loss": 0.8755, + "step": 1080 + }, + { + "epoch": 0.0817978888426469, + "grad_norm": 2.651939630508423, + "learning_rate": 1.7950335420845463e-05, + "loss": 0.7144, + "step": 1081 + }, + { + "epoch": 0.08187355756498052, + "grad_norm": 3.2069571018218994, + "learning_rate": 1.7950098860353534e-05, + "loss": 0.834, + "step": 1082 + }, + { + "epoch": 0.08194922628731414, + "grad_norm": 2.9685559272766113, + "learning_rate": 1.7949861739377464e-05, + "loss": 0.9, + "step": 1083 + }, + { + "epoch": 0.08202489500964776, + "grad_norm": 3.860081434249878, + "learning_rate": 1.7949624057932108e-05, + "loss": 0.8808, + "step": 1084 + }, + { + "epoch": 0.08210056373198138, + "grad_norm": 3.6514697074890137, + "learning_rate": 1.7949385816032348e-05, + "loss": 0.8848, + "step": 1085 + }, + { + "epoch": 0.082176232454315, + "grad_norm": 2.7582156658172607, + "learning_rate": 1.79491470136931e-05, + "loss": 0.8554, + "step": 1086 + }, + { + "epoch": 0.08225190117664863, + "grad_norm": 3.2017011642456055, + "learning_rate": 1.7948907650929322e-05, + "loss": 0.9358, + "step": 1087 + }, + { + "epoch": 0.08232756989898225, + "grad_norm": 2.749764919281006, + "learning_rate": 1.7948667727756e-05, + "loss": 0.7399, + "step": 1088 + }, + { + "epoch": 0.08240323862131588, + "grad_norm": 3.237679958343506, + "learning_rate": 1.7948427244188163e-05, + "loss": 1.0332, + "step": 1089 + }, + { + "epoch": 0.0824789073436495, + "grad_norm": 3.3725643157958984, + "learning_rate": 1.794818620024087e-05, + "loss": 0.8071, + "step": 1090 + }, + { + "epoch": 0.08255457606598313, + "grad_norm": 3.030790090560913, + "learning_rate": 1.7947944595929215e-05, + "loss": 0.973, + "step": 1091 + }, + { + "epoch": 0.08263024478831675, + "grad_norm": 3.141195058822632, + "learning_rate": 1.794770243126833e-05, + "loss": 0.8654, + "step": 1092 + }, + { + "epoch": 0.08270591351065038, + "grad_norm": 2.7971293926239014, + "learning_rate": 1.7947459706273376e-05, + "loss": 0.9006, + "step": 1093 + }, + { + "epoch": 0.082781582232984, + "grad_norm": 2.6309781074523926, + "learning_rate": 1.7947216420959556e-05, + "loss": 0.8958, + "step": 1094 + }, + { + "epoch": 0.08285725095531762, + "grad_norm": 2.7773690223693848, + "learning_rate": 1.7946972575342104e-05, + "loss": 0.7116, + "step": 1095 + }, + { + "epoch": 0.08293291967765125, + "grad_norm": 3.7063984870910645, + "learning_rate": 1.7946728169436292e-05, + "loss": 0.8178, + "step": 1096 + }, + { + "epoch": 0.08300858839998487, + "grad_norm": 2.698293685913086, + "learning_rate": 1.7946483203257426e-05, + "loss": 0.7834, + "step": 1097 + }, + { + "epoch": 0.08308425712231848, + "grad_norm": 3.520792245864868, + "learning_rate": 1.7946237676820842e-05, + "loss": 0.9203, + "step": 1098 + }, + { + "epoch": 0.08315992584465211, + "grad_norm": 5.286828517913818, + "learning_rate": 1.794599159014192e-05, + "loss": 0.9913, + "step": 1099 + }, + { + "epoch": 0.08323559456698573, + "grad_norm": 2.7940480709075928, + "learning_rate": 1.7945744943236073e-05, + "loss": 0.8903, + "step": 1100 + }, + { + "epoch": 0.08331126328931936, + "grad_norm": 3.531196355819702, + "learning_rate": 1.794549773611874e-05, + "loss": 0.9732, + "step": 1101 + }, + { + "epoch": 0.08338693201165298, + "grad_norm": 2.8770856857299805, + "learning_rate": 1.7945249968805412e-05, + "loss": 0.7176, + "step": 1102 + }, + { + "epoch": 0.0834626007339866, + "grad_norm": 2.6823272705078125, + "learning_rate": 1.794500164131159e-05, + "loss": 0.7169, + "step": 1103 + }, + { + "epoch": 0.08353826945632023, + "grad_norm": 14.177001953125, + "learning_rate": 1.794475275365284e-05, + "loss": 0.7207, + "step": 1104 + }, + { + "epoch": 0.08361393817865385, + "grad_norm": 3.2320713996887207, + "learning_rate": 1.7944503305844738e-05, + "loss": 0.799, + "step": 1105 + }, + { + "epoch": 0.08368960690098748, + "grad_norm": 3.453160524368286, + "learning_rate": 1.794425329790291e-05, + "loss": 0.6669, + "step": 1106 + }, + { + "epoch": 0.0837652756233211, + "grad_norm": 2.6871984004974365, + "learning_rate": 1.794400272984301e-05, + "loss": 0.8554, + "step": 1107 + }, + { + "epoch": 0.08384094434565473, + "grad_norm": 3.1583449840545654, + "learning_rate": 1.7943751601680732e-05, + "loss": 0.8331, + "step": 1108 + }, + { + "epoch": 0.08391661306798835, + "grad_norm": 2.431509256362915, + "learning_rate": 1.79434999134318e-05, + "loss": 0.9261, + "step": 1109 + }, + { + "epoch": 0.08399228179032198, + "grad_norm": 5.224951267242432, + "learning_rate": 1.7943247665111978e-05, + "loss": 0.8823, + "step": 1110 + }, + { + "epoch": 0.08406795051265559, + "grad_norm": 3.4264092445373535, + "learning_rate": 1.7942994856737063e-05, + "loss": 0.7422, + "step": 1111 + }, + { + "epoch": 0.08414361923498921, + "grad_norm": 3.036907196044922, + "learning_rate": 1.7942741488322882e-05, + "loss": 0.8846, + "step": 1112 + }, + { + "epoch": 0.08421928795732284, + "grad_norm": 2.8463785648345947, + "learning_rate": 1.7942487559885306e-05, + "loss": 0.8043, + "step": 1113 + }, + { + "epoch": 0.08429495667965646, + "grad_norm": 3.0611727237701416, + "learning_rate": 1.7942233071440235e-05, + "loss": 0.7897, + "step": 1114 + }, + { + "epoch": 0.08437062540199008, + "grad_norm": 3.0145342350006104, + "learning_rate": 1.7941978023003604e-05, + "loss": 0.8423, + "step": 1115 + }, + { + "epoch": 0.08444629412432371, + "grad_norm": 2.9739110469818115, + "learning_rate": 1.794172241459139e-05, + "loss": 0.9366, + "step": 1116 + }, + { + "epoch": 0.08452196284665733, + "grad_norm": 3.091020345687866, + "learning_rate": 1.7941466246219597e-05, + "loss": 0.7762, + "step": 1117 + }, + { + "epoch": 0.08459763156899096, + "grad_norm": 2.4491589069366455, + "learning_rate": 1.7941209517904267e-05, + "loss": 0.791, + "step": 1118 + }, + { + "epoch": 0.08467330029132458, + "grad_norm": 3.116558313369751, + "learning_rate": 1.794095222966148e-05, + "loss": 0.8275, + "step": 1119 + }, + { + "epoch": 0.0847489690136582, + "grad_norm": 3.4584758281707764, + "learning_rate": 1.7940694381507345e-05, + "loss": 0.9236, + "step": 1120 + }, + { + "epoch": 0.08482463773599183, + "grad_norm": 3.4700706005096436, + "learning_rate": 1.794043597345801e-05, + "loss": 0.9343, + "step": 1121 + }, + { + "epoch": 0.08490030645832546, + "grad_norm": 2.679157257080078, + "learning_rate": 1.7940177005529653e-05, + "loss": 0.908, + "step": 1122 + }, + { + "epoch": 0.08497597518065908, + "grad_norm": 4.534050941467285, + "learning_rate": 1.7939917477738502e-05, + "loss": 0.761, + "step": 1123 + }, + { + "epoch": 0.0850516439029927, + "grad_norm": 3.0127124786376953, + "learning_rate": 1.79396573901008e-05, + "loss": 0.7555, + "step": 1124 + }, + { + "epoch": 0.08512731262532632, + "grad_norm": 2.5386579036712646, + "learning_rate": 1.793939674263284e-05, + "loss": 0.8402, + "step": 1125 + }, + { + "epoch": 0.08520298134765994, + "grad_norm": 3.1187853813171387, + "learning_rate": 1.793913553535094e-05, + "loss": 0.7755, + "step": 1126 + }, + { + "epoch": 0.08527865006999356, + "grad_norm": 3.3859164714813232, + "learning_rate": 1.793887376827146e-05, + "loss": 0.7611, + "step": 1127 + }, + { + "epoch": 0.08535431879232719, + "grad_norm": 2.509925127029419, + "learning_rate": 1.7938611441410795e-05, + "loss": 0.7936, + "step": 1128 + }, + { + "epoch": 0.08542998751466081, + "grad_norm": 2.7846434116363525, + "learning_rate": 1.793834855478537e-05, + "loss": 0.6917, + "step": 1129 + }, + { + "epoch": 0.08550565623699444, + "grad_norm": 3.471247673034668, + "learning_rate": 1.7938085108411648e-05, + "loss": 0.8178, + "step": 1130 + }, + { + "epoch": 0.08558132495932806, + "grad_norm": 2.7567524909973145, + "learning_rate": 1.7937821102306127e-05, + "loss": 0.8146, + "step": 1131 + }, + { + "epoch": 0.08565699368166169, + "grad_norm": 3.3898468017578125, + "learning_rate": 1.793755653648534e-05, + "loss": 0.7553, + "step": 1132 + }, + { + "epoch": 0.08573266240399531, + "grad_norm": 7.726632595062256, + "learning_rate": 1.7937291410965855e-05, + "loss": 0.803, + "step": 1133 + }, + { + "epoch": 0.08580833112632894, + "grad_norm": 3.307058811187744, + "learning_rate": 1.7937025725764273e-05, + "loss": 0.9423, + "step": 1134 + }, + { + "epoch": 0.08588399984866256, + "grad_norm": 3.0027151107788086, + "learning_rate": 1.793675948089724e-05, + "loss": 0.8663, + "step": 1135 + }, + { + "epoch": 0.08595966857099618, + "grad_norm": 2.042656898498535, + "learning_rate": 1.793649267638142e-05, + "loss": 1.0062, + "step": 1136 + }, + { + "epoch": 0.08603533729332981, + "grad_norm": 4.205798625946045, + "learning_rate": 1.7936225312233523e-05, + "loss": 0.7688, + "step": 1137 + }, + { + "epoch": 0.08611100601566342, + "grad_norm": 2.7063050270080566, + "learning_rate": 1.7935957388470297e-05, + "loss": 0.7372, + "step": 1138 + }, + { + "epoch": 0.08618667473799704, + "grad_norm": 2.97853422164917, + "learning_rate": 1.7935688905108513e-05, + "loss": 0.8734, + "step": 1139 + }, + { + "epoch": 0.08626234346033067, + "grad_norm": 2.819689989089966, + "learning_rate": 1.793541986216499e-05, + "loss": 0.8523, + "step": 1140 + }, + { + "epoch": 0.08633801218266429, + "grad_norm": 3.4364449977874756, + "learning_rate": 1.7935150259656575e-05, + "loss": 0.9452, + "step": 1141 + }, + { + "epoch": 0.08641368090499792, + "grad_norm": 3.2552027702331543, + "learning_rate": 1.7934880097600153e-05, + "loss": 0.955, + "step": 1142 + }, + { + "epoch": 0.08648934962733154, + "grad_norm": 2.6701014041900635, + "learning_rate": 1.7934609376012637e-05, + "loss": 0.909, + "step": 1143 + }, + { + "epoch": 0.08656501834966517, + "grad_norm": 3.5823423862457275, + "learning_rate": 1.7934338094910986e-05, + "loss": 0.7366, + "step": 1144 + }, + { + "epoch": 0.08664068707199879, + "grad_norm": 4.914628505706787, + "learning_rate": 1.7934066254312185e-05, + "loss": 0.7957, + "step": 1145 + }, + { + "epoch": 0.08671635579433241, + "grad_norm": 3.194420576095581, + "learning_rate": 1.7933793854233258e-05, + "loss": 0.8123, + "step": 1146 + }, + { + "epoch": 0.08679202451666604, + "grad_norm": 3.094132900238037, + "learning_rate": 1.7933520894691268e-05, + "loss": 0.9446, + "step": 1147 + }, + { + "epoch": 0.08686769323899966, + "grad_norm": 3.0485928058624268, + "learning_rate": 1.7933247375703302e-05, + "loss": 0.8139, + "step": 1148 + }, + { + "epoch": 0.08694336196133329, + "grad_norm": 2.725642681121826, + "learning_rate": 1.7932973297286493e-05, + "loss": 0.8204, + "step": 1149 + }, + { + "epoch": 0.08701903068366691, + "grad_norm": 2.9291388988494873, + "learning_rate": 1.7932698659458002e-05, + "loss": 0.7845, + "step": 1150 + }, + { + "epoch": 0.08709469940600054, + "grad_norm": 3.9883525371551514, + "learning_rate": 1.793242346223503e-05, + "loss": 0.9038, + "step": 1151 + }, + { + "epoch": 0.08717036812833415, + "grad_norm": 2.8218777179718018, + "learning_rate": 1.7932147705634813e-05, + "loss": 0.9964, + "step": 1152 + }, + { + "epoch": 0.08724603685066777, + "grad_norm": 3.1680803298950195, + "learning_rate": 1.7931871389674615e-05, + "loss": 0.8961, + "step": 1153 + }, + { + "epoch": 0.0873217055730014, + "grad_norm": 3.147364616394043, + "learning_rate": 1.7931594514371738e-05, + "loss": 0.8748, + "step": 1154 + }, + { + "epoch": 0.08739737429533502, + "grad_norm": 2.9208531379699707, + "learning_rate": 1.7931317079743526e-05, + "loss": 0.8665, + "step": 1155 + }, + { + "epoch": 0.08747304301766864, + "grad_norm": 3.3694779872894287, + "learning_rate": 1.793103908580735e-05, + "loss": 0.955, + "step": 1156 + }, + { + "epoch": 0.08754871174000227, + "grad_norm": 2.678873300552368, + "learning_rate": 1.793076053258062e-05, + "loss": 0.7752, + "step": 1157 + }, + { + "epoch": 0.0876243804623359, + "grad_norm": 2.5789995193481445, + "learning_rate": 1.793048142008078e-05, + "loss": 0.8021, + "step": 1158 + }, + { + "epoch": 0.08770004918466952, + "grad_norm": 2.937720537185669, + "learning_rate": 1.793020174832531e-05, + "loss": 0.7064, + "step": 1159 + }, + { + "epoch": 0.08777571790700314, + "grad_norm": 3.5758986473083496, + "learning_rate": 1.7929921517331725e-05, + "loss": 0.7418, + "step": 1160 + }, + { + "epoch": 0.08785138662933677, + "grad_norm": 3.9661405086517334, + "learning_rate": 1.792964072711757e-05, + "loss": 0.9024, + "step": 1161 + }, + { + "epoch": 0.08792705535167039, + "grad_norm": 2.9074490070343018, + "learning_rate": 1.792935937770043e-05, + "loss": 1.0127, + "step": 1162 + }, + { + "epoch": 0.08800272407400402, + "grad_norm": 3.0278260707855225, + "learning_rate": 1.7929077469097923e-05, + "loss": 0.8291, + "step": 1163 + }, + { + "epoch": 0.08807839279633764, + "grad_norm": 2.941364049911499, + "learning_rate": 1.792879500132771e-05, + "loss": 0.7886, + "step": 1164 + }, + { + "epoch": 0.08815406151867125, + "grad_norm": 3.2918078899383545, + "learning_rate": 1.7928511974407468e-05, + "loss": 0.8103, + "step": 1165 + }, + { + "epoch": 0.08822973024100488, + "grad_norm": 3.120277166366577, + "learning_rate": 1.7928228388354932e-05, + "loss": 0.8215, + "step": 1166 + }, + { + "epoch": 0.0883053989633385, + "grad_norm": 3.4401562213897705, + "learning_rate": 1.7927944243187857e-05, + "loss": 0.8355, + "step": 1167 + }, + { + "epoch": 0.08838106768567212, + "grad_norm": 3.9332945346832275, + "learning_rate": 1.7927659538924037e-05, + "loss": 0.966, + "step": 1168 + }, + { + "epoch": 0.08845673640800575, + "grad_norm": 3.2182464599609375, + "learning_rate": 1.7927374275581298e-05, + "loss": 0.8184, + "step": 1169 + }, + { + "epoch": 0.08853240513033937, + "grad_norm": 4.130304336547852, + "learning_rate": 1.792708845317751e-05, + "loss": 0.8465, + "step": 1170 + }, + { + "epoch": 0.088608073852673, + "grad_norm": 3.9958245754241943, + "learning_rate": 1.792680207173057e-05, + "loss": 0.7846, + "step": 1171 + }, + { + "epoch": 0.08868374257500662, + "grad_norm": 3.126950740814209, + "learning_rate": 1.792651513125841e-05, + "loss": 0.9448, + "step": 1172 + }, + { + "epoch": 0.08875941129734025, + "grad_norm": 2.5483791828155518, + "learning_rate": 1.7926227631779e-05, + "loss": 0.818, + "step": 1173 + }, + { + "epoch": 0.08883508001967387, + "grad_norm": 3.175549268722534, + "learning_rate": 1.7925939573310348e-05, + "loss": 0.8815, + "step": 1174 + }, + { + "epoch": 0.0889107487420075, + "grad_norm": 3.0271804332733154, + "learning_rate": 1.7925650955870484e-05, + "loss": 0.8213, + "step": 1175 + }, + { + "epoch": 0.08898641746434112, + "grad_norm": 4.855120658874512, + "learning_rate": 1.7925361779477492e-05, + "loss": 0.7978, + "step": 1176 + }, + { + "epoch": 0.08906208618667474, + "grad_norm": 2.6035373210906982, + "learning_rate": 1.792507204414948e-05, + "loss": 0.7594, + "step": 1177 + }, + { + "epoch": 0.08913775490900837, + "grad_norm": 2.7867379188537598, + "learning_rate": 1.7924781749904583e-05, + "loss": 0.9074, + "step": 1178 + }, + { + "epoch": 0.08921342363134198, + "grad_norm": 2.4809141159057617, + "learning_rate": 1.792449089676099e-05, + "loss": 0.7016, + "step": 1179 + }, + { + "epoch": 0.0892890923536756, + "grad_norm": 3.8243072032928467, + "learning_rate": 1.7924199484736912e-05, + "loss": 0.9437, + "step": 1180 + }, + { + "epoch": 0.08936476107600923, + "grad_norm": 2.4355239868164062, + "learning_rate": 1.7923907513850598e-05, + "loss": 0.7985, + "step": 1181 + }, + { + "epoch": 0.08944042979834285, + "grad_norm": 3.5210628509521484, + "learning_rate": 1.792361498412033e-05, + "loss": 0.9034, + "step": 1182 + }, + { + "epoch": 0.08951609852067648, + "grad_norm": 3.205606460571289, + "learning_rate": 1.7923321895564434e-05, + "loss": 0.836, + "step": 1183 + }, + { + "epoch": 0.0895917672430101, + "grad_norm": 2.4469003677368164, + "learning_rate": 1.7923028248201254e-05, + "loss": 0.8001, + "step": 1184 + }, + { + "epoch": 0.08966743596534373, + "grad_norm": 3.101621150970459, + "learning_rate": 1.792273404204919e-05, + "loss": 0.9058, + "step": 1185 + }, + { + "epoch": 0.08974310468767735, + "grad_norm": 2.8717565536499023, + "learning_rate": 1.7922439277126656e-05, + "loss": 0.8039, + "step": 1186 + }, + { + "epoch": 0.08981877341001097, + "grad_norm": 2.7401671409606934, + "learning_rate": 1.7922143953452117e-05, + "loss": 0.8037, + "step": 1187 + }, + { + "epoch": 0.0898944421323446, + "grad_norm": 3.1042513847351074, + "learning_rate": 1.7921848071044065e-05, + "loss": 0.8021, + "step": 1188 + }, + { + "epoch": 0.08997011085467822, + "grad_norm": 3.410513162612915, + "learning_rate": 1.7921551629921033e-05, + "loss": 0.7713, + "step": 1189 + }, + { + "epoch": 0.09004577957701185, + "grad_norm": 2.913480758666992, + "learning_rate": 1.792125463010158e-05, + "loss": 0.6799, + "step": 1190 + }, + { + "epoch": 0.09012144829934547, + "grad_norm": 3.294175863265991, + "learning_rate": 1.792095707160431e-05, + "loss": 0.9056, + "step": 1191 + }, + { + "epoch": 0.09019711702167908, + "grad_norm": 3.1891300678253174, + "learning_rate": 1.792065895444785e-05, + "loss": 0.8954, + "step": 1192 + }, + { + "epoch": 0.0902727857440127, + "grad_norm": 2.7054443359375, + "learning_rate": 1.7920360278650874e-05, + "loss": 0.8146, + "step": 1193 + }, + { + "epoch": 0.09034845446634633, + "grad_norm": 2.5543649196624756, + "learning_rate": 1.7920061044232086e-05, + "loss": 0.7524, + "step": 1194 + }, + { + "epoch": 0.09042412318867996, + "grad_norm": 2.7355525493621826, + "learning_rate": 1.7919761251210227e-05, + "loss": 0.8009, + "step": 1195 + }, + { + "epoch": 0.09049979191101358, + "grad_norm": 2.5173544883728027, + "learning_rate": 1.7919460899604065e-05, + "loss": 0.8508, + "step": 1196 + }, + { + "epoch": 0.0905754606333472, + "grad_norm": 4.627716064453125, + "learning_rate": 1.7919159989432417e-05, + "loss": 0.7769, + "step": 1197 + }, + { + "epoch": 0.09065112935568083, + "grad_norm": 3.5971720218658447, + "learning_rate": 1.7918858520714118e-05, + "loss": 0.7218, + "step": 1198 + }, + { + "epoch": 0.09072679807801445, + "grad_norm": 3.731093168258667, + "learning_rate": 1.791855649346805e-05, + "loss": 0.9134, + "step": 1199 + }, + { + "epoch": 0.09080246680034808, + "grad_norm": 3.367734670639038, + "learning_rate": 1.7918253907713133e-05, + "loss": 0.7829, + "step": 1200 + }, + { + "epoch": 0.0908781355226817, + "grad_norm": 2.7320973873138428, + "learning_rate": 1.791795076346831e-05, + "loss": 0.8174, + "step": 1201 + }, + { + "epoch": 0.09095380424501533, + "grad_norm": 2.977246046066284, + "learning_rate": 1.7917647060752562e-05, + "loss": 0.9902, + "step": 1202 + }, + { + "epoch": 0.09102947296734895, + "grad_norm": 2.8450841903686523, + "learning_rate": 1.7917342799584916e-05, + "loss": 0.7602, + "step": 1203 + }, + { + "epoch": 0.09110514168968258, + "grad_norm": 2.729694128036499, + "learning_rate": 1.791703797998442e-05, + "loss": 0.7567, + "step": 1204 + }, + { + "epoch": 0.0911808104120162, + "grad_norm": 2.8043293952941895, + "learning_rate": 1.7916732601970166e-05, + "loss": 0.8673, + "step": 1205 + }, + { + "epoch": 0.09125647913434981, + "grad_norm": 2.3596503734588623, + "learning_rate": 1.7916426665561276e-05, + "loss": 0.6582, + "step": 1206 + }, + { + "epoch": 0.09133214785668343, + "grad_norm": 3.7093918323516846, + "learning_rate": 1.791612017077691e-05, + "loss": 0.8625, + "step": 1207 + }, + { + "epoch": 0.09140781657901706, + "grad_norm": 3.144490957260132, + "learning_rate": 1.791581311763626e-05, + "loss": 0.7741, + "step": 1208 + }, + { + "epoch": 0.09148348530135068, + "grad_norm": 2.924307346343994, + "learning_rate": 1.7915505506158553e-05, + "loss": 0.7959, + "step": 1209 + }, + { + "epoch": 0.09155915402368431, + "grad_norm": 3.3152287006378174, + "learning_rate": 1.7915197336363054e-05, + "loss": 0.928, + "step": 1210 + }, + { + "epoch": 0.09163482274601793, + "grad_norm": 3.4837348461151123, + "learning_rate": 1.7914888608269068e-05, + "loss": 0.8039, + "step": 1211 + }, + { + "epoch": 0.09171049146835156, + "grad_norm": 3.198897361755371, + "learning_rate": 1.791457932189592e-05, + "loss": 0.8471, + "step": 1212 + }, + { + "epoch": 0.09178616019068518, + "grad_norm": 2.604722023010254, + "learning_rate": 1.791426947726298e-05, + "loss": 0.7698, + "step": 1213 + }, + { + "epoch": 0.0918618289130188, + "grad_norm": 5.606075286865234, + "learning_rate": 1.7913959074389655e-05, + "loss": 0.7748, + "step": 1214 + }, + { + "epoch": 0.09193749763535243, + "grad_norm": 3.001539707183838, + "learning_rate": 1.7913648113295382e-05, + "loss": 0.8456, + "step": 1215 + }, + { + "epoch": 0.09201316635768605, + "grad_norm": 2.6370272636413574, + "learning_rate": 1.7913336593999634e-05, + "loss": 0.8696, + "step": 1216 + }, + { + "epoch": 0.09208883508001968, + "grad_norm": 3.1904518604278564, + "learning_rate": 1.791302451652192e-05, + "loss": 0.7573, + "step": 1217 + }, + { + "epoch": 0.0921645038023533, + "grad_norm": 4.082500457763672, + "learning_rate": 1.791271188088178e-05, + "loss": 0.6788, + "step": 1218 + }, + { + "epoch": 0.09224017252468691, + "grad_norm": 3.6102075576782227, + "learning_rate": 1.7912398687098794e-05, + "loss": 0.867, + "step": 1219 + }, + { + "epoch": 0.09231584124702054, + "grad_norm": 2.7773969173431396, + "learning_rate": 1.7912084935192577e-05, + "loss": 0.7679, + "step": 1220 + }, + { + "epoch": 0.09239150996935416, + "grad_norm": 2.8499839305877686, + "learning_rate": 1.791177062518278e-05, + "loss": 0.819, + "step": 1221 + }, + { + "epoch": 0.09246717869168779, + "grad_norm": 2.839503765106201, + "learning_rate": 1.7911455757089076e-05, + "loss": 0.8847, + "step": 1222 + }, + { + "epoch": 0.09254284741402141, + "grad_norm": 3.260828733444214, + "learning_rate": 1.7911140330931193e-05, + "loss": 0.8336, + "step": 1223 + }, + { + "epoch": 0.09261851613635504, + "grad_norm": 2.5245110988616943, + "learning_rate": 1.7910824346728882e-05, + "loss": 1.0465, + "step": 1224 + }, + { + "epoch": 0.09269418485868866, + "grad_norm": 3.2489395141601562, + "learning_rate": 1.7910507804501925e-05, + "loss": 0.7982, + "step": 1225 + }, + { + "epoch": 0.09276985358102229, + "grad_norm": 2.756540060043335, + "learning_rate": 1.7910190704270155e-05, + "loss": 0.818, + "step": 1226 + }, + { + "epoch": 0.09284552230335591, + "grad_norm": 2.708287477493286, + "learning_rate": 1.7909873046053417e-05, + "loss": 0.7818, + "step": 1227 + }, + { + "epoch": 0.09292119102568953, + "grad_norm": 3.1949870586395264, + "learning_rate": 1.7909554829871615e-05, + "loss": 0.7934, + "step": 1228 + }, + { + "epoch": 0.09299685974802316, + "grad_norm": 3.080357074737549, + "learning_rate": 1.7909236055744675e-05, + "loss": 0.7607, + "step": 1229 + }, + { + "epoch": 0.09307252847035678, + "grad_norm": 3.3811452388763428, + "learning_rate": 1.790891672369255e-05, + "loss": 0.9556, + "step": 1230 + }, + { + "epoch": 0.09314819719269041, + "grad_norm": 2.7672924995422363, + "learning_rate": 1.790859683373525e-05, + "loss": 0.7895, + "step": 1231 + }, + { + "epoch": 0.09322386591502403, + "grad_norm": 2.8778274059295654, + "learning_rate": 1.7908276385892802e-05, + "loss": 0.7614, + "step": 1232 + }, + { + "epoch": 0.09329953463735764, + "grad_norm": 3.135817289352417, + "learning_rate": 1.7907955380185276e-05, + "loss": 0.7486, + "step": 1233 + }, + { + "epoch": 0.09337520335969127, + "grad_norm": 2.9601290225982666, + "learning_rate": 1.790763381663277e-05, + "loss": 0.7852, + "step": 1234 + }, + { + "epoch": 0.09345087208202489, + "grad_norm": 2.9267449378967285, + "learning_rate": 1.790731169525542e-05, + "loss": 0.9109, + "step": 1235 + }, + { + "epoch": 0.09352654080435852, + "grad_norm": 2.473426342010498, + "learning_rate": 1.7906989016073405e-05, + "loss": 0.8989, + "step": 1236 + }, + { + "epoch": 0.09360220952669214, + "grad_norm": 3.5370841026306152, + "learning_rate": 1.790666577910693e-05, + "loss": 0.7684, + "step": 1237 + }, + { + "epoch": 0.09367787824902576, + "grad_norm": 2.9519424438476562, + "learning_rate": 1.7906341984376237e-05, + "loss": 0.8645, + "step": 1238 + }, + { + "epoch": 0.09375354697135939, + "grad_norm": 3.7128260135650635, + "learning_rate": 1.79060176319016e-05, + "loss": 0.903, + "step": 1239 + }, + { + "epoch": 0.09382921569369301, + "grad_norm": 2.769270420074463, + "learning_rate": 1.7905692721703332e-05, + "loss": 0.7397, + "step": 1240 + }, + { + "epoch": 0.09390488441602664, + "grad_norm": 2.524073600769043, + "learning_rate": 1.7905367253801784e-05, + "loss": 0.6321, + "step": 1241 + }, + { + "epoch": 0.09398055313836026, + "grad_norm": 2.9674630165100098, + "learning_rate": 1.7905041228217335e-05, + "loss": 0.8209, + "step": 1242 + }, + { + "epoch": 0.09405622186069389, + "grad_norm": 2.350693941116333, + "learning_rate": 1.79047146449704e-05, + "loss": 0.8143, + "step": 1243 + }, + { + "epoch": 0.09413189058302751, + "grad_norm": 2.93047833442688, + "learning_rate": 1.7904387504081435e-05, + "loss": 0.8576, + "step": 1244 + }, + { + "epoch": 0.09420755930536114, + "grad_norm": 2.9106175899505615, + "learning_rate": 1.7904059805570923e-05, + "loss": 0.7653, + "step": 1245 + }, + { + "epoch": 0.09428322802769475, + "grad_norm": 2.6790316104888916, + "learning_rate": 1.7903731549459388e-05, + "loss": 0.8273, + "step": 1246 + }, + { + "epoch": 0.09435889675002837, + "grad_norm": 2.455004930496216, + "learning_rate": 1.7903402735767385e-05, + "loss": 0.7973, + "step": 1247 + }, + { + "epoch": 0.094434565472362, + "grad_norm": 3.4603185653686523, + "learning_rate": 1.7903073364515504e-05, + "loss": 0.7254, + "step": 1248 + }, + { + "epoch": 0.09451023419469562, + "grad_norm": 3.0400404930114746, + "learning_rate": 1.790274343572437e-05, + "loss": 0.8121, + "step": 1249 + }, + { + "epoch": 0.09458590291702924, + "grad_norm": 2.6459450721740723, + "learning_rate": 1.7902412949414652e-05, + "loss": 0.871, + "step": 1250 + }, + { + "epoch": 0.09466157163936287, + "grad_norm": 3.258230686187744, + "learning_rate": 1.790208190560704e-05, + "loss": 0.8936, + "step": 1251 + }, + { + "epoch": 0.09473724036169649, + "grad_norm": 3.0523667335510254, + "learning_rate": 1.7901750304322267e-05, + "loss": 0.8872, + "step": 1252 + }, + { + "epoch": 0.09481290908403012, + "grad_norm": 3.323202610015869, + "learning_rate": 1.79014181455811e-05, + "loss": 0.9111, + "step": 1253 + }, + { + "epoch": 0.09488857780636374, + "grad_norm": 2.76963210105896, + "learning_rate": 1.7901085429404335e-05, + "loss": 0.8893, + "step": 1254 + }, + { + "epoch": 0.09496424652869737, + "grad_norm": 2.809248685836792, + "learning_rate": 1.790075215581281e-05, + "loss": 0.8977, + "step": 1255 + }, + { + "epoch": 0.09503991525103099, + "grad_norm": 3.7128524780273438, + "learning_rate": 1.79004183248274e-05, + "loss": 0.846, + "step": 1256 + }, + { + "epoch": 0.09511558397336461, + "grad_norm": 2.7078073024749756, + "learning_rate": 1.7900083936469003e-05, + "loss": 0.8137, + "step": 1257 + }, + { + "epoch": 0.09519125269569824, + "grad_norm": 3.0570812225341797, + "learning_rate": 1.7899748990758564e-05, + "loss": 0.9453, + "step": 1258 + }, + { + "epoch": 0.09526692141803185, + "grad_norm": 3.7988598346710205, + "learning_rate": 1.789941348771706e-05, + "loss": 0.6545, + "step": 1259 + }, + { + "epoch": 0.09534259014036547, + "grad_norm": 2.1266133785247803, + "learning_rate": 1.7899077427365496e-05, + "loss": 0.7383, + "step": 1260 + }, + { + "epoch": 0.0954182588626991, + "grad_norm": 2.4982497692108154, + "learning_rate": 1.7898740809724925e-05, + "loss": 0.6417, + "step": 1261 + }, + { + "epoch": 0.09549392758503272, + "grad_norm": 3.788729190826416, + "learning_rate": 1.789840363481642e-05, + "loss": 0.7754, + "step": 1262 + }, + { + "epoch": 0.09556959630736635, + "grad_norm": 2.876657009124756, + "learning_rate": 1.78980659026611e-05, + "loss": 0.8848, + "step": 1263 + }, + { + "epoch": 0.09564526502969997, + "grad_norm": 2.669171094894409, + "learning_rate": 1.789772761328011e-05, + "loss": 0.7351, + "step": 1264 + }, + { + "epoch": 0.0957209337520336, + "grad_norm": 2.671163558959961, + "learning_rate": 1.7897388766694643e-05, + "loss": 0.9625, + "step": 1265 + }, + { + "epoch": 0.09579660247436722, + "grad_norm": 3.019002676010132, + "learning_rate": 1.789704936292591e-05, + "loss": 0.6518, + "step": 1266 + }, + { + "epoch": 0.09587227119670085, + "grad_norm": 2.8676607608795166, + "learning_rate": 1.789670940199517e-05, + "loss": 0.8793, + "step": 1267 + }, + { + "epoch": 0.09594793991903447, + "grad_norm": 2.384054660797119, + "learning_rate": 1.789636888392371e-05, + "loss": 0.8888, + "step": 1268 + }, + { + "epoch": 0.0960236086413681, + "grad_norm": 4.045263290405273, + "learning_rate": 1.789602780873286e-05, + "loss": 0.8233, + "step": 1269 + }, + { + "epoch": 0.09609927736370172, + "grad_norm": 2.7333438396453857, + "learning_rate": 1.7895686176443973e-05, + "loss": 0.8029, + "step": 1270 + }, + { + "epoch": 0.09617494608603534, + "grad_norm": 2.8120603561401367, + "learning_rate": 1.7895343987078446e-05, + "loss": 0.9133, + "step": 1271 + }, + { + "epoch": 0.09625061480836897, + "grad_norm": 2.5713160037994385, + "learning_rate": 1.789500124065771e-05, + "loss": 0.8908, + "step": 1272 + }, + { + "epoch": 0.09632628353070258, + "grad_norm": 3.9488155841827393, + "learning_rate": 1.7894657937203222e-05, + "loss": 0.8737, + "step": 1273 + }, + { + "epoch": 0.0964019522530362, + "grad_norm": 4.038736820220947, + "learning_rate": 1.7894314076736486e-05, + "loss": 0.9211, + "step": 1274 + }, + { + "epoch": 0.09647762097536983, + "grad_norm": 2.971062660217285, + "learning_rate": 1.789396965927904e-05, + "loss": 0.8733, + "step": 1275 + }, + { + "epoch": 0.09655328969770345, + "grad_norm": 2.516186237335205, + "learning_rate": 1.789362468485244e-05, + "loss": 0.7466, + "step": 1276 + }, + { + "epoch": 0.09662895842003708, + "grad_norm": 3.4188225269317627, + "learning_rate": 1.78932791534783e-05, + "loss": 0.9705, + "step": 1277 + }, + { + "epoch": 0.0967046271423707, + "grad_norm": 2.534454822540283, + "learning_rate": 1.7892933065178257e-05, + "loss": 0.8904, + "step": 1278 + }, + { + "epoch": 0.09678029586470432, + "grad_norm": 2.6995832920074463, + "learning_rate": 1.789258641997398e-05, + "loss": 0.6556, + "step": 1279 + }, + { + "epoch": 0.09685596458703795, + "grad_norm": 3.069735050201416, + "learning_rate": 1.789223921788718e-05, + "loss": 0.9564, + "step": 1280 + }, + { + "epoch": 0.09693163330937157, + "grad_norm": 4.048869609832764, + "learning_rate": 1.7891891458939597e-05, + "loss": 0.846, + "step": 1281 + }, + { + "epoch": 0.0970073020317052, + "grad_norm": 2.709456443786621, + "learning_rate": 1.7891543143153014e-05, + "loss": 0.7941, + "step": 1282 + }, + { + "epoch": 0.09708297075403882, + "grad_norm": 3.2407002449035645, + "learning_rate": 1.7891194270549238e-05, + "loss": 0.7715, + "step": 1283 + }, + { + "epoch": 0.09715863947637245, + "grad_norm": 4.350454807281494, + "learning_rate": 1.7890844841150122e-05, + "loss": 0.8406, + "step": 1284 + }, + { + "epoch": 0.09723430819870607, + "grad_norm": 3.1144919395446777, + "learning_rate": 1.789049485497754e-05, + "loss": 0.9677, + "step": 1285 + }, + { + "epoch": 0.09730997692103968, + "grad_norm": 5.615466594696045, + "learning_rate": 1.7890144312053423e-05, + "loss": 0.8997, + "step": 1286 + }, + { + "epoch": 0.0973856456433733, + "grad_norm": 3.7363674640655518, + "learning_rate": 1.788979321239971e-05, + "loss": 0.8649, + "step": 1287 + }, + { + "epoch": 0.09746131436570693, + "grad_norm": 2.72586989402771, + "learning_rate": 1.7889441556038394e-05, + "loss": 0.8034, + "step": 1288 + }, + { + "epoch": 0.09753698308804055, + "grad_norm": 3.393902540206909, + "learning_rate": 1.7889089342991495e-05, + "loss": 1.0079, + "step": 1289 + }, + { + "epoch": 0.09761265181037418, + "grad_norm": 2.7981932163238525, + "learning_rate": 1.7888736573281073e-05, + "loss": 0.7422, + "step": 1290 + }, + { + "epoch": 0.0976883205327078, + "grad_norm": 3.0770058631896973, + "learning_rate": 1.7888383246929213e-05, + "loss": 0.8174, + "step": 1291 + }, + { + "epoch": 0.09776398925504143, + "grad_norm": 3.3533740043640137, + "learning_rate": 1.7888029363958048e-05, + "loss": 0.753, + "step": 1292 + }, + { + "epoch": 0.09783965797737505, + "grad_norm": 3.7199461460113525, + "learning_rate": 1.788767492438974e-05, + "loss": 0.7974, + "step": 1293 + }, + { + "epoch": 0.09791532669970868, + "grad_norm": 3.1728827953338623, + "learning_rate": 1.788731992824648e-05, + "loss": 0.6211, + "step": 1294 + }, + { + "epoch": 0.0979909954220423, + "grad_norm": 3.2378153800964355, + "learning_rate": 1.7886964375550497e-05, + "loss": 0.8974, + "step": 1295 + }, + { + "epoch": 0.09806666414437593, + "grad_norm": 2.729747772216797, + "learning_rate": 1.7886608266324063e-05, + "loss": 0.8367, + "step": 1296 + }, + { + "epoch": 0.09814233286670955, + "grad_norm": 3.429668664932251, + "learning_rate": 1.7886251600589478e-05, + "loss": 0.8581, + "step": 1297 + }, + { + "epoch": 0.09821800158904317, + "grad_norm": 2.603367567062378, + "learning_rate": 1.7885894378369077e-05, + "loss": 0.789, + "step": 1298 + }, + { + "epoch": 0.0982936703113768, + "grad_norm": 3.1779685020446777, + "learning_rate": 1.7885536599685227e-05, + "loss": 0.8841, + "step": 1299 + }, + { + "epoch": 0.09836933903371041, + "grad_norm": 2.518272638320923, + "learning_rate": 1.7885178264560335e-05, + "loss": 0.7745, + "step": 1300 + }, + { + "epoch": 0.09844500775604403, + "grad_norm": 2.8336172103881836, + "learning_rate": 1.7884819373016844e-05, + "loss": 0.7445, + "step": 1301 + }, + { + "epoch": 0.09852067647837766, + "grad_norm": 3.497591972351074, + "learning_rate": 1.7884459925077227e-05, + "loss": 0.7151, + "step": 1302 + }, + { + "epoch": 0.09859634520071128, + "grad_norm": 1.6441137790679932, + "learning_rate": 1.7884099920763995e-05, + "loss": 0.9654, + "step": 1303 + }, + { + "epoch": 0.09867201392304491, + "grad_norm": 2.5127291679382324, + "learning_rate": 1.788373936009969e-05, + "loss": 0.625, + "step": 1304 + }, + { + "epoch": 0.09874768264537853, + "grad_norm": 2.6683413982391357, + "learning_rate": 1.788337824310689e-05, + "loss": 0.9659, + "step": 1305 + }, + { + "epoch": 0.09882335136771216, + "grad_norm": 2.3852410316467285, + "learning_rate": 1.7883016569808213e-05, + "loss": 0.8527, + "step": 1306 + }, + { + "epoch": 0.09889902009004578, + "grad_norm": 3.4505207538604736, + "learning_rate": 1.788265434022631e-05, + "loss": 0.797, + "step": 1307 + }, + { + "epoch": 0.0989746888123794, + "grad_norm": 3.126436471939087, + "learning_rate": 1.7882291554383862e-05, + "loss": 0.8493, + "step": 1308 + }, + { + "epoch": 0.09905035753471303, + "grad_norm": 2.6196725368499756, + "learning_rate": 1.7881928212303586e-05, + "loss": 0.8203, + "step": 1309 + }, + { + "epoch": 0.09912602625704665, + "grad_norm": 3.3748865127563477, + "learning_rate": 1.788156431400824e-05, + "loss": 0.9412, + "step": 1310 + }, + { + "epoch": 0.09920169497938028, + "grad_norm": 3.1734228134155273, + "learning_rate": 1.788119985952061e-05, + "loss": 0.8705, + "step": 1311 + }, + { + "epoch": 0.0992773637017139, + "grad_norm": 2.6753692626953125, + "learning_rate": 1.7880834848863517e-05, + "loss": 0.7506, + "step": 1312 + }, + { + "epoch": 0.09935303242404751, + "grad_norm": 2.6498630046844482, + "learning_rate": 1.788046928205982e-05, + "loss": 0.7981, + "step": 1313 + }, + { + "epoch": 0.09942870114638114, + "grad_norm": 3.271476984024048, + "learning_rate": 1.788010315913242e-05, + "loss": 0.8675, + "step": 1314 + }, + { + "epoch": 0.09950436986871476, + "grad_norm": 2.3523943424224854, + "learning_rate": 1.7879736480104234e-05, + "loss": 0.8247, + "step": 1315 + }, + { + "epoch": 0.09958003859104839, + "grad_norm": 2.718277931213379, + "learning_rate": 1.787936924499823e-05, + "loss": 0.8838, + "step": 1316 + }, + { + "epoch": 0.09965570731338201, + "grad_norm": 2.883328676223755, + "learning_rate": 1.7879001453837406e-05, + "loss": 0.8652, + "step": 1317 + }, + { + "epoch": 0.09973137603571564, + "grad_norm": 4.095880508422852, + "learning_rate": 1.787863310664479e-05, + "loss": 0.8405, + "step": 1318 + }, + { + "epoch": 0.09980704475804926, + "grad_norm": 3.1553285121917725, + "learning_rate": 1.7878264203443453e-05, + "loss": 0.8698, + "step": 1319 + }, + { + "epoch": 0.09988271348038288, + "grad_norm": 3.0702428817749023, + "learning_rate": 1.7877894744256494e-05, + "loss": 0.8055, + "step": 1320 + }, + { + "epoch": 0.09995838220271651, + "grad_norm": 3.2465262413024902, + "learning_rate": 1.7877524729107054e-05, + "loss": 0.8476, + "step": 1321 + }, + { + "epoch": 0.10003405092505013, + "grad_norm": 2.630358934402466, + "learning_rate": 1.7877154158018306e-05, + "loss": 0.7447, + "step": 1322 + }, + { + "epoch": 0.10010971964738376, + "grad_norm": 3.00877046585083, + "learning_rate": 1.7876783031013445e-05, + "loss": 0.7267, + "step": 1323 + }, + { + "epoch": 0.10018538836971738, + "grad_norm": 3.247757911682129, + "learning_rate": 1.7876411348115726e-05, + "loss": 0.9357, + "step": 1324 + }, + { + "epoch": 0.100261057092051, + "grad_norm": 2.484332323074341, + "learning_rate": 1.7876039109348413e-05, + "loss": 0.8055, + "step": 1325 + }, + { + "epoch": 0.10033672581438463, + "grad_norm": 2.731320858001709, + "learning_rate": 1.7875666314734823e-05, + "loss": 0.7875, + "step": 1326 + }, + { + "epoch": 0.10041239453671824, + "grad_norm": 2.6516387462615967, + "learning_rate": 1.7875292964298306e-05, + "loss": 0.8813, + "step": 1327 + }, + { + "epoch": 0.10048806325905187, + "grad_norm": 3.460632085800171, + "learning_rate": 1.7874919058062234e-05, + "loss": 0.8705, + "step": 1328 + }, + { + "epoch": 0.10056373198138549, + "grad_norm": 3.08870792388916, + "learning_rate": 1.7874544596050024e-05, + "loss": 0.9075, + "step": 1329 + }, + { + "epoch": 0.10063940070371911, + "grad_norm": 2.5822556018829346, + "learning_rate": 1.787416957828513e-05, + "loss": 0.7953, + "step": 1330 + }, + { + "epoch": 0.10071506942605274, + "grad_norm": 2.9101879596710205, + "learning_rate": 1.7873794004791034e-05, + "loss": 0.7639, + "step": 1331 + }, + { + "epoch": 0.10079073814838636, + "grad_norm": 3.263343095779419, + "learning_rate": 1.7873417875591257e-05, + "loss": 0.789, + "step": 1332 + }, + { + "epoch": 0.10086640687071999, + "grad_norm": 3.0501604080200195, + "learning_rate": 1.7873041190709348e-05, + "loss": 0.8689, + "step": 1333 + }, + { + "epoch": 0.10094207559305361, + "grad_norm": 10.238188743591309, + "learning_rate": 1.7872663950168907e-05, + "loss": 0.8529, + "step": 1334 + }, + { + "epoch": 0.10101774431538724, + "grad_norm": 2.7818658351898193, + "learning_rate": 1.7872286153993548e-05, + "loss": 0.883, + "step": 1335 + }, + { + "epoch": 0.10109341303772086, + "grad_norm": 3.551182985305786, + "learning_rate": 1.7871907802206934e-05, + "loss": 0.7372, + "step": 1336 + }, + { + "epoch": 0.10116908176005449, + "grad_norm": 3.187056541442871, + "learning_rate": 1.7871528894832758e-05, + "loss": 0.8921, + "step": 1337 + }, + { + "epoch": 0.10124475048238811, + "grad_norm": 3.0303001403808594, + "learning_rate": 1.7871149431894747e-05, + "loss": 0.9956, + "step": 1338 + }, + { + "epoch": 0.10132041920472173, + "grad_norm": 2.6252822875976562, + "learning_rate": 1.787076941341667e-05, + "loss": 0.6815, + "step": 1339 + }, + { + "epoch": 0.10139608792705535, + "grad_norm": 2.8354198932647705, + "learning_rate": 1.787038883942232e-05, + "loss": 0.8485, + "step": 1340 + }, + { + "epoch": 0.10147175664938897, + "grad_norm": 2.798238754272461, + "learning_rate": 1.787000770993553e-05, + "loss": 0.8172, + "step": 1341 + }, + { + "epoch": 0.1015474253717226, + "grad_norm": 3.534174680709839, + "learning_rate": 1.7869626024980167e-05, + "loss": 0.7315, + "step": 1342 + }, + { + "epoch": 0.10162309409405622, + "grad_norm": 2.5657639503479004, + "learning_rate": 1.7869243784580133e-05, + "loss": 0.6961, + "step": 1343 + }, + { + "epoch": 0.10169876281638984, + "grad_norm": 3.29061222076416, + "learning_rate": 1.7868860988759372e-05, + "loss": 0.7886, + "step": 1344 + }, + { + "epoch": 0.10177443153872347, + "grad_norm": 2.4909753799438477, + "learning_rate": 1.7868477637541845e-05, + "loss": 0.8066, + "step": 1345 + }, + { + "epoch": 0.10185010026105709, + "grad_norm": 3.2499606609344482, + "learning_rate": 1.7868093730951568e-05, + "loss": 0.8099, + "step": 1346 + }, + { + "epoch": 0.10192576898339072, + "grad_norm": 2.9574480056762695, + "learning_rate": 1.7867709269012575e-05, + "loss": 0.7153, + "step": 1347 + }, + { + "epoch": 0.10200143770572434, + "grad_norm": 2.3955531120300293, + "learning_rate": 1.786732425174895e-05, + "loss": 1.0746, + "step": 1348 + }, + { + "epoch": 0.10207710642805796, + "grad_norm": 2.7731571197509766, + "learning_rate": 1.7866938679184797e-05, + "loss": 0.8378, + "step": 1349 + }, + { + "epoch": 0.10215277515039159, + "grad_norm": 2.637190818786621, + "learning_rate": 1.7866552551344267e-05, + "loss": 0.9882, + "step": 1350 + }, + { + "epoch": 0.10222844387272521, + "grad_norm": 3.307260036468506, + "learning_rate": 1.7866165868251535e-05, + "loss": 0.8507, + "step": 1351 + }, + { + "epoch": 0.10230411259505884, + "grad_norm": 3.339919090270996, + "learning_rate": 1.786577862993082e-05, + "loss": 0.8656, + "step": 1352 + }, + { + "epoch": 0.10237978131739246, + "grad_norm": 3.161778211593628, + "learning_rate": 1.7865390836406373e-05, + "loss": 0.8829, + "step": 1353 + }, + { + "epoch": 0.10245545003972607, + "grad_norm": 7.813992500305176, + "learning_rate": 1.786500248770248e-05, + "loss": 0.7528, + "step": 1354 + }, + { + "epoch": 0.1025311187620597, + "grad_norm": 3.0543980598449707, + "learning_rate": 1.7864613583843453e-05, + "loss": 0.7212, + "step": 1355 + }, + { + "epoch": 0.10260678748439332, + "grad_norm": 5.032805442810059, + "learning_rate": 1.7864224124853656e-05, + "loss": 0.772, + "step": 1356 + }, + { + "epoch": 0.10268245620672695, + "grad_norm": 2.9874513149261475, + "learning_rate": 1.7863834110757476e-05, + "loss": 0.7245, + "step": 1357 + }, + { + "epoch": 0.10275812492906057, + "grad_norm": 3.358482837677002, + "learning_rate": 1.786344354157933e-05, + "loss": 0.9143, + "step": 1358 + }, + { + "epoch": 0.1028337936513942, + "grad_norm": 2.869976758956909, + "learning_rate": 1.7863052417343684e-05, + "loss": 0.7532, + "step": 1359 + }, + { + "epoch": 0.10290946237372782, + "grad_norm": 3.7541520595550537, + "learning_rate": 1.7862660738075028e-05, + "loss": 0.8099, + "step": 1360 + }, + { + "epoch": 0.10298513109606144, + "grad_norm": 2.752075433731079, + "learning_rate": 1.7862268503797893e-05, + "loss": 0.7319, + "step": 1361 + }, + { + "epoch": 0.10306079981839507, + "grad_norm": 3.171396017074585, + "learning_rate": 1.786187571453684e-05, + "loss": 0.7254, + "step": 1362 + }, + { + "epoch": 0.10313646854072869, + "grad_norm": 2.9250547885894775, + "learning_rate": 1.7861482370316464e-05, + "loss": 0.7121, + "step": 1363 + }, + { + "epoch": 0.10321213726306232, + "grad_norm": 3.4453530311584473, + "learning_rate": 1.78610884711614e-05, + "loss": 0.8577, + "step": 1364 + }, + { + "epoch": 0.10328780598539594, + "grad_norm": 3.4727487564086914, + "learning_rate": 1.7860694017096323e-05, + "loss": 0.7081, + "step": 1365 + }, + { + "epoch": 0.10336347470772957, + "grad_norm": 2.7783381938934326, + "learning_rate": 1.7860299008145922e-05, + "loss": 0.8319, + "step": 1366 + }, + { + "epoch": 0.10343914343006318, + "grad_norm": 2.9331109523773193, + "learning_rate": 1.785990344433494e-05, + "loss": 0.772, + "step": 1367 + }, + { + "epoch": 0.1035148121523968, + "grad_norm": 2.4165093898773193, + "learning_rate": 1.7859507325688146e-05, + "loss": 0.8665, + "step": 1368 + }, + { + "epoch": 0.10359048087473043, + "grad_norm": 3.596440553665161, + "learning_rate": 1.7859110652230352e-05, + "loss": 0.8207, + "step": 1369 + }, + { + "epoch": 0.10366614959706405, + "grad_norm": 3.142336368560791, + "learning_rate": 1.7858713423986392e-05, + "loss": 0.8451, + "step": 1370 + }, + { + "epoch": 0.10374181831939767, + "grad_norm": 3.1580209732055664, + "learning_rate": 1.7858315640981147e-05, + "loss": 0.8636, + "step": 1371 + }, + { + "epoch": 0.1038174870417313, + "grad_norm": 2.814429998397827, + "learning_rate": 1.7857917303239527e-05, + "loss": 0.8564, + "step": 1372 + }, + { + "epoch": 0.10389315576406492, + "grad_norm": 2.6497771739959717, + "learning_rate": 1.7857518410786472e-05, + "loss": 0.7462, + "step": 1373 + }, + { + "epoch": 0.10396882448639855, + "grad_norm": 2.9846158027648926, + "learning_rate": 1.7857118963646963e-05, + "loss": 0.706, + "step": 1374 + }, + { + "epoch": 0.10404449320873217, + "grad_norm": 3.386066436767578, + "learning_rate": 1.785671896184602e-05, + "loss": 0.7739, + "step": 1375 + }, + { + "epoch": 0.1041201619310658, + "grad_norm": 4.174465179443359, + "learning_rate": 1.7856318405408694e-05, + "loss": 0.8237, + "step": 1376 + }, + { + "epoch": 0.10419583065339942, + "grad_norm": 3.151991844177246, + "learning_rate": 1.785591729436006e-05, + "loss": 0.8827, + "step": 1377 + }, + { + "epoch": 0.10427149937573305, + "grad_norm": 3.0265588760375977, + "learning_rate": 1.785551562872524e-05, + "loss": 0.7439, + "step": 1378 + }, + { + "epoch": 0.10434716809806667, + "grad_norm": 2.958704710006714, + "learning_rate": 1.7855113408529395e-05, + "loss": 0.7563, + "step": 1379 + }, + { + "epoch": 0.1044228368204003, + "grad_norm": 2.7380552291870117, + "learning_rate": 1.7854710633797703e-05, + "loss": 0.6439, + "step": 1380 + }, + { + "epoch": 0.1044985055427339, + "grad_norm": 2.067690134048462, + "learning_rate": 1.785430730455539e-05, + "loss": 0.8905, + "step": 1381 + }, + { + "epoch": 0.10457417426506753, + "grad_norm": 2.9177439212799072, + "learning_rate": 1.785390342082772e-05, + "loss": 0.9292, + "step": 1382 + }, + { + "epoch": 0.10464984298740115, + "grad_norm": 3.5004894733428955, + "learning_rate": 1.7853498982639977e-05, + "loss": 0.7962, + "step": 1383 + }, + { + "epoch": 0.10472551170973478, + "grad_norm": 2.974620819091797, + "learning_rate": 1.7853093990017494e-05, + "loss": 0.8304, + "step": 1384 + }, + { + "epoch": 0.1048011804320684, + "grad_norm": 3.4205729961395264, + "learning_rate": 1.785268844298563e-05, + "loss": 0.8792, + "step": 1385 + }, + { + "epoch": 0.10487684915440203, + "grad_norm": 3.7999086380004883, + "learning_rate": 1.7852282341569782e-05, + "loss": 0.9034, + "step": 1386 + }, + { + "epoch": 0.10495251787673565, + "grad_norm": 3.3099162578582764, + "learning_rate": 1.7851875685795383e-05, + "loss": 0.9594, + "step": 1387 + }, + { + "epoch": 0.10502818659906928, + "grad_norm": 3.3331620693206787, + "learning_rate": 1.78514684756879e-05, + "loss": 0.9426, + "step": 1388 + }, + { + "epoch": 0.1051038553214029, + "grad_norm": 2.9932851791381836, + "learning_rate": 1.7851060711272827e-05, + "loss": 0.7063, + "step": 1389 + }, + { + "epoch": 0.10517952404373652, + "grad_norm": 3.7370762825012207, + "learning_rate": 1.7850652392575712e-05, + "loss": 0.9669, + "step": 1390 + }, + { + "epoch": 0.10525519276607015, + "grad_norm": 3.106455087661743, + "learning_rate": 1.785024351962211e-05, + "loss": 0.7777, + "step": 1391 + }, + { + "epoch": 0.10533086148840377, + "grad_norm": 3.7146058082580566, + "learning_rate": 1.784983409243764e-05, + "loss": 0.9436, + "step": 1392 + }, + { + "epoch": 0.1054065302107374, + "grad_norm": 2.6527533531188965, + "learning_rate": 1.784942411104793e-05, + "loss": 0.7476, + "step": 1393 + }, + { + "epoch": 0.10548219893307101, + "grad_norm": 2.8467373847961426, + "learning_rate": 1.7849013575478664e-05, + "loss": 0.6969, + "step": 1394 + }, + { + "epoch": 0.10555786765540463, + "grad_norm": 3.833505153656006, + "learning_rate": 1.7848602485755542e-05, + "loss": 0.8751, + "step": 1395 + }, + { + "epoch": 0.10563353637773826, + "grad_norm": 3.4765021800994873, + "learning_rate": 1.7848190841904314e-05, + "loss": 0.7033, + "step": 1396 + }, + { + "epoch": 0.10570920510007188, + "grad_norm": 2.6900827884674072, + "learning_rate": 1.784777864395076e-05, + "loss": 0.7621, + "step": 1397 + }, + { + "epoch": 0.1057848738224055, + "grad_norm": 4.700630187988281, + "learning_rate": 1.7847365891920688e-05, + "loss": 0.9304, + "step": 1398 + }, + { + "epoch": 0.10586054254473913, + "grad_norm": 3.0544216632843018, + "learning_rate": 1.7846952585839946e-05, + "loss": 0.8242, + "step": 1399 + }, + { + "epoch": 0.10593621126707276, + "grad_norm": 2.659796714782715, + "learning_rate": 1.784653872573442e-05, + "loss": 0.8753, + "step": 1400 + }, + { + "epoch": 0.10601187998940638, + "grad_norm": 2.934483289718628, + "learning_rate": 1.784612431163003e-05, + "loss": 0.7843, + "step": 1401 + }, + { + "epoch": 0.10608754871174, + "grad_norm": 2.5334622859954834, + "learning_rate": 1.784570934355272e-05, + "loss": 0.8974, + "step": 1402 + }, + { + "epoch": 0.10616321743407363, + "grad_norm": 1.8454688787460327, + "learning_rate": 1.784529382152848e-05, + "loss": 0.999, + "step": 1403 + }, + { + "epoch": 0.10623888615640725, + "grad_norm": 3.1255669593811035, + "learning_rate": 1.7844877745583333e-05, + "loss": 0.8522, + "step": 1404 + }, + { + "epoch": 0.10631455487874088, + "grad_norm": 2.9718644618988037, + "learning_rate": 1.7844461115743334e-05, + "loss": 0.8519, + "step": 1405 + }, + { + "epoch": 0.1063902236010745, + "grad_norm": 2.7671189308166504, + "learning_rate": 1.7844043932034572e-05, + "loss": 0.9112, + "step": 1406 + }, + { + "epoch": 0.10646589232340813, + "grad_norm": 3.579397678375244, + "learning_rate": 1.7843626194483174e-05, + "loss": 0.8709, + "step": 1407 + }, + { + "epoch": 0.10654156104574174, + "grad_norm": 2.603875160217285, + "learning_rate": 1.78432079031153e-05, + "loss": 0.7613, + "step": 1408 + }, + { + "epoch": 0.10661722976807536, + "grad_norm": 2.835737466812134, + "learning_rate": 1.7842789057957146e-05, + "loss": 0.821, + "step": 1409 + }, + { + "epoch": 0.10669289849040899, + "grad_norm": 2.738487958908081, + "learning_rate": 1.784236965903494e-05, + "loss": 0.7394, + "step": 1410 + }, + { + "epoch": 0.10676856721274261, + "grad_norm": 2.9330735206604004, + "learning_rate": 1.7841949706374944e-05, + "loss": 0.9316, + "step": 1411 + }, + { + "epoch": 0.10684423593507623, + "grad_norm": 3.281982898712158, + "learning_rate": 1.784152920000346e-05, + "loss": 0.7128, + "step": 1412 + }, + { + "epoch": 0.10691990465740986, + "grad_norm": 2.7696800231933594, + "learning_rate": 1.7841108139946824e-05, + "loss": 0.8896, + "step": 1413 + }, + { + "epoch": 0.10699557337974348, + "grad_norm": 6.206698894500732, + "learning_rate": 1.7840686526231394e-05, + "loss": 0.8619, + "step": 1414 + }, + { + "epoch": 0.10707124210207711, + "grad_norm": 2.468066453933716, + "learning_rate": 1.7840264358883585e-05, + "loss": 0.5857, + "step": 1415 + }, + { + "epoch": 0.10714691082441073, + "grad_norm": 3.581237316131592, + "learning_rate": 1.7839841637929827e-05, + "loss": 0.8797, + "step": 1416 + }, + { + "epoch": 0.10722257954674436, + "grad_norm": 2.6938111782073975, + "learning_rate": 1.7839418363396596e-05, + "loss": 0.6231, + "step": 1417 + }, + { + "epoch": 0.10729824826907798, + "grad_norm": 3.101189613342285, + "learning_rate": 1.7838994535310393e-05, + "loss": 0.9891, + "step": 1418 + }, + { + "epoch": 0.1073739169914116, + "grad_norm": 3.6879820823669434, + "learning_rate": 1.7838570153697767e-05, + "loss": 0.7532, + "step": 1419 + }, + { + "epoch": 0.10744958571374523, + "grad_norm": 3.3243961334228516, + "learning_rate": 1.783814521858529e-05, + "loss": 0.7972, + "step": 1420 + }, + { + "epoch": 0.10752525443607884, + "grad_norm": 3.053708076477051, + "learning_rate": 1.783771972999957e-05, + "loss": 0.8763, + "step": 1421 + }, + { + "epoch": 0.10760092315841246, + "grad_norm": 3.4024465084075928, + "learning_rate": 1.783729368796726e-05, + "loss": 0.8386, + "step": 1422 + }, + { + "epoch": 0.10767659188074609, + "grad_norm": 3.4709556102752686, + "learning_rate": 1.7836867092515034e-05, + "loss": 0.8942, + "step": 1423 + }, + { + "epoch": 0.10775226060307971, + "grad_norm": 2.8416600227355957, + "learning_rate": 1.783643994366961e-05, + "loss": 0.9081, + "step": 1424 + }, + { + "epoch": 0.10782792932541334, + "grad_norm": 2.391524076461792, + "learning_rate": 1.7836012241457736e-05, + "loss": 0.6704, + "step": 1425 + }, + { + "epoch": 0.10790359804774696, + "grad_norm": 2.441361665725708, + "learning_rate": 1.7835583985906197e-05, + "loss": 0.6307, + "step": 1426 + }, + { + "epoch": 0.10797926677008059, + "grad_norm": 2.4388327598571777, + "learning_rate": 1.7835155177041807e-05, + "loss": 0.7966, + "step": 1427 + }, + { + "epoch": 0.10805493549241421, + "grad_norm": 2.3517658710479736, + "learning_rate": 1.7834725814891427e-05, + "loss": 0.7591, + "step": 1428 + }, + { + "epoch": 0.10813060421474784, + "grad_norm": 2.327765941619873, + "learning_rate": 1.783429589948194e-05, + "loss": 0.8393, + "step": 1429 + }, + { + "epoch": 0.10820627293708146, + "grad_norm": 2.1608386039733887, + "learning_rate": 1.7833865430840273e-05, + "loss": 0.6654, + "step": 1430 + }, + { + "epoch": 0.10828194165941508, + "grad_norm": 2.7661025524139404, + "learning_rate": 1.783343440899338e-05, + "loss": 0.6717, + "step": 1431 + }, + { + "epoch": 0.10835761038174871, + "grad_norm": 3.8667588233947754, + "learning_rate": 1.783300283396825e-05, + "loss": 0.8499, + "step": 1432 + }, + { + "epoch": 0.10843327910408233, + "grad_norm": 2.459967851638794, + "learning_rate": 1.7832570705791915e-05, + "loss": 0.9147, + "step": 1433 + }, + { + "epoch": 0.10850894782641594, + "grad_norm": 2.2012739181518555, + "learning_rate": 1.7832138024491435e-05, + "loss": 0.787, + "step": 1434 + }, + { + "epoch": 0.10858461654874957, + "grad_norm": 2.780473232269287, + "learning_rate": 1.7831704790093903e-05, + "loss": 0.9463, + "step": 1435 + }, + { + "epoch": 0.10866028527108319, + "grad_norm": 2.5738842487335205, + "learning_rate": 1.7831271002626457e-05, + "loss": 0.7366, + "step": 1436 + }, + { + "epoch": 0.10873595399341682, + "grad_norm": 2.993759870529175, + "learning_rate": 1.7830836662116253e-05, + "loss": 0.7384, + "step": 1437 + }, + { + "epoch": 0.10881162271575044, + "grad_norm": 2.661965847015381, + "learning_rate": 1.7830401768590494e-05, + "loss": 0.7393, + "step": 1438 + }, + { + "epoch": 0.10888729143808407, + "grad_norm": 2.5119550228118896, + "learning_rate": 1.782996632207642e-05, + "loss": 0.8387, + "step": 1439 + }, + { + "epoch": 0.10896296016041769, + "grad_norm": 2.4007089138031006, + "learning_rate": 1.7829530322601288e-05, + "loss": 0.8684, + "step": 1440 + }, + { + "epoch": 0.10903862888275132, + "grad_norm": 2.434774398803711, + "learning_rate": 1.7829093770192415e-05, + "loss": 0.746, + "step": 1441 + }, + { + "epoch": 0.10911429760508494, + "grad_norm": 3.004561185836792, + "learning_rate": 1.782865666487713e-05, + "loss": 0.7922, + "step": 1442 + }, + { + "epoch": 0.10918996632741856, + "grad_norm": 2.905332565307617, + "learning_rate": 1.7828219006682814e-05, + "loss": 0.7966, + "step": 1443 + }, + { + "epoch": 0.10926563504975219, + "grad_norm": 2.940967559814453, + "learning_rate": 1.7827780795636866e-05, + "loss": 0.8431, + "step": 1444 + }, + { + "epoch": 0.10934130377208581, + "grad_norm": 2.504350185394287, + "learning_rate": 1.782734203176673e-05, + "loss": 0.938, + "step": 1445 + }, + { + "epoch": 0.10941697249441944, + "grad_norm": 2.725872278213501, + "learning_rate": 1.782690271509989e-05, + "loss": 0.9283, + "step": 1446 + }, + { + "epoch": 0.10949264121675306, + "grad_norm": 2.9516894817352295, + "learning_rate": 1.7826462845663853e-05, + "loss": 0.8293, + "step": 1447 + }, + { + "epoch": 0.10956830993908667, + "grad_norm": 3.0764172077178955, + "learning_rate": 1.782602242348616e-05, + "loss": 0.9247, + "step": 1448 + }, + { + "epoch": 0.1096439786614203, + "grad_norm": 3.033979892730713, + "learning_rate": 1.7825581448594394e-05, + "loss": 0.7406, + "step": 1449 + }, + { + "epoch": 0.10971964738375392, + "grad_norm": 2.9168546199798584, + "learning_rate": 1.782513992101618e-05, + "loss": 0.8797, + "step": 1450 + }, + { + "epoch": 0.10979531610608755, + "grad_norm": 2.965071201324463, + "learning_rate": 1.782469784077915e-05, + "loss": 0.7152, + "step": 1451 + }, + { + "epoch": 0.10987098482842117, + "grad_norm": 2.7454051971435547, + "learning_rate": 1.7824255207911008e-05, + "loss": 0.8399, + "step": 1452 + }, + { + "epoch": 0.1099466535507548, + "grad_norm": 3.45354962348938, + "learning_rate": 1.782381202243946e-05, + "loss": 0.8285, + "step": 1453 + }, + { + "epoch": 0.11002232227308842, + "grad_norm": 2.291821002960205, + "learning_rate": 1.7823368284392266e-05, + "loss": 0.8612, + "step": 1454 + }, + { + "epoch": 0.11009799099542204, + "grad_norm": 2.7993972301483154, + "learning_rate": 1.782292399379721e-05, + "loss": 0.7609, + "step": 1455 + }, + { + "epoch": 0.11017365971775567, + "grad_norm": 2.7965731620788574, + "learning_rate": 1.7822479150682113e-05, + "loss": 0.8857, + "step": 1456 + }, + { + "epoch": 0.11024932844008929, + "grad_norm": 2.9071121215820312, + "learning_rate": 1.782203375507484e-05, + "loss": 0.6945, + "step": 1457 + }, + { + "epoch": 0.11032499716242292, + "grad_norm": 6.042922496795654, + "learning_rate": 1.7821587807003278e-05, + "loss": 0.7653, + "step": 1458 + }, + { + "epoch": 0.11040066588475654, + "grad_norm": 2.6927385330200195, + "learning_rate": 1.782114130649536e-05, + "loss": 0.9095, + "step": 1459 + }, + { + "epoch": 0.11047633460709017, + "grad_norm": 2.8487069606781006, + "learning_rate": 1.7820694253579036e-05, + "loss": 0.8508, + "step": 1460 + }, + { + "epoch": 0.11055200332942378, + "grad_norm": 3.1788697242736816, + "learning_rate": 1.782024664828231e-05, + "loss": 0.7621, + "step": 1461 + }, + { + "epoch": 0.1106276720517574, + "grad_norm": 3.724763870239258, + "learning_rate": 1.7819798490633212e-05, + "loss": 0.6952, + "step": 1462 + }, + { + "epoch": 0.11070334077409102, + "grad_norm": 2.963629961013794, + "learning_rate": 1.7819349780659806e-05, + "loss": 0.7546, + "step": 1463 + }, + { + "epoch": 0.11077900949642465, + "grad_norm": 2.979599952697754, + "learning_rate": 1.781890051839019e-05, + "loss": 0.8406, + "step": 1464 + }, + { + "epoch": 0.11085467821875827, + "grad_norm": 2.474740505218506, + "learning_rate": 1.78184507038525e-05, + "loss": 0.7787, + "step": 1465 + }, + { + "epoch": 0.1109303469410919, + "grad_norm": 2.5070388317108154, + "learning_rate": 1.7818000337074906e-05, + "loss": 0.7781, + "step": 1466 + }, + { + "epoch": 0.11100601566342552, + "grad_norm": 2.9093334674835205, + "learning_rate": 1.7817549418085607e-05, + "loss": 0.7751, + "step": 1467 + }, + { + "epoch": 0.11108168438575915, + "grad_norm": 2.6724863052368164, + "learning_rate": 1.7817097946912847e-05, + "loss": 0.8846, + "step": 1468 + }, + { + "epoch": 0.11115735310809277, + "grad_norm": 2.9973912239074707, + "learning_rate": 1.7816645923584896e-05, + "loss": 0.701, + "step": 1469 + }, + { + "epoch": 0.1112330218304264, + "grad_norm": 2.5031442642211914, + "learning_rate": 1.781619334813006e-05, + "loss": 0.7866, + "step": 1470 + }, + { + "epoch": 0.11130869055276002, + "grad_norm": 2.907050609588623, + "learning_rate": 1.781574022057668e-05, + "loss": 0.7756, + "step": 1471 + }, + { + "epoch": 0.11138435927509364, + "grad_norm": 2.397503137588501, + "learning_rate": 1.7815286540953133e-05, + "loss": 0.7306, + "step": 1472 + }, + { + "epoch": 0.11146002799742727, + "grad_norm": 2.7645323276519775, + "learning_rate": 1.7814832309287835e-05, + "loss": 0.81, + "step": 1473 + }, + { + "epoch": 0.1115356967197609, + "grad_norm": 2.5474066734313965, + "learning_rate": 1.7814377525609223e-05, + "loss": 1.0083, + "step": 1474 + }, + { + "epoch": 0.1116113654420945, + "grad_norm": 3.7379724979400635, + "learning_rate": 1.7813922189945782e-05, + "loss": 0.8414, + "step": 1475 + }, + { + "epoch": 0.11168703416442813, + "grad_norm": 2.205005645751953, + "learning_rate": 1.7813466302326027e-05, + "loss": 0.8559, + "step": 1476 + }, + { + "epoch": 0.11176270288676175, + "grad_norm": 2.9247653484344482, + "learning_rate": 1.7813009862778505e-05, + "loss": 0.7688, + "step": 1477 + }, + { + "epoch": 0.11183837160909538, + "grad_norm": 2.9259767532348633, + "learning_rate": 1.7812552871331803e-05, + "loss": 0.8447, + "step": 1478 + }, + { + "epoch": 0.111914040331429, + "grad_norm": 2.8542733192443848, + "learning_rate": 1.7812095328014533e-05, + "loss": 0.7469, + "step": 1479 + }, + { + "epoch": 0.11198970905376263, + "grad_norm": 2.260713577270508, + "learning_rate": 1.7811637232855356e-05, + "loss": 0.6106, + "step": 1480 + }, + { + "epoch": 0.11206537777609625, + "grad_norm": 2.205512046813965, + "learning_rate": 1.7811178585882952e-05, + "loss": 0.8235, + "step": 1481 + }, + { + "epoch": 0.11214104649842987, + "grad_norm": 2.5569989681243896, + "learning_rate": 1.781071938712605e-05, + "loss": 0.8225, + "step": 1482 + }, + { + "epoch": 0.1122167152207635, + "grad_norm": 2.4361040592193604, + "learning_rate": 1.7810259636613398e-05, + "loss": 0.8132, + "step": 1483 + }, + { + "epoch": 0.11229238394309712, + "grad_norm": 3.278949737548828, + "learning_rate": 1.7809799334373792e-05, + "loss": 0.909, + "step": 1484 + }, + { + "epoch": 0.11236805266543075, + "grad_norm": 3.451547145843506, + "learning_rate": 1.780933848043606e-05, + "loss": 0.8415, + "step": 1485 + }, + { + "epoch": 0.11244372138776437, + "grad_norm": 2.9717342853546143, + "learning_rate": 1.7808877074829058e-05, + "loss": 0.8156, + "step": 1486 + }, + { + "epoch": 0.112519390110098, + "grad_norm": 2.911635398864746, + "learning_rate": 1.7808415117581683e-05, + "loss": 0.892, + "step": 1487 + }, + { + "epoch": 0.11259505883243161, + "grad_norm": 2.9125287532806396, + "learning_rate": 1.7807952608722862e-05, + "loss": 0.8326, + "step": 1488 + }, + { + "epoch": 0.11267072755476523, + "grad_norm": 2.8065741062164307, + "learning_rate": 1.7807489548281562e-05, + "loss": 0.888, + "step": 1489 + }, + { + "epoch": 0.11274639627709886, + "grad_norm": 2.326284408569336, + "learning_rate": 1.780702593628678e-05, + "loss": 0.6024, + "step": 1490 + }, + { + "epoch": 0.11282206499943248, + "grad_norm": 2.677926778793335, + "learning_rate": 1.7806561772767548e-05, + "loss": 0.7457, + "step": 1491 + }, + { + "epoch": 0.1128977337217661, + "grad_norm": 2.430309534072876, + "learning_rate": 1.7806097057752933e-05, + "loss": 0.7384, + "step": 1492 + }, + { + "epoch": 0.11297340244409973, + "grad_norm": 2.58219838142395, + "learning_rate": 1.780563179127204e-05, + "loss": 0.8198, + "step": 1493 + }, + { + "epoch": 0.11304907116643335, + "grad_norm": 2.838965892791748, + "learning_rate": 1.7805165973354e-05, + "loss": 0.9538, + "step": 1494 + }, + { + "epoch": 0.11312473988876698, + "grad_norm": 2.168802499771118, + "learning_rate": 1.780469960402799e-05, + "loss": 0.9857, + "step": 1495 + }, + { + "epoch": 0.1132004086111006, + "grad_norm": 3.0226144790649414, + "learning_rate": 1.7804232683323212e-05, + "loss": 0.8795, + "step": 1496 + }, + { + "epoch": 0.11327607733343423, + "grad_norm": 2.2193140983581543, + "learning_rate": 1.7803765211268907e-05, + "loss": 0.8259, + "step": 1497 + }, + { + "epoch": 0.11335174605576785, + "grad_norm": 2.614348888397217, + "learning_rate": 1.7803297187894352e-05, + "loss": 0.7653, + "step": 1498 + }, + { + "epoch": 0.11342741477810148, + "grad_norm": 2.413205862045288, + "learning_rate": 1.780282861322885e-05, + "loss": 0.8608, + "step": 1499 + }, + { + "epoch": 0.1135030835004351, + "grad_norm": 2.941840648651123, + "learning_rate": 1.780235948730175e-05, + "loss": 0.7904, + "step": 1500 + }, + { + "epoch": 0.11357875222276873, + "grad_norm": 2.9199554920196533, + "learning_rate": 1.780188981014243e-05, + "loss": 0.8542, + "step": 1501 + }, + { + "epoch": 0.11365442094510234, + "grad_norm": 2.8440537452697754, + "learning_rate": 1.7801419581780295e-05, + "loss": 0.766, + "step": 1502 + }, + { + "epoch": 0.11373008966743596, + "grad_norm": 2.193862199783325, + "learning_rate": 1.7800948802244805e-05, + "loss": 0.8432, + "step": 1503 + }, + { + "epoch": 0.11380575838976958, + "grad_norm": 2.660568952560425, + "learning_rate": 1.7800477471565435e-05, + "loss": 0.8334, + "step": 1504 + }, + { + "epoch": 0.11388142711210321, + "grad_norm": 2.565652847290039, + "learning_rate": 1.78000055897717e-05, + "loss": 0.7056, + "step": 1505 + }, + { + "epoch": 0.11395709583443683, + "grad_norm": 2.698594808578491, + "learning_rate": 1.7799533156893153e-05, + "loss": 0.8236, + "step": 1506 + }, + { + "epoch": 0.11403276455677046, + "grad_norm": 2.662174940109253, + "learning_rate": 1.779906017295938e-05, + "loss": 0.7694, + "step": 1507 + }, + { + "epoch": 0.11410843327910408, + "grad_norm": 2.9940743446350098, + "learning_rate": 1.7798586637999993e-05, + "loss": 1.1411, + "step": 1508 + }, + { + "epoch": 0.1141841020014377, + "grad_norm": 2.8996222019195557, + "learning_rate": 1.7798112552044658e-05, + "loss": 0.715, + "step": 1509 + }, + { + "epoch": 0.11425977072377133, + "grad_norm": 2.917023181915283, + "learning_rate": 1.7797637915123058e-05, + "loss": 0.5476, + "step": 1510 + }, + { + "epoch": 0.11433543944610496, + "grad_norm": 2.769496440887451, + "learning_rate": 1.7797162727264917e-05, + "loss": 0.8295, + "step": 1511 + }, + { + "epoch": 0.11441110816843858, + "grad_norm": 2.2324085235595703, + "learning_rate": 1.779668698849999e-05, + "loss": 0.7346, + "step": 1512 + }, + { + "epoch": 0.1144867768907722, + "grad_norm": 3.295725107192993, + "learning_rate": 1.7796210698858077e-05, + "loss": 0.7722, + "step": 1513 + }, + { + "epoch": 0.11456244561310583, + "grad_norm": 2.2366225719451904, + "learning_rate": 1.7795733858368992e-05, + "loss": 0.7013, + "step": 1514 + }, + { + "epoch": 0.11463811433543944, + "grad_norm": 3.1166555881500244, + "learning_rate": 1.7795256467062612e-05, + "loss": 0.8173, + "step": 1515 + }, + { + "epoch": 0.11471378305777306, + "grad_norm": 2.2865703105926514, + "learning_rate": 1.779477852496882e-05, + "loss": 0.745, + "step": 1516 + }, + { + "epoch": 0.11478945178010669, + "grad_norm": 2.9082911014556885, + "learning_rate": 1.779430003211755e-05, + "loss": 0.9206, + "step": 1517 + }, + { + "epoch": 0.11486512050244031, + "grad_norm": 2.4655425548553467, + "learning_rate": 1.779382098853877e-05, + "loss": 0.7449, + "step": 1518 + }, + { + "epoch": 0.11494078922477394, + "grad_norm": 3.6643152236938477, + "learning_rate": 1.7793341394262476e-05, + "loss": 0.8769, + "step": 1519 + }, + { + "epoch": 0.11501645794710756, + "grad_norm": 2.4818716049194336, + "learning_rate": 1.7792861249318704e-05, + "loss": 0.7565, + "step": 1520 + }, + { + "epoch": 0.11509212666944119, + "grad_norm": 2.0801405906677246, + "learning_rate": 1.779238055373752e-05, + "loss": 0.5753, + "step": 1521 + }, + { + "epoch": 0.11516779539177481, + "grad_norm": 3.2640860080718994, + "learning_rate": 1.779189930754903e-05, + "loss": 0.6951, + "step": 1522 + }, + { + "epoch": 0.11524346411410843, + "grad_norm": 3.1288063526153564, + "learning_rate": 1.7791417510783368e-05, + "loss": 0.8465, + "step": 1523 + }, + { + "epoch": 0.11531913283644206, + "grad_norm": 3.1117849349975586, + "learning_rate": 1.7790935163470706e-05, + "loss": 0.685, + "step": 1524 + }, + { + "epoch": 0.11539480155877568, + "grad_norm": 2.816326379776001, + "learning_rate": 1.779045226564125e-05, + "loss": 0.7518, + "step": 1525 + }, + { + "epoch": 0.11547047028110931, + "grad_norm": 3.014407157897949, + "learning_rate": 1.7789968817325242e-05, + "loss": 0.8803, + "step": 1526 + }, + { + "epoch": 0.11554613900344293, + "grad_norm": 3.064116954803467, + "learning_rate": 1.7789484818552954e-05, + "loss": 0.7059, + "step": 1527 + }, + { + "epoch": 0.11562180772577656, + "grad_norm": 2.1914854049682617, + "learning_rate": 1.77890002693547e-05, + "loss": 0.7042, + "step": 1528 + }, + { + "epoch": 0.11569747644811017, + "grad_norm": 3.057530403137207, + "learning_rate": 1.7788515169760824e-05, + "loss": 0.876, + "step": 1529 + }, + { + "epoch": 0.11577314517044379, + "grad_norm": 2.713554859161377, + "learning_rate": 1.7788029519801703e-05, + "loss": 0.8374, + "step": 1530 + }, + { + "epoch": 0.11584881389277742, + "grad_norm": 2.849468231201172, + "learning_rate": 1.7787543319507743e-05, + "loss": 0.924, + "step": 1531 + }, + { + "epoch": 0.11592448261511104, + "grad_norm": 3.1437432765960693, + "learning_rate": 1.7787056568909405e-05, + "loss": 0.8471, + "step": 1532 + }, + { + "epoch": 0.11600015133744467, + "grad_norm": 2.3561949729919434, + "learning_rate": 1.778656926803716e-05, + "loss": 0.8902, + "step": 1533 + }, + { + "epoch": 0.11607582005977829, + "grad_norm": 1.9011698961257935, + "learning_rate": 1.778608141692153e-05, + "loss": 0.8698, + "step": 1534 + }, + { + "epoch": 0.11615148878211191, + "grad_norm": 3.898846387863159, + "learning_rate": 1.7785593015593066e-05, + "loss": 0.7568, + "step": 1535 + }, + { + "epoch": 0.11622715750444554, + "grad_norm": 3.060079574584961, + "learning_rate": 1.7785104064082347e-05, + "loss": 0.863, + "step": 1536 + }, + { + "epoch": 0.11630282622677916, + "grad_norm": 3.5187714099884033, + "learning_rate": 1.7784614562419998e-05, + "loss": 0.8006, + "step": 1537 + }, + { + "epoch": 0.11637849494911279, + "grad_norm": 2.9314115047454834, + "learning_rate": 1.7784124510636672e-05, + "loss": 0.9548, + "step": 1538 + }, + { + "epoch": 0.11645416367144641, + "grad_norm": 2.3972244262695312, + "learning_rate": 1.7783633908763062e-05, + "loss": 0.6688, + "step": 1539 + }, + { + "epoch": 0.11652983239378004, + "grad_norm": 2.985501766204834, + "learning_rate": 1.7783142756829882e-05, + "loss": 0.7211, + "step": 1540 + }, + { + "epoch": 0.11660550111611366, + "grad_norm": 2.532233476638794, + "learning_rate": 1.7782651054867895e-05, + "loss": 0.8695, + "step": 1541 + }, + { + "epoch": 0.11668116983844727, + "grad_norm": 3.1398353576660156, + "learning_rate": 1.7782158802907893e-05, + "loss": 0.796, + "step": 1542 + }, + { + "epoch": 0.1167568385607809, + "grad_norm": 3.156766414642334, + "learning_rate": 1.7781666000980705e-05, + "loss": 0.8581, + "step": 1543 + }, + { + "epoch": 0.11683250728311452, + "grad_norm": 3.025268793106079, + "learning_rate": 1.7781172649117186e-05, + "loss": 0.7749, + "step": 1544 + }, + { + "epoch": 0.11690817600544814, + "grad_norm": 3.071802854537964, + "learning_rate": 1.7780678747348236e-05, + "loss": 0.7598, + "step": 1545 + }, + { + "epoch": 0.11698384472778177, + "grad_norm": 4.598686218261719, + "learning_rate": 1.7780184295704778e-05, + "loss": 0.8049, + "step": 1546 + }, + { + "epoch": 0.1170595134501154, + "grad_norm": 3.0059025287628174, + "learning_rate": 1.7779689294217784e-05, + "loss": 0.7546, + "step": 1547 + }, + { + "epoch": 0.11713518217244902, + "grad_norm": 2.655482292175293, + "learning_rate": 1.777919374291825e-05, + "loss": 0.941, + "step": 1548 + }, + { + "epoch": 0.11721085089478264, + "grad_norm": 2.7230942249298096, + "learning_rate": 1.7778697641837208e-05, + "loss": 0.8749, + "step": 1549 + }, + { + "epoch": 0.11728651961711627, + "grad_norm": 3.8638458251953125, + "learning_rate": 1.7778200991005724e-05, + "loss": 0.7645, + "step": 1550 + }, + { + "epoch": 0.11736218833944989, + "grad_norm": 2.8020644187927246, + "learning_rate": 1.7777703790454906e-05, + "loss": 0.6915, + "step": 1551 + }, + { + "epoch": 0.11743785706178352, + "grad_norm": 4.486051559448242, + "learning_rate": 1.777720604021588e-05, + "loss": 0.7654, + "step": 1552 + }, + { + "epoch": 0.11751352578411714, + "grad_norm": 3.036688804626465, + "learning_rate": 1.7776707740319828e-05, + "loss": 0.9693, + "step": 1553 + }, + { + "epoch": 0.11758919450645076, + "grad_norm": 2.724858045578003, + "learning_rate": 1.7776208890797947e-05, + "loss": 0.6755, + "step": 1554 + }, + { + "epoch": 0.11766486322878439, + "grad_norm": 3.0751144886016846, + "learning_rate": 1.7775709491681482e-05, + "loss": 0.9963, + "step": 1555 + }, + { + "epoch": 0.117740531951118, + "grad_norm": 2.686180591583252, + "learning_rate": 1.7775209543001703e-05, + "loss": 0.8259, + "step": 1556 + }, + { + "epoch": 0.11781620067345162, + "grad_norm": 2.430630683898926, + "learning_rate": 1.777470904478992e-05, + "loss": 0.8329, + "step": 1557 + }, + { + "epoch": 0.11789186939578525, + "grad_norm": 2.6584362983703613, + "learning_rate": 1.7774207997077477e-05, + "loss": 0.8525, + "step": 1558 + }, + { + "epoch": 0.11796753811811887, + "grad_norm": 2.8905189037323, + "learning_rate": 1.777370639989575e-05, + "loss": 0.8348, + "step": 1559 + }, + { + "epoch": 0.1180432068404525, + "grad_norm": 2.841679334640503, + "learning_rate": 1.777320425327615e-05, + "loss": 0.8827, + "step": 1560 + }, + { + "epoch": 0.11811887556278612, + "grad_norm": 2.7715628147125244, + "learning_rate": 1.777270155725012e-05, + "loss": 0.8298, + "step": 1561 + }, + { + "epoch": 0.11819454428511975, + "grad_norm": 3.1917660236358643, + "learning_rate": 1.777219831184915e-05, + "loss": 0.8082, + "step": 1562 + }, + { + "epoch": 0.11827021300745337, + "grad_norm": 3.6017658710479736, + "learning_rate": 1.7771694517104746e-05, + "loss": 0.7245, + "step": 1563 + }, + { + "epoch": 0.118345881729787, + "grad_norm": 3.7225780487060547, + "learning_rate": 1.777119017304846e-05, + "loss": 0.7443, + "step": 1564 + }, + { + "epoch": 0.11842155045212062, + "grad_norm": 3.468682289123535, + "learning_rate": 1.7770685279711877e-05, + "loss": 0.7181, + "step": 1565 + }, + { + "epoch": 0.11849721917445424, + "grad_norm": 3.647542715072632, + "learning_rate": 1.7770179837126613e-05, + "loss": 0.7155, + "step": 1566 + }, + { + "epoch": 0.11857288789678787, + "grad_norm": 3.232402801513672, + "learning_rate": 1.7769673845324322e-05, + "loss": 0.7418, + "step": 1567 + }, + { + "epoch": 0.11864855661912149, + "grad_norm": 2.8265175819396973, + "learning_rate": 1.776916730433669e-05, + "loss": 0.7867, + "step": 1568 + }, + { + "epoch": 0.1187242253414551, + "grad_norm": 2.74609637260437, + "learning_rate": 1.7768660214195437e-05, + "loss": 0.7622, + "step": 1569 + }, + { + "epoch": 0.11879989406378873, + "grad_norm": 2.700554609298706, + "learning_rate": 1.7768152574932323e-05, + "loss": 0.9818, + "step": 1570 + }, + { + "epoch": 0.11887556278612235, + "grad_norm": 2.617316722869873, + "learning_rate": 1.776764438657913e-05, + "loss": 0.8502, + "step": 1571 + }, + { + "epoch": 0.11895123150845598, + "grad_norm": 2.603131055831909, + "learning_rate": 1.7767135649167694e-05, + "loss": 0.7249, + "step": 1572 + }, + { + "epoch": 0.1190269002307896, + "grad_norm": 2.8606648445129395, + "learning_rate": 1.7766626362729864e-05, + "loss": 0.7766, + "step": 1573 + }, + { + "epoch": 0.11910256895312323, + "grad_norm": 3.8220481872558594, + "learning_rate": 1.776611652729754e-05, + "loss": 0.7473, + "step": 1574 + }, + { + "epoch": 0.11917823767545685, + "grad_norm": 2.6276204586029053, + "learning_rate": 1.7765606142902642e-05, + "loss": 0.8983, + "step": 1575 + }, + { + "epoch": 0.11925390639779047, + "grad_norm": 1.8426728248596191, + "learning_rate": 1.7765095209577137e-05, + "loss": 1.0027, + "step": 1576 + }, + { + "epoch": 0.1193295751201241, + "grad_norm": 3.0207700729370117, + "learning_rate": 1.776458372735302e-05, + "loss": 0.7059, + "step": 1577 + }, + { + "epoch": 0.11940524384245772, + "grad_norm": 2.4392471313476562, + "learning_rate": 1.776407169626232e-05, + "loss": 0.6909, + "step": 1578 + }, + { + "epoch": 0.11948091256479135, + "grad_norm": 3.044403076171875, + "learning_rate": 1.7763559116337107e-05, + "loss": 0.7622, + "step": 1579 + }, + { + "epoch": 0.11955658128712497, + "grad_norm": 2.991702079772949, + "learning_rate": 1.776304598760948e-05, + "loss": 0.7587, + "step": 1580 + }, + { + "epoch": 0.1196322500094586, + "grad_norm": 2.4249448776245117, + "learning_rate": 1.7762532310111565e-05, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 0.11970791873179222, + "grad_norm": 2.8478915691375732, + "learning_rate": 1.7762018083875536e-05, + "loss": 0.6938, + "step": 1582 + }, + { + "epoch": 0.11978358745412583, + "grad_norm": 2.7412543296813965, + "learning_rate": 1.7761503308933594e-05, + "loss": 0.6663, + "step": 1583 + }, + { + "epoch": 0.11985925617645946, + "grad_norm": 2.393305540084839, + "learning_rate": 1.776098798531798e-05, + "loss": 0.9015, + "step": 1584 + }, + { + "epoch": 0.11993492489879308, + "grad_norm": 3.5225398540496826, + "learning_rate": 1.776047211306096e-05, + "loss": 0.7928, + "step": 1585 + }, + { + "epoch": 0.1200105936211267, + "grad_norm": 2.1863973140716553, + "learning_rate": 1.7759955692194843e-05, + "loss": 0.8247, + "step": 1586 + }, + { + "epoch": 0.12008626234346033, + "grad_norm": 2.749263048171997, + "learning_rate": 1.7759438722751962e-05, + "loss": 0.7481, + "step": 1587 + }, + { + "epoch": 0.12016193106579395, + "grad_norm": 2.8146281242370605, + "learning_rate": 1.7758921204764704e-05, + "loss": 0.8196, + "step": 1588 + }, + { + "epoch": 0.12023759978812758, + "grad_norm": 2.485360622406006, + "learning_rate": 1.7758403138265465e-05, + "loss": 0.7325, + "step": 1589 + }, + { + "epoch": 0.1203132685104612, + "grad_norm": 2.7749016284942627, + "learning_rate": 1.7757884523286697e-05, + "loss": 0.9098, + "step": 1590 + }, + { + "epoch": 0.12038893723279483, + "grad_norm": 2.742647647857666, + "learning_rate": 1.775736535986087e-05, + "loss": 0.8401, + "step": 1591 + }, + { + "epoch": 0.12046460595512845, + "grad_norm": 2.826692581176758, + "learning_rate": 1.7756845648020502e-05, + "loss": 0.8449, + "step": 1592 + }, + { + "epoch": 0.12054027467746208, + "grad_norm": 3.2634994983673096, + "learning_rate": 1.7756325387798138e-05, + "loss": 0.7922, + "step": 1593 + }, + { + "epoch": 0.1206159433997957, + "grad_norm": 2.3694705963134766, + "learning_rate": 1.7755804579226352e-05, + "loss": 0.8471, + "step": 1594 + }, + { + "epoch": 0.12069161212212932, + "grad_norm": 2.5997414588928223, + "learning_rate": 1.775528322233777e-05, + "loss": 0.7086, + "step": 1595 + }, + { + "epoch": 0.12076728084446293, + "grad_norm": 2.655085802078247, + "learning_rate": 1.775476131716503e-05, + "loss": 0.746, + "step": 1596 + }, + { + "epoch": 0.12084294956679656, + "grad_norm": 2.6947407722473145, + "learning_rate": 1.7754238863740822e-05, + "loss": 0.8031, + "step": 1597 + }, + { + "epoch": 0.12091861828913018, + "grad_norm": 3.005265951156616, + "learning_rate": 1.775371586209786e-05, + "loss": 0.7431, + "step": 1598 + }, + { + "epoch": 0.12099428701146381, + "grad_norm": 2.7612268924713135, + "learning_rate": 1.7753192312268897e-05, + "loss": 0.8103, + "step": 1599 + }, + { + "epoch": 0.12106995573379743, + "grad_norm": 2.702098846435547, + "learning_rate": 1.775266821428672e-05, + "loss": 0.7749, + "step": 1600 + }, + { + "epoch": 0.12114562445613106, + "grad_norm": 3.0759353637695312, + "learning_rate": 1.7752143568184155e-05, + "loss": 0.7911, + "step": 1601 + }, + { + "epoch": 0.12122129317846468, + "grad_norm": 2.7693514823913574, + "learning_rate": 1.7751618373994046e-05, + "loss": 0.7506, + "step": 1602 + }, + { + "epoch": 0.1212969619007983, + "grad_norm": 2.7553870677948, + "learning_rate": 1.775109263174929e-05, + "loss": 0.6945, + "step": 1603 + }, + { + "epoch": 0.12137263062313193, + "grad_norm": 3.5849199295043945, + "learning_rate": 1.7750566341482813e-05, + "loss": 0.9296, + "step": 1604 + }, + { + "epoch": 0.12144829934546555, + "grad_norm": 2.122450828552246, + "learning_rate": 1.7750039503227564e-05, + "loss": 0.8155, + "step": 1605 + }, + { + "epoch": 0.12152396806779918, + "grad_norm": 2.7267143726348877, + "learning_rate": 1.774951211701654e-05, + "loss": 0.8056, + "step": 1606 + }, + { + "epoch": 0.1215996367901328, + "grad_norm": 2.3027350902557373, + "learning_rate": 1.774898418288277e-05, + "loss": 0.923, + "step": 1607 + }, + { + "epoch": 0.12167530551246643, + "grad_norm": 2.8576319217681885, + "learning_rate": 1.774845570085931e-05, + "loss": 0.7195, + "step": 1608 + }, + { + "epoch": 0.12175097423480005, + "grad_norm": 2.155660390853882, + "learning_rate": 1.7747926670979264e-05, + "loss": 0.6233, + "step": 1609 + }, + { + "epoch": 0.12182664295713366, + "grad_norm": 6.00687837600708, + "learning_rate": 1.774739709327575e-05, + "loss": 0.8258, + "step": 1610 + }, + { + "epoch": 0.12190231167946729, + "grad_norm": 2.7313666343688965, + "learning_rate": 1.774686696778194e-05, + "loss": 0.7564, + "step": 1611 + }, + { + "epoch": 0.12197798040180091, + "grad_norm": 2.975884199142456, + "learning_rate": 1.774633629453103e-05, + "loss": 0.8001, + "step": 1612 + }, + { + "epoch": 0.12205364912413454, + "grad_norm": 2.4056174755096436, + "learning_rate": 1.7745805073556252e-05, + "loss": 0.6628, + "step": 1613 + }, + { + "epoch": 0.12212931784646816, + "grad_norm": 2.666964292526245, + "learning_rate": 1.7745273304890872e-05, + "loss": 0.7826, + "step": 1614 + }, + { + "epoch": 0.12220498656880179, + "grad_norm": 2.7258975505828857, + "learning_rate": 1.7744740988568195e-05, + "loss": 0.6598, + "step": 1615 + }, + { + "epoch": 0.12228065529113541, + "grad_norm": 3.982149600982666, + "learning_rate": 1.774420812462155e-05, + "loss": 0.8036, + "step": 1616 + }, + { + "epoch": 0.12235632401346903, + "grad_norm": 2.672240734100342, + "learning_rate": 1.7743674713084312e-05, + "loss": 0.7409, + "step": 1617 + }, + { + "epoch": 0.12243199273580266, + "grad_norm": 2.278903007507324, + "learning_rate": 1.774314075398988e-05, + "loss": 0.7339, + "step": 1618 + }, + { + "epoch": 0.12250766145813628, + "grad_norm": 3.2077767848968506, + "learning_rate": 1.7742606247371698e-05, + "loss": 0.859, + "step": 1619 + }, + { + "epoch": 0.12258333018046991, + "grad_norm": 2.292569398880005, + "learning_rate": 1.7742071193263233e-05, + "loss": 0.73, + "step": 1620 + }, + { + "epoch": 0.12265899890280353, + "grad_norm": 2.9495203495025635, + "learning_rate": 1.7741535591697998e-05, + "loss": 0.8434, + "step": 1621 + }, + { + "epoch": 0.12273466762513716, + "grad_norm": 2.9479892253875732, + "learning_rate": 1.7740999442709528e-05, + "loss": 0.7948, + "step": 1622 + }, + { + "epoch": 0.12281033634747077, + "grad_norm": 2.773385763168335, + "learning_rate": 1.7740462746331402e-05, + "loss": 0.8904, + "step": 1623 + }, + { + "epoch": 0.12288600506980439, + "grad_norm": 3.8508312702178955, + "learning_rate": 1.773992550259723e-05, + "loss": 0.7778, + "step": 1624 + }, + { + "epoch": 0.12296167379213802, + "grad_norm": 3.088562488555908, + "learning_rate": 1.7739387711540655e-05, + "loss": 0.7213, + "step": 1625 + }, + { + "epoch": 0.12303734251447164, + "grad_norm": 2.9575798511505127, + "learning_rate": 1.7738849373195352e-05, + "loss": 0.8504, + "step": 1626 + }, + { + "epoch": 0.12311301123680526, + "grad_norm": 4.4509596824646, + "learning_rate": 1.7738310487595038e-05, + "loss": 0.8436, + "step": 1627 + }, + { + "epoch": 0.12318867995913889, + "grad_norm": 3.1381912231445312, + "learning_rate": 1.773777105477346e-05, + "loss": 0.7903, + "step": 1628 + }, + { + "epoch": 0.12326434868147251, + "grad_norm": 2.663259267807007, + "learning_rate": 1.773723107476439e-05, + "loss": 0.8887, + "step": 1629 + }, + { + "epoch": 0.12334001740380614, + "grad_norm": 2.6209001541137695, + "learning_rate": 1.773669054760166e-05, + "loss": 0.8118, + "step": 1630 + }, + { + "epoch": 0.12341568612613976, + "grad_norm": 2.6906800270080566, + "learning_rate": 1.7736149473319102e-05, + "loss": 0.8902, + "step": 1631 + }, + { + "epoch": 0.12349135484847339, + "grad_norm": 2.746408700942993, + "learning_rate": 1.7735607851950613e-05, + "loss": 0.8419, + "step": 1632 + }, + { + "epoch": 0.12356702357080701, + "grad_norm": 4.190911293029785, + "learning_rate": 1.7735065683530103e-05, + "loss": 0.8135, + "step": 1633 + }, + { + "epoch": 0.12364269229314064, + "grad_norm": 4.614360332489014, + "learning_rate": 1.7734522968091528e-05, + "loss": 0.8491, + "step": 1634 + }, + { + "epoch": 0.12371836101547426, + "grad_norm": 3.484330892562866, + "learning_rate": 1.7733979705668877e-05, + "loss": 0.8769, + "step": 1635 + }, + { + "epoch": 0.12379402973780787, + "grad_norm": 2.535391092300415, + "learning_rate": 1.7733435896296164e-05, + "loss": 0.8932, + "step": 1636 + }, + { + "epoch": 0.1238696984601415, + "grad_norm": 2.774635076522827, + "learning_rate": 1.773289154000745e-05, + "loss": 1.014, + "step": 1637 + }, + { + "epoch": 0.12394536718247512, + "grad_norm": 2.7426695823669434, + "learning_rate": 1.773234663683682e-05, + "loss": 0.6978, + "step": 1638 + }, + { + "epoch": 0.12402103590480874, + "grad_norm": 2.296440362930298, + "learning_rate": 1.77318011868184e-05, + "loss": 0.8189, + "step": 1639 + }, + { + "epoch": 0.12409670462714237, + "grad_norm": 2.881760835647583, + "learning_rate": 1.773125518998635e-05, + "loss": 0.8082, + "step": 1640 + }, + { + "epoch": 0.12417237334947599, + "grad_norm": 2.310656785964966, + "learning_rate": 1.773070864637486e-05, + "loss": 0.8178, + "step": 1641 + }, + { + "epoch": 0.12424804207180962, + "grad_norm": 3.410867691040039, + "learning_rate": 1.7730161556018154e-05, + "loss": 0.8674, + "step": 1642 + }, + { + "epoch": 0.12432371079414324, + "grad_norm": 2.032334327697754, + "learning_rate": 1.7729613918950496e-05, + "loss": 0.8109, + "step": 1643 + }, + { + "epoch": 0.12439937951647687, + "grad_norm": 2.293539047241211, + "learning_rate": 1.7729065735206177e-05, + "loss": 0.8798, + "step": 1644 + }, + { + "epoch": 0.12447504823881049, + "grad_norm": 2.2878317832946777, + "learning_rate": 1.7728517004819527e-05, + "loss": 0.7187, + "step": 1645 + }, + { + "epoch": 0.12455071696114411, + "grad_norm": 2.5122296810150146, + "learning_rate": 1.772796772782492e-05, + "loss": 0.8013, + "step": 1646 + }, + { + "epoch": 0.12462638568347774, + "grad_norm": 2.389878511428833, + "learning_rate": 1.7727417904256734e-05, + "loss": 0.8499, + "step": 1647 + }, + { + "epoch": 0.12470205440581136, + "grad_norm": 2.666120767593384, + "learning_rate": 1.7726867534149413e-05, + "loss": 0.723, + "step": 1648 + }, + { + "epoch": 0.12477772312814499, + "grad_norm": 3.555640697479248, + "learning_rate": 1.7726316617537424e-05, + "loss": 0.8265, + "step": 1649 + }, + { + "epoch": 0.1248533918504786, + "grad_norm": 2.2089498043060303, + "learning_rate": 1.7725765154455262e-05, + "loss": 0.8063, + "step": 1650 + }, + { + "epoch": 0.12492906057281222, + "grad_norm": 4.1747918128967285, + "learning_rate": 1.7725213144937464e-05, + "loss": 0.9545, + "step": 1651 + }, + { + "epoch": 0.12500472929514586, + "grad_norm": 2.3813788890838623, + "learning_rate": 1.7724660589018597e-05, + "loss": 0.8837, + "step": 1652 + }, + { + "epoch": 0.12508039801747947, + "grad_norm": 3.1693472862243652, + "learning_rate": 1.7724107486733268e-05, + "loss": 0.8958, + "step": 1653 + }, + { + "epoch": 0.1251560667398131, + "grad_norm": 2.4706733226776123, + "learning_rate": 1.772355383811611e-05, + "loss": 0.8077, + "step": 1654 + }, + { + "epoch": 0.12523173546214672, + "grad_norm": 2.8894009590148926, + "learning_rate": 1.7722999643201794e-05, + "loss": 0.7148, + "step": 1655 + }, + { + "epoch": 0.12530740418448036, + "grad_norm": 2.7157387733459473, + "learning_rate": 1.7722444902025025e-05, + "loss": 0.72, + "step": 1656 + }, + { + "epoch": 0.12538307290681397, + "grad_norm": 4.078158855438232, + "learning_rate": 1.7721889614620548e-05, + "loss": 0.7737, + "step": 1657 + }, + { + "epoch": 0.12545874162914758, + "grad_norm": 2.9151313304901123, + "learning_rate": 1.772133378102313e-05, + "loss": 0.8545, + "step": 1658 + }, + { + "epoch": 0.12553441035148122, + "grad_norm": 5.656601905822754, + "learning_rate": 1.7720777401267586e-05, + "loss": 0.8819, + "step": 1659 + }, + { + "epoch": 0.12561007907381483, + "grad_norm": 2.7481420040130615, + "learning_rate": 1.7720220475388756e-05, + "loss": 0.7213, + "step": 1660 + }, + { + "epoch": 0.12568574779614847, + "grad_norm": 3.2132253646850586, + "learning_rate": 1.771966300342151e-05, + "loss": 0.8183, + "step": 1661 + }, + { + "epoch": 0.12576141651848208, + "grad_norm": 2.4579145908355713, + "learning_rate": 1.771910498540077e-05, + "loss": 0.8897, + "step": 1662 + }, + { + "epoch": 0.12583708524081572, + "grad_norm": 2.608400344848633, + "learning_rate": 1.7718546421361465e-05, + "loss": 0.8401, + "step": 1663 + }, + { + "epoch": 0.12591275396314933, + "grad_norm": 2.6007118225097656, + "learning_rate": 1.771798731133859e-05, + "loss": 0.906, + "step": 1664 + }, + { + "epoch": 0.12598842268548296, + "grad_norm": 2.439805746078491, + "learning_rate": 1.7717427655367153e-05, + "loss": 0.9405, + "step": 1665 + }, + { + "epoch": 0.12606409140781658, + "grad_norm": 2.689601421356201, + "learning_rate": 1.7716867453482198e-05, + "loss": 0.8398, + "step": 1666 + }, + { + "epoch": 0.1261397601301502, + "grad_norm": 2.9567224979400635, + "learning_rate": 1.7716306705718814e-05, + "loss": 0.8278, + "step": 1667 + }, + { + "epoch": 0.12621542885248382, + "grad_norm": 3.0706992149353027, + "learning_rate": 1.7715745412112107e-05, + "loss": 0.7353, + "step": 1668 + }, + { + "epoch": 0.12629109757481746, + "grad_norm": 2.521843910217285, + "learning_rate": 1.7715183572697234e-05, + "loss": 0.91, + "step": 1669 + }, + { + "epoch": 0.12636676629715107, + "grad_norm": 2.7420334815979004, + "learning_rate": 1.771462118750938e-05, + "loss": 0.7666, + "step": 1670 + }, + { + "epoch": 0.12644243501948468, + "grad_norm": 2.520315647125244, + "learning_rate": 1.7714058256583758e-05, + "loss": 0.8706, + "step": 1671 + }, + { + "epoch": 0.12651810374181832, + "grad_norm": 2.752073287963867, + "learning_rate": 1.7713494779955625e-05, + "loss": 0.9659, + "step": 1672 + }, + { + "epoch": 0.12659377246415193, + "grad_norm": 2.854764223098755, + "learning_rate": 1.771293075766026e-05, + "loss": 0.8267, + "step": 1673 + }, + { + "epoch": 0.12666944118648557, + "grad_norm": 2.201077938079834, + "learning_rate": 1.7712366189732995e-05, + "loss": 0.7964, + "step": 1674 + }, + { + "epoch": 0.12674510990881918, + "grad_norm": 3.6953394412994385, + "learning_rate": 1.7711801076209182e-05, + "loss": 0.7227, + "step": 1675 + }, + { + "epoch": 0.12682077863115282, + "grad_norm": 2.4317433834075928, + "learning_rate": 1.7711235417124207e-05, + "loss": 0.8278, + "step": 1676 + }, + { + "epoch": 0.12689644735348643, + "grad_norm": 3.292365550994873, + "learning_rate": 1.771066921251349e-05, + "loss": 0.8908, + "step": 1677 + }, + { + "epoch": 0.12697211607582007, + "grad_norm": 2.3354146480560303, + "learning_rate": 1.7710102462412498e-05, + "loss": 0.7178, + "step": 1678 + }, + { + "epoch": 0.12704778479815368, + "grad_norm": 2.138141393661499, + "learning_rate": 1.7709535166856718e-05, + "loss": 0.7817, + "step": 1679 + }, + { + "epoch": 0.12712345352048732, + "grad_norm": 2.597642183303833, + "learning_rate": 1.7708967325881675e-05, + "loss": 0.7315, + "step": 1680 + }, + { + "epoch": 0.12719912224282093, + "grad_norm": 2.2730774879455566, + "learning_rate": 1.7708398939522927e-05, + "loss": 0.7304, + "step": 1681 + }, + { + "epoch": 0.12727479096515457, + "grad_norm": 2.7624247074127197, + "learning_rate": 1.7707830007816073e-05, + "loss": 0.8055, + "step": 1682 + }, + { + "epoch": 0.12735045968748818, + "grad_norm": 2.6193227767944336, + "learning_rate": 1.770726053079674e-05, + "loss": 0.8424, + "step": 1683 + }, + { + "epoch": 0.1274261284098218, + "grad_norm": 2.64697265625, + "learning_rate": 1.770669050850059e-05, + "loss": 0.8105, + "step": 1684 + }, + { + "epoch": 0.12750179713215543, + "grad_norm": 2.303603172302246, + "learning_rate": 1.770611994096332e-05, + "loss": 0.8021, + "step": 1685 + }, + { + "epoch": 0.12757746585448904, + "grad_norm": 2.714512348175049, + "learning_rate": 1.7705548828220657e-05, + "loss": 0.7102, + "step": 1686 + }, + { + "epoch": 0.12765313457682267, + "grad_norm": 3.6520884037017822, + "learning_rate": 1.7704977170308372e-05, + "loss": 0.7449, + "step": 1687 + }, + { + "epoch": 0.12772880329915628, + "grad_norm": 3.0547521114349365, + "learning_rate": 1.770440496726226e-05, + "loss": 0.8081, + "step": 1688 + }, + { + "epoch": 0.12780447202148992, + "grad_norm": 2.003868579864502, + "learning_rate": 1.770383221911816e-05, + "loss": 0.9698, + "step": 1689 + }, + { + "epoch": 0.12788014074382353, + "grad_norm": 2.442770481109619, + "learning_rate": 1.7703258925911927e-05, + "loss": 0.6974, + "step": 1690 + }, + { + "epoch": 0.12795580946615717, + "grad_norm": 4.01525354385376, + "learning_rate": 1.7702685087679475e-05, + "loss": 0.8011, + "step": 1691 + }, + { + "epoch": 0.12803147818849078, + "grad_norm": 2.3456971645355225, + "learning_rate": 1.7702110704456735e-05, + "loss": 0.7804, + "step": 1692 + }, + { + "epoch": 0.12810714691082442, + "grad_norm": 3.353431224822998, + "learning_rate": 1.7701535776279678e-05, + "loss": 0.648, + "step": 1693 + }, + { + "epoch": 0.12818281563315803, + "grad_norm": 3.051726818084717, + "learning_rate": 1.7700960303184303e-05, + "loss": 0.6802, + "step": 1694 + }, + { + "epoch": 0.12825848435549167, + "grad_norm": 2.835416316986084, + "learning_rate": 1.7700384285206653e-05, + "loss": 0.734, + "step": 1695 + }, + { + "epoch": 0.12833415307782528, + "grad_norm": 2.5643491744995117, + "learning_rate": 1.7699807722382798e-05, + "loss": 0.6028, + "step": 1696 + }, + { + "epoch": 0.1284098218001589, + "grad_norm": 2.56653094291687, + "learning_rate": 1.7699230614748846e-05, + "loss": 0.7887, + "step": 1697 + }, + { + "epoch": 0.12848549052249253, + "grad_norm": 3.0402891635894775, + "learning_rate": 1.7698652962340934e-05, + "loss": 0.7655, + "step": 1698 + }, + { + "epoch": 0.12856115924482614, + "grad_norm": 2.826277017593384, + "learning_rate": 1.769807476519524e-05, + "loss": 0.9516, + "step": 1699 + }, + { + "epoch": 0.12863682796715978, + "grad_norm": 2.662139415740967, + "learning_rate": 1.7697496023347972e-05, + "loss": 0.6862, + "step": 1700 + }, + { + "epoch": 0.1287124966894934, + "grad_norm": 2.36557936668396, + "learning_rate": 1.769691673683537e-05, + "loss": 0.6471, + "step": 1701 + }, + { + "epoch": 0.12878816541182703, + "grad_norm": 2.5000391006469727, + "learning_rate": 1.7696336905693713e-05, + "loss": 0.7876, + "step": 1702 + }, + { + "epoch": 0.12886383413416064, + "grad_norm": 2.8562798500061035, + "learning_rate": 1.7695756529959313e-05, + "loss": 0.7194, + "step": 1703 + }, + { + "epoch": 0.12893950285649428, + "grad_norm": 2.4133670330047607, + "learning_rate": 1.7695175609668516e-05, + "loss": 0.816, + "step": 1704 + }, + { + "epoch": 0.1290151715788279, + "grad_norm": 2.8379642963409424, + "learning_rate": 1.7694594144857696e-05, + "loss": 0.8176, + "step": 1705 + }, + { + "epoch": 0.12909084030116152, + "grad_norm": 2.2420616149902344, + "learning_rate": 1.769401213556327e-05, + "loss": 0.7886, + "step": 1706 + }, + { + "epoch": 0.12916650902349514, + "grad_norm": 2.5862045288085938, + "learning_rate": 1.769342958182168e-05, + "loss": 0.723, + "step": 1707 + }, + { + "epoch": 0.12924217774582877, + "grad_norm": 2.9615068435668945, + "learning_rate": 1.7692846483669416e-05, + "loss": 0.8738, + "step": 1708 + }, + { + "epoch": 0.12931784646816238, + "grad_norm": 2.824129581451416, + "learning_rate": 1.7692262841142988e-05, + "loss": 0.7763, + "step": 1709 + }, + { + "epoch": 0.12939351519049602, + "grad_norm": 2.8640074729919434, + "learning_rate": 1.7691678654278947e-05, + "loss": 0.7885, + "step": 1710 + }, + { + "epoch": 0.12946918391282963, + "grad_norm": 3.950695753097534, + "learning_rate": 1.7691093923113875e-05, + "loss": 0.8261, + "step": 1711 + }, + { + "epoch": 0.12954485263516324, + "grad_norm": 2.261723041534424, + "learning_rate": 1.769050864768439e-05, + "loss": 0.806, + "step": 1712 + }, + { + "epoch": 0.12962052135749688, + "grad_norm": 2.690190076828003, + "learning_rate": 1.768992282802715e-05, + "loss": 0.7319, + "step": 1713 + }, + { + "epoch": 0.1296961900798305, + "grad_norm": 3.1310863494873047, + "learning_rate": 1.768933646417883e-05, + "loss": 0.6613, + "step": 1714 + }, + { + "epoch": 0.12977185880216413, + "grad_norm": 2.7063982486724854, + "learning_rate": 1.768874955617616e-05, + "loss": 0.8432, + "step": 1715 + }, + { + "epoch": 0.12984752752449774, + "grad_norm": 2.001281499862671, + "learning_rate": 1.768816210405589e-05, + "loss": 0.6304, + "step": 1716 + }, + { + "epoch": 0.12992319624683138, + "grad_norm": 2.623138189315796, + "learning_rate": 1.7687574107854808e-05, + "loss": 0.8827, + "step": 1717 + }, + { + "epoch": 0.129998864969165, + "grad_norm": 3.3270182609558105, + "learning_rate": 1.7686985567609735e-05, + "loss": 0.6884, + "step": 1718 + }, + { + "epoch": 0.13007453369149863, + "grad_norm": 3.246429204940796, + "learning_rate": 1.7686396483357528e-05, + "loss": 0.8164, + "step": 1719 + }, + { + "epoch": 0.13015020241383224, + "grad_norm": 2.8202688694000244, + "learning_rate": 1.7685806855135077e-05, + "loss": 0.7462, + "step": 1720 + }, + { + "epoch": 0.13022587113616588, + "grad_norm": 1.7720720767974854, + "learning_rate": 1.768521668297931e-05, + "loss": 0.9133, + "step": 1721 + }, + { + "epoch": 0.1303015398584995, + "grad_norm": 2.7071969509124756, + "learning_rate": 1.768462596692718e-05, + "loss": 0.857, + "step": 1722 + }, + { + "epoch": 0.13037720858083313, + "grad_norm": 2.4455504417419434, + "learning_rate": 1.7684034707015686e-05, + "loss": 0.7251, + "step": 1723 + }, + { + "epoch": 0.13045287730316674, + "grad_norm": 2.843379020690918, + "learning_rate": 1.768344290328185e-05, + "loss": 0.9646, + "step": 1724 + }, + { + "epoch": 0.13052854602550035, + "grad_norm": 3.5226151943206787, + "learning_rate": 1.7682850555762735e-05, + "loss": 0.7941, + "step": 1725 + }, + { + "epoch": 0.13060421474783399, + "grad_norm": 2.8279130458831787, + "learning_rate": 1.768225766449543e-05, + "loss": 0.7988, + "step": 1726 + }, + { + "epoch": 0.1306798834701676, + "grad_norm": 3.3851380348205566, + "learning_rate": 1.7681664229517074e-05, + "loss": 0.7569, + "step": 1727 + }, + { + "epoch": 0.13075555219250123, + "grad_norm": 2.9672915935516357, + "learning_rate": 1.7681070250864817e-05, + "loss": 0.8753, + "step": 1728 + }, + { + "epoch": 0.13083122091483484, + "grad_norm": 2.609323740005493, + "learning_rate": 1.768047572857587e-05, + "loss": 0.7996, + "step": 1729 + }, + { + "epoch": 0.13090688963716848, + "grad_norm": 2.4190008640289307, + "learning_rate": 1.7679880662687453e-05, + "loss": 0.7647, + "step": 1730 + }, + { + "epoch": 0.1309825583595021, + "grad_norm": 3.3898887634277344, + "learning_rate": 1.7679285053236838e-05, + "loss": 0.7903, + "step": 1731 + }, + { + "epoch": 0.13105822708183573, + "grad_norm": 2.466832160949707, + "learning_rate": 1.767868890026132e-05, + "loss": 0.6835, + "step": 1732 + }, + { + "epoch": 0.13113389580416934, + "grad_norm": 3.120836019515991, + "learning_rate": 1.767809220379823e-05, + "loss": 0.7485, + "step": 1733 + }, + { + "epoch": 0.13120956452650298, + "grad_norm": 2.3976681232452393, + "learning_rate": 1.7677494963884935e-05, + "loss": 0.7093, + "step": 1734 + }, + { + "epoch": 0.1312852332488366, + "grad_norm": 3.236889600753784, + "learning_rate": 1.7676897180558844e-05, + "loss": 0.7506, + "step": 1735 + }, + { + "epoch": 0.13136090197117023, + "grad_norm": 3.0290706157684326, + "learning_rate": 1.7676298853857387e-05, + "loss": 0.7999, + "step": 1736 + }, + { + "epoch": 0.13143657069350384, + "grad_norm": 2.8244903087615967, + "learning_rate": 1.767569998381803e-05, + "loss": 0.6637, + "step": 1737 + }, + { + "epoch": 0.13151223941583745, + "grad_norm": 2.6339778900146484, + "learning_rate": 1.7675100570478282e-05, + "loss": 0.81, + "step": 1738 + }, + { + "epoch": 0.1315879081381711, + "grad_norm": 3.118966817855835, + "learning_rate": 1.7674500613875678e-05, + "loss": 0.8709, + "step": 1739 + }, + { + "epoch": 0.1316635768605047, + "grad_norm": 3.025437593460083, + "learning_rate": 1.767390011404779e-05, + "loss": 0.8147, + "step": 1740 + }, + { + "epoch": 0.13173924558283834, + "grad_norm": 3.028900623321533, + "learning_rate": 1.767329907103222e-05, + "loss": 0.8358, + "step": 1741 + }, + { + "epoch": 0.13181491430517195, + "grad_norm": 2.833244800567627, + "learning_rate": 1.767269748486661e-05, + "loss": 0.7973, + "step": 1742 + }, + { + "epoch": 0.1318905830275056, + "grad_norm": 2.8717784881591797, + "learning_rate": 1.7672095355588632e-05, + "loss": 0.8679, + "step": 1743 + }, + { + "epoch": 0.1319662517498392, + "grad_norm": 2.449739456176758, + "learning_rate": 1.7671492683235993e-05, + "loss": 0.9128, + "step": 1744 + }, + { + "epoch": 0.13204192047217284, + "grad_norm": 4.743024826049805, + "learning_rate": 1.7670889467846435e-05, + "loss": 0.884, + "step": 1745 + }, + { + "epoch": 0.13211758919450645, + "grad_norm": 2.698247194290161, + "learning_rate": 1.7670285709457732e-05, + "loss": 0.7641, + "step": 1746 + }, + { + "epoch": 0.13219325791684008, + "grad_norm": 2.852203607559204, + "learning_rate": 1.76696814081077e-05, + "loss": 0.7957, + "step": 1747 + }, + { + "epoch": 0.1322689266391737, + "grad_norm": 2.8371152877807617, + "learning_rate": 1.7669076563834174e-05, + "loss": 0.8086, + "step": 1748 + }, + { + "epoch": 0.13234459536150733, + "grad_norm": 2.7854745388031006, + "learning_rate": 1.7668471176675033e-05, + "loss": 0.9564, + "step": 1749 + }, + { + "epoch": 0.13242026408384094, + "grad_norm": 4.4708051681518555, + "learning_rate": 1.7667865246668193e-05, + "loss": 0.7811, + "step": 1750 + }, + { + "epoch": 0.13249593280617455, + "grad_norm": 3.00451922416687, + "learning_rate": 1.7667258773851596e-05, + "loss": 0.9362, + "step": 1751 + }, + { + "epoch": 0.1325716015285082, + "grad_norm": 2.8037302494049072, + "learning_rate": 1.7666651758263218e-05, + "loss": 0.8856, + "step": 1752 + }, + { + "epoch": 0.1326472702508418, + "grad_norm": 2.7416181564331055, + "learning_rate": 1.7666044199941077e-05, + "loss": 0.7625, + "step": 1753 + }, + { + "epoch": 0.13272293897317544, + "grad_norm": 3.8341903686523438, + "learning_rate": 1.766543609892322e-05, + "loss": 0.7551, + "step": 1754 + }, + { + "epoch": 0.13279860769550905, + "grad_norm": 2.7719779014587402, + "learning_rate": 1.7664827455247725e-05, + "loss": 0.6949, + "step": 1755 + }, + { + "epoch": 0.1328742764178427, + "grad_norm": 11.289454460144043, + "learning_rate": 1.766421826895271e-05, + "loss": 0.7725, + "step": 1756 + }, + { + "epoch": 0.1329499451401763, + "grad_norm": 4.495529651641846, + "learning_rate": 1.7663608540076325e-05, + "loss": 0.935, + "step": 1757 + }, + { + "epoch": 0.13302561386250994, + "grad_norm": 3.4977807998657227, + "learning_rate": 1.766299826865675e-05, + "loss": 0.7704, + "step": 1758 + }, + { + "epoch": 0.13310128258484355, + "grad_norm": 3.455537796020508, + "learning_rate": 1.7662387454732206e-05, + "loss": 0.7805, + "step": 1759 + }, + { + "epoch": 0.1331769513071772, + "grad_norm": 2.7036778926849365, + "learning_rate": 1.766177609834094e-05, + "loss": 0.7823, + "step": 1760 + }, + { + "epoch": 0.1332526200295108, + "grad_norm": 2.983051061630249, + "learning_rate": 1.7661164199521238e-05, + "loss": 0.8118, + "step": 1761 + }, + { + "epoch": 0.13332828875184444, + "grad_norm": 2.9861648082733154, + "learning_rate": 1.7660551758311424e-05, + "loss": 0.755, + "step": 1762 + }, + { + "epoch": 0.13340395747417805, + "grad_norm": 3.221959352493286, + "learning_rate": 1.7659938774749843e-05, + "loss": 0.8239, + "step": 1763 + }, + { + "epoch": 0.13347962619651166, + "grad_norm": 2.7048370838165283, + "learning_rate": 1.765932524887489e-05, + "loss": 0.7978, + "step": 1764 + }, + { + "epoch": 0.1335552949188453, + "grad_norm": 3.0908560752868652, + "learning_rate": 1.765871118072498e-05, + "loss": 0.951, + "step": 1765 + }, + { + "epoch": 0.1336309636411789, + "grad_norm": 2.1981539726257324, + "learning_rate": 1.765809657033857e-05, + "loss": 0.6185, + "step": 1766 + }, + { + "epoch": 0.13370663236351255, + "grad_norm": 2.284193277359009, + "learning_rate": 1.765748141775415e-05, + "loss": 1.0244, + "step": 1767 + }, + { + "epoch": 0.13378230108584616, + "grad_norm": 3.8819425106048584, + "learning_rate": 1.7656865723010242e-05, + "loss": 0.7626, + "step": 1768 + }, + { + "epoch": 0.1338579698081798, + "grad_norm": 2.6897873878479004, + "learning_rate": 1.7656249486145405e-05, + "loss": 0.666, + "step": 1769 + }, + { + "epoch": 0.1339336385305134, + "grad_norm": 3.00016713142395, + "learning_rate": 1.7655632707198225e-05, + "loss": 0.8359, + "step": 1770 + }, + { + "epoch": 0.13400930725284704, + "grad_norm": 2.38024640083313, + "learning_rate": 1.7655015386207326e-05, + "loss": 0.7616, + "step": 1771 + }, + { + "epoch": 0.13408497597518065, + "grad_norm": 2.9403862953186035, + "learning_rate": 1.7654397523211374e-05, + "loss": 0.804, + "step": 1772 + }, + { + "epoch": 0.1341606446975143, + "grad_norm": 3.072727680206299, + "learning_rate": 1.7653779118249055e-05, + "loss": 0.7256, + "step": 1773 + }, + { + "epoch": 0.1342363134198479, + "grad_norm": 3.713294267654419, + "learning_rate": 1.76531601713591e-05, + "loss": 0.842, + "step": 1774 + }, + { + "epoch": 0.13431198214218154, + "grad_norm": 2.764293909072876, + "learning_rate": 1.7652540682580267e-05, + "loss": 0.7639, + "step": 1775 + }, + { + "epoch": 0.13438765086451515, + "grad_norm": 2.774491310119629, + "learning_rate": 1.765192065195135e-05, + "loss": 0.8338, + "step": 1776 + }, + { + "epoch": 0.1344633195868488, + "grad_norm": 2.9168760776519775, + "learning_rate": 1.765130007951118e-05, + "loss": 0.7613, + "step": 1777 + }, + { + "epoch": 0.1345389883091824, + "grad_norm": 2.8239054679870605, + "learning_rate": 1.7650678965298615e-05, + "loss": 0.908, + "step": 1778 + }, + { + "epoch": 0.134614657031516, + "grad_norm": 2.766092538833618, + "learning_rate": 1.7650057309352554e-05, + "loss": 0.8853, + "step": 1779 + }, + { + "epoch": 0.13469032575384965, + "grad_norm": 2.6427292823791504, + "learning_rate": 1.7649435111711926e-05, + "loss": 0.7613, + "step": 1780 + }, + { + "epoch": 0.13476599447618326, + "grad_norm": 3.0190093517303467, + "learning_rate": 1.7648812372415697e-05, + "loss": 0.7216, + "step": 1781 + }, + { + "epoch": 0.1348416631985169, + "grad_norm": 2.665872573852539, + "learning_rate": 1.7648189091502863e-05, + "loss": 0.9648, + "step": 1782 + }, + { + "epoch": 0.1349173319208505, + "grad_norm": 2.2282044887542725, + "learning_rate": 1.7647565269012458e-05, + "loss": 0.8866, + "step": 1783 + }, + { + "epoch": 0.13499300064318415, + "grad_norm": 2.46911883354187, + "learning_rate": 1.7646940904983545e-05, + "loss": 0.7876, + "step": 1784 + }, + { + "epoch": 0.13506866936551776, + "grad_norm": 2.569694757461548, + "learning_rate": 1.7646315999455224e-05, + "loss": 0.5898, + "step": 1785 + }, + { + "epoch": 0.1351443380878514, + "grad_norm": 2.6884071826934814, + "learning_rate": 1.764569055246663e-05, + "loss": 0.8368, + "step": 1786 + }, + { + "epoch": 0.135220006810185, + "grad_norm": 2.3804450035095215, + "learning_rate": 1.764506456405693e-05, + "loss": 0.8086, + "step": 1787 + }, + { + "epoch": 0.13529567553251864, + "grad_norm": 2.71864914894104, + "learning_rate": 1.7644438034265326e-05, + "loss": 0.6779, + "step": 1788 + }, + { + "epoch": 0.13537134425485225, + "grad_norm": 2.8667616844177246, + "learning_rate": 1.7643810963131053e-05, + "loss": 0.9573, + "step": 1789 + }, + { + "epoch": 0.1354470129771859, + "grad_norm": 2.3643083572387695, + "learning_rate": 1.764318335069338e-05, + "loss": 0.9273, + "step": 1790 + }, + { + "epoch": 0.1355226816995195, + "grad_norm": 2.5259487628936768, + "learning_rate": 1.764255519699161e-05, + "loss": 0.6811, + "step": 1791 + }, + { + "epoch": 0.13559835042185311, + "grad_norm": 3.6173150539398193, + "learning_rate": 1.764192650206508e-05, + "loss": 0.9071, + "step": 1792 + }, + { + "epoch": 0.13567401914418675, + "grad_norm": 3.5628244876861572, + "learning_rate": 1.7641297265953158e-05, + "loss": 0.7724, + "step": 1793 + }, + { + "epoch": 0.13574968786652036, + "grad_norm": 2.5445380210876465, + "learning_rate": 1.7640667488695258e-05, + "loss": 0.8766, + "step": 1794 + }, + { + "epoch": 0.135825356588854, + "grad_norm": 2.36080002784729, + "learning_rate": 1.764003717033081e-05, + "loss": 0.7998, + "step": 1795 + }, + { + "epoch": 0.1359010253111876, + "grad_norm": 2.7486581802368164, + "learning_rate": 1.763940631089929e-05, + "loss": 0.7066, + "step": 1796 + }, + { + "epoch": 0.13597669403352125, + "grad_norm": 3.2052993774414062, + "learning_rate": 1.7638774910440197e-05, + "loss": 0.7667, + "step": 1797 + }, + { + "epoch": 0.13605236275585486, + "grad_norm": 2.3594470024108887, + "learning_rate": 1.7638142968993086e-05, + "loss": 0.8154, + "step": 1798 + }, + { + "epoch": 0.1361280314781885, + "grad_norm": 3.531343936920166, + "learning_rate": 1.7637510486597517e-05, + "loss": 0.7966, + "step": 1799 + }, + { + "epoch": 0.1362037002005221, + "grad_norm": 2.1436774730682373, + "learning_rate": 1.7636877463293108e-05, + "loss": 0.9163, + "step": 1800 + }, + { + "epoch": 0.13627936892285575, + "grad_norm": 2.2388010025024414, + "learning_rate": 1.76362438991195e-05, + "loss": 0.8786, + "step": 1801 + }, + { + "epoch": 0.13635503764518936, + "grad_norm": 2.846320390701294, + "learning_rate": 1.7635609794116362e-05, + "loss": 0.8172, + "step": 1802 + }, + { + "epoch": 0.136430706367523, + "grad_norm": 2.405848264694214, + "learning_rate": 1.7634975148323405e-05, + "loss": 0.9217, + "step": 1803 + }, + { + "epoch": 0.1365063750898566, + "grad_norm": 2.645883321762085, + "learning_rate": 1.763433996178038e-05, + "loss": 0.8546, + "step": 1804 + }, + { + "epoch": 0.13658204381219022, + "grad_norm": 2.8361809253692627, + "learning_rate": 1.763370423452706e-05, + "loss": 0.8501, + "step": 1805 + }, + { + "epoch": 0.13665771253452386, + "grad_norm": 2.7826128005981445, + "learning_rate": 1.7633067966603254e-05, + "loss": 0.9248, + "step": 1806 + }, + { + "epoch": 0.13673338125685747, + "grad_norm": 2.6363344192504883, + "learning_rate": 1.7632431158048808e-05, + "loss": 0.8338, + "step": 1807 + }, + { + "epoch": 0.1368090499791911, + "grad_norm": 3.479905366897583, + "learning_rate": 1.7631793808903604e-05, + "loss": 0.8466, + "step": 1808 + }, + { + "epoch": 0.13688471870152472, + "grad_norm": 3.4462170600891113, + "learning_rate": 1.7631155919207556e-05, + "loss": 0.9066, + "step": 1809 + }, + { + "epoch": 0.13696038742385835, + "grad_norm": 3.2740397453308105, + "learning_rate": 1.76305174890006e-05, + "loss": 0.7991, + "step": 1810 + }, + { + "epoch": 0.13703605614619196, + "grad_norm": 2.6029398441314697, + "learning_rate": 1.7629878518322732e-05, + "loss": 0.7337, + "step": 1811 + }, + { + "epoch": 0.1371117248685256, + "grad_norm": 2.4479711055755615, + "learning_rate": 1.7629239007213957e-05, + "loss": 0.7885, + "step": 1812 + }, + { + "epoch": 0.1371873935908592, + "grad_norm": 2.370789051055908, + "learning_rate": 1.7628598955714322e-05, + "loss": 0.8362, + "step": 1813 + }, + { + "epoch": 0.13726306231319285, + "grad_norm": 2.8538105487823486, + "learning_rate": 1.7627958363863914e-05, + "loss": 0.6775, + "step": 1814 + }, + { + "epoch": 0.13733873103552646, + "grad_norm": 2.297853469848633, + "learning_rate": 1.7627317231702847e-05, + "loss": 0.597, + "step": 1815 + }, + { + "epoch": 0.1374143997578601, + "grad_norm": 3.4059054851531982, + "learning_rate": 1.762667555927127e-05, + "loss": 0.6599, + "step": 1816 + }, + { + "epoch": 0.1374900684801937, + "grad_norm": 4.349469184875488, + "learning_rate": 1.762603334660937e-05, + "loss": 0.715, + "step": 1817 + }, + { + "epoch": 0.13756573720252732, + "grad_norm": 2.5945630073547363, + "learning_rate": 1.762539059375736e-05, + "loss": 0.7752, + "step": 1818 + }, + { + "epoch": 0.13764140592486096, + "grad_norm": 2.1834769248962402, + "learning_rate": 1.7624747300755493e-05, + "loss": 0.7783, + "step": 1819 + }, + { + "epoch": 0.13771707464719457, + "grad_norm": 2.6315038204193115, + "learning_rate": 1.7624103467644055e-05, + "loss": 0.7731, + "step": 1820 + }, + { + "epoch": 0.1377927433695282, + "grad_norm": 2.684382915496826, + "learning_rate": 1.7623459094463363e-05, + "loss": 0.6977, + "step": 1821 + }, + { + "epoch": 0.13786841209186182, + "grad_norm": 2.139249086380005, + "learning_rate": 1.762281418125377e-05, + "loss": 0.6902, + "step": 1822 + }, + { + "epoch": 0.13794408081419546, + "grad_norm": 5.037784099578857, + "learning_rate": 1.7622168728055665e-05, + "loss": 0.823, + "step": 1823 + }, + { + "epoch": 0.13801974953652907, + "grad_norm": 3.9115712642669678, + "learning_rate": 1.762152273490947e-05, + "loss": 0.7472, + "step": 1824 + }, + { + "epoch": 0.1380954182588627, + "grad_norm": 2.827516555786133, + "learning_rate": 1.7620876201855633e-05, + "loss": 0.8842, + "step": 1825 + }, + { + "epoch": 0.13817108698119632, + "grad_norm": 2.3763670921325684, + "learning_rate": 1.7620229128934644e-05, + "loss": 0.7502, + "step": 1826 + }, + { + "epoch": 0.13824675570352996, + "grad_norm": 2.8924078941345215, + "learning_rate": 1.7619581516187026e-05, + "loss": 0.8482, + "step": 1827 + }, + { + "epoch": 0.13832242442586357, + "grad_norm": 2.8543429374694824, + "learning_rate": 1.7618933363653333e-05, + "loss": 0.7102, + "step": 1828 + }, + { + "epoch": 0.1383980931481972, + "grad_norm": 2.7342612743377686, + "learning_rate": 1.7618284671374157e-05, + "loss": 0.7424, + "step": 1829 + }, + { + "epoch": 0.13847376187053081, + "grad_norm": 3.0927932262420654, + "learning_rate": 1.7617635439390123e-05, + "loss": 0.8179, + "step": 1830 + }, + { + "epoch": 0.13854943059286445, + "grad_norm": 2.353637933731079, + "learning_rate": 1.761698566774188e-05, + "loss": 0.7647, + "step": 1831 + }, + { + "epoch": 0.13862509931519806, + "grad_norm": 2.7687482833862305, + "learning_rate": 1.7616335356470128e-05, + "loss": 0.9106, + "step": 1832 + }, + { + "epoch": 0.13870076803753167, + "grad_norm": 4.123755931854248, + "learning_rate": 1.7615684505615587e-05, + "loss": 0.9028, + "step": 1833 + }, + { + "epoch": 0.1387764367598653, + "grad_norm": 2.1787283420562744, + "learning_rate": 1.7615033115219012e-05, + "loss": 0.6567, + "step": 1834 + }, + { + "epoch": 0.13885210548219892, + "grad_norm": 2.4294352531433105, + "learning_rate": 1.76143811853212e-05, + "loss": 0.8595, + "step": 1835 + }, + { + "epoch": 0.13892777420453256, + "grad_norm": 2.3959708213806152, + "learning_rate": 1.7613728715962978e-05, + "loss": 0.7286, + "step": 1836 + }, + { + "epoch": 0.13900344292686617, + "grad_norm": 1.9494025707244873, + "learning_rate": 1.7613075707185203e-05, + "loss": 0.721, + "step": 1837 + }, + { + "epoch": 0.1390791116491998, + "grad_norm": 1.5588613748550415, + "learning_rate": 1.7612422159028767e-05, + "loss": 0.9273, + "step": 1838 + }, + { + "epoch": 0.13915478037153342, + "grad_norm": 3.5548200607299805, + "learning_rate": 1.7611768071534604e-05, + "loss": 0.7202, + "step": 1839 + }, + { + "epoch": 0.13923044909386706, + "grad_norm": 2.5746283531188965, + "learning_rate": 1.7611113444743665e-05, + "loss": 0.708, + "step": 1840 + }, + { + "epoch": 0.13930611781620067, + "grad_norm": 2.695033073425293, + "learning_rate": 1.7610458278696955e-05, + "loss": 0.8412, + "step": 1841 + }, + { + "epoch": 0.1393817865385343, + "grad_norm": 2.7120509147644043, + "learning_rate": 1.7609802573435495e-05, + "loss": 0.8491, + "step": 1842 + }, + { + "epoch": 0.13945745526086792, + "grad_norm": 2.5624072551727295, + "learning_rate": 1.7609146329000353e-05, + "loss": 0.8119, + "step": 1843 + }, + { + "epoch": 0.13953312398320156, + "grad_norm": 2.7809038162231445, + "learning_rate": 1.760848954543262e-05, + "loss": 0.7659, + "step": 1844 + }, + { + "epoch": 0.13960879270553517, + "grad_norm": 2.43100905418396, + "learning_rate": 1.760783222277343e-05, + "loss": 0.6677, + "step": 1845 + }, + { + "epoch": 0.13968446142786878, + "grad_norm": 2.619065999984741, + "learning_rate": 1.7607174361063944e-05, + "loss": 0.9192, + "step": 1846 + }, + { + "epoch": 0.13976013015020242, + "grad_norm": 2.997462511062622, + "learning_rate": 1.7606515960345362e-05, + "loss": 0.7037, + "step": 1847 + }, + { + "epoch": 0.13983579887253603, + "grad_norm": 2.8004891872406006, + "learning_rate": 1.7605857020658913e-05, + "loss": 0.6762, + "step": 1848 + }, + { + "epoch": 0.13991146759486967, + "grad_norm": 2.8649933338165283, + "learning_rate": 1.760519754204586e-05, + "loss": 0.6628, + "step": 1849 + }, + { + "epoch": 0.13998713631720328, + "grad_norm": 2.591527223587036, + "learning_rate": 1.760453752454751e-05, + "loss": 0.8484, + "step": 1850 + }, + { + "epoch": 0.14006280503953691, + "grad_norm": 2.9254584312438965, + "learning_rate": 1.7603876968205185e-05, + "loss": 0.9029, + "step": 1851 + }, + { + "epoch": 0.14013847376187052, + "grad_norm": 2.7631466388702393, + "learning_rate": 1.7603215873060256e-05, + "loss": 0.8673, + "step": 1852 + }, + { + "epoch": 0.14021414248420416, + "grad_norm": 2.2092506885528564, + "learning_rate": 1.7602554239154126e-05, + "loss": 0.7803, + "step": 1853 + }, + { + "epoch": 0.14028981120653777, + "grad_norm": 5.1182026863098145, + "learning_rate": 1.7601892066528224e-05, + "loss": 0.7412, + "step": 1854 + }, + { + "epoch": 0.1403654799288714, + "grad_norm": 2.7302863597869873, + "learning_rate": 1.7601229355224018e-05, + "loss": 0.7575, + "step": 1855 + }, + { + "epoch": 0.14044114865120502, + "grad_norm": 2.1949663162231445, + "learning_rate": 1.7600566105283013e-05, + "loss": 0.5413, + "step": 1856 + }, + { + "epoch": 0.14051681737353866, + "grad_norm": 2.7179486751556396, + "learning_rate": 1.7599902316746737e-05, + "loss": 0.7399, + "step": 1857 + }, + { + "epoch": 0.14059248609587227, + "grad_norm": 2.4774746894836426, + "learning_rate": 1.7599237989656765e-05, + "loss": 0.7259, + "step": 1858 + }, + { + "epoch": 0.14066815481820588, + "grad_norm": 2.5634522438049316, + "learning_rate": 1.7598573124054694e-05, + "loss": 0.7805, + "step": 1859 + }, + { + "epoch": 0.14074382354053952, + "grad_norm": 2.349278450012207, + "learning_rate": 1.7597907719982165e-05, + "loss": 0.8274, + "step": 1860 + }, + { + "epoch": 0.14081949226287313, + "grad_norm": 2.56119441986084, + "learning_rate": 1.7597241777480846e-05, + "loss": 1.0099, + "step": 1861 + }, + { + "epoch": 0.14089516098520677, + "grad_norm": 2.2470288276672363, + "learning_rate": 1.759657529659244e-05, + "loss": 0.7354, + "step": 1862 + }, + { + "epoch": 0.14097082970754038, + "grad_norm": 2.7131481170654297, + "learning_rate": 1.7595908277358683e-05, + "loss": 0.8058, + "step": 1863 + }, + { + "epoch": 0.14104649842987402, + "grad_norm": 2.9659440517425537, + "learning_rate": 1.7595240719821348e-05, + "loss": 0.6039, + "step": 1864 + }, + { + "epoch": 0.14112216715220763, + "grad_norm": 2.83231782913208, + "learning_rate": 1.7594572624022236e-05, + "loss": 1.0244, + "step": 1865 + }, + { + "epoch": 0.14119783587454127, + "grad_norm": 2.453878402709961, + "learning_rate": 1.7593903990003194e-05, + "loss": 0.9862, + "step": 1866 + }, + { + "epoch": 0.14127350459687488, + "grad_norm": 2.553098678588867, + "learning_rate": 1.7593234817806085e-05, + "loss": 0.8229, + "step": 1867 + }, + { + "epoch": 0.14134917331920852, + "grad_norm": 2.8726489543914795, + "learning_rate": 1.7592565107472817e-05, + "loss": 0.8612, + "step": 1868 + }, + { + "epoch": 0.14142484204154213, + "grad_norm": 2.7454302310943604, + "learning_rate": 1.759189485904533e-05, + "loss": 0.912, + "step": 1869 + }, + { + "epoch": 0.14150051076387576, + "grad_norm": 2.6439144611358643, + "learning_rate": 1.7591224072565598e-05, + "loss": 0.6824, + "step": 1870 + }, + { + "epoch": 0.14157617948620937, + "grad_norm": 3.527799367904663, + "learning_rate": 1.7590552748075626e-05, + "loss": 0.9554, + "step": 1871 + }, + { + "epoch": 0.14165184820854299, + "grad_norm": 3.4755446910858154, + "learning_rate": 1.7589880885617457e-05, + "loss": 0.7975, + "step": 1872 + }, + { + "epoch": 0.14172751693087662, + "grad_norm": 2.0089948177337646, + "learning_rate": 1.7589208485233164e-05, + "loss": 0.7643, + "step": 1873 + }, + { + "epoch": 0.14180318565321023, + "grad_norm": 2.9593629837036133, + "learning_rate": 1.7588535546964853e-05, + "loss": 0.8146, + "step": 1874 + }, + { + "epoch": 0.14187885437554387, + "grad_norm": 2.3689467906951904, + "learning_rate": 1.758786207085467e-05, + "loss": 0.9881, + "step": 1875 + }, + { + "epoch": 0.14195452309787748, + "grad_norm": 2.658514976501465, + "learning_rate": 1.758718805694479e-05, + "loss": 0.5528, + "step": 1876 + }, + { + "epoch": 0.14203019182021112, + "grad_norm": 2.965433359146118, + "learning_rate": 1.7586513505277414e-05, + "loss": 0.6984, + "step": 1877 + }, + { + "epoch": 0.14210586054254473, + "grad_norm": 3.3098106384277344, + "learning_rate": 1.758583841589479e-05, + "loss": 0.9566, + "step": 1878 + }, + { + "epoch": 0.14218152926487837, + "grad_norm": 2.4268958568573, + "learning_rate": 1.7585162788839197e-05, + "loss": 0.8622, + "step": 1879 + }, + { + "epoch": 0.14225719798721198, + "grad_norm": 3.331698417663574, + "learning_rate": 1.7584486624152943e-05, + "loss": 0.8862, + "step": 1880 + }, + { + "epoch": 0.14233286670954562, + "grad_norm": 2.746612787246704, + "learning_rate": 1.758380992187837e-05, + "loss": 0.8097, + "step": 1881 + }, + { + "epoch": 0.14240853543187923, + "grad_norm": 2.4068593978881836, + "learning_rate": 1.7583132682057857e-05, + "loss": 0.8202, + "step": 1882 + }, + { + "epoch": 0.14248420415421287, + "grad_norm": 2.4909160137176514, + "learning_rate": 1.7582454904733815e-05, + "loss": 0.7272, + "step": 1883 + }, + { + "epoch": 0.14255987287654648, + "grad_norm": 2.3360512256622314, + "learning_rate": 1.7581776589948686e-05, + "loss": 0.7754, + "step": 1884 + }, + { + "epoch": 0.14263554159888012, + "grad_norm": 2.4334325790405273, + "learning_rate": 1.758109773774495e-05, + "loss": 0.7806, + "step": 1885 + }, + { + "epoch": 0.14271121032121373, + "grad_norm": 3.528743028640747, + "learning_rate": 1.758041834816512e-05, + "loss": 0.7452, + "step": 1886 + }, + { + "epoch": 0.14278687904354734, + "grad_norm": 2.863650321960449, + "learning_rate": 1.757973842125174e-05, + "loss": 0.908, + "step": 1887 + }, + { + "epoch": 0.14286254776588098, + "grad_norm": 2.7504138946533203, + "learning_rate": 1.757905795704739e-05, + "loss": 0.8345, + "step": 1888 + }, + { + "epoch": 0.1429382164882146, + "grad_norm": 2.4951789379119873, + "learning_rate": 1.7578376955594682e-05, + "loss": 0.7721, + "step": 1889 + }, + { + "epoch": 0.14301388521054823, + "grad_norm": 2.6636242866516113, + "learning_rate": 1.7577695416936263e-05, + "loss": 0.9099, + "step": 1890 + }, + { + "epoch": 0.14308955393288184, + "grad_norm": 2.8234310150146484, + "learning_rate": 1.7577013341114815e-05, + "loss": 1.0285, + "step": 1891 + }, + { + "epoch": 0.14316522265521547, + "grad_norm": 3.015465259552002, + "learning_rate": 1.7576330728173047e-05, + "loss": 0.6027, + "step": 1892 + }, + { + "epoch": 0.14324089137754908, + "grad_norm": 2.617048740386963, + "learning_rate": 1.7575647578153716e-05, + "loss": 0.9196, + "step": 1893 + }, + { + "epoch": 0.14331656009988272, + "grad_norm": 3.0418808460235596, + "learning_rate": 1.757496389109959e-05, + "loss": 0.68, + "step": 1894 + }, + { + "epoch": 0.14339222882221633, + "grad_norm": 2.8369641304016113, + "learning_rate": 1.7574279667053494e-05, + "loss": 0.8018, + "step": 1895 + }, + { + "epoch": 0.14346789754454997, + "grad_norm": 2.903010129928589, + "learning_rate": 1.7573594906058273e-05, + "loss": 0.7899, + "step": 1896 + }, + { + "epoch": 0.14354356626688358, + "grad_norm": 3.224677324295044, + "learning_rate": 1.7572909608156805e-05, + "loss": 0.8495, + "step": 1897 + }, + { + "epoch": 0.14361923498921722, + "grad_norm": 3.112607955932617, + "learning_rate": 1.7572223773392012e-05, + "loss": 0.7727, + "step": 1898 + }, + { + "epoch": 0.14369490371155083, + "grad_norm": 2.8533096313476562, + "learning_rate": 1.757153740180684e-05, + "loss": 0.7837, + "step": 1899 + }, + { + "epoch": 0.14377057243388444, + "grad_norm": 7.023125648498535, + "learning_rate": 1.7570850493444273e-05, + "loss": 0.7611, + "step": 1900 + }, + { + "epoch": 0.14384624115621808, + "grad_norm": 3.051293134689331, + "learning_rate": 1.7570163048347325e-05, + "loss": 0.8792, + "step": 1901 + }, + { + "epoch": 0.1439219098785517, + "grad_norm": 2.489708185195923, + "learning_rate": 1.7569475066559046e-05, + "loss": 0.7576, + "step": 1902 + }, + { + "epoch": 0.14399757860088533, + "grad_norm": 3.0184719562530518, + "learning_rate": 1.7568786548122527e-05, + "loss": 0.704, + "step": 1903 + }, + { + "epoch": 0.14407324732321894, + "grad_norm": 2.3036086559295654, + "learning_rate": 1.7568097493080874e-05, + "loss": 0.622, + "step": 1904 + }, + { + "epoch": 0.14414891604555258, + "grad_norm": 2.6877403259277344, + "learning_rate": 1.7567407901477243e-05, + "loss": 0.6003, + "step": 1905 + }, + { + "epoch": 0.1442245847678862, + "grad_norm": 2.290517568588257, + "learning_rate": 1.7566717773354822e-05, + "loss": 0.7039, + "step": 1906 + }, + { + "epoch": 0.14430025349021983, + "grad_norm": 3.5468292236328125, + "learning_rate": 1.7566027108756826e-05, + "loss": 0.9343, + "step": 1907 + }, + { + "epoch": 0.14437592221255344, + "grad_norm": 2.674797296524048, + "learning_rate": 1.7565335907726505e-05, + "loss": 0.7771, + "step": 1908 + }, + { + "epoch": 0.14445159093488708, + "grad_norm": 2.3369529247283936, + "learning_rate": 1.7564644170307146e-05, + "loss": 0.7907, + "step": 1909 + }, + { + "epoch": 0.14452725965722069, + "grad_norm": 2.8003242015838623, + "learning_rate": 1.756395189654207e-05, + "loss": 0.9282, + "step": 1910 + }, + { + "epoch": 0.14460292837955432, + "grad_norm": 3.076770305633545, + "learning_rate": 1.7563259086474627e-05, + "loss": 0.8343, + "step": 1911 + }, + { + "epoch": 0.14467859710188793, + "grad_norm": 2.4851486682891846, + "learning_rate": 1.7562565740148202e-05, + "loss": 0.813, + "step": 1912 + }, + { + "epoch": 0.14475426582422155, + "grad_norm": 2.4978880882263184, + "learning_rate": 1.756187185760621e-05, + "loss": 0.7405, + "step": 1913 + }, + { + "epoch": 0.14482993454655518, + "grad_norm": 2.255244255065918, + "learning_rate": 1.7561177438892118e-05, + "loss": 0.6685, + "step": 1914 + }, + { + "epoch": 0.1449056032688888, + "grad_norm": 2.5343151092529297, + "learning_rate": 1.7560482484049402e-05, + "loss": 0.8281, + "step": 1915 + }, + { + "epoch": 0.14498127199122243, + "grad_norm": 2.053952693939209, + "learning_rate": 1.7559786993121583e-05, + "loss": 0.6369, + "step": 1916 + }, + { + "epoch": 0.14505694071355604, + "grad_norm": 2.417632818222046, + "learning_rate": 1.755909096615222e-05, + "loss": 0.8845, + "step": 1917 + }, + { + "epoch": 0.14513260943588968, + "grad_norm": 2.182724714279175, + "learning_rate": 1.7558394403184892e-05, + "loss": 0.7699, + "step": 1918 + }, + { + "epoch": 0.1452082781582233, + "grad_norm": 2.5295472145080566, + "learning_rate": 1.755769730426323e-05, + "loss": 0.8116, + "step": 1919 + }, + { + "epoch": 0.14528394688055693, + "grad_norm": 2.3989028930664062, + "learning_rate": 1.7556999669430882e-05, + "loss": 0.7237, + "step": 1920 + }, + { + "epoch": 0.14535961560289054, + "grad_norm": 2.519035816192627, + "learning_rate": 1.755630149873154e-05, + "loss": 0.9275, + "step": 1921 + }, + { + "epoch": 0.14543528432522418, + "grad_norm": 2.8320114612579346, + "learning_rate": 1.755560279220892e-05, + "loss": 0.8483, + "step": 1922 + }, + { + "epoch": 0.1455109530475578, + "grad_norm": 2.433471202850342, + "learning_rate": 1.755490354990678e-05, + "loss": 0.7238, + "step": 1923 + }, + { + "epoch": 0.14558662176989143, + "grad_norm": 2.112417697906494, + "learning_rate": 1.7554203771868918e-05, + "loss": 0.8088, + "step": 1924 + }, + { + "epoch": 0.14566229049222504, + "grad_norm": 3.250727891921997, + "learning_rate": 1.755350345813914e-05, + "loss": 0.7521, + "step": 1925 + }, + { + "epoch": 0.14573795921455865, + "grad_norm": 2.6410152912139893, + "learning_rate": 1.7552802608761317e-05, + "loss": 0.8002, + "step": 1926 + }, + { + "epoch": 0.1458136279368923, + "grad_norm": 2.4262301921844482, + "learning_rate": 1.7552101223779325e-05, + "loss": 0.7778, + "step": 1927 + }, + { + "epoch": 0.1458892966592259, + "grad_norm": 2.871870517730713, + "learning_rate": 1.7551399303237097e-05, + "loss": 0.8634, + "step": 1928 + }, + { + "epoch": 0.14596496538155954, + "grad_norm": 2.4270362854003906, + "learning_rate": 1.7550696847178586e-05, + "loss": 0.8465, + "step": 1929 + }, + { + "epoch": 0.14604063410389315, + "grad_norm": 2.422614097595215, + "learning_rate": 1.7549993855647778e-05, + "loss": 0.7685, + "step": 1930 + }, + { + "epoch": 0.14611630282622678, + "grad_norm": 2.4596493244171143, + "learning_rate": 1.7549290328688707e-05, + "loss": 0.8147, + "step": 1931 + }, + { + "epoch": 0.1461919715485604, + "grad_norm": 2.994337558746338, + "learning_rate": 1.754858626634542e-05, + "loss": 0.7431, + "step": 1932 + }, + { + "epoch": 0.14626764027089403, + "grad_norm": 3.324469804763794, + "learning_rate": 1.754788166866201e-05, + "loss": 0.9868, + "step": 1933 + }, + { + "epoch": 0.14634330899322764, + "grad_norm": 2.912229061126709, + "learning_rate": 1.7547176535682607e-05, + "loss": 0.8108, + "step": 1934 + }, + { + "epoch": 0.14641897771556128, + "grad_norm": 3.092162847518921, + "learning_rate": 1.754647086745136e-05, + "loss": 0.7094, + "step": 1935 + }, + { + "epoch": 0.1464946464378949, + "grad_norm": 2.593982219696045, + "learning_rate": 1.754576466401247e-05, + "loss": 0.7324, + "step": 1936 + }, + { + "epoch": 0.14657031516022853, + "grad_norm": 2.84027361869812, + "learning_rate": 1.7545057925410154e-05, + "loss": 0.707, + "step": 1937 + }, + { + "epoch": 0.14664598388256214, + "grad_norm": 2.8210887908935547, + "learning_rate": 1.754435065168867e-05, + "loss": 0.8568, + "step": 1938 + }, + { + "epoch": 0.14672165260489575, + "grad_norm": 2.999987840652466, + "learning_rate": 1.754364284289232e-05, + "loss": 0.768, + "step": 1939 + }, + { + "epoch": 0.1467973213272294, + "grad_norm": 3.40769362449646, + "learning_rate": 1.7542934499065413e-05, + "loss": 0.896, + "step": 1940 + }, + { + "epoch": 0.146872990049563, + "grad_norm": 2.4607155323028564, + "learning_rate": 1.7542225620252318e-05, + "loss": 0.7203, + "step": 1941 + }, + { + "epoch": 0.14694865877189664, + "grad_norm": 2.867579460144043, + "learning_rate": 1.754151620649743e-05, + "loss": 0.7944, + "step": 1942 + }, + { + "epoch": 0.14702432749423025, + "grad_norm": 2.73762583732605, + "learning_rate": 1.7540806257845167e-05, + "loss": 0.8689, + "step": 1943 + }, + { + "epoch": 0.1470999962165639, + "grad_norm": 2.8091366291046143, + "learning_rate": 1.7540095774339995e-05, + "loss": 0.85, + "step": 1944 + }, + { + "epoch": 0.1471756649388975, + "grad_norm": 2.514357089996338, + "learning_rate": 1.75393847560264e-05, + "loss": 0.9253, + "step": 1945 + }, + { + "epoch": 0.14725133366123114, + "grad_norm": 2.452388286590576, + "learning_rate": 1.7538673202948913e-05, + "loss": 0.5905, + "step": 1946 + }, + { + "epoch": 0.14732700238356475, + "grad_norm": 2.8314881324768066, + "learning_rate": 1.7537961115152093e-05, + "loss": 0.7482, + "step": 1947 + }, + { + "epoch": 0.1474026711058984, + "grad_norm": 3.156912088394165, + "learning_rate": 1.7537248492680532e-05, + "loss": 0.7742, + "step": 1948 + }, + { + "epoch": 0.147478339828232, + "grad_norm": 2.562253713607788, + "learning_rate": 1.7536535335578858e-05, + "loss": 0.8122, + "step": 1949 + }, + { + "epoch": 0.14755400855056564, + "grad_norm": 2.4131200313568115, + "learning_rate": 1.7535821643891732e-05, + "loss": 0.8995, + "step": 1950 + }, + { + "epoch": 0.14762967727289925, + "grad_norm": 2.465721368789673, + "learning_rate": 1.7535107417663845e-05, + "loss": 0.7528, + "step": 1951 + }, + { + "epoch": 0.14770534599523288, + "grad_norm": 2.287510633468628, + "learning_rate": 1.7534392656939927e-05, + "loss": 0.7947, + "step": 1952 + }, + { + "epoch": 0.1477810147175665, + "grad_norm": 2.6098928451538086, + "learning_rate": 1.7533677361764738e-05, + "loss": 0.8785, + "step": 1953 + }, + { + "epoch": 0.1478566834399001, + "grad_norm": 2.5681772232055664, + "learning_rate": 1.7532961532183065e-05, + "loss": 0.7377, + "step": 1954 + }, + { + "epoch": 0.14793235216223374, + "grad_norm": 2.2554850578308105, + "learning_rate": 1.753224516823975e-05, + "loss": 0.7856, + "step": 1955 + }, + { + "epoch": 0.14800802088456735, + "grad_norm": 2.409834146499634, + "learning_rate": 1.7531528269979642e-05, + "loss": 0.7473, + "step": 1956 + }, + { + "epoch": 0.148083689606901, + "grad_norm": 4.173964023590088, + "learning_rate": 1.753081083744764e-05, + "loss": 0.9151, + "step": 1957 + }, + { + "epoch": 0.1481593583292346, + "grad_norm": 3.092050790786743, + "learning_rate": 1.753009287068867e-05, + "loss": 0.8214, + "step": 1958 + }, + { + "epoch": 0.14823502705156824, + "grad_norm": 2.639125108718872, + "learning_rate": 1.7529374369747697e-05, + "loss": 0.6853, + "step": 1959 + }, + { + "epoch": 0.14831069577390185, + "grad_norm": 2.0184338092803955, + "learning_rate": 1.7528655334669715e-05, + "loss": 0.7324, + "step": 1960 + }, + { + "epoch": 0.1483863644962355, + "grad_norm": 2.531182289123535, + "learning_rate": 1.7527935765499746e-05, + "loss": 0.8407, + "step": 1961 + }, + { + "epoch": 0.1484620332185691, + "grad_norm": 2.4710707664489746, + "learning_rate": 1.7527215662282862e-05, + "loss": 0.7486, + "step": 1962 + }, + { + "epoch": 0.14853770194090274, + "grad_norm": 2.496640920639038, + "learning_rate": 1.7526495025064147e-05, + "loss": 0.7771, + "step": 1963 + }, + { + "epoch": 0.14861337066323635, + "grad_norm": 2.6928977966308594, + "learning_rate": 1.752577385388874e-05, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.14868903938557, + "grad_norm": 3.124969720840454, + "learning_rate": 1.75250521488018e-05, + "loss": 0.8272, + "step": 1965 + }, + { + "epoch": 0.1487647081079036, + "grad_norm": 2.917346477508545, + "learning_rate": 1.7524329909848514e-05, + "loss": 0.8293, + "step": 1966 + }, + { + "epoch": 0.1488403768302372, + "grad_norm": 4.432890892028809, + "learning_rate": 1.7523607137074124e-05, + "loss": 0.6315, + "step": 1967 + }, + { + "epoch": 0.14891604555257085, + "grad_norm": 2.767793893814087, + "learning_rate": 1.7522883830523887e-05, + "loss": 0.8295, + "step": 1968 + }, + { + "epoch": 0.14899171427490446, + "grad_norm": 2.901761531829834, + "learning_rate": 1.7522159990243096e-05, + "loss": 0.8094, + "step": 1969 + }, + { + "epoch": 0.1490673829972381, + "grad_norm": 3.11002254486084, + "learning_rate": 1.7521435616277083e-05, + "loss": 0.801, + "step": 1970 + }, + { + "epoch": 0.1491430517195717, + "grad_norm": 2.691927433013916, + "learning_rate": 1.7520710708671207e-05, + "loss": 0.8218, + "step": 1971 + }, + { + "epoch": 0.14921872044190534, + "grad_norm": 2.540382146835327, + "learning_rate": 1.751998526747087e-05, + "loss": 0.8464, + "step": 1972 + }, + { + "epoch": 0.14929438916423896, + "grad_norm": 2.418942451477051, + "learning_rate": 1.75192592927215e-05, + "loss": 0.7234, + "step": 1973 + }, + { + "epoch": 0.1493700578865726, + "grad_norm": 3.039180278778076, + "learning_rate": 1.7518532784468555e-05, + "loss": 0.7202, + "step": 1974 + }, + { + "epoch": 0.1494457266089062, + "grad_norm": 3.523127555847168, + "learning_rate": 1.7517805742757537e-05, + "loss": 0.83, + "step": 1975 + }, + { + "epoch": 0.14952139533123984, + "grad_norm": 2.5864298343658447, + "learning_rate": 1.751707816763397e-05, + "loss": 0.9138, + "step": 1976 + }, + { + "epoch": 0.14959706405357345, + "grad_norm": 2.7911880016326904, + "learning_rate": 1.7516350059143425e-05, + "loss": 0.7544, + "step": 1977 + }, + { + "epoch": 0.1496727327759071, + "grad_norm": 3.0859878063201904, + "learning_rate": 1.7515621417331493e-05, + "loss": 0.77, + "step": 1978 + }, + { + "epoch": 0.1497484014982407, + "grad_norm": 2.647671699523926, + "learning_rate": 1.7514892242243805e-05, + "loss": 0.842, + "step": 1979 + }, + { + "epoch": 0.1498240702205743, + "grad_norm": 2.811793804168701, + "learning_rate": 1.7514162533926024e-05, + "loss": 0.8385, + "step": 1980 + }, + { + "epoch": 0.14989973894290795, + "grad_norm": 2.701957941055298, + "learning_rate": 1.7513432292423846e-05, + "loss": 0.726, + "step": 1981 + }, + { + "epoch": 0.14997540766524156, + "grad_norm": 2.3943700790405273, + "learning_rate": 1.7512701517783006e-05, + "loss": 0.8246, + "step": 1982 + }, + { + "epoch": 0.1500510763875752, + "grad_norm": 2.8560993671417236, + "learning_rate": 1.751197021004926e-05, + "loss": 0.8316, + "step": 1983 + }, + { + "epoch": 0.1501267451099088, + "grad_norm": 2.4664223194122314, + "learning_rate": 1.7511238369268408e-05, + "loss": 0.8168, + "step": 1984 + }, + { + "epoch": 0.15020241383224245, + "grad_norm": 2.2594258785247803, + "learning_rate": 1.7510505995486278e-05, + "loss": 0.6974, + "step": 1985 + }, + { + "epoch": 0.15027808255457606, + "grad_norm": 2.1898341178894043, + "learning_rate": 1.7509773088748744e-05, + "loss": 0.7319, + "step": 1986 + }, + { + "epoch": 0.1503537512769097, + "grad_norm": 2.4452483654022217, + "learning_rate": 1.7509039649101688e-05, + "loss": 0.9508, + "step": 1987 + }, + { + "epoch": 0.1504294199992433, + "grad_norm": 2.685222864151001, + "learning_rate": 1.750830567659105e-05, + "loss": 0.6802, + "step": 1988 + }, + { + "epoch": 0.15050508872157695, + "grad_norm": 2.3486924171447754, + "learning_rate": 1.7507571171262793e-05, + "loss": 0.745, + "step": 1989 + }, + { + "epoch": 0.15058075744391056, + "grad_norm": 2.269319534301758, + "learning_rate": 1.7506836133162912e-05, + "loss": 0.621, + "step": 1990 + }, + { + "epoch": 0.1506564261662442, + "grad_norm": 2.322559118270874, + "learning_rate": 1.7506100562337433e-05, + "loss": 0.706, + "step": 1991 + }, + { + "epoch": 0.1507320948885778, + "grad_norm": 2.2499871253967285, + "learning_rate": 1.7505364458832433e-05, + "loss": 0.8762, + "step": 1992 + }, + { + "epoch": 0.15080776361091142, + "grad_norm": 2.313094139099121, + "learning_rate": 1.7504627822693997e-05, + "loss": 0.8429, + "step": 1993 + }, + { + "epoch": 0.15088343233324505, + "grad_norm": 2.01436710357666, + "learning_rate": 1.750389065396826e-05, + "loss": 0.7233, + "step": 1994 + }, + { + "epoch": 0.15095910105557866, + "grad_norm": 3.0272583961486816, + "learning_rate": 1.7503152952701382e-05, + "loss": 0.7201, + "step": 1995 + }, + { + "epoch": 0.1510347697779123, + "grad_norm": 2.1522340774536133, + "learning_rate": 1.7502414718939565e-05, + "loss": 0.6485, + "step": 1996 + }, + { + "epoch": 0.1511104385002459, + "grad_norm": 2.177858829498291, + "learning_rate": 1.750167595272904e-05, + "loss": 0.7589, + "step": 1997 + }, + { + "epoch": 0.15118610722257955, + "grad_norm": 2.149120569229126, + "learning_rate": 1.750093665411607e-05, + "loss": 0.6136, + "step": 1998 + }, + { + "epoch": 0.15126177594491316, + "grad_norm": 2.5754570960998535, + "learning_rate": 1.7500196823146948e-05, + "loss": 0.6516, + "step": 1999 + }, + { + "epoch": 0.1513374446672468, + "grad_norm": 2.3540103435516357, + "learning_rate": 1.749945645986801e-05, + "loss": 0.7556, + "step": 2000 + }, + { + "epoch": 0.1514131133895804, + "grad_norm": 2.42387318611145, + "learning_rate": 1.7498715564325618e-05, + "loss": 0.7252, + "step": 2001 + }, + { + "epoch": 0.15148878211191405, + "grad_norm": 2.679372549057007, + "learning_rate": 1.749797413656617e-05, + "loss": 0.7455, + "step": 2002 + }, + { + "epoch": 0.15156445083424766, + "grad_norm": 2.259680986404419, + "learning_rate": 1.7497232176636094e-05, + "loss": 0.7875, + "step": 2003 + }, + { + "epoch": 0.1516401195565813, + "grad_norm": 2.2387616634368896, + "learning_rate": 1.7496489684581854e-05, + "loss": 0.7815, + "step": 2004 + }, + { + "epoch": 0.1517157882789149, + "grad_norm": 3.6523263454437256, + "learning_rate": 1.7495746660449954e-05, + "loss": 0.626, + "step": 2005 + }, + { + "epoch": 0.15179145700124855, + "grad_norm": 2.6934144496917725, + "learning_rate": 1.7495003104286916e-05, + "loss": 0.7533, + "step": 2006 + }, + { + "epoch": 0.15186712572358216, + "grad_norm": 2.329314947128296, + "learning_rate": 1.749425901613931e-05, + "loss": 0.7858, + "step": 2007 + }, + { + "epoch": 0.15194279444591577, + "grad_norm": 3.2107553482055664, + "learning_rate": 1.7493514396053727e-05, + "loss": 0.7217, + "step": 2008 + }, + { + "epoch": 0.1520184631682494, + "grad_norm": 2.6318206787109375, + "learning_rate": 1.7492769244076804e-05, + "loss": 0.7701, + "step": 2009 + }, + { + "epoch": 0.15209413189058302, + "grad_norm": 2.870945453643799, + "learning_rate": 1.7492023560255202e-05, + "loss": 0.8018, + "step": 2010 + }, + { + "epoch": 0.15216980061291666, + "grad_norm": 2.799070119857788, + "learning_rate": 1.7491277344635616e-05, + "loss": 0.7906, + "step": 2011 + }, + { + "epoch": 0.15224546933525027, + "grad_norm": 3.5722224712371826, + "learning_rate": 1.7490530597264778e-05, + "loss": 0.7233, + "step": 2012 + }, + { + "epoch": 0.1523211380575839, + "grad_norm": 2.482611894607544, + "learning_rate": 1.7489783318189455e-05, + "loss": 0.7542, + "step": 2013 + }, + { + "epoch": 0.15239680677991752, + "grad_norm": 2.59255051612854, + "learning_rate": 1.748903550745644e-05, + "loss": 0.7671, + "step": 2014 + }, + { + "epoch": 0.15247247550225115, + "grad_norm": 3.2076199054718018, + "learning_rate": 1.7488287165112564e-05, + "loss": 0.8936, + "step": 2015 + }, + { + "epoch": 0.15254814422458476, + "grad_norm": 3.4623420238494873, + "learning_rate": 1.748753829120469e-05, + "loss": 0.7874, + "step": 2016 + }, + { + "epoch": 0.1526238129469184, + "grad_norm": 2.2293150424957275, + "learning_rate": 1.748678888577972e-05, + "loss": 0.8013, + "step": 2017 + }, + { + "epoch": 0.152699481669252, + "grad_norm": 2.1837499141693115, + "learning_rate": 1.748603894888458e-05, + "loss": 0.8155, + "step": 2018 + }, + { + "epoch": 0.15277515039158565, + "grad_norm": 2.574540138244629, + "learning_rate": 1.748528848056623e-05, + "loss": 0.6882, + "step": 2019 + }, + { + "epoch": 0.15285081911391926, + "grad_norm": 2.3444480895996094, + "learning_rate": 1.7484537480871676e-05, + "loss": 0.9241, + "step": 2020 + }, + { + "epoch": 0.15292648783625287, + "grad_norm": 2.1523497104644775, + "learning_rate": 1.7483785949847937e-05, + "loss": 0.7816, + "step": 2021 + }, + { + "epoch": 0.1530021565585865, + "grad_norm": 2.120872735977173, + "learning_rate": 1.7483033887542087e-05, + "loss": 0.7581, + "step": 2022 + }, + { + "epoch": 0.15307782528092012, + "grad_norm": 2.379638910293579, + "learning_rate": 1.7482281294001218e-05, + "loss": 0.7142, + "step": 2023 + }, + { + "epoch": 0.15315349400325376, + "grad_norm": 1.9186440706253052, + "learning_rate": 1.7481528169272455e-05, + "loss": 0.6981, + "step": 2024 + }, + { + "epoch": 0.15322916272558737, + "grad_norm": 2.0575826168060303, + "learning_rate": 1.7480774513402966e-05, + "loss": 0.6741, + "step": 2025 + }, + { + "epoch": 0.153304831447921, + "grad_norm": 2.492513656616211, + "learning_rate": 1.7480020326439945e-05, + "loss": 0.845, + "step": 2026 + }, + { + "epoch": 0.15338050017025462, + "grad_norm": 2.5875332355499268, + "learning_rate": 1.7479265608430632e-05, + "loss": 0.7312, + "step": 2027 + }, + { + "epoch": 0.15345616889258826, + "grad_norm": 3.0655195713043213, + "learning_rate": 1.7478510359422273e-05, + "loss": 0.8206, + "step": 2028 + }, + { + "epoch": 0.15353183761492187, + "grad_norm": 2.516611337661743, + "learning_rate": 1.7477754579462173e-05, + "loss": 0.6667, + "step": 2029 + }, + { + "epoch": 0.1536075063372555, + "grad_norm": 2.1421144008636475, + "learning_rate": 1.7476998268597665e-05, + "loss": 0.7155, + "step": 2030 + }, + { + "epoch": 0.15368317505958912, + "grad_norm": 2.9190337657928467, + "learning_rate": 1.7476241426876104e-05, + "loss": 0.756, + "step": 2031 + }, + { + "epoch": 0.15375884378192275, + "grad_norm": 3.1374433040618896, + "learning_rate": 1.747548405434489e-05, + "loss": 0.7129, + "step": 2032 + }, + { + "epoch": 0.15383451250425637, + "grad_norm": 3.1990249156951904, + "learning_rate": 1.747472615105145e-05, + "loss": 0.7409, + "step": 2033 + }, + { + "epoch": 0.15391018122658998, + "grad_norm": 2.4455344676971436, + "learning_rate": 1.7473967717043255e-05, + "loss": 0.6613, + "step": 2034 + }, + { + "epoch": 0.15398584994892361, + "grad_norm": 2.024739980697632, + "learning_rate": 1.747320875236779e-05, + "loss": 0.8867, + "step": 2035 + }, + { + "epoch": 0.15406151867125722, + "grad_norm": 2.6114304065704346, + "learning_rate": 1.747244925707258e-05, + "loss": 0.8703, + "step": 2036 + }, + { + "epoch": 0.15413718739359086, + "grad_norm": 3.779912233352661, + "learning_rate": 1.7471689231205206e-05, + "loss": 0.8262, + "step": 2037 + }, + { + "epoch": 0.15421285611592447, + "grad_norm": 3.2516801357269287, + "learning_rate": 1.7470928674813242e-05, + "loss": 0.7587, + "step": 2038 + }, + { + "epoch": 0.1542885248382581, + "grad_norm": 2.9008138179779053, + "learning_rate": 1.7470167587944333e-05, + "loss": 0.7588, + "step": 2039 + }, + { + "epoch": 0.15436419356059172, + "grad_norm": 2.617128610610962, + "learning_rate": 1.7469405970646126e-05, + "loss": 0.6199, + "step": 2040 + }, + { + "epoch": 0.15443986228292536, + "grad_norm": 3.0137505531311035, + "learning_rate": 1.746864382296633e-05, + "loss": 0.7195, + "step": 2041 + }, + { + "epoch": 0.15451553100525897, + "grad_norm": 2.683501720428467, + "learning_rate": 1.7467881144952664e-05, + "loss": 0.8571, + "step": 2042 + }, + { + "epoch": 0.1545911997275926, + "grad_norm": 2.612112283706665, + "learning_rate": 1.7467117936652896e-05, + "loss": 0.8931, + "step": 2043 + }, + { + "epoch": 0.15466686844992622, + "grad_norm": 3.511695384979248, + "learning_rate": 1.7466354198114813e-05, + "loss": 0.7837, + "step": 2044 + }, + { + "epoch": 0.15474253717225986, + "grad_norm": 2.829535961151123, + "learning_rate": 1.7465589929386248e-05, + "loss": 0.8148, + "step": 2045 + }, + { + "epoch": 0.15481820589459347, + "grad_norm": 2.3426201343536377, + "learning_rate": 1.746482513051506e-05, + "loss": 0.6724, + "step": 2046 + }, + { + "epoch": 0.15489387461692708, + "grad_norm": 2.5401344299316406, + "learning_rate": 1.7464059801549144e-05, + "loss": 0.9651, + "step": 2047 + }, + { + "epoch": 0.15496954333926072, + "grad_norm": 2.8630175590515137, + "learning_rate": 1.7463293942536427e-05, + "loss": 0.8498, + "step": 2048 + }, + { + "epoch": 0.15504521206159433, + "grad_norm": 2.4896228313446045, + "learning_rate": 1.746252755352487e-05, + "loss": 0.862, + "step": 2049 + }, + { + "epoch": 0.15512088078392797, + "grad_norm": 2.259605646133423, + "learning_rate": 1.7461760634562468e-05, + "loss": 0.633, + "step": 2050 + }, + { + "epoch": 0.15519654950626158, + "grad_norm": 2.4651870727539062, + "learning_rate": 1.7460993185697244e-05, + "loss": 0.7007, + "step": 2051 + }, + { + "epoch": 0.15527221822859522, + "grad_norm": 2.3934268951416016, + "learning_rate": 1.7460225206977262e-05, + "loss": 0.9508, + "step": 2052 + }, + { + "epoch": 0.15534788695092883, + "grad_norm": 2.429025650024414, + "learning_rate": 1.7459456698450613e-05, + "loss": 0.6615, + "step": 2053 + }, + { + "epoch": 0.15542355567326246, + "grad_norm": 2.19804310798645, + "learning_rate": 1.7458687660165425e-05, + "loss": 0.8376, + "step": 2054 + }, + { + "epoch": 0.15549922439559608, + "grad_norm": 2.211962938308716, + "learning_rate": 1.7457918092169857e-05, + "loss": 0.8152, + "step": 2055 + }, + { + "epoch": 0.1555748931179297, + "grad_norm": 2.254776954650879, + "learning_rate": 1.74571479945121e-05, + "loss": 0.8516, + "step": 2056 + }, + { + "epoch": 0.15565056184026332, + "grad_norm": 2.47976016998291, + "learning_rate": 1.7456377367240385e-05, + "loss": 0.8315, + "step": 2057 + }, + { + "epoch": 0.15572623056259696, + "grad_norm": 2.8059258460998535, + "learning_rate": 1.7455606210402966e-05, + "loss": 0.7777, + "step": 2058 + }, + { + "epoch": 0.15580189928493057, + "grad_norm": 2.620880603790283, + "learning_rate": 1.7454834524048138e-05, + "loss": 0.6418, + "step": 2059 + }, + { + "epoch": 0.1558775680072642, + "grad_norm": 2.414295196533203, + "learning_rate": 1.7454062308224226e-05, + "loss": 0.7401, + "step": 2060 + }, + { + "epoch": 0.15595323672959782, + "grad_norm": 2.7423794269561768, + "learning_rate": 1.7453289562979585e-05, + "loss": 0.8576, + "step": 2061 + }, + { + "epoch": 0.15602890545193143, + "grad_norm": 3.214839458465576, + "learning_rate": 1.7452516288362612e-05, + "loss": 0.8235, + "step": 2062 + }, + { + "epoch": 0.15610457417426507, + "grad_norm": 2.462529182434082, + "learning_rate": 1.7451742484421733e-05, + "loss": 0.8605, + "step": 2063 + }, + { + "epoch": 0.15618024289659868, + "grad_norm": 2.2474400997161865, + "learning_rate": 1.7450968151205402e-05, + "loss": 0.7083, + "step": 2064 + }, + { + "epoch": 0.15625591161893232, + "grad_norm": 2.528843879699707, + "learning_rate": 1.7450193288762116e-05, + "loss": 0.8239, + "step": 2065 + }, + { + "epoch": 0.15633158034126593, + "grad_norm": 2.497174024581909, + "learning_rate": 1.7449417897140387e-05, + "loss": 0.7607, + "step": 2066 + }, + { + "epoch": 0.15640724906359957, + "grad_norm": 3.976351261138916, + "learning_rate": 1.7448641976388783e-05, + "loss": 0.7265, + "step": 2067 + }, + { + "epoch": 0.15648291778593318, + "grad_norm": 2.027620792388916, + "learning_rate": 1.7447865526555894e-05, + "loss": 0.8558, + "step": 2068 + }, + { + "epoch": 0.15655858650826682, + "grad_norm": 2.824955701828003, + "learning_rate": 1.7447088547690343e-05, + "loss": 0.9394, + "step": 2069 + }, + { + "epoch": 0.15663425523060043, + "grad_norm": 2.474083185195923, + "learning_rate": 1.7446311039840784e-05, + "loss": 0.8471, + "step": 2070 + }, + { + "epoch": 0.15670992395293407, + "grad_norm": 2.226369619369507, + "learning_rate": 1.744553300305591e-05, + "loss": 0.7453, + "step": 2071 + }, + { + "epoch": 0.15678559267526768, + "grad_norm": 2.525721788406372, + "learning_rate": 1.7444754437384443e-05, + "loss": 1.0301, + "step": 2072 + }, + { + "epoch": 0.15686126139760131, + "grad_norm": 2.348961591720581, + "learning_rate": 1.7443975342875138e-05, + "loss": 0.7909, + "step": 2073 + }, + { + "epoch": 0.15693693011993493, + "grad_norm": 2.763505697250366, + "learning_rate": 1.7443195719576785e-05, + "loss": 0.8576, + "step": 2074 + }, + { + "epoch": 0.15701259884226854, + "grad_norm": 2.2634928226470947, + "learning_rate": 1.7442415567538213e-05, + "loss": 0.9044, + "step": 2075 + }, + { + "epoch": 0.15708826756460217, + "grad_norm": 2.370476722717285, + "learning_rate": 1.7441634886808265e-05, + "loss": 0.8432, + "step": 2076 + }, + { + "epoch": 0.15716393628693578, + "grad_norm": 2.5229246616363525, + "learning_rate": 1.7440853677435842e-05, + "loss": 0.7714, + "step": 2077 + }, + { + "epoch": 0.15723960500926942, + "grad_norm": 2.90761137008667, + "learning_rate": 1.744007193946986e-05, + "loss": 0.6313, + "step": 2078 + }, + { + "epoch": 0.15731527373160303, + "grad_norm": 2.4619617462158203, + "learning_rate": 1.7439289672959275e-05, + "loss": 0.7495, + "step": 2079 + }, + { + "epoch": 0.15739094245393667, + "grad_norm": 3.04870867729187, + "learning_rate": 1.743850687795307e-05, + "loss": 0.8112, + "step": 2080 + }, + { + "epoch": 0.15746661117627028, + "grad_norm": 2.4026286602020264, + "learning_rate": 1.7437723554500277e-05, + "loss": 0.772, + "step": 2081 + }, + { + "epoch": 0.15754227989860392, + "grad_norm": 2.5476691722869873, + "learning_rate": 1.743693970264994e-05, + "loss": 0.8682, + "step": 2082 + }, + { + "epoch": 0.15761794862093753, + "grad_norm": 2.528425931930542, + "learning_rate": 1.7436155322451153e-05, + "loss": 0.9005, + "step": 2083 + }, + { + "epoch": 0.15769361734327117, + "grad_norm": 2.272146224975586, + "learning_rate": 1.743537041395303e-05, + "loss": 0.6577, + "step": 2084 + }, + { + "epoch": 0.15776928606560478, + "grad_norm": 2.186119794845581, + "learning_rate": 1.743458497720473e-05, + "loss": 0.7003, + "step": 2085 + }, + { + "epoch": 0.15784495478793842, + "grad_norm": 2.385634660720825, + "learning_rate": 1.743379901225544e-05, + "loss": 0.8375, + "step": 2086 + }, + { + "epoch": 0.15792062351027203, + "grad_norm": 3.0107641220092773, + "learning_rate": 1.7433012519154378e-05, + "loss": 0.8261, + "step": 2087 + }, + { + "epoch": 0.15799629223260564, + "grad_norm": 2.3825418949127197, + "learning_rate": 1.7432225497950792e-05, + "loss": 0.729, + "step": 2088 + }, + { + "epoch": 0.15807196095493928, + "grad_norm": 2.1834664344787598, + "learning_rate": 1.7431437948693975e-05, + "loss": 0.6568, + "step": 2089 + }, + { + "epoch": 0.1581476296772729, + "grad_norm": 2.4563395977020264, + "learning_rate": 1.7430649871433245e-05, + "loss": 0.7753, + "step": 2090 + }, + { + "epoch": 0.15822329839960653, + "grad_norm": 2.7324671745300293, + "learning_rate": 1.742986126621795e-05, + "loss": 0.7523, + "step": 2091 + }, + { + "epoch": 0.15829896712194014, + "grad_norm": 2.3517651557922363, + "learning_rate": 1.7429072133097478e-05, + "loss": 0.7389, + "step": 2092 + }, + { + "epoch": 0.15837463584427378, + "grad_norm": 2.4391419887542725, + "learning_rate": 1.7428282472121245e-05, + "loss": 0.6748, + "step": 2093 + }, + { + "epoch": 0.1584503045666074, + "grad_norm": 3.3053195476531982, + "learning_rate": 1.7427492283338704e-05, + "loss": 0.7699, + "step": 2094 + }, + { + "epoch": 0.15852597328894102, + "grad_norm": 2.2691550254821777, + "learning_rate": 1.7426701566799337e-05, + "loss": 0.8406, + "step": 2095 + }, + { + "epoch": 0.15860164201127464, + "grad_norm": 2.280519723892212, + "learning_rate": 1.7425910322552666e-05, + "loss": 0.8129, + "step": 2096 + }, + { + "epoch": 0.15867731073360827, + "grad_norm": 2.6433169841766357, + "learning_rate": 1.7425118550648234e-05, + "loss": 0.7612, + "step": 2097 + }, + { + "epoch": 0.15875297945594188, + "grad_norm": 2.356234550476074, + "learning_rate": 1.742432625113563e-05, + "loss": 0.7845, + "step": 2098 + }, + { + "epoch": 0.15882864817827552, + "grad_norm": 2.7373485565185547, + "learning_rate": 1.742353342406447e-05, + "loss": 0.6441, + "step": 2099 + }, + { + "epoch": 0.15890431690060913, + "grad_norm": 2.2492804527282715, + "learning_rate": 1.7422740069484397e-05, + "loss": 0.6834, + "step": 2100 + }, + { + "epoch": 0.15897998562294274, + "grad_norm": 2.219045639038086, + "learning_rate": 1.7421946187445104e-05, + "loss": 0.7691, + "step": 2101 + }, + { + "epoch": 0.15905565434527638, + "grad_norm": 2.9211206436157227, + "learning_rate": 1.7421151777996297e-05, + "loss": 0.629, + "step": 2102 + }, + { + "epoch": 0.15913132306761, + "grad_norm": 3.131239414215088, + "learning_rate": 1.7420356841187732e-05, + "loss": 0.7825, + "step": 2103 + }, + { + "epoch": 0.15920699178994363, + "grad_norm": 2.3928134441375732, + "learning_rate": 1.7419561377069183e-05, + "loss": 0.8064, + "step": 2104 + }, + { + "epoch": 0.15928266051227724, + "grad_norm": 2.74513840675354, + "learning_rate": 1.741876538569047e-05, + "loss": 0.8204, + "step": 2105 + }, + { + "epoch": 0.15935832923461088, + "grad_norm": 2.3963100910186768, + "learning_rate": 1.741796886710144e-05, + "loss": 1.0466, + "step": 2106 + }, + { + "epoch": 0.1594339979569445, + "grad_norm": 2.196810007095337, + "learning_rate": 1.7417171821351973e-05, + "loss": 0.6747, + "step": 2107 + }, + { + "epoch": 0.15950966667927813, + "grad_norm": 2.6434783935546875, + "learning_rate": 1.741637424849198e-05, + "loss": 0.8187, + "step": 2108 + }, + { + "epoch": 0.15958533540161174, + "grad_norm": 2.148426055908203, + "learning_rate": 1.741557614857141e-05, + "loss": 0.7376, + "step": 2109 + }, + { + "epoch": 0.15966100412394538, + "grad_norm": 3.2016146183013916, + "learning_rate": 1.741477752164024e-05, + "loss": 0.7088, + "step": 2110 + }, + { + "epoch": 0.159736672846279, + "grad_norm": 2.8447532653808594, + "learning_rate": 1.7413978367748488e-05, + "loss": 0.8271, + "step": 2111 + }, + { + "epoch": 0.15981234156861263, + "grad_norm": 2.411562919616699, + "learning_rate": 1.7413178686946198e-05, + "loss": 0.801, + "step": 2112 + }, + { + "epoch": 0.15988801029094624, + "grad_norm": 2.4940292835235596, + "learning_rate": 1.7412378479283445e-05, + "loss": 0.838, + "step": 2113 + }, + { + "epoch": 0.15996367901327985, + "grad_norm": 2.510559320449829, + "learning_rate": 1.7411577744810343e-05, + "loss": 0.7729, + "step": 2114 + }, + { + "epoch": 0.16003934773561349, + "grad_norm": 2.928279399871826, + "learning_rate": 1.7410776483577036e-05, + "loss": 0.9162, + "step": 2115 + }, + { + "epoch": 0.1601150164579471, + "grad_norm": 2.311375617980957, + "learning_rate": 1.7409974695633702e-05, + "loss": 0.7684, + "step": 2116 + }, + { + "epoch": 0.16019068518028073, + "grad_norm": 2.5286309719085693, + "learning_rate": 1.740917238103055e-05, + "loss": 0.7551, + "step": 2117 + }, + { + "epoch": 0.16026635390261434, + "grad_norm": 2.7157998085021973, + "learning_rate": 1.740836953981783e-05, + "loss": 0.9096, + "step": 2118 + }, + { + "epoch": 0.16034202262494798, + "grad_norm": 2.3941192626953125, + "learning_rate": 1.7407566172045808e-05, + "loss": 0.7154, + "step": 2119 + }, + { + "epoch": 0.1604176913472816, + "grad_norm": 3.259812355041504, + "learning_rate": 1.74067622777648e-05, + "loss": 0.7483, + "step": 2120 + }, + { + "epoch": 0.16049336006961523, + "grad_norm": 2.8680622577667236, + "learning_rate": 1.740595785702515e-05, + "loss": 0.7112, + "step": 2121 + }, + { + "epoch": 0.16056902879194884, + "grad_norm": 2.4445548057556152, + "learning_rate": 1.7405152909877228e-05, + "loss": 0.7903, + "step": 2122 + }, + { + "epoch": 0.16064469751428248, + "grad_norm": 2.763958215713501, + "learning_rate": 1.7404347436371446e-05, + "loss": 0.7796, + "step": 2123 + }, + { + "epoch": 0.1607203662366161, + "grad_norm": 2.0752618312835693, + "learning_rate": 1.7403541436558246e-05, + "loss": 0.7199, + "step": 2124 + }, + { + "epoch": 0.16079603495894973, + "grad_norm": 2.1711764335632324, + "learning_rate": 1.74027349104881e-05, + "loss": 0.6946, + "step": 2125 + }, + { + "epoch": 0.16087170368128334, + "grad_norm": 2.151015043258667, + "learning_rate": 1.7401927858211516e-05, + "loss": 0.7603, + "step": 2126 + }, + { + "epoch": 0.16094737240361698, + "grad_norm": 4.292178153991699, + "learning_rate": 1.7401120279779035e-05, + "loss": 0.7286, + "step": 2127 + }, + { + "epoch": 0.1610230411259506, + "grad_norm": 4.497610569000244, + "learning_rate": 1.7400312175241226e-05, + "loss": 0.8232, + "step": 2128 + }, + { + "epoch": 0.1610987098482842, + "grad_norm": 2.505603551864624, + "learning_rate": 1.73995035446487e-05, + "loss": 0.7859, + "step": 2129 + }, + { + "epoch": 0.16117437857061784, + "grad_norm": 2.1637277603149414, + "learning_rate": 1.73986943880521e-05, + "loss": 0.7861, + "step": 2130 + }, + { + "epoch": 0.16125004729295145, + "grad_norm": 3.3411808013916016, + "learning_rate": 1.7397884705502088e-05, + "loss": 0.8564, + "step": 2131 + }, + { + "epoch": 0.1613257160152851, + "grad_norm": 2.2842612266540527, + "learning_rate": 1.7397074497049378e-05, + "loss": 0.799, + "step": 2132 + }, + { + "epoch": 0.1614013847376187, + "grad_norm": 3.669283628463745, + "learning_rate": 1.73962637627447e-05, + "loss": 0.7311, + "step": 2133 + }, + { + "epoch": 0.16147705345995234, + "grad_norm": 2.489490032196045, + "learning_rate": 1.7395452502638826e-05, + "loss": 0.8819, + "step": 2134 + }, + { + "epoch": 0.16155272218228595, + "grad_norm": 2.164292097091675, + "learning_rate": 1.7394640716782564e-05, + "loss": 0.8138, + "step": 2135 + }, + { + "epoch": 0.16162839090461958, + "grad_norm": 2.452188014984131, + "learning_rate": 1.739382840522675e-05, + "loss": 0.7811, + "step": 2136 + }, + { + "epoch": 0.1617040596269532, + "grad_norm": 2.700749397277832, + "learning_rate": 1.739301556802225e-05, + "loss": 0.7128, + "step": 2137 + }, + { + "epoch": 0.16177972834928683, + "grad_norm": 3.382140874862671, + "learning_rate": 1.7392202205219974e-05, + "loss": 0.9743, + "step": 2138 + }, + { + "epoch": 0.16185539707162044, + "grad_norm": 2.4139597415924072, + "learning_rate": 1.739138831687085e-05, + "loss": 0.7896, + "step": 2139 + }, + { + "epoch": 0.16193106579395408, + "grad_norm": 2.2004284858703613, + "learning_rate": 1.7390573903025845e-05, + "loss": 0.7469, + "step": 2140 + }, + { + "epoch": 0.1620067345162877, + "grad_norm": 2.8019471168518066, + "learning_rate": 1.7389758963735967e-05, + "loss": 0.7453, + "step": 2141 + }, + { + "epoch": 0.1620824032386213, + "grad_norm": 2.7168564796447754, + "learning_rate": 1.7388943499052246e-05, + "loss": 0.8727, + "step": 2142 + }, + { + "epoch": 0.16215807196095494, + "grad_norm": 2.5624351501464844, + "learning_rate": 1.7388127509025748e-05, + "loss": 0.7883, + "step": 2143 + }, + { + "epoch": 0.16223374068328855, + "grad_norm": 2.199237585067749, + "learning_rate": 1.738731099370758e-05, + "loss": 0.8322, + "step": 2144 + }, + { + "epoch": 0.1623094094056222, + "grad_norm": 2.3310837745666504, + "learning_rate": 1.7386493953148867e-05, + "loss": 0.7478, + "step": 2145 + }, + { + "epoch": 0.1623850781279558, + "grad_norm": 2.901883363723755, + "learning_rate": 1.7385676387400777e-05, + "loss": 0.7515, + "step": 2146 + }, + { + "epoch": 0.16246074685028944, + "grad_norm": 2.0343589782714844, + "learning_rate": 1.7384858296514507e-05, + "loss": 0.6157, + "step": 2147 + }, + { + "epoch": 0.16253641557262305, + "grad_norm": 2.7100648880004883, + "learning_rate": 1.7384039680541295e-05, + "loss": 0.8054, + "step": 2148 + }, + { + "epoch": 0.1626120842949567, + "grad_norm": 2.2112245559692383, + "learning_rate": 1.7383220539532396e-05, + "loss": 0.7847, + "step": 2149 + }, + { + "epoch": 0.1626877530172903, + "grad_norm": 2.090649127960205, + "learning_rate": 1.7382400873539117e-05, + "loss": 0.7328, + "step": 2150 + }, + { + "epoch": 0.16276342173962394, + "grad_norm": 3.082857847213745, + "learning_rate": 1.738158068261278e-05, + "loss": 0.6708, + "step": 2151 + }, + { + "epoch": 0.16283909046195755, + "grad_norm": 2.9798154830932617, + "learning_rate": 1.7380759966804754e-05, + "loss": 0.9559, + "step": 2152 + }, + { + "epoch": 0.16291475918429119, + "grad_norm": 2.2334978580474854, + "learning_rate": 1.7379938726166428e-05, + "loss": 0.6963, + "step": 2153 + }, + { + "epoch": 0.1629904279066248, + "grad_norm": 3.03721022605896, + "learning_rate": 1.737911696074924e-05, + "loss": 0.7875, + "step": 2154 + }, + { + "epoch": 0.1630660966289584, + "grad_norm": 2.496274471282959, + "learning_rate": 1.7378294670604644e-05, + "loss": 0.8951, + "step": 2155 + }, + { + "epoch": 0.16314176535129205, + "grad_norm": 2.7649779319763184, + "learning_rate": 1.7377471855784138e-05, + "loss": 0.8018, + "step": 2156 + }, + { + "epoch": 0.16321743407362566, + "grad_norm": 5.206967353820801, + "learning_rate": 1.7376648516339247e-05, + "loss": 0.7761, + "step": 2157 + }, + { + "epoch": 0.1632931027959593, + "grad_norm": 2.439495086669922, + "learning_rate": 1.7375824652321533e-05, + "loss": 0.6983, + "step": 2158 + }, + { + "epoch": 0.1633687715182929, + "grad_norm": 2.5603365898132324, + "learning_rate": 1.737500026378259e-05, + "loss": 0.7642, + "step": 2159 + }, + { + "epoch": 0.16344444024062654, + "grad_norm": 2.957632303237915, + "learning_rate": 1.7374175350774042e-05, + "loss": 0.829, + "step": 2160 + }, + { + "epoch": 0.16352010896296015, + "grad_norm": 2.3568167686462402, + "learning_rate": 1.7373349913347546e-05, + "loss": 0.8891, + "step": 2161 + }, + { + "epoch": 0.1635957776852938, + "grad_norm": 2.483896017074585, + "learning_rate": 1.7372523951554797e-05, + "loss": 0.6859, + "step": 2162 + }, + { + "epoch": 0.1636714464076274, + "grad_norm": 2.430391788482666, + "learning_rate": 1.737169746544752e-05, + "loss": 0.8047, + "step": 2163 + }, + { + "epoch": 0.16374711512996104, + "grad_norm": 2.568268060684204, + "learning_rate": 1.7370870455077468e-05, + "loss": 0.6092, + "step": 2164 + }, + { + "epoch": 0.16382278385229465, + "grad_norm": 2.6781516075134277, + "learning_rate": 1.7370042920496433e-05, + "loss": 0.7879, + "step": 2165 + }, + { + "epoch": 0.1638984525746283, + "grad_norm": 2.247915506362915, + "learning_rate": 1.7369214861756238e-05, + "loss": 0.7788, + "step": 2166 + }, + { + "epoch": 0.1639741212969619, + "grad_norm": 2.176671028137207, + "learning_rate": 1.7368386278908742e-05, + "loss": 0.8544, + "step": 2167 + }, + { + "epoch": 0.1640497900192955, + "grad_norm": 2.515249013900757, + "learning_rate": 1.7367557172005827e-05, + "loss": 0.7041, + "step": 2168 + }, + { + "epoch": 0.16412545874162915, + "grad_norm": 2.1374387741088867, + "learning_rate": 1.736672754109942e-05, + "loss": 0.8569, + "step": 2169 + }, + { + "epoch": 0.16420112746396276, + "grad_norm": 2.5364511013031006, + "learning_rate": 1.7365897386241472e-05, + "loss": 0.7735, + "step": 2170 + }, + { + "epoch": 0.1642767961862964, + "grad_norm": 2.0026803016662598, + "learning_rate": 1.7365066707483972e-05, + "loss": 0.8604, + "step": 2171 + }, + { + "epoch": 0.16435246490863, + "grad_norm": 20.266813278198242, + "learning_rate": 1.736423550487894e-05, + "loss": 0.871, + "step": 2172 + }, + { + "epoch": 0.16442813363096365, + "grad_norm": 1.7573026418685913, + "learning_rate": 1.736340377847843e-05, + "loss": 0.6222, + "step": 2173 + }, + { + "epoch": 0.16450380235329726, + "grad_norm": 2.631108522415161, + "learning_rate": 1.736257152833452e-05, + "loss": 0.7496, + "step": 2174 + }, + { + "epoch": 0.1645794710756309, + "grad_norm": 2.149601459503174, + "learning_rate": 1.7361738754499332e-05, + "loss": 0.7281, + "step": 2175 + }, + { + "epoch": 0.1646551397979645, + "grad_norm": 2.6889591217041016, + "learning_rate": 1.736090545702502e-05, + "loss": 0.7323, + "step": 2176 + }, + { + "epoch": 0.16473080852029814, + "grad_norm": 2.2632665634155273, + "learning_rate": 1.736007163596377e-05, + "loss": 0.8867, + "step": 2177 + }, + { + "epoch": 0.16480647724263175, + "grad_norm": 2.988801956176758, + "learning_rate": 1.735923729136779e-05, + "loss": 0.8742, + "step": 2178 + }, + { + "epoch": 0.1648821459649654, + "grad_norm": 2.2661144733428955, + "learning_rate": 1.7358402423289332e-05, + "loss": 0.6946, + "step": 2179 + }, + { + "epoch": 0.164957814687299, + "grad_norm": 2.401752233505249, + "learning_rate": 1.735756703178068e-05, + "loss": 0.8024, + "step": 2180 + }, + { + "epoch": 0.16503348340963264, + "grad_norm": 2.7266244888305664, + "learning_rate": 1.7356731116894153e-05, + "loss": 0.7484, + "step": 2181 + }, + { + "epoch": 0.16510915213196625, + "grad_norm": 3.7801167964935303, + "learning_rate": 1.7355894678682094e-05, + "loss": 0.7794, + "step": 2182 + }, + { + "epoch": 0.16518482085429986, + "grad_norm": 2.931405544281006, + "learning_rate": 1.7355057717196883e-05, + "loss": 0.6981, + "step": 2183 + }, + { + "epoch": 0.1652604895766335, + "grad_norm": 3.314436435699463, + "learning_rate": 1.7354220232490932e-05, + "loss": 0.8774, + "step": 2184 + }, + { + "epoch": 0.1653361582989671, + "grad_norm": 2.7740743160247803, + "learning_rate": 1.735338222461669e-05, + "loss": 0.6553, + "step": 2185 + }, + { + "epoch": 0.16541182702130075, + "grad_norm": 2.309906482696533, + "learning_rate": 1.735254369362664e-05, + "loss": 0.7038, + "step": 2186 + }, + { + "epoch": 0.16548749574363436, + "grad_norm": 2.7351341247558594, + "learning_rate": 1.7351704639573284e-05, + "loss": 0.7777, + "step": 2187 + }, + { + "epoch": 0.165563164465968, + "grad_norm": 3.7266592979431152, + "learning_rate": 1.735086506250917e-05, + "loss": 0.7223, + "step": 2188 + }, + { + "epoch": 0.1656388331883016, + "grad_norm": 7.991847038269043, + "learning_rate": 1.7350024962486876e-05, + "loss": 0.6462, + "step": 2189 + }, + { + "epoch": 0.16571450191063525, + "grad_norm": 2.2966339588165283, + "learning_rate": 1.7349184339559015e-05, + "loss": 0.8108, + "step": 2190 + }, + { + "epoch": 0.16579017063296886, + "grad_norm": 2.601431131362915, + "learning_rate": 1.7348343193778223e-05, + "loss": 0.7492, + "step": 2191 + }, + { + "epoch": 0.1658658393553025, + "grad_norm": 3.003119945526123, + "learning_rate": 1.7347501525197177e-05, + "loss": 0.7404, + "step": 2192 + }, + { + "epoch": 0.1659415080776361, + "grad_norm": 2.2687859535217285, + "learning_rate": 1.734665933386859e-05, + "loss": 0.7979, + "step": 2193 + }, + { + "epoch": 0.16601717679996975, + "grad_norm": 2.463181257247925, + "learning_rate": 1.73458166198452e-05, + "loss": 0.8188, + "step": 2194 + }, + { + "epoch": 0.16609284552230336, + "grad_norm": 2.8143796920776367, + "learning_rate": 1.7344973383179776e-05, + "loss": 0.8257, + "step": 2195 + }, + { + "epoch": 0.16616851424463697, + "grad_norm": 2.4394776821136475, + "learning_rate": 1.7344129623925128e-05, + "loss": 0.7174, + "step": 2196 + }, + { + "epoch": 0.1662441829669706, + "grad_norm": 2.9498252868652344, + "learning_rate": 1.7343285342134096e-05, + "loss": 0.7468, + "step": 2197 + }, + { + "epoch": 0.16631985168930422, + "grad_norm": 2.7809300422668457, + "learning_rate": 1.734244053785955e-05, + "loss": 0.917, + "step": 2198 + }, + { + "epoch": 0.16639552041163785, + "grad_norm": 2.0343682765960693, + "learning_rate": 1.7341595211154397e-05, + "loss": 0.8867, + "step": 2199 + }, + { + "epoch": 0.16647118913397146, + "grad_norm": 2.406003713607788, + "learning_rate": 1.7340749362071567e-05, + "loss": 0.7902, + "step": 2200 + }, + { + "epoch": 0.1665468578563051, + "grad_norm": 3.3123080730438232, + "learning_rate": 1.733990299066404e-05, + "loss": 0.8509, + "step": 2201 + }, + { + "epoch": 0.1666225265786387, + "grad_norm": 2.0788116455078125, + "learning_rate": 1.733905609698481e-05, + "loss": 0.6832, + "step": 2202 + }, + { + "epoch": 0.16669819530097235, + "grad_norm": 2.732825517654419, + "learning_rate": 1.7338208681086916e-05, + "loss": 0.789, + "step": 2203 + }, + { + "epoch": 0.16677386402330596, + "grad_norm": 2.7512941360473633, + "learning_rate": 1.7337360743023425e-05, + "loss": 0.9523, + "step": 2204 + }, + { + "epoch": 0.1668495327456396, + "grad_norm": 2.181548833847046, + "learning_rate": 1.733651228284744e-05, + "loss": 0.7516, + "step": 2205 + }, + { + "epoch": 0.1669252014679732, + "grad_norm": 2.5979111194610596, + "learning_rate": 1.733566330061209e-05, + "loss": 0.861, + "step": 2206 + }, + { + "epoch": 0.16700087019030685, + "grad_norm": 2.890141248703003, + "learning_rate": 1.7334813796370546e-05, + "loss": 0.8529, + "step": 2207 + }, + { + "epoch": 0.16707653891264046, + "grad_norm": 2.580782175064087, + "learning_rate": 1.7333963770176002e-05, + "loss": 0.8297, + "step": 2208 + }, + { + "epoch": 0.16715220763497407, + "grad_norm": 3.2536733150482178, + "learning_rate": 1.7333113222081692e-05, + "loss": 0.723, + "step": 2209 + }, + { + "epoch": 0.1672278763573077, + "grad_norm": 2.6812384128570557, + "learning_rate": 1.733226215214088e-05, + "loss": 0.84, + "step": 2210 + }, + { + "epoch": 0.16730354507964132, + "grad_norm": 4.129171371459961, + "learning_rate": 1.733141056040686e-05, + "loss": 0.8535, + "step": 2211 + }, + { + "epoch": 0.16737921380197496, + "grad_norm": 2.4980597496032715, + "learning_rate": 1.7330558446932965e-05, + "loss": 0.8225, + "step": 2212 + }, + { + "epoch": 0.16745488252430857, + "grad_norm": 2.615471839904785, + "learning_rate": 1.7329705811772556e-05, + "loss": 0.7097, + "step": 2213 + }, + { + "epoch": 0.1675305512466422, + "grad_norm": 2.604362964630127, + "learning_rate": 1.7328852654979026e-05, + "loss": 0.8121, + "step": 2214 + }, + { + "epoch": 0.16760621996897582, + "grad_norm": 2.1432902812957764, + "learning_rate": 1.732799897660581e-05, + "loss": 0.5405, + "step": 2215 + }, + { + "epoch": 0.16768188869130946, + "grad_norm": 2.9844110012054443, + "learning_rate": 1.7327144776706355e-05, + "loss": 0.8734, + "step": 2216 + }, + { + "epoch": 0.16775755741364307, + "grad_norm": 2.605469226837158, + "learning_rate": 1.7326290055334162e-05, + "loss": 0.8101, + "step": 2217 + }, + { + "epoch": 0.1678332261359767, + "grad_norm": 3.859015464782715, + "learning_rate": 1.7325434812542757e-05, + "loss": 0.7934, + "step": 2218 + }, + { + "epoch": 0.16790889485831031, + "grad_norm": 2.154299020767212, + "learning_rate": 1.7324579048385696e-05, + "loss": 0.7312, + "step": 2219 + }, + { + "epoch": 0.16798456358064395, + "grad_norm": 3.647308111190796, + "learning_rate": 1.732372276291657e-05, + "loss": 0.7668, + "step": 2220 + }, + { + "epoch": 0.16806023230297756, + "grad_norm": 2.0847365856170654, + "learning_rate": 1.7322865956189003e-05, + "loss": 0.7016, + "step": 2221 + }, + { + "epoch": 0.16813590102531117, + "grad_norm": 2.722703695297241, + "learning_rate": 1.732200862825665e-05, + "loss": 0.7803, + "step": 2222 + }, + { + "epoch": 0.1682115697476448, + "grad_norm": 2.674581527709961, + "learning_rate": 1.7321150779173197e-05, + "loss": 0.854, + "step": 2223 + }, + { + "epoch": 0.16828723846997842, + "grad_norm": 3.1795260906219482, + "learning_rate": 1.732029240899237e-05, + "loss": 0.7935, + "step": 2224 + }, + { + "epoch": 0.16836290719231206, + "grad_norm": 2.396897792816162, + "learning_rate": 1.7319433517767923e-05, + "loss": 0.7769, + "step": 2225 + }, + { + "epoch": 0.16843857591464567, + "grad_norm": 2.776615619659424, + "learning_rate": 1.731857410555364e-05, + "loss": 0.6984, + "step": 2226 + }, + { + "epoch": 0.1685142446369793, + "grad_norm": 2.690028429031372, + "learning_rate": 1.731771417240334e-05, + "loss": 0.7615, + "step": 2227 + }, + { + "epoch": 0.16858991335931292, + "grad_norm": 2.915459156036377, + "learning_rate": 1.731685371837088e-05, + "loss": 0.9331, + "step": 2228 + }, + { + "epoch": 0.16866558208164656, + "grad_norm": 2.515017509460449, + "learning_rate": 1.7315992743510135e-05, + "loss": 0.7996, + "step": 2229 + }, + { + "epoch": 0.16874125080398017, + "grad_norm": 2.5439369678497314, + "learning_rate": 1.7315131247875028e-05, + "loss": 0.873, + "step": 2230 + }, + { + "epoch": 0.1688169195263138, + "grad_norm": 3.400592803955078, + "learning_rate": 1.7314269231519512e-05, + "loss": 0.8382, + "step": 2231 + }, + { + "epoch": 0.16889258824864742, + "grad_norm": 3.8131964206695557, + "learning_rate": 1.7313406694497562e-05, + "loss": 0.786, + "step": 2232 + }, + { + "epoch": 0.16896825697098106, + "grad_norm": 2.194751501083374, + "learning_rate": 1.7312543636863197e-05, + "loss": 0.7376, + "step": 2233 + }, + { + "epoch": 0.16904392569331467, + "grad_norm": 2.373616933822632, + "learning_rate": 1.731168005867046e-05, + "loss": 0.6248, + "step": 2234 + }, + { + "epoch": 0.1691195944156483, + "grad_norm": 2.5149641036987305, + "learning_rate": 1.731081595997344e-05, + "loss": 0.8259, + "step": 2235 + }, + { + "epoch": 0.16919526313798192, + "grad_norm": 2.6134889125823975, + "learning_rate": 1.730995134082624e-05, + "loss": 0.804, + "step": 2236 + }, + { + "epoch": 0.16927093186031553, + "grad_norm": 2.909189462661743, + "learning_rate": 1.730908620128301e-05, + "loss": 0.8914, + "step": 2237 + }, + { + "epoch": 0.16934660058264916, + "grad_norm": 2.5435116291046143, + "learning_rate": 1.7308220541397926e-05, + "loss": 0.8368, + "step": 2238 + }, + { + "epoch": 0.16942226930498278, + "grad_norm": 2.7107224464416504, + "learning_rate": 1.7307354361225204e-05, + "loss": 0.9474, + "step": 2239 + }, + { + "epoch": 0.1694979380273164, + "grad_norm": 2.7715609073638916, + "learning_rate": 1.730648766081908e-05, + "loss": 0.7627, + "step": 2240 + }, + { + "epoch": 0.16957360674965002, + "grad_norm": 2.654773473739624, + "learning_rate": 1.730562044023383e-05, + "loss": 0.7729, + "step": 2241 + }, + { + "epoch": 0.16964927547198366, + "grad_norm": 2.386650800704956, + "learning_rate": 1.730475269952377e-05, + "loss": 0.6608, + "step": 2242 + }, + { + "epoch": 0.16972494419431727, + "grad_norm": 2.307753086090088, + "learning_rate": 1.730388443874323e-05, + "loss": 0.7689, + "step": 2243 + }, + { + "epoch": 0.1698006129166509, + "grad_norm": 2.196772336959839, + "learning_rate": 1.7303015657946592e-05, + "loss": 0.7859, + "step": 2244 + }, + { + "epoch": 0.16987628163898452, + "grad_norm": 2.593203544616699, + "learning_rate": 1.730214635718826e-05, + "loss": 0.7733, + "step": 2245 + }, + { + "epoch": 0.16995195036131816, + "grad_norm": 2.494314193725586, + "learning_rate": 1.7301276536522664e-05, + "loss": 0.7156, + "step": 2246 + }, + { + "epoch": 0.17002761908365177, + "grad_norm": 2.721299648284912, + "learning_rate": 1.7300406196004286e-05, + "loss": 0.738, + "step": 2247 + }, + { + "epoch": 0.1701032878059854, + "grad_norm": 2.1940648555755615, + "learning_rate": 1.7299535335687622e-05, + "loss": 0.7942, + "step": 2248 + }, + { + "epoch": 0.17017895652831902, + "grad_norm": 2.7148277759552, + "learning_rate": 1.7298663955627216e-05, + "loss": 0.8078, + "step": 2249 + }, + { + "epoch": 0.17025462525065263, + "grad_norm": 3.0322153568267822, + "learning_rate": 1.729779205587763e-05, + "loss": 0.9257, + "step": 2250 + }, + { + "epoch": 0.17033029397298627, + "grad_norm": 2.168626546859741, + "learning_rate": 1.7296919636493464e-05, + "loss": 0.8326, + "step": 2251 + }, + { + "epoch": 0.17040596269531988, + "grad_norm": 2.8294248580932617, + "learning_rate": 1.729604669752936e-05, + "loss": 0.8557, + "step": 2252 + }, + { + "epoch": 0.17048163141765352, + "grad_norm": 2.4835293292999268, + "learning_rate": 1.7295173239039975e-05, + "loss": 0.7724, + "step": 2253 + }, + { + "epoch": 0.17055730013998713, + "grad_norm": 3.3324198722839355, + "learning_rate": 1.7294299261080015e-05, + "loss": 0.7424, + "step": 2254 + }, + { + "epoch": 0.17063296886232077, + "grad_norm": 2.104118824005127, + "learning_rate": 1.7293424763704206e-05, + "loss": 0.7898, + "step": 2255 + }, + { + "epoch": 0.17070863758465438, + "grad_norm": 2.7089343070983887, + "learning_rate": 1.7292549746967316e-05, + "loss": 0.9443, + "step": 2256 + }, + { + "epoch": 0.17078430630698802, + "grad_norm": 3.223379611968994, + "learning_rate": 1.7291674210924138e-05, + "loss": 0.7764, + "step": 2257 + }, + { + "epoch": 0.17085997502932163, + "grad_norm": 2.709465980529785, + "learning_rate": 1.7290798155629502e-05, + "loss": 0.7964, + "step": 2258 + }, + { + "epoch": 0.17093564375165526, + "grad_norm": 2.5600428581237793, + "learning_rate": 1.7289921581138273e-05, + "loss": 0.7413, + "step": 2259 + }, + { + "epoch": 0.17101131247398887, + "grad_norm": 1.963610053062439, + "learning_rate": 1.7289044487505337e-05, + "loss": 0.6844, + "step": 2260 + }, + { + "epoch": 0.1710869811963225, + "grad_norm": 2.8103370666503906, + "learning_rate": 1.728816687478563e-05, + "loss": 0.6223, + "step": 2261 + }, + { + "epoch": 0.17116264991865612, + "grad_norm": 2.9981131553649902, + "learning_rate": 1.7287288743034103e-05, + "loss": 0.7519, + "step": 2262 + }, + { + "epoch": 0.17123831864098973, + "grad_norm": 2.655627965927124, + "learning_rate": 1.728641009230575e-05, + "loss": 0.7625, + "step": 2263 + }, + { + "epoch": 0.17131398736332337, + "grad_norm": 2.775040626525879, + "learning_rate": 1.72855309226556e-05, + "loss": 0.7466, + "step": 2264 + }, + { + "epoch": 0.17138965608565698, + "grad_norm": 3.4195356369018555, + "learning_rate": 1.72846512341387e-05, + "loss": 0.6637, + "step": 2265 + }, + { + "epoch": 0.17146532480799062, + "grad_norm": 3.398912191390991, + "learning_rate": 1.7283771026810144e-05, + "loss": 0.7456, + "step": 2266 + }, + { + "epoch": 0.17154099353032423, + "grad_norm": 2.160043478012085, + "learning_rate": 1.7282890300725054e-05, + "loss": 0.6909, + "step": 2267 + }, + { + "epoch": 0.17161666225265787, + "grad_norm": 1.9356389045715332, + "learning_rate": 1.7282009055938587e-05, + "loss": 0.7153, + "step": 2268 + }, + { + "epoch": 0.17169233097499148, + "grad_norm": 3.335268259048462, + "learning_rate": 1.728112729250592e-05, + "loss": 0.6971, + "step": 2269 + }, + { + "epoch": 0.17176799969732512, + "grad_norm": 2.4439728260040283, + "learning_rate": 1.728024501048228e-05, + "loss": 0.9026, + "step": 2270 + }, + { + "epoch": 0.17184366841965873, + "grad_norm": 3.08240008354187, + "learning_rate": 1.7279362209922922e-05, + "loss": 0.7948, + "step": 2271 + }, + { + "epoch": 0.17191933714199237, + "grad_norm": 2.764417886734009, + "learning_rate": 1.7278478890883115e-05, + "loss": 0.8714, + "step": 2272 + }, + { + "epoch": 0.17199500586432598, + "grad_norm": 2.609437942504883, + "learning_rate": 1.727759505341819e-05, + "loss": 0.7332, + "step": 2273 + }, + { + "epoch": 0.17207067458665962, + "grad_norm": 2.7951455116271973, + "learning_rate": 1.7276710697583485e-05, + "loss": 0.7675, + "step": 2274 + }, + { + "epoch": 0.17214634330899323, + "grad_norm": 2.2547028064727783, + "learning_rate": 1.7275825823434386e-05, + "loss": 0.7684, + "step": 2275 + }, + { + "epoch": 0.17222201203132684, + "grad_norm": 2.3958706855773926, + "learning_rate": 1.727494043102631e-05, + "loss": 0.7689, + "step": 2276 + }, + { + "epoch": 0.17229768075366048, + "grad_norm": 2.6258463859558105, + "learning_rate": 1.7274054520414697e-05, + "loss": 0.6702, + "step": 2277 + }, + { + "epoch": 0.1723733494759941, + "grad_norm": 2.128289222717285, + "learning_rate": 1.7273168091655028e-05, + "loss": 0.8102, + "step": 2278 + }, + { + "epoch": 0.17244901819832772, + "grad_norm": 2.5921168327331543, + "learning_rate": 1.727228114480282e-05, + "loss": 0.7875, + "step": 2279 + }, + { + "epoch": 0.17252468692066134, + "grad_norm": 2.994091033935547, + "learning_rate": 1.7271393679913604e-05, + "loss": 0.8095, + "step": 2280 + }, + { + "epoch": 0.17260035564299497, + "grad_norm": 2.6522233486175537, + "learning_rate": 1.7270505697042966e-05, + "loss": 0.8525, + "step": 2281 + }, + { + "epoch": 0.17267602436532858, + "grad_norm": 1.99582040309906, + "learning_rate": 1.7269617196246514e-05, + "loss": 0.7875, + "step": 2282 + }, + { + "epoch": 0.17275169308766222, + "grad_norm": 2.60740065574646, + "learning_rate": 1.726872817757988e-05, + "loss": 0.8859, + "step": 2283 + }, + { + "epoch": 0.17282736180999583, + "grad_norm": 1.9426453113555908, + "learning_rate": 1.7267838641098748e-05, + "loss": 0.9347, + "step": 2284 + }, + { + "epoch": 0.17290303053232947, + "grad_norm": 2.676067352294922, + "learning_rate": 1.7266948586858816e-05, + "loss": 0.674, + "step": 2285 + }, + { + "epoch": 0.17297869925466308, + "grad_norm": 2.255591869354248, + "learning_rate": 1.7266058014915826e-05, + "loss": 0.7917, + "step": 2286 + }, + { + "epoch": 0.17305436797699672, + "grad_norm": 2.7783968448638916, + "learning_rate": 1.7265166925325547e-05, + "loss": 0.8044, + "step": 2287 + }, + { + "epoch": 0.17313003669933033, + "grad_norm": 2.7550559043884277, + "learning_rate": 1.7264275318143784e-05, + "loss": 0.6446, + "step": 2288 + }, + { + "epoch": 0.17320570542166394, + "grad_norm": 3.500746488571167, + "learning_rate": 1.726338319342637e-05, + "loss": 0.8033, + "step": 2289 + }, + { + "epoch": 0.17328137414399758, + "grad_norm": 2.1858432292938232, + "learning_rate": 1.7262490551229173e-05, + "loss": 0.647, + "step": 2290 + }, + { + "epoch": 0.1733570428663312, + "grad_norm": 2.3065178394317627, + "learning_rate": 1.726159739160809e-05, + "loss": 0.9231, + "step": 2291 + }, + { + "epoch": 0.17343271158866483, + "grad_norm": 2.7201311588287354, + "learning_rate": 1.7260703714619062e-05, + "loss": 0.8506, + "step": 2292 + }, + { + "epoch": 0.17350838031099844, + "grad_norm": 2.0879547595977783, + "learning_rate": 1.725980952031805e-05, + "loss": 0.8429, + "step": 2293 + }, + { + "epoch": 0.17358404903333208, + "grad_norm": 6.11818265914917, + "learning_rate": 1.7258914808761048e-05, + "loss": 0.6266, + "step": 2294 + }, + { + "epoch": 0.1736597177556657, + "grad_norm": 2.2385120391845703, + "learning_rate": 1.7258019580004084e-05, + "loss": 0.8526, + "step": 2295 + }, + { + "epoch": 0.17373538647799933, + "grad_norm": 3.8654024600982666, + "learning_rate": 1.725712383410323e-05, + "loss": 0.7531, + "step": 2296 + }, + { + "epoch": 0.17381105520033294, + "grad_norm": 2.5260634422302246, + "learning_rate": 1.7256227571114577e-05, + "loss": 0.7502, + "step": 2297 + }, + { + "epoch": 0.17388672392266658, + "grad_norm": 2.305957794189453, + "learning_rate": 1.7255330791094244e-05, + "loss": 0.7199, + "step": 2298 + }, + { + "epoch": 0.17396239264500019, + "grad_norm": 2.708401918411255, + "learning_rate": 1.7254433494098393e-05, + "loss": 0.762, + "step": 2299 + }, + { + "epoch": 0.17403806136733382, + "grad_norm": 3.0765719413757324, + "learning_rate": 1.7253535680183228e-05, + "loss": 0.6405, + "step": 2300 + }, + { + "epoch": 0.17411373008966743, + "grad_norm": 2.6883769035339355, + "learning_rate": 1.7252637349404956e-05, + "loss": 0.7477, + "step": 2301 + }, + { + "epoch": 0.17418939881200107, + "grad_norm": 5.647037982940674, + "learning_rate": 1.725173850181984e-05, + "loss": 0.833, + "step": 2302 + }, + { + "epoch": 0.17426506753433468, + "grad_norm": 2.7593023777008057, + "learning_rate": 1.725083913748418e-05, + "loss": 0.8364, + "step": 2303 + }, + { + "epoch": 0.1743407362566683, + "grad_norm": 2.6544225215911865, + "learning_rate": 1.7249939256454277e-05, + "loss": 0.7959, + "step": 2304 + }, + { + "epoch": 0.17441640497900193, + "grad_norm": 2.8478567600250244, + "learning_rate": 1.7249038858786496e-05, + "loss": 0.8185, + "step": 2305 + }, + { + "epoch": 0.17449207370133554, + "grad_norm": 2.937596082687378, + "learning_rate": 1.7248137944537224e-05, + "loss": 0.7666, + "step": 2306 + }, + { + "epoch": 0.17456774242366918, + "grad_norm": 2.3636603355407715, + "learning_rate": 1.7247236513762876e-05, + "loss": 0.8019, + "step": 2307 + }, + { + "epoch": 0.1746434111460028, + "grad_norm": 2.4322621822357178, + "learning_rate": 1.72463345665199e-05, + "loss": 0.7484, + "step": 2308 + }, + { + "epoch": 0.17471907986833643, + "grad_norm": 2.590067148208618, + "learning_rate": 1.7245432102864782e-05, + "loss": 0.6762, + "step": 2309 + }, + { + "epoch": 0.17479474859067004, + "grad_norm": 2.2933037281036377, + "learning_rate": 1.7244529122854035e-05, + "loss": 0.7488, + "step": 2310 + }, + { + "epoch": 0.17487041731300368, + "grad_norm": 2.3996517658233643, + "learning_rate": 1.724362562654421e-05, + "loss": 0.7258, + "step": 2311 + }, + { + "epoch": 0.1749460860353373, + "grad_norm": 2.4362945556640625, + "learning_rate": 1.7242721613991887e-05, + "loss": 0.743, + "step": 2312 + }, + { + "epoch": 0.17502175475767093, + "grad_norm": 2.572498321533203, + "learning_rate": 1.7241817085253678e-05, + "loss": 0.7258, + "step": 2313 + }, + { + "epoch": 0.17509742348000454, + "grad_norm": 2.252002477645874, + "learning_rate": 1.724091204038622e-05, + "loss": 0.752, + "step": 2314 + }, + { + "epoch": 0.17517309220233818, + "grad_norm": 2.6183085441589355, + "learning_rate": 1.7240006479446202e-05, + "loss": 0.5796, + "step": 2315 + }, + { + "epoch": 0.1752487609246718, + "grad_norm": 3.0919864177703857, + "learning_rate": 1.723910040249032e-05, + "loss": 0.841, + "step": 2316 + }, + { + "epoch": 0.1753244296470054, + "grad_norm": 2.5486013889312744, + "learning_rate": 1.7238193809575325e-05, + "loss": 0.8376, + "step": 2317 + }, + { + "epoch": 0.17540009836933904, + "grad_norm": 2.988577127456665, + "learning_rate": 1.723728670075799e-05, + "loss": 0.8065, + "step": 2318 + }, + { + "epoch": 0.17547576709167265, + "grad_norm": 2.734192371368408, + "learning_rate": 1.7236379076095118e-05, + "loss": 0.7786, + "step": 2319 + }, + { + "epoch": 0.17555143581400628, + "grad_norm": 2.8744254112243652, + "learning_rate": 1.723547093564355e-05, + "loss": 0.8307, + "step": 2320 + }, + { + "epoch": 0.1756271045363399, + "grad_norm": 2.407675266265869, + "learning_rate": 1.7234562279460156e-05, + "loss": 0.7314, + "step": 2321 + }, + { + "epoch": 0.17570277325867353, + "grad_norm": 2.588304281234741, + "learning_rate": 1.7233653107601833e-05, + "loss": 0.7172, + "step": 2322 + }, + { + "epoch": 0.17577844198100714, + "grad_norm": 2.3886618614196777, + "learning_rate": 1.7232743420125526e-05, + "loss": 0.9478, + "step": 2323 + }, + { + "epoch": 0.17585411070334078, + "grad_norm": 3.020280599594116, + "learning_rate": 1.7231833217088195e-05, + "loss": 0.8497, + "step": 2324 + }, + { + "epoch": 0.1759297794256744, + "grad_norm": 2.5860559940338135, + "learning_rate": 1.7230922498546847e-05, + "loss": 0.8636, + "step": 2325 + }, + { + "epoch": 0.17600544814800803, + "grad_norm": 2.4977405071258545, + "learning_rate": 1.7230011264558506e-05, + "loss": 0.8239, + "step": 2326 + }, + { + "epoch": 0.17608111687034164, + "grad_norm": 2.03897762298584, + "learning_rate": 1.7229099515180243e-05, + "loss": 0.6944, + "step": 2327 + }, + { + "epoch": 0.17615678559267528, + "grad_norm": 3.2319719791412354, + "learning_rate": 1.7228187250469154e-05, + "loss": 0.9229, + "step": 2328 + }, + { + "epoch": 0.1762324543150089, + "grad_norm": 2.1355583667755127, + "learning_rate": 1.7227274470482363e-05, + "loss": 0.9233, + "step": 2329 + }, + { + "epoch": 0.1763081230373425, + "grad_norm": 2.936140298843384, + "learning_rate": 1.7226361175277034e-05, + "loss": 0.6154, + "step": 2330 + }, + { + "epoch": 0.17638379175967614, + "grad_norm": 2.1924805641174316, + "learning_rate": 1.7225447364910364e-05, + "loss": 0.9198, + "step": 2331 + }, + { + "epoch": 0.17645946048200975, + "grad_norm": 2.7870981693267822, + "learning_rate": 1.7224533039439573e-05, + "loss": 0.7164, + "step": 2332 + }, + { + "epoch": 0.1765351292043434, + "grad_norm": 2.4783294200897217, + "learning_rate": 1.722361819892192e-05, + "loss": 0.6998, + "step": 2333 + }, + { + "epoch": 0.176610797926677, + "grad_norm": 2.5335793495178223, + "learning_rate": 1.7222702843414703e-05, + "loss": 0.7745, + "step": 2334 + }, + { + "epoch": 0.17668646664901064, + "grad_norm": 2.6729066371917725, + "learning_rate": 1.7221786972975234e-05, + "loss": 0.8394, + "step": 2335 + }, + { + "epoch": 0.17676213537134425, + "grad_norm": 2.4603934288024902, + "learning_rate": 1.7220870587660872e-05, + "loss": 0.8235, + "step": 2336 + }, + { + "epoch": 0.17683780409367789, + "grad_norm": 2.5660200119018555, + "learning_rate": 1.7219953687529006e-05, + "loss": 0.8368, + "step": 2337 + }, + { + "epoch": 0.1769134728160115, + "grad_norm": 2.5912487506866455, + "learning_rate": 1.7219036272637054e-05, + "loss": 0.7614, + "step": 2338 + }, + { + "epoch": 0.17698914153834513, + "grad_norm": 2.4109630584716797, + "learning_rate": 1.7218118343042468e-05, + "loss": 0.693, + "step": 2339 + }, + { + "epoch": 0.17706481026067875, + "grad_norm": 2.0487425327301025, + "learning_rate": 1.7217199898802726e-05, + "loss": 0.9291, + "step": 2340 + }, + { + "epoch": 0.17714047898301238, + "grad_norm": 2.833705425262451, + "learning_rate": 1.721628093997535e-05, + "loss": 0.8157, + "step": 2341 + }, + { + "epoch": 0.177216147705346, + "grad_norm": 2.298569679260254, + "learning_rate": 1.7215361466617892e-05, + "loss": 0.8041, + "step": 2342 + }, + { + "epoch": 0.1772918164276796, + "grad_norm": 2.4448776245117188, + "learning_rate": 1.7214441478787923e-05, + "loss": 0.8001, + "step": 2343 + }, + { + "epoch": 0.17736748515001324, + "grad_norm": 2.5493338108062744, + "learning_rate": 1.7213520976543057e-05, + "loss": 0.9375, + "step": 2344 + }, + { + "epoch": 0.17744315387234685, + "grad_norm": 2.8180043697357178, + "learning_rate": 1.7212599959940947e-05, + "loss": 0.8333, + "step": 2345 + }, + { + "epoch": 0.1775188225946805, + "grad_norm": 2.575085163116455, + "learning_rate": 1.7211678429039264e-05, + "loss": 0.7597, + "step": 2346 + }, + { + "epoch": 0.1775944913170141, + "grad_norm": 2.7505435943603516, + "learning_rate": 1.721075638389572e-05, + "loss": 0.7751, + "step": 2347 + }, + { + "epoch": 0.17767016003934774, + "grad_norm": 2.285794973373413, + "learning_rate": 1.7209833824568047e-05, + "loss": 0.7253, + "step": 2348 + }, + { + "epoch": 0.17774582876168135, + "grad_norm": 3.101331949234009, + "learning_rate": 1.7208910751114033e-05, + "loss": 0.7653, + "step": 2349 + }, + { + "epoch": 0.177821497484015, + "grad_norm": 2.0107734203338623, + "learning_rate": 1.7207987163591474e-05, + "loss": 0.7867, + "step": 2350 + }, + { + "epoch": 0.1778971662063486, + "grad_norm": 2.603510618209839, + "learning_rate": 1.720706306205821e-05, + "loss": 0.8109, + "step": 2351 + }, + { + "epoch": 0.17797283492868224, + "grad_norm": 4.241885185241699, + "learning_rate": 1.7206138446572113e-05, + "loss": 0.7568, + "step": 2352 + }, + { + "epoch": 0.17804850365101585, + "grad_norm": 2.324835777282715, + "learning_rate": 1.720521331719109e-05, + "loss": 0.6711, + "step": 2353 + }, + { + "epoch": 0.1781241723733495, + "grad_norm": 3.291964292526245, + "learning_rate": 1.7204287673973062e-05, + "loss": 0.7461, + "step": 2354 + }, + { + "epoch": 0.1781998410956831, + "grad_norm": 4.23928689956665, + "learning_rate": 1.7203361516976007e-05, + "loss": 0.6751, + "step": 2355 + }, + { + "epoch": 0.17827550981801674, + "grad_norm": 2.199976682662964, + "learning_rate": 1.7202434846257922e-05, + "loss": 0.8, + "step": 2356 + }, + { + "epoch": 0.17835117854035035, + "grad_norm": 1.6747773885726929, + "learning_rate": 1.7201507661876838e-05, + "loss": 1.0083, + "step": 2357 + }, + { + "epoch": 0.17842684726268396, + "grad_norm": 2.7017219066619873, + "learning_rate": 1.7200579963890814e-05, + "loss": 0.8006, + "step": 2358 + }, + { + "epoch": 0.1785025159850176, + "grad_norm": 2.3280436992645264, + "learning_rate": 1.719965175235795e-05, + "loss": 0.8232, + "step": 2359 + }, + { + "epoch": 0.1785781847073512, + "grad_norm": 2.4222941398620605, + "learning_rate": 1.7198723027336374e-05, + "loss": 0.6266, + "step": 2360 + }, + { + "epoch": 0.17865385342968484, + "grad_norm": 2.6295063495635986, + "learning_rate": 1.7197793788884245e-05, + "loss": 0.7572, + "step": 2361 + }, + { + "epoch": 0.17872952215201846, + "grad_norm": 2.855011224746704, + "learning_rate": 1.7196864037059748e-05, + "loss": 0.8607, + "step": 2362 + }, + { + "epoch": 0.1788051908743521, + "grad_norm": 2.3858821392059326, + "learning_rate": 1.7195933771921118e-05, + "loss": 0.6728, + "step": 2363 + }, + { + "epoch": 0.1788808595966857, + "grad_norm": 3.4347083568573, + "learning_rate": 1.7195002993526604e-05, + "loss": 0.7073, + "step": 2364 + }, + { + "epoch": 0.17895652831901934, + "grad_norm": 2.68788480758667, + "learning_rate": 1.71940717019345e-05, + "loss": 0.853, + "step": 2365 + }, + { + "epoch": 0.17903219704135295, + "grad_norm": 2.82952880859375, + "learning_rate": 1.7193139897203122e-05, + "loss": 0.8541, + "step": 2366 + }, + { + "epoch": 0.1791078657636866, + "grad_norm": 2.4485409259796143, + "learning_rate": 1.7192207579390824e-05, + "loss": 0.6887, + "step": 2367 + }, + { + "epoch": 0.1791835344860202, + "grad_norm": 2.3793416023254395, + "learning_rate": 1.7191274748555987e-05, + "loss": 0.9804, + "step": 2368 + }, + { + "epoch": 0.17925920320835384, + "grad_norm": 2.226893663406372, + "learning_rate": 1.7190341404757034e-05, + "loss": 0.877, + "step": 2369 + }, + { + "epoch": 0.17933487193068745, + "grad_norm": 2.134824752807617, + "learning_rate": 1.7189407548052412e-05, + "loss": 0.8302, + "step": 2370 + }, + { + "epoch": 0.17941054065302106, + "grad_norm": 2.6807243824005127, + "learning_rate": 1.7188473178500602e-05, + "loss": 0.6898, + "step": 2371 + }, + { + "epoch": 0.1794862093753547, + "grad_norm": 2.6654117107391357, + "learning_rate": 1.7187538296160115e-05, + "loss": 0.6984, + "step": 2372 + }, + { + "epoch": 0.1795618780976883, + "grad_norm": 2.5605947971343994, + "learning_rate": 1.71866029010895e-05, + "loss": 0.8535, + "step": 2373 + }, + { + "epoch": 0.17963754682002195, + "grad_norm": 3.3925888538360596, + "learning_rate": 1.718566699334733e-05, + "loss": 0.8313, + "step": 2374 + }, + { + "epoch": 0.17971321554235556, + "grad_norm": 2.7094411849975586, + "learning_rate": 1.7184730572992222e-05, + "loss": 0.6807, + "step": 2375 + }, + { + "epoch": 0.1797888842646892, + "grad_norm": 3.0134902000427246, + "learning_rate": 1.718379364008281e-05, + "loss": 0.7635, + "step": 2376 + }, + { + "epoch": 0.1798645529870228, + "grad_norm": 2.536494255065918, + "learning_rate": 1.718285619467777e-05, + "loss": 0.9329, + "step": 2377 + }, + { + "epoch": 0.17994022170935645, + "grad_norm": 2.6002416610717773, + "learning_rate": 1.718191823683581e-05, + "loss": 0.9118, + "step": 2378 + }, + { + "epoch": 0.18001589043169006, + "grad_norm": 2.81915283203125, + "learning_rate": 1.7180979766615663e-05, + "loss": 0.7235, + "step": 2379 + }, + { + "epoch": 0.1800915591540237, + "grad_norm": 2.888113260269165, + "learning_rate": 1.7180040784076106e-05, + "loss": 0.6503, + "step": 2380 + }, + { + "epoch": 0.1801672278763573, + "grad_norm": 2.9773685932159424, + "learning_rate": 1.7179101289275937e-05, + "loss": 0.704, + "step": 2381 + }, + { + "epoch": 0.18024289659869094, + "grad_norm": 2.8631527423858643, + "learning_rate": 1.717816128227399e-05, + "loss": 0.8687, + "step": 2382 + }, + { + "epoch": 0.18031856532102455, + "grad_norm": 2.364201068878174, + "learning_rate": 1.7177220763129133e-05, + "loss": 0.6779, + "step": 2383 + }, + { + "epoch": 0.18039423404335816, + "grad_norm": 2.255162239074707, + "learning_rate": 1.7176279731900264e-05, + "loss": 0.6428, + "step": 2384 + }, + { + "epoch": 0.1804699027656918, + "grad_norm": 3.1965222358703613, + "learning_rate": 1.717533818864631e-05, + "loss": 0.7524, + "step": 2385 + }, + { + "epoch": 0.1805455714880254, + "grad_norm": 2.435013771057129, + "learning_rate": 1.717439613342624e-05, + "loss": 0.8193, + "step": 2386 + }, + { + "epoch": 0.18062124021035905, + "grad_norm": 3.8150510787963867, + "learning_rate": 1.7173453566299044e-05, + "loss": 0.7033, + "step": 2387 + }, + { + "epoch": 0.18069690893269266, + "grad_norm": 2.313161849975586, + "learning_rate": 1.717251048732375e-05, + "loss": 0.8132, + "step": 2388 + }, + { + "epoch": 0.1807725776550263, + "grad_norm": 3.1583354473114014, + "learning_rate": 1.717156689655941e-05, + "loss": 0.7569, + "step": 2389 + }, + { + "epoch": 0.1808482463773599, + "grad_norm": 2.503884792327881, + "learning_rate": 1.717062279406513e-05, + "loss": 0.6835, + "step": 2390 + }, + { + "epoch": 0.18092391509969355, + "grad_norm": 2.8149566650390625, + "learning_rate": 1.716967817990002e-05, + "loss": 0.8832, + "step": 2391 + }, + { + "epoch": 0.18099958382202716, + "grad_norm": 2.804032802581787, + "learning_rate": 1.7168733054123238e-05, + "loss": 0.9157, + "step": 2392 + }, + { + "epoch": 0.1810752525443608, + "grad_norm": 2.569125175476074, + "learning_rate": 1.7167787416793973e-05, + "loss": 0.9452, + "step": 2393 + }, + { + "epoch": 0.1811509212666944, + "grad_norm": 2.8881638050079346, + "learning_rate": 1.7166841267971438e-05, + "loss": 0.8094, + "step": 2394 + }, + { + "epoch": 0.18122658998902805, + "grad_norm": 2.7067344188690186, + "learning_rate": 1.716589460771489e-05, + "loss": 0.8893, + "step": 2395 + }, + { + "epoch": 0.18130225871136166, + "grad_norm": 3.1878957748413086, + "learning_rate": 1.716494743608361e-05, + "loss": 0.8778, + "step": 2396 + }, + { + "epoch": 0.18137792743369527, + "grad_norm": 2.745701313018799, + "learning_rate": 1.7163999753136913e-05, + "loss": 0.9082, + "step": 2397 + }, + { + "epoch": 0.1814535961560289, + "grad_norm": 2.593395471572876, + "learning_rate": 1.7163051558934146e-05, + "loss": 0.8964, + "step": 2398 + }, + { + "epoch": 0.18152926487836252, + "grad_norm": 2.715348243713379, + "learning_rate": 1.716210285353469e-05, + "loss": 0.6536, + "step": 2399 + }, + { + "epoch": 0.18160493360069616, + "grad_norm": 2.5695583820343018, + "learning_rate": 1.716115363699795e-05, + "loss": 0.7906, + "step": 2400 + }, + { + "epoch": 0.18168060232302977, + "grad_norm": 5.8394455909729, + "learning_rate": 1.7160203909383375e-05, + "loss": 0.7837, + "step": 2401 + }, + { + "epoch": 0.1817562710453634, + "grad_norm": 2.660222291946411, + "learning_rate": 1.715925367075044e-05, + "loss": 0.6843, + "step": 2402 + }, + { + "epoch": 0.18183193976769702, + "grad_norm": 3.027378559112549, + "learning_rate": 1.7158302921158647e-05, + "loss": 0.8906, + "step": 2403 + }, + { + "epoch": 0.18190760849003065, + "grad_norm": 2.216245174407959, + "learning_rate": 1.715735166066754e-05, + "loss": 0.9702, + "step": 2404 + }, + { + "epoch": 0.18198327721236426, + "grad_norm": 2.4401350021362305, + "learning_rate": 1.7156399889336684e-05, + "loss": 0.7855, + "step": 2405 + }, + { + "epoch": 0.1820589459346979, + "grad_norm": 2.4370734691619873, + "learning_rate": 1.715544760722569e-05, + "loss": 0.784, + "step": 2406 + }, + { + "epoch": 0.1821346146570315, + "grad_norm": 5.49038553237915, + "learning_rate": 1.7154494814394186e-05, + "loss": 0.7539, + "step": 2407 + }, + { + "epoch": 0.18221028337936515, + "grad_norm": 2.7418787479400635, + "learning_rate": 1.7153541510901844e-05, + "loss": 0.8306, + "step": 2408 + }, + { + "epoch": 0.18228595210169876, + "grad_norm": 2.403918981552124, + "learning_rate": 1.7152587696808358e-05, + "loss": 0.6963, + "step": 2409 + }, + { + "epoch": 0.1823616208240324, + "grad_norm": 3.1243717670440674, + "learning_rate": 1.7151633372173467e-05, + "loss": 0.8133, + "step": 2410 + }, + { + "epoch": 0.182437289546366, + "grad_norm": 3.145928382873535, + "learning_rate": 1.7150678537056925e-05, + "loss": 0.7925, + "step": 2411 + }, + { + "epoch": 0.18251295826869962, + "grad_norm": 3.3592019081115723, + "learning_rate": 1.7149723191518533e-05, + "loss": 0.8252, + "step": 2412 + }, + { + "epoch": 0.18258862699103326, + "grad_norm": 2.1932833194732666, + "learning_rate": 1.714876733561811e-05, + "loss": 0.8144, + "step": 2413 + }, + { + "epoch": 0.18266429571336687, + "grad_norm": 2.8968517780303955, + "learning_rate": 1.7147810969415526e-05, + "loss": 0.7965, + "step": 2414 + }, + { + "epoch": 0.1827399644357005, + "grad_norm": 2.438443660736084, + "learning_rate": 1.7146854092970663e-05, + "loss": 0.7568, + "step": 2415 + }, + { + "epoch": 0.18281563315803412, + "grad_norm": 2.609872341156006, + "learning_rate": 1.7145896706343445e-05, + "loss": 0.6807, + "step": 2416 + }, + { + "epoch": 0.18289130188036776, + "grad_norm": 2.623194932937622, + "learning_rate": 1.714493880959383e-05, + "loss": 0.7449, + "step": 2417 + }, + { + "epoch": 0.18296697060270137, + "grad_norm": 2.9819412231445312, + "learning_rate": 1.7143980402781804e-05, + "loss": 0.9333, + "step": 2418 + }, + { + "epoch": 0.183042639325035, + "grad_norm": 2.4386146068573, + "learning_rate": 1.7143021485967382e-05, + "loss": 0.7959, + "step": 2419 + }, + { + "epoch": 0.18311830804736862, + "grad_norm": 2.293463706970215, + "learning_rate": 1.7142062059210618e-05, + "loss": 0.8353, + "step": 2420 + }, + { + "epoch": 0.18319397676970225, + "grad_norm": 3.0216493606567383, + "learning_rate": 1.7141102122571593e-05, + "loss": 0.8713, + "step": 2421 + }, + { + "epoch": 0.18326964549203587, + "grad_norm": 2.45841908454895, + "learning_rate": 1.7140141676110424e-05, + "loss": 0.8078, + "step": 2422 + }, + { + "epoch": 0.1833453142143695, + "grad_norm": 2.933749198913574, + "learning_rate": 1.713918071988725e-05, + "loss": 0.8919, + "step": 2423 + }, + { + "epoch": 0.18342098293670311, + "grad_norm": 2.992274761199951, + "learning_rate": 1.713821925396226e-05, + "loss": 0.7021, + "step": 2424 + }, + { + "epoch": 0.18349665165903672, + "grad_norm": 2.653069257736206, + "learning_rate": 1.7137257278395655e-05, + "loss": 0.8012, + "step": 2425 + }, + { + "epoch": 0.18357232038137036, + "grad_norm": 2.428311586380005, + "learning_rate": 1.7136294793247677e-05, + "loss": 0.8426, + "step": 2426 + }, + { + "epoch": 0.18364798910370397, + "grad_norm": 3.16813063621521, + "learning_rate": 1.7135331798578607e-05, + "loss": 0.8006, + "step": 2427 + }, + { + "epoch": 0.1837236578260376, + "grad_norm": 3.0173721313476562, + "learning_rate": 1.7134368294448746e-05, + "loss": 0.7249, + "step": 2428 + }, + { + "epoch": 0.18379932654837122, + "grad_norm": 3.074843406677246, + "learning_rate": 1.7133404280918435e-05, + "loss": 0.8502, + "step": 2429 + }, + { + "epoch": 0.18387499527070486, + "grad_norm": 2.8122646808624268, + "learning_rate": 1.713243975804804e-05, + "loss": 0.8029, + "step": 2430 + }, + { + "epoch": 0.18395066399303847, + "grad_norm": 2.632542610168457, + "learning_rate": 1.7131474725897958e-05, + "loss": 0.9138, + "step": 2431 + }, + { + "epoch": 0.1840263327153721, + "grad_norm": 2.9249496459960938, + "learning_rate": 1.7130509184528634e-05, + "loss": 0.8061, + "step": 2432 + }, + { + "epoch": 0.18410200143770572, + "grad_norm": 2.6574416160583496, + "learning_rate": 1.7129543134000528e-05, + "loss": 0.846, + "step": 2433 + }, + { + "epoch": 0.18417767016003936, + "grad_norm": 2.1470108032226562, + "learning_rate": 1.7128576574374138e-05, + "loss": 0.7422, + "step": 2434 + }, + { + "epoch": 0.18425333888237297, + "grad_norm": 3.626044750213623, + "learning_rate": 1.712760950570999e-05, + "loss": 0.8358, + "step": 2435 + }, + { + "epoch": 0.1843290076047066, + "grad_norm": 2.8826382160186768, + "learning_rate": 1.7126641928068642e-05, + "loss": 0.6992, + "step": 2436 + }, + { + "epoch": 0.18440467632704022, + "grad_norm": 2.540454149246216, + "learning_rate": 1.7125673841510696e-05, + "loss": 0.7279, + "step": 2437 + }, + { + "epoch": 0.18448034504937383, + "grad_norm": 4.273547172546387, + "learning_rate": 1.7124705246096776e-05, + "loss": 0.9469, + "step": 2438 + }, + { + "epoch": 0.18455601377170747, + "grad_norm": 2.69283390045166, + "learning_rate": 1.712373614188753e-05, + "loss": 0.7379, + "step": 2439 + }, + { + "epoch": 0.18463168249404108, + "grad_norm": 2.7413628101348877, + "learning_rate": 1.712276652894365e-05, + "loss": 0.8033, + "step": 2440 + }, + { + "epoch": 0.18470735121637472, + "grad_norm": 2.5811452865600586, + "learning_rate": 1.7121796407325864e-05, + "loss": 0.7025, + "step": 2441 + }, + { + "epoch": 0.18478301993870833, + "grad_norm": 2.8517563343048096, + "learning_rate": 1.7120825777094916e-05, + "loss": 0.8967, + "step": 2442 + }, + { + "epoch": 0.18485868866104196, + "grad_norm": 2.390089988708496, + "learning_rate": 1.7119854638311587e-05, + "loss": 0.7239, + "step": 2443 + }, + { + "epoch": 0.18493435738337557, + "grad_norm": 3.3105874061584473, + "learning_rate": 1.71188829910367e-05, + "loss": 0.8914, + "step": 2444 + }, + { + "epoch": 0.1850100261057092, + "grad_norm": 3.032083511352539, + "learning_rate": 1.7117910835331104e-05, + "loss": 0.85, + "step": 2445 + }, + { + "epoch": 0.18508569482804282, + "grad_norm": 2.4324803352355957, + "learning_rate": 1.7116938171255672e-05, + "loss": 0.5128, + "step": 2446 + }, + { + "epoch": 0.18516136355037646, + "grad_norm": 2.444301128387451, + "learning_rate": 1.711596499887132e-05, + "loss": 0.8662, + "step": 2447 + }, + { + "epoch": 0.18523703227271007, + "grad_norm": 2.0396249294281006, + "learning_rate": 1.711499131823899e-05, + "loss": 0.6182, + "step": 2448 + }, + { + "epoch": 0.1853127009950437, + "grad_norm": 2.8817059993743896, + "learning_rate": 1.7114017129419654e-05, + "loss": 0.8314, + "step": 2449 + }, + { + "epoch": 0.18538836971737732, + "grad_norm": 2.181652545928955, + "learning_rate": 1.7113042432474323e-05, + "loss": 0.7013, + "step": 2450 + }, + { + "epoch": 0.18546403843971093, + "grad_norm": 2.367856740951538, + "learning_rate": 1.7112067227464035e-05, + "loss": 0.818, + "step": 2451 + }, + { + "epoch": 0.18553970716204457, + "grad_norm": 2.3162214756011963, + "learning_rate": 1.7111091514449857e-05, + "loss": 0.8122, + "step": 2452 + }, + { + "epoch": 0.18561537588437818, + "grad_norm": 2.549567937850952, + "learning_rate": 1.7110115293492893e-05, + "loss": 0.7482, + "step": 2453 + }, + { + "epoch": 0.18569104460671182, + "grad_norm": 1.9302699565887451, + "learning_rate": 1.7109138564654283e-05, + "loss": 0.8955, + "step": 2454 + }, + { + "epoch": 0.18576671332904543, + "grad_norm": 2.4720423221588135, + "learning_rate": 1.7108161327995182e-05, + "loss": 0.7715, + "step": 2455 + }, + { + "epoch": 0.18584238205137907, + "grad_norm": 2.2266173362731934, + "learning_rate": 1.7107183583576798e-05, + "loss": 0.7588, + "step": 2456 + }, + { + "epoch": 0.18591805077371268, + "grad_norm": 2.241393804550171, + "learning_rate": 1.7106205331460356e-05, + "loss": 0.6046, + "step": 2457 + }, + { + "epoch": 0.18599371949604632, + "grad_norm": 2.3776702880859375, + "learning_rate": 1.7105226571707115e-05, + "loss": 0.7866, + "step": 2458 + }, + { + "epoch": 0.18606938821837993, + "grad_norm": 3.0117900371551514, + "learning_rate": 1.7104247304378372e-05, + "loss": 0.68, + "step": 2459 + }, + { + "epoch": 0.18614505694071357, + "grad_norm": 2.2148845195770264, + "learning_rate": 1.7103267529535453e-05, + "loss": 0.8208, + "step": 2460 + }, + { + "epoch": 0.18622072566304718, + "grad_norm": 2.5542850494384766, + "learning_rate": 1.710228724723971e-05, + "loss": 0.7452, + "step": 2461 + }, + { + "epoch": 0.18629639438538081, + "grad_norm": 2.4348342418670654, + "learning_rate": 1.7101306457552532e-05, + "loss": 0.8503, + "step": 2462 + }, + { + "epoch": 0.18637206310771443, + "grad_norm": 2.9579315185546875, + "learning_rate": 1.7100325160535344e-05, + "loss": 0.9084, + "step": 2463 + }, + { + "epoch": 0.18644773183004806, + "grad_norm": 2.543611764907837, + "learning_rate": 1.7099343356249594e-05, + "loss": 0.7714, + "step": 2464 + }, + { + "epoch": 0.18652340055238167, + "grad_norm": 2.5883982181549072, + "learning_rate": 1.7098361044756762e-05, + "loss": 0.6883, + "step": 2465 + }, + { + "epoch": 0.18659906927471528, + "grad_norm": 2.596781015396118, + "learning_rate": 1.7097378226118372e-05, + "loss": 0.7947, + "step": 2466 + }, + { + "epoch": 0.18667473799704892, + "grad_norm": 1.9685479402542114, + "learning_rate": 1.709639490039597e-05, + "loss": 0.8463, + "step": 2467 + }, + { + "epoch": 0.18675040671938253, + "grad_norm": 2.6280531883239746, + "learning_rate": 1.7095411067651128e-05, + "loss": 0.7112, + "step": 2468 + }, + { + "epoch": 0.18682607544171617, + "grad_norm": 4.373874187469482, + "learning_rate": 1.7094426727945463e-05, + "loss": 0.7965, + "step": 2469 + }, + { + "epoch": 0.18690174416404978, + "grad_norm": 2.3263533115386963, + "learning_rate": 1.7093441881340614e-05, + "loss": 0.7271, + "step": 2470 + }, + { + "epoch": 0.18697741288638342, + "grad_norm": 2.311795711517334, + "learning_rate": 1.709245652789826e-05, + "loss": 0.7724, + "step": 2471 + }, + { + "epoch": 0.18705308160871703, + "grad_norm": 2.1716461181640625, + "learning_rate": 1.7091470667680102e-05, + "loss": 0.7175, + "step": 2472 + }, + { + "epoch": 0.18712875033105067, + "grad_norm": 2.450676202774048, + "learning_rate": 1.7090484300747882e-05, + "loss": 0.7955, + "step": 2473 + }, + { + "epoch": 0.18720441905338428, + "grad_norm": 2.3514206409454346, + "learning_rate": 1.7089497427163362e-05, + "loss": 0.6674, + "step": 2474 + }, + { + "epoch": 0.18728008777571792, + "grad_norm": 2.2294673919677734, + "learning_rate": 1.7088510046988355e-05, + "loss": 0.6138, + "step": 2475 + }, + { + "epoch": 0.18735575649805153, + "grad_norm": 2.8601155281066895, + "learning_rate": 1.7087522160284684e-05, + "loss": 0.7145, + "step": 2476 + }, + { + "epoch": 0.18743142522038517, + "grad_norm": 2.8566646575927734, + "learning_rate": 1.7086533767114216e-05, + "loss": 0.8687, + "step": 2477 + }, + { + "epoch": 0.18750709394271878, + "grad_norm": 2.680727005004883, + "learning_rate": 1.7085544867538854e-05, + "loss": 0.6916, + "step": 2478 + }, + { + "epoch": 0.1875827626650524, + "grad_norm": 2.9633843898773193, + "learning_rate": 1.7084555461620514e-05, + "loss": 0.8821, + "step": 2479 + }, + { + "epoch": 0.18765843138738603, + "grad_norm": 2.1105926036834717, + "learning_rate": 1.7083565549421166e-05, + "loss": 0.713, + "step": 2480 + }, + { + "epoch": 0.18773410010971964, + "grad_norm": 2.6153006553649902, + "learning_rate": 1.7082575131002796e-05, + "loss": 0.6687, + "step": 2481 + }, + { + "epoch": 0.18780976883205328, + "grad_norm": 2.08457088470459, + "learning_rate": 1.708158420642743e-05, + "loss": 0.7836, + "step": 2482 + }, + { + "epoch": 0.18788543755438689, + "grad_norm": 2.58174467086792, + "learning_rate": 1.7080592775757122e-05, + "loss": 0.8457, + "step": 2483 + }, + { + "epoch": 0.18796110627672052, + "grad_norm": 2.193037271499634, + "learning_rate": 1.707960083905396e-05, + "loss": 0.7799, + "step": 2484 + }, + { + "epoch": 0.18803677499905413, + "grad_norm": 2.2527616024017334, + "learning_rate": 1.707860839638006e-05, + "loss": 0.6341, + "step": 2485 + }, + { + "epoch": 0.18811244372138777, + "grad_norm": 2.2028379440307617, + "learning_rate": 1.707761544779757e-05, + "loss": 0.6469, + "step": 2486 + }, + { + "epoch": 0.18818811244372138, + "grad_norm": 2.2932393550872803, + "learning_rate": 1.7076621993368676e-05, + "loss": 0.6554, + "step": 2487 + }, + { + "epoch": 0.18826378116605502, + "grad_norm": 2.409485101699829, + "learning_rate": 1.7075628033155593e-05, + "loss": 0.905, + "step": 2488 + }, + { + "epoch": 0.18833944988838863, + "grad_norm": 3.5237672328948975, + "learning_rate": 1.707463356722056e-05, + "loss": 0.7915, + "step": 2489 + }, + { + "epoch": 0.18841511861072227, + "grad_norm": 2.7192323207855225, + "learning_rate": 1.7073638595625856e-05, + "loss": 0.8663, + "step": 2490 + }, + { + "epoch": 0.18849078733305588, + "grad_norm": 2.6464853286743164, + "learning_rate": 1.707264311843379e-05, + "loss": 0.9029, + "step": 2491 + }, + { + "epoch": 0.1885664560553895, + "grad_norm": 2.4088399410247803, + "learning_rate": 1.7071647135706702e-05, + "loss": 0.7941, + "step": 2492 + }, + { + "epoch": 0.18864212477772313, + "grad_norm": 2.637516975402832, + "learning_rate": 1.7070650647506966e-05, + "loss": 0.679, + "step": 2493 + }, + { + "epoch": 0.18871779350005674, + "grad_norm": 2.1144580841064453, + "learning_rate": 1.7069653653896982e-05, + "loss": 0.7291, + "step": 2494 + }, + { + "epoch": 0.18879346222239038, + "grad_norm": 2.473254680633545, + "learning_rate": 1.7068656154939183e-05, + "loss": 0.7055, + "step": 2495 + }, + { + "epoch": 0.188869130944724, + "grad_norm": 2.5873825550079346, + "learning_rate": 1.7067658150696043e-05, + "loss": 0.8274, + "step": 2496 + }, + { + "epoch": 0.18894479966705763, + "grad_norm": 2.4804913997650146, + "learning_rate": 1.706665964123005e-05, + "loss": 0.7887, + "step": 2497 + }, + { + "epoch": 0.18902046838939124, + "grad_norm": 3.1803600788116455, + "learning_rate": 1.7065660626603745e-05, + "loss": 0.9983, + "step": 2498 + }, + { + "epoch": 0.18909613711172488, + "grad_norm": 3.4321467876434326, + "learning_rate": 1.706466110687968e-05, + "loss": 0.8888, + "step": 2499 + }, + { + "epoch": 0.1891718058340585, + "grad_norm": 2.6586387157440186, + "learning_rate": 1.706366108212045e-05, + "loss": 0.9563, + "step": 2500 + }, + { + "epoch": 0.18924747455639213, + "grad_norm": 2.2730400562286377, + "learning_rate": 1.7062660552388687e-05, + "loss": 0.722, + "step": 2501 + }, + { + "epoch": 0.18932314327872574, + "grad_norm": 2.669356107711792, + "learning_rate": 1.706165951774704e-05, + "loss": 0.7355, + "step": 2502 + }, + { + "epoch": 0.18939881200105937, + "grad_norm": 3.200263738632202, + "learning_rate": 1.70606579782582e-05, + "loss": 0.6391, + "step": 2503 + }, + { + "epoch": 0.18947448072339299, + "grad_norm": 2.0862410068511963, + "learning_rate": 1.7059655933984886e-05, + "loss": 0.7393, + "step": 2504 + }, + { + "epoch": 0.1895501494457266, + "grad_norm": 2.3829562664031982, + "learning_rate": 1.7058653384989852e-05, + "loss": 0.7792, + "step": 2505 + }, + { + "epoch": 0.18962581816806023, + "grad_norm": 2.7192342281341553, + "learning_rate": 1.7057650331335875e-05, + "loss": 0.8724, + "step": 2506 + }, + { + "epoch": 0.18970148689039384, + "grad_norm": 2.462100028991699, + "learning_rate": 1.7056646773085773e-05, + "loss": 0.8377, + "step": 2507 + }, + { + "epoch": 0.18977715561272748, + "grad_norm": 2.152848243713379, + "learning_rate": 1.705564271030239e-05, + "loss": 0.8334, + "step": 2508 + }, + { + "epoch": 0.1898528243350611, + "grad_norm": 2.1347358226776123, + "learning_rate": 1.705463814304861e-05, + "loss": 0.7019, + "step": 2509 + }, + { + "epoch": 0.18992849305739473, + "grad_norm": 2.732978343963623, + "learning_rate": 1.7053633071387336e-05, + "loss": 0.8409, + "step": 2510 + }, + { + "epoch": 0.19000416177972834, + "grad_norm": 2.211718797683716, + "learning_rate": 1.7052627495381507e-05, + "loss": 0.7797, + "step": 2511 + }, + { + "epoch": 0.19007983050206198, + "grad_norm": 2.507467031478882, + "learning_rate": 1.7051621415094105e-05, + "loss": 0.7145, + "step": 2512 + }, + { + "epoch": 0.1901554992243956, + "grad_norm": 2.3044538497924805, + "learning_rate": 1.7050614830588122e-05, + "loss": 0.8306, + "step": 2513 + }, + { + "epoch": 0.19023116794672923, + "grad_norm": 2.1056079864501953, + "learning_rate": 1.7049607741926603e-05, + "loss": 0.9237, + "step": 2514 + }, + { + "epoch": 0.19030683666906284, + "grad_norm": 2.6704158782958984, + "learning_rate": 1.704860014917261e-05, + "loss": 0.8201, + "step": 2515 + }, + { + "epoch": 0.19038250539139648, + "grad_norm": 3.0640957355499268, + "learning_rate": 1.7047592052389243e-05, + "loss": 0.8734, + "step": 2516 + }, + { + "epoch": 0.1904581741137301, + "grad_norm": 2.7852325439453125, + "learning_rate": 1.7046583451639635e-05, + "loss": 0.8072, + "step": 2517 + }, + { + "epoch": 0.1905338428360637, + "grad_norm": 3.1452224254608154, + "learning_rate": 1.7045574346986942e-05, + "loss": 0.6817, + "step": 2518 + }, + { + "epoch": 0.19060951155839734, + "grad_norm": 2.4162135124206543, + "learning_rate": 1.7044564738494367e-05, + "loss": 0.8041, + "step": 2519 + }, + { + "epoch": 0.19068518028073095, + "grad_norm": 2.7600903511047363, + "learning_rate": 1.704355462622512e-05, + "loss": 0.8216, + "step": 2520 + }, + { + "epoch": 0.1907608490030646, + "grad_norm": 2.507215976715088, + "learning_rate": 1.7042544010242473e-05, + "loss": 0.8253, + "step": 2521 + }, + { + "epoch": 0.1908365177253982, + "grad_norm": 2.914543628692627, + "learning_rate": 1.7041532890609703e-05, + "loss": 0.9177, + "step": 2522 + }, + { + "epoch": 0.19091218644773184, + "grad_norm": 3.545713186264038, + "learning_rate": 1.704052126739014e-05, + "loss": 0.6333, + "step": 2523 + }, + { + "epoch": 0.19098785517006545, + "grad_norm": 2.720325469970703, + "learning_rate": 1.7039509140647124e-05, + "loss": 0.7723, + "step": 2524 + }, + { + "epoch": 0.19106352389239908, + "grad_norm": 2.6830990314483643, + "learning_rate": 1.703849651044404e-05, + "loss": 0.8263, + "step": 2525 + }, + { + "epoch": 0.1911391926147327, + "grad_norm": 2.6255667209625244, + "learning_rate": 1.703748337684431e-05, + "loss": 0.7983, + "step": 2526 + }, + { + "epoch": 0.19121486133706633, + "grad_norm": 3.0577828884124756, + "learning_rate": 1.7036469739911374e-05, + "loss": 0.5705, + "step": 2527 + }, + { + "epoch": 0.19129053005939994, + "grad_norm": 2.4407639503479004, + "learning_rate": 1.703545559970871e-05, + "loss": 0.7921, + "step": 2528 + }, + { + "epoch": 0.19136619878173358, + "grad_norm": 3.169311046600342, + "learning_rate": 1.7034440956299825e-05, + "loss": 0.6525, + "step": 2529 + }, + { + "epoch": 0.1914418675040672, + "grad_norm": 2.345228433609009, + "learning_rate": 1.703342580974826e-05, + "loss": 0.6853, + "step": 2530 + }, + { + "epoch": 0.19151753622640083, + "grad_norm": 2.094965696334839, + "learning_rate": 1.703241016011759e-05, + "loss": 0.6976, + "step": 2531 + }, + { + "epoch": 0.19159320494873444, + "grad_norm": 2.3443291187286377, + "learning_rate": 1.7031394007471415e-05, + "loss": 0.6738, + "step": 2532 + }, + { + "epoch": 0.19166887367106805, + "grad_norm": 2.716376304626465, + "learning_rate": 1.703037735187337e-05, + "loss": 0.8052, + "step": 2533 + }, + { + "epoch": 0.1917445423934017, + "grad_norm": 2.7396631240844727, + "learning_rate": 1.7029360193387123e-05, + "loss": 0.6116, + "step": 2534 + }, + { + "epoch": 0.1918202111157353, + "grad_norm": 2.562323570251465, + "learning_rate": 1.702834253207637e-05, + "loss": 0.7081, + "step": 2535 + }, + { + "epoch": 0.19189587983806894, + "grad_norm": 2.4629499912261963, + "learning_rate": 1.702732436800484e-05, + "loss": 0.8838, + "step": 2536 + }, + { + "epoch": 0.19197154856040255, + "grad_norm": 2.0351247787475586, + "learning_rate": 1.7026305701236294e-05, + "loss": 0.9576, + "step": 2537 + }, + { + "epoch": 0.1920472172827362, + "grad_norm": 3.575366973876953, + "learning_rate": 1.7025286531834525e-05, + "loss": 0.6759, + "step": 2538 + }, + { + "epoch": 0.1921228860050698, + "grad_norm": 2.341073513031006, + "learning_rate": 1.7024266859863358e-05, + "loss": 0.7948, + "step": 2539 + }, + { + "epoch": 0.19219855472740344, + "grad_norm": 2.288145065307617, + "learning_rate": 1.7023246685386646e-05, + "loss": 0.6851, + "step": 2540 + }, + { + "epoch": 0.19227422344973705, + "grad_norm": 2.1108577251434326, + "learning_rate": 1.7022226008468275e-05, + "loss": 0.783, + "step": 2541 + }, + { + "epoch": 0.19234989217207069, + "grad_norm": 2.4641237258911133, + "learning_rate": 1.7021204829172166e-05, + "loss": 0.833, + "step": 2542 + }, + { + "epoch": 0.1924255608944043, + "grad_norm": 3.356717109680176, + "learning_rate": 1.7020183147562267e-05, + "loss": 0.7958, + "step": 2543 + }, + { + "epoch": 0.19250122961673793, + "grad_norm": 2.6340181827545166, + "learning_rate": 1.7019160963702556e-05, + "loss": 0.7378, + "step": 2544 + }, + { + "epoch": 0.19257689833907154, + "grad_norm": 2.6588118076324463, + "learning_rate": 1.701813827765705e-05, + "loss": 0.9538, + "step": 2545 + }, + { + "epoch": 0.19265256706140516, + "grad_norm": 2.6437458992004395, + "learning_rate": 1.7017115089489794e-05, + "loss": 0.739, + "step": 2546 + }, + { + "epoch": 0.1927282357837388, + "grad_norm": 2.546844005584717, + "learning_rate": 1.7016091399264856e-05, + "loss": 0.8209, + "step": 2547 + }, + { + "epoch": 0.1928039045060724, + "grad_norm": 2.78609299659729, + "learning_rate": 1.701506720704635e-05, + "loss": 0.9092, + "step": 2548 + }, + { + "epoch": 0.19287957322840604, + "grad_norm": 2.1498682498931885, + "learning_rate": 1.7014042512898414e-05, + "loss": 0.8103, + "step": 2549 + }, + { + "epoch": 0.19295524195073965, + "grad_norm": 2.6014087200164795, + "learning_rate": 1.701301731688521e-05, + "loss": 1.07, + "step": 2550 + }, + { + "epoch": 0.1930309106730733, + "grad_norm": 2.4372475147247314, + "learning_rate": 1.7011991619070948e-05, + "loss": 0.785, + "step": 2551 + }, + { + "epoch": 0.1931065793954069, + "grad_norm": 2.3356244564056396, + "learning_rate": 1.7010965419519858e-05, + "loss": 0.689, + "step": 2552 + }, + { + "epoch": 0.19318224811774054, + "grad_norm": 2.393585443496704, + "learning_rate": 1.70099387182962e-05, + "loss": 0.7593, + "step": 2553 + }, + { + "epoch": 0.19325791684007415, + "grad_norm": 2.5867748260498047, + "learning_rate": 1.700891151546427e-05, + "loss": 0.7933, + "step": 2554 + }, + { + "epoch": 0.1933335855624078, + "grad_norm": 2.6387293338775635, + "learning_rate": 1.7007883811088403e-05, + "loss": 0.7924, + "step": 2555 + }, + { + "epoch": 0.1934092542847414, + "grad_norm": 1.7899304628372192, + "learning_rate": 1.7006855605232947e-05, + "loss": 0.8936, + "step": 2556 + }, + { + "epoch": 0.19348492300707504, + "grad_norm": 3.2812225818634033, + "learning_rate": 1.7005826897962294e-05, + "loss": 0.7595, + "step": 2557 + }, + { + "epoch": 0.19356059172940865, + "grad_norm": 3.634249448776245, + "learning_rate": 1.7004797689340873e-05, + "loss": 0.8057, + "step": 2558 + }, + { + "epoch": 0.19363626045174226, + "grad_norm": 2.197214365005493, + "learning_rate": 1.7003767979433126e-05, + "loss": 0.7255, + "step": 2559 + }, + { + "epoch": 0.1937119291740759, + "grad_norm": 2.3928468227386475, + "learning_rate": 1.7002737768303542e-05, + "loss": 1.0792, + "step": 2560 + }, + { + "epoch": 0.1937875978964095, + "grad_norm": 2.169796943664551, + "learning_rate": 1.7001707056016633e-05, + "loss": 0.8442, + "step": 2561 + }, + { + "epoch": 0.19386326661874315, + "grad_norm": 2.0690131187438965, + "learning_rate": 1.7000675842636948e-05, + "loss": 0.7823, + "step": 2562 + }, + { + "epoch": 0.19393893534107676, + "grad_norm": 2.256768226623535, + "learning_rate": 1.6999644128229065e-05, + "loss": 0.7462, + "step": 2563 + }, + { + "epoch": 0.1940146040634104, + "grad_norm": 2.095914602279663, + "learning_rate": 1.6998611912857592e-05, + "loss": 0.8174, + "step": 2564 + }, + { + "epoch": 0.194090272785744, + "grad_norm": 3.0056588649749756, + "learning_rate": 1.6997579196587173e-05, + "loss": 0.8705, + "step": 2565 + }, + { + "epoch": 0.19416594150807764, + "grad_norm": 2.5607128143310547, + "learning_rate": 1.6996545979482475e-05, + "loss": 0.7777, + "step": 2566 + }, + { + "epoch": 0.19424161023041125, + "grad_norm": 2.9612107276916504, + "learning_rate": 1.6995512261608202e-05, + "loss": 0.6993, + "step": 2567 + }, + { + "epoch": 0.1943172789527449, + "grad_norm": 2.610933780670166, + "learning_rate": 1.6994478043029095e-05, + "loss": 0.7044, + "step": 2568 + }, + { + "epoch": 0.1943929476750785, + "grad_norm": 2.2328102588653564, + "learning_rate": 1.699344332380991e-05, + "loss": 0.6661, + "step": 2569 + }, + { + "epoch": 0.19446861639741214, + "grad_norm": 2.128195285797119, + "learning_rate": 1.6992408104015458e-05, + "loss": 0.735, + "step": 2570 + }, + { + "epoch": 0.19454428511974575, + "grad_norm": 3.4304070472717285, + "learning_rate": 1.6991372383710555e-05, + "loss": 0.9446, + "step": 2571 + }, + { + "epoch": 0.19461995384207936, + "grad_norm": 2.2780416011810303, + "learning_rate": 1.6990336162960066e-05, + "loss": 0.8719, + "step": 2572 + }, + { + "epoch": 0.194695622564413, + "grad_norm": 2.545768976211548, + "learning_rate": 1.6989299441828883e-05, + "loss": 0.7445, + "step": 2573 + }, + { + "epoch": 0.1947712912867466, + "grad_norm": 2.4428822994232178, + "learning_rate": 1.698826222038193e-05, + "loss": 0.6788, + "step": 2574 + }, + { + "epoch": 0.19484696000908025, + "grad_norm": 2.0839898586273193, + "learning_rate": 1.6987224498684157e-05, + "loss": 0.7712, + "step": 2575 + }, + { + "epoch": 0.19492262873141386, + "grad_norm": 2.200305938720703, + "learning_rate": 1.6986186276800554e-05, + "loss": 0.6872, + "step": 2576 + }, + { + "epoch": 0.1949982974537475, + "grad_norm": 2.497018575668335, + "learning_rate": 1.6985147554796134e-05, + "loss": 0.7094, + "step": 2577 + }, + { + "epoch": 0.1950739661760811, + "grad_norm": 2.3949403762817383, + "learning_rate": 1.698410833273595e-05, + "loss": 0.778, + "step": 2578 + }, + { + "epoch": 0.19514963489841475, + "grad_norm": 3.1228713989257812, + "learning_rate": 1.698306861068508e-05, + "loss": 0.8602, + "step": 2579 + }, + { + "epoch": 0.19522530362074836, + "grad_norm": 2.2190120220184326, + "learning_rate": 1.6982028388708625e-05, + "loss": 0.8146, + "step": 2580 + }, + { + "epoch": 0.195300972343082, + "grad_norm": 1.9382598400115967, + "learning_rate": 1.698098766687174e-05, + "loss": 0.636, + "step": 2581 + }, + { + "epoch": 0.1953766410654156, + "grad_norm": 2.97119402885437, + "learning_rate": 1.6979946445239595e-05, + "loss": 0.732, + "step": 2582 + }, + { + "epoch": 0.19545230978774925, + "grad_norm": 2.2818760871887207, + "learning_rate": 1.6978904723877394e-05, + "loss": 0.7439, + "step": 2583 + }, + { + "epoch": 0.19552797851008286, + "grad_norm": 3.0975162982940674, + "learning_rate": 1.697786250285037e-05, + "loss": 0.8555, + "step": 2584 + }, + { + "epoch": 0.1956036472324165, + "grad_norm": 2.345454454421997, + "learning_rate": 1.6976819782223792e-05, + "loss": 0.7178, + "step": 2585 + }, + { + "epoch": 0.1956793159547501, + "grad_norm": 2.2025437355041504, + "learning_rate": 1.697577656206296e-05, + "loss": 0.8992, + "step": 2586 + }, + { + "epoch": 0.19575498467708372, + "grad_norm": 2.2825613021850586, + "learning_rate": 1.6974732842433202e-05, + "loss": 0.667, + "step": 2587 + }, + { + "epoch": 0.19583065339941735, + "grad_norm": 2.3459088802337646, + "learning_rate": 1.697368862339988e-05, + "loss": 0.5786, + "step": 2588 + }, + { + "epoch": 0.19590632212175096, + "grad_norm": 2.5378456115722656, + "learning_rate": 1.697264390502839e-05, + "loss": 0.7354, + "step": 2589 + }, + { + "epoch": 0.1959819908440846, + "grad_norm": 2.5877671241760254, + "learning_rate": 1.697159868738415e-05, + "loss": 0.7067, + "step": 2590 + }, + { + "epoch": 0.1960576595664182, + "grad_norm": 2.3794543743133545, + "learning_rate": 1.6970552970532616e-05, + "loss": 0.8205, + "step": 2591 + }, + { + "epoch": 0.19613332828875185, + "grad_norm": 2.144336462020874, + "learning_rate": 1.6969506754539278e-05, + "loss": 0.882, + "step": 2592 + }, + { + "epoch": 0.19620899701108546, + "grad_norm": 2.0169589519500732, + "learning_rate": 1.6968460039469644e-05, + "loss": 0.8049, + "step": 2593 + }, + { + "epoch": 0.1962846657334191, + "grad_norm": 1.9170702695846558, + "learning_rate": 1.6967412825389272e-05, + "loss": 0.6913, + "step": 2594 + }, + { + "epoch": 0.1963603344557527, + "grad_norm": 2.5606637001037598, + "learning_rate": 1.6966365112363743e-05, + "loss": 0.7495, + "step": 2595 + }, + { + "epoch": 0.19643600317808635, + "grad_norm": 2.407437801361084, + "learning_rate": 1.696531690045866e-05, + "loss": 0.8785, + "step": 2596 + }, + { + "epoch": 0.19651167190041996, + "grad_norm": 2.435490369796753, + "learning_rate": 1.696426818973967e-05, + "loss": 0.7802, + "step": 2597 + }, + { + "epoch": 0.1965873406227536, + "grad_norm": 2.0081589221954346, + "learning_rate": 1.696321898027245e-05, + "loss": 0.6976, + "step": 2598 + }, + { + "epoch": 0.1966630093450872, + "grad_norm": 2.957495927810669, + "learning_rate": 1.6962169272122697e-05, + "loss": 0.7115, + "step": 2599 + }, + { + "epoch": 0.19673867806742082, + "grad_norm": 2.271768808364868, + "learning_rate": 1.6961119065356155e-05, + "loss": 0.8056, + "step": 2600 + }, + { + "epoch": 0.19681434678975446, + "grad_norm": 1.9800879955291748, + "learning_rate": 1.6960068360038584e-05, + "loss": 0.6092, + "step": 2601 + }, + { + "epoch": 0.19689001551208807, + "grad_norm": 2.3020243644714355, + "learning_rate": 1.695901715623579e-05, + "loss": 0.8472, + "step": 2602 + }, + { + "epoch": 0.1969656842344217, + "grad_norm": 2.3017523288726807, + "learning_rate": 1.6957965454013597e-05, + "loss": 0.8187, + "step": 2603 + }, + { + "epoch": 0.19704135295675532, + "grad_norm": 2.2522785663604736, + "learning_rate": 1.6956913253437868e-05, + "loss": 0.7279, + "step": 2604 + }, + { + "epoch": 0.19711702167908896, + "grad_norm": 2.6222681999206543, + "learning_rate": 1.6955860554574495e-05, + "loss": 0.8215, + "step": 2605 + }, + { + "epoch": 0.19719269040142257, + "grad_norm": 2.9026076793670654, + "learning_rate": 1.6954807357489407e-05, + "loss": 0.6979, + "step": 2606 + }, + { + "epoch": 0.1972683591237562, + "grad_norm": 3.418788194656372, + "learning_rate": 1.6953753662248547e-05, + "loss": 0.7305, + "step": 2607 + }, + { + "epoch": 0.19734402784608981, + "grad_norm": 2.4104363918304443, + "learning_rate": 1.695269946891791e-05, + "loss": 0.7632, + "step": 2608 + }, + { + "epoch": 0.19741969656842345, + "grad_norm": 2.85041880607605, + "learning_rate": 1.695164477756351e-05, + "loss": 0.7216, + "step": 2609 + }, + { + "epoch": 0.19749536529075706, + "grad_norm": 2.079584836959839, + "learning_rate": 1.695058958825139e-05, + "loss": 0.7418, + "step": 2610 + }, + { + "epoch": 0.1975710340130907, + "grad_norm": 2.3523175716400146, + "learning_rate": 1.6949533901047643e-05, + "loss": 0.7151, + "step": 2611 + }, + { + "epoch": 0.1976467027354243, + "grad_norm": 2.4954283237457275, + "learning_rate": 1.6948477716018366e-05, + "loss": 0.8416, + "step": 2612 + }, + { + "epoch": 0.19772237145775792, + "grad_norm": 2.920403480529785, + "learning_rate": 1.6947421033229706e-05, + "loss": 0.7814, + "step": 2613 + }, + { + "epoch": 0.19779804018009156, + "grad_norm": 2.6211936473846436, + "learning_rate": 1.6946363852747838e-05, + "loss": 0.7247, + "step": 2614 + }, + { + "epoch": 0.19787370890242517, + "grad_norm": 3.040184259414673, + "learning_rate": 1.694530617463896e-05, + "loss": 0.8646, + "step": 2615 + }, + { + "epoch": 0.1979493776247588, + "grad_norm": 2.736969232559204, + "learning_rate": 1.6944247998969318e-05, + "loss": 0.6909, + "step": 2616 + }, + { + "epoch": 0.19802504634709242, + "grad_norm": 2.3474671840667725, + "learning_rate": 1.694318932580517e-05, + "loss": 0.7216, + "step": 2617 + }, + { + "epoch": 0.19810071506942606, + "grad_norm": 2.3557677268981934, + "learning_rate": 1.6942130155212808e-05, + "loss": 0.7854, + "step": 2618 + }, + { + "epoch": 0.19817638379175967, + "grad_norm": 3.080761432647705, + "learning_rate": 1.6941070487258573e-05, + "loss": 0.7452, + "step": 2619 + }, + { + "epoch": 0.1982520525140933, + "grad_norm": 2.3704681396484375, + "learning_rate": 1.694001032200882e-05, + "loss": 0.6745, + "step": 2620 + }, + { + "epoch": 0.19832772123642692, + "grad_norm": 2.427135467529297, + "learning_rate": 1.6938949659529935e-05, + "loss": 0.8975, + "step": 2621 + }, + { + "epoch": 0.19840338995876056, + "grad_norm": 2.0975005626678467, + "learning_rate": 1.693788849988835e-05, + "loss": 0.619, + "step": 2622 + }, + { + "epoch": 0.19847905868109417, + "grad_norm": 2.5328569412231445, + "learning_rate": 1.6936826843150512e-05, + "loss": 0.7884, + "step": 2623 + }, + { + "epoch": 0.1985547274034278, + "grad_norm": 5.670637130737305, + "learning_rate": 1.6935764689382904e-05, + "loss": 0.8431, + "step": 2624 + }, + { + "epoch": 0.19863039612576142, + "grad_norm": 2.570315361022949, + "learning_rate": 1.6934702038652046e-05, + "loss": 0.8699, + "step": 2625 + }, + { + "epoch": 0.19870606484809503, + "grad_norm": 2.185997247695923, + "learning_rate": 1.693363889102448e-05, + "loss": 0.8449, + "step": 2626 + }, + { + "epoch": 0.19878173357042866, + "grad_norm": 2.4839537143707275, + "learning_rate": 1.6932575246566788e-05, + "loss": 0.8201, + "step": 2627 + }, + { + "epoch": 0.19885740229276228, + "grad_norm": 2.1971192359924316, + "learning_rate": 1.6931511105345575e-05, + "loss": 0.7496, + "step": 2628 + }, + { + "epoch": 0.1989330710150959, + "grad_norm": 2.9695796966552734, + "learning_rate": 1.6930446467427484e-05, + "loss": 0.7093, + "step": 2629 + }, + { + "epoch": 0.19900873973742952, + "grad_norm": 2.3504600524902344, + "learning_rate": 1.6929381332879187e-05, + "loss": 0.7825, + "step": 2630 + }, + { + "epoch": 0.19908440845976316, + "grad_norm": 2.4642410278320312, + "learning_rate": 1.6928315701767382e-05, + "loss": 0.8154, + "step": 2631 + }, + { + "epoch": 0.19916007718209677, + "grad_norm": 2.417527675628662, + "learning_rate": 1.6927249574158803e-05, + "loss": 0.7947, + "step": 2632 + }, + { + "epoch": 0.1992357459044304, + "grad_norm": 2.4972589015960693, + "learning_rate": 1.692618295012022e-05, + "loss": 0.714, + "step": 2633 + }, + { + "epoch": 0.19931141462676402, + "grad_norm": 2.6074717044830322, + "learning_rate": 1.6925115829718424e-05, + "loss": 0.7362, + "step": 2634 + }, + { + "epoch": 0.19938708334909766, + "grad_norm": 2.2612643241882324, + "learning_rate": 1.692404821302024e-05, + "loss": 0.8082, + "step": 2635 + }, + { + "epoch": 0.19946275207143127, + "grad_norm": 3.058591842651367, + "learning_rate": 1.6922980100092524e-05, + "loss": 0.729, + "step": 2636 + }, + { + "epoch": 0.1995384207937649, + "grad_norm": 2.8211417198181152, + "learning_rate": 1.6921911491002175e-05, + "loss": 0.6836, + "step": 2637 + }, + { + "epoch": 0.19961408951609852, + "grad_norm": 2.3648903369903564, + "learning_rate": 1.69208423858161e-05, + "loss": 0.7884, + "step": 2638 + }, + { + "epoch": 0.19968975823843216, + "grad_norm": 2.64884877204895, + "learning_rate": 1.691977278460126e-05, + "loss": 0.8841, + "step": 2639 + }, + { + "epoch": 0.19976542696076577, + "grad_norm": 2.9867165088653564, + "learning_rate": 1.6918702687424628e-05, + "loss": 0.7115, + "step": 2640 + }, + { + "epoch": 0.19984109568309938, + "grad_norm": 2.427945137023926, + "learning_rate": 1.6917632094353226e-05, + "loss": 1.0163, + "step": 2641 + }, + { + "epoch": 0.19991676440543302, + "grad_norm": 2.252856969833374, + "learning_rate": 1.6916561005454093e-05, + "loss": 0.8115, + "step": 2642 + }, + { + "epoch": 0.19999243312776663, + "grad_norm": 2.524324893951416, + "learning_rate": 1.6915489420794304e-05, + "loss": 0.6516, + "step": 2643 + }, + { + "epoch": 0.20006810185010027, + "grad_norm": 2.3426573276519775, + "learning_rate": 1.691441734044096e-05, + "loss": 0.7816, + "step": 2644 + }, + { + "epoch": 0.20014377057243388, + "grad_norm": 2.6278955936431885, + "learning_rate": 1.691334476446121e-05, + "loss": 0.6821, + "step": 2645 + }, + { + "epoch": 0.20021943929476752, + "grad_norm": 2.5441291332244873, + "learning_rate": 1.6912271692922216e-05, + "loss": 0.8005, + "step": 2646 + }, + { + "epoch": 0.20029510801710113, + "grad_norm": 3.0294198989868164, + "learning_rate": 1.691119812589118e-05, + "loss": 0.7918, + "step": 2647 + }, + { + "epoch": 0.20037077673943476, + "grad_norm": 2.6301164627075195, + "learning_rate": 1.6910124063435322e-05, + "loss": 0.7959, + "step": 2648 + }, + { + "epoch": 0.20044644546176837, + "grad_norm": 6.36815071105957, + "learning_rate": 1.6909049505621912e-05, + "loss": 0.8501, + "step": 2649 + }, + { + "epoch": 0.200522114184102, + "grad_norm": 11.723066329956055, + "learning_rate": 1.6907974452518245e-05, + "loss": 0.6644, + "step": 2650 + }, + { + "epoch": 0.20059778290643562, + "grad_norm": 30.52318572998047, + "learning_rate": 1.690689890419164e-05, + "loss": 0.8288, + "step": 2651 + }, + { + "epoch": 0.20067345162876926, + "grad_norm": 2.54634165763855, + "learning_rate": 1.6905822860709446e-05, + "loss": 0.807, + "step": 2652 + }, + { + "epoch": 0.20074912035110287, + "grad_norm": 2.408019542694092, + "learning_rate": 1.690474632213906e-05, + "loss": 0.9354, + "step": 2653 + }, + { + "epoch": 0.20082478907343648, + "grad_norm": 3.3913652896881104, + "learning_rate": 1.690366928854789e-05, + "loss": 0.8307, + "step": 2654 + }, + { + "epoch": 0.20090045779577012, + "grad_norm": 3.0079503059387207, + "learning_rate": 1.6902591760003387e-05, + "loss": 0.6633, + "step": 2655 + }, + { + "epoch": 0.20097612651810373, + "grad_norm": 2.45389461517334, + "learning_rate": 1.6901513736573027e-05, + "loss": 0.7959, + "step": 2656 + }, + { + "epoch": 0.20105179524043737, + "grad_norm": 2.300036907196045, + "learning_rate": 1.690043521832432e-05, + "loss": 0.7693, + "step": 2657 + }, + { + "epoch": 0.20112746396277098, + "grad_norm": 2.469834566116333, + "learning_rate": 1.6899356205324807e-05, + "loss": 0.8985, + "step": 2658 + }, + { + "epoch": 0.20120313268510462, + "grad_norm": 2.4078941345214844, + "learning_rate": 1.6898276697642056e-05, + "loss": 0.8893, + "step": 2659 + }, + { + "epoch": 0.20127880140743823, + "grad_norm": 2.462860584259033, + "learning_rate": 1.6897196695343672e-05, + "loss": 0.8923, + "step": 2660 + }, + { + "epoch": 0.20135447012977187, + "grad_norm": 2.2678468227386475, + "learning_rate": 1.6896116198497295e-05, + "loss": 0.8364, + "step": 2661 + }, + { + "epoch": 0.20143013885210548, + "grad_norm": 2.6497673988342285, + "learning_rate": 1.6895035207170577e-05, + "loss": 0.7843, + "step": 2662 + }, + { + "epoch": 0.20150580757443912, + "grad_norm": 2.344269037246704, + "learning_rate": 1.6893953721431218e-05, + "loss": 0.7275, + "step": 2663 + }, + { + "epoch": 0.20158147629677273, + "grad_norm": 2.5566043853759766, + "learning_rate": 1.689287174134695e-05, + "loss": 0.856, + "step": 2664 + }, + { + "epoch": 0.20165714501910637, + "grad_norm": 2.8160223960876465, + "learning_rate": 1.689178926698552e-05, + "loss": 0.7982, + "step": 2665 + }, + { + "epoch": 0.20173281374143998, + "grad_norm": 5.079640865325928, + "learning_rate": 1.6890706298414722e-05, + "loss": 0.8701, + "step": 2666 + }, + { + "epoch": 0.2018084824637736, + "grad_norm": 2.1939332485198975, + "learning_rate": 1.6889622835702372e-05, + "loss": 0.6718, + "step": 2667 + }, + { + "epoch": 0.20188415118610722, + "grad_norm": 2.220892906188965, + "learning_rate": 1.6888538878916328e-05, + "loss": 0.6956, + "step": 2668 + }, + { + "epoch": 0.20195981990844084, + "grad_norm": 2.5196173191070557, + "learning_rate": 1.688745442812446e-05, + "loss": 0.864, + "step": 2669 + }, + { + "epoch": 0.20203548863077447, + "grad_norm": 2.502357244491577, + "learning_rate": 1.6886369483394683e-05, + "loss": 0.7481, + "step": 2670 + }, + { + "epoch": 0.20211115735310808, + "grad_norm": 3.1497068405151367, + "learning_rate": 1.6885284044794946e-05, + "loss": 0.7413, + "step": 2671 + }, + { + "epoch": 0.20218682607544172, + "grad_norm": 2.358307361602783, + "learning_rate": 1.6884198112393216e-05, + "loss": 0.8536, + "step": 2672 + }, + { + "epoch": 0.20226249479777533, + "grad_norm": 3.1044929027557373, + "learning_rate": 1.68831116862575e-05, + "loss": 0.8081, + "step": 2673 + }, + { + "epoch": 0.20233816352010897, + "grad_norm": 2.201646327972412, + "learning_rate": 1.6882024766455832e-05, + "loss": 0.9349, + "step": 2674 + }, + { + "epoch": 0.20241383224244258, + "grad_norm": 2.6423752307891846, + "learning_rate": 1.6880937353056283e-05, + "loss": 0.7464, + "step": 2675 + }, + { + "epoch": 0.20248950096477622, + "grad_norm": 2.547576427459717, + "learning_rate": 1.6879849446126942e-05, + "loss": 0.6216, + "step": 2676 + }, + { + "epoch": 0.20256516968710983, + "grad_norm": 2.7774250507354736, + "learning_rate": 1.6878761045735946e-05, + "loss": 0.828, + "step": 2677 + }, + { + "epoch": 0.20264083840944347, + "grad_norm": 2.6742935180664062, + "learning_rate": 1.6877672151951446e-05, + "loss": 0.7657, + "step": 2678 + }, + { + "epoch": 0.20271650713177708, + "grad_norm": 2.081855535507202, + "learning_rate": 1.687658276484164e-05, + "loss": 0.7594, + "step": 2679 + }, + { + "epoch": 0.2027921758541107, + "grad_norm": 2.9770843982696533, + "learning_rate": 1.6875492884474744e-05, + "loss": 0.777, + "step": 2680 + }, + { + "epoch": 0.20286784457644433, + "grad_norm": 2.1680080890655518, + "learning_rate": 1.6874402510919013e-05, + "loss": 0.8131, + "step": 2681 + }, + { + "epoch": 0.20294351329877794, + "grad_norm": 2.5214853286743164, + "learning_rate": 1.6873311644242726e-05, + "loss": 0.8729, + "step": 2682 + }, + { + "epoch": 0.20301918202111158, + "grad_norm": 2.355656862258911, + "learning_rate": 1.68722202845142e-05, + "loss": 0.7149, + "step": 2683 + }, + { + "epoch": 0.2030948507434452, + "grad_norm": 2.722672939300537, + "learning_rate": 1.6871128431801776e-05, + "loss": 0.7906, + "step": 2684 + }, + { + "epoch": 0.20317051946577883, + "grad_norm": 2.526291608810425, + "learning_rate": 1.6870036086173833e-05, + "loss": 0.9885, + "step": 2685 + }, + { + "epoch": 0.20324618818811244, + "grad_norm": 2.7537612915039062, + "learning_rate": 1.686894324769877e-05, + "loss": 0.7336, + "step": 2686 + }, + { + "epoch": 0.20332185691044607, + "grad_norm": 2.314716100692749, + "learning_rate": 1.686784991644504e-05, + "loss": 0.6739, + "step": 2687 + }, + { + "epoch": 0.20339752563277969, + "grad_norm": 2.298309087753296, + "learning_rate": 1.6866756092481092e-05, + "loss": 0.655, + "step": 2688 + }, + { + "epoch": 0.20347319435511332, + "grad_norm": 2.149913787841797, + "learning_rate": 1.6865661775875437e-05, + "loss": 0.688, + "step": 2689 + }, + { + "epoch": 0.20354886307744693, + "grad_norm": 2.236656904220581, + "learning_rate": 1.68645669666966e-05, + "loss": 0.8814, + "step": 2690 + }, + { + "epoch": 0.20362453179978057, + "grad_norm": 2.557054042816162, + "learning_rate": 1.686347166501314e-05, + "loss": 0.8819, + "step": 2691 + }, + { + "epoch": 0.20370020052211418, + "grad_norm": 7.46500301361084, + "learning_rate": 1.6862375870893653e-05, + "loss": 0.807, + "step": 2692 + }, + { + "epoch": 0.2037758692444478, + "grad_norm": 2.449782609939575, + "learning_rate": 1.686127958440676e-05, + "loss": 0.6193, + "step": 2693 + }, + { + "epoch": 0.20385153796678143, + "grad_norm": 2.736797571182251, + "learning_rate": 1.6860182805621112e-05, + "loss": 0.9145, + "step": 2694 + }, + { + "epoch": 0.20392720668911504, + "grad_norm": 2.61444354057312, + "learning_rate": 1.6859085534605395e-05, + "loss": 0.8425, + "step": 2695 + }, + { + "epoch": 0.20400287541144868, + "grad_norm": 2.4622318744659424, + "learning_rate": 1.6857987771428323e-05, + "loss": 0.9507, + "step": 2696 + }, + { + "epoch": 0.2040785441337823, + "grad_norm": 2.3272974491119385, + "learning_rate": 1.6856889516158637e-05, + "loss": 0.774, + "step": 2697 + }, + { + "epoch": 0.20415421285611593, + "grad_norm": 2.636510133743286, + "learning_rate": 1.685579076886512e-05, + "loss": 0.7583, + "step": 2698 + }, + { + "epoch": 0.20422988157844954, + "grad_norm": 5.449431896209717, + "learning_rate": 1.6854691529616578e-05, + "loss": 0.7954, + "step": 2699 + }, + { + "epoch": 0.20430555030078318, + "grad_norm": 2.3265273571014404, + "learning_rate": 1.6853591798481845e-05, + "loss": 0.6401, + "step": 2700 + }, + { + "epoch": 0.2043812190231168, + "grad_norm": 2.6790006160736084, + "learning_rate": 1.685249157552979e-05, + "loss": 0.9875, + "step": 2701 + }, + { + "epoch": 0.20445688774545043, + "grad_norm": 2.484577178955078, + "learning_rate": 1.6851390860829317e-05, + "loss": 0.7331, + "step": 2702 + }, + { + "epoch": 0.20453255646778404, + "grad_norm": 2.061288833618164, + "learning_rate": 1.6850289654449355e-05, + "loss": 0.7615, + "step": 2703 + }, + { + "epoch": 0.20460822519011768, + "grad_norm": 1.9732778072357178, + "learning_rate": 1.684918795645886e-05, + "loss": 0.7419, + "step": 2704 + }, + { + "epoch": 0.2046838939124513, + "grad_norm": 2.913220167160034, + "learning_rate": 1.684808576692683e-05, + "loss": 0.7499, + "step": 2705 + }, + { + "epoch": 0.20475956263478493, + "grad_norm": 1.9376118183135986, + "learning_rate": 1.6846983085922287e-05, + "loss": 0.8705, + "step": 2706 + }, + { + "epoch": 0.20483523135711854, + "grad_norm": 2.383751153945923, + "learning_rate": 1.684587991351428e-05, + "loss": 0.8206, + "step": 2707 + }, + { + "epoch": 0.20491090007945215, + "grad_norm": 2.5557644367218018, + "learning_rate": 1.68447762497719e-05, + "loss": 0.802, + "step": 2708 + }, + { + "epoch": 0.20498656880178578, + "grad_norm": 2.486907958984375, + "learning_rate": 1.6843672094764253e-05, + "loss": 0.8066, + "step": 2709 + }, + { + "epoch": 0.2050622375241194, + "grad_norm": 2.465059757232666, + "learning_rate": 1.6842567448560494e-05, + "loss": 0.8275, + "step": 2710 + }, + { + "epoch": 0.20513790624645303, + "grad_norm": 2.361616611480713, + "learning_rate": 1.6841462311229796e-05, + "loss": 0.7221, + "step": 2711 + }, + { + "epoch": 0.20521357496878664, + "grad_norm": 3.4184422492980957, + "learning_rate": 1.6840356682841362e-05, + "loss": 0.6928, + "step": 2712 + }, + { + "epoch": 0.20528924369112028, + "grad_norm": 2.329210042953491, + "learning_rate": 1.6839250563464436e-05, + "loss": 0.7858, + "step": 2713 + }, + { + "epoch": 0.2053649124134539, + "grad_norm": 2.920400619506836, + "learning_rate": 1.6838143953168285e-05, + "loss": 0.7444, + "step": 2714 + }, + { + "epoch": 0.20544058113578753, + "grad_norm": 2.222848415374756, + "learning_rate": 1.6837036852022205e-05, + "loss": 0.7223, + "step": 2715 + }, + { + "epoch": 0.20551624985812114, + "grad_norm": 2.549514055252075, + "learning_rate": 1.683592926009553e-05, + "loss": 0.8735, + "step": 2716 + }, + { + "epoch": 0.20559191858045478, + "grad_norm": 3.1271274089813232, + "learning_rate": 1.6834821177457625e-05, + "loss": 0.7578, + "step": 2717 + }, + { + "epoch": 0.2056675873027884, + "grad_norm": 2.493976354598999, + "learning_rate": 1.683371260417787e-05, + "loss": 0.714, + "step": 2718 + }, + { + "epoch": 0.20574325602512203, + "grad_norm": 2.656724214553833, + "learning_rate": 1.6832603540325702e-05, + "loss": 0.9036, + "step": 2719 + }, + { + "epoch": 0.20581892474745564, + "grad_norm": 1.9792472124099731, + "learning_rate": 1.683149398597056e-05, + "loss": 0.8492, + "step": 2720 + }, + { + "epoch": 0.20589459346978925, + "grad_norm": 3.5724356174468994, + "learning_rate": 1.6830383941181938e-05, + "loss": 0.7013, + "step": 2721 + }, + { + "epoch": 0.2059702621921229, + "grad_norm": 2.62611985206604, + "learning_rate": 1.6829273406029347e-05, + "loss": 0.7121, + "step": 2722 + }, + { + "epoch": 0.2060459309144565, + "grad_norm": 2.143665075302124, + "learning_rate": 1.6828162380582334e-05, + "loss": 0.686, + "step": 2723 + }, + { + "epoch": 0.20612159963679014, + "grad_norm": 2.061655044555664, + "learning_rate": 1.682705086491047e-05, + "loss": 0.9111, + "step": 2724 + }, + { + "epoch": 0.20619726835912375, + "grad_norm": 2.1191210746765137, + "learning_rate": 1.6825938859083365e-05, + "loss": 0.7933, + "step": 2725 + }, + { + "epoch": 0.20627293708145739, + "grad_norm": 2.6450743675231934, + "learning_rate": 1.6824826363170658e-05, + "loss": 0.6982, + "step": 2726 + }, + { + "epoch": 0.206348605803791, + "grad_norm": 1.9432772397994995, + "learning_rate": 1.6823713377242015e-05, + "loss": 0.7923, + "step": 2727 + }, + { + "epoch": 0.20642427452612463, + "grad_norm": 2.247926712036133, + "learning_rate": 1.6822599901367134e-05, + "loss": 0.7163, + "step": 2728 + }, + { + "epoch": 0.20649994324845825, + "grad_norm": 2.1911559104919434, + "learning_rate": 1.6821485935615748e-05, + "loss": 0.7813, + "step": 2729 + }, + { + "epoch": 0.20657561197079188, + "grad_norm": 2.491403341293335, + "learning_rate": 1.6820371480057613e-05, + "loss": 0.8105, + "step": 2730 + }, + { + "epoch": 0.2066512806931255, + "grad_norm": 2.1198718547821045, + "learning_rate": 1.6819256534762525e-05, + "loss": 0.8581, + "step": 2731 + }, + { + "epoch": 0.20672694941545913, + "grad_norm": 2.3443424701690674, + "learning_rate": 1.68181410998003e-05, + "loss": 0.791, + "step": 2732 + }, + { + "epoch": 0.20680261813779274, + "grad_norm": 1.8470584154129028, + "learning_rate": 1.6817025175240793e-05, + "loss": 0.8894, + "step": 2733 + }, + { + "epoch": 0.20687828686012635, + "grad_norm": 2.2806928157806396, + "learning_rate": 1.6815908761153887e-05, + "loss": 0.6281, + "step": 2734 + }, + { + "epoch": 0.20695395558246, + "grad_norm": 2.3737926483154297, + "learning_rate": 1.681479185760949e-05, + "loss": 0.6816, + "step": 2735 + }, + { + "epoch": 0.2070296243047936, + "grad_norm": 2.267542600631714, + "learning_rate": 1.681367446467756e-05, + "loss": 0.7355, + "step": 2736 + }, + { + "epoch": 0.20710529302712724, + "grad_norm": 2.259472608566284, + "learning_rate": 1.6812556582428052e-05, + "loss": 0.7937, + "step": 2737 + }, + { + "epoch": 0.20718096174946085, + "grad_norm": 2.7698655128479004, + "learning_rate": 1.6811438210930987e-05, + "loss": 0.8421, + "step": 2738 + }, + { + "epoch": 0.2072566304717945, + "grad_norm": 2.1473047733306885, + "learning_rate": 1.6810319350256397e-05, + "loss": 0.6328, + "step": 2739 + }, + { + "epoch": 0.2073322991941281, + "grad_norm": 2.5372629165649414, + "learning_rate": 1.6809200000474347e-05, + "loss": 0.7475, + "step": 2740 + }, + { + "epoch": 0.20740796791646174, + "grad_norm": 2.0664265155792236, + "learning_rate": 1.6808080161654935e-05, + "loss": 0.6881, + "step": 2741 + }, + { + "epoch": 0.20748363663879535, + "grad_norm": 4.4088311195373535, + "learning_rate": 1.6806959833868288e-05, + "loss": 0.705, + "step": 2742 + }, + { + "epoch": 0.207559305361129, + "grad_norm": 3.005873918533325, + "learning_rate": 1.6805839017184565e-05, + "loss": 0.8028, + "step": 2743 + }, + { + "epoch": 0.2076349740834626, + "grad_norm": 2.4871227741241455, + "learning_rate": 1.6804717711673957e-05, + "loss": 0.8642, + "step": 2744 + }, + { + "epoch": 0.20771064280579624, + "grad_norm": 2.4536328315734863, + "learning_rate": 1.680359591740668e-05, + "loss": 0.6728, + "step": 2745 + }, + { + "epoch": 0.20778631152812985, + "grad_norm": 2.521181583404541, + "learning_rate": 1.680247363445299e-05, + "loss": 0.6283, + "step": 2746 + }, + { + "epoch": 0.20786198025046346, + "grad_norm": 2.403087615966797, + "learning_rate": 1.680135086288316e-05, + "loss": 0.7569, + "step": 2747 + }, + { + "epoch": 0.2079376489727971, + "grad_norm": 2.606722116470337, + "learning_rate": 1.6800227602767513e-05, + "loss": 0.8054, + "step": 2748 + }, + { + "epoch": 0.2080133176951307, + "grad_norm": 2.361828088760376, + "learning_rate": 1.679910385417638e-05, + "loss": 0.8643, + "step": 2749 + }, + { + "epoch": 0.20808898641746434, + "grad_norm": 3.0654191970825195, + "learning_rate": 1.679797961718014e-05, + "loss": 0.8303, + "step": 2750 + }, + { + "epoch": 0.20816465513979795, + "grad_norm": 2.4694888591766357, + "learning_rate": 1.6796854891849195e-05, + "loss": 0.9629, + "step": 2751 + }, + { + "epoch": 0.2082403238621316, + "grad_norm": 2.855731964111328, + "learning_rate": 1.6795729678253977e-05, + "loss": 0.9194, + "step": 2752 + }, + { + "epoch": 0.2083159925844652, + "grad_norm": 2.1465506553649902, + "learning_rate": 1.6794603976464953e-05, + "loss": 0.7555, + "step": 2753 + }, + { + "epoch": 0.20839166130679884, + "grad_norm": 2.5873541831970215, + "learning_rate": 1.6793477786552618e-05, + "loss": 0.7371, + "step": 2754 + }, + { + "epoch": 0.20846733002913245, + "grad_norm": 2.087071180343628, + "learning_rate": 1.679235110858749e-05, + "loss": 0.6777, + "step": 2755 + }, + { + "epoch": 0.2085429987514661, + "grad_norm": 3.9847989082336426, + "learning_rate": 1.679122394264014e-05, + "loss": 0.8945, + "step": 2756 + }, + { + "epoch": 0.2086186674737997, + "grad_norm": 2.593203067779541, + "learning_rate": 1.6790096288781148e-05, + "loss": 0.6988, + "step": 2757 + }, + { + "epoch": 0.20869433619613334, + "grad_norm": 2.5116524696350098, + "learning_rate": 1.6788968147081126e-05, + "loss": 0.812, + "step": 2758 + }, + { + "epoch": 0.20877000491846695, + "grad_norm": 2.77005672454834, + "learning_rate": 1.6787839517610727e-05, + "loss": 0.877, + "step": 2759 + }, + { + "epoch": 0.2088456736408006, + "grad_norm": 2.331638813018799, + "learning_rate": 1.6786710400440627e-05, + "loss": 0.8041, + "step": 2760 + }, + { + "epoch": 0.2089213423631342, + "grad_norm": 2.0847373008728027, + "learning_rate": 1.678558079564154e-05, + "loss": 0.8226, + "step": 2761 + }, + { + "epoch": 0.2089970110854678, + "grad_norm": 2.118413209915161, + "learning_rate": 1.6784450703284197e-05, + "loss": 0.8281, + "step": 2762 + }, + { + "epoch": 0.20907267980780145, + "grad_norm": 2.4616172313690186, + "learning_rate": 1.6783320123439376e-05, + "loss": 0.7126, + "step": 2763 + }, + { + "epoch": 0.20914834853013506, + "grad_norm": 2.1653876304626465, + "learning_rate": 1.6782189056177875e-05, + "loss": 0.7439, + "step": 2764 + }, + { + "epoch": 0.2092240172524687, + "grad_norm": 2.1744189262390137, + "learning_rate": 1.6781057501570522e-05, + "loss": 0.8306, + "step": 2765 + }, + { + "epoch": 0.2092996859748023, + "grad_norm": 2.326197624206543, + "learning_rate": 1.6779925459688186e-05, + "loss": 0.7588, + "step": 2766 + }, + { + "epoch": 0.20937535469713595, + "grad_norm": 2.3091301918029785, + "learning_rate": 1.677879293060175e-05, + "loss": 0.7054, + "step": 2767 + }, + { + "epoch": 0.20945102341946956, + "grad_norm": 2.6318981647491455, + "learning_rate": 1.6777659914382144e-05, + "loss": 0.8123, + "step": 2768 + }, + { + "epoch": 0.2095266921418032, + "grad_norm": 2.382268190383911, + "learning_rate": 1.6776526411100315e-05, + "loss": 0.815, + "step": 2769 + }, + { + "epoch": 0.2096023608641368, + "grad_norm": 2.143889904022217, + "learning_rate": 1.6775392420827253e-05, + "loss": 0.7313, + "step": 2770 + }, + { + "epoch": 0.20967802958647044, + "grad_norm": 2.355656385421753, + "learning_rate": 1.6774257943633967e-05, + "loss": 0.7956, + "step": 2771 + }, + { + "epoch": 0.20975369830880405, + "grad_norm": 2.269749641418457, + "learning_rate": 1.6773122979591503e-05, + "loss": 0.7962, + "step": 2772 + }, + { + "epoch": 0.2098293670311377, + "grad_norm": 2.584016799926758, + "learning_rate": 1.6771987528770938e-05, + "loss": 0.8603, + "step": 2773 + }, + { + "epoch": 0.2099050357534713, + "grad_norm": 2.4904229640960693, + "learning_rate": 1.6770851591243378e-05, + "loss": 0.7932, + "step": 2774 + }, + { + "epoch": 0.2099807044758049, + "grad_norm": 3.181654453277588, + "learning_rate": 1.6769715167079953e-05, + "loss": 0.9061, + "step": 2775 + }, + { + "epoch": 0.21005637319813855, + "grad_norm": 2.1907131671905518, + "learning_rate": 1.6768578256351835e-05, + "loss": 0.7714, + "step": 2776 + }, + { + "epoch": 0.21013204192047216, + "grad_norm": 2.349133253097534, + "learning_rate": 1.6767440859130222e-05, + "loss": 0.8765, + "step": 2777 + }, + { + "epoch": 0.2102077106428058, + "grad_norm": 2.340827226638794, + "learning_rate": 1.6766302975486342e-05, + "loss": 0.8702, + "step": 2778 + }, + { + "epoch": 0.2102833793651394, + "grad_norm": 2.352503776550293, + "learning_rate": 1.6765164605491445e-05, + "loss": 0.6379, + "step": 2779 + }, + { + "epoch": 0.21035904808747305, + "grad_norm": 2.20408034324646, + "learning_rate": 1.6764025749216826e-05, + "loss": 0.6901, + "step": 2780 + }, + { + "epoch": 0.21043471680980666, + "grad_norm": 2.612621784210205, + "learning_rate": 1.6762886406733803e-05, + "loss": 0.6788, + "step": 2781 + }, + { + "epoch": 0.2105103855321403, + "grad_norm": 2.921649932861328, + "learning_rate": 1.6761746578113727e-05, + "loss": 0.7544, + "step": 2782 + }, + { + "epoch": 0.2105860542544739, + "grad_norm": 3.306110382080078, + "learning_rate": 1.6760606263427975e-05, + "loss": 0.7943, + "step": 2783 + }, + { + "epoch": 0.21066172297680755, + "grad_norm": 2.2433836460113525, + "learning_rate": 1.675946546274796e-05, + "loss": 0.9816, + "step": 2784 + }, + { + "epoch": 0.21073739169914116, + "grad_norm": 2.480579137802124, + "learning_rate": 1.6758324176145117e-05, + "loss": 0.7943, + "step": 2785 + }, + { + "epoch": 0.2108130604214748, + "grad_norm": 3.2746899127960205, + "learning_rate": 1.675718240369092e-05, + "loss": 0.8868, + "step": 2786 + }, + { + "epoch": 0.2108887291438084, + "grad_norm": 2.4543137550354004, + "learning_rate": 1.675604014545687e-05, + "loss": 0.7192, + "step": 2787 + }, + { + "epoch": 0.21096439786614202, + "grad_norm": 2.593430519104004, + "learning_rate": 1.6754897401514504e-05, + "loss": 0.7476, + "step": 2788 + }, + { + "epoch": 0.21104006658847566, + "grad_norm": 2.6094298362731934, + "learning_rate": 1.675375417193538e-05, + "loss": 0.8261, + "step": 2789 + }, + { + "epoch": 0.21111573531080927, + "grad_norm": 2.957797050476074, + "learning_rate": 1.6752610456791093e-05, + "loss": 0.777, + "step": 2790 + }, + { + "epoch": 0.2111914040331429, + "grad_norm": 3.8428449630737305, + "learning_rate": 1.6751466256153257e-05, + "loss": 0.8653, + "step": 2791 + }, + { + "epoch": 0.21126707275547651, + "grad_norm": 2.952915668487549, + "learning_rate": 1.675032157009354e-05, + "loss": 0.7709, + "step": 2792 + }, + { + "epoch": 0.21134274147781015, + "grad_norm": 2.7285871505737305, + "learning_rate": 1.6749176398683616e-05, + "loss": 0.7093, + "step": 2793 + }, + { + "epoch": 0.21141841020014376, + "grad_norm": 16.232563018798828, + "learning_rate": 1.67480307419952e-05, + "loss": 0.8038, + "step": 2794 + }, + { + "epoch": 0.2114940789224774, + "grad_norm": 2.1137609481811523, + "learning_rate": 1.6746884600100038e-05, + "loss": 0.8155, + "step": 2795 + }, + { + "epoch": 0.211569747644811, + "grad_norm": 2.6953883171081543, + "learning_rate": 1.674573797306991e-05, + "loss": 0.736, + "step": 2796 + }, + { + "epoch": 0.21164541636714465, + "grad_norm": 2.4279799461364746, + "learning_rate": 1.6744590860976615e-05, + "loss": 0.7183, + "step": 2797 + }, + { + "epoch": 0.21172108508947826, + "grad_norm": 2.5715172290802, + "learning_rate": 1.6743443263891994e-05, + "loss": 0.8558, + "step": 2798 + }, + { + "epoch": 0.2117967538118119, + "grad_norm": 2.9246373176574707, + "learning_rate": 1.6742295181887908e-05, + "loss": 0.8916, + "step": 2799 + }, + { + "epoch": 0.2118724225341455, + "grad_norm": 2.751868486404419, + "learning_rate": 1.6741146615036255e-05, + "loss": 0.9153, + "step": 2800 + }, + { + "epoch": 0.21194809125647912, + "grad_norm": 2.7025206089019775, + "learning_rate": 1.6739997563408967e-05, + "loss": 0.6688, + "step": 2801 + }, + { + "epoch": 0.21202375997881276, + "grad_norm": 2.7657957077026367, + "learning_rate": 1.6738848027077994e-05, + "loss": 0.9089, + "step": 2802 + }, + { + "epoch": 0.21209942870114637, + "grad_norm": 2.771183967590332, + "learning_rate": 1.6737698006115326e-05, + "loss": 0.8091, + "step": 2803 + }, + { + "epoch": 0.21217509742348, + "grad_norm": 2.281928300857544, + "learning_rate": 1.6736547500592985e-05, + "loss": 0.7638, + "step": 2804 + }, + { + "epoch": 0.21225076614581362, + "grad_norm": 2.5218539237976074, + "learning_rate": 1.673539651058302e-05, + "loss": 0.9364, + "step": 2805 + }, + { + "epoch": 0.21232643486814726, + "grad_norm": 2.1761555671691895, + "learning_rate": 1.6734245036157498e-05, + "loss": 0.7687, + "step": 2806 + }, + { + "epoch": 0.21240210359048087, + "grad_norm": 2.418473720550537, + "learning_rate": 1.6733093077388543e-05, + "loss": 0.676, + "step": 2807 + }, + { + "epoch": 0.2124777723128145, + "grad_norm": 1.9938814640045166, + "learning_rate": 1.673194063434828e-05, + "loss": 0.745, + "step": 2808 + }, + { + "epoch": 0.21255344103514812, + "grad_norm": 2.486959934234619, + "learning_rate": 1.6730787707108895e-05, + "loss": 0.8677, + "step": 2809 + }, + { + "epoch": 0.21262910975748175, + "grad_norm": 2.121563673019409, + "learning_rate": 1.6729634295742573e-05, + "loss": 0.7888, + "step": 2810 + }, + { + "epoch": 0.21270477847981537, + "grad_norm": 2.618818759918213, + "learning_rate": 1.6728480400321553e-05, + "loss": 0.6763, + "step": 2811 + }, + { + "epoch": 0.212780447202149, + "grad_norm": 2.66679310798645, + "learning_rate": 1.6727326020918095e-05, + "loss": 0.761, + "step": 2812 + }, + { + "epoch": 0.21285611592448261, + "grad_norm": 3.1060845851898193, + "learning_rate": 1.6726171157604486e-05, + "loss": 0.7265, + "step": 2813 + }, + { + "epoch": 0.21293178464681625, + "grad_norm": 2.5589113235473633, + "learning_rate": 1.672501581045305e-05, + "loss": 0.8843, + "step": 2814 + }, + { + "epoch": 0.21300745336914986, + "grad_norm": 2.34407901763916, + "learning_rate": 1.672385997953614e-05, + "loss": 0.8115, + "step": 2815 + }, + { + "epoch": 0.21308312209148347, + "grad_norm": 2.688868522644043, + "learning_rate": 1.6722703664926135e-05, + "loss": 0.6882, + "step": 2816 + }, + { + "epoch": 0.2131587908138171, + "grad_norm": 2.3896734714508057, + "learning_rate": 1.672154686669545e-05, + "loss": 0.7876, + "step": 2817 + }, + { + "epoch": 0.21323445953615072, + "grad_norm": 2.3544201850891113, + "learning_rate": 1.6720389584916525e-05, + "loss": 0.8239, + "step": 2818 + }, + { + "epoch": 0.21331012825848436, + "grad_norm": 2.2516539096832275, + "learning_rate": 1.671923181966183e-05, + "loss": 0.7131, + "step": 2819 + }, + { + "epoch": 0.21338579698081797, + "grad_norm": 3.938749074935913, + "learning_rate": 1.671807357100387e-05, + "loss": 0.7547, + "step": 2820 + }, + { + "epoch": 0.2134614657031516, + "grad_norm": 2.0515267848968506, + "learning_rate": 1.6716914839015185e-05, + "loss": 0.6756, + "step": 2821 + }, + { + "epoch": 0.21353713442548522, + "grad_norm": 2.133115768432617, + "learning_rate": 1.6715755623768334e-05, + "loss": 0.7228, + "step": 2822 + }, + { + "epoch": 0.21361280314781886, + "grad_norm": 2.3022468090057373, + "learning_rate": 1.6714595925335906e-05, + "loss": 0.8373, + "step": 2823 + }, + { + "epoch": 0.21368847187015247, + "grad_norm": 2.543943166732788, + "learning_rate": 1.671343574379053e-05, + "loss": 0.9197, + "step": 2824 + }, + { + "epoch": 0.2137641405924861, + "grad_norm": 2.1154918670654297, + "learning_rate": 1.6712275079204863e-05, + "loss": 0.9015, + "step": 2825 + }, + { + "epoch": 0.21383980931481972, + "grad_norm": 2.0648181438446045, + "learning_rate": 1.671111393165158e-05, + "loss": 0.8252, + "step": 2826 + }, + { + "epoch": 0.21391547803715336, + "grad_norm": 2.3254873752593994, + "learning_rate": 1.6709952301203405e-05, + "loss": 0.7808, + "step": 2827 + }, + { + "epoch": 0.21399114675948697, + "grad_norm": 2.4721736907958984, + "learning_rate": 1.670879018793308e-05, + "loss": 0.7166, + "step": 2828 + }, + { + "epoch": 0.21406681548182058, + "grad_norm": 2.7000808715820312, + "learning_rate": 1.6707627591913382e-05, + "loss": 0.8353, + "step": 2829 + }, + { + "epoch": 0.21414248420415422, + "grad_norm": 2.1937551498413086, + "learning_rate": 1.6706464513217115e-05, + "loss": 0.874, + "step": 2830 + }, + { + "epoch": 0.21421815292648783, + "grad_norm": 4.846593379974365, + "learning_rate": 1.670530095191711e-05, + "loss": 0.9048, + "step": 2831 + }, + { + "epoch": 0.21429382164882146, + "grad_norm": 2.571143627166748, + "learning_rate": 1.6704136908086242e-05, + "loss": 0.871, + "step": 2832 + }, + { + "epoch": 0.21436949037115507, + "grad_norm": 2.5679643154144287, + "learning_rate": 1.67029723817974e-05, + "loss": 0.7586, + "step": 2833 + }, + { + "epoch": 0.2144451590934887, + "grad_norm": 2.327501058578491, + "learning_rate": 1.670180737312351e-05, + "loss": 0.7948, + "step": 2834 + }, + { + "epoch": 0.21452082781582232, + "grad_norm": 2.3081796169281006, + "learning_rate": 1.670064188213754e-05, + "loss": 0.8191, + "step": 2835 + }, + { + "epoch": 0.21459649653815596, + "grad_norm": 2.2802083492279053, + "learning_rate": 1.669947590891246e-05, + "loss": 0.7863, + "step": 2836 + }, + { + "epoch": 0.21467216526048957, + "grad_norm": 1.9905712604522705, + "learning_rate": 1.6698309453521298e-05, + "loss": 0.6816, + "step": 2837 + }, + { + "epoch": 0.2147478339828232, + "grad_norm": 2.6414380073547363, + "learning_rate": 1.66971425160371e-05, + "loss": 0.7644, + "step": 2838 + }, + { + "epoch": 0.21482350270515682, + "grad_norm": 2.325744390487671, + "learning_rate": 1.6695975096532946e-05, + "loss": 0.9109, + "step": 2839 + }, + { + "epoch": 0.21489917142749046, + "grad_norm": 2.3487441539764404, + "learning_rate": 1.6694807195081934e-05, + "loss": 0.7954, + "step": 2840 + }, + { + "epoch": 0.21497484014982407, + "grad_norm": 2.7436139583587646, + "learning_rate": 1.6693638811757206e-05, + "loss": 0.9525, + "step": 2841 + }, + { + "epoch": 0.21505050887215768, + "grad_norm": 2.626749038696289, + "learning_rate": 1.6692469946631935e-05, + "loss": 0.7477, + "step": 2842 + }, + { + "epoch": 0.21512617759449132, + "grad_norm": 2.0960118770599365, + "learning_rate": 1.6691300599779314e-05, + "loss": 0.828, + "step": 2843 + }, + { + "epoch": 0.21520184631682493, + "grad_norm": 2.273026704788208, + "learning_rate": 1.6690130771272576e-05, + "loss": 0.6678, + "step": 2844 + }, + { + "epoch": 0.21527751503915857, + "grad_norm": 1.8988648653030396, + "learning_rate": 1.6688960461184974e-05, + "loss": 0.6153, + "step": 2845 + }, + { + "epoch": 0.21535318376149218, + "grad_norm": 2.7627248764038086, + "learning_rate": 1.6687789669589797e-05, + "loss": 0.8108, + "step": 2846 + }, + { + "epoch": 0.21542885248382582, + "grad_norm": 7.99500846862793, + "learning_rate": 1.6686618396560365e-05, + "loss": 0.7978, + "step": 2847 + }, + { + "epoch": 0.21550452120615943, + "grad_norm": 2.0139832496643066, + "learning_rate": 1.668544664217003e-05, + "loss": 0.8848, + "step": 2848 + }, + { + "epoch": 0.21558018992849307, + "grad_norm": 2.0754570960998535, + "learning_rate": 1.668427440649217e-05, + "loss": 0.8815, + "step": 2849 + }, + { + "epoch": 0.21565585865082668, + "grad_norm": 2.176095485687256, + "learning_rate": 1.668310168960019e-05, + "loss": 0.7272, + "step": 2850 + }, + { + "epoch": 0.21573152737316031, + "grad_norm": 2.36541748046875, + "learning_rate": 1.668192849156753e-05, + "loss": 0.6364, + "step": 2851 + }, + { + "epoch": 0.21580719609549393, + "grad_norm": 2.3747830390930176, + "learning_rate": 1.6680754812467666e-05, + "loss": 0.8652, + "step": 2852 + }, + { + "epoch": 0.21588286481782756, + "grad_norm": 2.5572152137756348, + "learning_rate": 1.667958065237409e-05, + "loss": 0.7099, + "step": 2853 + }, + { + "epoch": 0.21595853354016117, + "grad_norm": 2.4014647006988525, + "learning_rate": 1.6678406011360337e-05, + "loss": 0.9067, + "step": 2854 + }, + { + "epoch": 0.21603420226249478, + "grad_norm": 2.304295301437378, + "learning_rate": 1.6677230889499966e-05, + "loss": 0.6992, + "step": 2855 + }, + { + "epoch": 0.21610987098482842, + "grad_norm": 1.8173857927322388, + "learning_rate": 1.667605528686656e-05, + "loss": 1.0161, + "step": 2856 + }, + { + "epoch": 0.21618553970716203, + "grad_norm": 2.0855274200439453, + "learning_rate": 1.6674879203533748e-05, + "loss": 0.7699, + "step": 2857 + }, + { + "epoch": 0.21626120842949567, + "grad_norm": 1.9764469861984253, + "learning_rate": 1.6673702639575176e-05, + "loss": 0.8007, + "step": 2858 + }, + { + "epoch": 0.21633687715182928, + "grad_norm": 2.9846057891845703, + "learning_rate": 1.6672525595064527e-05, + "loss": 0.7783, + "step": 2859 + }, + { + "epoch": 0.21641254587416292, + "grad_norm": 2.2411768436431885, + "learning_rate": 1.667134807007551e-05, + "loss": 0.6131, + "step": 2860 + }, + { + "epoch": 0.21648821459649653, + "grad_norm": 2.099818229675293, + "learning_rate": 1.6670170064681858e-05, + "loss": 0.7569, + "step": 2861 + }, + { + "epoch": 0.21656388331883017, + "grad_norm": 2.3399593830108643, + "learning_rate": 1.6668991578957354e-05, + "loss": 0.7872, + "step": 2862 + }, + { + "epoch": 0.21663955204116378, + "grad_norm": 2.527578115463257, + "learning_rate": 1.666781261297579e-05, + "loss": 0.6493, + "step": 2863 + }, + { + "epoch": 0.21671522076349742, + "grad_norm": 2.33992862701416, + "learning_rate": 1.6666633166811004e-05, + "loss": 0.7972, + "step": 2864 + }, + { + "epoch": 0.21679088948583103, + "grad_norm": 2.1418380737304688, + "learning_rate": 1.666545324053685e-05, + "loss": 0.7548, + "step": 2865 + }, + { + "epoch": 0.21686655820816467, + "grad_norm": 2.6442832946777344, + "learning_rate": 1.6664272834227218e-05, + "loss": 0.8697, + "step": 2866 + }, + { + "epoch": 0.21694222693049828, + "grad_norm": 2.5152578353881836, + "learning_rate": 1.666309194795603e-05, + "loss": 0.9075, + "step": 2867 + }, + { + "epoch": 0.2170178956528319, + "grad_norm": 2.834080696105957, + "learning_rate": 1.6661910581797246e-05, + "loss": 0.7793, + "step": 2868 + }, + { + "epoch": 0.21709356437516553, + "grad_norm": 3.0419914722442627, + "learning_rate": 1.6660728735824834e-05, + "loss": 0.7984, + "step": 2869 + }, + { + "epoch": 0.21716923309749914, + "grad_norm": 2.7962558269500732, + "learning_rate": 1.6659546410112815e-05, + "loss": 0.9006, + "step": 2870 + }, + { + "epoch": 0.21724490181983278, + "grad_norm": 2.511221408843994, + "learning_rate": 1.6658363604735224e-05, + "loss": 0.8622, + "step": 2871 + }, + { + "epoch": 0.21732057054216639, + "grad_norm": 2.652181386947632, + "learning_rate": 1.6657180319766134e-05, + "loss": 0.866, + "step": 2872 + }, + { + "epoch": 0.21739623926450002, + "grad_norm": 2.3145172595977783, + "learning_rate": 1.6655996555279645e-05, + "loss": 0.8523, + "step": 2873 + }, + { + "epoch": 0.21747190798683363, + "grad_norm": 2.5612776279449463, + "learning_rate": 1.665481231134989e-05, + "loss": 0.8403, + "step": 2874 + }, + { + "epoch": 0.21754757670916727, + "grad_norm": 1.8979072570800781, + "learning_rate": 1.665362758805103e-05, + "loss": 0.679, + "step": 2875 + }, + { + "epoch": 0.21762324543150088, + "grad_norm": 2.4776549339294434, + "learning_rate": 1.6652442385457255e-05, + "loss": 0.9507, + "step": 2876 + }, + { + "epoch": 0.21769891415383452, + "grad_norm": 3.0830917358398438, + "learning_rate": 1.6651256703642786e-05, + "loss": 0.7769, + "step": 2877 + }, + { + "epoch": 0.21777458287616813, + "grad_norm": 2.253445625305176, + "learning_rate": 1.6650070542681876e-05, + "loss": 0.7806, + "step": 2878 + }, + { + "epoch": 0.21785025159850177, + "grad_norm": 2.626587152481079, + "learning_rate": 1.6648883902648805e-05, + "loss": 0.784, + "step": 2879 + }, + { + "epoch": 0.21792592032083538, + "grad_norm": 2.3169028759002686, + "learning_rate": 1.6647696783617887e-05, + "loss": 0.594, + "step": 2880 + }, + { + "epoch": 0.21800158904316902, + "grad_norm": 2.367800712585449, + "learning_rate": 1.6646509185663458e-05, + "loss": 0.8395, + "step": 2881 + }, + { + "epoch": 0.21807725776550263, + "grad_norm": 2.1728861331939697, + "learning_rate": 1.6645321108859894e-05, + "loss": 0.8088, + "step": 2882 + }, + { + "epoch": 0.21815292648783624, + "grad_norm": 2.3152570724487305, + "learning_rate": 1.6644132553281592e-05, + "loss": 0.6268, + "step": 2883 + }, + { + "epoch": 0.21822859521016988, + "grad_norm": 4.68181037902832, + "learning_rate": 1.6642943519002983e-05, + "loss": 0.8115, + "step": 2884 + }, + { + "epoch": 0.2183042639325035, + "grad_norm": 2.9083902835845947, + "learning_rate": 1.6641754006098537e-05, + "loss": 0.9005, + "step": 2885 + }, + { + "epoch": 0.21837993265483713, + "grad_norm": 2.559480667114258, + "learning_rate": 1.6640564014642732e-05, + "loss": 0.7178, + "step": 2886 + }, + { + "epoch": 0.21845560137717074, + "grad_norm": 2.6691970825195312, + "learning_rate": 1.66393735447101e-05, + "loss": 0.6153, + "step": 2887 + }, + { + "epoch": 0.21853127009950438, + "grad_norm": 2.705824851989746, + "learning_rate": 1.663818259637519e-05, + "loss": 0.9264, + "step": 2888 + }, + { + "epoch": 0.218606938821838, + "grad_norm": 2.3913021087646484, + "learning_rate": 1.6636991169712577e-05, + "loss": 0.6872, + "step": 2889 + }, + { + "epoch": 0.21868260754417163, + "grad_norm": 2.4797730445861816, + "learning_rate": 1.6635799264796877e-05, + "loss": 0.8335, + "step": 2890 + }, + { + "epoch": 0.21875827626650524, + "grad_norm": 2.379905939102173, + "learning_rate": 1.663460688170273e-05, + "loss": 0.8438, + "step": 2891 + }, + { + "epoch": 0.21883394498883887, + "grad_norm": 2.6139721870422363, + "learning_rate": 1.6633414020504805e-05, + "loss": 0.7048, + "step": 2892 + }, + { + "epoch": 0.21890961371117248, + "grad_norm": 2.9872958660125732, + "learning_rate": 1.6632220681277806e-05, + "loss": 0.7684, + "step": 2893 + }, + { + "epoch": 0.21898528243350612, + "grad_norm": 1.7575455904006958, + "learning_rate": 1.6631026864096465e-05, + "loss": 0.9336, + "step": 2894 + }, + { + "epoch": 0.21906095115583973, + "grad_norm": 2.184025287628174, + "learning_rate": 1.6629832569035537e-05, + "loss": 0.7969, + "step": 2895 + }, + { + "epoch": 0.21913661987817334, + "grad_norm": 2.2860093116760254, + "learning_rate": 1.6628637796169815e-05, + "loss": 0.7119, + "step": 2896 + }, + { + "epoch": 0.21921228860050698, + "grad_norm": 2.7359209060668945, + "learning_rate": 1.6627442545574122e-05, + "loss": 0.696, + "step": 2897 + }, + { + "epoch": 0.2192879573228406, + "grad_norm": 2.2823524475097656, + "learning_rate": 1.6626246817323307e-05, + "loss": 0.7375, + "step": 2898 + }, + { + "epoch": 0.21936362604517423, + "grad_norm": 2.211632490158081, + "learning_rate": 1.6625050611492246e-05, + "loss": 0.8292, + "step": 2899 + }, + { + "epoch": 0.21943929476750784, + "grad_norm": 2.5959115028381348, + "learning_rate": 1.6623853928155857e-05, + "loss": 0.9269, + "step": 2900 + }, + { + "epoch": 0.21951496348984148, + "grad_norm": 2.787301778793335, + "learning_rate": 1.6622656767389077e-05, + "loss": 0.8806, + "step": 2901 + }, + { + "epoch": 0.2195906322121751, + "grad_norm": 2.558061361312866, + "learning_rate": 1.6621459129266875e-05, + "loss": 0.8496, + "step": 2902 + }, + { + "epoch": 0.21966630093450873, + "grad_norm": 2.832548141479492, + "learning_rate": 1.662026101386425e-05, + "loss": 0.9092, + "step": 2903 + }, + { + "epoch": 0.21974196965684234, + "grad_norm": 2.5366275310516357, + "learning_rate": 1.6619062421256235e-05, + "loss": 0.7832, + "step": 2904 + }, + { + "epoch": 0.21981763837917598, + "grad_norm": 2.213609218597412, + "learning_rate": 1.6617863351517885e-05, + "loss": 0.9086, + "step": 2905 + }, + { + "epoch": 0.2198933071015096, + "grad_norm": 2.412593126296997, + "learning_rate": 1.6616663804724297e-05, + "loss": 0.8328, + "step": 2906 + }, + { + "epoch": 0.21996897582384323, + "grad_norm": 2.010378122329712, + "learning_rate": 1.6615463780950583e-05, + "loss": 0.8728, + "step": 2907 + }, + { + "epoch": 0.22004464454617684, + "grad_norm": 2.071139335632324, + "learning_rate": 1.66142632802719e-05, + "loss": 0.6447, + "step": 2908 + }, + { + "epoch": 0.22012031326851045, + "grad_norm": 2.2854831218719482, + "learning_rate": 1.6613062302763417e-05, + "loss": 0.7125, + "step": 2909 + }, + { + "epoch": 0.2201959819908441, + "grad_norm": 2.3209803104400635, + "learning_rate": 1.6611860848500354e-05, + "loss": 0.7449, + "step": 2910 + }, + { + "epoch": 0.2202716507131777, + "grad_norm": 2.6688926219940186, + "learning_rate": 1.6610658917557942e-05, + "loss": 0.7961, + "step": 2911 + }, + { + "epoch": 0.22034731943551134, + "grad_norm": 2.9156992435455322, + "learning_rate": 1.6609456510011454e-05, + "loss": 0.7054, + "step": 2912 + }, + { + "epoch": 0.22042298815784495, + "grad_norm": 2.726867437362671, + "learning_rate": 1.6608253625936185e-05, + "loss": 0.9102, + "step": 2913 + }, + { + "epoch": 0.22049865688017858, + "grad_norm": 2.408320903778076, + "learning_rate": 1.6607050265407473e-05, + "loss": 0.6769, + "step": 2914 + }, + { + "epoch": 0.2205743256025122, + "grad_norm": 2.9164679050445557, + "learning_rate": 1.660584642850066e-05, + "loss": 0.8634, + "step": 2915 + }, + { + "epoch": 0.22064999432484583, + "grad_norm": 2.25538969039917, + "learning_rate": 1.660464211529115e-05, + "loss": 0.6191, + "step": 2916 + }, + { + "epoch": 0.22072566304717944, + "grad_norm": 2.494601249694824, + "learning_rate": 1.660343732585435e-05, + "loss": 0.8541, + "step": 2917 + }, + { + "epoch": 0.22080133176951308, + "grad_norm": 2.4575066566467285, + "learning_rate": 1.6602232060265712e-05, + "loss": 0.7013, + "step": 2918 + }, + { + "epoch": 0.2208770004918467, + "grad_norm": 2.584141731262207, + "learning_rate": 1.660102631860072e-05, + "loss": 0.8288, + "step": 2919 + }, + { + "epoch": 0.22095266921418033, + "grad_norm": 2.5871658325195312, + "learning_rate": 1.659982010093487e-05, + "loss": 0.8565, + "step": 2920 + }, + { + "epoch": 0.22102833793651394, + "grad_norm": 3.7710883617401123, + "learning_rate": 1.6598613407343707e-05, + "loss": 0.9081, + "step": 2921 + }, + { + "epoch": 0.22110400665884755, + "grad_norm": 2.9032387733459473, + "learning_rate": 1.659740623790279e-05, + "loss": 0.8016, + "step": 2922 + }, + { + "epoch": 0.2211796753811812, + "grad_norm": 2.4792866706848145, + "learning_rate": 1.6596198592687727e-05, + "loss": 0.6606, + "step": 2923 + }, + { + "epoch": 0.2212553441035148, + "grad_norm": 2.214895486831665, + "learning_rate": 1.6594990471774135e-05, + "loss": 0.7439, + "step": 2924 + }, + { + "epoch": 0.22133101282584844, + "grad_norm": 2.056378126144409, + "learning_rate": 1.659378187523768e-05, + "loss": 0.794, + "step": 2925 + }, + { + "epoch": 0.22140668154818205, + "grad_norm": 2.1232733726501465, + "learning_rate": 1.659257280315404e-05, + "loss": 0.7306, + "step": 2926 + }, + { + "epoch": 0.2214823502705157, + "grad_norm": 2.4625091552734375, + "learning_rate": 1.659136325559893e-05, + "loss": 0.8396, + "step": 2927 + }, + { + "epoch": 0.2215580189928493, + "grad_norm": 2.7897610664367676, + "learning_rate": 1.6590153232648106e-05, + "loss": 0.8533, + "step": 2928 + }, + { + "epoch": 0.22163368771518294, + "grad_norm": 2.2553727626800537, + "learning_rate": 1.6588942734377333e-05, + "loss": 0.6447, + "step": 2929 + }, + { + "epoch": 0.22170935643751655, + "grad_norm": 2.372699737548828, + "learning_rate": 1.658773176086242e-05, + "loss": 0.7301, + "step": 2930 + }, + { + "epoch": 0.22178502515985019, + "grad_norm": 2.075169086456299, + "learning_rate": 1.6586520312179203e-05, + "loss": 0.7797, + "step": 2931 + }, + { + "epoch": 0.2218606938821838, + "grad_norm": 1.9998425245285034, + "learning_rate": 1.658530838840355e-05, + "loss": 0.8402, + "step": 2932 + }, + { + "epoch": 0.22193636260451743, + "grad_norm": 2.4670629501342773, + "learning_rate": 1.658409598961135e-05, + "loss": 0.7589, + "step": 2933 + }, + { + "epoch": 0.22201203132685104, + "grad_norm": 1.9075826406478882, + "learning_rate": 1.6582883115878526e-05, + "loss": 0.7386, + "step": 2934 + }, + { + "epoch": 0.22208770004918468, + "grad_norm": 2.3455803394317627, + "learning_rate": 1.6581669767281037e-05, + "loss": 0.8606, + "step": 2935 + }, + { + "epoch": 0.2221633687715183, + "grad_norm": 3.289113998413086, + "learning_rate": 1.6580455943894866e-05, + "loss": 0.5393, + "step": 2936 + }, + { + "epoch": 0.2222390374938519, + "grad_norm": 1.9734731912612915, + "learning_rate": 1.6579241645796026e-05, + "loss": 0.8515, + "step": 2937 + }, + { + "epoch": 0.22231470621618554, + "grad_norm": 2.288149118423462, + "learning_rate": 1.6578026873060556e-05, + "loss": 0.7706, + "step": 2938 + }, + { + "epoch": 0.22239037493851915, + "grad_norm": 2.2239696979522705, + "learning_rate": 1.6576811625764537e-05, + "loss": 0.882, + "step": 2939 + }, + { + "epoch": 0.2224660436608528, + "grad_norm": 2.2956271171569824, + "learning_rate": 1.6575595903984065e-05, + "loss": 0.704, + "step": 2940 + }, + { + "epoch": 0.2225417123831864, + "grad_norm": 2.0809905529022217, + "learning_rate": 1.6574379707795277e-05, + "loss": 1.0292, + "step": 2941 + }, + { + "epoch": 0.22261738110552004, + "grad_norm": 2.3830835819244385, + "learning_rate": 1.6573163037274333e-05, + "loss": 0.6888, + "step": 2942 + }, + { + "epoch": 0.22269304982785365, + "grad_norm": 2.550462007522583, + "learning_rate": 1.6571945892497423e-05, + "loss": 0.7566, + "step": 2943 + }, + { + "epoch": 0.2227687185501873, + "grad_norm": 2.1050031185150146, + "learning_rate": 1.6570728273540773e-05, + "loss": 0.7667, + "step": 2944 + }, + { + "epoch": 0.2228443872725209, + "grad_norm": 3.8642396926879883, + "learning_rate": 1.6569510180480632e-05, + "loss": 0.7821, + "step": 2945 + }, + { + "epoch": 0.22292005599485454, + "grad_norm": 2.2704646587371826, + "learning_rate": 1.656829161339328e-05, + "loss": 0.8253, + "step": 2946 + }, + { + "epoch": 0.22299572471718815, + "grad_norm": 4.2045111656188965, + "learning_rate": 1.6567072572355026e-05, + "loss": 0.7635, + "step": 2947 + }, + { + "epoch": 0.2230713934395218, + "grad_norm": 2.0793330669403076, + "learning_rate": 1.656585305744222e-05, + "loss": 0.8678, + "step": 2948 + }, + { + "epoch": 0.2231470621618554, + "grad_norm": 2.1388299465179443, + "learning_rate": 1.6564633068731215e-05, + "loss": 0.7312, + "step": 2949 + }, + { + "epoch": 0.223222730884189, + "grad_norm": 2.120715856552124, + "learning_rate": 1.6563412606298426e-05, + "loss": 0.7689, + "step": 2950 + }, + { + "epoch": 0.22329839960652265, + "grad_norm": 2.12400484085083, + "learning_rate": 1.6562191670220272e-05, + "loss": 0.6913, + "step": 2951 + }, + { + "epoch": 0.22337406832885626, + "grad_norm": 2.987435817718506, + "learning_rate": 1.656097026057322e-05, + "loss": 0.7776, + "step": 2952 + }, + { + "epoch": 0.2234497370511899, + "grad_norm": 2.098076105117798, + "learning_rate": 1.6559748377433756e-05, + "loss": 0.66, + "step": 2953 + }, + { + "epoch": 0.2235254057735235, + "grad_norm": 2.531951904296875, + "learning_rate": 1.6558526020878395e-05, + "loss": 0.8598, + "step": 2954 + }, + { + "epoch": 0.22360107449585714, + "grad_norm": 2.2897439002990723, + "learning_rate": 1.655730319098369e-05, + "loss": 0.8132, + "step": 2955 + }, + { + "epoch": 0.22367674321819075, + "grad_norm": 2.279578924179077, + "learning_rate": 1.6556079887826215e-05, + "loss": 0.7632, + "step": 2956 + }, + { + "epoch": 0.2237524119405244, + "grad_norm": 2.306779623031616, + "learning_rate": 1.6554856111482576e-05, + "loss": 0.8243, + "step": 2957 + }, + { + "epoch": 0.223828080662858, + "grad_norm": 2.461686849594116, + "learning_rate": 1.6553631862029413e-05, + "loss": 0.7213, + "step": 2958 + }, + { + "epoch": 0.22390374938519164, + "grad_norm": 2.0661559104919434, + "learning_rate": 1.6552407139543393e-05, + "loss": 0.8158, + "step": 2959 + }, + { + "epoch": 0.22397941810752525, + "grad_norm": 2.2545764446258545, + "learning_rate": 1.655118194410121e-05, + "loss": 0.7361, + "step": 2960 + }, + { + "epoch": 0.2240550868298589, + "grad_norm": 2.0152387619018555, + "learning_rate": 1.6549956275779588e-05, + "loss": 0.7121, + "step": 2961 + }, + { + "epoch": 0.2241307555521925, + "grad_norm": 2.965620756149292, + "learning_rate": 1.6548730134655286e-05, + "loss": 0.6965, + "step": 2962 + }, + { + "epoch": 0.2242064242745261, + "grad_norm": 2.6358821392059326, + "learning_rate": 1.6547503520805087e-05, + "loss": 0.7382, + "step": 2963 + }, + { + "epoch": 0.22428209299685975, + "grad_norm": 3.588127613067627, + "learning_rate": 1.6546276434305805e-05, + "loss": 0.868, + "step": 2964 + }, + { + "epoch": 0.22435776171919336, + "grad_norm": 2.1781110763549805, + "learning_rate": 1.654504887523429e-05, + "loss": 0.829, + "step": 2965 + }, + { + "epoch": 0.224433430441527, + "grad_norm": 2.047546863555908, + "learning_rate": 1.6543820843667405e-05, + "loss": 0.6987, + "step": 2966 + }, + { + "epoch": 0.2245090991638606, + "grad_norm": 2.5507969856262207, + "learning_rate": 1.654259233968206e-05, + "loss": 0.6387, + "step": 2967 + }, + { + "epoch": 0.22458476788619425, + "grad_norm": 2.1714725494384766, + "learning_rate": 1.654136336335519e-05, + "loss": 0.8598, + "step": 2968 + }, + { + "epoch": 0.22466043660852786, + "grad_norm": 2.6356706619262695, + "learning_rate": 1.654013391476375e-05, + "loss": 0.8211, + "step": 2969 + }, + { + "epoch": 0.2247361053308615, + "grad_norm": 2.035926342010498, + "learning_rate": 1.653890399398474e-05, + "loss": 0.6791, + "step": 2970 + }, + { + "epoch": 0.2248117740531951, + "grad_norm": 2.380887508392334, + "learning_rate": 1.6537673601095178e-05, + "loss": 0.8578, + "step": 2971 + }, + { + "epoch": 0.22488744277552875, + "grad_norm": 2.41847562789917, + "learning_rate": 1.6536442736172114e-05, + "loss": 0.8885, + "step": 2972 + }, + { + "epoch": 0.22496311149786236, + "grad_norm": 2.5144875049591064, + "learning_rate": 1.653521139929263e-05, + "loss": 0.8791, + "step": 2973 + }, + { + "epoch": 0.225038780220196, + "grad_norm": 2.021480083465576, + "learning_rate": 1.6533979590533838e-05, + "loss": 0.8099, + "step": 2974 + }, + { + "epoch": 0.2251144489425296, + "grad_norm": 2.1968541145324707, + "learning_rate": 1.6532747309972876e-05, + "loss": 0.762, + "step": 2975 + }, + { + "epoch": 0.22519011766486322, + "grad_norm": 1.8280404806137085, + "learning_rate": 1.6531514557686913e-05, + "loss": 0.7402, + "step": 2976 + }, + { + "epoch": 0.22526578638719685, + "grad_norm": 2.3831984996795654, + "learning_rate": 1.6530281333753148e-05, + "loss": 0.7373, + "step": 2977 + }, + { + "epoch": 0.22534145510953046, + "grad_norm": 2.2614855766296387, + "learning_rate": 1.6529047638248808e-05, + "loss": 0.7542, + "step": 2978 + }, + { + "epoch": 0.2254171238318641, + "grad_norm": 2.100390911102295, + "learning_rate": 1.6527813471251158e-05, + "loss": 0.7678, + "step": 2979 + }, + { + "epoch": 0.2254927925541977, + "grad_norm": 2.33455491065979, + "learning_rate": 1.6526578832837476e-05, + "loss": 0.8819, + "step": 2980 + }, + { + "epoch": 0.22556846127653135, + "grad_norm": 1.782531976699829, + "learning_rate": 1.6525343723085085e-05, + "loss": 0.6764, + "step": 2981 + }, + { + "epoch": 0.22564412999886496, + "grad_norm": 2.2353334426879883, + "learning_rate": 1.652410814207133e-05, + "loss": 0.7287, + "step": 2982 + }, + { + "epoch": 0.2257197987211986, + "grad_norm": 3.260190486907959, + "learning_rate": 1.652287208987359e-05, + "loss": 0.8681, + "step": 2983 + }, + { + "epoch": 0.2257954674435322, + "grad_norm": 2.1882760524749756, + "learning_rate": 1.6521635566569266e-05, + "loss": 0.8207, + "step": 2984 + }, + { + "epoch": 0.22587113616586585, + "grad_norm": 2.4284403324127197, + "learning_rate": 1.6520398572235794e-05, + "loss": 0.8992, + "step": 2985 + }, + { + "epoch": 0.22594680488819946, + "grad_norm": 2.3258399963378906, + "learning_rate": 1.6519161106950638e-05, + "loss": 0.9157, + "step": 2986 + }, + { + "epoch": 0.2260224736105331, + "grad_norm": 2.984490156173706, + "learning_rate": 1.6517923170791298e-05, + "loss": 0.8868, + "step": 2987 + }, + { + "epoch": 0.2260981423328667, + "grad_norm": 2.800849437713623, + "learning_rate": 1.651668476383529e-05, + "loss": 0.7888, + "step": 2988 + }, + { + "epoch": 0.22617381105520035, + "grad_norm": 2.9878838062286377, + "learning_rate": 1.651544588616017e-05, + "loss": 0.7345, + "step": 2989 + }, + { + "epoch": 0.22624947977753396, + "grad_norm": 3.69124174118042, + "learning_rate": 1.651420653784352e-05, + "loss": 0.744, + "step": 2990 + }, + { + "epoch": 0.22632514849986757, + "grad_norm": 2.3240010738372803, + "learning_rate": 1.6512966718962958e-05, + "loss": 0.7195, + "step": 2991 + }, + { + "epoch": 0.2264008172222012, + "grad_norm": 2.4493560791015625, + "learning_rate": 1.6511726429596115e-05, + "loss": 0.8408, + "step": 2992 + }, + { + "epoch": 0.22647648594453482, + "grad_norm": 2.2705140113830566, + "learning_rate": 1.6510485669820668e-05, + "loss": 0.601, + "step": 2993 + }, + { + "epoch": 0.22655215466686845, + "grad_norm": 2.269789695739746, + "learning_rate": 1.6509244439714317e-05, + "loss": 0.8425, + "step": 2994 + }, + { + "epoch": 0.22662782338920207, + "grad_norm": 2.8810999393463135, + "learning_rate": 1.6508002739354793e-05, + "loss": 0.7285, + "step": 2995 + }, + { + "epoch": 0.2267034921115357, + "grad_norm": 2.548349142074585, + "learning_rate": 1.650676056881985e-05, + "loss": 0.7676, + "step": 2996 + }, + { + "epoch": 0.22677916083386931, + "grad_norm": 3.094545841217041, + "learning_rate": 1.6505517928187282e-05, + "loss": 0.9013, + "step": 2997 + }, + { + "epoch": 0.22685482955620295, + "grad_norm": 2.7249162197113037, + "learning_rate": 1.6504274817534906e-05, + "loss": 0.6787, + "step": 2998 + }, + { + "epoch": 0.22693049827853656, + "grad_norm": 2.3722381591796875, + "learning_rate": 1.650303123694057e-05, + "loss": 0.6625, + "step": 2999 + }, + { + "epoch": 0.2270061670008702, + "grad_norm": 2.4005649089813232, + "learning_rate": 1.650178718648215e-05, + "loss": 0.7726, + "step": 3000 + }, + { + "epoch": 0.2270818357232038, + "grad_norm": 2.658586263656616, + "learning_rate": 1.6500542666237553e-05, + "loss": 0.7898, + "step": 3001 + }, + { + "epoch": 0.22715750444553745, + "grad_norm": 2.125000238418579, + "learning_rate": 1.649929767628471e-05, + "loss": 0.7455, + "step": 3002 + }, + { + "epoch": 0.22723317316787106, + "grad_norm": 2.5988640785217285, + "learning_rate": 1.6498052216701595e-05, + "loss": 0.7898, + "step": 3003 + }, + { + "epoch": 0.22730884189020467, + "grad_norm": 2.0731637477874756, + "learning_rate": 1.64968062875662e-05, + "loss": 0.6588, + "step": 3004 + }, + { + "epoch": 0.2273845106125383, + "grad_norm": 2.0830650329589844, + "learning_rate": 1.6495559888956544e-05, + "loss": 0.7186, + "step": 3005 + }, + { + "epoch": 0.22746017933487192, + "grad_norm": 4.79794979095459, + "learning_rate": 1.6494313020950687e-05, + "loss": 0.7021, + "step": 3006 + }, + { + "epoch": 0.22753584805720556, + "grad_norm": 3.0674867630004883, + "learning_rate": 1.6493065683626706e-05, + "loss": 0.639, + "step": 3007 + }, + { + "epoch": 0.22761151677953917, + "grad_norm": 2.5474140644073486, + "learning_rate": 1.6491817877062718e-05, + "loss": 0.7456, + "step": 3008 + }, + { + "epoch": 0.2276871855018728, + "grad_norm": 3.6565370559692383, + "learning_rate": 1.6490569601336864e-05, + "loss": 0.7771, + "step": 3009 + }, + { + "epoch": 0.22776285422420642, + "grad_norm": 2.16518497467041, + "learning_rate": 1.6489320856527312e-05, + "loss": 0.7738, + "step": 3010 + }, + { + "epoch": 0.22783852294654006, + "grad_norm": 2.3201420307159424, + "learning_rate": 1.648807164271227e-05, + "loss": 0.7261, + "step": 3011 + }, + { + "epoch": 0.22791419166887367, + "grad_norm": 2.659714698791504, + "learning_rate": 1.6486821959969954e-05, + "loss": 0.7438, + "step": 3012 + }, + { + "epoch": 0.2279898603912073, + "grad_norm": 2.2226853370666504, + "learning_rate": 1.6485571808378637e-05, + "loss": 0.6663, + "step": 3013 + }, + { + "epoch": 0.22806552911354092, + "grad_norm": 2.8635661602020264, + "learning_rate": 1.64843211880166e-05, + "loss": 0.6857, + "step": 3014 + }, + { + "epoch": 0.22814119783587455, + "grad_norm": 2.4834866523742676, + "learning_rate": 1.6483070098962165e-05, + "loss": 0.7628, + "step": 3015 + }, + { + "epoch": 0.22821686655820816, + "grad_norm": 2.6139743328094482, + "learning_rate": 1.6481818541293675e-05, + "loss": 0.8111, + "step": 3016 + }, + { + "epoch": 0.22829253528054178, + "grad_norm": 2.68338942527771, + "learning_rate": 1.648056651508951e-05, + "loss": 0.6424, + "step": 3017 + }, + { + "epoch": 0.2283682040028754, + "grad_norm": 2.5253429412841797, + "learning_rate": 1.6479314020428078e-05, + "loss": 0.8043, + "step": 3018 + }, + { + "epoch": 0.22844387272520902, + "grad_norm": 3.081286907196045, + "learning_rate": 1.6478061057387804e-05, + "loss": 0.785, + "step": 3019 + }, + { + "epoch": 0.22851954144754266, + "grad_norm": 2.6442153453826904, + "learning_rate": 1.6476807626047164e-05, + "loss": 0.6959, + "step": 3020 + }, + { + "epoch": 0.22859521016987627, + "grad_norm": 2.3536460399627686, + "learning_rate": 1.6475553726484645e-05, + "loss": 0.8099, + "step": 3021 + }, + { + "epoch": 0.2286708788922099, + "grad_norm": 2.2801482677459717, + "learning_rate": 1.647429935877878e-05, + "loss": 0.8345, + "step": 3022 + }, + { + "epoch": 0.22874654761454352, + "grad_norm": 2.2949955463409424, + "learning_rate": 1.6473044523008106e-05, + "loss": 0.9149, + "step": 3023 + }, + { + "epoch": 0.22882221633687716, + "grad_norm": 2.6356618404388428, + "learning_rate": 1.6471789219251216e-05, + "loss": 0.7342, + "step": 3024 + }, + { + "epoch": 0.22889788505921077, + "grad_norm": 2.3132026195526123, + "learning_rate": 1.647053344758672e-05, + "loss": 0.6673, + "step": 3025 + }, + { + "epoch": 0.2289735537815444, + "grad_norm": 2.86631178855896, + "learning_rate": 1.6469277208093256e-05, + "loss": 0.8654, + "step": 3026 + }, + { + "epoch": 0.22904922250387802, + "grad_norm": 2.4042892456054688, + "learning_rate": 1.646802050084949e-05, + "loss": 0.7001, + "step": 3027 + }, + { + "epoch": 0.22912489122621166, + "grad_norm": 2.6637120246887207, + "learning_rate": 1.6466763325934133e-05, + "loss": 0.7731, + "step": 3028 + }, + { + "epoch": 0.22920055994854527, + "grad_norm": 3.4634320735931396, + "learning_rate": 1.64655056834259e-05, + "loss": 0.9874, + "step": 3029 + }, + { + "epoch": 0.22927622867087888, + "grad_norm": 2.559786558151245, + "learning_rate": 1.646424757340356e-05, + "loss": 0.788, + "step": 3030 + }, + { + "epoch": 0.22935189739321252, + "grad_norm": 2.1549887657165527, + "learning_rate": 1.646298899594589e-05, + "loss": 0.7187, + "step": 3031 + }, + { + "epoch": 0.22942756611554613, + "grad_norm": 2.1260504722595215, + "learning_rate": 1.6461729951131712e-05, + "loss": 0.7892, + "step": 3032 + }, + { + "epoch": 0.22950323483787977, + "grad_norm": 2.535245656967163, + "learning_rate": 1.6460470439039874e-05, + "loss": 0.7686, + "step": 3033 + }, + { + "epoch": 0.22957890356021338, + "grad_norm": 2.4702205657958984, + "learning_rate": 1.6459210459749244e-05, + "loss": 0.9208, + "step": 3034 + }, + { + "epoch": 0.22965457228254701, + "grad_norm": 2.4642333984375, + "learning_rate": 1.645795001333873e-05, + "loss": 0.745, + "step": 3035 + }, + { + "epoch": 0.22973024100488063, + "grad_norm": 2.5127851963043213, + "learning_rate": 1.6456689099887263e-05, + "loss": 0.9197, + "step": 3036 + }, + { + "epoch": 0.22980590972721426, + "grad_norm": 2.6181676387786865, + "learning_rate": 1.6455427719473806e-05, + "loss": 0.6855, + "step": 3037 + }, + { + "epoch": 0.22988157844954787, + "grad_norm": 2.3293540477752686, + "learning_rate": 1.6454165872177354e-05, + "loss": 0.6547, + "step": 3038 + }, + { + "epoch": 0.2299572471718815, + "grad_norm": 2.4066195487976074, + "learning_rate": 1.6452903558076925e-05, + "loss": 0.8718, + "step": 3039 + }, + { + "epoch": 0.23003291589421512, + "grad_norm": 2.3516995906829834, + "learning_rate": 1.6451640777251567e-05, + "loss": 0.7497, + "step": 3040 + }, + { + "epoch": 0.23010858461654876, + "grad_norm": 2.0895943641662598, + "learning_rate": 1.6450377529780363e-05, + "loss": 0.768, + "step": 3041 + }, + { + "epoch": 0.23018425333888237, + "grad_norm": 2.606595039367676, + "learning_rate": 1.6449113815742422e-05, + "loss": 0.7996, + "step": 3042 + }, + { + "epoch": 0.23025992206121598, + "grad_norm": 2.8909833431243896, + "learning_rate": 1.644784963521688e-05, + "loss": 0.8167, + "step": 3043 + }, + { + "epoch": 0.23033559078354962, + "grad_norm": 2.0377085208892822, + "learning_rate": 1.6446584988282907e-05, + "loss": 0.7504, + "step": 3044 + }, + { + "epoch": 0.23041125950588323, + "grad_norm": 2.66420841217041, + "learning_rate": 1.6445319875019694e-05, + "loss": 0.6552, + "step": 3045 + }, + { + "epoch": 0.23048692822821687, + "grad_norm": 2.240417718887329, + "learning_rate": 1.644405429550647e-05, + "loss": 0.7903, + "step": 3046 + }, + { + "epoch": 0.23056259695055048, + "grad_norm": 2.7434213161468506, + "learning_rate": 1.6442788249822486e-05, + "loss": 0.8382, + "step": 3047 + }, + { + "epoch": 0.23063826567288412, + "grad_norm": 2.1214497089385986, + "learning_rate": 1.6441521738047033e-05, + "loss": 0.8043, + "step": 3048 + }, + { + "epoch": 0.23071393439521773, + "grad_norm": 2.15208101272583, + "learning_rate": 1.6440254760259416e-05, + "loss": 0.9723, + "step": 3049 + }, + { + "epoch": 0.23078960311755137, + "grad_norm": 3.5610761642456055, + "learning_rate": 1.6438987316538985e-05, + "loss": 0.8386, + "step": 3050 + }, + { + "epoch": 0.23086527183988498, + "grad_norm": 2.2690629959106445, + "learning_rate": 1.643771940696511e-05, + "loss": 0.7666, + "step": 3051 + }, + { + "epoch": 0.23094094056221862, + "grad_norm": 2.7488620281219482, + "learning_rate": 1.6436451031617182e-05, + "loss": 0.738, + "step": 3052 + }, + { + "epoch": 0.23101660928455223, + "grad_norm": 2.229719877243042, + "learning_rate": 1.6435182190574643e-05, + "loss": 0.8551, + "step": 3053 + }, + { + "epoch": 0.23109227800688587, + "grad_norm": 2.4451205730438232, + "learning_rate": 1.6433912883916944e-05, + "loss": 0.688, + "step": 3054 + }, + { + "epoch": 0.23116794672921948, + "grad_norm": 2.5764944553375244, + "learning_rate": 1.6432643111723578e-05, + "loss": 0.7756, + "step": 3055 + }, + { + "epoch": 0.23124361545155311, + "grad_norm": 2.299663782119751, + "learning_rate": 1.6431372874074057e-05, + "loss": 0.7981, + "step": 3056 + }, + { + "epoch": 0.23131928417388672, + "grad_norm": 2.5289340019226074, + "learning_rate": 1.6430102171047935e-05, + "loss": 0.6726, + "step": 3057 + }, + { + "epoch": 0.23139495289622034, + "grad_norm": 3.0668952465057373, + "learning_rate": 1.6428831002724782e-05, + "loss": 0.8785, + "step": 3058 + }, + { + "epoch": 0.23147062161855397, + "grad_norm": 2.457798480987549, + "learning_rate": 1.6427559369184202e-05, + "loss": 0.7681, + "step": 3059 + }, + { + "epoch": 0.23154629034088758, + "grad_norm": 2.192192316055298, + "learning_rate": 1.6426287270505837e-05, + "loss": 0.7763, + "step": 3060 + }, + { + "epoch": 0.23162195906322122, + "grad_norm": 2.4625298976898193, + "learning_rate": 1.6425014706769337e-05, + "loss": 0.6961, + "step": 3061 + }, + { + "epoch": 0.23169762778555483, + "grad_norm": 2.2496135234832764, + "learning_rate": 1.64237416780544e-05, + "loss": 0.8604, + "step": 3062 + }, + { + "epoch": 0.23177329650788847, + "grad_norm": 2.194805860519409, + "learning_rate": 1.642246818444075e-05, + "loss": 0.7537, + "step": 3063 + }, + { + "epoch": 0.23184896523022208, + "grad_norm": 2.501255512237549, + "learning_rate": 1.6421194226008138e-05, + "loss": 0.8666, + "step": 3064 + }, + { + "epoch": 0.23192463395255572, + "grad_norm": 2.5918145179748535, + "learning_rate": 1.6419919802836337e-05, + "loss": 0.6322, + "step": 3065 + }, + { + "epoch": 0.23200030267488933, + "grad_norm": 2.162069082260132, + "learning_rate": 1.641864491500516e-05, + "loss": 0.7903, + "step": 3066 + }, + { + "epoch": 0.23207597139722297, + "grad_norm": 2.95308518409729, + "learning_rate": 1.6417369562594444e-05, + "loss": 0.8792, + "step": 3067 + }, + { + "epoch": 0.23215164011955658, + "grad_norm": 2.692232131958008, + "learning_rate": 1.6416093745684054e-05, + "loss": 0.7282, + "step": 3068 + }, + { + "epoch": 0.23222730884189022, + "grad_norm": 2.6581435203552246, + "learning_rate": 1.6414817464353888e-05, + "loss": 0.8014, + "step": 3069 + }, + { + "epoch": 0.23230297756422383, + "grad_norm": 2.171360969543457, + "learning_rate": 1.6413540718683872e-05, + "loss": 0.7544, + "step": 3070 + }, + { + "epoch": 0.23237864628655744, + "grad_norm": 2.7274041175842285, + "learning_rate": 1.6412263508753952e-05, + "loss": 0.6874, + "step": 3071 + }, + { + "epoch": 0.23245431500889108, + "grad_norm": 2.1489665508270264, + "learning_rate": 1.6410985834644123e-05, + "loss": 0.7909, + "step": 3072 + }, + { + "epoch": 0.2325299837312247, + "grad_norm": 1.950114369392395, + "learning_rate": 1.6409707696434388e-05, + "loss": 0.7571, + "step": 3073 + }, + { + "epoch": 0.23260565245355833, + "grad_norm": 4.101839542388916, + "learning_rate": 1.640842909420479e-05, + "loss": 0.7565, + "step": 3074 + }, + { + "epoch": 0.23268132117589194, + "grad_norm": 2.2927932739257812, + "learning_rate": 1.6407150028035402e-05, + "loss": 0.6963, + "step": 3075 + }, + { + "epoch": 0.23275698989822557, + "grad_norm": 2.3485589027404785, + "learning_rate": 1.6405870498006326e-05, + "loss": 0.6547, + "step": 3076 + }, + { + "epoch": 0.23283265862055919, + "grad_norm": 2.404630422592163, + "learning_rate": 1.640459050419768e-05, + "loss": 0.8066, + "step": 3077 + }, + { + "epoch": 0.23290832734289282, + "grad_norm": 2.318694591522217, + "learning_rate": 1.640331004668963e-05, + "loss": 0.8715, + "step": 3078 + }, + { + "epoch": 0.23298399606522643, + "grad_norm": 2.4930219650268555, + "learning_rate": 1.6402029125562357e-05, + "loss": 0.7058, + "step": 3079 + }, + { + "epoch": 0.23305966478756007, + "grad_norm": 3.169928789138794, + "learning_rate": 1.640074774089608e-05, + "loss": 0.758, + "step": 3080 + }, + { + "epoch": 0.23313533350989368, + "grad_norm": 2.3577566146850586, + "learning_rate": 1.6399465892771045e-05, + "loss": 0.8166, + "step": 3081 + }, + { + "epoch": 0.23321100223222732, + "grad_norm": 2.1574015617370605, + "learning_rate": 1.6398183581267522e-05, + "loss": 0.8251, + "step": 3082 + }, + { + "epoch": 0.23328667095456093, + "grad_norm": 2.141531229019165, + "learning_rate": 1.639690080646581e-05, + "loss": 0.785, + "step": 3083 + }, + { + "epoch": 0.23336233967689454, + "grad_norm": 2.964078426361084, + "learning_rate": 1.639561756844625e-05, + "loss": 0.6618, + "step": 3084 + }, + { + "epoch": 0.23343800839922818, + "grad_norm": 2.588050603866577, + "learning_rate": 1.6394333867289198e-05, + "loss": 0.7254, + "step": 3085 + }, + { + "epoch": 0.2335136771215618, + "grad_norm": 3.1595702171325684, + "learning_rate": 1.639304970307504e-05, + "loss": 0.759, + "step": 3086 + }, + { + "epoch": 0.23358934584389543, + "grad_norm": 2.7546892166137695, + "learning_rate": 1.63917650758842e-05, + "loss": 0.8624, + "step": 3087 + }, + { + "epoch": 0.23366501456622904, + "grad_norm": 2.3731441497802734, + "learning_rate": 1.639047998579712e-05, + "loss": 0.7255, + "step": 3088 + }, + { + "epoch": 0.23374068328856268, + "grad_norm": 2.9596669673919678, + "learning_rate": 1.6389194432894283e-05, + "loss": 0.8804, + "step": 3089 + }, + { + "epoch": 0.2338163520108963, + "grad_norm": 2.525820732116699, + "learning_rate": 1.638790841725619e-05, + "loss": 0.8342, + "step": 3090 + }, + { + "epoch": 0.23389202073322993, + "grad_norm": 2.2738192081451416, + "learning_rate": 1.6386621938963375e-05, + "loss": 0.7513, + "step": 3091 + }, + { + "epoch": 0.23396768945556354, + "grad_norm": 1.9771041870117188, + "learning_rate": 1.6385334998096405e-05, + "loss": 0.8262, + "step": 3092 + }, + { + "epoch": 0.23404335817789718, + "grad_norm": 2.0869479179382324, + "learning_rate": 1.638404759473587e-05, + "loss": 0.7338, + "step": 3093 + }, + { + "epoch": 0.2341190269002308, + "grad_norm": 2.7630059719085693, + "learning_rate": 1.6382759728962392e-05, + "loss": 0.9286, + "step": 3094 + }, + { + "epoch": 0.23419469562256442, + "grad_norm": 2.188873291015625, + "learning_rate": 1.638147140085662e-05, + "loss": 0.7531, + "step": 3095 + }, + { + "epoch": 0.23427036434489804, + "grad_norm": 1.8856064081192017, + "learning_rate": 1.6380182610499234e-05, + "loss": 0.7139, + "step": 3096 + }, + { + "epoch": 0.23434603306723165, + "grad_norm": 2.62967848777771, + "learning_rate": 1.637889335797094e-05, + "loss": 0.8767, + "step": 3097 + }, + { + "epoch": 0.23442170178956528, + "grad_norm": 10.917149543762207, + "learning_rate": 1.6377603643352483e-05, + "loss": 0.8107, + "step": 3098 + }, + { + "epoch": 0.2344973705118989, + "grad_norm": 2.7370142936706543, + "learning_rate": 1.6376313466724624e-05, + "loss": 0.7963, + "step": 3099 + }, + { + "epoch": 0.23457303923423253, + "grad_norm": 1.870970606803894, + "learning_rate": 1.6375022828168153e-05, + "loss": 0.6093, + "step": 3100 + }, + { + "epoch": 0.23464870795656614, + "grad_norm": 2.654796600341797, + "learning_rate": 1.6373731727763902e-05, + "loss": 0.766, + "step": 3101 + }, + { + "epoch": 0.23472437667889978, + "grad_norm": 2.636352062225342, + "learning_rate": 1.6372440165592717e-05, + "loss": 0.7713, + "step": 3102 + }, + { + "epoch": 0.2348000454012334, + "grad_norm": 2.9713962078094482, + "learning_rate": 1.6371148141735488e-05, + "loss": 0.7387, + "step": 3103 + }, + { + "epoch": 0.23487571412356703, + "grad_norm": 2.823192596435547, + "learning_rate": 1.636985565627312e-05, + "loss": 0.7023, + "step": 3104 + }, + { + "epoch": 0.23495138284590064, + "grad_norm": 2.4954514503479004, + "learning_rate": 1.6368562709286553e-05, + "loss": 0.7699, + "step": 3105 + }, + { + "epoch": 0.23502705156823428, + "grad_norm": 2.6096956729888916, + "learning_rate": 1.6367269300856755e-05, + "loss": 0.6207, + "step": 3106 + }, + { + "epoch": 0.2351027202905679, + "grad_norm": 2.9008142948150635, + "learning_rate": 1.636597543106473e-05, + "loss": 0.8394, + "step": 3107 + }, + { + "epoch": 0.23517838901290153, + "grad_norm": 1.9445158243179321, + "learning_rate": 1.636468109999149e-05, + "loss": 0.6827, + "step": 3108 + }, + { + "epoch": 0.23525405773523514, + "grad_norm": 2.0866832733154297, + "learning_rate": 1.6363386307718106e-05, + "loss": 0.8256, + "step": 3109 + }, + { + "epoch": 0.23532972645756878, + "grad_norm": 2.2782726287841797, + "learning_rate": 1.6362091054325657e-05, + "loss": 0.6447, + "step": 3110 + }, + { + "epoch": 0.2354053951799024, + "grad_norm": 2.6688876152038574, + "learning_rate": 1.636079533989525e-05, + "loss": 0.7119, + "step": 3111 + }, + { + "epoch": 0.235481063902236, + "grad_norm": 2.297727346420288, + "learning_rate": 1.6359499164508034e-05, + "loss": 0.9497, + "step": 3112 + }, + { + "epoch": 0.23555673262456964, + "grad_norm": 2.075216054916382, + "learning_rate": 1.6358202528245173e-05, + "loss": 0.8736, + "step": 3113 + }, + { + "epoch": 0.23563240134690325, + "grad_norm": 2.638568878173828, + "learning_rate": 1.6356905431187874e-05, + "loss": 0.7503, + "step": 3114 + }, + { + "epoch": 0.23570807006923689, + "grad_norm": 3.3016517162323, + "learning_rate": 1.635560787341736e-05, + "loss": 0.6998, + "step": 3115 + }, + { + "epoch": 0.2357837387915705, + "grad_norm": 2.7137296199798584, + "learning_rate": 1.635430985501489e-05, + "loss": 0.8485, + "step": 3116 + }, + { + "epoch": 0.23585940751390413, + "grad_norm": 2.4911835193634033, + "learning_rate": 1.6353011376061752e-05, + "loss": 0.7518, + "step": 3117 + }, + { + "epoch": 0.23593507623623775, + "grad_norm": 2.5743257999420166, + "learning_rate": 1.6351712436639254e-05, + "loss": 0.8575, + "step": 3118 + }, + { + "epoch": 0.23601074495857138, + "grad_norm": 1.9882807731628418, + "learning_rate": 1.635041303682875e-05, + "loss": 0.7866, + "step": 3119 + }, + { + "epoch": 0.236086413680905, + "grad_norm": 2.3012471199035645, + "learning_rate": 1.6349113176711606e-05, + "loss": 0.7392, + "step": 3120 + }, + { + "epoch": 0.23616208240323863, + "grad_norm": 2.3356645107269287, + "learning_rate": 1.6347812856369225e-05, + "loss": 0.7093, + "step": 3121 + }, + { + "epoch": 0.23623775112557224, + "grad_norm": 1.8433892726898193, + "learning_rate": 1.6346512075883035e-05, + "loss": 0.7198, + "step": 3122 + }, + { + "epoch": 0.23631341984790588, + "grad_norm": 2.4490842819213867, + "learning_rate": 1.6345210835334502e-05, + "loss": 0.8044, + "step": 3123 + }, + { + "epoch": 0.2363890885702395, + "grad_norm": 2.7389450073242188, + "learning_rate": 1.6343909134805106e-05, + "loss": 0.7449, + "step": 3124 + }, + { + "epoch": 0.2364647572925731, + "grad_norm": 2.162238836288452, + "learning_rate": 1.6342606974376367e-05, + "loss": 0.6568, + "step": 3125 + }, + { + "epoch": 0.23654042601490674, + "grad_norm": 2.0842134952545166, + "learning_rate": 1.634130435412983e-05, + "loss": 0.5613, + "step": 3126 + }, + { + "epoch": 0.23661609473724035, + "grad_norm": 2.217766284942627, + "learning_rate": 1.6340001274147074e-05, + "loss": 0.7913, + "step": 3127 + }, + { + "epoch": 0.236691763459574, + "grad_norm": 2.228721857070923, + "learning_rate": 1.6338697734509694e-05, + "loss": 0.8567, + "step": 3128 + }, + { + "epoch": 0.2367674321819076, + "grad_norm": 2.4052469730377197, + "learning_rate": 1.6337393735299325e-05, + "loss": 0.7, + "step": 3129 + }, + { + "epoch": 0.23684310090424124, + "grad_norm": 2.7400004863739014, + "learning_rate": 1.633608927659763e-05, + "loss": 0.7754, + "step": 3130 + }, + { + "epoch": 0.23691876962657485, + "grad_norm": 2.2973527908325195, + "learning_rate": 1.6334784358486296e-05, + "loss": 0.8086, + "step": 3131 + }, + { + "epoch": 0.2369944383489085, + "grad_norm": 2.4160079956054688, + "learning_rate": 1.6333478981047043e-05, + "loss": 0.7429, + "step": 3132 + }, + { + "epoch": 0.2370701070712421, + "grad_norm": 2.779785394668579, + "learning_rate": 1.6332173144361613e-05, + "loss": 0.6959, + "step": 3133 + }, + { + "epoch": 0.23714577579357574, + "grad_norm": 2.1350619792938232, + "learning_rate": 1.633086684851179e-05, + "loss": 0.8856, + "step": 3134 + }, + { + "epoch": 0.23722144451590935, + "grad_norm": 3.6366331577301025, + "learning_rate": 1.632956009357937e-05, + "loss": 0.6962, + "step": 3135 + }, + { + "epoch": 0.23729711323824298, + "grad_norm": 2.5807621479034424, + "learning_rate": 1.6328252879646195e-05, + "loss": 0.7641, + "step": 3136 + }, + { + "epoch": 0.2373727819605766, + "grad_norm": 2.8703842163085938, + "learning_rate": 1.632694520679412e-05, + "loss": 0.793, + "step": 3137 + }, + { + "epoch": 0.2374484506829102, + "grad_norm": 2.0771567821502686, + "learning_rate": 1.632563707510504e-05, + "loss": 0.7813, + "step": 3138 + }, + { + "epoch": 0.23752411940524384, + "grad_norm": 2.532623529434204, + "learning_rate": 1.6324328484660867e-05, + "loss": 0.722, + "step": 3139 + }, + { + "epoch": 0.23759978812757745, + "grad_norm": 1.9321879148483276, + "learning_rate": 1.632301943554356e-05, + "loss": 0.7015, + "step": 3140 + }, + { + "epoch": 0.2376754568499111, + "grad_norm": 2.117048978805542, + "learning_rate": 1.6321709927835087e-05, + "loss": 0.7204, + "step": 3141 + }, + { + "epoch": 0.2377511255722447, + "grad_norm": 2.478074550628662, + "learning_rate": 1.6320399961617458e-05, + "loss": 0.7196, + "step": 3142 + }, + { + "epoch": 0.23782679429457834, + "grad_norm": 2.4688057899475098, + "learning_rate": 1.6319089536972706e-05, + "loss": 0.7519, + "step": 3143 + }, + { + "epoch": 0.23790246301691195, + "grad_norm": 2.998124599456787, + "learning_rate": 1.6317778653982898e-05, + "loss": 0.8621, + "step": 3144 + }, + { + "epoch": 0.2379781317392456, + "grad_norm": 3.5607550144195557, + "learning_rate": 1.631646731273012e-05, + "loss": 0.748, + "step": 3145 + }, + { + "epoch": 0.2380538004615792, + "grad_norm": 2.3379786014556885, + "learning_rate": 1.631515551329649e-05, + "loss": 0.5764, + "step": 3146 + }, + { + "epoch": 0.23812946918391284, + "grad_norm": 2.382059097290039, + "learning_rate": 1.6313843255764167e-05, + "loss": 0.7674, + "step": 3147 + }, + { + "epoch": 0.23820513790624645, + "grad_norm": 2.690565586090088, + "learning_rate": 1.6312530540215322e-05, + "loss": 0.823, + "step": 3148 + }, + { + "epoch": 0.2382808066285801, + "grad_norm": 2.1526098251342773, + "learning_rate": 1.631121736673216e-05, + "loss": 0.8074, + "step": 3149 + }, + { + "epoch": 0.2383564753509137, + "grad_norm": 2.1651923656463623, + "learning_rate": 1.6309903735396925e-05, + "loss": 0.7172, + "step": 3150 + }, + { + "epoch": 0.2384321440732473, + "grad_norm": 2.0754897594451904, + "learning_rate": 1.6308589646291873e-05, + "loss": 0.7341, + "step": 3151 + }, + { + "epoch": 0.23850781279558095, + "grad_norm": 2.7566630840301514, + "learning_rate": 1.6307275099499297e-05, + "loss": 0.813, + "step": 3152 + }, + { + "epoch": 0.23858348151791456, + "grad_norm": 2.0758936405181885, + "learning_rate": 1.630596009510152e-05, + "loss": 0.7604, + "step": 3153 + }, + { + "epoch": 0.2386591502402482, + "grad_norm": 2.4266982078552246, + "learning_rate": 1.6304644633180893e-05, + "loss": 0.806, + "step": 3154 + }, + { + "epoch": 0.2387348189625818, + "grad_norm": 2.900785446166992, + "learning_rate": 1.630332871381979e-05, + "loss": 0.7293, + "step": 3155 + }, + { + "epoch": 0.23881048768491545, + "grad_norm": 2.0471670627593994, + "learning_rate": 1.6302012337100624e-05, + "loss": 0.7067, + "step": 3156 + }, + { + "epoch": 0.23888615640724906, + "grad_norm": 2.2892119884490967, + "learning_rate": 1.6300695503105825e-05, + "loss": 0.8938, + "step": 3157 + }, + { + "epoch": 0.2389618251295827, + "grad_norm": 1.9991192817687988, + "learning_rate": 1.629937821191786e-05, + "loss": 0.7893, + "step": 3158 + }, + { + "epoch": 0.2390374938519163, + "grad_norm": 2.1983959674835205, + "learning_rate": 1.6298060463619224e-05, + "loss": 0.6493, + "step": 3159 + }, + { + "epoch": 0.23911316257424994, + "grad_norm": 2.451901912689209, + "learning_rate": 1.629674225829244e-05, + "loss": 0.6745, + "step": 3160 + }, + { + "epoch": 0.23918883129658355, + "grad_norm": 2.5038340091705322, + "learning_rate": 1.6295423596020052e-05, + "loss": 0.8159, + "step": 3161 + }, + { + "epoch": 0.2392645000189172, + "grad_norm": 2.6183676719665527, + "learning_rate": 1.6294104476884643e-05, + "loss": 0.747, + "step": 3162 + }, + { + "epoch": 0.2393401687412508, + "grad_norm": 3.3365650177001953, + "learning_rate": 1.6292784900968818e-05, + "loss": 0.8315, + "step": 3163 + }, + { + "epoch": 0.23941583746358444, + "grad_norm": 2.1766538619995117, + "learning_rate": 1.6291464868355216e-05, + "loss": 0.7855, + "step": 3164 + }, + { + "epoch": 0.23949150618591805, + "grad_norm": 2.9773683547973633, + "learning_rate": 1.6290144379126498e-05, + "loss": 0.8054, + "step": 3165 + }, + { + "epoch": 0.23956717490825166, + "grad_norm": 2.252821445465088, + "learning_rate": 1.6288823433365365e-05, + "loss": 0.754, + "step": 3166 + }, + { + "epoch": 0.2396428436305853, + "grad_norm": 2.627490520477295, + "learning_rate": 1.628750203115453e-05, + "loss": 0.8792, + "step": 3167 + }, + { + "epoch": 0.2397185123529189, + "grad_norm": 2.516615867614746, + "learning_rate": 1.6286180172576748e-05, + "loss": 0.8076, + "step": 3168 + }, + { + "epoch": 0.23979418107525255, + "grad_norm": 3.4849398136138916, + "learning_rate": 1.6284857857714798e-05, + "loss": 0.7961, + "step": 3169 + }, + { + "epoch": 0.23986984979758616, + "grad_norm": 2.3548803329467773, + "learning_rate": 1.6283535086651487e-05, + "loss": 0.7389, + "step": 3170 + }, + { + "epoch": 0.2399455185199198, + "grad_norm": 2.4780502319335938, + "learning_rate": 1.6282211859469652e-05, + "loss": 0.7191, + "step": 3171 + }, + { + "epoch": 0.2400211872422534, + "grad_norm": 2.856328248977661, + "learning_rate": 1.6280888176252153e-05, + "loss": 0.8256, + "step": 3172 + }, + { + "epoch": 0.24009685596458705, + "grad_norm": 3.004737615585327, + "learning_rate": 1.627956403708189e-05, + "loss": 0.843, + "step": 3173 + }, + { + "epoch": 0.24017252468692066, + "grad_norm": 2.3179590702056885, + "learning_rate": 1.627823944204178e-05, + "loss": 0.7477, + "step": 3174 + }, + { + "epoch": 0.2402481934092543, + "grad_norm": 2.164140462875366, + "learning_rate": 1.627691439121478e-05, + "loss": 0.7412, + "step": 3175 + }, + { + "epoch": 0.2403238621315879, + "grad_norm": 2.5224568843841553, + "learning_rate": 1.6275588884683858e-05, + "loss": 0.6791, + "step": 3176 + }, + { + "epoch": 0.24039953085392154, + "grad_norm": 2.7676162719726562, + "learning_rate": 1.6274262922532033e-05, + "loss": 0.652, + "step": 3177 + }, + { + "epoch": 0.24047519957625516, + "grad_norm": 1.807481288909912, + "learning_rate": 1.6272936504842333e-05, + "loss": 0.93, + "step": 3178 + }, + { + "epoch": 0.24055086829858877, + "grad_norm": 2.958256483078003, + "learning_rate": 1.627160963169783e-05, + "loss": 0.7591, + "step": 3179 + }, + { + "epoch": 0.2406265370209224, + "grad_norm": 3.4339983463287354, + "learning_rate": 1.6270282303181606e-05, + "loss": 0.7686, + "step": 3180 + }, + { + "epoch": 0.24070220574325601, + "grad_norm": 8.310561180114746, + "learning_rate": 1.6268954519376792e-05, + "loss": 0.7554, + "step": 3181 + }, + { + "epoch": 0.24077787446558965, + "grad_norm": 1.9216821193695068, + "learning_rate": 1.6267626280366538e-05, + "loss": 0.6726, + "step": 3182 + }, + { + "epoch": 0.24085354318792326, + "grad_norm": 2.743393659591675, + "learning_rate": 1.626629758623402e-05, + "loss": 0.7734, + "step": 3183 + }, + { + "epoch": 0.2409292119102569, + "grad_norm": 2.5748066902160645, + "learning_rate": 1.6264968437062438e-05, + "loss": 0.8017, + "step": 3184 + }, + { + "epoch": 0.2410048806325905, + "grad_norm": 2.8789925575256348, + "learning_rate": 1.626363883293504e-05, + "loss": 0.7882, + "step": 3185 + }, + { + "epoch": 0.24108054935492415, + "grad_norm": 2.611142158508301, + "learning_rate": 1.6262308773935085e-05, + "loss": 0.8026, + "step": 3186 + }, + { + "epoch": 0.24115621807725776, + "grad_norm": 2.30765700340271, + "learning_rate": 1.6260978260145867e-05, + "loss": 0.8759, + "step": 3187 + }, + { + "epoch": 0.2412318867995914, + "grad_norm": 2.0693159103393555, + "learning_rate": 1.62596472916507e-05, + "loss": 0.7008, + "step": 3188 + }, + { + "epoch": 0.241307555521925, + "grad_norm": 2.8635051250457764, + "learning_rate": 1.6258315868532945e-05, + "loss": 0.7966, + "step": 3189 + }, + { + "epoch": 0.24138322424425865, + "grad_norm": 2.766191244125366, + "learning_rate": 1.625698399087597e-05, + "loss": 0.6112, + "step": 3190 + }, + { + "epoch": 0.24145889296659226, + "grad_norm": 2.440380811691284, + "learning_rate": 1.6255651658763185e-05, + "loss": 0.8754, + "step": 3191 + }, + { + "epoch": 0.24153456168892587, + "grad_norm": 2.209153175354004, + "learning_rate": 1.625431887227803e-05, + "loss": 0.7281, + "step": 3192 + }, + { + "epoch": 0.2416102304112595, + "grad_norm": 1.9143271446228027, + "learning_rate": 1.625298563150396e-05, + "loss": 0.8007, + "step": 3193 + }, + { + "epoch": 0.24168589913359312, + "grad_norm": 2.3368210792541504, + "learning_rate": 1.6251651936524473e-05, + "loss": 0.8231, + "step": 3194 + }, + { + "epoch": 0.24176156785592676, + "grad_norm": 2.0771644115448, + "learning_rate": 1.6250317787423087e-05, + "loss": 0.7381, + "step": 3195 + }, + { + "epoch": 0.24183723657826037, + "grad_norm": 4.1516289710998535, + "learning_rate": 1.624898318428335e-05, + "loss": 0.6193, + "step": 3196 + }, + { + "epoch": 0.241912905300594, + "grad_norm": 2.5363802909851074, + "learning_rate": 1.6247648127188842e-05, + "loss": 0.8325, + "step": 3197 + }, + { + "epoch": 0.24198857402292762, + "grad_norm": 2.0077271461486816, + "learning_rate": 1.6246312616223164e-05, + "loss": 0.83, + "step": 3198 + }, + { + "epoch": 0.24206424274526125, + "grad_norm": 2.6140248775482178, + "learning_rate": 1.6244976651469952e-05, + "loss": 0.8015, + "step": 3199 + }, + { + "epoch": 0.24213991146759486, + "grad_norm": 2.4978835582733154, + "learning_rate": 1.624364023301287e-05, + "loss": 0.7106, + "step": 3200 + }, + { + "epoch": 0.2422155801899285, + "grad_norm": 2.355581521987915, + "learning_rate": 1.624230336093561e-05, + "loss": 0.8455, + "step": 3201 + }, + { + "epoch": 0.2422912489122621, + "grad_norm": 3.009420871734619, + "learning_rate": 1.6240966035321887e-05, + "loss": 0.6477, + "step": 3202 + }, + { + "epoch": 0.24236691763459575, + "grad_norm": 3.0516929626464844, + "learning_rate": 1.623962825625545e-05, + "loss": 0.7656, + "step": 3203 + }, + { + "epoch": 0.24244258635692936, + "grad_norm": 4.451990604400635, + "learning_rate": 1.6238290023820077e-05, + "loss": 0.8312, + "step": 3204 + }, + { + "epoch": 0.24251825507926297, + "grad_norm": 2.4959819316864014, + "learning_rate": 1.6236951338099567e-05, + "loss": 0.815, + "step": 3205 + }, + { + "epoch": 0.2425939238015966, + "grad_norm": 2.653895854949951, + "learning_rate": 1.6235612199177765e-05, + "loss": 0.6203, + "step": 3206 + }, + { + "epoch": 0.24266959252393022, + "grad_norm": 4.231168746948242, + "learning_rate": 1.6234272607138517e-05, + "loss": 0.716, + "step": 3207 + }, + { + "epoch": 0.24274526124626386, + "grad_norm": 2.9942550659179688, + "learning_rate": 1.6232932562065727e-05, + "loss": 0.783, + "step": 3208 + }, + { + "epoch": 0.24282092996859747, + "grad_norm": 2.903759241104126, + "learning_rate": 1.6231592064043298e-05, + "loss": 0.7758, + "step": 3209 + }, + { + "epoch": 0.2428965986909311, + "grad_norm": 2.2011756896972656, + "learning_rate": 1.6230251113155188e-05, + "loss": 0.9041, + "step": 3210 + }, + { + "epoch": 0.24297226741326472, + "grad_norm": 2.3656907081604004, + "learning_rate": 1.622890970948537e-05, + "loss": 0.8788, + "step": 3211 + }, + { + "epoch": 0.24304793613559836, + "grad_norm": 2.8599255084991455, + "learning_rate": 1.6227567853117842e-05, + "loss": 0.8441, + "step": 3212 + }, + { + "epoch": 0.24312360485793197, + "grad_norm": 2.9189417362213135, + "learning_rate": 1.6226225544136638e-05, + "loss": 0.7811, + "step": 3213 + }, + { + "epoch": 0.2431992735802656, + "grad_norm": 2.1617062091827393, + "learning_rate": 1.622488278262582e-05, + "loss": 0.6992, + "step": 3214 + }, + { + "epoch": 0.24327494230259922, + "grad_norm": 2.509108066558838, + "learning_rate": 1.6223539568669476e-05, + "loss": 0.7945, + "step": 3215 + }, + { + "epoch": 0.24335061102493286, + "grad_norm": 2.4163920879364014, + "learning_rate": 1.6222195902351715e-05, + "loss": 0.7454, + "step": 3216 + }, + { + "epoch": 0.24342627974726647, + "grad_norm": 1.949989914894104, + "learning_rate": 1.622085178375669e-05, + "loss": 0.7069, + "step": 3217 + }, + { + "epoch": 0.2435019484696001, + "grad_norm": 2.657010078430176, + "learning_rate": 1.6219507212968568e-05, + "loss": 0.6595, + "step": 3218 + }, + { + "epoch": 0.24357761719193372, + "grad_norm": 2.2271595001220703, + "learning_rate": 1.6218162190071557e-05, + "loss": 0.8176, + "step": 3219 + }, + { + "epoch": 0.24365328591426733, + "grad_norm": 2.0614137649536133, + "learning_rate": 1.6216816715149884e-05, + "loss": 0.8147, + "step": 3220 + }, + { + "epoch": 0.24372895463660096, + "grad_norm": 3.104268789291382, + "learning_rate": 1.6215470788287803e-05, + "loss": 0.7427, + "step": 3221 + }, + { + "epoch": 0.24380462335893457, + "grad_norm": 2.268542528152466, + "learning_rate": 1.6214124409569605e-05, + "loss": 0.78, + "step": 3222 + }, + { + "epoch": 0.2438802920812682, + "grad_norm": 2.576122283935547, + "learning_rate": 1.6212777579079606e-05, + "loss": 0.7915, + "step": 3223 + }, + { + "epoch": 0.24395596080360182, + "grad_norm": 1.9363725185394287, + "learning_rate": 1.6211430296902145e-05, + "loss": 0.7399, + "step": 3224 + }, + { + "epoch": 0.24403162952593546, + "grad_norm": 1.9388312101364136, + "learning_rate": 1.621008256312159e-05, + "loss": 0.7831, + "step": 3225 + }, + { + "epoch": 0.24410729824826907, + "grad_norm": 2.938326835632324, + "learning_rate": 1.620873437782235e-05, + "loss": 0.744, + "step": 3226 + }, + { + "epoch": 0.2441829669706027, + "grad_norm": 2.0602774620056152, + "learning_rate": 1.6207385741088843e-05, + "loss": 0.7965, + "step": 3227 + }, + { + "epoch": 0.24425863569293632, + "grad_norm": 2.867607593536377, + "learning_rate": 1.620603665300553e-05, + "loss": 0.7248, + "step": 3228 + }, + { + "epoch": 0.24433430441526996, + "grad_norm": 2.3602488040924072, + "learning_rate": 1.6204687113656895e-05, + "loss": 0.6803, + "step": 3229 + }, + { + "epoch": 0.24440997313760357, + "grad_norm": 2.2412686347961426, + "learning_rate": 1.6203337123127456e-05, + "loss": 0.7207, + "step": 3230 + }, + { + "epoch": 0.2444856418599372, + "grad_norm": 2.8568761348724365, + "learning_rate": 1.620198668150174e-05, + "loss": 0.7997, + "step": 3231 + }, + { + "epoch": 0.24456131058227082, + "grad_norm": 2.4794857501983643, + "learning_rate": 1.620063578886433e-05, + "loss": 0.6912, + "step": 3232 + }, + { + "epoch": 0.24463697930460443, + "grad_norm": 2.476989507675171, + "learning_rate": 1.6199284445299815e-05, + "loss": 0.9426, + "step": 3233 + }, + { + "epoch": 0.24471264802693807, + "grad_norm": 2.4754297733306885, + "learning_rate": 1.619793265089282e-05, + "loss": 0.8036, + "step": 3234 + }, + { + "epoch": 0.24478831674927168, + "grad_norm": 2.367055654525757, + "learning_rate": 1.6196580405728005e-05, + "loss": 0.7739, + "step": 3235 + }, + { + "epoch": 0.24486398547160532, + "grad_norm": 2.6627862453460693, + "learning_rate": 1.6195227709890047e-05, + "loss": 0.7749, + "step": 3236 + }, + { + "epoch": 0.24493965419393893, + "grad_norm": 2.5135037899017334, + "learning_rate": 1.6193874563463657e-05, + "loss": 0.7816, + "step": 3237 + }, + { + "epoch": 0.24501532291627257, + "grad_norm": 3.7979977130889893, + "learning_rate": 1.6192520966533574e-05, + "loss": 0.7276, + "step": 3238 + }, + { + "epoch": 0.24509099163860618, + "grad_norm": 2.5826263427734375, + "learning_rate": 1.6191166919184564e-05, + "loss": 0.7003, + "step": 3239 + }, + { + "epoch": 0.24516666036093981, + "grad_norm": 2.326385259628296, + "learning_rate": 1.6189812421501424e-05, + "loss": 0.549, + "step": 3240 + }, + { + "epoch": 0.24524232908327342, + "grad_norm": 2.296499490737915, + "learning_rate": 1.6188457473568974e-05, + "loss": 0.8263, + "step": 3241 + }, + { + "epoch": 0.24531799780560706, + "grad_norm": 2.9421603679656982, + "learning_rate": 1.6187102075472067e-05, + "loss": 0.8154, + "step": 3242 + }, + { + "epoch": 0.24539366652794067, + "grad_norm": 2.3418660163879395, + "learning_rate": 1.6185746227295585e-05, + "loss": 0.8657, + "step": 3243 + }, + { + "epoch": 0.2454693352502743, + "grad_norm": 2.390544891357422, + "learning_rate": 1.618438992912443e-05, + "loss": 0.7327, + "step": 3244 + }, + { + "epoch": 0.24554500397260792, + "grad_norm": 2.8358356952667236, + "learning_rate": 1.6183033181043542e-05, + "loss": 0.9002, + "step": 3245 + }, + { + "epoch": 0.24562067269494153, + "grad_norm": 2.2084474563598633, + "learning_rate": 1.6181675983137884e-05, + "loss": 0.7483, + "step": 3246 + }, + { + "epoch": 0.24569634141727517, + "grad_norm": 2.7843167781829834, + "learning_rate": 1.6180318335492445e-05, + "loss": 0.7849, + "step": 3247 + }, + { + "epoch": 0.24577201013960878, + "grad_norm": 2.2138888835906982, + "learning_rate": 1.617896023819225e-05, + "loss": 0.7737, + "step": 3248 + }, + { + "epoch": 0.24584767886194242, + "grad_norm": 2.3047590255737305, + "learning_rate": 1.6177601691322344e-05, + "loss": 0.6689, + "step": 3249 + }, + { + "epoch": 0.24592334758427603, + "grad_norm": 2.421382188796997, + "learning_rate": 1.6176242694967803e-05, + "loss": 0.697, + "step": 3250 + }, + { + "epoch": 0.24599901630660967, + "grad_norm": 2.865973472595215, + "learning_rate": 1.6174883249213736e-05, + "loss": 0.6845, + "step": 3251 + }, + { + "epoch": 0.24607468502894328, + "grad_norm": 2.016404628753662, + "learning_rate": 1.6173523354145275e-05, + "loss": 0.7247, + "step": 3252 + }, + { + "epoch": 0.24615035375127692, + "grad_norm": 2.101076602935791, + "learning_rate": 1.617216300984758e-05, + "loss": 0.9097, + "step": 3253 + }, + { + "epoch": 0.24622602247361053, + "grad_norm": 2.44075083732605, + "learning_rate": 1.6170802216405835e-05, + "loss": 0.7383, + "step": 3254 + }, + { + "epoch": 0.24630169119594417, + "grad_norm": 2.7942304611206055, + "learning_rate": 1.6169440973905266e-05, + "loss": 0.862, + "step": 3255 + }, + { + "epoch": 0.24637735991827778, + "grad_norm": 2.607255458831787, + "learning_rate": 1.6168079282431113e-05, + "loss": 0.8421, + "step": 3256 + }, + { + "epoch": 0.24645302864061142, + "grad_norm": 3.629255771636963, + "learning_rate": 1.6166717142068654e-05, + "loss": 0.7941, + "step": 3257 + }, + { + "epoch": 0.24652869736294503, + "grad_norm": 3.293483257293701, + "learning_rate": 1.6165354552903182e-05, + "loss": 0.9336, + "step": 3258 + }, + { + "epoch": 0.24660436608527864, + "grad_norm": 2.2481791973114014, + "learning_rate": 1.6163991515020035e-05, + "loss": 0.7163, + "step": 3259 + }, + { + "epoch": 0.24668003480761228, + "grad_norm": 5.270366668701172, + "learning_rate": 1.616262802850457e-05, + "loss": 0.8224, + "step": 3260 + }, + { + "epoch": 0.24675570352994589, + "grad_norm": 2.6033430099487305, + "learning_rate": 1.616126409344217e-05, + "loss": 0.8148, + "step": 3261 + }, + { + "epoch": 0.24683137225227952, + "grad_norm": 3.4595351219177246, + "learning_rate": 1.6159899709918247e-05, + "loss": 0.653, + "step": 3262 + }, + { + "epoch": 0.24690704097461313, + "grad_norm": 2.154088020324707, + "learning_rate": 1.615853487801825e-05, + "loss": 0.8671, + "step": 3263 + }, + { + "epoch": 0.24698270969694677, + "grad_norm": 2.9141697883605957, + "learning_rate": 1.615716959782764e-05, + "loss": 0.846, + "step": 3264 + }, + { + "epoch": 0.24705837841928038, + "grad_norm": 2.8304619789123535, + "learning_rate": 1.6155803869431927e-05, + "loss": 0.584, + "step": 3265 + }, + { + "epoch": 0.24713404714161402, + "grad_norm": 6.038189888000488, + "learning_rate": 1.615443769291663e-05, + "loss": 0.5921, + "step": 3266 + }, + { + "epoch": 0.24720971586394763, + "grad_norm": 2.2429237365722656, + "learning_rate": 1.6153071068367302e-05, + "loss": 0.7524, + "step": 3267 + }, + { + "epoch": 0.24728538458628127, + "grad_norm": 2.542436361312866, + "learning_rate": 1.6151703995869533e-05, + "loss": 0.6946, + "step": 3268 + }, + { + "epoch": 0.24736105330861488, + "grad_norm": 2.85803484916687, + "learning_rate": 1.6150336475508923e-05, + "loss": 0.9324, + "step": 3269 + }, + { + "epoch": 0.24743672203094852, + "grad_norm": 2.8291127681732178, + "learning_rate": 1.614896850737112e-05, + "loss": 0.933, + "step": 3270 + }, + { + "epoch": 0.24751239075328213, + "grad_norm": 3.0022454261779785, + "learning_rate": 1.6147600091541782e-05, + "loss": 0.5986, + "step": 3271 + }, + { + "epoch": 0.24758805947561574, + "grad_norm": 2.692591428756714, + "learning_rate": 1.614623122810661e-05, + "loss": 0.7886, + "step": 3272 + }, + { + "epoch": 0.24766372819794938, + "grad_norm": 3.29293155670166, + "learning_rate": 1.6144861917151322e-05, + "loss": 0.8193, + "step": 3273 + }, + { + "epoch": 0.247739396920283, + "grad_norm": 2.4391543865203857, + "learning_rate": 1.614349215876168e-05, + "loss": 0.8077, + "step": 3274 + }, + { + "epoch": 0.24781506564261663, + "grad_norm": 2.349703788757324, + "learning_rate": 1.6142121953023447e-05, + "loss": 0.7945, + "step": 3275 + }, + { + "epoch": 0.24789073436495024, + "grad_norm": 2.3136823177337646, + "learning_rate": 1.6140751300022437e-05, + "loss": 0.7307, + "step": 3276 + }, + { + "epoch": 0.24796640308728388, + "grad_norm": 2.6560375690460205, + "learning_rate": 1.6139380199844487e-05, + "loss": 0.7428, + "step": 3277 + }, + { + "epoch": 0.2480420718096175, + "grad_norm": 2.5470519065856934, + "learning_rate": 1.6138008652575455e-05, + "loss": 0.7495, + "step": 3278 + }, + { + "epoch": 0.24811774053195113, + "grad_norm": 3.2811646461486816, + "learning_rate": 1.6136636658301236e-05, + "loss": 0.9781, + "step": 3279 + }, + { + "epoch": 0.24819340925428474, + "grad_norm": 2.395357608795166, + "learning_rate": 1.6135264217107744e-05, + "loss": 0.6281, + "step": 3280 + }, + { + "epoch": 0.24826907797661837, + "grad_norm": 2.6726183891296387, + "learning_rate": 1.6133891329080933e-05, + "loss": 0.7946, + "step": 3281 + }, + { + "epoch": 0.24834474669895198, + "grad_norm": 2.373309373855591, + "learning_rate": 1.6132517994306767e-05, + "loss": 0.7204, + "step": 3282 + }, + { + "epoch": 0.24842041542128562, + "grad_norm": 2.439506769180298, + "learning_rate": 1.6131144212871264e-05, + "loss": 0.7067, + "step": 3283 + }, + { + "epoch": 0.24849608414361923, + "grad_norm": 2.1670548915863037, + "learning_rate": 1.6129769984860435e-05, + "loss": 0.9527, + "step": 3284 + }, + { + "epoch": 0.24857175286595287, + "grad_norm": 2.238982677459717, + "learning_rate": 1.6128395310360356e-05, + "loss": 0.6461, + "step": 3285 + }, + { + "epoch": 0.24864742158828648, + "grad_norm": 3.4354398250579834, + "learning_rate": 1.6127020189457107e-05, + "loss": 0.7255, + "step": 3286 + }, + { + "epoch": 0.2487230903106201, + "grad_norm": 3.187068223953247, + "learning_rate": 1.6125644622236797e-05, + "loss": 0.9041, + "step": 3287 + }, + { + "epoch": 0.24879875903295373, + "grad_norm": 2.2371320724487305, + "learning_rate": 1.6124268608785578e-05, + "loss": 0.7082, + "step": 3288 + }, + { + "epoch": 0.24887442775528734, + "grad_norm": 1.9986299276351929, + "learning_rate": 1.6122892149189616e-05, + "loss": 0.6645, + "step": 3289 + }, + { + "epoch": 0.24895009647762098, + "grad_norm": 2.3427042961120605, + "learning_rate": 1.6121515243535107e-05, + "loss": 0.7438, + "step": 3290 + }, + { + "epoch": 0.2490257651999546, + "grad_norm": 11.90526294708252, + "learning_rate": 1.612013789190828e-05, + "loss": 0.7747, + "step": 3291 + }, + { + "epoch": 0.24910143392228823, + "grad_norm": 2.641674280166626, + "learning_rate": 1.611876009439539e-05, + "loss": 0.8812, + "step": 3292 + }, + { + "epoch": 0.24917710264462184, + "grad_norm": 2.691256046295166, + "learning_rate": 1.6117381851082717e-05, + "loss": 0.6981, + "step": 3293 + }, + { + "epoch": 0.24925277136695548, + "grad_norm": 2.4926040172576904, + "learning_rate": 1.6116003162056574e-05, + "loss": 0.7059, + "step": 3294 + }, + { + "epoch": 0.2493284400892891, + "grad_norm": 1.984239101409912, + "learning_rate": 1.6114624027403297e-05, + "loss": 0.7439, + "step": 3295 + }, + { + "epoch": 0.24940410881162273, + "grad_norm": 2.4202473163604736, + "learning_rate": 1.611324444720925e-05, + "loss": 0.8114, + "step": 3296 + }, + { + "epoch": 0.24947977753395634, + "grad_norm": 2.3358452320098877, + "learning_rate": 1.611186442156083e-05, + "loss": 0.7779, + "step": 3297 + }, + { + "epoch": 0.24955544625628998, + "grad_norm": 2.353821277618408, + "learning_rate": 1.6110483950544454e-05, + "loss": 0.7116, + "step": 3298 + }, + { + "epoch": 0.24963111497862359, + "grad_norm": 2.44838547706604, + "learning_rate": 1.610910303424658e-05, + "loss": 0.7111, + "step": 3299 + }, + { + "epoch": 0.2497067837009572, + "grad_norm": 2.416057586669922, + "learning_rate": 1.6107721672753678e-05, + "loss": 0.7076, + "step": 3300 + }, + { + "epoch": 0.24978245242329083, + "grad_norm": 2.881209373474121, + "learning_rate": 1.6106339866152255e-05, + "loss": 0.9059, + "step": 3301 + }, + { + "epoch": 0.24985812114562445, + "grad_norm": 2.15285325050354, + "learning_rate": 1.6104957614528846e-05, + "loss": 0.6258, + "step": 3302 + }, + { + "epoch": 0.24993378986795808, + "grad_norm": 2.7021114826202393, + "learning_rate": 1.610357491797001e-05, + "loss": 0.8291, + "step": 3303 + }, + { + "epoch": 0.2500094585902917, + "grad_norm": 2.52158260345459, + "learning_rate": 1.6102191776562335e-05, + "loss": 0.7096, + "step": 3304 + }, + { + "epoch": 0.25008512731262533, + "grad_norm": 2.6078898906707764, + "learning_rate": 1.6100808190392446e-05, + "loss": 0.7502, + "step": 3305 + }, + { + "epoch": 0.25016079603495894, + "grad_norm": 2.7016923427581787, + "learning_rate": 1.6099424159546976e-05, + "loss": 0.6632, + "step": 3306 + }, + { + "epoch": 0.25023646475729255, + "grad_norm": 2.718710422515869, + "learning_rate": 1.6098039684112605e-05, + "loss": 0.6973, + "step": 3307 + }, + { + "epoch": 0.2503121334796262, + "grad_norm": 2.2537267208099365, + "learning_rate": 1.6096654764176027e-05, + "loss": 0.6324, + "step": 3308 + }, + { + "epoch": 0.25038780220195983, + "grad_norm": 2.0894060134887695, + "learning_rate": 1.609526939982398e-05, + "loss": 0.711, + "step": 3309 + }, + { + "epoch": 0.25046347092429344, + "grad_norm": 2.1615793704986572, + "learning_rate": 1.6093883591143212e-05, + "loss": 0.8218, + "step": 3310 + }, + { + "epoch": 0.25053913964662705, + "grad_norm": 2.357072114944458, + "learning_rate": 1.609249733822051e-05, + "loss": 0.7687, + "step": 3311 + }, + { + "epoch": 0.2506148083689607, + "grad_norm": 2.5962414741516113, + "learning_rate": 1.6091110641142683e-05, + "loss": 1.0468, + "step": 3312 + }, + { + "epoch": 0.25069047709129433, + "grad_norm": 2.7395946979522705, + "learning_rate": 1.608972349999657e-05, + "loss": 0.7615, + "step": 3313 + }, + { + "epoch": 0.25076614581362794, + "grad_norm": 3.2760064601898193, + "learning_rate": 1.6088335914869047e-05, + "loss": 0.7543, + "step": 3314 + }, + { + "epoch": 0.25084181453596155, + "grad_norm": 2.215672254562378, + "learning_rate": 1.6086947885846997e-05, + "loss": 0.8131, + "step": 3315 + }, + { + "epoch": 0.25091748325829516, + "grad_norm": 2.628455638885498, + "learning_rate": 1.6085559413017353e-05, + "loss": 0.8267, + "step": 3316 + }, + { + "epoch": 0.2509931519806288, + "grad_norm": 2.1428425312042236, + "learning_rate": 1.608417049646706e-05, + "loss": 0.6553, + "step": 3317 + }, + { + "epoch": 0.25106882070296244, + "grad_norm": 2.397225856781006, + "learning_rate": 1.6082781136283094e-05, + "loss": 0.7837, + "step": 3318 + }, + { + "epoch": 0.25114448942529605, + "grad_norm": 1.8074523210525513, + "learning_rate": 1.6081391332552464e-05, + "loss": 0.6386, + "step": 3319 + }, + { + "epoch": 0.25122015814762966, + "grad_norm": 2.304368257522583, + "learning_rate": 1.608000108536221e-05, + "loss": 0.7698, + "step": 3320 + }, + { + "epoch": 0.2512958268699633, + "grad_norm": 2.147972583770752, + "learning_rate": 1.6078610394799386e-05, + "loss": 0.77, + "step": 3321 + }, + { + "epoch": 0.25137149559229693, + "grad_norm": 2.355785846710205, + "learning_rate": 1.6077219260951082e-05, + "loss": 0.6399, + "step": 3322 + }, + { + "epoch": 0.25144716431463054, + "grad_norm": 2.293780565261841, + "learning_rate": 1.607582768390442e-05, + "loss": 0.7807, + "step": 3323 + }, + { + "epoch": 0.25152283303696416, + "grad_norm": 1.981724500656128, + "learning_rate": 1.6074435663746543e-05, + "loss": 0.6969, + "step": 3324 + }, + { + "epoch": 0.2515985017592978, + "grad_norm": 2.2810912132263184, + "learning_rate": 1.6073043200564623e-05, + "loss": 0.711, + "step": 3325 + }, + { + "epoch": 0.25167417048163143, + "grad_norm": 1.999266266822815, + "learning_rate": 1.607165029444586e-05, + "loss": 0.7184, + "step": 3326 + }, + { + "epoch": 0.25174983920396504, + "grad_norm": 2.708433151245117, + "learning_rate": 1.6070256945477485e-05, + "loss": 0.7204, + "step": 3327 + }, + { + "epoch": 0.25182550792629865, + "grad_norm": 2.3254494667053223, + "learning_rate": 1.606886315374675e-05, + "loss": 0.6824, + "step": 3328 + }, + { + "epoch": 0.25190117664863226, + "grad_norm": 2.2623493671417236, + "learning_rate": 1.606746891934094e-05, + "loss": 0.8442, + "step": 3329 + }, + { + "epoch": 0.25197684537096593, + "grad_norm": 2.412431240081787, + "learning_rate": 1.606607424234737e-05, + "loss": 0.8786, + "step": 3330 + }, + { + "epoch": 0.25205251409329954, + "grad_norm": 2.111781597137451, + "learning_rate": 1.6064679122853372e-05, + "loss": 0.836, + "step": 3331 + }, + { + "epoch": 0.25212818281563315, + "grad_norm": 2.9553704261779785, + "learning_rate": 1.6063283560946322e-05, + "loss": 0.8473, + "step": 3332 + }, + { + "epoch": 0.25220385153796676, + "grad_norm": 2.696552276611328, + "learning_rate": 1.6061887556713608e-05, + "loss": 0.7043, + "step": 3333 + }, + { + "epoch": 0.2522795202603004, + "grad_norm": 1.8179265260696411, + "learning_rate": 1.6060491110242655e-05, + "loss": 0.8593, + "step": 3334 + }, + { + "epoch": 0.25235518898263404, + "grad_norm": 2.586488962173462, + "learning_rate": 1.6059094221620913e-05, + "loss": 0.8374, + "step": 3335 + }, + { + "epoch": 0.25243085770496765, + "grad_norm": 2.9181783199310303, + "learning_rate": 1.6057696890935857e-05, + "loss": 0.8011, + "step": 3336 + }, + { + "epoch": 0.25250652642730126, + "grad_norm": 2.254702568054199, + "learning_rate": 1.6056299118274993e-05, + "loss": 0.6613, + "step": 3337 + }, + { + "epoch": 0.2525821951496349, + "grad_norm": 1.7335007190704346, + "learning_rate": 1.6054900903725856e-05, + "loss": 0.6857, + "step": 3338 + }, + { + "epoch": 0.25265786387196854, + "grad_norm": 2.5397019386291504, + "learning_rate": 1.605350224737601e-05, + "loss": 0.911, + "step": 3339 + }, + { + "epoch": 0.25273353259430215, + "grad_norm": 1.3949565887451172, + "learning_rate": 1.6052103149313037e-05, + "loss": 0.9984, + "step": 3340 + }, + { + "epoch": 0.25280920131663576, + "grad_norm": 1.854061245918274, + "learning_rate": 1.6050703609624554e-05, + "loss": 0.7489, + "step": 3341 + }, + { + "epoch": 0.25288487003896937, + "grad_norm": 2.37091064453125, + "learning_rate": 1.604930362839821e-05, + "loss": 0.7097, + "step": 3342 + }, + { + "epoch": 0.25296053876130303, + "grad_norm": 2.4179093837738037, + "learning_rate": 1.604790320572167e-05, + "loss": 0.7679, + "step": 3343 + }, + { + "epoch": 0.25303620748363664, + "grad_norm": 2.1405601501464844, + "learning_rate": 1.6046502341682637e-05, + "loss": 0.6784, + "step": 3344 + }, + { + "epoch": 0.25311187620597025, + "grad_norm": 2.4001498222351074, + "learning_rate": 1.6045101036368833e-05, + "loss": 0.7994, + "step": 3345 + }, + { + "epoch": 0.25318754492830386, + "grad_norm": 2.001662492752075, + "learning_rate": 1.6043699289868018e-05, + "loss": 0.8095, + "step": 3346 + }, + { + "epoch": 0.25326321365063753, + "grad_norm": 2.297889471054077, + "learning_rate": 1.6042297102267972e-05, + "loss": 0.8036, + "step": 3347 + }, + { + "epoch": 0.25333888237297114, + "grad_norm": 2.313671588897705, + "learning_rate": 1.6040894473656502e-05, + "loss": 0.6839, + "step": 3348 + }, + { + "epoch": 0.25341455109530475, + "grad_norm": 2.0433685779571533, + "learning_rate": 1.603949140412145e-05, + "loss": 0.7668, + "step": 3349 + }, + { + "epoch": 0.25349021981763836, + "grad_norm": 2.314209461212158, + "learning_rate": 1.6038087893750673e-05, + "loss": 0.806, + "step": 3350 + }, + { + "epoch": 0.25356588853997203, + "grad_norm": 2.245436906814575, + "learning_rate": 1.6036683942632073e-05, + "loss": 0.7672, + "step": 3351 + }, + { + "epoch": 0.25364155726230564, + "grad_norm": 1.8543047904968262, + "learning_rate": 1.6035279550853564e-05, + "loss": 0.6501, + "step": 3352 + }, + { + "epoch": 0.25371722598463925, + "grad_norm": 2.085162878036499, + "learning_rate": 1.6033874718503092e-05, + "loss": 0.7534, + "step": 3353 + }, + { + "epoch": 0.25379289470697286, + "grad_norm": 2.3843894004821777, + "learning_rate": 1.6032469445668636e-05, + "loss": 0.8074, + "step": 3354 + }, + { + "epoch": 0.25386856342930647, + "grad_norm": 2.223787307739258, + "learning_rate": 1.6031063732438197e-05, + "loss": 0.7994, + "step": 3355 + }, + { + "epoch": 0.25394423215164014, + "grad_norm": 2.203183650970459, + "learning_rate": 1.6029657578899808e-05, + "loss": 0.7981, + "step": 3356 + }, + { + "epoch": 0.25401990087397375, + "grad_norm": 2.084005355834961, + "learning_rate": 1.6028250985141524e-05, + "loss": 0.8071, + "step": 3357 + }, + { + "epoch": 0.25409556959630736, + "grad_norm": 2.4879648685455322, + "learning_rate": 1.602684395125143e-05, + "loss": 0.7354, + "step": 3358 + }, + { + "epoch": 0.25417123831864097, + "grad_norm": 2.1480824947357178, + "learning_rate": 1.602543647731764e-05, + "loss": 0.775, + "step": 3359 + }, + { + "epoch": 0.25424690704097463, + "grad_norm": 2.283517360687256, + "learning_rate": 1.6024028563428296e-05, + "loss": 0.7326, + "step": 3360 + }, + { + "epoch": 0.25432257576330825, + "grad_norm": 2.376298666000366, + "learning_rate": 1.6022620209671567e-05, + "loss": 0.6533, + "step": 3361 + }, + { + "epoch": 0.25439824448564186, + "grad_norm": 2.1882760524749756, + "learning_rate": 1.6021211416135644e-05, + "loss": 0.8149, + "step": 3362 + }, + { + "epoch": 0.25447391320797547, + "grad_norm": 2.442096471786499, + "learning_rate": 1.601980218290875e-05, + "loss": 0.8602, + "step": 3363 + }, + { + "epoch": 0.25454958193030913, + "grad_norm": 2.5917186737060547, + "learning_rate": 1.6018392510079145e-05, + "loss": 0.7316, + "step": 3364 + }, + { + "epoch": 0.25462525065264274, + "grad_norm": 2.8332271575927734, + "learning_rate": 1.6016982397735098e-05, + "loss": 0.6501, + "step": 3365 + }, + { + "epoch": 0.25470091937497635, + "grad_norm": 2.322115182876587, + "learning_rate": 1.6015571845964914e-05, + "loss": 0.6404, + "step": 3366 + }, + { + "epoch": 0.25477658809730996, + "grad_norm": 1.6798187494277954, + "learning_rate": 1.6014160854856933e-05, + "loss": 0.6577, + "step": 3367 + }, + { + "epoch": 0.2548522568196436, + "grad_norm": 2.1623387336730957, + "learning_rate": 1.601274942449951e-05, + "loss": 0.7403, + "step": 3368 + }, + { + "epoch": 0.25492792554197724, + "grad_norm": 2.2108073234558105, + "learning_rate": 1.6011337554981044e-05, + "loss": 0.626, + "step": 3369 + }, + { + "epoch": 0.25500359426431085, + "grad_norm": 2.5659923553466797, + "learning_rate": 1.6009925246389933e-05, + "loss": 0.7742, + "step": 3370 + }, + { + "epoch": 0.25507926298664446, + "grad_norm": 2.251542091369629, + "learning_rate": 1.6008512498814637e-05, + "loss": 0.8051, + "step": 3371 + }, + { + "epoch": 0.25515493170897807, + "grad_norm": 2.227972984313965, + "learning_rate": 1.6007099312343618e-05, + "loss": 0.7986, + "step": 3372 + }, + { + "epoch": 0.25523060043131174, + "grad_norm": 2.539790153503418, + "learning_rate": 1.6005685687065375e-05, + "loss": 0.7455, + "step": 3373 + }, + { + "epoch": 0.25530626915364535, + "grad_norm": 2.585012435913086, + "learning_rate": 1.6004271623068436e-05, + "loss": 0.8405, + "step": 3374 + }, + { + "epoch": 0.25538193787597896, + "grad_norm": 2.184983015060425, + "learning_rate": 1.6002857120441354e-05, + "loss": 0.6086, + "step": 3375 + }, + { + "epoch": 0.25545760659831257, + "grad_norm": 3.0355441570281982, + "learning_rate": 1.6001442179272708e-05, + "loss": 0.8099, + "step": 3376 + }, + { + "epoch": 0.25553327532064624, + "grad_norm": 2.2966806888580322, + "learning_rate": 1.600002679965111e-05, + "loss": 0.8367, + "step": 3377 + }, + { + "epoch": 0.25560894404297985, + "grad_norm": 2.3266594409942627, + "learning_rate": 1.599861098166519e-05, + "loss": 0.7798, + "step": 3378 + }, + { + "epoch": 0.25568461276531346, + "grad_norm": 2.5856709480285645, + "learning_rate": 1.5997194725403614e-05, + "loss": 0.7653, + "step": 3379 + }, + { + "epoch": 0.25576028148764707, + "grad_norm": 1.9472054243087769, + "learning_rate": 1.5995778030955073e-05, + "loss": 0.8388, + "step": 3380 + }, + { + "epoch": 0.2558359502099807, + "grad_norm": 2.6131577491760254, + "learning_rate": 1.599436089840829e-05, + "loss": 0.8128, + "step": 3381 + }, + { + "epoch": 0.25591161893231434, + "grad_norm": 2.5530786514282227, + "learning_rate": 1.5992943327851998e-05, + "loss": 0.7969, + "step": 3382 + }, + { + "epoch": 0.25598728765464795, + "grad_norm": 2.0992929935455322, + "learning_rate": 1.599152531937498e-05, + "loss": 0.7408, + "step": 3383 + }, + { + "epoch": 0.25606295637698157, + "grad_norm": 2.5544259548187256, + "learning_rate": 1.599010687306603e-05, + "loss": 0.8137, + "step": 3384 + }, + { + "epoch": 0.2561386250993152, + "grad_norm": 2.344470500946045, + "learning_rate": 1.5988687989013985e-05, + "loss": 0.6803, + "step": 3385 + }, + { + "epoch": 0.25621429382164884, + "grad_norm": 2.069218873977661, + "learning_rate": 1.5987268667307688e-05, + "loss": 0.8429, + "step": 3386 + }, + { + "epoch": 0.25628996254398245, + "grad_norm": 1.9888510704040527, + "learning_rate": 1.598584890803603e-05, + "loss": 0.8587, + "step": 3387 + }, + { + "epoch": 0.25636563126631606, + "grad_norm": 2.630401849746704, + "learning_rate": 1.5984428711287917e-05, + "loss": 0.8905, + "step": 3388 + }, + { + "epoch": 0.2564412999886497, + "grad_norm": 2.1368906497955322, + "learning_rate": 1.5983008077152292e-05, + "loss": 0.6999, + "step": 3389 + }, + { + "epoch": 0.25651696871098334, + "grad_norm": 3.064831495285034, + "learning_rate": 1.598158700571811e-05, + "loss": 0.6628, + "step": 3390 + }, + { + "epoch": 0.25659263743331695, + "grad_norm": 2.474104881286621, + "learning_rate": 1.598016549707437e-05, + "loss": 0.8611, + "step": 3391 + }, + { + "epoch": 0.25666830615565056, + "grad_norm": 2.035888671875, + "learning_rate": 1.5978743551310094e-05, + "loss": 0.6389, + "step": 3392 + }, + { + "epoch": 0.25674397487798417, + "grad_norm": 2.2236475944519043, + "learning_rate": 1.597732116851432e-05, + "loss": 0.7512, + "step": 3393 + }, + { + "epoch": 0.2568196436003178, + "grad_norm": 2.101870536804199, + "learning_rate": 1.5975898348776128e-05, + "loss": 0.8177, + "step": 3394 + }, + { + "epoch": 0.25689531232265145, + "grad_norm": 2.129502058029175, + "learning_rate": 1.5974475092184618e-05, + "loss": 0.6882, + "step": 3395 + }, + { + "epoch": 0.25697098104498506, + "grad_norm": 3.0002481937408447, + "learning_rate": 1.5973051398828923e-05, + "loss": 0.7554, + "step": 3396 + }, + { + "epoch": 0.25704664976731867, + "grad_norm": 2.7170114517211914, + "learning_rate": 1.5971627268798193e-05, + "loss": 0.6824, + "step": 3397 + }, + { + "epoch": 0.2571223184896523, + "grad_norm": 2.4976844787597656, + "learning_rate": 1.5970202702181613e-05, + "loss": 0.8525, + "step": 3398 + }, + { + "epoch": 0.25719798721198595, + "grad_norm": 2.3807260990142822, + "learning_rate": 1.59687776990684e-05, + "loss": 0.6943, + "step": 3399 + }, + { + "epoch": 0.25727365593431956, + "grad_norm": 2.146085023880005, + "learning_rate": 1.5967352259547786e-05, + "loss": 0.7723, + "step": 3400 + }, + { + "epoch": 0.25734932465665317, + "grad_norm": 2.7178564071655273, + "learning_rate": 1.596592638370904e-05, + "loss": 0.6769, + "step": 3401 + }, + { + "epoch": 0.2574249933789868, + "grad_norm": 2.112178325653076, + "learning_rate": 1.5964500071641446e-05, + "loss": 0.8901, + "step": 3402 + }, + { + "epoch": 0.25750066210132044, + "grad_norm": 2.2620973587036133, + "learning_rate": 1.5963073323434336e-05, + "loss": 0.9132, + "step": 3403 + }, + { + "epoch": 0.25757633082365405, + "grad_norm": 2.1333353519439697, + "learning_rate": 1.5961646139177053e-05, + "loss": 0.8134, + "step": 3404 + }, + { + "epoch": 0.25765199954598766, + "grad_norm": 2.5050787925720215, + "learning_rate": 1.5960218518958977e-05, + "loss": 0.863, + "step": 3405 + }, + { + "epoch": 0.2577276682683213, + "grad_norm": 1.9675660133361816, + "learning_rate": 1.59587904628695e-05, + "loss": 0.8025, + "step": 3406 + }, + { + "epoch": 0.2578033369906549, + "grad_norm": 2.3362104892730713, + "learning_rate": 1.5957361970998056e-05, + "loss": 0.8922, + "step": 3407 + }, + { + "epoch": 0.25787900571298855, + "grad_norm": 2.6554508209228516, + "learning_rate": 1.5955933043434102e-05, + "loss": 0.6258, + "step": 3408 + }, + { + "epoch": 0.25795467443532216, + "grad_norm": 2.41428542137146, + "learning_rate": 1.5954503680267128e-05, + "loss": 0.8198, + "step": 3409 + }, + { + "epoch": 0.2580303431576558, + "grad_norm": 2.5038862228393555, + "learning_rate": 1.5953073881586637e-05, + "loss": 0.7589, + "step": 3410 + }, + { + "epoch": 0.2581060118799894, + "grad_norm": 2.199652671813965, + "learning_rate": 1.5951643647482172e-05, + "loss": 0.6257, + "step": 3411 + }, + { + "epoch": 0.25818168060232305, + "grad_norm": 2.1079485416412354, + "learning_rate": 1.5950212978043294e-05, + "loss": 0.6186, + "step": 3412 + }, + { + "epoch": 0.25825734932465666, + "grad_norm": 2.202430248260498, + "learning_rate": 1.5948781873359602e-05, + "loss": 0.7587, + "step": 3413 + }, + { + "epoch": 0.25833301804699027, + "grad_norm": 2.260615110397339, + "learning_rate": 1.5947350333520713e-05, + "loss": 0.7012, + "step": 3414 + }, + { + "epoch": 0.2584086867693239, + "grad_norm": 2.422053337097168, + "learning_rate": 1.5945918358616276e-05, + "loss": 0.9323, + "step": 3415 + }, + { + "epoch": 0.25848435549165755, + "grad_norm": 2.8548550605773926, + "learning_rate": 1.5944485948735965e-05, + "loss": 0.722, + "step": 3416 + }, + { + "epoch": 0.25856002421399116, + "grad_norm": 2.1494009494781494, + "learning_rate": 1.5943053103969484e-05, + "loss": 0.8007, + "step": 3417 + }, + { + "epoch": 0.25863569293632477, + "grad_norm": 2.599536180496216, + "learning_rate": 1.594161982440656e-05, + "loss": 0.9085, + "step": 3418 + }, + { + "epoch": 0.2587113616586584, + "grad_norm": 2.498399257659912, + "learning_rate": 1.5940186110136952e-05, + "loss": 0.7815, + "step": 3419 + }, + { + "epoch": 0.25878703038099204, + "grad_norm": 2.293846845626831, + "learning_rate": 1.593875196125044e-05, + "loss": 0.6341, + "step": 3420 + }, + { + "epoch": 0.25886269910332566, + "grad_norm": 1.788888931274414, + "learning_rate": 1.593731737783684e-05, + "loss": 0.6127, + "step": 3421 + }, + { + "epoch": 0.25893836782565927, + "grad_norm": 2.166012763977051, + "learning_rate": 1.5935882359985986e-05, + "loss": 0.6978, + "step": 3422 + }, + { + "epoch": 0.2590140365479929, + "grad_norm": 2.089470148086548, + "learning_rate": 1.5934446907787748e-05, + "loss": 0.7217, + "step": 3423 + }, + { + "epoch": 0.2590897052703265, + "grad_norm": 1.9885095357894897, + "learning_rate": 1.5933011021332015e-05, + "loss": 0.6653, + "step": 3424 + }, + { + "epoch": 0.25916537399266015, + "grad_norm": 2.147557497024536, + "learning_rate": 1.5931574700708704e-05, + "loss": 0.6181, + "step": 3425 + }, + { + "epoch": 0.25924104271499376, + "grad_norm": 1.9154552221298218, + "learning_rate": 1.5930137946007768e-05, + "loss": 0.7011, + "step": 3426 + }, + { + "epoch": 0.2593167114373274, + "grad_norm": 1.8677332401275635, + "learning_rate": 1.592870075731918e-05, + "loss": 0.8459, + "step": 3427 + }, + { + "epoch": 0.259392380159661, + "grad_norm": 2.35475492477417, + "learning_rate": 1.592726313473294e-05, + "loss": 0.8357, + "step": 3428 + }, + { + "epoch": 0.25946804888199465, + "grad_norm": 2.0991413593292236, + "learning_rate": 1.592582507833908e-05, + "loss": 0.715, + "step": 3429 + }, + { + "epoch": 0.25954371760432826, + "grad_norm": 2.366481304168701, + "learning_rate": 1.592438658822765e-05, + "loss": 0.751, + "step": 3430 + }, + { + "epoch": 0.25961938632666187, + "grad_norm": 2.203183650970459, + "learning_rate": 1.5922947664488733e-05, + "loss": 0.863, + "step": 3431 + }, + { + "epoch": 0.2596950550489955, + "grad_norm": 2.090794324874878, + "learning_rate": 1.5921508307212445e-05, + "loss": 0.7527, + "step": 3432 + }, + { + "epoch": 0.25977072377132915, + "grad_norm": 2.188838005065918, + "learning_rate": 1.592006851648892e-05, + "loss": 0.8538, + "step": 3433 + }, + { + "epoch": 0.25984639249366276, + "grad_norm": 2.623730182647705, + "learning_rate": 1.5918628292408323e-05, + "loss": 1.0331, + "step": 3434 + }, + { + "epoch": 0.25992206121599637, + "grad_norm": 1.948943853378296, + "learning_rate": 1.591718763506084e-05, + "loss": 0.705, + "step": 3435 + }, + { + "epoch": 0.25999772993833, + "grad_norm": 2.0423173904418945, + "learning_rate": 1.59157465445367e-05, + "loss": 0.7737, + "step": 3436 + }, + { + "epoch": 0.2600733986606636, + "grad_norm": 3.1950523853302, + "learning_rate": 1.591430502092614e-05, + "loss": 0.7861, + "step": 3437 + }, + { + "epoch": 0.26014906738299726, + "grad_norm": 1.802661657333374, + "learning_rate": 1.5912863064319437e-05, + "loss": 0.6932, + "step": 3438 + }, + { + "epoch": 0.26022473610533087, + "grad_norm": 2.07924485206604, + "learning_rate": 1.591142067480689e-05, + "loss": 0.7227, + "step": 3439 + }, + { + "epoch": 0.2603004048276645, + "grad_norm": 2.3398637771606445, + "learning_rate": 1.5909977852478826e-05, + "loss": 0.8542, + "step": 3440 + }, + { + "epoch": 0.2603760735499981, + "grad_norm": 2.2240166664123535, + "learning_rate": 1.5908534597425597e-05, + "loss": 0.6994, + "step": 3441 + }, + { + "epoch": 0.26045174227233175, + "grad_norm": 2.661499261856079, + "learning_rate": 1.5907090909737592e-05, + "loss": 0.7015, + "step": 3442 + }, + { + "epoch": 0.26052741099466536, + "grad_norm": 2.6953768730163574, + "learning_rate": 1.590564678950521e-05, + "loss": 0.8241, + "step": 3443 + }, + { + "epoch": 0.260603079716999, + "grad_norm": 2.4903414249420166, + "learning_rate": 1.590420223681889e-05, + "loss": 0.648, + "step": 3444 + }, + { + "epoch": 0.2606787484393326, + "grad_norm": 1.9942926168441772, + "learning_rate": 1.5902757251769097e-05, + "loss": 0.7933, + "step": 3445 + }, + { + "epoch": 0.26075441716166625, + "grad_norm": 2.222245931625366, + "learning_rate": 1.590131183444632e-05, + "loss": 0.8263, + "step": 3446 + }, + { + "epoch": 0.26083008588399986, + "grad_norm": 2.2102739810943604, + "learning_rate": 1.589986598494107e-05, + "loss": 0.7452, + "step": 3447 + }, + { + "epoch": 0.2609057546063335, + "grad_norm": 2.0333497524261475, + "learning_rate": 1.5898419703343896e-05, + "loss": 0.7399, + "step": 3448 + }, + { + "epoch": 0.2609814233286671, + "grad_norm": 3.434465169906616, + "learning_rate": 1.5896972989745372e-05, + "loss": 0.5499, + "step": 3449 + }, + { + "epoch": 0.2610570920510007, + "grad_norm": 1.980613112449646, + "learning_rate": 1.589552584423609e-05, + "loss": 0.7486, + "step": 3450 + }, + { + "epoch": 0.26113276077333436, + "grad_norm": 2.2190256118774414, + "learning_rate": 1.5894078266906676e-05, + "loss": 0.7358, + "step": 3451 + }, + { + "epoch": 0.26120842949566797, + "grad_norm": 2.138643264770508, + "learning_rate": 1.5892630257847783e-05, + "loss": 0.7376, + "step": 3452 + }, + { + "epoch": 0.2612840982180016, + "grad_norm": 2.1247470378875732, + "learning_rate": 1.589118181715009e-05, + "loss": 0.5772, + "step": 3453 + }, + { + "epoch": 0.2613597669403352, + "grad_norm": 2.137392520904541, + "learning_rate": 1.58897329449043e-05, + "loss": 0.8956, + "step": 3454 + }, + { + "epoch": 0.26143543566266886, + "grad_norm": 2.029174327850342, + "learning_rate": 1.588828364120115e-05, + "loss": 0.7217, + "step": 3455 + }, + { + "epoch": 0.26151110438500247, + "grad_norm": 1.8149274587631226, + "learning_rate": 1.5886833906131404e-05, + "loss": 0.9841, + "step": 3456 + }, + { + "epoch": 0.2615867731073361, + "grad_norm": 2.2353672981262207, + "learning_rate": 1.588538373978584e-05, + "loss": 0.7479, + "step": 3457 + }, + { + "epoch": 0.2616624418296697, + "grad_norm": 2.4940500259399414, + "learning_rate": 1.5883933142255276e-05, + "loss": 0.6687, + "step": 3458 + }, + { + "epoch": 0.26173811055200336, + "grad_norm": 2.020583391189575, + "learning_rate": 1.5882482113630554e-05, + "loss": 0.7496, + "step": 3459 + }, + { + "epoch": 0.26181377927433697, + "grad_norm": 2.2853190898895264, + "learning_rate": 1.5881030654002542e-05, + "loss": 0.7181, + "step": 3460 + }, + { + "epoch": 0.2618894479966706, + "grad_norm": 2.1658430099487305, + "learning_rate": 1.5879578763462135e-05, + "loss": 0.7435, + "step": 3461 + }, + { + "epoch": 0.2619651167190042, + "grad_norm": 2.1128828525543213, + "learning_rate": 1.5878126442100252e-05, + "loss": 0.7692, + "step": 3462 + }, + { + "epoch": 0.2620407854413378, + "grad_norm": 2.092738628387451, + "learning_rate": 1.5876673690007848e-05, + "loss": 0.6514, + "step": 3463 + }, + { + "epoch": 0.26211645416367146, + "grad_norm": 2.4423155784606934, + "learning_rate": 1.587522050727589e-05, + "loss": 0.8233, + "step": 3464 + }, + { + "epoch": 0.2621921228860051, + "grad_norm": 2.969520092010498, + "learning_rate": 1.5873766893995392e-05, + "loss": 0.7755, + "step": 3465 + }, + { + "epoch": 0.2622677916083387, + "grad_norm": 2.1669178009033203, + "learning_rate": 1.5872312850257378e-05, + "loss": 0.8578, + "step": 3466 + }, + { + "epoch": 0.2623434603306723, + "grad_norm": 2.1200549602508545, + "learning_rate": 1.5870858376152904e-05, + "loss": 0.8403, + "step": 3467 + }, + { + "epoch": 0.26241912905300596, + "grad_norm": 2.184720039367676, + "learning_rate": 1.5869403471773058e-05, + "loss": 0.6616, + "step": 3468 + }, + { + "epoch": 0.2624947977753396, + "grad_norm": 2.18776798248291, + "learning_rate": 1.5867948137208945e-05, + "loss": 0.8047, + "step": 3469 + }, + { + "epoch": 0.2625704664976732, + "grad_norm": 2.2961819171905518, + "learning_rate": 1.5866492372551707e-05, + "loss": 0.8281, + "step": 3470 + }, + { + "epoch": 0.2626461352200068, + "grad_norm": 2.440213918685913, + "learning_rate": 1.5865036177892508e-05, + "loss": 0.7852, + "step": 3471 + }, + { + "epoch": 0.26272180394234046, + "grad_norm": 2.684682846069336, + "learning_rate": 1.586357955332254e-05, + "loss": 0.7171, + "step": 3472 + }, + { + "epoch": 0.26279747266467407, + "grad_norm": 2.211758613586426, + "learning_rate": 1.5862122498933016e-05, + "loss": 0.8172, + "step": 3473 + }, + { + "epoch": 0.2628731413870077, + "grad_norm": 2.2629692554473877, + "learning_rate": 1.5860665014815192e-05, + "loss": 0.7832, + "step": 3474 + }, + { + "epoch": 0.2629488101093413, + "grad_norm": 2.0024826526641846, + "learning_rate": 1.5859207101060336e-05, + "loss": 0.8227, + "step": 3475 + }, + { + "epoch": 0.2630244788316749, + "grad_norm": 2.5234375, + "learning_rate": 1.585774875775974e-05, + "loss": 0.8759, + "step": 3476 + }, + { + "epoch": 0.26310014755400857, + "grad_norm": 2.879760503768921, + "learning_rate": 1.585628998500474e-05, + "loss": 0.7474, + "step": 3477 + }, + { + "epoch": 0.2631758162763422, + "grad_norm": 2.5393941402435303, + "learning_rate": 1.5854830782886686e-05, + "loss": 0.8035, + "step": 3478 + }, + { + "epoch": 0.2632514849986758, + "grad_norm": 2.9086737632751465, + "learning_rate": 1.5853371151496956e-05, + "loss": 0.7489, + "step": 3479 + }, + { + "epoch": 0.2633271537210094, + "grad_norm": 2.603519916534424, + "learning_rate": 1.5851911090926957e-05, + "loss": 0.8353, + "step": 3480 + }, + { + "epoch": 0.26340282244334307, + "grad_norm": 3.2521355152130127, + "learning_rate": 1.5850450601268123e-05, + "loss": 0.8392, + "step": 3481 + }, + { + "epoch": 0.2634784911656767, + "grad_norm": 1.8981279134750366, + "learning_rate": 1.5848989682611916e-05, + "loss": 0.7701, + "step": 3482 + }, + { + "epoch": 0.2635541598880103, + "grad_norm": 2.0759172439575195, + "learning_rate": 1.5847528335049825e-05, + "loss": 0.8041, + "step": 3483 + }, + { + "epoch": 0.2636298286103439, + "grad_norm": 2.702728748321533, + "learning_rate": 1.584606655867336e-05, + "loss": 0.8192, + "step": 3484 + }, + { + "epoch": 0.26370549733267756, + "grad_norm": 2.305008888244629, + "learning_rate": 1.5844604353574065e-05, + "loss": 0.788, + "step": 3485 + }, + { + "epoch": 0.2637811660550112, + "grad_norm": 2.113942861557007, + "learning_rate": 1.5843141719843506e-05, + "loss": 0.8344, + "step": 3486 + }, + { + "epoch": 0.2638568347773448, + "grad_norm": 1.9294511079788208, + "learning_rate": 1.584167865757328e-05, + "loss": 0.6665, + "step": 3487 + }, + { + "epoch": 0.2639325034996784, + "grad_norm": 2.4283103942871094, + "learning_rate": 1.584021516685501e-05, + "loss": 0.7448, + "step": 3488 + }, + { + "epoch": 0.264008172222012, + "grad_norm": 2.7982795238494873, + "learning_rate": 1.583875124778034e-05, + "loss": 0.797, + "step": 3489 + }, + { + "epoch": 0.26408384094434567, + "grad_norm": 2.033411741256714, + "learning_rate": 1.5837286900440946e-05, + "loss": 0.6918, + "step": 3490 + }, + { + "epoch": 0.2641595096666793, + "grad_norm": 1.9575029611587524, + "learning_rate": 1.5835822124928536e-05, + "loss": 0.7613, + "step": 3491 + }, + { + "epoch": 0.2642351783890129, + "grad_norm": 2.232651472091675, + "learning_rate": 1.583435692133483e-05, + "loss": 0.7767, + "step": 3492 + }, + { + "epoch": 0.2643108471113465, + "grad_norm": 2.3022804260253906, + "learning_rate": 1.5832891289751595e-05, + "loss": 0.6333, + "step": 3493 + }, + { + "epoch": 0.26438651583368017, + "grad_norm": 2.3175253868103027, + "learning_rate": 1.58314252302706e-05, + "loss": 0.6724, + "step": 3494 + }, + { + "epoch": 0.2644621845560138, + "grad_norm": 2.0862104892730713, + "learning_rate": 1.5829958742983665e-05, + "loss": 0.7843, + "step": 3495 + }, + { + "epoch": 0.2645378532783474, + "grad_norm": 2.384624481201172, + "learning_rate": 1.5828491827982625e-05, + "loss": 0.6976, + "step": 3496 + }, + { + "epoch": 0.264613522000681, + "grad_norm": 2.224984645843506, + "learning_rate": 1.5827024485359337e-05, + "loss": 0.7435, + "step": 3497 + }, + { + "epoch": 0.26468919072301467, + "grad_norm": 2.280029535293579, + "learning_rate": 1.5825556715205696e-05, + "loss": 0.7737, + "step": 3498 + }, + { + "epoch": 0.2647648594453483, + "grad_norm": 2.348893642425537, + "learning_rate": 1.5824088517613618e-05, + "loss": 0.7458, + "step": 3499 + }, + { + "epoch": 0.2648405281676819, + "grad_norm": 2.458357334136963, + "learning_rate": 1.5822619892675042e-05, + "loss": 0.7923, + "step": 3500 + }, + { + "epoch": 0.2649161968900155, + "grad_norm": 2.259758710861206, + "learning_rate": 1.5821150840481944e-05, + "loss": 0.9079, + "step": 3501 + }, + { + "epoch": 0.2649918656123491, + "grad_norm": 2.3717095851898193, + "learning_rate": 1.5819681361126315e-05, + "loss": 0.9236, + "step": 3502 + }, + { + "epoch": 0.2650675343346828, + "grad_norm": 1.9060953855514526, + "learning_rate": 1.5818211454700185e-05, + "loss": 0.6778, + "step": 3503 + }, + { + "epoch": 0.2651432030570164, + "grad_norm": 2.2435457706451416, + "learning_rate": 1.5816741121295602e-05, + "loss": 0.7405, + "step": 3504 + }, + { + "epoch": 0.26521887177935, + "grad_norm": 1.7485630512237549, + "learning_rate": 1.5815270361004638e-05, + "loss": 0.6545, + "step": 3505 + }, + { + "epoch": 0.2652945405016836, + "grad_norm": 2.0698351860046387, + "learning_rate": 1.5813799173919403e-05, + "loss": 0.7109, + "step": 3506 + }, + { + "epoch": 0.2653702092240173, + "grad_norm": 2.1378233432769775, + "learning_rate": 1.5812327560132024e-05, + "loss": 0.639, + "step": 3507 + }, + { + "epoch": 0.2654458779463509, + "grad_norm": 2.570868492126465, + "learning_rate": 1.581085551973466e-05, + "loss": 0.8756, + "step": 3508 + }, + { + "epoch": 0.2655215466686845, + "grad_norm": 2.893656015396118, + "learning_rate": 1.5809383052819496e-05, + "loss": 0.812, + "step": 3509 + }, + { + "epoch": 0.2655972153910181, + "grad_norm": 2.2518537044525146, + "learning_rate": 1.580791015947874e-05, + "loss": 0.8163, + "step": 3510 + }, + { + "epoch": 0.26567288411335177, + "grad_norm": 2.2821381092071533, + "learning_rate": 1.580643683980463e-05, + "loss": 0.6801, + "step": 3511 + }, + { + "epoch": 0.2657485528356854, + "grad_norm": 2.068220376968384, + "learning_rate": 1.580496309388943e-05, + "loss": 0.8859, + "step": 3512 + }, + { + "epoch": 0.265824221558019, + "grad_norm": 2.3530995845794678, + "learning_rate": 1.580348892182543e-05, + "loss": 0.8019, + "step": 3513 + }, + { + "epoch": 0.2658998902803526, + "grad_norm": 2.0066163539886475, + "learning_rate": 1.580201432370495e-05, + "loss": 0.7801, + "step": 3514 + }, + { + "epoch": 0.2659755590026862, + "grad_norm": 2.1849310398101807, + "learning_rate": 1.5800539299620333e-05, + "loss": 0.8071, + "step": 3515 + }, + { + "epoch": 0.2660512277250199, + "grad_norm": 2.290064573287964, + "learning_rate": 1.5799063849663948e-05, + "loss": 0.6413, + "step": 3516 + }, + { + "epoch": 0.2661268964473535, + "grad_norm": 2.0501484870910645, + "learning_rate": 1.5797587973928197e-05, + "loss": 0.6741, + "step": 3517 + }, + { + "epoch": 0.2662025651696871, + "grad_norm": 2.559082508087158, + "learning_rate": 1.57961116725055e-05, + "loss": 0.7717, + "step": 3518 + }, + { + "epoch": 0.2662782338920207, + "grad_norm": 2.377612829208374, + "learning_rate": 1.579463494548831e-05, + "loss": 0.7944, + "step": 3519 + }, + { + "epoch": 0.2663539026143544, + "grad_norm": 2.3611176013946533, + "learning_rate": 1.57931577929691e-05, + "loss": 0.7483, + "step": 3520 + }, + { + "epoch": 0.266429571336688, + "grad_norm": 1.9215701818466187, + "learning_rate": 1.5791680215040376e-05, + "loss": 0.6026, + "step": 3521 + }, + { + "epoch": 0.2665052400590216, + "grad_norm": 2.0787158012390137, + "learning_rate": 1.5790202211794675e-05, + "loss": 0.7157, + "step": 3522 + }, + { + "epoch": 0.2665809087813552, + "grad_norm": 2.295515298843384, + "learning_rate": 1.5788723783324546e-05, + "loss": 0.7047, + "step": 3523 + }, + { + "epoch": 0.2666565775036889, + "grad_norm": 1.9624741077423096, + "learning_rate": 1.5787244929722578e-05, + "loss": 0.7119, + "step": 3524 + }, + { + "epoch": 0.2667322462260225, + "grad_norm": 2.571765661239624, + "learning_rate": 1.5785765651081377e-05, + "loss": 0.7344, + "step": 3525 + }, + { + "epoch": 0.2668079149483561, + "grad_norm": 3.0177741050720215, + "learning_rate": 1.5784285947493585e-05, + "loss": 0.9012, + "step": 3526 + }, + { + "epoch": 0.2668835836706897, + "grad_norm": 2.370260238647461, + "learning_rate": 1.5782805819051865e-05, + "loss": 0.838, + "step": 3527 + }, + { + "epoch": 0.2669592523930233, + "grad_norm": 2.122828722000122, + "learning_rate": 1.5781325265848906e-05, + "loss": 0.8281, + "step": 3528 + }, + { + "epoch": 0.267034921115357, + "grad_norm": 1.9243050813674927, + "learning_rate": 1.5779844287977424e-05, + "loss": 0.7285, + "step": 3529 + }, + { + "epoch": 0.2671105898376906, + "grad_norm": 2.585332155227661, + "learning_rate": 1.577836288553016e-05, + "loss": 0.8387, + "step": 3530 + }, + { + "epoch": 0.2671862585600242, + "grad_norm": 2.6601555347442627, + "learning_rate": 1.5776881058599897e-05, + "loss": 0.7493, + "step": 3531 + }, + { + "epoch": 0.2672619272823578, + "grad_norm": 2.395071029663086, + "learning_rate": 1.577539880727942e-05, + "loss": 0.8335, + "step": 3532 + }, + { + "epoch": 0.2673375960046915, + "grad_norm": 1.9745339155197144, + "learning_rate": 1.5773916131661553e-05, + "loss": 0.6619, + "step": 3533 + }, + { + "epoch": 0.2674132647270251, + "grad_norm": 2.2702884674072266, + "learning_rate": 1.577243303183915e-05, + "loss": 0.7704, + "step": 3534 + }, + { + "epoch": 0.2674889334493587, + "grad_norm": 2.0734710693359375, + "learning_rate": 1.5770949507905085e-05, + "loss": 0.7202, + "step": 3535 + }, + { + "epoch": 0.2675646021716923, + "grad_norm": 1.9147382974624634, + "learning_rate": 1.576946555995226e-05, + "loss": 0.6244, + "step": 3536 + }, + { + "epoch": 0.267640270894026, + "grad_norm": 2.5039188861846924, + "learning_rate": 1.576798118807361e-05, + "loss": 0.6814, + "step": 3537 + }, + { + "epoch": 0.2677159396163596, + "grad_norm": 2.3731849193573, + "learning_rate": 1.5766496392362088e-05, + "loss": 0.7602, + "step": 3538 + }, + { + "epoch": 0.2677916083386932, + "grad_norm": 2.7606589794158936, + "learning_rate": 1.5765011172910676e-05, + "loss": 0.7816, + "step": 3539 + }, + { + "epoch": 0.2678672770610268, + "grad_norm": 2.252899408340454, + "learning_rate": 1.576352552981238e-05, + "loss": 0.8885, + "step": 3540 + }, + { + "epoch": 0.2679429457833605, + "grad_norm": 2.393841505050659, + "learning_rate": 1.5762039463160244e-05, + "loss": 0.7985, + "step": 3541 + }, + { + "epoch": 0.2680186145056941, + "grad_norm": 2.1034390926361084, + "learning_rate": 1.5760552973047324e-05, + "loss": 0.7088, + "step": 3542 + }, + { + "epoch": 0.2680942832280277, + "grad_norm": 2.355592727661133, + "learning_rate": 1.5759066059566708e-05, + "loss": 0.7645, + "step": 3543 + }, + { + "epoch": 0.2681699519503613, + "grad_norm": 2.2355756759643555, + "learning_rate": 1.575757872281152e-05, + "loss": 0.745, + "step": 3544 + }, + { + "epoch": 0.2682456206726949, + "grad_norm": 2.194815158843994, + "learning_rate": 1.5756090962874887e-05, + "loss": 0.5606, + "step": 3545 + }, + { + "epoch": 0.2683212893950286, + "grad_norm": 2.1128334999084473, + "learning_rate": 1.5754602779849992e-05, + "loss": 0.7998, + "step": 3546 + }, + { + "epoch": 0.2683969581173622, + "grad_norm": 2.3708901405334473, + "learning_rate": 1.5753114173830024e-05, + "loss": 0.8299, + "step": 3547 + }, + { + "epoch": 0.2684726268396958, + "grad_norm": 2.555021286010742, + "learning_rate": 1.5751625144908203e-05, + "loss": 0.7638, + "step": 3548 + }, + { + "epoch": 0.2685482955620294, + "grad_norm": 2.081984758377075, + "learning_rate": 1.5750135693177777e-05, + "loss": 0.7852, + "step": 3549 + }, + { + "epoch": 0.2686239642843631, + "grad_norm": 2.4795191287994385, + "learning_rate": 1.5748645818732025e-05, + "loss": 0.7854, + "step": 3550 + }, + { + "epoch": 0.2686996330066967, + "grad_norm": 2.0074732303619385, + "learning_rate": 1.574715552166424e-05, + "loss": 0.8557, + "step": 3551 + }, + { + "epoch": 0.2687753017290303, + "grad_norm": 2.3227956295013428, + "learning_rate": 1.5745664802067755e-05, + "loss": 0.6981, + "step": 3552 + }, + { + "epoch": 0.2688509704513639, + "grad_norm": 3.9149978160858154, + "learning_rate": 1.5744173660035923e-05, + "loss": 0.7036, + "step": 3553 + }, + { + "epoch": 0.2689266391736976, + "grad_norm": 2.3597822189331055, + "learning_rate": 1.574268209566212e-05, + "loss": 0.7453, + "step": 3554 + }, + { + "epoch": 0.2690023078960312, + "grad_norm": 3.120544672012329, + "learning_rate": 1.574119010903976e-05, + "loss": 0.7463, + "step": 3555 + }, + { + "epoch": 0.2690779766183648, + "grad_norm": 2.2731995582580566, + "learning_rate": 1.573969770026227e-05, + "loss": 0.7326, + "step": 3556 + }, + { + "epoch": 0.2691536453406984, + "grad_norm": 3.586319923400879, + "learning_rate": 1.5738204869423107e-05, + "loss": 0.7289, + "step": 3557 + }, + { + "epoch": 0.269229314063032, + "grad_norm": 2.4169609546661377, + "learning_rate": 1.5736711616615765e-05, + "loss": 0.8624, + "step": 3558 + }, + { + "epoch": 0.2693049827853657, + "grad_norm": 2.7807676792144775, + "learning_rate": 1.5735217941933754e-05, + "loss": 0.7344, + "step": 3559 + }, + { + "epoch": 0.2693806515076993, + "grad_norm": 2.2362794876098633, + "learning_rate": 1.5733723845470606e-05, + "loss": 0.8274, + "step": 3560 + }, + { + "epoch": 0.2694563202300329, + "grad_norm": 2.450251817703247, + "learning_rate": 1.5732229327319895e-05, + "loss": 0.7416, + "step": 3561 + }, + { + "epoch": 0.2695319889523665, + "grad_norm": 2.79219126701355, + "learning_rate": 1.573073438757521e-05, + "loss": 0.6722, + "step": 3562 + }, + { + "epoch": 0.2696076576747002, + "grad_norm": 2.2558610439300537, + "learning_rate": 1.5729239026330167e-05, + "loss": 0.5821, + "step": 3563 + }, + { + "epoch": 0.2696833263970338, + "grad_norm": 2.438255786895752, + "learning_rate": 1.572774324367841e-05, + "loss": 0.8141, + "step": 3564 + }, + { + "epoch": 0.2697589951193674, + "grad_norm": 2.1649351119995117, + "learning_rate": 1.572624703971361e-05, + "loss": 0.6348, + "step": 3565 + }, + { + "epoch": 0.269834663841701, + "grad_norm": 2.2655069828033447, + "learning_rate": 1.5724750414529466e-05, + "loss": 0.6662, + "step": 3566 + }, + { + "epoch": 0.2699103325640347, + "grad_norm": 2.0114426612854004, + "learning_rate": 1.57232533682197e-05, + "loss": 0.7854, + "step": 3567 + }, + { + "epoch": 0.2699860012863683, + "grad_norm": 2.8764772415161133, + "learning_rate": 1.5721755900878062e-05, + "loss": 0.6343, + "step": 3568 + }, + { + "epoch": 0.2700616700087019, + "grad_norm": 2.2476704120635986, + "learning_rate": 1.5720258012598332e-05, + "loss": 0.9252, + "step": 3569 + }, + { + "epoch": 0.2701373387310355, + "grad_norm": 2.010709047317505, + "learning_rate": 1.5718759703474307e-05, + "loss": 0.639, + "step": 3570 + }, + { + "epoch": 0.2702130074533691, + "grad_norm": 2.497084617614746, + "learning_rate": 1.571726097359982e-05, + "loss": 0.7162, + "step": 3571 + }, + { + "epoch": 0.2702886761757028, + "grad_norm": 2.4472103118896484, + "learning_rate": 1.571576182306872e-05, + "loss": 0.796, + "step": 3572 + }, + { + "epoch": 0.2703643448980364, + "grad_norm": 2.1636784076690674, + "learning_rate": 1.5714262251974896e-05, + "loss": 0.7544, + "step": 3573 + }, + { + "epoch": 0.27044001362037, + "grad_norm": 2.9408183097839355, + "learning_rate": 1.5712762260412256e-05, + "loss": 0.9256, + "step": 3574 + }, + { + "epoch": 0.2705156823427036, + "grad_norm": 1.9260292053222656, + "learning_rate": 1.571126184847473e-05, + "loss": 0.893, + "step": 3575 + }, + { + "epoch": 0.2705913510650373, + "grad_norm": 3.143998861312866, + "learning_rate": 1.5709761016256277e-05, + "loss": 0.7053, + "step": 3576 + }, + { + "epoch": 0.2706670197873709, + "grad_norm": 2.45214581489563, + "learning_rate": 1.570825976385089e-05, + "loss": 0.7357, + "step": 3577 + }, + { + "epoch": 0.2707426885097045, + "grad_norm": 2.4344959259033203, + "learning_rate": 1.570675809135258e-05, + "loss": 0.7794, + "step": 3578 + }, + { + "epoch": 0.2708183572320381, + "grad_norm": 2.231781005859375, + "learning_rate": 1.5705255998855384e-05, + "loss": 0.8033, + "step": 3579 + }, + { + "epoch": 0.2708940259543718, + "grad_norm": 2.1317062377929688, + "learning_rate": 1.570375348645337e-05, + "loss": 0.7642, + "step": 3580 + }, + { + "epoch": 0.2709696946767054, + "grad_norm": 2.1097607612609863, + "learning_rate": 1.570225055424063e-05, + "loss": 0.6544, + "step": 3581 + }, + { + "epoch": 0.271045363399039, + "grad_norm": 3.234607219696045, + "learning_rate": 1.5700747202311284e-05, + "loss": 0.8878, + "step": 3582 + }, + { + "epoch": 0.2711210321213726, + "grad_norm": 2.072981119155884, + "learning_rate": 1.5699243430759477e-05, + "loss": 0.692, + "step": 3583 + }, + { + "epoch": 0.27119670084370623, + "grad_norm": 2.0358352661132812, + "learning_rate": 1.5697739239679374e-05, + "loss": 0.7518, + "step": 3584 + }, + { + "epoch": 0.2712723695660399, + "grad_norm": 2.821546792984009, + "learning_rate": 1.569623462916518e-05, + "loss": 0.8253, + "step": 3585 + }, + { + "epoch": 0.2713480382883735, + "grad_norm": 2.268690824508667, + "learning_rate": 1.569472959931111e-05, + "loss": 0.7328, + "step": 3586 + }, + { + "epoch": 0.2714237070107071, + "grad_norm": 2.1014082431793213, + "learning_rate": 1.5693224150211427e-05, + "loss": 0.7633, + "step": 3587 + }, + { + "epoch": 0.2714993757330407, + "grad_norm": 2.4852254390716553, + "learning_rate": 1.5691718281960395e-05, + "loss": 0.6053, + "step": 3588 + }, + { + "epoch": 0.2715750444553744, + "grad_norm": 2.0812735557556152, + "learning_rate": 1.569021199465232e-05, + "loss": 0.6779, + "step": 3589 + }, + { + "epoch": 0.271650713177708, + "grad_norm": 2.4176812171936035, + "learning_rate": 1.5688705288381533e-05, + "loss": 0.6303, + "step": 3590 + }, + { + "epoch": 0.2717263819000416, + "grad_norm": 2.428131103515625, + "learning_rate": 1.5687198163242388e-05, + "loss": 0.6474, + "step": 3591 + }, + { + "epoch": 0.2718020506223752, + "grad_norm": 2.6284077167510986, + "learning_rate": 1.568569061932926e-05, + "loss": 0.6688, + "step": 3592 + }, + { + "epoch": 0.2718777193447089, + "grad_norm": 2.674342155456543, + "learning_rate": 1.5684182656736566e-05, + "loss": 0.7523, + "step": 3593 + }, + { + "epoch": 0.2719533880670425, + "grad_norm": 2.8079347610473633, + "learning_rate": 1.5682674275558734e-05, + "loss": 0.7762, + "step": 3594 + }, + { + "epoch": 0.2720290567893761, + "grad_norm": 2.347954273223877, + "learning_rate": 1.568116547589022e-05, + "loss": 0.7758, + "step": 3595 + }, + { + "epoch": 0.2721047255117097, + "grad_norm": 2.2305753231048584, + "learning_rate": 1.567965625782552e-05, + "loss": 0.8519, + "step": 3596 + }, + { + "epoch": 0.27218039423404333, + "grad_norm": 2.6532304286956787, + "learning_rate": 1.567814662145914e-05, + "loss": 0.6987, + "step": 3597 + }, + { + "epoch": 0.272256062956377, + "grad_norm": 3.169668197631836, + "learning_rate": 1.5676636566885616e-05, + "loss": 0.7087, + "step": 3598 + }, + { + "epoch": 0.2723317316787106, + "grad_norm": 2.8570854663848877, + "learning_rate": 1.5675126094199516e-05, + "loss": 0.7527, + "step": 3599 + }, + { + "epoch": 0.2724074004010442, + "grad_norm": 2.194880247116089, + "learning_rate": 1.567361520349543e-05, + "loss": 0.7146, + "step": 3600 + }, + { + "epoch": 0.27248306912337783, + "grad_norm": 2.3188719749450684, + "learning_rate": 1.5672103894867978e-05, + "loss": 0.8719, + "step": 3601 + }, + { + "epoch": 0.2725587378457115, + "grad_norm": 2.677793502807617, + "learning_rate": 1.5670592168411797e-05, + "loss": 0.7991, + "step": 3602 + }, + { + "epoch": 0.2726344065680451, + "grad_norm": 2.2928788661956787, + "learning_rate": 1.566908002422156e-05, + "loss": 0.767, + "step": 3603 + }, + { + "epoch": 0.2727100752903787, + "grad_norm": 2.1832549571990967, + "learning_rate": 1.566756746239196e-05, + "loss": 0.7049, + "step": 3604 + }, + { + "epoch": 0.27278574401271233, + "grad_norm": 1.460695743560791, + "learning_rate": 1.5666054483017722e-05, + "loss": 0.8869, + "step": 3605 + }, + { + "epoch": 0.272861412735046, + "grad_norm": 2.577636241912842, + "learning_rate": 1.566454108619359e-05, + "loss": 0.7205, + "step": 3606 + }, + { + "epoch": 0.2729370814573796, + "grad_norm": 1.833762764930725, + "learning_rate": 1.5663027272014337e-05, + "loss": 0.9315, + "step": 3607 + }, + { + "epoch": 0.2730127501797132, + "grad_norm": 1.9866582155227661, + "learning_rate": 1.566151304057477e-05, + "loss": 0.6925, + "step": 3608 + }, + { + "epoch": 0.2730884189020468, + "grad_norm": 2.173614501953125, + "learning_rate": 1.565999839196971e-05, + "loss": 0.8448, + "step": 3609 + }, + { + "epoch": 0.27316408762438044, + "grad_norm": 1.7537118196487427, + "learning_rate": 1.5658483326294008e-05, + "loss": 0.6487, + "step": 3610 + }, + { + "epoch": 0.2732397563467141, + "grad_norm": 1.7981582880020142, + "learning_rate": 1.5656967843642544e-05, + "loss": 0.8801, + "step": 3611 + }, + { + "epoch": 0.2733154250690477, + "grad_norm": 2.380213499069214, + "learning_rate": 1.5655451944110223e-05, + "loss": 0.7393, + "step": 3612 + }, + { + "epoch": 0.2733910937913813, + "grad_norm": 2.3052456378936768, + "learning_rate": 1.5653935627791976e-05, + "loss": 0.8234, + "step": 3613 + }, + { + "epoch": 0.27346676251371493, + "grad_norm": 2.1715638637542725, + "learning_rate": 1.5652418894782755e-05, + "loss": 0.6953, + "step": 3614 + }, + { + "epoch": 0.2735424312360486, + "grad_norm": 2.075761079788208, + "learning_rate": 1.565090174517755e-05, + "loss": 0.662, + "step": 3615 + }, + { + "epoch": 0.2736180999583822, + "grad_norm": 1.88273024559021, + "learning_rate": 1.5649384179071363e-05, + "loss": 0.6704, + "step": 3616 + }, + { + "epoch": 0.2736937686807158, + "grad_norm": 2.0616722106933594, + "learning_rate": 1.5647866196559234e-05, + "loss": 0.6941, + "step": 3617 + }, + { + "epoch": 0.27376943740304943, + "grad_norm": 2.087221145629883, + "learning_rate": 1.564634779773622e-05, + "loss": 0.866, + "step": 3618 + }, + { + "epoch": 0.2738451061253831, + "grad_norm": 3.6184213161468506, + "learning_rate": 1.5644828982697413e-05, + "loss": 0.6817, + "step": 3619 + }, + { + "epoch": 0.2739207748477167, + "grad_norm": 1.8233073949813843, + "learning_rate": 1.5643309751537922e-05, + "loss": 0.7882, + "step": 3620 + }, + { + "epoch": 0.2739964435700503, + "grad_norm": 2.140226364135742, + "learning_rate": 1.564179010435289e-05, + "loss": 0.7102, + "step": 3621 + }, + { + "epoch": 0.27407211229238393, + "grad_norm": 1.8913706541061401, + "learning_rate": 1.5640270041237475e-05, + "loss": 0.7444, + "step": 3622 + }, + { + "epoch": 0.27414778101471754, + "grad_norm": 2.4049935340881348, + "learning_rate": 1.5638749562286875e-05, + "loss": 0.7232, + "step": 3623 + }, + { + "epoch": 0.2742234497370512, + "grad_norm": 2.22857666015625, + "learning_rate": 1.5637228667596302e-05, + "loss": 0.7192, + "step": 3624 + }, + { + "epoch": 0.2742991184593848, + "grad_norm": 2.1087875366210938, + "learning_rate": 1.5635707357261007e-05, + "loss": 0.6453, + "step": 3625 + }, + { + "epoch": 0.2743747871817184, + "grad_norm": 3.1201884746551514, + "learning_rate": 1.563418563137625e-05, + "loss": 0.7145, + "step": 3626 + }, + { + "epoch": 0.27445045590405204, + "grad_norm": 2.400087356567383, + "learning_rate": 1.5632663490037334e-05, + "loss": 0.747, + "step": 3627 + }, + { + "epoch": 0.2745261246263857, + "grad_norm": 2.568697214126587, + "learning_rate": 1.563114093333958e-05, + "loss": 0.7742, + "step": 3628 + }, + { + "epoch": 0.2746017933487193, + "grad_norm": 2.265756607055664, + "learning_rate": 1.562961796137833e-05, + "loss": 0.8239, + "step": 3629 + }, + { + "epoch": 0.2746774620710529, + "grad_norm": 2.1188042163848877, + "learning_rate": 1.5628094574248962e-05, + "loss": 0.818, + "step": 3630 + }, + { + "epoch": 0.27475313079338654, + "grad_norm": 2.4410367012023926, + "learning_rate": 1.562657077204687e-05, + "loss": 0.8349, + "step": 3631 + }, + { + "epoch": 0.2748287995157202, + "grad_norm": 2.043889284133911, + "learning_rate": 1.562504655486749e-05, + "loss": 0.7269, + "step": 3632 + }, + { + "epoch": 0.2749044682380538, + "grad_norm": 2.041012763977051, + "learning_rate": 1.5623521922806263e-05, + "loss": 0.7347, + "step": 3633 + }, + { + "epoch": 0.2749801369603874, + "grad_norm": 2.0784006118774414, + "learning_rate": 1.5621996875958668e-05, + "loss": 0.5876, + "step": 3634 + }, + { + "epoch": 0.27505580568272103, + "grad_norm": 2.4200499057769775, + "learning_rate": 1.5620471414420212e-05, + "loss": 0.6634, + "step": 3635 + }, + { + "epoch": 0.27513147440505464, + "grad_norm": 2.1126153469085693, + "learning_rate": 1.5618945538286423e-05, + "loss": 0.801, + "step": 3636 + }, + { + "epoch": 0.2752071431273883, + "grad_norm": 2.4012205600738525, + "learning_rate": 1.561741924765286e-05, + "loss": 0.7216, + "step": 3637 + }, + { + "epoch": 0.2752828118497219, + "grad_norm": 2.6323201656341553, + "learning_rate": 1.5615892542615095e-05, + "loss": 0.7527, + "step": 3638 + }, + { + "epoch": 0.27535848057205553, + "grad_norm": 2.035029649734497, + "learning_rate": 1.5614365423268742e-05, + "loss": 0.9722, + "step": 3639 + }, + { + "epoch": 0.27543414929438914, + "grad_norm": 1.8150397539138794, + "learning_rate": 1.561283788970943e-05, + "loss": 0.6651, + "step": 3640 + }, + { + "epoch": 0.2755098180167228, + "grad_norm": 2.298706293106079, + "learning_rate": 1.5611309942032827e-05, + "loss": 0.6894, + "step": 3641 + }, + { + "epoch": 0.2755854867390564, + "grad_norm": 2.7103264331817627, + "learning_rate": 1.5609781580334607e-05, + "loss": 0.8144, + "step": 3642 + }, + { + "epoch": 0.27566115546139003, + "grad_norm": 1.96636962890625, + "learning_rate": 1.560825280471049e-05, + "loss": 0.7921, + "step": 3643 + }, + { + "epoch": 0.27573682418372364, + "grad_norm": 2.5422213077545166, + "learning_rate": 1.5606723615256205e-05, + "loss": 0.8534, + "step": 3644 + }, + { + "epoch": 0.2758124929060573, + "grad_norm": 2.5183253288269043, + "learning_rate": 1.560519401206752e-05, + "loss": 0.6715, + "step": 3645 + }, + { + "epoch": 0.2758881616283909, + "grad_norm": 2.7416634559631348, + "learning_rate": 1.5603663995240223e-05, + "loss": 0.7949, + "step": 3646 + }, + { + "epoch": 0.2759638303507245, + "grad_norm": 2.2581331729888916, + "learning_rate": 1.5602133564870126e-05, + "loss": 0.7579, + "step": 3647 + }, + { + "epoch": 0.27603949907305814, + "grad_norm": 1.9897353649139404, + "learning_rate": 1.5600602721053073e-05, + "loss": 0.6848, + "step": 3648 + }, + { + "epoch": 0.27611516779539175, + "grad_norm": 1.6935006380081177, + "learning_rate": 1.5599071463884927e-05, + "loss": 0.7117, + "step": 3649 + }, + { + "epoch": 0.2761908365177254, + "grad_norm": 2.8586652278900146, + "learning_rate": 1.5597539793461584e-05, + "loss": 0.9308, + "step": 3650 + }, + { + "epoch": 0.276266505240059, + "grad_norm": 1.9398143291473389, + "learning_rate": 1.5596007709878957e-05, + "loss": 0.7423, + "step": 3651 + }, + { + "epoch": 0.27634217396239263, + "grad_norm": 3.198117733001709, + "learning_rate": 1.5594475213232995e-05, + "loss": 0.8414, + "step": 3652 + }, + { + "epoch": 0.27641784268472624, + "grad_norm": 2.2561442852020264, + "learning_rate": 1.5592942303619667e-05, + "loss": 0.6747, + "step": 3653 + }, + { + "epoch": 0.2764935114070599, + "grad_norm": 1.7593903541564941, + "learning_rate": 1.5591408981134966e-05, + "loss": 0.7299, + "step": 3654 + }, + { + "epoch": 0.2765691801293935, + "grad_norm": 2.0782854557037354, + "learning_rate": 1.5589875245874918e-05, + "loss": 0.6619, + "step": 3655 + }, + { + "epoch": 0.27664484885172713, + "grad_norm": 3.2505691051483154, + "learning_rate": 1.5588341097935565e-05, + "loss": 0.7653, + "step": 3656 + }, + { + "epoch": 0.27672051757406074, + "grad_norm": 2.4041876792907715, + "learning_rate": 1.5586806537412987e-05, + "loss": 0.7123, + "step": 3657 + }, + { + "epoch": 0.2767961862963944, + "grad_norm": 2.018611431121826, + "learning_rate": 1.5585271564403276e-05, + "loss": 0.6393, + "step": 3658 + }, + { + "epoch": 0.276871855018728, + "grad_norm": 2.0278730392456055, + "learning_rate": 1.558373617900256e-05, + "loss": 0.7515, + "step": 3659 + }, + { + "epoch": 0.27694752374106163, + "grad_norm": 2.383406400680542, + "learning_rate": 1.558220038130699e-05, + "loss": 0.7467, + "step": 3660 + }, + { + "epoch": 0.27702319246339524, + "grad_norm": 2.429568290710449, + "learning_rate": 1.5580664171412743e-05, + "loss": 0.7286, + "step": 3661 + }, + { + "epoch": 0.2770988611857289, + "grad_norm": 2.291583776473999, + "learning_rate": 1.5579127549416024e-05, + "loss": 0.7208, + "step": 3662 + }, + { + "epoch": 0.2771745299080625, + "grad_norm": 2.344414710998535, + "learning_rate": 1.5577590515413054e-05, + "loss": 0.7562, + "step": 3663 + }, + { + "epoch": 0.2772501986303961, + "grad_norm": 2.6295807361602783, + "learning_rate": 1.5576053069500093e-05, + "loss": 0.7099, + "step": 3664 + }, + { + "epoch": 0.27732586735272974, + "grad_norm": 2.575855016708374, + "learning_rate": 1.557451521177342e-05, + "loss": 0.8234, + "step": 3665 + }, + { + "epoch": 0.27740153607506335, + "grad_norm": 2.7682507038116455, + "learning_rate": 1.557297694232934e-05, + "loss": 0.8003, + "step": 3666 + }, + { + "epoch": 0.277477204797397, + "grad_norm": 2.386277675628662, + "learning_rate": 1.5571438261264184e-05, + "loss": 0.8584, + "step": 3667 + }, + { + "epoch": 0.2775528735197306, + "grad_norm": 2.271766424179077, + "learning_rate": 1.5569899168674308e-05, + "loss": 0.7653, + "step": 3668 + }, + { + "epoch": 0.27762854224206424, + "grad_norm": 2.1660287380218506, + "learning_rate": 1.55683596646561e-05, + "loss": 0.845, + "step": 3669 + }, + { + "epoch": 0.27770421096439785, + "grad_norm": 2.2329893112182617, + "learning_rate": 1.5566819749305962e-05, + "loss": 0.7212, + "step": 3670 + }, + { + "epoch": 0.2777798796867315, + "grad_norm": 2.5628223419189453, + "learning_rate": 1.5565279422720335e-05, + "loss": 0.919, + "step": 3671 + }, + { + "epoch": 0.2778555484090651, + "grad_norm": 2.2546982765197754, + "learning_rate": 1.556373868499567e-05, + "loss": 0.6941, + "step": 3672 + }, + { + "epoch": 0.27793121713139873, + "grad_norm": 2.420389413833618, + "learning_rate": 1.556219753622846e-05, + "loss": 0.7521, + "step": 3673 + }, + { + "epoch": 0.27800688585373234, + "grad_norm": 1.9865230321884155, + "learning_rate": 1.556065597651522e-05, + "loss": 0.775, + "step": 3674 + }, + { + "epoch": 0.278082554576066, + "grad_norm": 2.0972037315368652, + "learning_rate": 1.5559114005952483e-05, + "loss": 0.72, + "step": 3675 + }, + { + "epoch": 0.2781582232983996, + "grad_norm": 3.4112987518310547, + "learning_rate": 1.555757162463681e-05, + "loss": 0.7276, + "step": 3676 + }, + { + "epoch": 0.27823389202073323, + "grad_norm": 2.490372896194458, + "learning_rate": 1.5556028832664793e-05, + "loss": 0.7068, + "step": 3677 + }, + { + "epoch": 0.27830956074306684, + "grad_norm": 2.6258339881896973, + "learning_rate": 1.5554485630133045e-05, + "loss": 0.801, + "step": 3678 + }, + { + "epoch": 0.27838522946540045, + "grad_norm": 1.8640928268432617, + "learning_rate": 1.5552942017138204e-05, + "loss": 0.8175, + "step": 3679 + }, + { + "epoch": 0.2784608981877341, + "grad_norm": 3.355138063430786, + "learning_rate": 1.5551397993776943e-05, + "loss": 0.7382, + "step": 3680 + }, + { + "epoch": 0.27853656691006773, + "grad_norm": 2.3146259784698486, + "learning_rate": 1.554985356014595e-05, + "loss": 0.6254, + "step": 3681 + }, + { + "epoch": 0.27861223563240134, + "grad_norm": 1.9555891752243042, + "learning_rate": 1.5548308716341944e-05, + "loss": 0.6862, + "step": 3682 + }, + { + "epoch": 0.27868790435473495, + "grad_norm": 2.361454725265503, + "learning_rate": 1.554676346246166e-05, + "loss": 0.6829, + "step": 3683 + }, + { + "epoch": 0.2787635730770686, + "grad_norm": 2.4104437828063965, + "learning_rate": 1.5545217798601878e-05, + "loss": 0.6886, + "step": 3684 + }, + { + "epoch": 0.2788392417994022, + "grad_norm": 2.2208261489868164, + "learning_rate": 1.5543671724859387e-05, + "loss": 0.6815, + "step": 3685 + }, + { + "epoch": 0.27891491052173584, + "grad_norm": 2.3924684524536133, + "learning_rate": 1.5542125241331006e-05, + "loss": 0.8088, + "step": 3686 + }, + { + "epoch": 0.27899057924406945, + "grad_norm": 2.2955551147460938, + "learning_rate": 1.5540578348113585e-05, + "loss": 0.8344, + "step": 3687 + }, + { + "epoch": 0.2790662479664031, + "grad_norm": 2.611711025238037, + "learning_rate": 1.553903104530399e-05, + "loss": 0.7999, + "step": 3688 + }, + { + "epoch": 0.2791419166887367, + "grad_norm": 2.359159231185913, + "learning_rate": 1.5537483332999123e-05, + "loss": 0.7241, + "step": 3689 + }, + { + "epoch": 0.27921758541107033, + "grad_norm": 2.494488000869751, + "learning_rate": 1.5535935211295906e-05, + "loss": 0.7985, + "step": 3690 + }, + { + "epoch": 0.27929325413340395, + "grad_norm": 2.4356842041015625, + "learning_rate": 1.5534386680291286e-05, + "loss": 0.8035, + "step": 3691 + }, + { + "epoch": 0.27936892285573756, + "grad_norm": 2.5868098735809326, + "learning_rate": 1.5532837740082237e-05, + "loss": 0.9104, + "step": 3692 + }, + { + "epoch": 0.2794445915780712, + "grad_norm": 2.140434741973877, + "learning_rate": 1.5531288390765757e-05, + "loss": 0.7656, + "step": 3693 + }, + { + "epoch": 0.27952026030040483, + "grad_norm": 2.476284980773926, + "learning_rate": 1.5529738632438873e-05, + "loss": 0.7472, + "step": 3694 + }, + { + "epoch": 0.27959592902273844, + "grad_norm": 2.0590710639953613, + "learning_rate": 1.552818846519864e-05, + "loss": 0.7425, + "step": 3695 + }, + { + "epoch": 0.27967159774507205, + "grad_norm": 1.7913882732391357, + "learning_rate": 1.552663788914213e-05, + "loss": 0.7771, + "step": 3696 + }, + { + "epoch": 0.2797472664674057, + "grad_norm": 2.3980860710144043, + "learning_rate": 1.552508690436644e-05, + "loss": 0.8346, + "step": 3697 + }, + { + "epoch": 0.27982293518973933, + "grad_norm": 2.179081678390503, + "learning_rate": 1.552353551096871e-05, + "loss": 0.7082, + "step": 3698 + }, + { + "epoch": 0.27989860391207294, + "grad_norm": 2.324483871459961, + "learning_rate": 1.5521983709046084e-05, + "loss": 0.6597, + "step": 3699 + }, + { + "epoch": 0.27997427263440655, + "grad_norm": 3.7304763793945312, + "learning_rate": 1.5520431498695743e-05, + "loss": 0.6375, + "step": 3700 + }, + { + "epoch": 0.2800499413567402, + "grad_norm": 2.4392082691192627, + "learning_rate": 1.5518878880014894e-05, + "loss": 0.6816, + "step": 3701 + }, + { + "epoch": 0.28012561007907383, + "grad_norm": 2.4349966049194336, + "learning_rate": 1.5517325853100762e-05, + "loss": 0.79, + "step": 3702 + }, + { + "epoch": 0.28020127880140744, + "grad_norm": 2.9991888999938965, + "learning_rate": 1.5515772418050605e-05, + "loss": 0.6722, + "step": 3703 + }, + { + "epoch": 0.28027694752374105, + "grad_norm": 2.6778488159179688, + "learning_rate": 1.5514218574961706e-05, + "loss": 0.6987, + "step": 3704 + }, + { + "epoch": 0.28035261624607466, + "grad_norm": 2.7935636043548584, + "learning_rate": 1.5512664323931372e-05, + "loss": 0.7712, + "step": 3705 + }, + { + "epoch": 0.2804282849684083, + "grad_norm": 2.4512863159179688, + "learning_rate": 1.5511109665056934e-05, + "loss": 0.7726, + "step": 3706 + }, + { + "epoch": 0.28050395369074194, + "grad_norm": 2.603178024291992, + "learning_rate": 1.5509554598435745e-05, + "loss": 0.6195, + "step": 3707 + }, + { + "epoch": 0.28057962241307555, + "grad_norm": 2.1387760639190674, + "learning_rate": 1.5507999124165196e-05, + "loss": 0.6634, + "step": 3708 + }, + { + "epoch": 0.28065529113540916, + "grad_norm": 2.0796477794647217, + "learning_rate": 1.550644324234269e-05, + "loss": 0.6855, + "step": 3709 + }, + { + "epoch": 0.2807309598577428, + "grad_norm": 2.175611734390259, + "learning_rate": 1.5504886953065666e-05, + "loss": 0.7012, + "step": 3710 + }, + { + "epoch": 0.28080662858007643, + "grad_norm": 2.2386960983276367, + "learning_rate": 1.550333025643158e-05, + "loss": 0.7295, + "step": 3711 + }, + { + "epoch": 0.28088229730241004, + "grad_norm": 4.274651527404785, + "learning_rate": 1.5501773152537922e-05, + "loss": 0.7778, + "step": 3712 + }, + { + "epoch": 0.28095796602474365, + "grad_norm": 2.236834764480591, + "learning_rate": 1.5500215641482197e-05, + "loss": 0.8035, + "step": 3713 + }, + { + "epoch": 0.2810336347470773, + "grad_norm": 1.7886241674423218, + "learning_rate": 1.5498657723361946e-05, + "loss": 0.9152, + "step": 3714 + }, + { + "epoch": 0.28110930346941093, + "grad_norm": 2.0251529216766357, + "learning_rate": 1.5497099398274727e-05, + "loss": 0.7473, + "step": 3715 + }, + { + "epoch": 0.28118497219174454, + "grad_norm": 2.066375970840454, + "learning_rate": 1.5495540666318133e-05, + "loss": 0.806, + "step": 3716 + }, + { + "epoch": 0.28126064091407815, + "grad_norm": 2.595116138458252, + "learning_rate": 1.5493981527589768e-05, + "loss": 0.9727, + "step": 3717 + }, + { + "epoch": 0.28133630963641176, + "grad_norm": 2.0963737964630127, + "learning_rate": 1.549242198218728e-05, + "loss": 0.7616, + "step": 3718 + }, + { + "epoch": 0.28141197835874543, + "grad_norm": 2.7268869876861572, + "learning_rate": 1.5490862030208326e-05, + "loss": 0.75, + "step": 3719 + }, + { + "epoch": 0.28148764708107904, + "grad_norm": 2.4957454204559326, + "learning_rate": 1.5489301671750602e-05, + "loss": 0.7749, + "step": 3720 + }, + { + "epoch": 0.28156331580341265, + "grad_norm": 2.260963201522827, + "learning_rate": 1.5487740906911814e-05, + "loss": 0.7527, + "step": 3721 + }, + { + "epoch": 0.28163898452574626, + "grad_norm": 2.0017857551574707, + "learning_rate": 1.5486179735789708e-05, + "loss": 0.5984, + "step": 3722 + }, + { + "epoch": 0.2817146532480799, + "grad_norm": 1.8426270484924316, + "learning_rate": 1.548461815848205e-05, + "loss": 0.7738, + "step": 3723 + }, + { + "epoch": 0.28179032197041354, + "grad_norm": 2.3114354610443115, + "learning_rate": 1.5483056175086624e-05, + "loss": 0.6493, + "step": 3724 + }, + { + "epoch": 0.28186599069274715, + "grad_norm": 2.0738372802734375, + "learning_rate": 1.5481493785701255e-05, + "loss": 0.7372, + "step": 3725 + }, + { + "epoch": 0.28194165941508076, + "grad_norm": 2.051266670227051, + "learning_rate": 1.547993099042378e-05, + "loss": 0.6902, + "step": 3726 + }, + { + "epoch": 0.2820173281374144, + "grad_norm": 2.181405544281006, + "learning_rate": 1.547836778935207e-05, + "loss": 0.7094, + "step": 3727 + }, + { + "epoch": 0.28209299685974804, + "grad_norm": 2.6144001483917236, + "learning_rate": 1.5476804182584015e-05, + "loss": 0.7553, + "step": 3728 + }, + { + "epoch": 0.28216866558208165, + "grad_norm": 2.2755348682403564, + "learning_rate": 1.5475240170217532e-05, + "loss": 0.7335, + "step": 3729 + }, + { + "epoch": 0.28224433430441526, + "grad_norm": 2.2966911792755127, + "learning_rate": 1.547367575235057e-05, + "loss": 0.7893, + "step": 3730 + }, + { + "epoch": 0.28232000302674887, + "grad_norm": 2.246685266494751, + "learning_rate": 1.547211092908109e-05, + "loss": 0.7984, + "step": 3731 + }, + { + "epoch": 0.28239567174908253, + "grad_norm": 2.2465322017669678, + "learning_rate": 1.547054570050709e-05, + "loss": 0.8449, + "step": 3732 + }, + { + "epoch": 0.28247134047141614, + "grad_norm": 2.385525703430176, + "learning_rate": 1.546898006672659e-05, + "loss": 0.7544, + "step": 3733 + }, + { + "epoch": 0.28254700919374975, + "grad_norm": 2.5891146659851074, + "learning_rate": 1.5467414027837638e-05, + "loss": 0.7069, + "step": 3734 + }, + { + "epoch": 0.28262267791608336, + "grad_norm": 3.729670763015747, + "learning_rate": 1.54658475839383e-05, + "loss": 0.8181, + "step": 3735 + }, + { + "epoch": 0.28269834663841703, + "grad_norm": 2.792766571044922, + "learning_rate": 1.546428073512667e-05, + "loss": 0.7421, + "step": 3736 + }, + { + "epoch": 0.28277401536075064, + "grad_norm": 1.9378312826156616, + "learning_rate": 1.5462713481500875e-05, + "loss": 0.4965, + "step": 3737 + }, + { + "epoch": 0.28284968408308425, + "grad_norm": 2.495208978652954, + "learning_rate": 1.5461145823159063e-05, + "loss": 0.6608, + "step": 3738 + }, + { + "epoch": 0.28292535280541786, + "grad_norm": 2.5819809436798096, + "learning_rate": 1.5459577760199396e-05, + "loss": 0.6334, + "step": 3739 + }, + { + "epoch": 0.28300102152775153, + "grad_norm": 2.375948667526245, + "learning_rate": 1.5458009292720077e-05, + "loss": 0.7322, + "step": 3740 + }, + { + "epoch": 0.28307669025008514, + "grad_norm": 2.2175607681274414, + "learning_rate": 1.545644042081933e-05, + "loss": 0.9031, + "step": 3741 + }, + { + "epoch": 0.28315235897241875, + "grad_norm": 2.4103119373321533, + "learning_rate": 1.54548711445954e-05, + "loss": 0.9138, + "step": 3742 + }, + { + "epoch": 0.28322802769475236, + "grad_norm": 3.062034845352173, + "learning_rate": 1.5453301464146563e-05, + "loss": 0.8161, + "step": 3743 + }, + { + "epoch": 0.28330369641708597, + "grad_norm": 2.3038175106048584, + "learning_rate": 1.5451731379571115e-05, + "loss": 0.7118, + "step": 3744 + }, + { + "epoch": 0.28337936513941964, + "grad_norm": 3.113598108291626, + "learning_rate": 1.545016089096738e-05, + "loss": 0.772, + "step": 3745 + }, + { + "epoch": 0.28345503386175325, + "grad_norm": 2.0970757007598877, + "learning_rate": 1.544858999843371e-05, + "loss": 0.8035, + "step": 3746 + }, + { + "epoch": 0.28353070258408686, + "grad_norm": 2.837749481201172, + "learning_rate": 1.5447018702068475e-05, + "loss": 0.8556, + "step": 3747 + }, + { + "epoch": 0.28360637130642047, + "grad_norm": 2.4374568462371826, + "learning_rate": 1.5445447001970078e-05, + "loss": 0.6574, + "step": 3748 + }, + { + "epoch": 0.28368204002875413, + "grad_norm": 2.6389224529266357, + "learning_rate": 1.544387489823694e-05, + "loss": 0.6553, + "step": 3749 + }, + { + "epoch": 0.28375770875108774, + "grad_norm": 2.4554734230041504, + "learning_rate": 1.5442302390967517e-05, + "loss": 0.7377, + "step": 3750 + }, + { + "epoch": 0.28383337747342136, + "grad_norm": 2.6802544593811035, + "learning_rate": 1.544072948026028e-05, + "loss": 0.6677, + "step": 3751 + }, + { + "epoch": 0.28390904619575497, + "grad_norm": 2.261831283569336, + "learning_rate": 1.543915616621373e-05, + "loss": 0.6648, + "step": 3752 + }, + { + "epoch": 0.28398471491808863, + "grad_norm": 2.759446144104004, + "learning_rate": 1.5437582448926395e-05, + "loss": 0.5712, + "step": 3753 + }, + { + "epoch": 0.28406038364042224, + "grad_norm": 1.8836723566055298, + "learning_rate": 1.5436008328496827e-05, + "loss": 0.7159, + "step": 3754 + }, + { + "epoch": 0.28413605236275585, + "grad_norm": 2.804567813873291, + "learning_rate": 1.54344338050236e-05, + "loss": 0.7679, + "step": 3755 + }, + { + "epoch": 0.28421172108508946, + "grad_norm": 2.44331431388855, + "learning_rate": 1.5432858878605315e-05, + "loss": 0.8358, + "step": 3756 + }, + { + "epoch": 0.2842873898074231, + "grad_norm": 1.8980436325073242, + "learning_rate": 1.54312835493406e-05, + "loss": 0.6766, + "step": 3757 + }, + { + "epoch": 0.28436305852975674, + "grad_norm": 3.5173916816711426, + "learning_rate": 1.542970781732811e-05, + "loss": 0.8005, + "step": 3758 + }, + { + "epoch": 0.28443872725209035, + "grad_norm": 2.506789207458496, + "learning_rate": 1.542813168266652e-05, + "loss": 0.7122, + "step": 3759 + }, + { + "epoch": 0.28451439597442396, + "grad_norm": 2.8228511810302734, + "learning_rate": 1.5426555145454533e-05, + "loss": 0.7872, + "step": 3760 + }, + { + "epoch": 0.28459006469675757, + "grad_norm": 2.5148704051971436, + "learning_rate": 1.5424978205790875e-05, + "loss": 0.8348, + "step": 3761 + }, + { + "epoch": 0.28466573341909124, + "grad_norm": 2.349149227142334, + "learning_rate": 1.54234008637743e-05, + "loss": 0.9365, + "step": 3762 + }, + { + "epoch": 0.28474140214142485, + "grad_norm": 2.6498892307281494, + "learning_rate": 1.542182311950359e-05, + "loss": 0.8949, + "step": 3763 + }, + { + "epoch": 0.28481707086375846, + "grad_norm": 2.6310479640960693, + "learning_rate": 1.5420244973077547e-05, + "loss": 0.793, + "step": 3764 + }, + { + "epoch": 0.28489273958609207, + "grad_norm": 2.2707366943359375, + "learning_rate": 1.5418666424595e-05, + "loss": 0.7804, + "step": 3765 + }, + { + "epoch": 0.28496840830842574, + "grad_norm": 2.279681444168091, + "learning_rate": 1.5417087474154796e-05, + "loss": 0.7311, + "step": 3766 + }, + { + "epoch": 0.28504407703075935, + "grad_norm": 2.6598429679870605, + "learning_rate": 1.541550812185582e-05, + "loss": 0.727, + "step": 3767 + }, + { + "epoch": 0.28511974575309296, + "grad_norm": 2.0504753589630127, + "learning_rate": 1.5413928367796974e-05, + "loss": 0.7835, + "step": 3768 + }, + { + "epoch": 0.28519541447542657, + "grad_norm": 2.460989236831665, + "learning_rate": 1.541234821207719e-05, + "loss": 0.7642, + "step": 3769 + }, + { + "epoch": 0.28527108319776023, + "grad_norm": 2.5266435146331787, + "learning_rate": 1.5410767654795423e-05, + "loss": 0.6354, + "step": 3770 + }, + { + "epoch": 0.28534675192009384, + "grad_norm": 2.2240378856658936, + "learning_rate": 1.540918669605065e-05, + "loss": 0.8202, + "step": 3771 + }, + { + "epoch": 0.28542242064242745, + "grad_norm": 2.3329272270202637, + "learning_rate": 1.540760533594187e-05, + "loss": 0.6528, + "step": 3772 + }, + { + "epoch": 0.28549808936476107, + "grad_norm": 2.2796730995178223, + "learning_rate": 1.5406023574568124e-05, + "loss": 0.6997, + "step": 3773 + }, + { + "epoch": 0.2855737580870947, + "grad_norm": 2.3111414909362793, + "learning_rate": 1.5404441412028464e-05, + "loss": 0.7873, + "step": 3774 + }, + { + "epoch": 0.28564942680942834, + "grad_norm": 2.165189266204834, + "learning_rate": 1.5402858848421962e-05, + "loss": 0.7418, + "step": 3775 + }, + { + "epoch": 0.28572509553176195, + "grad_norm": 3.3214333057403564, + "learning_rate": 1.5401275883847736e-05, + "loss": 0.7305, + "step": 3776 + }, + { + "epoch": 0.28580076425409556, + "grad_norm": 2.159792900085449, + "learning_rate": 1.539969251840491e-05, + "loss": 0.6443, + "step": 3777 + }, + { + "epoch": 0.2858764329764292, + "grad_norm": 2.1362006664276123, + "learning_rate": 1.5398108752192636e-05, + "loss": 0.9493, + "step": 3778 + }, + { + "epoch": 0.28595210169876284, + "grad_norm": 2.815425157546997, + "learning_rate": 1.53965245853101e-05, + "loss": 0.7395, + "step": 3779 + }, + { + "epoch": 0.28602777042109645, + "grad_norm": 2.310188055038452, + "learning_rate": 1.53949400178565e-05, + "loss": 0.8445, + "step": 3780 + }, + { + "epoch": 0.28610343914343006, + "grad_norm": 2.610882043838501, + "learning_rate": 1.539335504993108e-05, + "loss": 0.6956, + "step": 3781 + }, + { + "epoch": 0.28617910786576367, + "grad_norm": 1.9927890300750732, + "learning_rate": 1.5391769681633084e-05, + "loss": 0.761, + "step": 3782 + }, + { + "epoch": 0.28625477658809734, + "grad_norm": 2.202202558517456, + "learning_rate": 1.53901839130618e-05, + "loss": 0.8485, + "step": 3783 + }, + { + "epoch": 0.28633044531043095, + "grad_norm": 3.0478105545043945, + "learning_rate": 1.5388597744316527e-05, + "loss": 0.7307, + "step": 3784 + }, + { + "epoch": 0.28640611403276456, + "grad_norm": 2.1386568546295166, + "learning_rate": 1.5387011175496604e-05, + "loss": 0.7544, + "step": 3785 + }, + { + "epoch": 0.28648178275509817, + "grad_norm": 2.2375717163085938, + "learning_rate": 1.538542420670138e-05, + "loss": 0.9585, + "step": 3786 + }, + { + "epoch": 0.2865574514774318, + "grad_norm": 2.476581573486328, + "learning_rate": 1.5383836838030242e-05, + "loss": 0.6515, + "step": 3787 + }, + { + "epoch": 0.28663312019976545, + "grad_norm": 2.101915121078491, + "learning_rate": 1.538224906958259e-05, + "loss": 0.7691, + "step": 3788 + }, + { + "epoch": 0.28670878892209906, + "grad_norm": 2.667830467224121, + "learning_rate": 1.538066090145786e-05, + "loss": 0.7572, + "step": 3789 + }, + { + "epoch": 0.28678445764443267, + "grad_norm": 2.5487112998962402, + "learning_rate": 1.5379072333755508e-05, + "loss": 0.5705, + "step": 3790 + }, + { + "epoch": 0.2868601263667663, + "grad_norm": 2.0503430366516113, + "learning_rate": 1.5377483366575012e-05, + "loss": 0.8871, + "step": 3791 + }, + { + "epoch": 0.28693579508909994, + "grad_norm": 1.8013478517532349, + "learning_rate": 1.5375894000015883e-05, + "loss": 0.7667, + "step": 3792 + }, + { + "epoch": 0.28701146381143355, + "grad_norm": 11.85326862335205, + "learning_rate": 1.5374304234177648e-05, + "loss": 0.9177, + "step": 3793 + }, + { + "epoch": 0.28708713253376716, + "grad_norm": 2.1834912300109863, + "learning_rate": 1.5372714069159865e-05, + "loss": 0.7887, + "step": 3794 + }, + { + "epoch": 0.2871628012561008, + "grad_norm": 2.3547914028167725, + "learning_rate": 1.5371123505062116e-05, + "loss": 0.5846, + "step": 3795 + }, + { + "epoch": 0.28723846997843444, + "grad_norm": 2.845733404159546, + "learning_rate": 1.5369532541984003e-05, + "loss": 0.6585, + "step": 3796 + }, + { + "epoch": 0.28731413870076805, + "grad_norm": 2.8735568523406982, + "learning_rate": 1.5367941180025162e-05, + "loss": 0.7482, + "step": 3797 + }, + { + "epoch": 0.28738980742310166, + "grad_norm": 2.0185205936431885, + "learning_rate": 1.536634941928525e-05, + "loss": 0.7186, + "step": 3798 + }, + { + "epoch": 0.2874654761454353, + "grad_norm": 2.4860315322875977, + "learning_rate": 1.5364757259863943e-05, + "loss": 0.6788, + "step": 3799 + }, + { + "epoch": 0.2875411448677689, + "grad_norm": 1.9986555576324463, + "learning_rate": 1.5363164701860953e-05, + "loss": 0.7885, + "step": 3800 + }, + { + "epoch": 0.28761681359010255, + "grad_norm": 2.455195665359497, + "learning_rate": 1.536157174537601e-05, + "loss": 0.7411, + "step": 3801 + }, + { + "epoch": 0.28769248231243616, + "grad_norm": 2.079657793045044, + "learning_rate": 1.5359978390508865e-05, + "loss": 0.8515, + "step": 3802 + }, + { + "epoch": 0.28776815103476977, + "grad_norm": 2.4668712615966797, + "learning_rate": 1.5358384637359304e-05, + "loss": 0.6981, + "step": 3803 + }, + { + "epoch": 0.2878438197571034, + "grad_norm": 2.006376028060913, + "learning_rate": 1.535679048602713e-05, + "loss": 0.6889, + "step": 3804 + }, + { + "epoch": 0.28791948847943705, + "grad_norm": 2.464771032333374, + "learning_rate": 1.5355195936612178e-05, + "loss": 0.7923, + "step": 3805 + }, + { + "epoch": 0.28799515720177066, + "grad_norm": 2.3989076614379883, + "learning_rate": 1.53536009892143e-05, + "loss": 0.7724, + "step": 3806 + }, + { + "epoch": 0.28807082592410427, + "grad_norm": 2.19671893119812, + "learning_rate": 1.5352005643933378e-05, + "loss": 0.7459, + "step": 3807 + }, + { + "epoch": 0.2881464946464379, + "grad_norm": 2.1972532272338867, + "learning_rate": 1.5350409900869317e-05, + "loss": 0.6167, + "step": 3808 + }, + { + "epoch": 0.28822216336877154, + "grad_norm": 4.029229164123535, + "learning_rate": 1.534881376012205e-05, + "loss": 0.7394, + "step": 3809 + }, + { + "epoch": 0.28829783209110516, + "grad_norm": 2.5666446685791016, + "learning_rate": 1.534721722179153e-05, + "loss": 0.7416, + "step": 3810 + }, + { + "epoch": 0.28837350081343877, + "grad_norm": 2.153628349304199, + "learning_rate": 1.534562028597774e-05, + "loss": 0.6953, + "step": 3811 + }, + { + "epoch": 0.2884491695357724, + "grad_norm": 2.0070858001708984, + "learning_rate": 1.5344022952780678e-05, + "loss": 0.7555, + "step": 3812 + }, + { + "epoch": 0.288524838258106, + "grad_norm": 2.8213300704956055, + "learning_rate": 1.5342425222300384e-05, + "loss": 0.7651, + "step": 3813 + }, + { + "epoch": 0.28860050698043965, + "grad_norm": 2.5318684577941895, + "learning_rate": 1.534082709463691e-05, + "loss": 0.8429, + "step": 3814 + }, + { + "epoch": 0.28867617570277326, + "grad_norm": 2.2411584854125977, + "learning_rate": 1.533922856989033e-05, + "loss": 1.041, + "step": 3815 + }, + { + "epoch": 0.2887518444251069, + "grad_norm": 2.1817727088928223, + "learning_rate": 1.5337629648160754e-05, + "loss": 0.6029, + "step": 3816 + }, + { + "epoch": 0.2888275131474405, + "grad_norm": 1.9107509851455688, + "learning_rate": 1.5336030329548315e-05, + "loss": 0.7631, + "step": 3817 + }, + { + "epoch": 0.28890318186977415, + "grad_norm": 2.7551729679107666, + "learning_rate": 1.533443061415316e-05, + "loss": 0.6992, + "step": 3818 + }, + { + "epoch": 0.28897885059210776, + "grad_norm": 3.3587355613708496, + "learning_rate": 1.533283050207547e-05, + "loss": 0.5839, + "step": 3819 + }, + { + "epoch": 0.28905451931444137, + "grad_norm": 2.6251888275146484, + "learning_rate": 1.533122999341546e-05, + "loss": 0.8546, + "step": 3820 + }, + { + "epoch": 0.289130188036775, + "grad_norm": 3.0454955101013184, + "learning_rate": 1.532962908827334e-05, + "loss": 0.8615, + "step": 3821 + }, + { + "epoch": 0.28920585675910865, + "grad_norm": 2.355125904083252, + "learning_rate": 1.532802778674938e-05, + "loss": 0.7968, + "step": 3822 + }, + { + "epoch": 0.28928152548144226, + "grad_norm": 2.498617649078369, + "learning_rate": 1.5326426088943854e-05, + "loss": 0.757, + "step": 3823 + }, + { + "epoch": 0.28935719420377587, + "grad_norm": 2.536444664001465, + "learning_rate": 1.532482399495706e-05, + "loss": 0.9326, + "step": 3824 + }, + { + "epoch": 0.2894328629261095, + "grad_norm": 2.7442028522491455, + "learning_rate": 1.532322150488933e-05, + "loss": 0.8168, + "step": 3825 + }, + { + "epoch": 0.2895085316484431, + "grad_norm": 2.647066116333008, + "learning_rate": 1.532161861884102e-05, + "loss": 0.7789, + "step": 3826 + }, + { + "epoch": 0.28958420037077676, + "grad_norm": 2.5589473247528076, + "learning_rate": 1.5320015336912505e-05, + "loss": 0.6668, + "step": 3827 + }, + { + "epoch": 0.28965986909311037, + "grad_norm": 1.8725637197494507, + "learning_rate": 1.531841165920419e-05, + "loss": 0.8211, + "step": 3828 + }, + { + "epoch": 0.289735537815444, + "grad_norm": 2.117074966430664, + "learning_rate": 1.53168075858165e-05, + "loss": 0.8106, + "step": 3829 + }, + { + "epoch": 0.2898112065377776, + "grad_norm": 2.3661158084869385, + "learning_rate": 1.5315203116849883e-05, + "loss": 0.6988, + "step": 3830 + }, + { + "epoch": 0.28988687526011125, + "grad_norm": 2.031886100769043, + "learning_rate": 1.5313598252404824e-05, + "loss": 0.7676, + "step": 3831 + }, + { + "epoch": 0.28996254398244486, + "grad_norm": 3.7571802139282227, + "learning_rate": 1.5311992992581824e-05, + "loss": 0.8097, + "step": 3832 + }, + { + "epoch": 0.2900382127047785, + "grad_norm": 2.4319801330566406, + "learning_rate": 1.5310387337481405e-05, + "loss": 0.7101, + "step": 3833 + }, + { + "epoch": 0.2901138814271121, + "grad_norm": 2.1088521480560303, + "learning_rate": 1.530878128720412e-05, + "loss": 0.9235, + "step": 3834 + }, + { + "epoch": 0.29018955014944575, + "grad_norm": 2.3383238315582275, + "learning_rate": 1.5307174841850546e-05, + "loss": 0.8772, + "step": 3835 + }, + { + "epoch": 0.29026521887177936, + "grad_norm": 2.2183845043182373, + "learning_rate": 1.530556800152129e-05, + "loss": 0.8155, + "step": 3836 + }, + { + "epoch": 0.290340887594113, + "grad_norm": 2.790646553039551, + "learning_rate": 1.530396076631696e-05, + "loss": 0.7333, + "step": 3837 + }, + { + "epoch": 0.2904165563164466, + "grad_norm": 1.850390076637268, + "learning_rate": 1.5302353136338226e-05, + "loss": 0.8824, + "step": 3838 + }, + { + "epoch": 0.2904922250387802, + "grad_norm": 2.336447238922119, + "learning_rate": 1.530074511168575e-05, + "loss": 0.6944, + "step": 3839 + }, + { + "epoch": 0.29056789376111386, + "grad_norm": 2.158458709716797, + "learning_rate": 1.5299136692460238e-05, + "loss": 0.8595, + "step": 3840 + }, + { + "epoch": 0.29064356248344747, + "grad_norm": 2.0733799934387207, + "learning_rate": 1.5297527878762413e-05, + "loss": 0.6355, + "step": 3841 + }, + { + "epoch": 0.2907192312057811, + "grad_norm": 2.5459089279174805, + "learning_rate": 1.529591867069302e-05, + "loss": 0.7965, + "step": 3842 + }, + { + "epoch": 0.2907948999281147, + "grad_norm": 2.3149020671844482, + "learning_rate": 1.529430906835284e-05, + "loss": 0.7332, + "step": 3843 + }, + { + "epoch": 0.29087056865044836, + "grad_norm": 2.2165815830230713, + "learning_rate": 1.5292699071842665e-05, + "loss": 0.6736, + "step": 3844 + }, + { + "epoch": 0.29094623737278197, + "grad_norm": 2.079904794692993, + "learning_rate": 1.5291088681263325e-05, + "loss": 0.6289, + "step": 3845 + }, + { + "epoch": 0.2910219060951156, + "grad_norm": 2.122159957885742, + "learning_rate": 1.5289477896715662e-05, + "loss": 0.8419, + "step": 3846 + }, + { + "epoch": 0.2910975748174492, + "grad_norm": 1.8512182235717773, + "learning_rate": 1.5287866718300548e-05, + "loss": 0.6351, + "step": 3847 + }, + { + "epoch": 0.29117324353978286, + "grad_norm": 1.8471953868865967, + "learning_rate": 1.5286255146118886e-05, + "loss": 0.8519, + "step": 3848 + }, + { + "epoch": 0.29124891226211647, + "grad_norm": 1.9632765054702759, + "learning_rate": 1.5284643180271593e-05, + "loss": 0.8273, + "step": 3849 + }, + { + "epoch": 0.2913245809844501, + "grad_norm": 2.095710515975952, + "learning_rate": 1.5283030820859614e-05, + "loss": 0.7674, + "step": 3850 + }, + { + "epoch": 0.2914002497067837, + "grad_norm": 2.047574520111084, + "learning_rate": 1.528141806798393e-05, + "loss": 0.7336, + "step": 3851 + }, + { + "epoch": 0.2914759184291173, + "grad_norm": 3.1790666580200195, + "learning_rate": 1.5279804921745526e-05, + "loss": 0.6697, + "step": 3852 + }, + { + "epoch": 0.29155158715145096, + "grad_norm": 1.7570991516113281, + "learning_rate": 1.5278191382245424e-05, + "loss": 0.7305, + "step": 3853 + }, + { + "epoch": 0.2916272558737846, + "grad_norm": 6.570018291473389, + "learning_rate": 1.5276577449584677e-05, + "loss": 0.8719, + "step": 3854 + }, + { + "epoch": 0.2917029245961182, + "grad_norm": 2.8734898567199707, + "learning_rate": 1.5274963123864346e-05, + "loss": 0.7329, + "step": 3855 + }, + { + "epoch": 0.2917785933184518, + "grad_norm": 2.37735915184021, + "learning_rate": 1.527334840518553e-05, + "loss": 0.7074, + "step": 3856 + }, + { + "epoch": 0.29185426204078546, + "grad_norm": 2.6533119678497314, + "learning_rate": 1.5271733293649347e-05, + "loss": 0.7864, + "step": 3857 + }, + { + "epoch": 0.29192993076311907, + "grad_norm": 2.2681920528411865, + "learning_rate": 1.5270117789356937e-05, + "loss": 0.9383, + "step": 3858 + }, + { + "epoch": 0.2920055994854527, + "grad_norm": 2.4914307594299316, + "learning_rate": 1.5268501892409472e-05, + "loss": 0.7368, + "step": 3859 + }, + { + "epoch": 0.2920812682077863, + "grad_norm": 2.2388360500335693, + "learning_rate": 1.5266885602908145e-05, + "loss": 0.8739, + "step": 3860 + }, + { + "epoch": 0.29215693693011996, + "grad_norm": 1.967874526977539, + "learning_rate": 1.526526892095417e-05, + "loss": 0.6916, + "step": 3861 + }, + { + "epoch": 0.29223260565245357, + "grad_norm": 2.43058443069458, + "learning_rate": 1.5263651846648794e-05, + "loss": 0.8727, + "step": 3862 + }, + { + "epoch": 0.2923082743747872, + "grad_norm": 2.0961925983428955, + "learning_rate": 1.5262034380093276e-05, + "loss": 0.8726, + "step": 3863 + }, + { + "epoch": 0.2923839430971208, + "grad_norm": 2.120222330093384, + "learning_rate": 1.5260416521388916e-05, + "loss": 0.6642, + "step": 3864 + }, + { + "epoch": 0.2924596118194544, + "grad_norm": 2.0524697303771973, + "learning_rate": 1.525879827063702e-05, + "loss": 0.6988, + "step": 3865 + }, + { + "epoch": 0.29253528054178807, + "grad_norm": 2.791609525680542, + "learning_rate": 1.5257179627938935e-05, + "loss": 0.8412, + "step": 3866 + }, + { + "epoch": 0.2926109492641217, + "grad_norm": 2.2559304237365723, + "learning_rate": 1.5255560593396025e-05, + "loss": 1.0062, + "step": 3867 + }, + { + "epoch": 0.2926866179864553, + "grad_norm": 2.290409803390503, + "learning_rate": 1.5253941167109677e-05, + "loss": 0.6786, + "step": 3868 + }, + { + "epoch": 0.2927622867087889, + "grad_norm": 3.779801368713379, + "learning_rate": 1.5252321349181305e-05, + "loss": 0.8315, + "step": 3869 + }, + { + "epoch": 0.29283795543112257, + "grad_norm": 2.773968458175659, + "learning_rate": 1.5250701139712347e-05, + "loss": 0.8372, + "step": 3870 + }, + { + "epoch": 0.2929136241534562, + "grad_norm": 2.577420473098755, + "learning_rate": 1.5249080538804266e-05, + "loss": 0.7793, + "step": 3871 + }, + { + "epoch": 0.2929892928757898, + "grad_norm": 2.174755573272705, + "learning_rate": 1.5247459546558554e-05, + "loss": 0.6802, + "step": 3872 + }, + { + "epoch": 0.2930649615981234, + "grad_norm": 2.247739315032959, + "learning_rate": 1.5245838163076712e-05, + "loss": 0.8264, + "step": 3873 + }, + { + "epoch": 0.29314063032045706, + "grad_norm": 2.067796468734741, + "learning_rate": 1.5244216388460285e-05, + "loss": 0.8444, + "step": 3874 + }, + { + "epoch": 0.2932162990427907, + "grad_norm": 2.176210880279541, + "learning_rate": 1.5242594222810835e-05, + "loss": 0.6093, + "step": 3875 + }, + { + "epoch": 0.2932919677651243, + "grad_norm": 2.150022029876709, + "learning_rate": 1.5240971666229939e-05, + "loss": 0.724, + "step": 3876 + }, + { + "epoch": 0.2933676364874579, + "grad_norm": 2.2023675441741943, + "learning_rate": 1.5239348718819215e-05, + "loss": 0.8256, + "step": 3877 + }, + { + "epoch": 0.2934433052097915, + "grad_norm": 2.1717021465301514, + "learning_rate": 1.5237725380680291e-05, + "loss": 0.8227, + "step": 3878 + }, + { + "epoch": 0.29351897393212517, + "grad_norm": 2.7426464557647705, + "learning_rate": 1.523610165191483e-05, + "loss": 0.8648, + "step": 3879 + }, + { + "epoch": 0.2935946426544588, + "grad_norm": 1.9793200492858887, + "learning_rate": 1.5234477532624512e-05, + "loss": 0.5645, + "step": 3880 + }, + { + "epoch": 0.2936703113767924, + "grad_norm": 2.058037281036377, + "learning_rate": 1.5232853022911048e-05, + "loss": 0.7363, + "step": 3881 + }, + { + "epoch": 0.293745980099126, + "grad_norm": 2.488274097442627, + "learning_rate": 1.5231228122876167e-05, + "loss": 0.8012, + "step": 3882 + }, + { + "epoch": 0.29382164882145967, + "grad_norm": 2.5905423164367676, + "learning_rate": 1.5229602832621628e-05, + "loss": 0.7683, + "step": 3883 + }, + { + "epoch": 0.2938973175437933, + "grad_norm": 2.189631938934326, + "learning_rate": 1.5227977152249211e-05, + "loss": 0.6833, + "step": 3884 + }, + { + "epoch": 0.2939729862661269, + "grad_norm": 3.1091902256011963, + "learning_rate": 1.522635108186072e-05, + "loss": 0.6846, + "step": 3885 + }, + { + "epoch": 0.2940486549884605, + "grad_norm": 2.442972421646118, + "learning_rate": 1.5224724621557985e-05, + "loss": 0.7811, + "step": 3886 + }, + { + "epoch": 0.29412432371079417, + "grad_norm": 1.9295117855072021, + "learning_rate": 1.5223097771442863e-05, + "loss": 0.7857, + "step": 3887 + }, + { + "epoch": 0.2941999924331278, + "grad_norm": 2.101855516433716, + "learning_rate": 1.522147053161723e-05, + "loss": 0.7761, + "step": 3888 + }, + { + "epoch": 0.2942756611554614, + "grad_norm": 2.1722593307495117, + "learning_rate": 1.5219842902182986e-05, + "loss": 0.6753, + "step": 3889 + }, + { + "epoch": 0.294351329877795, + "grad_norm": 1.9771231412887573, + "learning_rate": 1.5218214883242067e-05, + "loss": 0.6856, + "step": 3890 + }, + { + "epoch": 0.29442699860012866, + "grad_norm": 2.3351590633392334, + "learning_rate": 1.5216586474896416e-05, + "loss": 0.6703, + "step": 3891 + }, + { + "epoch": 0.2945026673224623, + "grad_norm": 2.4224588871002197, + "learning_rate": 1.5214957677248015e-05, + "loss": 0.7411, + "step": 3892 + }, + { + "epoch": 0.2945783360447959, + "grad_norm": 1.6721079349517822, + "learning_rate": 1.5213328490398863e-05, + "loss": 0.8959, + "step": 3893 + }, + { + "epoch": 0.2946540047671295, + "grad_norm": 1.949367642402649, + "learning_rate": 1.5211698914450985e-05, + "loss": 0.564, + "step": 3894 + }, + { + "epoch": 0.2947296734894631, + "grad_norm": 2.243993043899536, + "learning_rate": 1.5210068949506428e-05, + "loss": 0.7671, + "step": 3895 + }, + { + "epoch": 0.2948053422117968, + "grad_norm": 1.8077340126037598, + "learning_rate": 1.5208438595667269e-05, + "loss": 0.8371, + "step": 3896 + }, + { + "epoch": 0.2948810109341304, + "grad_norm": 2.3621714115142822, + "learning_rate": 1.5206807853035604e-05, + "loss": 0.6225, + "step": 3897 + }, + { + "epoch": 0.294956679656464, + "grad_norm": 2.770918846130371, + "learning_rate": 1.5205176721713558e-05, + "loss": 0.6834, + "step": 3898 + }, + { + "epoch": 0.2950323483787976, + "grad_norm": 2.04931902885437, + "learning_rate": 1.5203545201803273e-05, + "loss": 0.7648, + "step": 3899 + }, + { + "epoch": 0.29510801710113127, + "grad_norm": 2.5135276317596436, + "learning_rate": 1.5201913293406924e-05, + "loss": 0.6688, + "step": 3900 + }, + { + "epoch": 0.2951836858234649, + "grad_norm": 1.7943437099456787, + "learning_rate": 1.5200280996626705e-05, + "loss": 0.5577, + "step": 3901 + }, + { + "epoch": 0.2952593545457985, + "grad_norm": 2.1226847171783447, + "learning_rate": 1.5198648311564836e-05, + "loss": 0.6473, + "step": 3902 + }, + { + "epoch": 0.2953350232681321, + "grad_norm": 2.3555779457092285, + "learning_rate": 1.5197015238323561e-05, + "loss": 0.9919, + "step": 3903 + }, + { + "epoch": 0.29541069199046577, + "grad_norm": 3.256469964981079, + "learning_rate": 1.5195381777005147e-05, + "loss": 1.0487, + "step": 3904 + }, + { + "epoch": 0.2954863607127994, + "grad_norm": 2.362062931060791, + "learning_rate": 1.5193747927711889e-05, + "loss": 0.7727, + "step": 3905 + }, + { + "epoch": 0.295562029435133, + "grad_norm": 1.6341859102249146, + "learning_rate": 1.5192113690546101e-05, + "loss": 0.6764, + "step": 3906 + }, + { + "epoch": 0.2956376981574666, + "grad_norm": 2.2178308963775635, + "learning_rate": 1.5190479065610129e-05, + "loss": 0.8589, + "step": 3907 + }, + { + "epoch": 0.2957133668798002, + "grad_norm": 2.287083148956299, + "learning_rate": 1.5188844053006334e-05, + "loss": 0.8661, + "step": 3908 + }, + { + "epoch": 0.2957890356021339, + "grad_norm": 2.168025016784668, + "learning_rate": 1.5187208652837105e-05, + "loss": 0.7258, + "step": 3909 + }, + { + "epoch": 0.2958647043244675, + "grad_norm": 2.3210017681121826, + "learning_rate": 1.5185572865204861e-05, + "loss": 0.9169, + "step": 3910 + }, + { + "epoch": 0.2959403730468011, + "grad_norm": 2.4655609130859375, + "learning_rate": 1.5183936690212038e-05, + "loss": 0.9665, + "step": 3911 + }, + { + "epoch": 0.2960160417691347, + "grad_norm": 2.277387857437134, + "learning_rate": 1.5182300127961097e-05, + "loss": 0.8959, + "step": 3912 + }, + { + "epoch": 0.2960917104914684, + "grad_norm": 2.419633388519287, + "learning_rate": 1.5180663178554527e-05, + "loss": 0.7201, + "step": 3913 + }, + { + "epoch": 0.296167379213802, + "grad_norm": 2.0627200603485107, + "learning_rate": 1.5179025842094837e-05, + "loss": 0.8001, + "step": 3914 + }, + { + "epoch": 0.2962430479361356, + "grad_norm": 2.119424343109131, + "learning_rate": 1.5177388118684563e-05, + "loss": 0.7499, + "step": 3915 + }, + { + "epoch": 0.2963187166584692, + "grad_norm": 2.0635826587677, + "learning_rate": 1.5175750008426266e-05, + "loss": 0.7671, + "step": 3916 + }, + { + "epoch": 0.29639438538080287, + "grad_norm": 1.9896836280822754, + "learning_rate": 1.5174111511422528e-05, + "loss": 0.694, + "step": 3917 + }, + { + "epoch": 0.2964700541031365, + "grad_norm": 2.2198286056518555, + "learning_rate": 1.517247262777596e-05, + "loss": 0.8069, + "step": 3918 + }, + { + "epoch": 0.2965457228254701, + "grad_norm": 2.9682750701904297, + "learning_rate": 1.5170833357589188e-05, + "loss": 0.8306, + "step": 3919 + }, + { + "epoch": 0.2966213915478037, + "grad_norm": 5.279101848602295, + "learning_rate": 1.5169193700964875e-05, + "loss": 0.891, + "step": 3920 + }, + { + "epoch": 0.2966970602701373, + "grad_norm": 2.3643431663513184, + "learning_rate": 1.5167553658005695e-05, + "loss": 0.7815, + "step": 3921 + }, + { + "epoch": 0.296772728992471, + "grad_norm": 2.442836046218872, + "learning_rate": 1.516591322881436e-05, + "loss": 0.7634, + "step": 3922 + }, + { + "epoch": 0.2968483977148046, + "grad_norm": 2.029599905014038, + "learning_rate": 1.5164272413493597e-05, + "loss": 0.7216, + "step": 3923 + }, + { + "epoch": 0.2969240664371382, + "grad_norm": 2.4150075912475586, + "learning_rate": 1.5162631212146155e-05, + "loss": 0.8002, + "step": 3924 + }, + { + "epoch": 0.2969997351594718, + "grad_norm": 2.4529528617858887, + "learning_rate": 1.5160989624874815e-05, + "loss": 0.6906, + "step": 3925 + }, + { + "epoch": 0.2970754038818055, + "grad_norm": 2.5680036544799805, + "learning_rate": 1.5159347651782379e-05, + "loss": 0.7421, + "step": 3926 + }, + { + "epoch": 0.2971510726041391, + "grad_norm": 2.2897586822509766, + "learning_rate": 1.515770529297167e-05, + "loss": 0.7853, + "step": 3927 + }, + { + "epoch": 0.2972267413264727, + "grad_norm": 2.2733471393585205, + "learning_rate": 1.5156062548545538e-05, + "loss": 0.6197, + "step": 3928 + }, + { + "epoch": 0.2973024100488063, + "grad_norm": 2.1105287075042725, + "learning_rate": 1.515441941860686e-05, + "loss": 0.8367, + "step": 3929 + }, + { + "epoch": 0.29737807877114, + "grad_norm": 2.4715983867645264, + "learning_rate": 1.515277590325853e-05, + "loss": 0.6271, + "step": 3930 + }, + { + "epoch": 0.2974537474934736, + "grad_norm": 1.8005375862121582, + "learning_rate": 1.5151132002603475e-05, + "loss": 0.7891, + "step": 3931 + }, + { + "epoch": 0.2975294162158072, + "grad_norm": 2.270176410675049, + "learning_rate": 1.5149487716744637e-05, + "loss": 0.6982, + "step": 3932 + }, + { + "epoch": 0.2976050849381408, + "grad_norm": 2.324087142944336, + "learning_rate": 1.5147843045784989e-05, + "loss": 0.7359, + "step": 3933 + }, + { + "epoch": 0.2976807536604744, + "grad_norm": 2.229957103729248, + "learning_rate": 1.5146197989827526e-05, + "loss": 0.6015, + "step": 3934 + }, + { + "epoch": 0.2977564223828081, + "grad_norm": 1.9169422388076782, + "learning_rate": 1.5144552548975264e-05, + "loss": 0.6176, + "step": 3935 + }, + { + "epoch": 0.2978320911051417, + "grad_norm": 2.401707172393799, + "learning_rate": 1.5142906723331248e-05, + "loss": 0.7862, + "step": 3936 + }, + { + "epoch": 0.2979077598274753, + "grad_norm": 2.221273899078369, + "learning_rate": 1.5141260512998544e-05, + "loss": 0.7928, + "step": 3937 + }, + { + "epoch": 0.2979834285498089, + "grad_norm": 2.259052276611328, + "learning_rate": 1.5139613918080243e-05, + "loss": 0.8092, + "step": 3938 + }, + { + "epoch": 0.2980590972721426, + "grad_norm": 2.435391426086426, + "learning_rate": 1.5137966938679463e-05, + "loss": 0.7887, + "step": 3939 + }, + { + "epoch": 0.2981347659944762, + "grad_norm": 1.9297043085098267, + "learning_rate": 1.5136319574899338e-05, + "loss": 0.763, + "step": 3940 + }, + { + "epoch": 0.2982104347168098, + "grad_norm": 1.8579686880111694, + "learning_rate": 1.5134671826843034e-05, + "loss": 0.6998, + "step": 3941 + }, + { + "epoch": 0.2982861034391434, + "grad_norm": 2.0775258541107178, + "learning_rate": 1.5133023694613743e-05, + "loss": 0.7513, + "step": 3942 + }, + { + "epoch": 0.2983617721614771, + "grad_norm": 2.1866838932037354, + "learning_rate": 1.5131375178314666e-05, + "loss": 0.8225, + "step": 3943 + }, + { + "epoch": 0.2984374408838107, + "grad_norm": 1.9399791955947876, + "learning_rate": 1.5129726278049046e-05, + "loss": 0.7335, + "step": 3944 + }, + { + "epoch": 0.2985131096061443, + "grad_norm": 2.0289242267608643, + "learning_rate": 1.5128076993920142e-05, + "loss": 0.7298, + "step": 3945 + }, + { + "epoch": 0.2985887783284779, + "grad_norm": 1.8820334672927856, + "learning_rate": 1.5126427326031233e-05, + "loss": 0.7299, + "step": 3946 + }, + { + "epoch": 0.2986644470508115, + "grad_norm": 1.9488524198532104, + "learning_rate": 1.5124777274485631e-05, + "loss": 0.7782, + "step": 3947 + }, + { + "epoch": 0.2987401157731452, + "grad_norm": 2.3379132747650146, + "learning_rate": 1.5123126839386668e-05, + "loss": 0.7691, + "step": 3948 + }, + { + "epoch": 0.2988157844954788, + "grad_norm": 2.2485504150390625, + "learning_rate": 1.5121476020837695e-05, + "loss": 0.8717, + "step": 3949 + }, + { + "epoch": 0.2988914532178124, + "grad_norm": 2.426990509033203, + "learning_rate": 1.5119824818942093e-05, + "loss": 0.8312, + "step": 3950 + }, + { + "epoch": 0.298967121940146, + "grad_norm": 2.0571541786193848, + "learning_rate": 1.511817323380327e-05, + "loss": 0.7172, + "step": 3951 + }, + { + "epoch": 0.2990427906624797, + "grad_norm": 2.4347405433654785, + "learning_rate": 1.5116521265524652e-05, + "loss": 0.9351, + "step": 3952 + }, + { + "epoch": 0.2991184593848133, + "grad_norm": 2.8219993114471436, + "learning_rate": 1.5114868914209686e-05, + "loss": 0.6793, + "step": 3953 + }, + { + "epoch": 0.2991941281071469, + "grad_norm": 2.1219239234924316, + "learning_rate": 1.5113216179961852e-05, + "loss": 0.826, + "step": 3954 + }, + { + "epoch": 0.2992697968294805, + "grad_norm": 2.106398344039917, + "learning_rate": 1.511156306288465e-05, + "loss": 0.7407, + "step": 3955 + }, + { + "epoch": 0.2993454655518142, + "grad_norm": 2.503176689147949, + "learning_rate": 1.5109909563081598e-05, + "loss": 0.7621, + "step": 3956 + }, + { + "epoch": 0.2994211342741478, + "grad_norm": 1.9739935398101807, + "learning_rate": 1.510825568065625e-05, + "loss": 0.6997, + "step": 3957 + }, + { + "epoch": 0.2994968029964814, + "grad_norm": 2.651623010635376, + "learning_rate": 1.5106601415712173e-05, + "loss": 0.7928, + "step": 3958 + }, + { + "epoch": 0.299572471718815, + "grad_norm": 2.1895413398742676, + "learning_rate": 1.5104946768352966e-05, + "loss": 0.6159, + "step": 3959 + }, + { + "epoch": 0.2996481404411486, + "grad_norm": 2.34374737739563, + "learning_rate": 1.5103291738682245e-05, + "loss": 0.7105, + "step": 3960 + }, + { + "epoch": 0.2997238091634823, + "grad_norm": 2.078537702560425, + "learning_rate": 1.5101636326803654e-05, + "loss": 0.7007, + "step": 3961 + }, + { + "epoch": 0.2997994778858159, + "grad_norm": 2.177887439727783, + "learning_rate": 1.5099980532820864e-05, + "loss": 0.6706, + "step": 3962 + }, + { + "epoch": 0.2998751466081495, + "grad_norm": 1.6755671501159668, + "learning_rate": 1.5098324356837562e-05, + "loss": 0.6632, + "step": 3963 + }, + { + "epoch": 0.2999508153304831, + "grad_norm": 3.093810796737671, + "learning_rate": 1.5096667798957465e-05, + "loss": 0.7289, + "step": 3964 + }, + { + "epoch": 0.3000264840528168, + "grad_norm": 2.284776449203491, + "learning_rate": 1.509501085928431e-05, + "loss": 0.8436, + "step": 3965 + }, + { + "epoch": 0.3001021527751504, + "grad_norm": 2.3349032402038574, + "learning_rate": 1.5093353537921863e-05, + "loss": 0.7769, + "step": 3966 + }, + { + "epoch": 0.300177821497484, + "grad_norm": 2.382188081741333, + "learning_rate": 1.5091695834973908e-05, + "loss": 0.7436, + "step": 3967 + }, + { + "epoch": 0.3002534902198176, + "grad_norm": 2.356771469116211, + "learning_rate": 1.5090037750544255e-05, + "loss": 0.7242, + "step": 3968 + }, + { + "epoch": 0.3003291589421513, + "grad_norm": 2.8641955852508545, + "learning_rate": 1.5088379284736744e-05, + "loss": 0.8392, + "step": 3969 + }, + { + "epoch": 0.3004048276644849, + "grad_norm": 2.1027886867523193, + "learning_rate": 1.5086720437655228e-05, + "loss": 0.6559, + "step": 3970 + }, + { + "epoch": 0.3004804963868185, + "grad_norm": 2.1024560928344727, + "learning_rate": 1.5085061209403593e-05, + "loss": 0.8123, + "step": 3971 + }, + { + "epoch": 0.3005561651091521, + "grad_norm": 3.435528516769409, + "learning_rate": 1.5083401600085741e-05, + "loss": 0.7778, + "step": 3972 + }, + { + "epoch": 0.30063183383148573, + "grad_norm": 2.4583373069763184, + "learning_rate": 1.5081741609805608e-05, + "loss": 0.6514, + "step": 3973 + }, + { + "epoch": 0.3007075025538194, + "grad_norm": 2.3399574756622314, + "learning_rate": 1.5080081238667143e-05, + "loss": 0.7155, + "step": 3974 + }, + { + "epoch": 0.300783171276153, + "grad_norm": 2.8221993446350098, + "learning_rate": 1.5078420486774327e-05, + "loss": 0.9117, + "step": 3975 + }, + { + "epoch": 0.3008588399984866, + "grad_norm": 2.924146890640259, + "learning_rate": 1.5076759354231156e-05, + "loss": 0.6425, + "step": 3976 + }, + { + "epoch": 0.3009345087208202, + "grad_norm": 2.2757415771484375, + "learning_rate": 1.5075097841141663e-05, + "loss": 0.9589, + "step": 3977 + }, + { + "epoch": 0.3010101774431539, + "grad_norm": 2.634446620941162, + "learning_rate": 1.5073435947609891e-05, + "loss": 0.7762, + "step": 3978 + }, + { + "epoch": 0.3010858461654875, + "grad_norm": 2.1489665508270264, + "learning_rate": 1.5071773673739918e-05, + "loss": 0.9189, + "step": 3979 + }, + { + "epoch": 0.3011615148878211, + "grad_norm": 3.0982818603515625, + "learning_rate": 1.507011101963584e-05, + "loss": 0.7364, + "step": 3980 + }, + { + "epoch": 0.3012371836101547, + "grad_norm": 3.0719218254089355, + "learning_rate": 1.5068447985401776e-05, + "loss": 0.6404, + "step": 3981 + }, + { + "epoch": 0.3013128523324884, + "grad_norm": 2.7266042232513428, + "learning_rate": 1.5066784571141874e-05, + "loss": 0.7629, + "step": 3982 + }, + { + "epoch": 0.301388521054822, + "grad_norm": 3.0816164016723633, + "learning_rate": 1.5065120776960294e-05, + "loss": 0.682, + "step": 3983 + }, + { + "epoch": 0.3014641897771556, + "grad_norm": 2.005387306213379, + "learning_rate": 1.5063456602961237e-05, + "loss": 0.6885, + "step": 3984 + }, + { + "epoch": 0.3015398584994892, + "grad_norm": 2.6402080059051514, + "learning_rate": 1.5061792049248918e-05, + "loss": 0.7662, + "step": 3985 + }, + { + "epoch": 0.30161552722182283, + "grad_norm": 2.094618797302246, + "learning_rate": 1.5060127115927572e-05, + "loss": 0.7406, + "step": 3986 + }, + { + "epoch": 0.3016911959441565, + "grad_norm": 2.5571320056915283, + "learning_rate": 1.5058461803101466e-05, + "loss": 0.6746, + "step": 3987 + }, + { + "epoch": 0.3017668646664901, + "grad_norm": 2.0813798904418945, + "learning_rate": 1.5056796110874885e-05, + "loss": 0.6651, + "step": 3988 + }, + { + "epoch": 0.3018425333888237, + "grad_norm": 1.9654511213302612, + "learning_rate": 1.5055130039352146e-05, + "loss": 0.7259, + "step": 3989 + }, + { + "epoch": 0.30191820211115733, + "grad_norm": 2.2641611099243164, + "learning_rate": 1.5053463588637577e-05, + "loss": 0.6931, + "step": 3990 + }, + { + "epoch": 0.301993870833491, + "grad_norm": 2.238877058029175, + "learning_rate": 1.5051796758835534e-05, + "loss": 0.7844, + "step": 3991 + }, + { + "epoch": 0.3020695395558246, + "grad_norm": 3.144057512283325, + "learning_rate": 1.505012955005041e-05, + "loss": 0.9696, + "step": 3992 + }, + { + "epoch": 0.3021452082781582, + "grad_norm": 1.9449888467788696, + "learning_rate": 1.5048461962386602e-05, + "loss": 0.7733, + "step": 3993 + }, + { + "epoch": 0.3022208770004918, + "grad_norm": 2.664398193359375, + "learning_rate": 1.5046793995948543e-05, + "loss": 0.8057, + "step": 3994 + }, + { + "epoch": 0.3022965457228255, + "grad_norm": 3.067143678665161, + "learning_rate": 1.504512565084069e-05, + "loss": 0.6894, + "step": 3995 + }, + { + "epoch": 0.3023722144451591, + "grad_norm": 2.999467372894287, + "learning_rate": 1.5043456927167511e-05, + "loss": 0.7235, + "step": 3996 + }, + { + "epoch": 0.3024478831674927, + "grad_norm": 2.0027337074279785, + "learning_rate": 1.5041787825033516e-05, + "loss": 0.5782, + "step": 3997 + }, + { + "epoch": 0.3025235518898263, + "grad_norm": 3.10082745552063, + "learning_rate": 1.5040118344543226e-05, + "loss": 0.6152, + "step": 3998 + }, + { + "epoch": 0.30259922061215994, + "grad_norm": 2.1244442462921143, + "learning_rate": 1.5038448485801188e-05, + "loss": 0.6947, + "step": 3999 + }, + { + "epoch": 0.3026748893344936, + "grad_norm": 3.423577070236206, + "learning_rate": 1.5036778248911973e-05, + "loss": 0.7686, + "step": 4000 + }, + { + "epoch": 0.3027505580568272, + "grad_norm": 2.019320249557495, + "learning_rate": 1.5035107633980182e-05, + "loss": 0.7844, + "step": 4001 + }, + { + "epoch": 0.3028262267791608, + "grad_norm": 1.8496774435043335, + "learning_rate": 1.503343664111043e-05, + "loss": 0.6277, + "step": 4002 + }, + { + "epoch": 0.30290189550149443, + "grad_norm": 2.151859998703003, + "learning_rate": 1.5031765270407362e-05, + "loss": 0.6403, + "step": 4003 + }, + { + "epoch": 0.3029775642238281, + "grad_norm": 2.207608222961426, + "learning_rate": 1.5030093521975642e-05, + "loss": 0.9016, + "step": 4004 + }, + { + "epoch": 0.3030532329461617, + "grad_norm": 2.543726921081543, + "learning_rate": 1.5028421395919961e-05, + "loss": 0.8025, + "step": 4005 + }, + { + "epoch": 0.3031289016684953, + "grad_norm": 1.9269533157348633, + "learning_rate": 1.5026748892345037e-05, + "loss": 0.6987, + "step": 4006 + }, + { + "epoch": 0.30320457039082893, + "grad_norm": 2.5526182651519775, + "learning_rate": 1.5025076011355602e-05, + "loss": 0.7898, + "step": 4007 + }, + { + "epoch": 0.3032802391131626, + "grad_norm": 2.2586183547973633, + "learning_rate": 1.5023402753056422e-05, + "loss": 0.782, + "step": 4008 + }, + { + "epoch": 0.3033559078354962, + "grad_norm": 2.1747992038726807, + "learning_rate": 1.5021729117552276e-05, + "loss": 0.77, + "step": 4009 + }, + { + "epoch": 0.3034315765578298, + "grad_norm": 2.0999064445495605, + "learning_rate": 1.5020055104947979e-05, + "loss": 0.6958, + "step": 4010 + }, + { + "epoch": 0.30350724528016343, + "grad_norm": 2.135993003845215, + "learning_rate": 1.501838071534836e-05, + "loss": 0.633, + "step": 4011 + }, + { + "epoch": 0.3035829140024971, + "grad_norm": 3.0176174640655518, + "learning_rate": 1.5016705948858274e-05, + "loss": 0.6997, + "step": 4012 + }, + { + "epoch": 0.3036585827248307, + "grad_norm": 2.2656147480010986, + "learning_rate": 1.5015030805582602e-05, + "loss": 0.7354, + "step": 4013 + }, + { + "epoch": 0.3037342514471643, + "grad_norm": 2.75590443611145, + "learning_rate": 1.5013355285626243e-05, + "loss": 0.5873, + "step": 4014 + }, + { + "epoch": 0.3038099201694979, + "grad_norm": 2.2523720264434814, + "learning_rate": 1.501167938909413e-05, + "loss": 0.795, + "step": 4015 + }, + { + "epoch": 0.30388558889183154, + "grad_norm": 2.134129047393799, + "learning_rate": 1.501000311609121e-05, + "loss": 0.7115, + "step": 4016 + }, + { + "epoch": 0.3039612576141652, + "grad_norm": 2.428209066390991, + "learning_rate": 1.5008326466722451e-05, + "loss": 0.7494, + "step": 4017 + }, + { + "epoch": 0.3040369263364988, + "grad_norm": 2.9792733192443848, + "learning_rate": 1.500664944109286e-05, + "loss": 0.9121, + "step": 4018 + }, + { + "epoch": 0.3041125950588324, + "grad_norm": 3.070441961288452, + "learning_rate": 1.5004972039307451e-05, + "loss": 0.7321, + "step": 4019 + }, + { + "epoch": 0.30418826378116604, + "grad_norm": 2.1026179790496826, + "learning_rate": 1.5003294261471272e-05, + "loss": 0.5796, + "step": 4020 + }, + { + "epoch": 0.3042639325034997, + "grad_norm": 2.6371750831604004, + "learning_rate": 1.5001616107689388e-05, + "loss": 0.7323, + "step": 4021 + }, + { + "epoch": 0.3043396012258333, + "grad_norm": 2.2472572326660156, + "learning_rate": 1.4999937578066893e-05, + "loss": 0.8368, + "step": 4022 + }, + { + "epoch": 0.3044152699481669, + "grad_norm": 4.721674919128418, + "learning_rate": 1.4998258672708901e-05, + "loss": 0.552, + "step": 4023 + }, + { + "epoch": 0.30449093867050053, + "grad_norm": 4.181674480438232, + "learning_rate": 1.499657939172055e-05, + "loss": 0.7753, + "step": 4024 + }, + { + "epoch": 0.3045666073928342, + "grad_norm": 2.1796109676361084, + "learning_rate": 1.4994899735207e-05, + "loss": 0.6785, + "step": 4025 + }, + { + "epoch": 0.3046422761151678, + "grad_norm": 2.1673810482025146, + "learning_rate": 1.499321970327344e-05, + "loss": 0.743, + "step": 4026 + }, + { + "epoch": 0.3047179448375014, + "grad_norm": 2.26334285736084, + "learning_rate": 1.4991539296025078e-05, + "loss": 0.7859, + "step": 4027 + }, + { + "epoch": 0.30479361355983503, + "grad_norm": 2.063838005065918, + "learning_rate": 1.4989858513567147e-05, + "loss": 1.0079, + "step": 4028 + }, + { + "epoch": 0.30486928228216864, + "grad_norm": 2.168710708618164, + "learning_rate": 1.4988177356004902e-05, + "loss": 0.6894, + "step": 4029 + }, + { + "epoch": 0.3049449510045023, + "grad_norm": 1.7202786207199097, + "learning_rate": 1.4986495823443621e-05, + "loss": 0.8765, + "step": 4030 + }, + { + "epoch": 0.3050206197268359, + "grad_norm": 2.792025327682495, + "learning_rate": 1.4984813915988614e-05, + "loss": 0.6793, + "step": 4031 + }, + { + "epoch": 0.30509628844916953, + "grad_norm": 2.359570264816284, + "learning_rate": 1.4983131633745196e-05, + "loss": 0.7668, + "step": 4032 + }, + { + "epoch": 0.30517195717150314, + "grad_norm": 4.47695779800415, + "learning_rate": 1.4981448976818725e-05, + "loss": 0.8328, + "step": 4033 + }, + { + "epoch": 0.3052476258938368, + "grad_norm": 2.457249879837036, + "learning_rate": 1.4979765945314574e-05, + "loss": 0.6713, + "step": 4034 + }, + { + "epoch": 0.3053232946161704, + "grad_norm": 1.8373754024505615, + "learning_rate": 1.497808253933814e-05, + "loss": 0.6746, + "step": 4035 + }, + { + "epoch": 0.305398963338504, + "grad_norm": 2.597541332244873, + "learning_rate": 1.497639875899484e-05, + "loss": 0.6358, + "step": 4036 + }, + { + "epoch": 0.30547463206083764, + "grad_norm": 2.0362725257873535, + "learning_rate": 1.4974714604390118e-05, + "loss": 0.8613, + "step": 4037 + }, + { + "epoch": 0.3055503007831713, + "grad_norm": 2.046318292617798, + "learning_rate": 1.4973030075629447e-05, + "loss": 0.7307, + "step": 4038 + }, + { + "epoch": 0.3056259695055049, + "grad_norm": 2.4216933250427246, + "learning_rate": 1.4971345172818313e-05, + "loss": 0.7361, + "step": 4039 + }, + { + "epoch": 0.3057016382278385, + "grad_norm": 2.259838104248047, + "learning_rate": 1.4969659896062226e-05, + "loss": 0.8991, + "step": 4040 + }, + { + "epoch": 0.30577730695017213, + "grad_norm": 1.8823871612548828, + "learning_rate": 1.4967974245466731e-05, + "loss": 0.7365, + "step": 4041 + }, + { + "epoch": 0.30585297567250574, + "grad_norm": 2.1523728370666504, + "learning_rate": 1.4966288221137388e-05, + "loss": 0.7233, + "step": 4042 + }, + { + "epoch": 0.3059286443948394, + "grad_norm": 5.254127502441406, + "learning_rate": 1.4964601823179776e-05, + "loss": 0.6328, + "step": 4043 + }, + { + "epoch": 0.306004313117173, + "grad_norm": 1.8347951173782349, + "learning_rate": 1.4962915051699506e-05, + "loss": 0.7929, + "step": 4044 + }, + { + "epoch": 0.30607998183950663, + "grad_norm": 2.36879563331604, + "learning_rate": 1.4961227906802212e-05, + "loss": 0.6739, + "step": 4045 + }, + { + "epoch": 0.30615565056184024, + "grad_norm": 1.9774808883666992, + "learning_rate": 1.4959540388593543e-05, + "loss": 0.8385, + "step": 4046 + }, + { + "epoch": 0.3062313192841739, + "grad_norm": 1.8404202461242676, + "learning_rate": 1.4957852497179182e-05, + "loss": 0.7847, + "step": 4047 + }, + { + "epoch": 0.3063069880065075, + "grad_norm": 2.2674901485443115, + "learning_rate": 1.4956164232664825e-05, + "loss": 0.7625, + "step": 4048 + }, + { + "epoch": 0.30638265672884113, + "grad_norm": 1.9228596687316895, + "learning_rate": 1.4954475595156198e-05, + "loss": 0.7071, + "step": 4049 + }, + { + "epoch": 0.30645832545117474, + "grad_norm": 2.0531787872314453, + "learning_rate": 1.4952786584759053e-05, + "loss": 0.7727, + "step": 4050 + }, + { + "epoch": 0.3065339941735084, + "grad_norm": 2.334456205368042, + "learning_rate": 1.4951097201579159e-05, + "loss": 0.7362, + "step": 4051 + }, + { + "epoch": 0.306609662895842, + "grad_norm": 2.5954339504241943, + "learning_rate": 1.4949407445722308e-05, + "loss": 0.6712, + "step": 4052 + }, + { + "epoch": 0.3066853316181756, + "grad_norm": 2.0414674282073975, + "learning_rate": 1.4947717317294321e-05, + "loss": 0.7052, + "step": 4053 + }, + { + "epoch": 0.30676100034050924, + "grad_norm": 2.296151638031006, + "learning_rate": 1.4946026816401037e-05, + "loss": 0.6226, + "step": 4054 + }, + { + "epoch": 0.30683666906284285, + "grad_norm": 1.8123749494552612, + "learning_rate": 1.4944335943148323e-05, + "loss": 0.7195, + "step": 4055 + }, + { + "epoch": 0.3069123377851765, + "grad_norm": 2.317089319229126, + "learning_rate": 1.4942644697642067e-05, + "loss": 0.728, + "step": 4056 + }, + { + "epoch": 0.3069880065075101, + "grad_norm": 2.25272536277771, + "learning_rate": 1.4940953079988179e-05, + "loss": 0.6677, + "step": 4057 + }, + { + "epoch": 0.30706367522984374, + "grad_norm": 2.574553966522217, + "learning_rate": 1.4939261090292592e-05, + "loss": 0.961, + "step": 4058 + }, + { + "epoch": 0.30713934395217735, + "grad_norm": 2.785632610321045, + "learning_rate": 1.4937568728661265e-05, + "loss": 0.7058, + "step": 4059 + }, + { + "epoch": 0.307215012674511, + "grad_norm": 2.4276599884033203, + "learning_rate": 1.4935875995200183e-05, + "loss": 0.7033, + "step": 4060 + }, + { + "epoch": 0.3072906813968446, + "grad_norm": 2.2110235691070557, + "learning_rate": 1.4934182890015345e-05, + "loss": 0.6735, + "step": 4061 + }, + { + "epoch": 0.30736635011917823, + "grad_norm": 2.3032000064849854, + "learning_rate": 1.4932489413212782e-05, + "loss": 0.8252, + "step": 4062 + }, + { + "epoch": 0.30744201884151184, + "grad_norm": 2.245957136154175, + "learning_rate": 1.4930795564898543e-05, + "loss": 0.9702, + "step": 4063 + }, + { + "epoch": 0.3075176875638455, + "grad_norm": 2.7905473709106445, + "learning_rate": 1.4929101345178703e-05, + "loss": 0.6039, + "step": 4064 + }, + { + "epoch": 0.3075933562861791, + "grad_norm": 2.3959288597106934, + "learning_rate": 1.4927406754159361e-05, + "loss": 0.6691, + "step": 4065 + }, + { + "epoch": 0.30766902500851273, + "grad_norm": 1.9613829851150513, + "learning_rate": 1.4925711791946636e-05, + "loss": 0.8457, + "step": 4066 + }, + { + "epoch": 0.30774469373084634, + "grad_norm": 2.6765880584716797, + "learning_rate": 1.492401645864667e-05, + "loss": 0.8385, + "step": 4067 + }, + { + "epoch": 0.30782036245317995, + "grad_norm": 2.0931665897369385, + "learning_rate": 1.4922320754365636e-05, + "loss": 0.6882, + "step": 4068 + }, + { + "epoch": 0.3078960311755136, + "grad_norm": 2.6953020095825195, + "learning_rate": 1.4920624679209723e-05, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.30797169989784723, + "grad_norm": 2.3795089721679688, + "learning_rate": 1.4918928233285139e-05, + "loss": 0.7182, + "step": 4070 + }, + { + "epoch": 0.30804736862018084, + "grad_norm": 1.9124236106872559, + "learning_rate": 1.4917231416698126e-05, + "loss": 0.6546, + "step": 4071 + }, + { + "epoch": 0.30812303734251445, + "grad_norm": 2.0559885501861572, + "learning_rate": 1.4915534229554944e-05, + "loss": 0.6896, + "step": 4072 + }, + { + "epoch": 0.3081987060648481, + "grad_norm": 1.9751750230789185, + "learning_rate": 1.4913836671961874e-05, + "loss": 0.7115, + "step": 4073 + }, + { + "epoch": 0.3082743747871817, + "grad_norm": 2.7253572940826416, + "learning_rate": 1.4912138744025223e-05, + "loss": 0.6748, + "step": 4074 + }, + { + "epoch": 0.30835004350951534, + "grad_norm": 2.238069534301758, + "learning_rate": 1.4910440445851325e-05, + "loss": 0.7391, + "step": 4075 + }, + { + "epoch": 0.30842571223184895, + "grad_norm": 2.452892780303955, + "learning_rate": 1.4908741777546527e-05, + "loss": 0.8675, + "step": 4076 + }, + { + "epoch": 0.3085013809541826, + "grad_norm": 2.315485715866089, + "learning_rate": 1.4907042739217208e-05, + "loss": 0.7192, + "step": 4077 + }, + { + "epoch": 0.3085770496765162, + "grad_norm": 2.2211544513702393, + "learning_rate": 1.4905343330969766e-05, + "loss": 0.7764, + "step": 4078 + }, + { + "epoch": 0.30865271839884983, + "grad_norm": 2.0151994228363037, + "learning_rate": 1.4903643552910628e-05, + "loss": 0.7262, + "step": 4079 + }, + { + "epoch": 0.30872838712118345, + "grad_norm": 2.6214284896850586, + "learning_rate": 1.4901943405146233e-05, + "loss": 0.7586, + "step": 4080 + }, + { + "epoch": 0.30880405584351706, + "grad_norm": 2.426795244216919, + "learning_rate": 1.4900242887783053e-05, + "loss": 0.7345, + "step": 4081 + }, + { + "epoch": 0.3088797245658507, + "grad_norm": 1.778331995010376, + "learning_rate": 1.4898542000927582e-05, + "loss": 0.606, + "step": 4082 + }, + { + "epoch": 0.30895539328818433, + "grad_norm": 2.1181724071502686, + "learning_rate": 1.4896840744686331e-05, + "loss": 0.7484, + "step": 4083 + }, + { + "epoch": 0.30903106201051794, + "grad_norm": 2.628530740737915, + "learning_rate": 1.4895139119165837e-05, + "loss": 0.7351, + "step": 4084 + }, + { + "epoch": 0.30910673073285155, + "grad_norm": 1.912762999534607, + "learning_rate": 1.489343712447267e-05, + "loss": 0.7039, + "step": 4085 + }, + { + "epoch": 0.3091823994551852, + "grad_norm": 2.245521306991577, + "learning_rate": 1.4891734760713405e-05, + "loss": 0.985, + "step": 4086 + }, + { + "epoch": 0.30925806817751883, + "grad_norm": 6.195135593414307, + "learning_rate": 1.4890032027994655e-05, + "loss": 0.7202, + "step": 4087 + }, + { + "epoch": 0.30933373689985244, + "grad_norm": 2.0465266704559326, + "learning_rate": 1.4888328926423048e-05, + "loss": 0.7164, + "step": 4088 + }, + { + "epoch": 0.30940940562218605, + "grad_norm": 2.153587818145752, + "learning_rate": 1.4886625456105235e-05, + "loss": 0.8307, + "step": 4089 + }, + { + "epoch": 0.3094850743445197, + "grad_norm": 2.2596750259399414, + "learning_rate": 1.48849216171479e-05, + "loss": 0.7311, + "step": 4090 + }, + { + "epoch": 0.3095607430668533, + "grad_norm": 1.8946340084075928, + "learning_rate": 1.4883217409657739e-05, + "loss": 0.7878, + "step": 4091 + }, + { + "epoch": 0.30963641178918694, + "grad_norm": 1.9765899181365967, + "learning_rate": 1.4881512833741475e-05, + "loss": 0.746, + "step": 4092 + }, + { + "epoch": 0.30971208051152055, + "grad_norm": 2.942574977874756, + "learning_rate": 1.4879807889505856e-05, + "loss": 0.6554, + "step": 4093 + }, + { + "epoch": 0.30978774923385416, + "grad_norm": 2.359882116317749, + "learning_rate": 1.4878102577057643e-05, + "loss": 0.6133, + "step": 4094 + }, + { + "epoch": 0.3098634179561878, + "grad_norm": 2.197938919067383, + "learning_rate": 1.487639689650364e-05, + "loss": 0.7502, + "step": 4095 + }, + { + "epoch": 0.30993908667852144, + "grad_norm": 2.5012736320495605, + "learning_rate": 1.4874690847950654e-05, + "loss": 0.7593, + "step": 4096 + }, + { + "epoch": 0.31001475540085505, + "grad_norm": 2.0770883560180664, + "learning_rate": 1.4872984431505528e-05, + "loss": 0.8841, + "step": 4097 + }, + { + "epoch": 0.31009042412318866, + "grad_norm": 2.287703275680542, + "learning_rate": 1.4871277647275122e-05, + "loss": 0.7934, + "step": 4098 + }, + { + "epoch": 0.3101660928455223, + "grad_norm": 1.9843212366104126, + "learning_rate": 1.486957049536632e-05, + "loss": 0.7782, + "step": 4099 + }, + { + "epoch": 0.31024176156785593, + "grad_norm": 2.0489273071289062, + "learning_rate": 1.4867862975886027e-05, + "loss": 0.7255, + "step": 4100 + }, + { + "epoch": 0.31031743029018954, + "grad_norm": 2.2509491443634033, + "learning_rate": 1.4866155088941175e-05, + "loss": 0.7689, + "step": 4101 + }, + { + "epoch": 0.31039309901252315, + "grad_norm": 2.3707261085510254, + "learning_rate": 1.4864446834638721e-05, + "loss": 0.8923, + "step": 4102 + }, + { + "epoch": 0.3104687677348568, + "grad_norm": 2.4059784412384033, + "learning_rate": 1.4862738213085634e-05, + "loss": 0.7605, + "step": 4103 + }, + { + "epoch": 0.31054443645719043, + "grad_norm": 2.0712780952453613, + "learning_rate": 1.4861029224388921e-05, + "loss": 0.7258, + "step": 4104 + }, + { + "epoch": 0.31062010517952404, + "grad_norm": 2.2427139282226562, + "learning_rate": 1.4859319868655602e-05, + "loss": 0.6634, + "step": 4105 + }, + { + "epoch": 0.31069577390185765, + "grad_norm": 2.8183469772338867, + "learning_rate": 1.4857610145992719e-05, + "loss": 0.7484, + "step": 4106 + }, + { + "epoch": 0.31077144262419126, + "grad_norm": 2.3078558444976807, + "learning_rate": 1.4855900056507343e-05, + "loss": 0.8037, + "step": 4107 + }, + { + "epoch": 0.31084711134652493, + "grad_norm": 1.7662575244903564, + "learning_rate": 1.4854189600306565e-05, + "loss": 0.5929, + "step": 4108 + }, + { + "epoch": 0.31092278006885854, + "grad_norm": 2.5993802547454834, + "learning_rate": 1.48524787774975e-05, + "loss": 0.7729, + "step": 4109 + }, + { + "epoch": 0.31099844879119215, + "grad_norm": 1.8144737482070923, + "learning_rate": 1.4850767588187285e-05, + "loss": 0.6611, + "step": 4110 + }, + { + "epoch": 0.31107411751352576, + "grad_norm": 1.6812607049942017, + "learning_rate": 1.4849056032483081e-05, + "loss": 0.7923, + "step": 4111 + }, + { + "epoch": 0.3111497862358594, + "grad_norm": 2.2537429332733154, + "learning_rate": 1.484734411049207e-05, + "loss": 0.7171, + "step": 4112 + }, + { + "epoch": 0.31122545495819304, + "grad_norm": 2.3459930419921875, + "learning_rate": 1.4845631822321456e-05, + "loss": 0.7479, + "step": 4113 + }, + { + "epoch": 0.31130112368052665, + "grad_norm": 1.9687094688415527, + "learning_rate": 1.484391916807847e-05, + "loss": 0.7927, + "step": 4114 + }, + { + "epoch": 0.31137679240286026, + "grad_norm": 2.511396646499634, + "learning_rate": 1.4842206147870365e-05, + "loss": 0.8989, + "step": 4115 + }, + { + "epoch": 0.3114524611251939, + "grad_norm": 2.6915669441223145, + "learning_rate": 1.4840492761804417e-05, + "loss": 0.8171, + "step": 4116 + }, + { + "epoch": 0.31152812984752754, + "grad_norm": 1.9705544710159302, + "learning_rate": 1.483877900998792e-05, + "loss": 0.7195, + "step": 4117 + }, + { + "epoch": 0.31160379856986115, + "grad_norm": 1.8884872198104858, + "learning_rate": 1.4837064892528197e-05, + "loss": 0.7556, + "step": 4118 + }, + { + "epoch": 0.31167946729219476, + "grad_norm": 2.239375591278076, + "learning_rate": 1.4835350409532592e-05, + "loss": 0.696, + "step": 4119 + }, + { + "epoch": 0.3117551360145284, + "grad_norm": 1.9161548614501953, + "learning_rate": 1.4833635561108469e-05, + "loss": 0.6043, + "step": 4120 + }, + { + "epoch": 0.31183080473686203, + "grad_norm": 2.44989275932312, + "learning_rate": 1.483192034736322e-05, + "loss": 0.8091, + "step": 4121 + }, + { + "epoch": 0.31190647345919564, + "grad_norm": 2.2470176219940186, + "learning_rate": 1.4830204768404253e-05, + "loss": 0.7588, + "step": 4122 + }, + { + "epoch": 0.31198214218152925, + "grad_norm": 2.389315128326416, + "learning_rate": 1.482848882433901e-05, + "loss": 0.8738, + "step": 4123 + }, + { + "epoch": 0.31205781090386286, + "grad_norm": 2.3633134365081787, + "learning_rate": 1.4826772515274943e-05, + "loss": 0.7753, + "step": 4124 + }, + { + "epoch": 0.31213347962619653, + "grad_norm": 2.183661699295044, + "learning_rate": 1.4825055841319536e-05, + "loss": 0.8454, + "step": 4125 + }, + { + "epoch": 0.31220914834853014, + "grad_norm": 2.239809989929199, + "learning_rate": 1.4823338802580294e-05, + "loss": 0.7674, + "step": 4126 + }, + { + "epoch": 0.31228481707086375, + "grad_norm": 2.594881296157837, + "learning_rate": 1.4821621399164737e-05, + "loss": 0.8039, + "step": 4127 + }, + { + "epoch": 0.31236048579319736, + "grad_norm": 2.0808498859405518, + "learning_rate": 1.4819903631180423e-05, + "loss": 0.8132, + "step": 4128 + }, + { + "epoch": 0.31243615451553103, + "grad_norm": 2.28064227104187, + "learning_rate": 1.4818185498734914e-05, + "loss": 0.7763, + "step": 4129 + }, + { + "epoch": 0.31251182323786464, + "grad_norm": 2.041588068008423, + "learning_rate": 1.4816467001935815e-05, + "loss": 0.7704, + "step": 4130 + }, + { + "epoch": 0.31258749196019825, + "grad_norm": 2.0679996013641357, + "learning_rate": 1.4814748140890738e-05, + "loss": 0.6893, + "step": 4131 + }, + { + "epoch": 0.31266316068253186, + "grad_norm": 1.9504202604293823, + "learning_rate": 1.4813028915707326e-05, + "loss": 0.8287, + "step": 4132 + }, + { + "epoch": 0.3127388294048655, + "grad_norm": 2.44740891456604, + "learning_rate": 1.4811309326493244e-05, + "loss": 0.7966, + "step": 4133 + }, + { + "epoch": 0.31281449812719914, + "grad_norm": 2.279268264770508, + "learning_rate": 1.4809589373356173e-05, + "loss": 0.9172, + "step": 4134 + }, + { + "epoch": 0.31289016684953275, + "grad_norm": 2.8255929946899414, + "learning_rate": 1.4807869056403823e-05, + "loss": 0.7796, + "step": 4135 + }, + { + "epoch": 0.31296583557186636, + "grad_norm": 1.751339077949524, + "learning_rate": 1.480614837574393e-05, + "loss": 0.7826, + "step": 4136 + }, + { + "epoch": 0.31304150429419997, + "grad_norm": 2.050893545150757, + "learning_rate": 1.4804427331484249e-05, + "loss": 0.7079, + "step": 4137 + }, + { + "epoch": 0.31311717301653363, + "grad_norm": 2.0679502487182617, + "learning_rate": 1.480270592373255e-05, + "loss": 0.7366, + "step": 4138 + }, + { + "epoch": 0.31319284173886724, + "grad_norm": 2.277479887008667, + "learning_rate": 1.480098415259664e-05, + "loss": 0.701, + "step": 4139 + }, + { + "epoch": 0.31326851046120086, + "grad_norm": 3.157134771347046, + "learning_rate": 1.479926201818434e-05, + "loss": 0.748, + "step": 4140 + }, + { + "epoch": 0.31334417918353447, + "grad_norm": 2.37265682220459, + "learning_rate": 1.4797539520603497e-05, + "loss": 0.7892, + "step": 4141 + }, + { + "epoch": 0.31341984790586813, + "grad_norm": 1.8097896575927734, + "learning_rate": 1.4795816659961974e-05, + "loss": 0.8259, + "step": 4142 + }, + { + "epoch": 0.31349551662820174, + "grad_norm": 1.9213300943374634, + "learning_rate": 1.4794093436367668e-05, + "loss": 0.8412, + "step": 4143 + }, + { + "epoch": 0.31357118535053535, + "grad_norm": 1.8709452152252197, + "learning_rate": 1.479236984992849e-05, + "loss": 0.7159, + "step": 4144 + }, + { + "epoch": 0.31364685407286896, + "grad_norm": 2.649545192718506, + "learning_rate": 1.4790645900752377e-05, + "loss": 0.7186, + "step": 4145 + }, + { + "epoch": 0.31372252279520263, + "grad_norm": 2.275909662246704, + "learning_rate": 1.478892158894729e-05, + "loss": 0.6128, + "step": 4146 + }, + { + "epoch": 0.31379819151753624, + "grad_norm": 2.2135143280029297, + "learning_rate": 1.4787196914621208e-05, + "loss": 0.9426, + "step": 4147 + }, + { + "epoch": 0.31387386023986985, + "grad_norm": 2.2389373779296875, + "learning_rate": 1.4785471877882138e-05, + "loss": 0.7118, + "step": 4148 + }, + { + "epoch": 0.31394952896220346, + "grad_norm": 1.9949945211410522, + "learning_rate": 1.4783746478838108e-05, + "loss": 0.7185, + "step": 4149 + }, + { + "epoch": 0.31402519768453707, + "grad_norm": 1.941359519958496, + "learning_rate": 1.4782020717597163e-05, + "loss": 0.6794, + "step": 4150 + }, + { + "epoch": 0.31410086640687074, + "grad_norm": 2.1652402877807617, + "learning_rate": 1.478029459426738e-05, + "loss": 0.7128, + "step": 4151 + }, + { + "epoch": 0.31417653512920435, + "grad_norm": 2.5514955520629883, + "learning_rate": 1.4778568108956857e-05, + "loss": 0.8677, + "step": 4152 + }, + { + "epoch": 0.31425220385153796, + "grad_norm": 2.4251315593719482, + "learning_rate": 1.4776841261773706e-05, + "loss": 0.8487, + "step": 4153 + }, + { + "epoch": 0.31432787257387157, + "grad_norm": 2.3087351322174072, + "learning_rate": 1.477511405282607e-05, + "loss": 0.7846, + "step": 4154 + }, + { + "epoch": 0.31440354129620524, + "grad_norm": 1.8735215663909912, + "learning_rate": 1.4773386482222115e-05, + "loss": 0.7894, + "step": 4155 + }, + { + "epoch": 0.31447921001853885, + "grad_norm": 2.0851855278015137, + "learning_rate": 1.4771658550070024e-05, + "loss": 0.7316, + "step": 4156 + }, + { + "epoch": 0.31455487874087246, + "grad_norm": 2.0207672119140625, + "learning_rate": 1.4769930256478008e-05, + "loss": 0.7909, + "step": 4157 + }, + { + "epoch": 0.31463054746320607, + "grad_norm": 2.3125038146972656, + "learning_rate": 1.4768201601554295e-05, + "loss": 0.7246, + "step": 4158 + }, + { + "epoch": 0.31470621618553973, + "grad_norm": 2.17578387260437, + "learning_rate": 1.4766472585407142e-05, + "loss": 0.7306, + "step": 4159 + }, + { + "epoch": 0.31478188490787334, + "grad_norm": 1.933734655380249, + "learning_rate": 1.4764743208144827e-05, + "loss": 0.7355, + "step": 4160 + }, + { + "epoch": 0.31485755363020695, + "grad_norm": 2.213831901550293, + "learning_rate": 1.4763013469875644e-05, + "loss": 0.8326, + "step": 4161 + }, + { + "epoch": 0.31493322235254056, + "grad_norm": 2.065779209136963, + "learning_rate": 1.4761283370707919e-05, + "loss": 0.8269, + "step": 4162 + }, + { + "epoch": 0.3150088910748742, + "grad_norm": 2.665332794189453, + "learning_rate": 1.4759552910749993e-05, + "loss": 0.7556, + "step": 4163 + }, + { + "epoch": 0.31508455979720784, + "grad_norm": 1.8183536529541016, + "learning_rate": 1.4757822090110236e-05, + "loss": 0.7538, + "step": 4164 + }, + { + "epoch": 0.31516022851954145, + "grad_norm": 2.538285493850708, + "learning_rate": 1.4756090908897039e-05, + "loss": 0.8137, + "step": 4165 + }, + { + "epoch": 0.31523589724187506, + "grad_norm": 2.3421831130981445, + "learning_rate": 1.4754359367218808e-05, + "loss": 0.7443, + "step": 4166 + }, + { + "epoch": 0.3153115659642087, + "grad_norm": 3.463080406188965, + "learning_rate": 1.4752627465183985e-05, + "loss": 0.783, + "step": 4167 + }, + { + "epoch": 0.31538723468654234, + "grad_norm": 2.2434184551239014, + "learning_rate": 1.4750895202901021e-05, + "loss": 0.8343, + "step": 4168 + }, + { + "epoch": 0.31546290340887595, + "grad_norm": 2.3080389499664307, + "learning_rate": 1.4749162580478401e-05, + "loss": 0.6915, + "step": 4169 + }, + { + "epoch": 0.31553857213120956, + "grad_norm": 1.77947998046875, + "learning_rate": 1.4747429598024625e-05, + "loss": 0.6928, + "step": 4170 + }, + { + "epoch": 0.31561424085354317, + "grad_norm": 1.8366894721984863, + "learning_rate": 1.4745696255648219e-05, + "loss": 0.7257, + "step": 4171 + }, + { + "epoch": 0.31568990957587684, + "grad_norm": 2.3540196418762207, + "learning_rate": 1.4743962553457729e-05, + "loss": 0.7414, + "step": 4172 + }, + { + "epoch": 0.31576557829821045, + "grad_norm": 2.082271099090576, + "learning_rate": 1.4742228491561723e-05, + "loss": 0.8058, + "step": 4173 + }, + { + "epoch": 0.31584124702054406, + "grad_norm": 1.8880605697631836, + "learning_rate": 1.4740494070068799e-05, + "loss": 0.8184, + "step": 4174 + }, + { + "epoch": 0.31591691574287767, + "grad_norm": 2.252822160720825, + "learning_rate": 1.4738759289087569e-05, + "loss": 0.8572, + "step": 4175 + }, + { + "epoch": 0.3159925844652113, + "grad_norm": 1.9916536808013916, + "learning_rate": 1.4737024148726668e-05, + "loss": 0.7948, + "step": 4176 + }, + { + "epoch": 0.31606825318754495, + "grad_norm": 2.2267072200775146, + "learning_rate": 1.4735288649094764e-05, + "loss": 0.7164, + "step": 4177 + }, + { + "epoch": 0.31614392190987856, + "grad_norm": 2.865279197692871, + "learning_rate": 1.4733552790300531e-05, + "loss": 0.8885, + "step": 4178 + }, + { + "epoch": 0.31621959063221217, + "grad_norm": 2.009626626968384, + "learning_rate": 1.473181657245268e-05, + "loss": 0.6854, + "step": 4179 + }, + { + "epoch": 0.3162952593545458, + "grad_norm": 2.28889536857605, + "learning_rate": 1.4730079995659935e-05, + "loss": 0.6812, + "step": 4180 + }, + { + "epoch": 0.31637092807687944, + "grad_norm": 1.8733904361724854, + "learning_rate": 1.4728343060031046e-05, + "loss": 0.7784, + "step": 4181 + }, + { + "epoch": 0.31644659679921305, + "grad_norm": 2.0452613830566406, + "learning_rate": 1.4726605765674788e-05, + "loss": 0.71, + "step": 4182 + }, + { + "epoch": 0.31652226552154666, + "grad_norm": 3.1773879528045654, + "learning_rate": 1.4724868112699957e-05, + "loss": 0.7515, + "step": 4183 + }, + { + "epoch": 0.3165979342438803, + "grad_norm": 1.9944331645965576, + "learning_rate": 1.4723130101215364e-05, + "loss": 0.686, + "step": 4184 + }, + { + "epoch": 0.31667360296621394, + "grad_norm": 2.6416072845458984, + "learning_rate": 1.4721391731329856e-05, + "loss": 0.8275, + "step": 4185 + }, + { + "epoch": 0.31674927168854755, + "grad_norm": 2.302402973175049, + "learning_rate": 1.4719653003152291e-05, + "loss": 0.6254, + "step": 4186 + }, + { + "epoch": 0.31682494041088116, + "grad_norm": 2.278822422027588, + "learning_rate": 1.4717913916791561e-05, + "loss": 0.9094, + "step": 4187 + }, + { + "epoch": 0.3169006091332148, + "grad_norm": 2.715550422668457, + "learning_rate": 1.4716174472356563e-05, + "loss": 0.8543, + "step": 4188 + }, + { + "epoch": 0.3169762778555484, + "grad_norm": 2.0558013916015625, + "learning_rate": 1.4714434669956228e-05, + "loss": 0.8176, + "step": 4189 + }, + { + "epoch": 0.31705194657788205, + "grad_norm": 2.050335645675659, + "learning_rate": 1.4712694509699517e-05, + "loss": 0.7816, + "step": 4190 + }, + { + "epoch": 0.31712761530021566, + "grad_norm": 2.250823497772217, + "learning_rate": 1.4710953991695394e-05, + "loss": 0.7854, + "step": 4191 + }, + { + "epoch": 0.31720328402254927, + "grad_norm": 2.516578435897827, + "learning_rate": 1.4709213116052864e-05, + "loss": 0.7717, + "step": 4192 + }, + { + "epoch": 0.3172789527448829, + "grad_norm": 2.0898921489715576, + "learning_rate": 1.4707471882880942e-05, + "loss": 0.7474, + "step": 4193 + }, + { + "epoch": 0.31735462146721655, + "grad_norm": 2.837730646133423, + "learning_rate": 1.470573029228867e-05, + "loss": 0.6944, + "step": 4194 + }, + { + "epoch": 0.31743029018955016, + "grad_norm": 2.0054285526275635, + "learning_rate": 1.4703988344385113e-05, + "loss": 0.902, + "step": 4195 + }, + { + "epoch": 0.31750595891188377, + "grad_norm": 2.452274799346924, + "learning_rate": 1.4702246039279356e-05, + "loss": 0.7572, + "step": 4196 + }, + { + "epoch": 0.3175816276342174, + "grad_norm": 2.3126518726348877, + "learning_rate": 1.470050337708051e-05, + "loss": 0.8434, + "step": 4197 + }, + { + "epoch": 0.31765729635655104, + "grad_norm": 1.9818753004074097, + "learning_rate": 1.4698760357897703e-05, + "loss": 0.7367, + "step": 4198 + }, + { + "epoch": 0.31773296507888465, + "grad_norm": 2.2270803451538086, + "learning_rate": 1.4697016981840091e-05, + "loss": 0.7117, + "step": 4199 + }, + { + "epoch": 0.31780863380121827, + "grad_norm": 2.7719006538391113, + "learning_rate": 1.469527324901685e-05, + "loss": 0.902, + "step": 4200 + }, + { + "epoch": 0.3178843025235519, + "grad_norm": 2.4166553020477295, + "learning_rate": 1.4693529159537179e-05, + "loss": 0.7287, + "step": 4201 + }, + { + "epoch": 0.3179599712458855, + "grad_norm": 2.3354287147521973, + "learning_rate": 1.4691784713510294e-05, + "loss": 0.7441, + "step": 4202 + }, + { + "epoch": 0.31803563996821915, + "grad_norm": 2.553823709487915, + "learning_rate": 1.4690039911045443e-05, + "loss": 0.7037, + "step": 4203 + }, + { + "epoch": 0.31811130869055276, + "grad_norm": 3.147313356399536, + "learning_rate": 1.4688294752251888e-05, + "loss": 0.8055, + "step": 4204 + }, + { + "epoch": 0.3181869774128864, + "grad_norm": 2.2056877613067627, + "learning_rate": 1.4686549237238917e-05, + "loss": 0.6627, + "step": 4205 + }, + { + "epoch": 0.31826264613522, + "grad_norm": 2.1581954956054688, + "learning_rate": 1.4684803366115841e-05, + "loss": 0.7106, + "step": 4206 + }, + { + "epoch": 0.31833831485755365, + "grad_norm": 2.5941429138183594, + "learning_rate": 1.468305713899199e-05, + "loss": 0.8563, + "step": 4207 + }, + { + "epoch": 0.31841398357988726, + "grad_norm": 2.1362545490264893, + "learning_rate": 1.468131055597672e-05, + "loss": 0.7852, + "step": 4208 + }, + { + "epoch": 0.31848965230222087, + "grad_norm": 3.146714925765991, + "learning_rate": 1.4679563617179408e-05, + "loss": 0.8228, + "step": 4209 + }, + { + "epoch": 0.3185653210245545, + "grad_norm": 2.1198391914367676, + "learning_rate": 1.4677816322709452e-05, + "loss": 0.8172, + "step": 4210 + }, + { + "epoch": 0.31864098974688815, + "grad_norm": 2.367765426635742, + "learning_rate": 1.4676068672676274e-05, + "loss": 0.6736, + "step": 4211 + }, + { + "epoch": 0.31871665846922176, + "grad_norm": 1.9327032566070557, + "learning_rate": 1.4674320667189317e-05, + "loss": 0.5936, + "step": 4212 + }, + { + "epoch": 0.31879232719155537, + "grad_norm": 2.0183558464050293, + "learning_rate": 1.4672572306358048e-05, + "loss": 0.707, + "step": 4213 + }, + { + "epoch": 0.318867995913889, + "grad_norm": 2.5018794536590576, + "learning_rate": 1.4670823590291953e-05, + "loss": 0.8315, + "step": 4214 + }, + { + "epoch": 0.3189436646362226, + "grad_norm": 1.932250738143921, + "learning_rate": 1.466907451910054e-05, + "loss": 0.6677, + "step": 4215 + }, + { + "epoch": 0.31901933335855626, + "grad_norm": 2.460291862487793, + "learning_rate": 1.4667325092893349e-05, + "loss": 0.7883, + "step": 4216 + }, + { + "epoch": 0.31909500208088987, + "grad_norm": 2.749185800552368, + "learning_rate": 1.466557531177993e-05, + "loss": 0.6801, + "step": 4217 + }, + { + "epoch": 0.3191706708032235, + "grad_norm": 2.338014602661133, + "learning_rate": 1.4663825175869858e-05, + "loss": 0.7228, + "step": 4218 + }, + { + "epoch": 0.3192463395255571, + "grad_norm": 2.1747679710388184, + "learning_rate": 1.4662074685272735e-05, + "loss": 0.6736, + "step": 4219 + }, + { + "epoch": 0.31932200824789075, + "grad_norm": 2.656726360321045, + "learning_rate": 1.4660323840098184e-05, + "loss": 0.6297, + "step": 4220 + }, + { + "epoch": 0.31939767697022436, + "grad_norm": 2.0806820392608643, + "learning_rate": 1.4658572640455842e-05, + "loss": 0.6698, + "step": 4221 + }, + { + "epoch": 0.319473345692558, + "grad_norm": 2.605621099472046, + "learning_rate": 1.4656821086455383e-05, + "loss": 0.8796, + "step": 4222 + }, + { + "epoch": 0.3195490144148916, + "grad_norm": 2.9140431880950928, + "learning_rate": 1.465506917820649e-05, + "loss": 0.6627, + "step": 4223 + }, + { + "epoch": 0.31962468313722525, + "grad_norm": 2.8789541721343994, + "learning_rate": 1.4653316915818876e-05, + "loss": 0.683, + "step": 4224 + }, + { + "epoch": 0.31970035185955886, + "grad_norm": 2.5387415885925293, + "learning_rate": 1.465156429940227e-05, + "loss": 0.8299, + "step": 4225 + }, + { + "epoch": 0.3197760205818925, + "grad_norm": 2.163238048553467, + "learning_rate": 1.4649811329066428e-05, + "loss": 0.7167, + "step": 4226 + }, + { + "epoch": 0.3198516893042261, + "grad_norm": 2.0733134746551514, + "learning_rate": 1.4648058004921126e-05, + "loss": 0.7044, + "step": 4227 + }, + { + "epoch": 0.3199273580265597, + "grad_norm": 2.783489227294922, + "learning_rate": 1.4646304327076165e-05, + "loss": 0.9027, + "step": 4228 + }, + { + "epoch": 0.32000302674889336, + "grad_norm": 2.3627331256866455, + "learning_rate": 1.4644550295641367e-05, + "loss": 0.8586, + "step": 4229 + }, + { + "epoch": 0.32007869547122697, + "grad_norm": 4.856930255889893, + "learning_rate": 1.464279591072657e-05, + "loss": 0.6494, + "step": 4230 + }, + { + "epoch": 0.3201543641935606, + "grad_norm": 2.0555548667907715, + "learning_rate": 1.4641041172441642e-05, + "loss": 0.7398, + "step": 4231 + }, + { + "epoch": 0.3202300329158942, + "grad_norm": 2.1479907035827637, + "learning_rate": 1.4639286080896468e-05, + "loss": 0.769, + "step": 4232 + }, + { + "epoch": 0.32030570163822786, + "grad_norm": 2.175365924835205, + "learning_rate": 1.4637530636200965e-05, + "loss": 0.6443, + "step": 4233 + }, + { + "epoch": 0.32038137036056147, + "grad_norm": 3.0681824684143066, + "learning_rate": 1.4635774838465055e-05, + "loss": 0.8433, + "step": 4234 + }, + { + "epoch": 0.3204570390828951, + "grad_norm": 2.8694117069244385, + "learning_rate": 1.46340186877987e-05, + "loss": 0.6974, + "step": 4235 + }, + { + "epoch": 0.3205327078052287, + "grad_norm": 3.192309856414795, + "learning_rate": 1.4632262184311872e-05, + "loss": 0.7687, + "step": 4236 + }, + { + "epoch": 0.32060837652756236, + "grad_norm": 2.460700750350952, + "learning_rate": 1.4630505328114569e-05, + "loss": 0.8654, + "step": 4237 + }, + { + "epoch": 0.32068404524989597, + "grad_norm": 2.13419771194458, + "learning_rate": 1.4628748119316807e-05, + "loss": 0.6749, + "step": 4238 + }, + { + "epoch": 0.3207597139722296, + "grad_norm": 2.089223623275757, + "learning_rate": 1.4626990558028636e-05, + "loss": 0.8542, + "step": 4239 + }, + { + "epoch": 0.3208353826945632, + "grad_norm": 2.2618892192840576, + "learning_rate": 1.4625232644360117e-05, + "loss": 0.7101, + "step": 4240 + }, + { + "epoch": 0.32091105141689685, + "grad_norm": 2.1564035415649414, + "learning_rate": 1.4623474378421333e-05, + "loss": 0.7211, + "step": 4241 + }, + { + "epoch": 0.32098672013923046, + "grad_norm": 2.1531484127044678, + "learning_rate": 1.4621715760322398e-05, + "loss": 0.7455, + "step": 4242 + }, + { + "epoch": 0.3210623888615641, + "grad_norm": 2.0043506622314453, + "learning_rate": 1.4619956790173435e-05, + "loss": 0.8069, + "step": 4243 + }, + { + "epoch": 0.3211380575838977, + "grad_norm": 2.2013986110687256, + "learning_rate": 1.4618197468084605e-05, + "loss": 0.8981, + "step": 4244 + }, + { + "epoch": 0.3212137263062313, + "grad_norm": 2.18361496925354, + "learning_rate": 1.4616437794166073e-05, + "loss": 0.9738, + "step": 4245 + }, + { + "epoch": 0.32128939502856496, + "grad_norm": 2.511524200439453, + "learning_rate": 1.4614677768528046e-05, + "loss": 0.6794, + "step": 4246 + }, + { + "epoch": 0.32136506375089857, + "grad_norm": 2.317763090133667, + "learning_rate": 1.4612917391280734e-05, + "loss": 0.7569, + "step": 4247 + }, + { + "epoch": 0.3214407324732322, + "grad_norm": 2.617969274520874, + "learning_rate": 1.4611156662534382e-05, + "loss": 0.7191, + "step": 4248 + }, + { + "epoch": 0.3215164011955658, + "grad_norm": 2.400247097015381, + "learning_rate": 1.4609395582399249e-05, + "loss": 0.72, + "step": 4249 + }, + { + "epoch": 0.32159206991789946, + "grad_norm": 2.0886826515197754, + "learning_rate": 1.4607634150985624e-05, + "loss": 0.7317, + "step": 4250 + }, + { + "epoch": 0.32166773864023307, + "grad_norm": 3.8635036945343018, + "learning_rate": 1.460587236840381e-05, + "loss": 0.7681, + "step": 4251 + }, + { + "epoch": 0.3217434073625667, + "grad_norm": 2.07661771774292, + "learning_rate": 1.4604110234764138e-05, + "loss": 0.6945, + "step": 4252 + }, + { + "epoch": 0.3218190760849003, + "grad_norm": 2.098093032836914, + "learning_rate": 1.4602347750176957e-05, + "loss": 0.6292, + "step": 4253 + }, + { + "epoch": 0.32189474480723396, + "grad_norm": 2.4790546894073486, + "learning_rate": 1.4600584914752637e-05, + "loss": 0.7044, + "step": 4254 + }, + { + "epoch": 0.32197041352956757, + "grad_norm": 2.268695831298828, + "learning_rate": 1.4598821728601579e-05, + "loss": 0.7005, + "step": 4255 + }, + { + "epoch": 0.3220460822519012, + "grad_norm": 2.2798666954040527, + "learning_rate": 1.4597058191834192e-05, + "loss": 0.7742, + "step": 4256 + }, + { + "epoch": 0.3221217509742348, + "grad_norm": 3.1297192573547363, + "learning_rate": 1.4595294304560919e-05, + "loss": 0.7833, + "step": 4257 + }, + { + "epoch": 0.3221974196965684, + "grad_norm": 2.595353841781616, + "learning_rate": 1.4593530066892218e-05, + "loss": 0.7436, + "step": 4258 + }, + { + "epoch": 0.32227308841890207, + "grad_norm": 2.4126555919647217, + "learning_rate": 1.4591765478938577e-05, + "loss": 0.798, + "step": 4259 + }, + { + "epoch": 0.3223487571412357, + "grad_norm": 2.161207675933838, + "learning_rate": 1.4590000540810492e-05, + "loss": 0.6859, + "step": 4260 + }, + { + "epoch": 0.3224244258635693, + "grad_norm": 2.0117027759552, + "learning_rate": 1.4588235252618494e-05, + "loss": 0.7389, + "step": 4261 + }, + { + "epoch": 0.3225000945859029, + "grad_norm": 2.079958915710449, + "learning_rate": 1.458646961447313e-05, + "loss": 0.788, + "step": 4262 + }, + { + "epoch": 0.32257576330823656, + "grad_norm": 1.884169340133667, + "learning_rate": 1.458470362648497e-05, + "loss": 0.8014, + "step": 4263 + }, + { + "epoch": 0.3226514320305702, + "grad_norm": 2.48260760307312, + "learning_rate": 1.4582937288764604e-05, + "loss": 0.8323, + "step": 4264 + }, + { + "epoch": 0.3227271007529038, + "grad_norm": 2.3636391162872314, + "learning_rate": 1.458117060142265e-05, + "loss": 0.8989, + "step": 4265 + }, + { + "epoch": 0.3228027694752374, + "grad_norm": 1.976210594177246, + "learning_rate": 1.4579403564569741e-05, + "loss": 0.8588, + "step": 4266 + }, + { + "epoch": 0.32287843819757106, + "grad_norm": 8.20315170288086, + "learning_rate": 1.4577636178316533e-05, + "loss": 0.836, + "step": 4267 + }, + { + "epoch": 0.32295410691990467, + "grad_norm": 3.0736913681030273, + "learning_rate": 1.4575868442773708e-05, + "loss": 0.8805, + "step": 4268 + }, + { + "epoch": 0.3230297756422383, + "grad_norm": 2.023252010345459, + "learning_rate": 1.4574100358051967e-05, + "loss": 0.6687, + "step": 4269 + }, + { + "epoch": 0.3231054443645719, + "grad_norm": 2.3063673973083496, + "learning_rate": 1.4572331924262033e-05, + "loss": 0.7783, + "step": 4270 + }, + { + "epoch": 0.3231811130869055, + "grad_norm": 3.034928321838379, + "learning_rate": 1.4570563141514651e-05, + "loss": 0.9137, + "step": 4271 + }, + { + "epoch": 0.32325678180923917, + "grad_norm": 3.5648000240325928, + "learning_rate": 1.4568794009920588e-05, + "loss": 0.723, + "step": 4272 + }, + { + "epoch": 0.3233324505315728, + "grad_norm": 2.6870532035827637, + "learning_rate": 1.456702452959063e-05, + "loss": 0.8169, + "step": 4273 + }, + { + "epoch": 0.3234081192539064, + "grad_norm": 2.3009469509124756, + "learning_rate": 1.4565254700635593e-05, + "loss": 0.751, + "step": 4274 + }, + { + "epoch": 0.32348378797624, + "grad_norm": 1.9499804973602295, + "learning_rate": 1.4563484523166307e-05, + "loss": 0.6896, + "step": 4275 + }, + { + "epoch": 0.32355945669857367, + "grad_norm": 2.177910804748535, + "learning_rate": 1.4561713997293621e-05, + "loss": 0.6619, + "step": 4276 + }, + { + "epoch": 0.3236351254209073, + "grad_norm": 2.637498378753662, + "learning_rate": 1.4559943123128418e-05, + "loss": 0.8201, + "step": 4277 + }, + { + "epoch": 0.3237107941432409, + "grad_norm": 2.287648916244507, + "learning_rate": 1.4558171900781594e-05, + "loss": 0.8366, + "step": 4278 + }, + { + "epoch": 0.3237864628655745, + "grad_norm": 2.5055220127105713, + "learning_rate": 1.455640033036407e-05, + "loss": 0.7382, + "step": 4279 + }, + { + "epoch": 0.32386213158790816, + "grad_norm": 2.373974323272705, + "learning_rate": 1.4554628411986783e-05, + "loss": 0.8601, + "step": 4280 + }, + { + "epoch": 0.3239378003102418, + "grad_norm": 2.571704387664795, + "learning_rate": 1.45528561457607e-05, + "loss": 0.7664, + "step": 4281 + }, + { + "epoch": 0.3240134690325754, + "grad_norm": 2.084261655807495, + "learning_rate": 1.4551083531796807e-05, + "loss": 0.8355, + "step": 4282 + }, + { + "epoch": 0.324089137754909, + "grad_norm": 13.960339546203613, + "learning_rate": 1.4549310570206106e-05, + "loss": 0.6196, + "step": 4283 + }, + { + "epoch": 0.3241648064772426, + "grad_norm": 2.3920133113861084, + "learning_rate": 1.454753726109963e-05, + "loss": 0.718, + "step": 4284 + }, + { + "epoch": 0.3242404751995763, + "grad_norm": 4.138522148132324, + "learning_rate": 1.4545763604588427e-05, + "loss": 0.6937, + "step": 4285 + }, + { + "epoch": 0.3243161439219099, + "grad_norm": 2.481703281402588, + "learning_rate": 1.454398960078357e-05, + "loss": 0.7318, + "step": 4286 + }, + { + "epoch": 0.3243918126442435, + "grad_norm": 2.211909532546997, + "learning_rate": 1.4542215249796151e-05, + "loss": 0.6617, + "step": 4287 + }, + { + "epoch": 0.3244674813665771, + "grad_norm": 2.126004457473755, + "learning_rate": 1.454044055173729e-05, + "loss": 0.6787, + "step": 4288 + }, + { + "epoch": 0.32454315008891077, + "grad_norm": 2.684769630432129, + "learning_rate": 1.4538665506718119e-05, + "loss": 0.6737, + "step": 4289 + }, + { + "epoch": 0.3246188188112444, + "grad_norm": 1.9614814519882202, + "learning_rate": 1.4536890114849804e-05, + "loss": 0.6616, + "step": 4290 + }, + { + "epoch": 0.324694487533578, + "grad_norm": 2.360924482345581, + "learning_rate": 1.4535114376243518e-05, + "loss": 0.7476, + "step": 4291 + }, + { + "epoch": 0.3247701562559116, + "grad_norm": 2.6671664714813232, + "learning_rate": 1.4533338291010469e-05, + "loss": 0.745, + "step": 4292 + }, + { + "epoch": 0.32484582497824527, + "grad_norm": 3.544961929321289, + "learning_rate": 1.453156185926188e-05, + "loss": 0.7626, + "step": 4293 + }, + { + "epoch": 0.3249214937005789, + "grad_norm": 1.3772363662719727, + "learning_rate": 1.4529785081108993e-05, + "loss": 0.8338, + "step": 4294 + }, + { + "epoch": 0.3249971624229125, + "grad_norm": 2.098928213119507, + "learning_rate": 1.4528007956663081e-05, + "loss": 0.8097, + "step": 4295 + }, + { + "epoch": 0.3250728311452461, + "grad_norm": 2.549992084503174, + "learning_rate": 1.452623048603543e-05, + "loss": 0.8661, + "step": 4296 + }, + { + "epoch": 0.3251484998675797, + "grad_norm": 2.1404833793640137, + "learning_rate": 1.4524452669337353e-05, + "loss": 0.6822, + "step": 4297 + }, + { + "epoch": 0.3252241685899134, + "grad_norm": 2.1892287731170654, + "learning_rate": 1.452267450668018e-05, + "loss": 0.7878, + "step": 4298 + }, + { + "epoch": 0.325299837312247, + "grad_norm": 2.762024164199829, + "learning_rate": 1.4520895998175267e-05, + "loss": 0.8672, + "step": 4299 + }, + { + "epoch": 0.3253755060345806, + "grad_norm": 2.249630928039551, + "learning_rate": 1.451911714393399e-05, + "loss": 0.7267, + "step": 4300 + }, + { + "epoch": 0.3254511747569142, + "grad_norm": 2.6956732273101807, + "learning_rate": 1.451733794406775e-05, + "loss": 0.7433, + "step": 4301 + }, + { + "epoch": 0.3255268434792479, + "grad_norm": 1.908390760421753, + "learning_rate": 1.4515558398687958e-05, + "loss": 0.838, + "step": 4302 + }, + { + "epoch": 0.3256025122015815, + "grad_norm": 1.9399058818817139, + "learning_rate": 1.4513778507906063e-05, + "loss": 0.7207, + "step": 4303 + }, + { + "epoch": 0.3256781809239151, + "grad_norm": 2.5229694843292236, + "learning_rate": 1.4511998271833522e-05, + "loss": 0.8095, + "step": 4304 + }, + { + "epoch": 0.3257538496462487, + "grad_norm": 1.9527181386947632, + "learning_rate": 1.4510217690581824e-05, + "loss": 0.6514, + "step": 4305 + }, + { + "epoch": 0.32582951836858237, + "grad_norm": 2.4306392669677734, + "learning_rate": 1.4508436764262467e-05, + "loss": 0.7142, + "step": 4306 + }, + { + "epoch": 0.325905187090916, + "grad_norm": 2.332416534423828, + "learning_rate": 1.4506655492986985e-05, + "loss": 0.6613, + "step": 4307 + }, + { + "epoch": 0.3259808558132496, + "grad_norm": 1.7992033958435059, + "learning_rate": 1.4504873876866928e-05, + "loss": 0.6745, + "step": 4308 + }, + { + "epoch": 0.3260565245355832, + "grad_norm": 2.053755283355713, + "learning_rate": 1.4503091916013861e-05, + "loss": 0.8298, + "step": 4309 + }, + { + "epoch": 0.3261321932579168, + "grad_norm": 2.6498591899871826, + "learning_rate": 1.4501309610539382e-05, + "loss": 0.7602, + "step": 4310 + }, + { + "epoch": 0.3262078619802505, + "grad_norm": 2.5992958545684814, + "learning_rate": 1.44995269605551e-05, + "loss": 0.7103, + "step": 4311 + }, + { + "epoch": 0.3262835307025841, + "grad_norm": 2.5793347358703613, + "learning_rate": 1.4497743966172652e-05, + "loss": 0.6369, + "step": 4312 + }, + { + "epoch": 0.3263591994249177, + "grad_norm": 2.428252696990967, + "learning_rate": 1.4495960627503695e-05, + "loss": 0.7442, + "step": 4313 + }, + { + "epoch": 0.3264348681472513, + "grad_norm": 2.169275999069214, + "learning_rate": 1.449417694465991e-05, + "loss": 0.6564, + "step": 4314 + }, + { + "epoch": 0.326510536869585, + "grad_norm": 2.4342880249023438, + "learning_rate": 1.449239291775299e-05, + "loss": 0.7119, + "step": 4315 + }, + { + "epoch": 0.3265862055919186, + "grad_norm": 2.3536922931671143, + "learning_rate": 1.4490608546894663e-05, + "loss": 0.7913, + "step": 4316 + }, + { + "epoch": 0.3266618743142522, + "grad_norm": 2.1272964477539062, + "learning_rate": 1.4488823832196671e-05, + "loss": 0.8102, + "step": 4317 + }, + { + "epoch": 0.3267375430365858, + "grad_norm": 2.338432550430298, + "learning_rate": 1.4487038773770778e-05, + "loss": 0.7231, + "step": 4318 + }, + { + "epoch": 0.3268132117589195, + "grad_norm": 2.033783197402954, + "learning_rate": 1.4485253371728769e-05, + "loss": 0.7513, + "step": 4319 + }, + { + "epoch": 0.3268888804812531, + "grad_norm": 2.384861946105957, + "learning_rate": 1.448346762618245e-05, + "loss": 0.6621, + "step": 4320 + }, + { + "epoch": 0.3269645492035867, + "grad_norm": 2.846778631210327, + "learning_rate": 1.4481681537243652e-05, + "loss": 0.739, + "step": 4321 + }, + { + "epoch": 0.3270402179259203, + "grad_norm": 2.581815242767334, + "learning_rate": 1.447989510502423e-05, + "loss": 0.6875, + "step": 4322 + }, + { + "epoch": 0.3271158866482539, + "grad_norm": 2.7722952365875244, + "learning_rate": 1.4478108329636053e-05, + "loss": 0.7614, + "step": 4323 + }, + { + "epoch": 0.3271915553705876, + "grad_norm": 1.9879074096679688, + "learning_rate": 1.4476321211191012e-05, + "loss": 0.8599, + "step": 4324 + }, + { + "epoch": 0.3272672240929212, + "grad_norm": 2.1524455547332764, + "learning_rate": 1.4474533749801024e-05, + "loss": 0.7467, + "step": 4325 + }, + { + "epoch": 0.3273428928152548, + "grad_norm": 1.877977728843689, + "learning_rate": 1.4472745945578023e-05, + "loss": 0.7021, + "step": 4326 + }, + { + "epoch": 0.3274185615375884, + "grad_norm": 2.359576940536499, + "learning_rate": 1.4470957798633974e-05, + "loss": 0.8945, + "step": 4327 + }, + { + "epoch": 0.3274942302599221, + "grad_norm": 2.371588706970215, + "learning_rate": 1.4469169309080853e-05, + "loss": 0.8712, + "step": 4328 + }, + { + "epoch": 0.3275698989822557, + "grad_norm": 2.2514874935150146, + "learning_rate": 1.4467380477030658e-05, + "loss": 0.645, + "step": 4329 + }, + { + "epoch": 0.3276455677045893, + "grad_norm": 1.858174204826355, + "learning_rate": 1.4465591302595415e-05, + "loss": 0.8199, + "step": 4330 + }, + { + "epoch": 0.3277212364269229, + "grad_norm": 2.374924421310425, + "learning_rate": 1.4463801785887165e-05, + "loss": 0.8076, + "step": 4331 + }, + { + "epoch": 0.3277969051492566, + "grad_norm": 2.052088737487793, + "learning_rate": 1.4462011927017977e-05, + "loss": 0.7276, + "step": 4332 + }, + { + "epoch": 0.3278725738715902, + "grad_norm": 2.3067727088928223, + "learning_rate": 1.4460221726099936e-05, + "loss": 0.7695, + "step": 4333 + }, + { + "epoch": 0.3279482425939238, + "grad_norm": 2.285053014755249, + "learning_rate": 1.445843118324515e-05, + "loss": 0.8209, + "step": 4334 + }, + { + "epoch": 0.3280239113162574, + "grad_norm": 2.512913703918457, + "learning_rate": 1.4456640298565749e-05, + "loss": 0.7486, + "step": 4335 + }, + { + "epoch": 0.328099580038591, + "grad_norm": 2.5502045154571533, + "learning_rate": 1.4454849072173882e-05, + "loss": 0.8489, + "step": 4336 + }, + { + "epoch": 0.3281752487609247, + "grad_norm": 1.8892613649368286, + "learning_rate": 1.4453057504181723e-05, + "loss": 0.838, + "step": 4337 + }, + { + "epoch": 0.3282509174832583, + "grad_norm": 2.4267325401306152, + "learning_rate": 1.4451265594701467e-05, + "loss": 0.8521, + "step": 4338 + }, + { + "epoch": 0.3283265862055919, + "grad_norm": 2.127098321914673, + "learning_rate": 1.4449473343845326e-05, + "loss": 0.8956, + "step": 4339 + }, + { + "epoch": 0.3284022549279255, + "grad_norm": 1.7218542098999023, + "learning_rate": 1.444768075172554e-05, + "loss": 0.8289, + "step": 4340 + }, + { + "epoch": 0.3284779236502592, + "grad_norm": 2.7613525390625, + "learning_rate": 1.4445887818454365e-05, + "loss": 0.8301, + "step": 4341 + }, + { + "epoch": 0.3285535923725928, + "grad_norm": 2.5535645484924316, + "learning_rate": 1.4444094544144084e-05, + "loss": 0.7195, + "step": 4342 + }, + { + "epoch": 0.3286292610949264, + "grad_norm": 2.61027193069458, + "learning_rate": 1.4442300928906988e-05, + "loss": 0.8215, + "step": 4343 + }, + { + "epoch": 0.32870492981726, + "grad_norm": 2.108738899230957, + "learning_rate": 1.4440506972855407e-05, + "loss": 0.6972, + "step": 4344 + }, + { + "epoch": 0.3287805985395937, + "grad_norm": 2.198636054992676, + "learning_rate": 1.4438712676101686e-05, + "loss": 0.8218, + "step": 4345 + }, + { + "epoch": 0.3288562672619273, + "grad_norm": 2.278756618499756, + "learning_rate": 1.4436918038758184e-05, + "loss": 0.752, + "step": 4346 + }, + { + "epoch": 0.3289319359842609, + "grad_norm": 2.669025421142578, + "learning_rate": 1.4435123060937291e-05, + "loss": 0.8767, + "step": 4347 + }, + { + "epoch": 0.3290076047065945, + "grad_norm": 2.104459047317505, + "learning_rate": 1.443332774275141e-05, + "loss": 0.735, + "step": 4348 + }, + { + "epoch": 0.3290832734289282, + "grad_norm": 2.096700429916382, + "learning_rate": 1.4431532084312973e-05, + "loss": 0.7188, + "step": 4349 + }, + { + "epoch": 0.3291589421512618, + "grad_norm": 2.3591086864471436, + "learning_rate": 1.4429736085734429e-05, + "loss": 0.7212, + "step": 4350 + }, + { + "epoch": 0.3292346108735954, + "grad_norm": 2.4446537494659424, + "learning_rate": 1.4427939747128252e-05, + "loss": 0.7998, + "step": 4351 + }, + { + "epoch": 0.329310279595929, + "grad_norm": 1.968955397605896, + "learning_rate": 1.442614306860693e-05, + "loss": 0.7152, + "step": 4352 + }, + { + "epoch": 0.3293859483182626, + "grad_norm": 2.0163991451263428, + "learning_rate": 1.4424346050282977e-05, + "loss": 0.7869, + "step": 4353 + }, + { + "epoch": 0.3294616170405963, + "grad_norm": 2.4209890365600586, + "learning_rate": 1.4422548692268934e-05, + "loss": 0.8442, + "step": 4354 + }, + { + "epoch": 0.3295372857629299, + "grad_norm": 2.772582530975342, + "learning_rate": 1.442075099467735e-05, + "loss": 0.8652, + "step": 4355 + }, + { + "epoch": 0.3296129544852635, + "grad_norm": 2.462894916534424, + "learning_rate": 1.4418952957620806e-05, + "loss": 0.659, + "step": 4356 + }, + { + "epoch": 0.3296886232075971, + "grad_norm": 2.011859893798828, + "learning_rate": 1.4417154581211901e-05, + "loss": 0.7464, + "step": 4357 + }, + { + "epoch": 0.3297642919299308, + "grad_norm": 1.890229344367981, + "learning_rate": 1.4415355865563254e-05, + "loss": 0.8068, + "step": 4358 + }, + { + "epoch": 0.3298399606522644, + "grad_norm": 2.171454906463623, + "learning_rate": 1.441355681078751e-05, + "loss": 0.7567, + "step": 4359 + }, + { + "epoch": 0.329915629374598, + "grad_norm": 2.6762516498565674, + "learning_rate": 1.4411757416997329e-05, + "loss": 0.6791, + "step": 4360 + }, + { + "epoch": 0.3299912980969316, + "grad_norm": 2.497652530670166, + "learning_rate": 1.4409957684305392e-05, + "loss": 0.8792, + "step": 4361 + }, + { + "epoch": 0.3300669668192653, + "grad_norm": 3.6293368339538574, + "learning_rate": 1.440815761282441e-05, + "loss": 0.7134, + "step": 4362 + }, + { + "epoch": 0.3301426355415989, + "grad_norm": 2.222104787826538, + "learning_rate": 1.4406357202667102e-05, + "loss": 0.7128, + "step": 4363 + }, + { + "epoch": 0.3302183042639325, + "grad_norm": 2.209268093109131, + "learning_rate": 1.4404556453946224e-05, + "loss": 0.8376, + "step": 4364 + }, + { + "epoch": 0.3302939729862661, + "grad_norm": 1.9097639322280884, + "learning_rate": 1.440275536677454e-05, + "loss": 0.8323, + "step": 4365 + }, + { + "epoch": 0.3303696417085997, + "grad_norm": 2.237016201019287, + "learning_rate": 1.4400953941264837e-05, + "loss": 0.7609, + "step": 4366 + }, + { + "epoch": 0.3304453104309334, + "grad_norm": 1.9410020112991333, + "learning_rate": 1.4399152177529932e-05, + "loss": 0.6425, + "step": 4367 + }, + { + "epoch": 0.330520979153267, + "grad_norm": 2.629134178161621, + "learning_rate": 1.4397350075682652e-05, + "loss": 0.5823, + "step": 4368 + }, + { + "epoch": 0.3305966478756006, + "grad_norm": 2.269423246383667, + "learning_rate": 1.4395547635835856e-05, + "loss": 0.6952, + "step": 4369 + }, + { + "epoch": 0.3306723165979342, + "grad_norm": 3.469186544418335, + "learning_rate": 1.4393744858102417e-05, + "loss": 0.7122, + "step": 4370 + }, + { + "epoch": 0.3307479853202679, + "grad_norm": 2.325718879699707, + "learning_rate": 1.4391941742595224e-05, + "loss": 0.7564, + "step": 4371 + }, + { + "epoch": 0.3308236540426015, + "grad_norm": 2.014582872390747, + "learning_rate": 1.4390138289427204e-05, + "loss": 0.7626, + "step": 4372 + }, + { + "epoch": 0.3308993227649351, + "grad_norm": 2.6732168197631836, + "learning_rate": 1.438833449871129e-05, + "loss": 0.7594, + "step": 4373 + }, + { + "epoch": 0.3309749914872687, + "grad_norm": 1.793832778930664, + "learning_rate": 1.4386530370560439e-05, + "loss": 0.7993, + "step": 4374 + }, + { + "epoch": 0.3310506602096024, + "grad_norm": 2.3724443912506104, + "learning_rate": 1.4384725905087638e-05, + "loss": 0.8349, + "step": 4375 + }, + { + "epoch": 0.331126328931936, + "grad_norm": 2.2450218200683594, + "learning_rate": 1.4382921102405882e-05, + "loss": 0.7646, + "step": 4376 + }, + { + "epoch": 0.3312019976542696, + "grad_norm": 2.3006644248962402, + "learning_rate": 1.4381115962628197e-05, + "loss": 0.7745, + "step": 4377 + }, + { + "epoch": 0.3312776663766032, + "grad_norm": 2.5185351371765137, + "learning_rate": 1.4379310485867626e-05, + "loss": 0.7865, + "step": 4378 + }, + { + "epoch": 0.33135333509893683, + "grad_norm": 3.231492280960083, + "learning_rate": 1.4377504672237231e-05, + "loss": 0.7233, + "step": 4379 + }, + { + "epoch": 0.3314290038212705, + "grad_norm": 3.705059766769409, + "learning_rate": 1.4375698521850104e-05, + "loss": 0.8611, + "step": 4380 + }, + { + "epoch": 0.3315046725436041, + "grad_norm": 2.2593576908111572, + "learning_rate": 1.4373892034819347e-05, + "loss": 0.7857, + "step": 4381 + }, + { + "epoch": 0.3315803412659377, + "grad_norm": 2.1838455200195312, + "learning_rate": 1.4372085211258087e-05, + "loss": 0.7414, + "step": 4382 + }, + { + "epoch": 0.3316560099882713, + "grad_norm": 2.32033371925354, + "learning_rate": 1.4370278051279481e-05, + "loss": 0.9, + "step": 4383 + }, + { + "epoch": 0.331731678710605, + "grad_norm": 2.2040934562683105, + "learning_rate": 1.4368470554996691e-05, + "loss": 0.7841, + "step": 4384 + }, + { + "epoch": 0.3318073474329386, + "grad_norm": 2.480590343475342, + "learning_rate": 1.4366662722522909e-05, + "loss": 0.7788, + "step": 4385 + }, + { + "epoch": 0.3318830161552722, + "grad_norm": 2.308894157409668, + "learning_rate": 1.4364854553971351e-05, + "loss": 0.8387, + "step": 4386 + }, + { + "epoch": 0.3319586848776058, + "grad_norm": 2.260097026824951, + "learning_rate": 1.4363046049455249e-05, + "loss": 0.8178, + "step": 4387 + }, + { + "epoch": 0.3320343535999395, + "grad_norm": 2.4592983722686768, + "learning_rate": 1.4361237209087857e-05, + "loss": 0.7605, + "step": 4388 + }, + { + "epoch": 0.3321100223222731, + "grad_norm": 2.417006254196167, + "learning_rate": 1.435942803298245e-05, + "loss": 0.7592, + "step": 4389 + }, + { + "epoch": 0.3321856910446067, + "grad_norm": 2.0404160022735596, + "learning_rate": 1.4357618521252326e-05, + "loss": 0.7705, + "step": 4390 + }, + { + "epoch": 0.3322613597669403, + "grad_norm": 1.8098578453063965, + "learning_rate": 1.4355808674010805e-05, + "loss": 0.7147, + "step": 4391 + }, + { + "epoch": 0.33233702848927393, + "grad_norm": 2.279453754425049, + "learning_rate": 1.4353998491371217e-05, + "loss": 0.8241, + "step": 4392 + }, + { + "epoch": 0.3324126972116076, + "grad_norm": 1.8218796253204346, + "learning_rate": 1.435218797344693e-05, + "loss": 0.8129, + "step": 4393 + }, + { + "epoch": 0.3324883659339412, + "grad_norm": 2.245039939880371, + "learning_rate": 1.4350377120351316e-05, + "loss": 0.8392, + "step": 4394 + }, + { + "epoch": 0.3325640346562748, + "grad_norm": 4.042459011077881, + "learning_rate": 1.4348565932197786e-05, + "loss": 0.8678, + "step": 4395 + }, + { + "epoch": 0.33263970337860843, + "grad_norm": 2.0353004932403564, + "learning_rate": 1.4346754409099758e-05, + "loss": 0.7939, + "step": 4396 + }, + { + "epoch": 0.3327153721009421, + "grad_norm": 2.1609416007995605, + "learning_rate": 1.4344942551170673e-05, + "loss": 0.7398, + "step": 4397 + }, + { + "epoch": 0.3327910408232757, + "grad_norm": 2.067974805831909, + "learning_rate": 1.4343130358523998e-05, + "loss": 0.79, + "step": 4398 + }, + { + "epoch": 0.3328667095456093, + "grad_norm": 1.9276955127716064, + "learning_rate": 1.4341317831273221e-05, + "loss": 0.7935, + "step": 4399 + }, + { + "epoch": 0.33294237826794293, + "grad_norm": 2.694018840789795, + "learning_rate": 1.4339504969531843e-05, + "loss": 0.7347, + "step": 4400 + }, + { + "epoch": 0.3330180469902766, + "grad_norm": 2.018950939178467, + "learning_rate": 1.4337691773413394e-05, + "loss": 0.5026, + "step": 4401 + }, + { + "epoch": 0.3330937157126102, + "grad_norm": 1.8888964653015137, + "learning_rate": 1.4335878243031423e-05, + "loss": 0.6937, + "step": 4402 + }, + { + "epoch": 0.3331693844349438, + "grad_norm": 2.4670050144195557, + "learning_rate": 1.4334064378499495e-05, + "loss": 0.7629, + "step": 4403 + }, + { + "epoch": 0.3332450531572774, + "grad_norm": 2.6195714473724365, + "learning_rate": 1.4332250179931207e-05, + "loss": 0.7759, + "step": 4404 + }, + { + "epoch": 0.33332072187961104, + "grad_norm": 2.684854507446289, + "learning_rate": 1.4330435647440165e-05, + "loss": 0.6832, + "step": 4405 + }, + { + "epoch": 0.3333963906019447, + "grad_norm": 2.092449903488159, + "learning_rate": 1.4328620781140001e-05, + "loss": 0.5791, + "step": 4406 + }, + { + "epoch": 0.3334720593242783, + "grad_norm": 1.9735100269317627, + "learning_rate": 1.432680558114437e-05, + "loss": 0.7944, + "step": 4407 + }, + { + "epoch": 0.3335477280466119, + "grad_norm": 2.075486421585083, + "learning_rate": 1.4324990047566943e-05, + "loss": 0.8772, + "step": 4408 + }, + { + "epoch": 0.33362339676894553, + "grad_norm": 1.8212954998016357, + "learning_rate": 1.4323174180521418e-05, + "loss": 0.7393, + "step": 4409 + }, + { + "epoch": 0.3336990654912792, + "grad_norm": 2.295945167541504, + "learning_rate": 1.4321357980121509e-05, + "loss": 0.7714, + "step": 4410 + }, + { + "epoch": 0.3337747342136128, + "grad_norm": 2.0244014263153076, + "learning_rate": 1.4319541446480951e-05, + "loss": 0.7639, + "step": 4411 + }, + { + "epoch": 0.3338504029359464, + "grad_norm": 2.317169189453125, + "learning_rate": 1.43177245797135e-05, + "loss": 0.8307, + "step": 4412 + }, + { + "epoch": 0.33392607165828003, + "grad_norm": 2.066453218460083, + "learning_rate": 1.431590737993294e-05, + "loss": 0.7536, + "step": 4413 + }, + { + "epoch": 0.3340017403806137, + "grad_norm": 2.337251663208008, + "learning_rate": 1.4314089847253063e-05, + "loss": 0.8076, + "step": 4414 + }, + { + "epoch": 0.3340774091029473, + "grad_norm": 2.4293739795684814, + "learning_rate": 1.4312271981787692e-05, + "loss": 0.7072, + "step": 4415 + }, + { + "epoch": 0.3341530778252809, + "grad_norm": 2.3487699031829834, + "learning_rate": 1.431045378365067e-05, + "loss": 0.8849, + "step": 4416 + }, + { + "epoch": 0.33422874654761453, + "grad_norm": 1.5888330936431885, + "learning_rate": 1.4308635252955854e-05, + "loss": 0.7692, + "step": 4417 + }, + { + "epoch": 0.33430441526994814, + "grad_norm": 2.819643974304199, + "learning_rate": 1.430681638981713e-05, + "loss": 0.7928, + "step": 4418 + }, + { + "epoch": 0.3343800839922818, + "grad_norm": 2.25228214263916, + "learning_rate": 1.4304997194348399e-05, + "loss": 0.6886, + "step": 4419 + }, + { + "epoch": 0.3344557527146154, + "grad_norm": 2.0968821048736572, + "learning_rate": 1.4303177666663582e-05, + "loss": 0.7954, + "step": 4420 + }, + { + "epoch": 0.33453142143694903, + "grad_norm": 2.138329267501831, + "learning_rate": 1.4301357806876632e-05, + "loss": 0.5807, + "step": 4421 + }, + { + "epoch": 0.33460709015928264, + "grad_norm": 2.1104984283447266, + "learning_rate": 1.4299537615101503e-05, + "loss": 0.7762, + "step": 4422 + }, + { + "epoch": 0.3346827588816163, + "grad_norm": 2.5498058795928955, + "learning_rate": 1.4297717091452193e-05, + "loss": 0.8644, + "step": 4423 + }, + { + "epoch": 0.3347584276039499, + "grad_norm": 2.219202995300293, + "learning_rate": 1.4295896236042702e-05, + "loss": 0.7394, + "step": 4424 + }, + { + "epoch": 0.3348340963262835, + "grad_norm": 2.217406988143921, + "learning_rate": 1.429407504898706e-05, + "loss": 0.8712, + "step": 4425 + }, + { + "epoch": 0.33490976504861714, + "grad_norm": 2.460085153579712, + "learning_rate": 1.4292253530399316e-05, + "loss": 0.8625, + "step": 4426 + }, + { + "epoch": 0.3349854337709508, + "grad_norm": 2.620077610015869, + "learning_rate": 1.429043168039354e-05, + "loss": 0.4811, + "step": 4427 + }, + { + "epoch": 0.3350611024932844, + "grad_norm": 2.3812079429626465, + "learning_rate": 1.4288609499083819e-05, + "loss": 0.736, + "step": 4428 + }, + { + "epoch": 0.335136771215618, + "grad_norm": 2.081484794616699, + "learning_rate": 1.4286786986584267e-05, + "loss": 0.7107, + "step": 4429 + }, + { + "epoch": 0.33521243993795163, + "grad_norm": 1.9492160081863403, + "learning_rate": 1.428496414300901e-05, + "loss": 0.8236, + "step": 4430 + }, + { + "epoch": 0.33528810866028524, + "grad_norm": 2.160243034362793, + "learning_rate": 1.428314096847221e-05, + "loss": 0.8853, + "step": 4431 + }, + { + "epoch": 0.3353637773826189, + "grad_norm": 2.322145462036133, + "learning_rate": 1.428131746308803e-05, + "loss": 0.8845, + "step": 4432 + }, + { + "epoch": 0.3354394461049525, + "grad_norm": 2.935598134994507, + "learning_rate": 1.427949362697067e-05, + "loss": 0.7397, + "step": 4433 + }, + { + "epoch": 0.33551511482728613, + "grad_norm": 2.1716995239257812, + "learning_rate": 1.4277669460234346e-05, + "loss": 0.6468, + "step": 4434 + }, + { + "epoch": 0.33559078354961974, + "grad_norm": 1.9296468496322632, + "learning_rate": 1.4275844962993288e-05, + "loss": 0.783, + "step": 4435 + }, + { + "epoch": 0.3356664522719534, + "grad_norm": 2.0291011333465576, + "learning_rate": 1.4274020135361758e-05, + "loss": 0.7367, + "step": 4436 + }, + { + "epoch": 0.335742120994287, + "grad_norm": 2.1033778190612793, + "learning_rate": 1.4272194977454024e-05, + "loss": 0.7526, + "step": 4437 + }, + { + "epoch": 0.33581778971662063, + "grad_norm": 2.0603630542755127, + "learning_rate": 1.427036948938439e-05, + "loss": 0.722, + "step": 4438 + }, + { + "epoch": 0.33589345843895424, + "grad_norm": 2.2816059589385986, + "learning_rate": 1.4268543671267173e-05, + "loss": 0.6255, + "step": 4439 + }, + { + "epoch": 0.3359691271612879, + "grad_norm": 2.037482976913452, + "learning_rate": 1.4266717523216709e-05, + "loss": 0.5998, + "step": 4440 + }, + { + "epoch": 0.3360447958836215, + "grad_norm": 1.928592562675476, + "learning_rate": 1.426489104534736e-05, + "loss": 0.8288, + "step": 4441 + }, + { + "epoch": 0.3361204646059551, + "grad_norm": 1.9821760654449463, + "learning_rate": 1.4263064237773506e-05, + "loss": 0.6743, + "step": 4442 + }, + { + "epoch": 0.33619613332828874, + "grad_norm": 2.9042348861694336, + "learning_rate": 1.4261237100609543e-05, + "loss": 0.9823, + "step": 4443 + }, + { + "epoch": 0.33627180205062235, + "grad_norm": 2.4665040969848633, + "learning_rate": 1.4259409633969901e-05, + "loss": 0.7231, + "step": 4444 + }, + { + "epoch": 0.336347470772956, + "grad_norm": 2.4456706047058105, + "learning_rate": 1.4257581837969012e-05, + "loss": 0.8436, + "step": 4445 + }, + { + "epoch": 0.3364231394952896, + "grad_norm": 2.440807342529297, + "learning_rate": 1.4255753712721347e-05, + "loss": 0.7894, + "step": 4446 + }, + { + "epoch": 0.33649880821762324, + "grad_norm": 1.8564308881759644, + "learning_rate": 1.4253925258341384e-05, + "loss": 0.7516, + "step": 4447 + }, + { + "epoch": 0.33657447693995685, + "grad_norm": 2.5682311058044434, + "learning_rate": 1.4252096474943626e-05, + "loss": 0.701, + "step": 4448 + }, + { + "epoch": 0.3366501456622905, + "grad_norm": 2.0853443145751953, + "learning_rate": 1.4250267362642604e-05, + "loss": 0.761, + "step": 4449 + }, + { + "epoch": 0.3367258143846241, + "grad_norm": 2.6252856254577637, + "learning_rate": 1.4248437921552855e-05, + "loss": 0.718, + "step": 4450 + }, + { + "epoch": 0.33680148310695773, + "grad_norm": 2.0729100704193115, + "learning_rate": 1.4246608151788947e-05, + "loss": 0.6689, + "step": 4451 + }, + { + "epoch": 0.33687715182929134, + "grad_norm": 3.019207715988159, + "learning_rate": 1.424477805346547e-05, + "loss": 0.7578, + "step": 4452 + }, + { + "epoch": 0.336952820551625, + "grad_norm": 1.869498610496521, + "learning_rate": 1.4242947626697024e-05, + "loss": 0.6262, + "step": 4453 + }, + { + "epoch": 0.3370284892739586, + "grad_norm": 9.356559753417969, + "learning_rate": 1.4241116871598241e-05, + "loss": 0.6689, + "step": 4454 + }, + { + "epoch": 0.33710415799629223, + "grad_norm": 2.5027689933776855, + "learning_rate": 1.423928578828377e-05, + "loss": 0.7085, + "step": 4455 + }, + { + "epoch": 0.33717982671862584, + "grad_norm": 1.68281090259552, + "learning_rate": 1.4237454376868275e-05, + "loss": 0.7766, + "step": 4456 + }, + { + "epoch": 0.33725549544095945, + "grad_norm": 3.305651903152466, + "learning_rate": 1.4235622637466449e-05, + "loss": 0.7257, + "step": 4457 + }, + { + "epoch": 0.3373311641632931, + "grad_norm": 2.6844162940979004, + "learning_rate": 1.4233790570192997e-05, + "loss": 0.7563, + "step": 4458 + }, + { + "epoch": 0.33740683288562673, + "grad_norm": 1.9760267734527588, + "learning_rate": 1.423195817516265e-05, + "loss": 0.6608, + "step": 4459 + }, + { + "epoch": 0.33748250160796034, + "grad_norm": 2.771921157836914, + "learning_rate": 1.4230125452490165e-05, + "loss": 0.8818, + "step": 4460 + }, + { + "epoch": 0.33755817033029395, + "grad_norm": 1.8411167860031128, + "learning_rate": 1.4228292402290303e-05, + "loss": 0.7705, + "step": 4461 + }, + { + "epoch": 0.3376338390526276, + "grad_norm": 8.400818824768066, + "learning_rate": 1.4226459024677864e-05, + "loss": 0.7038, + "step": 4462 + }, + { + "epoch": 0.3377095077749612, + "grad_norm": 2.2699685096740723, + "learning_rate": 1.4224625319767654e-05, + "loss": 0.7097, + "step": 4463 + }, + { + "epoch": 0.33778517649729484, + "grad_norm": 2.2146406173706055, + "learning_rate": 1.422279128767451e-05, + "loss": 0.7979, + "step": 4464 + }, + { + "epoch": 0.33786084521962845, + "grad_norm": 1.8910346031188965, + "learning_rate": 1.4220956928513283e-05, + "loss": 0.7868, + "step": 4465 + }, + { + "epoch": 0.3379365139419621, + "grad_norm": 2.668886423110962, + "learning_rate": 1.4219122242398842e-05, + "loss": 0.7427, + "step": 4466 + }, + { + "epoch": 0.3380121826642957, + "grad_norm": 2.642848014831543, + "learning_rate": 1.4217287229446089e-05, + "loss": 0.6824, + "step": 4467 + }, + { + "epoch": 0.33808785138662933, + "grad_norm": 2.7786381244659424, + "learning_rate": 1.4215451889769936e-05, + "loss": 0.8064, + "step": 4468 + }, + { + "epoch": 0.33816352010896294, + "grad_norm": 2.077474355697632, + "learning_rate": 1.4213616223485314e-05, + "loss": 0.7639, + "step": 4469 + }, + { + "epoch": 0.3382391888312966, + "grad_norm": 2.304389715194702, + "learning_rate": 1.4211780230707184e-05, + "loss": 0.7471, + "step": 4470 + }, + { + "epoch": 0.3383148575536302, + "grad_norm": 8.346323013305664, + "learning_rate": 1.4209943911550519e-05, + "loss": 0.6181, + "step": 4471 + }, + { + "epoch": 0.33839052627596383, + "grad_norm": 1.981086254119873, + "learning_rate": 1.4208107266130313e-05, + "loss": 0.656, + "step": 4472 + }, + { + "epoch": 0.33846619499829744, + "grad_norm": 2.607759714126587, + "learning_rate": 1.4206270294561587e-05, + "loss": 0.8316, + "step": 4473 + }, + { + "epoch": 0.33854186372063105, + "grad_norm": 2.3243844509124756, + "learning_rate": 1.4204432996959373e-05, + "loss": 0.7953, + "step": 4474 + }, + { + "epoch": 0.3386175324429647, + "grad_norm": 2.651670217514038, + "learning_rate": 1.4202595373438735e-05, + "loss": 0.7781, + "step": 4475 + }, + { + "epoch": 0.33869320116529833, + "grad_norm": 2.400404930114746, + "learning_rate": 1.4200757424114745e-05, + "loss": 0.6033, + "step": 4476 + }, + { + "epoch": 0.33876886988763194, + "grad_norm": 2.4451756477355957, + "learning_rate": 1.4198919149102506e-05, + "loss": 0.7409, + "step": 4477 + }, + { + "epoch": 0.33884453860996555, + "grad_norm": 1.9781309366226196, + "learning_rate": 1.4197080548517134e-05, + "loss": 0.8582, + "step": 4478 + }, + { + "epoch": 0.3389202073322992, + "grad_norm": 2.500493288040161, + "learning_rate": 1.4195241622473765e-05, + "loss": 0.6146, + "step": 4479 + }, + { + "epoch": 0.3389958760546328, + "grad_norm": 2.1779065132141113, + "learning_rate": 1.419340237108757e-05, + "loss": 0.7664, + "step": 4480 + }, + { + "epoch": 0.33907154477696644, + "grad_norm": 2.633241891860962, + "learning_rate": 1.4191562794473713e-05, + "loss": 0.7935, + "step": 4481 + }, + { + "epoch": 0.33914721349930005, + "grad_norm": 1.5423035621643066, + "learning_rate": 1.4189722892747406e-05, + "loss": 0.957, + "step": 4482 + }, + { + "epoch": 0.3392228822216337, + "grad_norm": 2.039738178253174, + "learning_rate": 1.4187882666023866e-05, + "loss": 0.6976, + "step": 4483 + }, + { + "epoch": 0.3392985509439673, + "grad_norm": 2.4406769275665283, + "learning_rate": 1.4186042114418331e-05, + "loss": 0.7359, + "step": 4484 + }, + { + "epoch": 0.33937421966630094, + "grad_norm": 2.1993062496185303, + "learning_rate": 1.4184201238046069e-05, + "loss": 0.6014, + "step": 4485 + }, + { + "epoch": 0.33944988838863455, + "grad_norm": 2.3027052879333496, + "learning_rate": 1.4182360037022355e-05, + "loss": 0.8225, + "step": 4486 + }, + { + "epoch": 0.33952555711096816, + "grad_norm": 2.9717302322387695, + "learning_rate": 1.4180518511462497e-05, + "loss": 0.7826, + "step": 4487 + }, + { + "epoch": 0.3396012258333018, + "grad_norm": 2.1658003330230713, + "learning_rate": 1.4178676661481813e-05, + "loss": 0.8349, + "step": 4488 + }, + { + "epoch": 0.33967689455563543, + "grad_norm": 2.308877944946289, + "learning_rate": 1.417683448719564e-05, + "loss": 0.6189, + "step": 4489 + }, + { + "epoch": 0.33975256327796904, + "grad_norm": 1.7710340023040771, + "learning_rate": 1.4174991988719355e-05, + "loss": 0.7385, + "step": 4490 + }, + { + "epoch": 0.33982823200030265, + "grad_norm": 2.1375606060028076, + "learning_rate": 1.4173149166168332e-05, + "loss": 0.739, + "step": 4491 + }, + { + "epoch": 0.3399039007226363, + "grad_norm": 2.5131471157073975, + "learning_rate": 1.4171306019657974e-05, + "loss": 0.7041, + "step": 4492 + }, + { + "epoch": 0.33997956944496993, + "grad_norm": 2.478649139404297, + "learning_rate": 1.416946254930371e-05, + "loss": 0.7395, + "step": 4493 + }, + { + "epoch": 0.34005523816730354, + "grad_norm": 2.2513961791992188, + "learning_rate": 1.416761875522098e-05, + "loss": 0.7747, + "step": 4494 + }, + { + "epoch": 0.34013090688963715, + "grad_norm": 2.2607710361480713, + "learning_rate": 1.416577463752525e-05, + "loss": 0.6677, + "step": 4495 + }, + { + "epoch": 0.3402065756119708, + "grad_norm": 1.9935152530670166, + "learning_rate": 1.4163930196332004e-05, + "loss": 0.7986, + "step": 4496 + }, + { + "epoch": 0.34028224433430443, + "grad_norm": 2.159668207168579, + "learning_rate": 1.4162085431756746e-05, + "loss": 1.073, + "step": 4497 + }, + { + "epoch": 0.34035791305663804, + "grad_norm": 1.8614026308059692, + "learning_rate": 1.4160240343915002e-05, + "loss": 0.5685, + "step": 4498 + }, + { + "epoch": 0.34043358177897165, + "grad_norm": 2.1379287242889404, + "learning_rate": 1.4158394932922315e-05, + "loss": 0.7633, + "step": 4499 + }, + { + "epoch": 0.34050925050130526, + "grad_norm": 2.2675228118896484, + "learning_rate": 1.4156549198894257e-05, + "loss": 0.8834, + "step": 4500 + }, + { + "epoch": 0.3405849192236389, + "grad_norm": 2.828331708908081, + "learning_rate": 1.415470314194641e-05, + "loss": 0.8714, + "step": 4501 + }, + { + "epoch": 0.34066058794597254, + "grad_norm": 2.267286777496338, + "learning_rate": 1.4152856762194377e-05, + "loss": 0.7331, + "step": 4502 + }, + { + "epoch": 0.34073625666830615, + "grad_norm": 2.4142282009124756, + "learning_rate": 1.415101005975379e-05, + "loss": 0.7392, + "step": 4503 + }, + { + "epoch": 0.34081192539063976, + "grad_norm": 2.212761878967285, + "learning_rate": 1.4149163034740291e-05, + "loss": 0.7835, + "step": 4504 + }, + { + "epoch": 0.3408875941129734, + "grad_norm": 2.463355541229248, + "learning_rate": 1.4147315687269547e-05, + "loss": 0.75, + "step": 4505 + }, + { + "epoch": 0.34096326283530703, + "grad_norm": 2.5899860858917236, + "learning_rate": 1.414546801745725e-05, + "loss": 0.6397, + "step": 4506 + }, + { + "epoch": 0.34103893155764065, + "grad_norm": 2.4131081104278564, + "learning_rate": 1.4143620025419099e-05, + "loss": 0.7682, + "step": 4507 + }, + { + "epoch": 0.34111460027997426, + "grad_norm": 2.420891284942627, + "learning_rate": 1.414177171127083e-05, + "loss": 0.7546, + "step": 4508 + }, + { + "epoch": 0.3411902690023079, + "grad_norm": 2.2223784923553467, + "learning_rate": 1.4139923075128185e-05, + "loss": 0.6996, + "step": 4509 + }, + { + "epoch": 0.34126593772464153, + "grad_norm": 2.2744736671447754, + "learning_rate": 1.413807411710693e-05, + "loss": 0.846, + "step": 4510 + }, + { + "epoch": 0.34134160644697514, + "grad_norm": 1.9180521965026855, + "learning_rate": 1.4136224837322857e-05, + "loss": 0.7467, + "step": 4511 + }, + { + "epoch": 0.34141727516930875, + "grad_norm": 2.1215381622314453, + "learning_rate": 1.413437523589177e-05, + "loss": 0.7741, + "step": 4512 + }, + { + "epoch": 0.34149294389164236, + "grad_norm": 2.879868984222412, + "learning_rate": 1.4132525312929501e-05, + "loss": 0.7951, + "step": 4513 + }, + { + "epoch": 0.34156861261397603, + "grad_norm": 2.1960246562957764, + "learning_rate": 1.4130675068551898e-05, + "loss": 0.7194, + "step": 4514 + }, + { + "epoch": 0.34164428133630964, + "grad_norm": 2.2980356216430664, + "learning_rate": 1.4128824502874824e-05, + "loss": 0.7676, + "step": 4515 + }, + { + "epoch": 0.34171995005864325, + "grad_norm": 2.5067970752716064, + "learning_rate": 1.412697361601417e-05, + "loss": 0.6625, + "step": 4516 + }, + { + "epoch": 0.34179561878097686, + "grad_norm": 2.2618825435638428, + "learning_rate": 1.4125122408085849e-05, + "loss": 0.7369, + "step": 4517 + }, + { + "epoch": 0.34187128750331053, + "grad_norm": 3.191148281097412, + "learning_rate": 1.4123270879205787e-05, + "loss": 0.694, + "step": 4518 + }, + { + "epoch": 0.34194695622564414, + "grad_norm": 1.9559108018875122, + "learning_rate": 1.412141902948993e-05, + "loss": 0.6441, + "step": 4519 + }, + { + "epoch": 0.34202262494797775, + "grad_norm": 2.6855931282043457, + "learning_rate": 1.4119566859054249e-05, + "loss": 0.8047, + "step": 4520 + }, + { + "epoch": 0.34209829367031136, + "grad_norm": 1.9309477806091309, + "learning_rate": 1.4117714368014732e-05, + "loss": 0.7086, + "step": 4521 + }, + { + "epoch": 0.342173962392645, + "grad_norm": 2.5932657718658447, + "learning_rate": 1.4115861556487388e-05, + "loss": 0.8026, + "step": 4522 + }, + { + "epoch": 0.34224963111497864, + "grad_norm": 2.6737778186798096, + "learning_rate": 1.4114008424588249e-05, + "loss": 0.7373, + "step": 4523 + }, + { + "epoch": 0.34232529983731225, + "grad_norm": 1.9361664056777954, + "learning_rate": 1.411215497243336e-05, + "loss": 0.6394, + "step": 4524 + }, + { + "epoch": 0.34240096855964586, + "grad_norm": 2.822296619415283, + "learning_rate": 1.4110301200138793e-05, + "loss": 0.6913, + "step": 4525 + }, + { + "epoch": 0.34247663728197947, + "grad_norm": 1.9152315855026245, + "learning_rate": 1.4108447107820634e-05, + "loss": 0.7587, + "step": 4526 + }, + { + "epoch": 0.34255230600431313, + "grad_norm": 2.4261810779571533, + "learning_rate": 1.4106592695594997e-05, + "loss": 0.7208, + "step": 4527 + }, + { + "epoch": 0.34262797472664674, + "grad_norm": 2.7643346786499023, + "learning_rate": 1.4104737963578006e-05, + "loss": 0.7681, + "step": 4528 + }, + { + "epoch": 0.34270364344898036, + "grad_norm": 1.8809469938278198, + "learning_rate": 1.4102882911885817e-05, + "loss": 0.7987, + "step": 4529 + }, + { + "epoch": 0.34277931217131397, + "grad_norm": 2.6248059272766113, + "learning_rate": 1.4101027540634591e-05, + "loss": 0.7945, + "step": 4530 + }, + { + "epoch": 0.34285498089364763, + "grad_norm": 2.5834579467773438, + "learning_rate": 1.4099171849940526e-05, + "loss": 0.7192, + "step": 4531 + }, + { + "epoch": 0.34293064961598124, + "grad_norm": 2.4351603984832764, + "learning_rate": 1.4097315839919824e-05, + "loss": 0.79, + "step": 4532 + }, + { + "epoch": 0.34300631833831485, + "grad_norm": 1.948221206665039, + "learning_rate": 1.4095459510688717e-05, + "loss": 0.5718, + "step": 4533 + }, + { + "epoch": 0.34308198706064846, + "grad_norm": 2.2318930625915527, + "learning_rate": 1.4093602862363455e-05, + "loss": 0.673, + "step": 4534 + }, + { + "epoch": 0.34315765578298213, + "grad_norm": 2.6008236408233643, + "learning_rate": 1.4091745895060307e-05, + "loss": 0.7378, + "step": 4535 + }, + { + "epoch": 0.34323332450531574, + "grad_norm": 2.340876579284668, + "learning_rate": 1.4089888608895564e-05, + "loss": 0.6036, + "step": 4536 + }, + { + "epoch": 0.34330899322764935, + "grad_norm": 2.4017269611358643, + "learning_rate": 1.4088031003985535e-05, + "loss": 0.6923, + "step": 4537 + }, + { + "epoch": 0.34338466194998296, + "grad_norm": 1.9680914878845215, + "learning_rate": 1.4086173080446543e-05, + "loss": 0.91, + "step": 4538 + }, + { + "epoch": 0.34346033067231657, + "grad_norm": 1.9939541816711426, + "learning_rate": 1.4084314838394944e-05, + "loss": 0.7028, + "step": 4539 + }, + { + "epoch": 0.34353599939465024, + "grad_norm": 2.0983216762542725, + "learning_rate": 1.4082456277947105e-05, + "loss": 0.8404, + "step": 4540 + }, + { + "epoch": 0.34361166811698385, + "grad_norm": 2.5122082233428955, + "learning_rate": 1.4080597399219415e-05, + "loss": 0.7113, + "step": 4541 + }, + { + "epoch": 0.34368733683931746, + "grad_norm": 2.6036436557769775, + "learning_rate": 1.4078738202328287e-05, + "loss": 0.7351, + "step": 4542 + }, + { + "epoch": 0.34376300556165107, + "grad_norm": 2.5619027614593506, + "learning_rate": 1.4076878687390143e-05, + "loss": 0.8871, + "step": 4543 + }, + { + "epoch": 0.34383867428398474, + "grad_norm": 2.4395925998687744, + "learning_rate": 1.4075018854521434e-05, + "loss": 0.7969, + "step": 4544 + }, + { + "epoch": 0.34391434300631835, + "grad_norm": 2.3042728900909424, + "learning_rate": 1.4073158703838632e-05, + "loss": 0.6841, + "step": 4545 + }, + { + "epoch": 0.34399001172865196, + "grad_norm": 1.9354417324066162, + "learning_rate": 1.4071298235458222e-05, + "loss": 0.6648, + "step": 4546 + }, + { + "epoch": 0.34406568045098557, + "grad_norm": 2.020469903945923, + "learning_rate": 1.4069437449496715e-05, + "loss": 0.8337, + "step": 4547 + }, + { + "epoch": 0.34414134917331923, + "grad_norm": 2.3265795707702637, + "learning_rate": 1.4067576346070637e-05, + "loss": 0.7247, + "step": 4548 + }, + { + "epoch": 0.34421701789565284, + "grad_norm": 1.9919472932815552, + "learning_rate": 1.4065714925296538e-05, + "loss": 0.7174, + "step": 4549 + }, + { + "epoch": 0.34429268661798645, + "grad_norm": 4.500394821166992, + "learning_rate": 1.4063853187290988e-05, + "loss": 0.8554, + "step": 4550 + }, + { + "epoch": 0.34436835534032006, + "grad_norm": 3.009972095489502, + "learning_rate": 1.4061991132170571e-05, + "loss": 0.9941, + "step": 4551 + }, + { + "epoch": 0.3444440240626537, + "grad_norm": 2.4552001953125, + "learning_rate": 1.4060128760051897e-05, + "loss": 0.7654, + "step": 4552 + }, + { + "epoch": 0.34451969278498734, + "grad_norm": 1.9885838031768799, + "learning_rate": 1.4058266071051593e-05, + "loss": 0.858, + "step": 4553 + }, + { + "epoch": 0.34459536150732095, + "grad_norm": 2.1382341384887695, + "learning_rate": 1.4056403065286308e-05, + "loss": 0.8359, + "step": 4554 + }, + { + "epoch": 0.34467103022965456, + "grad_norm": 1.8705214262008667, + "learning_rate": 1.4054539742872708e-05, + "loss": 0.6685, + "step": 4555 + }, + { + "epoch": 0.3447466989519882, + "grad_norm": 2.376476287841797, + "learning_rate": 1.405267610392748e-05, + "loss": 0.9648, + "step": 4556 + }, + { + "epoch": 0.34482236767432184, + "grad_norm": 2.8174350261688232, + "learning_rate": 1.4050812148567337e-05, + "loss": 0.7507, + "step": 4557 + }, + { + "epoch": 0.34489803639665545, + "grad_norm": 2.0304601192474365, + "learning_rate": 1.4048947876908994e-05, + "loss": 1.0117, + "step": 4558 + }, + { + "epoch": 0.34497370511898906, + "grad_norm": 2.247481107711792, + "learning_rate": 1.4047083289069209e-05, + "loss": 0.7993, + "step": 4559 + }, + { + "epoch": 0.34504937384132267, + "grad_norm": 2.2741150856018066, + "learning_rate": 1.4045218385164743e-05, + "loss": 0.8526, + "step": 4560 + }, + { + "epoch": 0.34512504256365634, + "grad_norm": 1.9066141843795776, + "learning_rate": 1.4043353165312383e-05, + "loss": 0.676, + "step": 4561 + }, + { + "epoch": 0.34520071128598995, + "grad_norm": 2.4865188598632812, + "learning_rate": 1.4041487629628936e-05, + "loss": 0.7842, + "step": 4562 + }, + { + "epoch": 0.34527638000832356, + "grad_norm": 2.2557551860809326, + "learning_rate": 1.4039621778231228e-05, + "loss": 0.8724, + "step": 4563 + }, + { + "epoch": 0.34535204873065717, + "grad_norm": 3.9412360191345215, + "learning_rate": 1.4037755611236103e-05, + "loss": 0.6768, + "step": 4564 + }, + { + "epoch": 0.3454277174529908, + "grad_norm": 2.0502843856811523, + "learning_rate": 1.403588912876043e-05, + "loss": 0.614, + "step": 4565 + }, + { + "epoch": 0.34550338617532445, + "grad_norm": 2.2128398418426514, + "learning_rate": 1.403402233092109e-05, + "loss": 0.8441, + "step": 4566 + }, + { + "epoch": 0.34557905489765806, + "grad_norm": 1.9671826362609863, + "learning_rate": 1.403215521783499e-05, + "loss": 0.7818, + "step": 4567 + }, + { + "epoch": 0.34565472361999167, + "grad_norm": 2.188419818878174, + "learning_rate": 1.4030287789619055e-05, + "loss": 0.7472, + "step": 4568 + }, + { + "epoch": 0.3457303923423253, + "grad_norm": 2.099836587905884, + "learning_rate": 1.4028420046390227e-05, + "loss": 0.7362, + "step": 4569 + }, + { + "epoch": 0.34580606106465894, + "grad_norm": 2.1307973861694336, + "learning_rate": 1.4026551988265472e-05, + "loss": 0.824, + "step": 4570 + }, + { + "epoch": 0.34588172978699255, + "grad_norm": 2.5330209732055664, + "learning_rate": 1.4024683615361774e-05, + "loss": 0.8464, + "step": 4571 + }, + { + "epoch": 0.34595739850932616, + "grad_norm": 2.512056589126587, + "learning_rate": 1.4022814927796137e-05, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.3460330672316598, + "grad_norm": 2.4882543087005615, + "learning_rate": 1.4020945925685584e-05, + "loss": 0.7155, + "step": 4573 + }, + { + "epoch": 0.34610873595399344, + "grad_norm": 2.098118782043457, + "learning_rate": 1.4019076609147158e-05, + "loss": 0.6978, + "step": 4574 + }, + { + "epoch": 0.34618440467632705, + "grad_norm": 2.1440021991729736, + "learning_rate": 1.401720697829792e-05, + "loss": 0.6937, + "step": 4575 + }, + { + "epoch": 0.34626007339866066, + "grad_norm": 2.386068820953369, + "learning_rate": 1.401533703325495e-05, + "loss": 0.7601, + "step": 4576 + }, + { + "epoch": 0.34633574212099427, + "grad_norm": 2.6994619369506836, + "learning_rate": 1.4013466774135355e-05, + "loss": 0.5682, + "step": 4577 + }, + { + "epoch": 0.3464114108433279, + "grad_norm": 2.2204699516296387, + "learning_rate": 1.4011596201056259e-05, + "loss": 0.7343, + "step": 4578 + }, + { + "epoch": 0.34648707956566155, + "grad_norm": 2.2778122425079346, + "learning_rate": 1.4009725314134795e-05, + "loss": 0.673, + "step": 4579 + }, + { + "epoch": 0.34656274828799516, + "grad_norm": 2.2870404720306396, + "learning_rate": 1.4007854113488132e-05, + "loss": 0.7098, + "step": 4580 + }, + { + "epoch": 0.34663841701032877, + "grad_norm": 1.8866722583770752, + "learning_rate": 1.4005982599233442e-05, + "loss": 0.6607, + "step": 4581 + }, + { + "epoch": 0.3467140857326624, + "grad_norm": 1.7772208452224731, + "learning_rate": 1.4004110771487935e-05, + "loss": 0.8816, + "step": 4582 + }, + { + "epoch": 0.34678975445499605, + "grad_norm": 2.5590288639068604, + "learning_rate": 1.4002238630368825e-05, + "loss": 0.8935, + "step": 4583 + }, + { + "epoch": 0.34686542317732966, + "grad_norm": 2.385871171951294, + "learning_rate": 1.4000366175993354e-05, + "loss": 0.8033, + "step": 4584 + }, + { + "epoch": 0.34694109189966327, + "grad_norm": 2.1621901988983154, + "learning_rate": 1.3998493408478778e-05, + "loss": 0.8084, + "step": 4585 + }, + { + "epoch": 0.3470167606219969, + "grad_norm": 2.374621868133545, + "learning_rate": 1.3996620327942377e-05, + "loss": 0.7405, + "step": 4586 + }, + { + "epoch": 0.34709242934433054, + "grad_norm": 1.7292189598083496, + "learning_rate": 1.3994746934501451e-05, + "loss": 0.759, + "step": 4587 + }, + { + "epoch": 0.34716809806666415, + "grad_norm": 1.871042013168335, + "learning_rate": 1.3992873228273317e-05, + "loss": 0.6202, + "step": 4588 + }, + { + "epoch": 0.34724376678899777, + "grad_norm": 2.0830862522125244, + "learning_rate": 1.3990999209375314e-05, + "loss": 0.915, + "step": 4589 + }, + { + "epoch": 0.3473194355113314, + "grad_norm": 2.267789363861084, + "learning_rate": 1.3989124877924795e-05, + "loss": 0.8031, + "step": 4590 + }, + { + "epoch": 0.34739510423366504, + "grad_norm": 2.1552555561065674, + "learning_rate": 1.3987250234039143e-05, + "loss": 0.818, + "step": 4591 + }, + { + "epoch": 0.34747077295599865, + "grad_norm": 1.9342949390411377, + "learning_rate": 1.3985375277835748e-05, + "loss": 0.7326, + "step": 4592 + }, + { + "epoch": 0.34754644167833226, + "grad_norm": 2.7303926944732666, + "learning_rate": 1.3983500009432028e-05, + "loss": 0.7703, + "step": 4593 + }, + { + "epoch": 0.3476221104006659, + "grad_norm": 2.3340682983398438, + "learning_rate": 1.3981624428945419e-05, + "loss": 0.6769, + "step": 4594 + }, + { + "epoch": 0.3476977791229995, + "grad_norm": 2.0905210971832275, + "learning_rate": 1.3979748536493376e-05, + "loss": 0.8423, + "step": 4595 + }, + { + "epoch": 0.34777344784533315, + "grad_norm": 2.2317705154418945, + "learning_rate": 1.3977872332193375e-05, + "loss": 0.6386, + "step": 4596 + }, + { + "epoch": 0.34784911656766676, + "grad_norm": 2.2130250930786133, + "learning_rate": 1.3975995816162904e-05, + "loss": 0.8846, + "step": 4597 + }, + { + "epoch": 0.34792478529000037, + "grad_norm": 2.068493127822876, + "learning_rate": 1.3974118988519486e-05, + "loss": 0.838, + "step": 4598 + }, + { + "epoch": 0.348000454012334, + "grad_norm": 2.0455193519592285, + "learning_rate": 1.3972241849380645e-05, + "loss": 0.6782, + "step": 4599 + }, + { + "epoch": 0.34807612273466765, + "grad_norm": 1.4974215030670166, + "learning_rate": 1.3970364398863938e-05, + "loss": 0.7652, + "step": 4600 + }, + { + "epoch": 0.34815179145700126, + "grad_norm": 1.7549113035202026, + "learning_rate": 1.3968486637086936e-05, + "loss": 0.8342, + "step": 4601 + }, + { + "epoch": 0.34822746017933487, + "grad_norm": 2.3468220233917236, + "learning_rate": 1.3966608564167231e-05, + "loss": 0.6665, + "step": 4602 + }, + { + "epoch": 0.3483031289016685, + "grad_norm": 2.2296173572540283, + "learning_rate": 1.3964730180222437e-05, + "loss": 0.8537, + "step": 4603 + }, + { + "epoch": 0.34837879762400215, + "grad_norm": 2.597890853881836, + "learning_rate": 1.3962851485370178e-05, + "loss": 0.7739, + "step": 4604 + }, + { + "epoch": 0.34845446634633576, + "grad_norm": 2.384274482727051, + "learning_rate": 1.3960972479728105e-05, + "loss": 0.7863, + "step": 4605 + }, + { + "epoch": 0.34853013506866937, + "grad_norm": 2.0478515625, + "learning_rate": 1.3959093163413893e-05, + "loss": 0.8538, + "step": 4606 + }, + { + "epoch": 0.348605803791003, + "grad_norm": 2.03800630569458, + "learning_rate": 1.3957213536545227e-05, + "loss": 0.716, + "step": 4607 + }, + { + "epoch": 0.3486814725133366, + "grad_norm": 2.1581063270568848, + "learning_rate": 1.3955333599239813e-05, + "loss": 0.6596, + "step": 4608 + }, + { + "epoch": 0.34875714123567025, + "grad_norm": 2.4075028896331787, + "learning_rate": 1.3953453351615387e-05, + "loss": 0.7188, + "step": 4609 + }, + { + "epoch": 0.34883280995800386, + "grad_norm": 2.663573980331421, + "learning_rate": 1.3951572793789685e-05, + "loss": 0.6679, + "step": 4610 + }, + { + "epoch": 0.3489084786803375, + "grad_norm": 1.9524000883102417, + "learning_rate": 1.3949691925880481e-05, + "loss": 0.8243, + "step": 4611 + }, + { + "epoch": 0.3489841474026711, + "grad_norm": 2.518850803375244, + "learning_rate": 1.3947810748005563e-05, + "loss": 0.7894, + "step": 4612 + }, + { + "epoch": 0.34905981612500475, + "grad_norm": 3.2514965534210205, + "learning_rate": 1.3945929260282729e-05, + "loss": 0.9017, + "step": 4613 + }, + { + "epoch": 0.34913548484733836, + "grad_norm": 2.429962635040283, + "learning_rate": 1.3944047462829808e-05, + "loss": 0.7169, + "step": 4614 + }, + { + "epoch": 0.349211153569672, + "grad_norm": 2.0547587871551514, + "learning_rate": 1.3942165355764644e-05, + "loss": 0.7357, + "step": 4615 + }, + { + "epoch": 0.3492868222920056, + "grad_norm": 1.767414927482605, + "learning_rate": 1.3940282939205102e-05, + "loss": 0.8615, + "step": 4616 + }, + { + "epoch": 0.34936249101433925, + "grad_norm": 3.1079351902008057, + "learning_rate": 1.3938400213269061e-05, + "loss": 0.7191, + "step": 4617 + }, + { + "epoch": 0.34943815973667286, + "grad_norm": 2.7524807453155518, + "learning_rate": 1.3936517178074428e-05, + "loss": 0.8091, + "step": 4618 + }, + { + "epoch": 0.34951382845900647, + "grad_norm": 2.906195878982544, + "learning_rate": 1.3934633833739122e-05, + "loss": 0.7484, + "step": 4619 + }, + { + "epoch": 0.3495894971813401, + "grad_norm": 2.340843677520752, + "learning_rate": 1.3932750180381083e-05, + "loss": 0.6663, + "step": 4620 + }, + { + "epoch": 0.3496651659036737, + "grad_norm": 2.3411765098571777, + "learning_rate": 1.3930866218118278e-05, + "loss": 0.7908, + "step": 4621 + }, + { + "epoch": 0.34974083462600736, + "grad_norm": 2.4406328201293945, + "learning_rate": 1.3928981947068676e-05, + "loss": 0.8513, + "step": 4622 + }, + { + "epoch": 0.34981650334834097, + "grad_norm": 2.0467100143432617, + "learning_rate": 1.3927097367350286e-05, + "loss": 0.7571, + "step": 4623 + }, + { + "epoch": 0.3498921720706746, + "grad_norm": 11.604252815246582, + "learning_rate": 1.3925212479081125e-05, + "loss": 0.7548, + "step": 4624 + }, + { + "epoch": 0.3499678407930082, + "grad_norm": 2.3513424396514893, + "learning_rate": 1.3923327282379224e-05, + "loss": 0.7268, + "step": 4625 + }, + { + "epoch": 0.35004350951534186, + "grad_norm": 2.104701042175293, + "learning_rate": 1.3921441777362647e-05, + "loss": 0.7942, + "step": 4626 + }, + { + "epoch": 0.35011917823767547, + "grad_norm": 2.1579172611236572, + "learning_rate": 1.3919555964149467e-05, + "loss": 0.7785, + "step": 4627 + }, + { + "epoch": 0.3501948469600091, + "grad_norm": 1.857549786567688, + "learning_rate": 1.391766984285778e-05, + "loss": 0.7444, + "step": 4628 + }, + { + "epoch": 0.3502705156823427, + "grad_norm": 2.2894248962402344, + "learning_rate": 1.3915783413605705e-05, + "loss": 0.6749, + "step": 4629 + }, + { + "epoch": 0.35034618440467635, + "grad_norm": 2.237196207046509, + "learning_rate": 1.3913896676511369e-05, + "loss": 0.845, + "step": 4630 + }, + { + "epoch": 0.35042185312700996, + "grad_norm": 1.8301403522491455, + "learning_rate": 1.3912009631692933e-05, + "loss": 0.8037, + "step": 4631 + }, + { + "epoch": 0.3504975218493436, + "grad_norm": 2.5242412090301514, + "learning_rate": 1.3910122279268563e-05, + "loss": 0.8057, + "step": 4632 + }, + { + "epoch": 0.3505731905716772, + "grad_norm": 2.2044260501861572, + "learning_rate": 1.3908234619356456e-05, + "loss": 0.8019, + "step": 4633 + }, + { + "epoch": 0.3506488592940108, + "grad_norm": 2.227987289428711, + "learning_rate": 1.3906346652074823e-05, + "loss": 0.7116, + "step": 4634 + }, + { + "epoch": 0.35072452801634446, + "grad_norm": 2.293186902999878, + "learning_rate": 1.3904458377541892e-05, + "loss": 0.6701, + "step": 4635 + }, + { + "epoch": 0.35080019673867807, + "grad_norm": 2.3717048168182373, + "learning_rate": 1.3902569795875918e-05, + "loss": 0.8981, + "step": 4636 + }, + { + "epoch": 0.3508758654610117, + "grad_norm": 2.319140672683716, + "learning_rate": 1.3900680907195162e-05, + "loss": 0.718, + "step": 4637 + }, + { + "epoch": 0.3509515341833453, + "grad_norm": 2.073474168777466, + "learning_rate": 1.389879171161792e-05, + "loss": 0.5373, + "step": 4638 + }, + { + "epoch": 0.35102720290567896, + "grad_norm": 1.9484330415725708, + "learning_rate": 1.3896902209262496e-05, + "loss": 0.7226, + "step": 4639 + }, + { + "epoch": 0.35110287162801257, + "grad_norm": 2.9964051246643066, + "learning_rate": 1.3895012400247216e-05, + "loss": 0.6546, + "step": 4640 + }, + { + "epoch": 0.3511785403503462, + "grad_norm": 2.1370325088500977, + "learning_rate": 1.3893122284690426e-05, + "loss": 0.9182, + "step": 4641 + }, + { + "epoch": 0.3512542090726798, + "grad_norm": 2.6059162616729736, + "learning_rate": 1.3891231862710495e-05, + "loss": 0.8695, + "step": 4642 + }, + { + "epoch": 0.35132987779501346, + "grad_norm": 2.8685076236724854, + "learning_rate": 1.3889341134425802e-05, + "loss": 0.6888, + "step": 4643 + }, + { + "epoch": 0.35140554651734707, + "grad_norm": 2.9174864292144775, + "learning_rate": 1.3887450099954757e-05, + "loss": 0.7698, + "step": 4644 + }, + { + "epoch": 0.3514812152396807, + "grad_norm": 2.5367772579193115, + "learning_rate": 1.3885558759415778e-05, + "loss": 0.7116, + "step": 4645 + }, + { + "epoch": 0.3515568839620143, + "grad_norm": 2.0750954151153564, + "learning_rate": 1.3883667112927305e-05, + "loss": 0.6243, + "step": 4646 + }, + { + "epoch": 0.3516325526843479, + "grad_norm": 1.9517443180084229, + "learning_rate": 1.3881775160607804e-05, + "loss": 0.596, + "step": 4647 + }, + { + "epoch": 0.35170822140668156, + "grad_norm": 2.155714988708496, + "learning_rate": 1.387988290257575e-05, + "loss": 0.636, + "step": 4648 + }, + { + "epoch": 0.3517838901290152, + "grad_norm": 2.3737261295318604, + "learning_rate": 1.3877990338949647e-05, + "loss": 0.5942, + "step": 4649 + }, + { + "epoch": 0.3518595588513488, + "grad_norm": 2.2594027519226074, + "learning_rate": 1.3876097469848013e-05, + "loss": 0.7346, + "step": 4650 + }, + { + "epoch": 0.3519352275736824, + "grad_norm": 2.5711894035339355, + "learning_rate": 1.3874204295389382e-05, + "loss": 0.7492, + "step": 4651 + }, + { + "epoch": 0.35201089629601606, + "grad_norm": 1.7782737016677856, + "learning_rate": 1.3872310815692313e-05, + "loss": 0.7328, + "step": 4652 + }, + { + "epoch": 0.3520865650183497, + "grad_norm": 2.088243246078491, + "learning_rate": 1.3870417030875383e-05, + "loss": 0.6446, + "step": 4653 + }, + { + "epoch": 0.3521622337406833, + "grad_norm": 1.7141531705856323, + "learning_rate": 1.3868522941057184e-05, + "loss": 0.6885, + "step": 4654 + }, + { + "epoch": 0.3522379024630169, + "grad_norm": 1.8343784809112549, + "learning_rate": 1.3866628546356334e-05, + "loss": 0.7294, + "step": 4655 + }, + { + "epoch": 0.35231357118535056, + "grad_norm": 2.9176583290100098, + "learning_rate": 1.386473384689146e-05, + "loss": 0.6369, + "step": 4656 + }, + { + "epoch": 0.35238923990768417, + "grad_norm": 2.6107664108276367, + "learning_rate": 1.3862838842781222e-05, + "loss": 0.6926, + "step": 4657 + }, + { + "epoch": 0.3524649086300178, + "grad_norm": 2.2640960216522217, + "learning_rate": 1.3860943534144288e-05, + "loss": 0.6889, + "step": 4658 + }, + { + "epoch": 0.3525405773523514, + "grad_norm": 2.1459038257598877, + "learning_rate": 1.3859047921099342e-05, + "loss": 0.8295, + "step": 4659 + }, + { + "epoch": 0.352616246074685, + "grad_norm": 2.1899123191833496, + "learning_rate": 1.3857152003765108e-05, + "loss": 0.7613, + "step": 4660 + }, + { + "epoch": 0.35269191479701867, + "grad_norm": 2.238300085067749, + "learning_rate": 1.3855255782260297e-05, + "loss": 0.7159, + "step": 4661 + }, + { + "epoch": 0.3527675835193523, + "grad_norm": 2.169271945953369, + "learning_rate": 1.3853359256703668e-05, + "loss": 0.7097, + "step": 4662 + }, + { + "epoch": 0.3528432522416859, + "grad_norm": 2.406660795211792, + "learning_rate": 1.3851462427213986e-05, + "loss": 0.9081, + "step": 4663 + }, + { + "epoch": 0.3529189209640195, + "grad_norm": 2.2769057750701904, + "learning_rate": 1.3849565293910034e-05, + "loss": 0.7397, + "step": 4664 + }, + { + "epoch": 0.35299458968635317, + "grad_norm": 2.031386137008667, + "learning_rate": 1.3847667856910621e-05, + "loss": 0.8047, + "step": 4665 + }, + { + "epoch": 0.3530702584086868, + "grad_norm": 2.512640953063965, + "learning_rate": 1.3845770116334561e-05, + "loss": 0.8456, + "step": 4666 + }, + { + "epoch": 0.3531459271310204, + "grad_norm": 2.3293848037719727, + "learning_rate": 1.384387207230071e-05, + "loss": 0.6356, + "step": 4667 + }, + { + "epoch": 0.353221595853354, + "grad_norm": 2.598417043685913, + "learning_rate": 1.384197372492792e-05, + "loss": 0.7213, + "step": 4668 + }, + { + "epoch": 0.35329726457568766, + "grad_norm": 2.296818733215332, + "learning_rate": 1.3840075074335074e-05, + "loss": 0.7332, + "step": 4669 + }, + { + "epoch": 0.3533729332980213, + "grad_norm": 1.8637245893478394, + "learning_rate": 1.3838176120641071e-05, + "loss": 0.4631, + "step": 4670 + }, + { + "epoch": 0.3534486020203549, + "grad_norm": 2.56315541267395, + "learning_rate": 1.3836276863964834e-05, + "loss": 0.7563, + "step": 4671 + }, + { + "epoch": 0.3535242707426885, + "grad_norm": 2.7444026470184326, + "learning_rate": 1.3834377304425298e-05, + "loss": 0.7095, + "step": 4672 + }, + { + "epoch": 0.3535999394650221, + "grad_norm": 1.994140386581421, + "learning_rate": 1.3832477442141416e-05, + "loss": 0.6721, + "step": 4673 + }, + { + "epoch": 0.35367560818735577, + "grad_norm": 2.4888455867767334, + "learning_rate": 1.3830577277232164e-05, + "loss": 0.7671, + "step": 4674 + }, + { + "epoch": 0.3537512769096894, + "grad_norm": 1.9443211555480957, + "learning_rate": 1.3828676809816543e-05, + "loss": 0.795, + "step": 4675 + }, + { + "epoch": 0.353826945632023, + "grad_norm": 2.254079818725586, + "learning_rate": 1.3826776040013563e-05, + "loss": 0.6806, + "step": 4676 + }, + { + "epoch": 0.3539026143543566, + "grad_norm": 2.345444440841675, + "learning_rate": 1.3824874967942251e-05, + "loss": 0.6459, + "step": 4677 + }, + { + "epoch": 0.35397828307669027, + "grad_norm": 2.6580867767333984, + "learning_rate": 1.382297359372167e-05, + "loss": 0.9527, + "step": 4678 + }, + { + "epoch": 0.3540539517990239, + "grad_norm": 2.4587290287017822, + "learning_rate": 1.3821071917470877e-05, + "loss": 0.8039, + "step": 4679 + }, + { + "epoch": 0.3541296205213575, + "grad_norm": 1.9164375066757202, + "learning_rate": 1.3819169939308969e-05, + "loss": 0.7868, + "step": 4680 + }, + { + "epoch": 0.3542052892436911, + "grad_norm": 2.0153324604034424, + "learning_rate": 1.3817267659355055e-05, + "loss": 0.739, + "step": 4681 + }, + { + "epoch": 0.35428095796602477, + "grad_norm": 2.2942817211151123, + "learning_rate": 1.3815365077728255e-05, + "loss": 0.8144, + "step": 4682 + }, + { + "epoch": 0.3543566266883584, + "grad_norm": 2.601985216140747, + "learning_rate": 1.3813462194547724e-05, + "loss": 0.7122, + "step": 4683 + }, + { + "epoch": 0.354432295410692, + "grad_norm": 1.8438481092453003, + "learning_rate": 1.3811559009932615e-05, + "loss": 0.7519, + "step": 4684 + }, + { + "epoch": 0.3545079641330256, + "grad_norm": 2.120218276977539, + "learning_rate": 1.3809655524002124e-05, + "loss": 0.7314, + "step": 4685 + }, + { + "epoch": 0.3545836328553592, + "grad_norm": 2.4902069568634033, + "learning_rate": 1.3807751736875446e-05, + "loss": 0.6525, + "step": 4686 + }, + { + "epoch": 0.3546593015776929, + "grad_norm": 2.555546998977661, + "learning_rate": 1.3805847648671803e-05, + "loss": 0.7323, + "step": 4687 + }, + { + "epoch": 0.3547349703000265, + "grad_norm": 2.1880481243133545, + "learning_rate": 1.3803943259510439e-05, + "loss": 0.792, + "step": 4688 + }, + { + "epoch": 0.3548106390223601, + "grad_norm": 2.3539915084838867, + "learning_rate": 1.3802038569510606e-05, + "loss": 0.7203, + "step": 4689 + }, + { + "epoch": 0.3548863077446937, + "grad_norm": 2.6515328884124756, + "learning_rate": 1.3800133578791591e-05, + "loss": 0.8069, + "step": 4690 + }, + { + "epoch": 0.3549619764670274, + "grad_norm": 1.9614640474319458, + "learning_rate": 1.3798228287472683e-05, + "loss": 0.7215, + "step": 4691 + }, + { + "epoch": 0.355037645189361, + "grad_norm": 2.137275218963623, + "learning_rate": 1.37963226956732e-05, + "loss": 0.8132, + "step": 4692 + }, + { + "epoch": 0.3551133139116946, + "grad_norm": 1.8678892850875854, + "learning_rate": 1.3794416803512477e-05, + "loss": 0.5912, + "step": 4693 + }, + { + "epoch": 0.3551889826340282, + "grad_norm": 2.0661749839782715, + "learning_rate": 1.379251061110987e-05, + "loss": 0.7803, + "step": 4694 + }, + { + "epoch": 0.35526465135636187, + "grad_norm": 2.396214485168457, + "learning_rate": 1.3790604118584744e-05, + "loss": 0.7889, + "step": 4695 + }, + { + "epoch": 0.3553403200786955, + "grad_norm": 2.3043839931488037, + "learning_rate": 1.3788697326056494e-05, + "loss": 0.8395, + "step": 4696 + }, + { + "epoch": 0.3554159888010291, + "grad_norm": 2.3201632499694824, + "learning_rate": 1.3786790233644529e-05, + "loss": 0.875, + "step": 4697 + }, + { + "epoch": 0.3554916575233627, + "grad_norm": 2.381060838699341, + "learning_rate": 1.3784882841468276e-05, + "loss": 0.8286, + "step": 4698 + }, + { + "epoch": 0.35556732624569637, + "grad_norm": 2.4320642948150635, + "learning_rate": 1.3782975149647184e-05, + "loss": 0.6862, + "step": 4699 + }, + { + "epoch": 0.35564299496803, + "grad_norm": 2.320983409881592, + "learning_rate": 1.378106715830072e-05, + "loss": 0.6916, + "step": 4700 + }, + { + "epoch": 0.3557186636903636, + "grad_norm": 2.415428876876831, + "learning_rate": 1.3779158867548367e-05, + "loss": 0.6812, + "step": 4701 + }, + { + "epoch": 0.3557943324126972, + "grad_norm": 2.3122177124023438, + "learning_rate": 1.3777250277509621e-05, + "loss": 0.6497, + "step": 4702 + }, + { + "epoch": 0.3558700011350308, + "grad_norm": 2.4607081413269043, + "learning_rate": 1.3775341388304019e-05, + "loss": 0.6586, + "step": 4703 + }, + { + "epoch": 0.3559456698573645, + "grad_norm": 2.4338831901550293, + "learning_rate": 1.3773432200051093e-05, + "loss": 0.7655, + "step": 4704 + }, + { + "epoch": 0.3560213385796981, + "grad_norm": 3.2349586486816406, + "learning_rate": 1.3771522712870401e-05, + "loss": 0.7787, + "step": 4705 + }, + { + "epoch": 0.3560970073020317, + "grad_norm": 2.1032230854034424, + "learning_rate": 1.3769612926881526e-05, + "loss": 0.7262, + "step": 4706 + }, + { + "epoch": 0.3561726760243653, + "grad_norm": 2.523385524749756, + "learning_rate": 1.3767702842204059e-05, + "loss": 0.7117, + "step": 4707 + }, + { + "epoch": 0.356248344746699, + "grad_norm": 2.341728687286377, + "learning_rate": 1.3765792458957624e-05, + "loss": 0.7133, + "step": 4708 + }, + { + "epoch": 0.3563240134690326, + "grad_norm": 2.2252790927886963, + "learning_rate": 1.3763881777261847e-05, + "loss": 0.771, + "step": 4709 + }, + { + "epoch": 0.3563996821913662, + "grad_norm": 2.088966131210327, + "learning_rate": 1.3761970797236386e-05, + "loss": 0.7424, + "step": 4710 + }, + { + "epoch": 0.3564753509136998, + "grad_norm": 2.176795244216919, + "learning_rate": 1.3760059519000912e-05, + "loss": 0.7256, + "step": 4711 + }, + { + "epoch": 0.3565510196360335, + "grad_norm": 2.541574716567993, + "learning_rate": 1.3758147942675115e-05, + "loss": 0.7764, + "step": 4712 + }, + { + "epoch": 0.3566266883583671, + "grad_norm": 2.2796478271484375, + "learning_rate": 1.3756236068378706e-05, + "loss": 0.7737, + "step": 4713 + }, + { + "epoch": 0.3567023570807007, + "grad_norm": 2.664128541946411, + "learning_rate": 1.3754323896231409e-05, + "loss": 0.8546, + "step": 4714 + }, + { + "epoch": 0.3567780258030343, + "grad_norm": 2.4556145668029785, + "learning_rate": 1.3752411426352971e-05, + "loss": 0.7036, + "step": 4715 + }, + { + "epoch": 0.3568536945253679, + "grad_norm": 2.207688093185425, + "learning_rate": 1.375049865886316e-05, + "loss": 0.7345, + "step": 4716 + }, + { + "epoch": 0.3569293632477016, + "grad_norm": 2.008202075958252, + "learning_rate": 1.3748585593881757e-05, + "loss": 0.7968, + "step": 4717 + }, + { + "epoch": 0.3570050319700352, + "grad_norm": 1.9544659852981567, + "learning_rate": 1.3746672231528565e-05, + "loss": 0.7988, + "step": 4718 + }, + { + "epoch": 0.3570807006923688, + "grad_norm": 2.3357014656066895, + "learning_rate": 1.3744758571923408e-05, + "loss": 0.7303, + "step": 4719 + }, + { + "epoch": 0.3571563694147024, + "grad_norm": 2.123999834060669, + "learning_rate": 1.3742844615186122e-05, + "loss": 0.7725, + "step": 4720 + }, + { + "epoch": 0.3572320381370361, + "grad_norm": 2.152657985687256, + "learning_rate": 1.3740930361436565e-05, + "loss": 0.8359, + "step": 4721 + }, + { + "epoch": 0.3573077068593697, + "grad_norm": 2.6231911182403564, + "learning_rate": 1.3739015810794616e-05, + "loss": 0.8116, + "step": 4722 + }, + { + "epoch": 0.3573833755817033, + "grad_norm": 2.069714307785034, + "learning_rate": 1.3737100963380164e-05, + "loss": 0.7383, + "step": 4723 + }, + { + "epoch": 0.3574590443040369, + "grad_norm": 2.1583962440490723, + "learning_rate": 1.3735185819313134e-05, + "loss": 0.7683, + "step": 4724 + }, + { + "epoch": 0.3575347130263706, + "grad_norm": 2.315765619277954, + "learning_rate": 1.3733270378713448e-05, + "loss": 0.708, + "step": 4725 + }, + { + "epoch": 0.3576103817487042, + "grad_norm": 1.5999698638916016, + "learning_rate": 1.3731354641701064e-05, + "loss": 0.7547, + "step": 4726 + }, + { + "epoch": 0.3576860504710378, + "grad_norm": 2.330720901489258, + "learning_rate": 1.3729438608395951e-05, + "loss": 0.7628, + "step": 4727 + }, + { + "epoch": 0.3577617191933714, + "grad_norm": 2.1309075355529785, + "learning_rate": 1.3727522278918094e-05, + "loss": 0.632, + "step": 4728 + }, + { + "epoch": 0.357837387915705, + "grad_norm": 2.6685056686401367, + "learning_rate": 1.3725605653387502e-05, + "loss": 0.8374, + "step": 4729 + }, + { + "epoch": 0.3579130566380387, + "grad_norm": 1.8115298748016357, + "learning_rate": 1.3723688731924195e-05, + "loss": 0.6296, + "step": 4730 + }, + { + "epoch": 0.3579887253603723, + "grad_norm": 2.554457664489746, + "learning_rate": 1.3721771514648227e-05, + "loss": 0.6815, + "step": 4731 + }, + { + "epoch": 0.3580643940827059, + "grad_norm": 2.170767068862915, + "learning_rate": 1.3719854001679654e-05, + "loss": 0.7365, + "step": 4732 + }, + { + "epoch": 0.3581400628050395, + "grad_norm": 1.8011586666107178, + "learning_rate": 1.3717936193138555e-05, + "loss": 0.707, + "step": 4733 + }, + { + "epoch": 0.3582157315273732, + "grad_norm": 2.423759937286377, + "learning_rate": 1.371601808914503e-05, + "loss": 0.7133, + "step": 4734 + }, + { + "epoch": 0.3582914002497068, + "grad_norm": 2.005772113800049, + "learning_rate": 1.3714099689819203e-05, + "loss": 0.711, + "step": 4735 + }, + { + "epoch": 0.3583670689720404, + "grad_norm": 2.3678839206695557, + "learning_rate": 1.3712180995281207e-05, + "loss": 0.7133, + "step": 4736 + }, + { + "epoch": 0.358442737694374, + "grad_norm": 1.848137617111206, + "learning_rate": 1.3710262005651195e-05, + "loss": 0.5987, + "step": 4737 + }, + { + "epoch": 0.3585184064167077, + "grad_norm": 1.9859153032302856, + "learning_rate": 1.370834272104934e-05, + "loss": 0.6771, + "step": 4738 + }, + { + "epoch": 0.3585940751390413, + "grad_norm": 1.895273208618164, + "learning_rate": 1.3706423141595834e-05, + "loss": 0.7763, + "step": 4739 + }, + { + "epoch": 0.3586697438613749, + "grad_norm": 2.161606788635254, + "learning_rate": 1.370450326741089e-05, + "loss": 0.7725, + "step": 4740 + }, + { + "epoch": 0.3587454125837085, + "grad_norm": 2.1437206268310547, + "learning_rate": 1.3702583098614734e-05, + "loss": 0.7627, + "step": 4741 + }, + { + "epoch": 0.3588210813060421, + "grad_norm": 2.3979063034057617, + "learning_rate": 1.3700662635327618e-05, + "loss": 0.8281, + "step": 4742 + }, + { + "epoch": 0.3588967500283758, + "grad_norm": 1.9266215562820435, + "learning_rate": 1.36987418776698e-05, + "loss": 0.7498, + "step": 4743 + }, + { + "epoch": 0.3589724187507094, + "grad_norm": 1.760246992111206, + "learning_rate": 1.369682082576157e-05, + "loss": 0.7122, + "step": 4744 + }, + { + "epoch": 0.359048087473043, + "grad_norm": 2.123413562774658, + "learning_rate": 1.369489947972323e-05, + "loss": 0.6687, + "step": 4745 + }, + { + "epoch": 0.3591237561953766, + "grad_norm": 1.9650204181671143, + "learning_rate": 1.3692977839675095e-05, + "loss": 0.6699, + "step": 4746 + }, + { + "epoch": 0.3591994249177103, + "grad_norm": 2.239622116088867, + "learning_rate": 1.3691055905737511e-05, + "loss": 0.6695, + "step": 4747 + }, + { + "epoch": 0.3592750936400439, + "grad_norm": 2.251483201980591, + "learning_rate": 1.3689133678030834e-05, + "loss": 0.7291, + "step": 4748 + }, + { + "epoch": 0.3593507623623775, + "grad_norm": 2.2021195888519287, + "learning_rate": 1.368721115667544e-05, + "loss": 0.682, + "step": 4749 + }, + { + "epoch": 0.3594264310847111, + "grad_norm": 2.117192506790161, + "learning_rate": 1.3685288341791724e-05, + "loss": 0.8493, + "step": 4750 + }, + { + "epoch": 0.3595020998070448, + "grad_norm": 1.8998351097106934, + "learning_rate": 1.3683365233500096e-05, + "loss": 0.6437, + "step": 4751 + }, + { + "epoch": 0.3595777685293784, + "grad_norm": 2.776381731033325, + "learning_rate": 1.3681441831920991e-05, + "loss": 0.8318, + "step": 4752 + }, + { + "epoch": 0.359653437251712, + "grad_norm": 2.061583995819092, + "learning_rate": 1.3679518137174854e-05, + "loss": 0.6635, + "step": 4753 + }, + { + "epoch": 0.3597291059740456, + "grad_norm": 2.730015516281128, + "learning_rate": 1.367759414938216e-05, + "loss": 0.8673, + "step": 4754 + }, + { + "epoch": 0.3598047746963792, + "grad_norm": 1.971379041671753, + "learning_rate": 1.3675669868663386e-05, + "loss": 0.6937, + "step": 4755 + }, + { + "epoch": 0.3598804434187129, + "grad_norm": 2.2239530086517334, + "learning_rate": 1.3673745295139044e-05, + "loss": 0.7792, + "step": 4756 + }, + { + "epoch": 0.3599561121410465, + "grad_norm": 2.4830453395843506, + "learning_rate": 1.3671820428929654e-05, + "loss": 0.8029, + "step": 4757 + }, + { + "epoch": 0.3600317808633801, + "grad_norm": 3.1367461681365967, + "learning_rate": 1.3669895270155762e-05, + "loss": 0.7619, + "step": 4758 + }, + { + "epoch": 0.3601074495857137, + "grad_norm": 2.4550986289978027, + "learning_rate": 1.3667969818937922e-05, + "loss": 0.6998, + "step": 4759 + }, + { + "epoch": 0.3601831183080474, + "grad_norm": 2.109701633453369, + "learning_rate": 1.3666044075396713e-05, + "loss": 0.7235, + "step": 4760 + }, + { + "epoch": 0.360258787030381, + "grad_norm": 2.4037647247314453, + "learning_rate": 1.3664118039652732e-05, + "loss": 0.8367, + "step": 4761 + }, + { + "epoch": 0.3603344557527146, + "grad_norm": 2.3242437839508057, + "learning_rate": 1.3662191711826594e-05, + "loss": 0.6881, + "step": 4762 + }, + { + "epoch": 0.3604101244750482, + "grad_norm": 2.0473146438598633, + "learning_rate": 1.3660265092038933e-05, + "loss": 0.6282, + "step": 4763 + }, + { + "epoch": 0.3604857931973819, + "grad_norm": 2.0871689319610596, + "learning_rate": 1.3658338180410396e-05, + "loss": 0.7049, + "step": 4764 + }, + { + "epoch": 0.3605614619197155, + "grad_norm": 2.224372625350952, + "learning_rate": 1.3656410977061659e-05, + "loss": 0.8585, + "step": 4765 + }, + { + "epoch": 0.3606371306420491, + "grad_norm": 2.71907114982605, + "learning_rate": 1.3654483482113403e-05, + "loss": 0.6829, + "step": 4766 + }, + { + "epoch": 0.3607127993643827, + "grad_norm": 2.1918084621429443, + "learning_rate": 1.365255569568634e-05, + "loss": 0.744, + "step": 4767 + }, + { + "epoch": 0.36078846808671633, + "grad_norm": 1.7974071502685547, + "learning_rate": 1.3650627617901187e-05, + "loss": 0.8826, + "step": 4768 + }, + { + "epoch": 0.36086413680905, + "grad_norm": 2.0619354248046875, + "learning_rate": 1.3648699248878694e-05, + "loss": 0.771, + "step": 4769 + }, + { + "epoch": 0.3609398055313836, + "grad_norm": 2.1140151023864746, + "learning_rate": 1.3646770588739617e-05, + "loss": 0.8686, + "step": 4770 + }, + { + "epoch": 0.3610154742537172, + "grad_norm": 1.8046315908432007, + "learning_rate": 1.3644841637604734e-05, + "loss": 0.6257, + "step": 4771 + }, + { + "epoch": 0.3610911429760508, + "grad_norm": 2.37791109085083, + "learning_rate": 1.3642912395594848e-05, + "loss": 0.7334, + "step": 4772 + }, + { + "epoch": 0.3611668116983845, + "grad_norm": 1.7635067701339722, + "learning_rate": 1.3640982862830768e-05, + "loss": 0.6244, + "step": 4773 + }, + { + "epoch": 0.3612424804207181, + "grad_norm": 2.401155471801758, + "learning_rate": 1.3639053039433334e-05, + "loss": 0.6292, + "step": 4774 + }, + { + "epoch": 0.3613181491430517, + "grad_norm": 1.8144967555999756, + "learning_rate": 1.3637122925523391e-05, + "loss": 0.8272, + "step": 4775 + }, + { + "epoch": 0.3613938178653853, + "grad_norm": 2.4782304763793945, + "learning_rate": 1.3635192521221815e-05, + "loss": 0.7252, + "step": 4776 + }, + { + "epoch": 0.361469486587719, + "grad_norm": 1.9335335493087769, + "learning_rate": 1.363326182664949e-05, + "loss": 0.8413, + "step": 4777 + }, + { + "epoch": 0.3615451553100526, + "grad_norm": 1.8808588981628418, + "learning_rate": 1.363133084192732e-05, + "loss": 0.6569, + "step": 4778 + }, + { + "epoch": 0.3616208240323862, + "grad_norm": 1.8410663604736328, + "learning_rate": 1.3629399567176237e-05, + "loss": 0.7255, + "step": 4779 + }, + { + "epoch": 0.3616964927547198, + "grad_norm": 1.8325263261795044, + "learning_rate": 1.3627468002517179e-05, + "loss": 0.7194, + "step": 4780 + }, + { + "epoch": 0.36177216147705343, + "grad_norm": 2.408630847930908, + "learning_rate": 1.3625536148071109e-05, + "loss": 0.7562, + "step": 4781 + }, + { + "epoch": 0.3618478301993871, + "grad_norm": 2.4441497325897217, + "learning_rate": 1.3623604003959004e-05, + "loss": 0.7213, + "step": 4782 + }, + { + "epoch": 0.3619234989217207, + "grad_norm": 2.6353988647460938, + "learning_rate": 1.3621671570301858e-05, + "loss": 0.8314, + "step": 4783 + }, + { + "epoch": 0.3619991676440543, + "grad_norm": 2.562260866165161, + "learning_rate": 1.3619738847220694e-05, + "loss": 0.6955, + "step": 4784 + }, + { + "epoch": 0.36207483636638793, + "grad_norm": 2.1167261600494385, + "learning_rate": 1.361780583483654e-05, + "loss": 0.6659, + "step": 4785 + }, + { + "epoch": 0.3621505050887216, + "grad_norm": 2.0852859020233154, + "learning_rate": 1.3615872533270452e-05, + "loss": 0.8003, + "step": 4786 + }, + { + "epoch": 0.3622261738110552, + "grad_norm": 2.634650945663452, + "learning_rate": 1.3613938942643491e-05, + "loss": 0.851, + "step": 4787 + }, + { + "epoch": 0.3623018425333888, + "grad_norm": 2.0236427783966064, + "learning_rate": 1.3612005063076753e-05, + "loss": 0.6714, + "step": 4788 + }, + { + "epoch": 0.36237751125572243, + "grad_norm": 2.218775510787964, + "learning_rate": 1.361007089469134e-05, + "loss": 0.8034, + "step": 4789 + }, + { + "epoch": 0.3624531799780561, + "grad_norm": 1.8466893434524536, + "learning_rate": 1.3608136437608379e-05, + "loss": 0.8057, + "step": 4790 + }, + { + "epoch": 0.3625288487003897, + "grad_norm": 1.9202516078948975, + "learning_rate": 1.3606201691949005e-05, + "loss": 0.8247, + "step": 4791 + }, + { + "epoch": 0.3626045174227233, + "grad_norm": 2.387627601623535, + "learning_rate": 1.3604266657834388e-05, + "loss": 0.6645, + "step": 4792 + }, + { + "epoch": 0.3626801861450569, + "grad_norm": 2.0650217533111572, + "learning_rate": 1.36023313353857e-05, + "loss": 0.6833, + "step": 4793 + }, + { + "epoch": 0.36275585486739054, + "grad_norm": 2.685912847518921, + "learning_rate": 1.3600395724724133e-05, + "loss": 0.7034, + "step": 4794 + }, + { + "epoch": 0.3628315235897242, + "grad_norm": 2.143637180328369, + "learning_rate": 1.3598459825970912e-05, + "loss": 0.6371, + "step": 4795 + }, + { + "epoch": 0.3629071923120578, + "grad_norm": 2.5087201595306396, + "learning_rate": 1.3596523639247263e-05, + "loss": 0.6024, + "step": 4796 + }, + { + "epoch": 0.3629828610343914, + "grad_norm": 2.4101240634918213, + "learning_rate": 1.3594587164674435e-05, + "loss": 0.7716, + "step": 4797 + }, + { + "epoch": 0.36305852975672503, + "grad_norm": 2.613996982574463, + "learning_rate": 1.3592650402373699e-05, + "loss": 0.7144, + "step": 4798 + }, + { + "epoch": 0.3631341984790587, + "grad_norm": 3.9812352657318115, + "learning_rate": 1.359071335246634e-05, + "loss": 0.6945, + "step": 4799 + }, + { + "epoch": 0.3632098672013923, + "grad_norm": 2.4522552490234375, + "learning_rate": 1.3588776015073662e-05, + "loss": 0.7366, + "step": 4800 + }, + { + "epoch": 0.3632855359237259, + "grad_norm": 2.1591553688049316, + "learning_rate": 1.3586838390316987e-05, + "loss": 0.6077, + "step": 4801 + }, + { + "epoch": 0.36336120464605953, + "grad_norm": 2.159881114959717, + "learning_rate": 1.3584900478317658e-05, + "loss": 0.6745, + "step": 4802 + }, + { + "epoch": 0.3634368733683932, + "grad_norm": 2.3063745498657227, + "learning_rate": 1.3582962279197031e-05, + "loss": 0.7862, + "step": 4803 + }, + { + "epoch": 0.3635125420907268, + "grad_norm": 2.219144582748413, + "learning_rate": 1.3581023793076485e-05, + "loss": 0.8561, + "step": 4804 + }, + { + "epoch": 0.3635882108130604, + "grad_norm": 6.310362815856934, + "learning_rate": 1.3579085020077409e-05, + "loss": 0.6899, + "step": 4805 + }, + { + "epoch": 0.36366387953539403, + "grad_norm": 2.508697748184204, + "learning_rate": 1.3577145960321223e-05, + "loss": 0.8235, + "step": 4806 + }, + { + "epoch": 0.36373954825772764, + "grad_norm": 2.050865888595581, + "learning_rate": 1.357520661392935e-05, + "loss": 0.6781, + "step": 4807 + }, + { + "epoch": 0.3638152169800613, + "grad_norm": 2.487555742263794, + "learning_rate": 1.357326698102324e-05, + "loss": 0.7999, + "step": 4808 + }, + { + "epoch": 0.3638908857023949, + "grad_norm": 2.141352415084839, + "learning_rate": 1.3571327061724362e-05, + "loss": 0.7551, + "step": 4809 + }, + { + "epoch": 0.36396655442472853, + "grad_norm": 2.2181026935577393, + "learning_rate": 1.3569386856154194e-05, + "loss": 0.8361, + "step": 4810 + }, + { + "epoch": 0.36404222314706214, + "grad_norm": 1.389456033706665, + "learning_rate": 1.3567446364434246e-05, + "loss": 0.8809, + "step": 4811 + }, + { + "epoch": 0.3641178918693958, + "grad_norm": 2.0791516304016113, + "learning_rate": 1.356550558668603e-05, + "loss": 0.6581, + "step": 4812 + }, + { + "epoch": 0.3641935605917294, + "grad_norm": 2.7910525798797607, + "learning_rate": 1.3563564523031091e-05, + "loss": 0.7416, + "step": 4813 + }, + { + "epoch": 0.364269229314063, + "grad_norm": 2.5519115924835205, + "learning_rate": 1.3561623173590978e-05, + "loss": 0.7204, + "step": 4814 + }, + { + "epoch": 0.36434489803639664, + "grad_norm": 2.1502325534820557, + "learning_rate": 1.3559681538487269e-05, + "loss": 0.8517, + "step": 4815 + }, + { + "epoch": 0.3644205667587303, + "grad_norm": 2.1360151767730713, + "learning_rate": 1.3557739617841558e-05, + "loss": 0.7458, + "step": 4816 + }, + { + "epoch": 0.3644962354810639, + "grad_norm": 2.29506254196167, + "learning_rate": 1.3555797411775447e-05, + "loss": 0.705, + "step": 4817 + }, + { + "epoch": 0.3645719042033975, + "grad_norm": 2.2674145698547363, + "learning_rate": 1.3553854920410568e-05, + "loss": 0.7909, + "step": 4818 + }, + { + "epoch": 0.36464757292573113, + "grad_norm": 2.701314926147461, + "learning_rate": 1.3551912143868564e-05, + "loss": 0.7936, + "step": 4819 + }, + { + "epoch": 0.3647232416480648, + "grad_norm": 2.063055992126465, + "learning_rate": 1.35499690822711e-05, + "loss": 0.6838, + "step": 4820 + }, + { + "epoch": 0.3647989103703984, + "grad_norm": 2.2792537212371826, + "learning_rate": 1.3548025735739852e-05, + "loss": 0.7913, + "step": 4821 + }, + { + "epoch": 0.364874579092732, + "grad_norm": 2.7173550128936768, + "learning_rate": 1.3546082104396528e-05, + "loss": 0.7174, + "step": 4822 + }, + { + "epoch": 0.36495024781506563, + "grad_norm": 2.250936508178711, + "learning_rate": 1.3544138188362835e-05, + "loss": 0.8527, + "step": 4823 + }, + { + "epoch": 0.36502591653739924, + "grad_norm": 2.110093355178833, + "learning_rate": 1.354219398776051e-05, + "loss": 0.9162, + "step": 4824 + }, + { + "epoch": 0.3651015852597329, + "grad_norm": 2.133039951324463, + "learning_rate": 1.354024950271131e-05, + "loss": 0.7488, + "step": 4825 + }, + { + "epoch": 0.3651772539820665, + "grad_norm": 1.9301999807357788, + "learning_rate": 1.3538304733337e-05, + "loss": 0.7952, + "step": 4826 + }, + { + "epoch": 0.36525292270440013, + "grad_norm": 2.617494821548462, + "learning_rate": 1.3536359679759369e-05, + "loss": 0.6829, + "step": 4827 + }, + { + "epoch": 0.36532859142673374, + "grad_norm": 2.2446556091308594, + "learning_rate": 1.3534414342100221e-05, + "loss": 0.684, + "step": 4828 + }, + { + "epoch": 0.3654042601490674, + "grad_norm": 2.0811996459960938, + "learning_rate": 1.3532468720481382e-05, + "loss": 0.6984, + "step": 4829 + }, + { + "epoch": 0.365479928871401, + "grad_norm": 2.136030673980713, + "learning_rate": 1.3530522815024692e-05, + "loss": 0.6672, + "step": 4830 + }, + { + "epoch": 0.3655555975937346, + "grad_norm": 2.017378330230713, + "learning_rate": 1.3528576625852012e-05, + "loss": 0.8001, + "step": 4831 + }, + { + "epoch": 0.36563126631606824, + "grad_norm": 2.336763620376587, + "learning_rate": 1.3526630153085214e-05, + "loss": 0.8168, + "step": 4832 + }, + { + "epoch": 0.3657069350384019, + "grad_norm": 2.5141499042510986, + "learning_rate": 1.352468339684619e-05, + "loss": 0.7466, + "step": 4833 + }, + { + "epoch": 0.3657826037607355, + "grad_norm": 2.2805662155151367, + "learning_rate": 1.3522736357256866e-05, + "loss": 0.6801, + "step": 4834 + }, + { + "epoch": 0.3658582724830691, + "grad_norm": 2.272472381591797, + "learning_rate": 1.3520789034439158e-05, + "loss": 0.74, + "step": 4835 + }, + { + "epoch": 0.36593394120540274, + "grad_norm": 2.805711507797241, + "learning_rate": 1.351884142851502e-05, + "loss": 0.6199, + "step": 4836 + }, + { + "epoch": 0.36600960992773635, + "grad_norm": 2.3359363079071045, + "learning_rate": 1.3516893539606415e-05, + "loss": 0.6921, + "step": 4837 + }, + { + "epoch": 0.36608527865007, + "grad_norm": 2.1814374923706055, + "learning_rate": 1.3514945367835328e-05, + "loss": 0.6558, + "step": 4838 + }, + { + "epoch": 0.3661609473724036, + "grad_norm": 1.8459466695785522, + "learning_rate": 1.3512996913323758e-05, + "loss": 0.6672, + "step": 4839 + }, + { + "epoch": 0.36623661609473723, + "grad_norm": 2.3520541191101074, + "learning_rate": 1.3511048176193727e-05, + "loss": 0.842, + "step": 4840 + }, + { + "epoch": 0.36631228481707084, + "grad_norm": 1.4185298681259155, + "learning_rate": 1.3509099156567269e-05, + "loss": 0.9015, + "step": 4841 + }, + { + "epoch": 0.3663879535394045, + "grad_norm": 1.6182681322097778, + "learning_rate": 1.3507149854566433e-05, + "loss": 0.7128, + "step": 4842 + }, + { + "epoch": 0.3664636222617381, + "grad_norm": 2.5852813720703125, + "learning_rate": 1.3505200270313298e-05, + "loss": 0.7549, + "step": 4843 + }, + { + "epoch": 0.36653929098407173, + "grad_norm": 2.689218044281006, + "learning_rate": 1.3503250403929951e-05, + "loss": 0.6497, + "step": 4844 + }, + { + "epoch": 0.36661495970640534, + "grad_norm": 2.295428514480591, + "learning_rate": 1.3501300255538499e-05, + "loss": 0.8129, + "step": 4845 + }, + { + "epoch": 0.366690628428739, + "grad_norm": 2.2084271907806396, + "learning_rate": 1.3499349825261065e-05, + "loss": 0.6761, + "step": 4846 + }, + { + "epoch": 0.3667662971510726, + "grad_norm": 2.2909162044525146, + "learning_rate": 1.3497399113219792e-05, + "loss": 0.8429, + "step": 4847 + }, + { + "epoch": 0.36684196587340623, + "grad_norm": 2.24273419380188, + "learning_rate": 1.349544811953684e-05, + "loss": 0.7073, + "step": 4848 + }, + { + "epoch": 0.36691763459573984, + "grad_norm": 1.7732197046279907, + "learning_rate": 1.3493496844334386e-05, + "loss": 0.6471, + "step": 4849 + }, + { + "epoch": 0.36699330331807345, + "grad_norm": 2.0910346508026123, + "learning_rate": 1.3491545287734628e-05, + "loss": 0.7475, + "step": 4850 + }, + { + "epoch": 0.3670689720404071, + "grad_norm": 2.107093095779419, + "learning_rate": 1.3489593449859774e-05, + "loss": 0.7203, + "step": 4851 + }, + { + "epoch": 0.3671446407627407, + "grad_norm": 2.1504039764404297, + "learning_rate": 1.348764133083206e-05, + "loss": 0.6333, + "step": 4852 + }, + { + "epoch": 0.36722030948507434, + "grad_norm": 1.9339256286621094, + "learning_rate": 1.3485688930773729e-05, + "loss": 0.6488, + "step": 4853 + }, + { + "epoch": 0.36729597820740795, + "grad_norm": 2.6421656608581543, + "learning_rate": 1.348373624980705e-05, + "loss": 0.6136, + "step": 4854 + }, + { + "epoch": 0.3673716469297416, + "grad_norm": 2.230567693710327, + "learning_rate": 1.3481783288054306e-05, + "loss": 0.7886, + "step": 4855 + }, + { + "epoch": 0.3674473156520752, + "grad_norm": 2.4279305934906006, + "learning_rate": 1.3479830045637794e-05, + "loss": 0.8438, + "step": 4856 + }, + { + "epoch": 0.36752298437440883, + "grad_norm": 2.177305221557617, + "learning_rate": 1.3477876522679835e-05, + "loss": 0.6058, + "step": 4857 + }, + { + "epoch": 0.36759865309674244, + "grad_norm": 2.3177402019500732, + "learning_rate": 1.3475922719302765e-05, + "loss": 0.8804, + "step": 4858 + }, + { + "epoch": 0.3676743218190761, + "grad_norm": 2.5845775604248047, + "learning_rate": 1.3473968635628939e-05, + "loss": 0.7402, + "step": 4859 + }, + { + "epoch": 0.3677499905414097, + "grad_norm": 2.1138968467712402, + "learning_rate": 1.3472014271780725e-05, + "loss": 0.5874, + "step": 4860 + }, + { + "epoch": 0.36782565926374333, + "grad_norm": 2.863762378692627, + "learning_rate": 1.3470059627880516e-05, + "loss": 0.5876, + "step": 4861 + }, + { + "epoch": 0.36790132798607694, + "grad_norm": 2.387801170349121, + "learning_rate": 1.3468104704050713e-05, + "loss": 0.7677, + "step": 4862 + }, + { + "epoch": 0.36797699670841055, + "grad_norm": 2.5543148517608643, + "learning_rate": 1.3466149500413742e-05, + "loss": 0.8206, + "step": 4863 + }, + { + "epoch": 0.3680526654307442, + "grad_norm": 2.3808276653289795, + "learning_rate": 1.3464194017092043e-05, + "loss": 0.8768, + "step": 4864 + }, + { + "epoch": 0.36812833415307783, + "grad_norm": 3.0910837650299072, + "learning_rate": 1.3462238254208076e-05, + "loss": 0.7585, + "step": 4865 + }, + { + "epoch": 0.36820400287541144, + "grad_norm": 2.5641915798187256, + "learning_rate": 1.3460282211884317e-05, + "loss": 0.911, + "step": 4866 + }, + { + "epoch": 0.36827967159774505, + "grad_norm": 2.5306897163391113, + "learning_rate": 1.345832589024326e-05, + "loss": 0.7569, + "step": 4867 + }, + { + "epoch": 0.3683553403200787, + "grad_norm": 2.912998914718628, + "learning_rate": 1.3456369289407418e-05, + "loss": 0.6358, + "step": 4868 + }, + { + "epoch": 0.3684310090424123, + "grad_norm": 2.1079254150390625, + "learning_rate": 1.3454412409499314e-05, + "loss": 0.612, + "step": 4869 + }, + { + "epoch": 0.36850667776474594, + "grad_norm": 2.1984755992889404, + "learning_rate": 1.3452455250641498e-05, + "loss": 0.6124, + "step": 4870 + }, + { + "epoch": 0.36858234648707955, + "grad_norm": 2.0500175952911377, + "learning_rate": 1.3450497812956535e-05, + "loss": 0.6647, + "step": 4871 + }, + { + "epoch": 0.3686580152094132, + "grad_norm": 2.169865846633911, + "learning_rate": 1.3448540096567004e-05, + "loss": 0.6461, + "step": 4872 + }, + { + "epoch": 0.3687336839317468, + "grad_norm": 4.670175075531006, + "learning_rate": 1.3446582101595503e-05, + "loss": 0.6869, + "step": 4873 + }, + { + "epoch": 0.36880935265408044, + "grad_norm": 1.9263705015182495, + "learning_rate": 1.3444623828164646e-05, + "loss": 0.729, + "step": 4874 + }, + { + "epoch": 0.36888502137641405, + "grad_norm": 2.334681510925293, + "learning_rate": 1.3442665276397076e-05, + "loss": 0.8118, + "step": 4875 + }, + { + "epoch": 0.36896069009874766, + "grad_norm": 2.442364454269409, + "learning_rate": 1.3440706446415433e-05, + "loss": 0.7216, + "step": 4876 + }, + { + "epoch": 0.3690363588210813, + "grad_norm": 3.1642048358917236, + "learning_rate": 1.3438747338342389e-05, + "loss": 0.7009, + "step": 4877 + }, + { + "epoch": 0.36911202754341493, + "grad_norm": 2.8377344608306885, + "learning_rate": 1.3436787952300629e-05, + "loss": 0.758, + "step": 4878 + }, + { + "epoch": 0.36918769626574854, + "grad_norm": 2.899456262588501, + "learning_rate": 1.3434828288412859e-05, + "loss": 0.6575, + "step": 4879 + }, + { + "epoch": 0.36926336498808215, + "grad_norm": 2.240098476409912, + "learning_rate": 1.34328683468018e-05, + "loss": 0.7384, + "step": 4880 + }, + { + "epoch": 0.3693390337104158, + "grad_norm": 2.007436513900757, + "learning_rate": 1.3430908127590185e-05, + "loss": 0.757, + "step": 4881 + }, + { + "epoch": 0.36941470243274943, + "grad_norm": 1.9840151071548462, + "learning_rate": 1.342894763090077e-05, + "loss": 0.6856, + "step": 4882 + }, + { + "epoch": 0.36949037115508304, + "grad_norm": 2.434241771697998, + "learning_rate": 1.3426986856856331e-05, + "loss": 0.8133, + "step": 4883 + }, + { + "epoch": 0.36956603987741665, + "grad_norm": 2.2398934364318848, + "learning_rate": 1.3425025805579656e-05, + "loss": 0.7309, + "step": 4884 + }, + { + "epoch": 0.3696417085997503, + "grad_norm": 1.9073582887649536, + "learning_rate": 1.3423064477193551e-05, + "loss": 0.8249, + "step": 4885 + }, + { + "epoch": 0.36971737732208393, + "grad_norm": 2.0606343746185303, + "learning_rate": 1.3421102871820848e-05, + "loss": 0.7246, + "step": 4886 + }, + { + "epoch": 0.36979304604441754, + "grad_norm": 2.2094714641571045, + "learning_rate": 1.341914098958438e-05, + "loss": 0.7137, + "step": 4887 + }, + { + "epoch": 0.36986871476675115, + "grad_norm": 2.191936731338501, + "learning_rate": 1.341717883060701e-05, + "loss": 0.6196, + "step": 4888 + }, + { + "epoch": 0.36994438348908476, + "grad_norm": 1.9013859033584595, + "learning_rate": 1.3415216395011615e-05, + "loss": 0.7521, + "step": 4889 + }, + { + "epoch": 0.3700200522114184, + "grad_norm": 2.026242733001709, + "learning_rate": 1.3413253682921088e-05, + "loss": 0.5991, + "step": 4890 + }, + { + "epoch": 0.37009572093375204, + "grad_norm": 1.986952304840088, + "learning_rate": 1.3411290694458343e-05, + "loss": 0.6441, + "step": 4891 + }, + { + "epoch": 0.37017138965608565, + "grad_norm": 1.9378926753997803, + "learning_rate": 1.3409327429746304e-05, + "loss": 0.7499, + "step": 4892 + }, + { + "epoch": 0.37024705837841926, + "grad_norm": 2.1102092266082764, + "learning_rate": 1.3407363888907925e-05, + "loss": 0.7828, + "step": 4893 + }, + { + "epoch": 0.3703227271007529, + "grad_norm": 1.809720516204834, + "learning_rate": 1.340540007206616e-05, + "loss": 0.7066, + "step": 4894 + }, + { + "epoch": 0.37039839582308653, + "grad_norm": 2.9328057765960693, + "learning_rate": 1.3403435979343995e-05, + "loss": 0.8314, + "step": 4895 + }, + { + "epoch": 0.37047406454542015, + "grad_norm": 1.9853700399398804, + "learning_rate": 1.3401471610864426e-05, + "loss": 0.8033, + "step": 4896 + }, + { + "epoch": 0.37054973326775376, + "grad_norm": 2.24923038482666, + "learning_rate": 1.3399506966750466e-05, + "loss": 0.779, + "step": 4897 + }, + { + "epoch": 0.3706254019900874, + "grad_norm": 2.0755980014801025, + "learning_rate": 1.3397542047125156e-05, + "loss": 0.6533, + "step": 4898 + }, + { + "epoch": 0.37070107071242103, + "grad_norm": 1.9873055219650269, + "learning_rate": 1.3395576852111535e-05, + "loss": 0.9006, + "step": 4899 + }, + { + "epoch": 0.37077673943475464, + "grad_norm": 1.7531121969223022, + "learning_rate": 1.3393611381832675e-05, + "loss": 0.7746, + "step": 4900 + }, + { + "epoch": 0.37085240815708825, + "grad_norm": 2.0152571201324463, + "learning_rate": 1.3391645636411661e-05, + "loss": 0.6594, + "step": 4901 + }, + { + "epoch": 0.37092807687942186, + "grad_norm": 1.8413496017456055, + "learning_rate": 1.3389679615971593e-05, + "loss": 0.793, + "step": 4902 + }, + { + "epoch": 0.37100374560175553, + "grad_norm": 2.1065332889556885, + "learning_rate": 1.338771332063559e-05, + "loss": 0.7577, + "step": 4903 + }, + { + "epoch": 0.37107941432408914, + "grad_norm": 2.4198694229125977, + "learning_rate": 1.3385746750526784e-05, + "loss": 0.7809, + "step": 4904 + }, + { + "epoch": 0.37115508304642275, + "grad_norm": 2.0909576416015625, + "learning_rate": 1.3383779905768336e-05, + "loss": 0.6867, + "step": 4905 + }, + { + "epoch": 0.37123075176875636, + "grad_norm": 2.1321213245391846, + "learning_rate": 1.3381812786483408e-05, + "loss": 0.7633, + "step": 4906 + }, + { + "epoch": 0.37130642049109003, + "grad_norm": 3.150540351867676, + "learning_rate": 1.3379845392795192e-05, + "loss": 0.7857, + "step": 4907 + }, + { + "epoch": 0.37138208921342364, + "grad_norm": 2.434208631515503, + "learning_rate": 1.337787772482689e-05, + "loss": 0.8088, + "step": 4908 + }, + { + "epoch": 0.37145775793575725, + "grad_norm": 3.1003522872924805, + "learning_rate": 1.3375909782701728e-05, + "loss": 0.775, + "step": 4909 + }, + { + "epoch": 0.37153342665809086, + "grad_norm": 2.1573758125305176, + "learning_rate": 1.337394156654294e-05, + "loss": 0.6811, + "step": 4910 + }, + { + "epoch": 0.3716090953804245, + "grad_norm": 2.903069019317627, + "learning_rate": 1.3371973076473783e-05, + "loss": 0.6592, + "step": 4911 + }, + { + "epoch": 0.37168476410275814, + "grad_norm": 2.0902886390686035, + "learning_rate": 1.3370004312617533e-05, + "loss": 0.772, + "step": 4912 + }, + { + "epoch": 0.37176043282509175, + "grad_norm": 2.2064809799194336, + "learning_rate": 1.3368035275097477e-05, + "loss": 0.6503, + "step": 4913 + }, + { + "epoch": 0.37183610154742536, + "grad_norm": 2.047928810119629, + "learning_rate": 1.3366065964036927e-05, + "loss": 0.7396, + "step": 4914 + }, + { + "epoch": 0.37191177026975897, + "grad_norm": 2.1945924758911133, + "learning_rate": 1.3364096379559203e-05, + "loss": 0.757, + "step": 4915 + }, + { + "epoch": 0.37198743899209263, + "grad_norm": 2.299427032470703, + "learning_rate": 1.3362126521787649e-05, + "loss": 0.8464, + "step": 4916 + }, + { + "epoch": 0.37206310771442624, + "grad_norm": 2.420886278152466, + "learning_rate": 1.3360156390845623e-05, + "loss": 0.7771, + "step": 4917 + }, + { + "epoch": 0.37213877643675985, + "grad_norm": 2.385572910308838, + "learning_rate": 1.33581859868565e-05, + "loss": 0.7385, + "step": 4918 + }, + { + "epoch": 0.37221444515909347, + "grad_norm": 2.7393147945404053, + "learning_rate": 1.3356215309943676e-05, + "loss": 0.8137, + "step": 4919 + }, + { + "epoch": 0.37229011388142713, + "grad_norm": 1.9456791877746582, + "learning_rate": 1.3354244360230558e-05, + "loss": 0.783, + "step": 4920 + }, + { + "epoch": 0.37236578260376074, + "grad_norm": 1.7846276760101318, + "learning_rate": 1.3352273137840579e-05, + "loss": 0.5752, + "step": 4921 + }, + { + "epoch": 0.37244145132609435, + "grad_norm": 2.710305690765381, + "learning_rate": 1.3350301642897174e-05, + "loss": 0.6808, + "step": 4922 + }, + { + "epoch": 0.37251712004842796, + "grad_norm": 2.421003818511963, + "learning_rate": 1.3348329875523812e-05, + "loss": 0.7287, + "step": 4923 + }, + { + "epoch": 0.37259278877076163, + "grad_norm": 2.223174810409546, + "learning_rate": 1.3346357835843968e-05, + "loss": 0.6746, + "step": 4924 + }, + { + "epoch": 0.37266845749309524, + "grad_norm": 2.102065086364746, + "learning_rate": 1.334438552398114e-05, + "loss": 0.6813, + "step": 4925 + }, + { + "epoch": 0.37274412621542885, + "grad_norm": 2.145731210708618, + "learning_rate": 1.334241294005884e-05, + "loss": 0.7174, + "step": 4926 + }, + { + "epoch": 0.37281979493776246, + "grad_norm": 2.4170005321502686, + "learning_rate": 1.3340440084200594e-05, + "loss": 0.7821, + "step": 4927 + }, + { + "epoch": 0.3728954636600961, + "grad_norm": 2.3091304302215576, + "learning_rate": 1.3338466956529953e-05, + "loss": 0.6898, + "step": 4928 + }, + { + "epoch": 0.37297113238242974, + "grad_norm": 2.37115216255188, + "learning_rate": 1.3336493557170476e-05, + "loss": 0.6841, + "step": 4929 + }, + { + "epoch": 0.37304680110476335, + "grad_norm": 2.1162335872650146, + "learning_rate": 1.3334519886245749e-05, + "loss": 0.5999, + "step": 4930 + }, + { + "epoch": 0.37312246982709696, + "grad_norm": 1.9648668766021729, + "learning_rate": 1.3332545943879367e-05, + "loss": 0.7191, + "step": 4931 + }, + { + "epoch": 0.37319813854943057, + "grad_norm": 2.149312973022461, + "learning_rate": 1.3330571730194945e-05, + "loss": 0.6175, + "step": 4932 + }, + { + "epoch": 0.37327380727176424, + "grad_norm": 2.0248279571533203, + "learning_rate": 1.3328597245316115e-05, + "loss": 0.6749, + "step": 4933 + }, + { + "epoch": 0.37334947599409785, + "grad_norm": 2.0984058380126953, + "learning_rate": 1.3326622489366525e-05, + "loss": 0.6864, + "step": 4934 + }, + { + "epoch": 0.37342514471643146, + "grad_norm": 1.864095687866211, + "learning_rate": 1.3324647462469841e-05, + "loss": 0.8771, + "step": 4935 + }, + { + "epoch": 0.37350081343876507, + "grad_norm": 2.171860933303833, + "learning_rate": 1.3322672164749742e-05, + "loss": 0.6689, + "step": 4936 + }, + { + "epoch": 0.37357648216109873, + "grad_norm": 2.0032029151916504, + "learning_rate": 1.3320696596329935e-05, + "loss": 0.7789, + "step": 4937 + }, + { + "epoch": 0.37365215088343234, + "grad_norm": 1.7304359674453735, + "learning_rate": 1.3318720757334126e-05, + "loss": 0.5789, + "step": 4938 + }, + { + "epoch": 0.37372781960576595, + "grad_norm": 2.128831624984741, + "learning_rate": 1.3316744647886063e-05, + "loss": 0.7089, + "step": 4939 + }, + { + "epoch": 0.37380348832809956, + "grad_norm": 2.6077706813812256, + "learning_rate": 1.3314768268109483e-05, + "loss": 0.6515, + "step": 4940 + }, + { + "epoch": 0.37387915705043323, + "grad_norm": 2.1214993000030518, + "learning_rate": 1.3312791618128161e-05, + "loss": 0.689, + "step": 4941 + }, + { + "epoch": 0.37395482577276684, + "grad_norm": 2.303440809249878, + "learning_rate": 1.3310814698065876e-05, + "loss": 0.8364, + "step": 4942 + }, + { + "epoch": 0.37403049449510045, + "grad_norm": 2.154649496078491, + "learning_rate": 1.3308837508046431e-05, + "loss": 0.7713, + "step": 4943 + }, + { + "epoch": 0.37410616321743406, + "grad_norm": 1.6795384883880615, + "learning_rate": 1.3306860048193649e-05, + "loss": 0.5444, + "step": 4944 + }, + { + "epoch": 0.3741818319397677, + "grad_norm": 1.9569112062454224, + "learning_rate": 1.3304882318631358e-05, + "loss": 0.7279, + "step": 4945 + }, + { + "epoch": 0.37425750066210134, + "grad_norm": 2.33054518699646, + "learning_rate": 1.3302904319483413e-05, + "loss": 0.8949, + "step": 4946 + }, + { + "epoch": 0.37433316938443495, + "grad_norm": 2.1638824939727783, + "learning_rate": 1.3300926050873681e-05, + "loss": 0.6947, + "step": 4947 + }, + { + "epoch": 0.37440883810676856, + "grad_norm": 2.491903066635132, + "learning_rate": 1.3298947512926052e-05, + "loss": 0.7139, + "step": 4948 + }, + { + "epoch": 0.37448450682910217, + "grad_norm": 3.184447765350342, + "learning_rate": 1.3296968705764422e-05, + "loss": 0.9073, + "step": 4949 + }, + { + "epoch": 0.37456017555143584, + "grad_norm": 2.1764798164367676, + "learning_rate": 1.3294989629512715e-05, + "loss": 0.8647, + "step": 4950 + }, + { + "epoch": 0.37463584427376945, + "grad_norm": 2.0613832473754883, + "learning_rate": 1.3293010284294867e-05, + "loss": 0.7197, + "step": 4951 + }, + { + "epoch": 0.37471151299610306, + "grad_norm": 2.1119112968444824, + "learning_rate": 1.3291030670234827e-05, + "loss": 0.7002, + "step": 4952 + }, + { + "epoch": 0.37478718171843667, + "grad_norm": 2.2408530712127686, + "learning_rate": 1.328905078745657e-05, + "loss": 0.6455, + "step": 4953 + }, + { + "epoch": 0.37486285044077033, + "grad_norm": 2.5578296184539795, + "learning_rate": 1.3287070636084077e-05, + "loss": 0.7374, + "step": 4954 + }, + { + "epoch": 0.37493851916310394, + "grad_norm": 2.552988052368164, + "learning_rate": 1.3285090216241359e-05, + "loss": 0.7484, + "step": 4955 + }, + { + "epoch": 0.37501418788543756, + "grad_norm": 2.490983724594116, + "learning_rate": 1.328310952805243e-05, + "loss": 0.8299, + "step": 4956 + }, + { + "epoch": 0.37508985660777117, + "grad_norm": 2.3703956604003906, + "learning_rate": 1.3281128571641329e-05, + "loss": 0.6885, + "step": 4957 + }, + { + "epoch": 0.3751655253301048, + "grad_norm": 2.2803077697753906, + "learning_rate": 1.3279147347132111e-05, + "loss": 0.6471, + "step": 4958 + }, + { + "epoch": 0.37524119405243844, + "grad_norm": 2.4633090496063232, + "learning_rate": 1.327716585464884e-05, + "loss": 0.7724, + "step": 4959 + }, + { + "epoch": 0.37531686277477205, + "grad_norm": 1.877164363861084, + "learning_rate": 1.3275184094315617e-05, + "loss": 0.8416, + "step": 4960 + }, + { + "epoch": 0.37539253149710566, + "grad_norm": 2.59460711479187, + "learning_rate": 1.3273202066256534e-05, + "loss": 0.8155, + "step": 4961 + }, + { + "epoch": 0.3754682002194393, + "grad_norm": 1.790168285369873, + "learning_rate": 1.3271219770595716e-05, + "loss": 0.7016, + "step": 4962 + }, + { + "epoch": 0.37554386894177294, + "grad_norm": 2.1875531673431396, + "learning_rate": 1.3269237207457305e-05, + "loss": 0.7702, + "step": 4963 + }, + { + "epoch": 0.37561953766410655, + "grad_norm": 1.8856444358825684, + "learning_rate": 1.3267254376965449e-05, + "loss": 0.7305, + "step": 4964 + }, + { + "epoch": 0.37569520638644016, + "grad_norm": 2.1934876441955566, + "learning_rate": 1.3265271279244324e-05, + "loss": 0.7793, + "step": 4965 + }, + { + "epoch": 0.37577087510877377, + "grad_norm": 1.9856321811676025, + "learning_rate": 1.3263287914418111e-05, + "loss": 0.7695, + "step": 4966 + }, + { + "epoch": 0.37584654383110744, + "grad_norm": 2.125422239303589, + "learning_rate": 1.3261304282611025e-05, + "loss": 0.9119, + "step": 4967 + }, + { + "epoch": 0.37592221255344105, + "grad_norm": 2.8048200607299805, + "learning_rate": 1.3259320383947279e-05, + "loss": 0.7251, + "step": 4968 + }, + { + "epoch": 0.37599788127577466, + "grad_norm": 2.0084269046783447, + "learning_rate": 1.3257336218551115e-05, + "loss": 0.6035, + "step": 4969 + }, + { + "epoch": 0.37607354999810827, + "grad_norm": 2.4063563346862793, + "learning_rate": 1.3255351786546786e-05, + "loss": 0.7344, + "step": 4970 + }, + { + "epoch": 0.3761492187204419, + "grad_norm": 2.7517759799957275, + "learning_rate": 1.3253367088058567e-05, + "loss": 0.7425, + "step": 4971 + }, + { + "epoch": 0.37622488744277555, + "grad_norm": 2.8988542556762695, + "learning_rate": 1.3251382123210743e-05, + "loss": 0.7319, + "step": 4972 + }, + { + "epoch": 0.37630055616510916, + "grad_norm": 2.421642541885376, + "learning_rate": 1.324939689212762e-05, + "loss": 0.888, + "step": 4973 + }, + { + "epoch": 0.37637622488744277, + "grad_norm": 2.56626558303833, + "learning_rate": 1.324741139493352e-05, + "loss": 0.654, + "step": 4974 + }, + { + "epoch": 0.3764518936097764, + "grad_norm": 1.87079918384552, + "learning_rate": 1.3245425631752777e-05, + "loss": 0.7278, + "step": 4975 + }, + { + "epoch": 0.37652756233211004, + "grad_norm": 2.264610767364502, + "learning_rate": 1.3243439602709754e-05, + "loss": 0.5887, + "step": 4976 + }, + { + "epoch": 0.37660323105444365, + "grad_norm": 2.095689058303833, + "learning_rate": 1.3241453307928816e-05, + "loss": 0.7291, + "step": 4977 + }, + { + "epoch": 0.37667889977677727, + "grad_norm": 1.5480690002441406, + "learning_rate": 1.3239466747534355e-05, + "loss": 0.6431, + "step": 4978 + }, + { + "epoch": 0.3767545684991109, + "grad_norm": 2.456465721130371, + "learning_rate": 1.3237479921650772e-05, + "loss": 0.6346, + "step": 4979 + }, + { + "epoch": 0.37683023722144454, + "grad_norm": 1.8623820543289185, + "learning_rate": 1.323549283040249e-05, + "loss": 0.7686, + "step": 4980 + }, + { + "epoch": 0.37690590594377815, + "grad_norm": 3.514700174331665, + "learning_rate": 1.3233505473913951e-05, + "loss": 0.6733, + "step": 4981 + }, + { + "epoch": 0.37698157466611176, + "grad_norm": 2.421954393386841, + "learning_rate": 1.3231517852309602e-05, + "loss": 0.6811, + "step": 4982 + }, + { + "epoch": 0.3770572433884454, + "grad_norm": 3.265939950942993, + "learning_rate": 1.3229529965713925e-05, + "loss": 0.8858, + "step": 4983 + }, + { + "epoch": 0.377132912110779, + "grad_norm": 2.31626558303833, + "learning_rate": 1.3227541814251395e-05, + "loss": 0.8614, + "step": 4984 + }, + { + "epoch": 0.37720858083311265, + "grad_norm": 1.9577152729034424, + "learning_rate": 1.3225553398046527e-05, + "loss": 0.7351, + "step": 4985 + }, + { + "epoch": 0.37728424955544626, + "grad_norm": 1.793585181236267, + "learning_rate": 1.3223564717223837e-05, + "loss": 0.7094, + "step": 4986 + }, + { + "epoch": 0.37735991827777987, + "grad_norm": 2.228999137878418, + "learning_rate": 1.3221575771907864e-05, + "loss": 0.6827, + "step": 4987 + }, + { + "epoch": 0.3774355870001135, + "grad_norm": 2.4481256008148193, + "learning_rate": 1.321958656222316e-05, + "loss": 0.7283, + "step": 4988 + }, + { + "epoch": 0.37751125572244715, + "grad_norm": 2.0559909343719482, + "learning_rate": 1.32175970882943e-05, + "loss": 0.6605, + "step": 4989 + }, + { + "epoch": 0.37758692444478076, + "grad_norm": 2.7114646434783936, + "learning_rate": 1.3215607350245869e-05, + "loss": 0.7703, + "step": 4990 + }, + { + "epoch": 0.37766259316711437, + "grad_norm": 2.345587968826294, + "learning_rate": 1.3213617348202471e-05, + "loss": 0.8246, + "step": 4991 + }, + { + "epoch": 0.377738261889448, + "grad_norm": 2.660860538482666, + "learning_rate": 1.3211627082288725e-05, + "loss": 0.7403, + "step": 4992 + }, + { + "epoch": 0.37781393061178165, + "grad_norm": 2.2472615242004395, + "learning_rate": 1.320963655262927e-05, + "loss": 0.7548, + "step": 4993 + }, + { + "epoch": 0.37788959933411526, + "grad_norm": 2.2617874145507812, + "learning_rate": 1.3207645759348759e-05, + "loss": 0.7703, + "step": 4994 + }, + { + "epoch": 0.37796526805644887, + "grad_norm": 2.100846290588379, + "learning_rate": 1.3205654702571858e-05, + "loss": 0.7233, + "step": 4995 + }, + { + "epoch": 0.3780409367787825, + "grad_norm": 2.057562828063965, + "learning_rate": 1.320366338242326e-05, + "loss": 0.6308, + "step": 4996 + }, + { + "epoch": 0.3781166055011161, + "grad_norm": 2.615999937057495, + "learning_rate": 1.3201671799027663e-05, + "loss": 0.7651, + "step": 4997 + }, + { + "epoch": 0.37819227422344975, + "grad_norm": 2.463115692138672, + "learning_rate": 1.319967995250979e-05, + "loss": 0.7944, + "step": 4998 + }, + { + "epoch": 0.37826794294578336, + "grad_norm": 2.52860689163208, + "learning_rate": 1.3197687842994374e-05, + "loss": 0.7515, + "step": 4999 + }, + { + "epoch": 0.378343611668117, + "grad_norm": 1.9558433294296265, + "learning_rate": 1.3195695470606167e-05, + "loss": 0.8407, + "step": 5000 + }, + { + "epoch": 0.3784192803904506, + "grad_norm": 2.168161153793335, + "learning_rate": 1.319370283546994e-05, + "loss": 0.7869, + "step": 5001 + }, + { + "epoch": 0.37849494911278425, + "grad_norm": 2.3098533153533936, + "learning_rate": 1.3191709937710478e-05, + "loss": 0.7613, + "step": 5002 + }, + { + "epoch": 0.37857061783511786, + "grad_norm": 2.507798910140991, + "learning_rate": 1.3189716777452581e-05, + "loss": 0.6891, + "step": 5003 + }, + { + "epoch": 0.3786462865574515, + "grad_norm": 2.061244249343872, + "learning_rate": 1.318772335482107e-05, + "loss": 0.72, + "step": 5004 + }, + { + "epoch": 0.3787219552797851, + "grad_norm": 2.2283413410186768, + "learning_rate": 1.3185729669940776e-05, + "loss": 0.7914, + "step": 5005 + }, + { + "epoch": 0.37879762400211875, + "grad_norm": 2.303812026977539, + "learning_rate": 1.3183735722936554e-05, + "loss": 0.7093, + "step": 5006 + }, + { + "epoch": 0.37887329272445236, + "grad_norm": 2.085308313369751, + "learning_rate": 1.3181741513933265e-05, + "loss": 0.8941, + "step": 5007 + }, + { + "epoch": 0.37894896144678597, + "grad_norm": 1.7646946907043457, + "learning_rate": 1.3179747043055802e-05, + "loss": 0.6162, + "step": 5008 + }, + { + "epoch": 0.3790246301691196, + "grad_norm": 2.3402299880981445, + "learning_rate": 1.3177752310429057e-05, + "loss": 0.7871, + "step": 5009 + }, + { + "epoch": 0.3791002988914532, + "grad_norm": 2.1623239517211914, + "learning_rate": 1.317575731617795e-05, + "loss": 0.832, + "step": 5010 + }, + { + "epoch": 0.37917596761378686, + "grad_norm": 2.153862953186035, + "learning_rate": 1.3173762060427414e-05, + "loss": 0.6814, + "step": 5011 + }, + { + "epoch": 0.37925163633612047, + "grad_norm": 2.1609599590301514, + "learning_rate": 1.31717665433024e-05, + "loss": 0.7195, + "step": 5012 + }, + { + "epoch": 0.3793273050584541, + "grad_norm": 1.9634231328964233, + "learning_rate": 1.316977076492787e-05, + "loss": 0.6328, + "step": 5013 + }, + { + "epoch": 0.3794029737807877, + "grad_norm": 2.0286974906921387, + "learning_rate": 1.316777472542881e-05, + "loss": 0.8721, + "step": 5014 + }, + { + "epoch": 0.37947864250312136, + "grad_norm": 4.155838966369629, + "learning_rate": 1.3165778424930214e-05, + "loss": 0.8739, + "step": 5015 + }, + { + "epoch": 0.37955431122545497, + "grad_norm": 2.0985116958618164, + "learning_rate": 1.31637818635571e-05, + "loss": 0.692, + "step": 5016 + }, + { + "epoch": 0.3796299799477886, + "grad_norm": 2.7467665672302246, + "learning_rate": 1.3161785041434501e-05, + "loss": 0.7331, + "step": 5017 + }, + { + "epoch": 0.3797056486701222, + "grad_norm": 6.863169193267822, + "learning_rate": 1.3159787958687457e-05, + "loss": 0.8108, + "step": 5018 + }, + { + "epoch": 0.37978131739245585, + "grad_norm": 2.4260308742523193, + "learning_rate": 1.3157790615441042e-05, + "loss": 0.6481, + "step": 5019 + }, + { + "epoch": 0.37985698611478946, + "grad_norm": 2.156952142715454, + "learning_rate": 1.3155793011820327e-05, + "loss": 0.6652, + "step": 5020 + }, + { + "epoch": 0.3799326548371231, + "grad_norm": 2.060511350631714, + "learning_rate": 1.3153795147950412e-05, + "loss": 0.7653, + "step": 5021 + }, + { + "epoch": 0.3800083235594567, + "grad_norm": 1.6299368143081665, + "learning_rate": 1.3151797023956411e-05, + "loss": 0.9353, + "step": 5022 + }, + { + "epoch": 0.3800839922817903, + "grad_norm": 2.0761969089508057, + "learning_rate": 1.3149798639963451e-05, + "loss": 0.6743, + "step": 5023 + }, + { + "epoch": 0.38015966100412396, + "grad_norm": 2.3090572357177734, + "learning_rate": 1.3147799996096682e-05, + "loss": 0.8114, + "step": 5024 + }, + { + "epoch": 0.38023532972645757, + "grad_norm": 2.1611216068267822, + "learning_rate": 1.3145801092481256e-05, + "loss": 0.8595, + "step": 5025 + }, + { + "epoch": 0.3803109984487912, + "grad_norm": 2.3790178298950195, + "learning_rate": 1.3143801929242359e-05, + "loss": 0.6275, + "step": 5026 + }, + { + "epoch": 0.3803866671711248, + "grad_norm": 2.3763949871063232, + "learning_rate": 1.3141802506505183e-05, + "loss": 0.8345, + "step": 5027 + }, + { + "epoch": 0.38046233589345846, + "grad_norm": 2.2187368869781494, + "learning_rate": 1.3139802824394936e-05, + "loss": 0.7276, + "step": 5028 + }, + { + "epoch": 0.38053800461579207, + "grad_norm": 1.9624474048614502, + "learning_rate": 1.313780288303685e-05, + "loss": 0.6358, + "step": 5029 + }, + { + "epoch": 0.3806136733381257, + "grad_norm": 1.6875008344650269, + "learning_rate": 1.3135802682556162e-05, + "loss": 0.745, + "step": 5030 + }, + { + "epoch": 0.3806893420604593, + "grad_norm": 3.063140392303467, + "learning_rate": 1.3133802223078132e-05, + "loss": 0.8704, + "step": 5031 + }, + { + "epoch": 0.38076501078279296, + "grad_norm": 2.5540196895599365, + "learning_rate": 1.3131801504728037e-05, + "loss": 0.6593, + "step": 5032 + }, + { + "epoch": 0.38084067950512657, + "grad_norm": 2.9757909774780273, + "learning_rate": 1.3129800527631167e-05, + "loss": 0.7676, + "step": 5033 + }, + { + "epoch": 0.3809163482274602, + "grad_norm": 2.7554965019226074, + "learning_rate": 1.3127799291912833e-05, + "loss": 0.7257, + "step": 5034 + }, + { + "epoch": 0.3809920169497938, + "grad_norm": 2.5679843425750732, + "learning_rate": 1.3125797797698358e-05, + "loss": 0.7173, + "step": 5035 + }, + { + "epoch": 0.3810676856721274, + "grad_norm": 2.0927176475524902, + "learning_rate": 1.3123796045113075e-05, + "loss": 0.707, + "step": 5036 + }, + { + "epoch": 0.38114335439446106, + "grad_norm": 3.1577799320220947, + "learning_rate": 1.312179403428235e-05, + "loss": 0.8109, + "step": 5037 + }, + { + "epoch": 0.3812190231167947, + "grad_norm": 1.8900063037872314, + "learning_rate": 1.3119791765331549e-05, + "loss": 0.7693, + "step": 5038 + }, + { + "epoch": 0.3812946918391283, + "grad_norm": 2.2258424758911133, + "learning_rate": 1.3117789238386063e-05, + "loss": 0.6866, + "step": 5039 + }, + { + "epoch": 0.3813703605614619, + "grad_norm": 1.6877254247665405, + "learning_rate": 1.3115786453571299e-05, + "loss": 0.6474, + "step": 5040 + }, + { + "epoch": 0.38144602928379556, + "grad_norm": 2.2569453716278076, + "learning_rate": 1.311378341101267e-05, + "loss": 0.6449, + "step": 5041 + }, + { + "epoch": 0.3815216980061292, + "grad_norm": 2.3201940059661865, + "learning_rate": 1.3111780110835622e-05, + "loss": 0.7697, + "step": 5042 + }, + { + "epoch": 0.3815973667284628, + "grad_norm": 2.2311851978302, + "learning_rate": 1.3109776553165604e-05, + "loss": 0.5872, + "step": 5043 + }, + { + "epoch": 0.3816730354507964, + "grad_norm": 1.6546425819396973, + "learning_rate": 1.3107772738128085e-05, + "loss": 0.6899, + "step": 5044 + }, + { + "epoch": 0.38174870417313006, + "grad_norm": 2.160982131958008, + "learning_rate": 1.3105768665848551e-05, + "loss": 0.7574, + "step": 5045 + }, + { + "epoch": 0.38182437289546367, + "grad_norm": 2.2722971439361572, + "learning_rate": 1.3103764336452501e-05, + "loss": 0.75, + "step": 5046 + }, + { + "epoch": 0.3819000416177973, + "grad_norm": 2.3404366970062256, + "learning_rate": 1.310175975006546e-05, + "loss": 0.6267, + "step": 5047 + }, + { + "epoch": 0.3819757103401309, + "grad_norm": 1.7116867303848267, + "learning_rate": 1.3099754906812952e-05, + "loss": 0.6965, + "step": 5048 + }, + { + "epoch": 0.38205137906246456, + "grad_norm": 1.8109760284423828, + "learning_rate": 1.3097749806820535e-05, + "loss": 0.6197, + "step": 5049 + }, + { + "epoch": 0.38212704778479817, + "grad_norm": 2.044471025466919, + "learning_rate": 1.309574445021377e-05, + "loss": 0.8108, + "step": 5050 + }, + { + "epoch": 0.3822027165071318, + "grad_norm": 2.0608906745910645, + "learning_rate": 1.309373883711824e-05, + "loss": 0.7047, + "step": 5051 + }, + { + "epoch": 0.3822783852294654, + "grad_norm": 5.626868724822998, + "learning_rate": 1.3091732967659546e-05, + "loss": 0.9076, + "step": 5052 + }, + { + "epoch": 0.382354053951799, + "grad_norm": 2.2120423316955566, + "learning_rate": 1.3089726841963296e-05, + "loss": 0.7146, + "step": 5053 + }, + { + "epoch": 0.38242972267413267, + "grad_norm": 2.1795167922973633, + "learning_rate": 1.3087720460155122e-05, + "loss": 0.7101, + "step": 5054 + }, + { + "epoch": 0.3825053913964663, + "grad_norm": 2.11128568649292, + "learning_rate": 1.3085713822360676e-05, + "loss": 0.8643, + "step": 5055 + }, + { + "epoch": 0.3825810601187999, + "grad_norm": 2.028358221054077, + "learning_rate": 1.3083706928705612e-05, + "loss": 0.7917, + "step": 5056 + }, + { + "epoch": 0.3826567288411335, + "grad_norm": 2.1836349964141846, + "learning_rate": 1.3081699779315615e-05, + "loss": 0.9105, + "step": 5057 + }, + { + "epoch": 0.38273239756346716, + "grad_norm": 1.974503517150879, + "learning_rate": 1.3079692374316374e-05, + "loss": 0.7627, + "step": 5058 + }, + { + "epoch": 0.3828080662858008, + "grad_norm": 2.411986827850342, + "learning_rate": 1.3077684713833602e-05, + "loss": 0.8903, + "step": 5059 + }, + { + "epoch": 0.3828837350081344, + "grad_norm": 2.188807725906372, + "learning_rate": 1.3075676797993023e-05, + "loss": 0.6245, + "step": 5060 + }, + { + "epoch": 0.382959403730468, + "grad_norm": 3.615983247756958, + "learning_rate": 1.3073668626920381e-05, + "loss": 0.7468, + "step": 5061 + }, + { + "epoch": 0.38303507245280166, + "grad_norm": 2.4570207595825195, + "learning_rate": 1.3071660200741436e-05, + "loss": 0.8592, + "step": 5062 + }, + { + "epoch": 0.38311074117513527, + "grad_norm": 2.342355966567993, + "learning_rate": 1.3069651519581959e-05, + "loss": 0.6396, + "step": 5063 + }, + { + "epoch": 0.3831864098974689, + "grad_norm": 2.1598551273345947, + "learning_rate": 1.3067642583567737e-05, + "loss": 0.5799, + "step": 5064 + }, + { + "epoch": 0.3832620786198025, + "grad_norm": 1.9163577556610107, + "learning_rate": 1.3065633392824586e-05, + "loss": 0.658, + "step": 5065 + }, + { + "epoch": 0.3833377473421361, + "grad_norm": 2.467026472091675, + "learning_rate": 1.3063623947478318e-05, + "loss": 0.7139, + "step": 5066 + }, + { + "epoch": 0.38341341606446977, + "grad_norm": 1.7132724523544312, + "learning_rate": 1.3061614247654775e-05, + "loss": 0.7353, + "step": 5067 + }, + { + "epoch": 0.3834890847868034, + "grad_norm": 1.9084765911102295, + "learning_rate": 1.3059604293479815e-05, + "loss": 0.6118, + "step": 5068 + }, + { + "epoch": 0.383564753509137, + "grad_norm": 1.8953239917755127, + "learning_rate": 1.3057594085079298e-05, + "loss": 0.8124, + "step": 5069 + }, + { + "epoch": 0.3836404222314706, + "grad_norm": 2.3783981800079346, + "learning_rate": 1.305558362257912e-05, + "loss": 0.763, + "step": 5070 + }, + { + "epoch": 0.38371609095380427, + "grad_norm": 2.3013756275177, + "learning_rate": 1.3053572906105177e-05, + "loss": 0.6881, + "step": 5071 + }, + { + "epoch": 0.3837917596761379, + "grad_norm": 2.113539934158325, + "learning_rate": 1.3051561935783388e-05, + "loss": 0.8303, + "step": 5072 + }, + { + "epoch": 0.3838674283984715, + "grad_norm": 2.5680975914001465, + "learning_rate": 1.3049550711739684e-05, + "loss": 0.7595, + "step": 5073 + }, + { + "epoch": 0.3839430971208051, + "grad_norm": 1.8435100317001343, + "learning_rate": 1.3047539234100018e-05, + "loss": 0.6678, + "step": 5074 + }, + { + "epoch": 0.38401876584313877, + "grad_norm": 2.025412082672119, + "learning_rate": 1.3045527502990358e-05, + "loss": 0.8392, + "step": 5075 + }, + { + "epoch": 0.3840944345654724, + "grad_norm": 2.096165895462036, + "learning_rate": 1.3043515518536674e-05, + "loss": 0.8409, + "step": 5076 + }, + { + "epoch": 0.384170103287806, + "grad_norm": 1.9506720304489136, + "learning_rate": 1.3041503280864974e-05, + "loss": 0.7444, + "step": 5077 + }, + { + "epoch": 0.3842457720101396, + "grad_norm": 1.969355583190918, + "learning_rate": 1.3039490790101266e-05, + "loss": 0.6558, + "step": 5078 + }, + { + "epoch": 0.3843214407324732, + "grad_norm": 1.9142673015594482, + "learning_rate": 1.303747804637158e-05, + "loss": 0.7879, + "step": 5079 + }, + { + "epoch": 0.3843971094548069, + "grad_norm": 1.9106582403182983, + "learning_rate": 1.3035465049801958e-05, + "loss": 0.7209, + "step": 5080 + }, + { + "epoch": 0.3844727781771405, + "grad_norm": 2.3156635761260986, + "learning_rate": 1.3033451800518464e-05, + "loss": 0.8002, + "step": 5081 + }, + { + "epoch": 0.3845484468994741, + "grad_norm": 2.1822335720062256, + "learning_rate": 1.3031438298647174e-05, + "loss": 0.7506, + "step": 5082 + }, + { + "epoch": 0.3846241156218077, + "grad_norm": 2.149963617324829, + "learning_rate": 1.3029424544314173e-05, + "loss": 0.6489, + "step": 5083 + }, + { + "epoch": 0.38469978434414137, + "grad_norm": 2.2395076751708984, + "learning_rate": 1.3027410537645578e-05, + "loss": 0.6394, + "step": 5084 + }, + { + "epoch": 0.384775453066475, + "grad_norm": 2.702310562133789, + "learning_rate": 1.3025396278767511e-05, + "loss": 0.7583, + "step": 5085 + }, + { + "epoch": 0.3848511217888086, + "grad_norm": 6.533085823059082, + "learning_rate": 1.3023381767806106e-05, + "loss": 0.759, + "step": 5086 + }, + { + "epoch": 0.3849267905111422, + "grad_norm": 2.2441484928131104, + "learning_rate": 1.302136700488752e-05, + "loss": 0.6975, + "step": 5087 + }, + { + "epoch": 0.38500245923347587, + "grad_norm": 2.1438467502593994, + "learning_rate": 1.301935199013793e-05, + "loss": 0.6995, + "step": 5088 + }, + { + "epoch": 0.3850781279558095, + "grad_norm": 2.285844564437866, + "learning_rate": 1.3017336723683519e-05, + "loss": 0.7527, + "step": 5089 + }, + { + "epoch": 0.3851537966781431, + "grad_norm": 2.3817970752716064, + "learning_rate": 1.3015321205650483e-05, + "loss": 0.6128, + "step": 5090 + }, + { + "epoch": 0.3852294654004767, + "grad_norm": 2.551360845565796, + "learning_rate": 1.3013305436165049e-05, + "loss": 0.7447, + "step": 5091 + }, + { + "epoch": 0.3853051341228103, + "grad_norm": 2.2289671897888184, + "learning_rate": 1.3011289415353446e-05, + "loss": 0.8124, + "step": 5092 + }, + { + "epoch": 0.385380802845144, + "grad_norm": 2.501476526260376, + "learning_rate": 1.300927314334193e-05, + "loss": 0.6713, + "step": 5093 + }, + { + "epoch": 0.3854564715674776, + "grad_norm": 1.9687072038650513, + "learning_rate": 1.300725662025676e-05, + "loss": 0.6829, + "step": 5094 + }, + { + "epoch": 0.3855321402898112, + "grad_norm": 2.438424825668335, + "learning_rate": 1.3005239846224218e-05, + "loss": 0.791, + "step": 5095 + }, + { + "epoch": 0.3856078090121448, + "grad_norm": 2.1504287719726562, + "learning_rate": 1.3003222821370605e-05, + "loss": 0.7567, + "step": 5096 + }, + { + "epoch": 0.3856834777344785, + "grad_norm": 2.486421823501587, + "learning_rate": 1.3001205545822228e-05, + "loss": 0.5951, + "step": 5097 + }, + { + "epoch": 0.3857591464568121, + "grad_norm": 1.9564738273620605, + "learning_rate": 1.299918801970542e-05, + "loss": 0.8513, + "step": 5098 + }, + { + "epoch": 0.3858348151791457, + "grad_norm": 2.0457041263580322, + "learning_rate": 1.2997170243146524e-05, + "loss": 0.7346, + "step": 5099 + }, + { + "epoch": 0.3859104839014793, + "grad_norm": 1.925238847732544, + "learning_rate": 1.2995152216271898e-05, + "loss": 0.7619, + "step": 5100 + }, + { + "epoch": 0.385986152623813, + "grad_norm": 2.6253859996795654, + "learning_rate": 1.2993133939207918e-05, + "loss": 0.8293, + "step": 5101 + }, + { + "epoch": 0.3860618213461466, + "grad_norm": 2.5588762760162354, + "learning_rate": 1.2991115412080976e-05, + "loss": 0.7825, + "step": 5102 + }, + { + "epoch": 0.3861374900684802, + "grad_norm": 1.8942152261734009, + "learning_rate": 1.2989096635017476e-05, + "loss": 0.7372, + "step": 5103 + }, + { + "epoch": 0.3862131587908138, + "grad_norm": 2.209826946258545, + "learning_rate": 1.2987077608143845e-05, + "loss": 0.575, + "step": 5104 + }, + { + "epoch": 0.3862888275131474, + "grad_norm": 2.1787028312683105, + "learning_rate": 1.2985058331586516e-05, + "loss": 0.8091, + "step": 5105 + }, + { + "epoch": 0.3863644962354811, + "grad_norm": 2.4378044605255127, + "learning_rate": 1.2983038805471949e-05, + "loss": 0.5765, + "step": 5106 + }, + { + "epoch": 0.3864401649578147, + "grad_norm": 2.141134023666382, + "learning_rate": 1.2981019029926606e-05, + "loss": 0.7571, + "step": 5107 + }, + { + "epoch": 0.3865158336801483, + "grad_norm": 4.0216064453125, + "learning_rate": 1.2978999005076976e-05, + "loss": 0.8407, + "step": 5108 + }, + { + "epoch": 0.3865915024024819, + "grad_norm": 2.1336159706115723, + "learning_rate": 1.2976978731049559e-05, + "loss": 0.794, + "step": 5109 + }, + { + "epoch": 0.3866671711248156, + "grad_norm": 2.151615858078003, + "learning_rate": 1.2974958207970868e-05, + "loss": 0.6768, + "step": 5110 + }, + { + "epoch": 0.3867428398471492, + "grad_norm": 1.8506669998168945, + "learning_rate": 1.2972937435967443e-05, + "loss": 0.6728, + "step": 5111 + }, + { + "epoch": 0.3868185085694828, + "grad_norm": 2.7511610984802246, + "learning_rate": 1.2970916415165822e-05, + "loss": 0.7382, + "step": 5112 + }, + { + "epoch": 0.3868941772918164, + "grad_norm": 2.5931057929992676, + "learning_rate": 1.296889514569257e-05, + "loss": 0.8731, + "step": 5113 + }, + { + "epoch": 0.3869698460141501, + "grad_norm": 3.5109941959381104, + "learning_rate": 1.296687362767427e-05, + "loss": 0.8754, + "step": 5114 + }, + { + "epoch": 0.3870455147364837, + "grad_norm": 2.4638400077819824, + "learning_rate": 1.2964851861237511e-05, + "loss": 0.7972, + "step": 5115 + }, + { + "epoch": 0.3871211834588173, + "grad_norm": 2.775519371032715, + "learning_rate": 1.2962829846508908e-05, + "loss": 0.6752, + "step": 5116 + }, + { + "epoch": 0.3871968521811509, + "grad_norm": 2.1849584579467773, + "learning_rate": 1.2960807583615081e-05, + "loss": 0.8533, + "step": 5117 + }, + { + "epoch": 0.3872725209034845, + "grad_norm": 2.431049346923828, + "learning_rate": 1.295878507268267e-05, + "loss": 0.7406, + "step": 5118 + }, + { + "epoch": 0.3873481896258182, + "grad_norm": 2.2878475189208984, + "learning_rate": 1.2956762313838335e-05, + "loss": 0.7887, + "step": 5119 + }, + { + "epoch": 0.3874238583481518, + "grad_norm": 2.3642971515655518, + "learning_rate": 1.2954739307208746e-05, + "loss": 0.723, + "step": 5120 + }, + { + "epoch": 0.3874995270704854, + "grad_norm": 1.9779037237167358, + "learning_rate": 1.295271605292059e-05, + "loss": 0.644, + "step": 5121 + }, + { + "epoch": 0.387575195792819, + "grad_norm": 2.5578744411468506, + "learning_rate": 1.2950692551100573e-05, + "loss": 0.7849, + "step": 5122 + }, + { + "epoch": 0.3876508645151527, + "grad_norm": 2.2762012481689453, + "learning_rate": 1.2948668801875408e-05, + "loss": 0.7393, + "step": 5123 + }, + { + "epoch": 0.3877265332374863, + "grad_norm": 2.143754720687866, + "learning_rate": 1.2946644805371833e-05, + "loss": 0.7024, + "step": 5124 + }, + { + "epoch": 0.3878022019598199, + "grad_norm": 2.0929954051971436, + "learning_rate": 1.2944620561716592e-05, + "loss": 0.717, + "step": 5125 + }, + { + "epoch": 0.3878778706821535, + "grad_norm": 2.0429515838623047, + "learning_rate": 1.2942596071036455e-05, + "loss": 0.7081, + "step": 5126 + }, + { + "epoch": 0.3879535394044872, + "grad_norm": 2.1083133220672607, + "learning_rate": 1.2940571333458201e-05, + "loss": 0.6678, + "step": 5127 + }, + { + "epoch": 0.3880292081268208, + "grad_norm": 2.166097640991211, + "learning_rate": 1.2938546349108623e-05, + "loss": 0.6017, + "step": 5128 + }, + { + "epoch": 0.3881048768491544, + "grad_norm": 1.7243160009384155, + "learning_rate": 1.2936521118114534e-05, + "loss": 0.6601, + "step": 5129 + }, + { + "epoch": 0.388180545571488, + "grad_norm": 2.28934383392334, + "learning_rate": 1.2934495640602759e-05, + "loss": 0.7419, + "step": 5130 + }, + { + "epoch": 0.3882562142938216, + "grad_norm": 2.0433170795440674, + "learning_rate": 1.2932469916700144e-05, + "loss": 0.8201, + "step": 5131 + }, + { + "epoch": 0.3883318830161553, + "grad_norm": 2.5278637409210205, + "learning_rate": 1.2930443946533543e-05, + "loss": 0.7638, + "step": 5132 + }, + { + "epoch": 0.3884075517384889, + "grad_norm": 1.9174318313598633, + "learning_rate": 1.2928417730229827e-05, + "loss": 0.7162, + "step": 5133 + }, + { + "epoch": 0.3884832204608225, + "grad_norm": 2.190006732940674, + "learning_rate": 1.2926391267915892e-05, + "loss": 0.7597, + "step": 5134 + }, + { + "epoch": 0.3885588891831561, + "grad_norm": 2.2163407802581787, + "learning_rate": 1.292436455971863e-05, + "loss": 0.5617, + "step": 5135 + }, + { + "epoch": 0.3886345579054898, + "grad_norm": 1.9766048192977905, + "learning_rate": 1.2922337605764971e-05, + "loss": 0.7222, + "step": 5136 + }, + { + "epoch": 0.3887102266278234, + "grad_norm": 2.149446964263916, + "learning_rate": 1.2920310406181842e-05, + "loss": 0.7806, + "step": 5137 + }, + { + "epoch": 0.388785895350157, + "grad_norm": 1.846808910369873, + "learning_rate": 1.2918282961096197e-05, + "loss": 0.7699, + "step": 5138 + }, + { + "epoch": 0.3888615640724906, + "grad_norm": 2.0156519412994385, + "learning_rate": 1.2916255270635001e-05, + "loss": 0.6868, + "step": 5139 + }, + { + "epoch": 0.3889372327948243, + "grad_norm": 1.861183524131775, + "learning_rate": 1.2914227334925231e-05, + "loss": 0.6657, + "step": 5140 + }, + { + "epoch": 0.3890129015171579, + "grad_norm": 2.640993118286133, + "learning_rate": 1.2912199154093886e-05, + "loss": 0.627, + "step": 5141 + }, + { + "epoch": 0.3890885702394915, + "grad_norm": 2.4647865295410156, + "learning_rate": 1.2910170728267974e-05, + "loss": 0.7462, + "step": 5142 + }, + { + "epoch": 0.3891642389618251, + "grad_norm": 2.260634422302246, + "learning_rate": 1.2908142057574526e-05, + "loss": 0.8352, + "step": 5143 + }, + { + "epoch": 0.3892399076841587, + "grad_norm": 2.117558002471924, + "learning_rate": 1.2906113142140582e-05, + "loss": 0.8288, + "step": 5144 + }, + { + "epoch": 0.3893155764064924, + "grad_norm": 2.3098366260528564, + "learning_rate": 1.29040839820932e-05, + "loss": 0.7885, + "step": 5145 + }, + { + "epoch": 0.389391245128826, + "grad_norm": 1.80618155002594, + "learning_rate": 1.2902054577559451e-05, + "loss": 0.7721, + "step": 5146 + }, + { + "epoch": 0.3894669138511596, + "grad_norm": 1.6692981719970703, + "learning_rate": 1.2900024928666424e-05, + "loss": 0.6533, + "step": 5147 + }, + { + "epoch": 0.3895425825734932, + "grad_norm": 2.307391405105591, + "learning_rate": 1.2897995035541223e-05, + "loss": 0.6559, + "step": 5148 + }, + { + "epoch": 0.3896182512958269, + "grad_norm": 2.1332476139068604, + "learning_rate": 1.2895964898310961e-05, + "loss": 0.7055, + "step": 5149 + }, + { + "epoch": 0.3896939200181605, + "grad_norm": 2.272970199584961, + "learning_rate": 1.289393451710278e-05, + "loss": 0.7819, + "step": 5150 + }, + { + "epoch": 0.3897695887404941, + "grad_norm": 2.7969746589660645, + "learning_rate": 1.289190389204382e-05, + "loss": 0.7392, + "step": 5151 + }, + { + "epoch": 0.3898452574628277, + "grad_norm": 2.1883418560028076, + "learning_rate": 1.2889873023261257e-05, + "loss": 0.7967, + "step": 5152 + }, + { + "epoch": 0.3899209261851614, + "grad_norm": 1.9223883152008057, + "learning_rate": 1.288784191088226e-05, + "loss": 0.7576, + "step": 5153 + }, + { + "epoch": 0.389996594907495, + "grad_norm": 2.0606937408447266, + "learning_rate": 1.2885810555034028e-05, + "loss": 0.7704, + "step": 5154 + }, + { + "epoch": 0.3900722636298286, + "grad_norm": 1.9041752815246582, + "learning_rate": 1.2883778955843772e-05, + "loss": 0.8243, + "step": 5155 + }, + { + "epoch": 0.3901479323521622, + "grad_norm": 2.1987617015838623, + "learning_rate": 1.2881747113438716e-05, + "loss": 0.8491, + "step": 5156 + }, + { + "epoch": 0.39022360107449583, + "grad_norm": 2.152064323425293, + "learning_rate": 1.2879715027946101e-05, + "loss": 0.7676, + "step": 5157 + }, + { + "epoch": 0.3902992697968295, + "grad_norm": 2.4647457599639893, + "learning_rate": 1.2877682699493179e-05, + "loss": 0.7452, + "step": 5158 + }, + { + "epoch": 0.3903749385191631, + "grad_norm": 1.774983286857605, + "learning_rate": 1.2875650128207228e-05, + "loss": 0.8262, + "step": 5159 + }, + { + "epoch": 0.3904506072414967, + "grad_norm": 1.9229451417922974, + "learning_rate": 1.2873617314215528e-05, + "loss": 0.7605, + "step": 5160 + }, + { + "epoch": 0.3905262759638303, + "grad_norm": 2.0117905139923096, + "learning_rate": 1.2871584257645385e-05, + "loss": 0.6922, + "step": 5161 + }, + { + "epoch": 0.390601944686164, + "grad_norm": 2.2805237770080566, + "learning_rate": 1.2869550958624115e-05, + "loss": 0.9432, + "step": 5162 + }, + { + "epoch": 0.3906776134084976, + "grad_norm": 2.6723804473876953, + "learning_rate": 1.2867517417279045e-05, + "loss": 0.741, + "step": 5163 + }, + { + "epoch": 0.3907532821308312, + "grad_norm": 1.9901678562164307, + "learning_rate": 1.2865483633737528e-05, + "loss": 0.7327, + "step": 5164 + }, + { + "epoch": 0.3908289508531648, + "grad_norm": 2.1943933963775635, + "learning_rate": 1.286344960812692e-05, + "loss": 0.8089, + "step": 5165 + }, + { + "epoch": 0.3909046195754985, + "grad_norm": 2.47472882270813, + "learning_rate": 1.2861415340574604e-05, + "loss": 0.7517, + "step": 5166 + }, + { + "epoch": 0.3909802882978321, + "grad_norm": 2.1459341049194336, + "learning_rate": 1.2859380831207969e-05, + "loss": 0.7393, + "step": 5167 + }, + { + "epoch": 0.3910559570201657, + "grad_norm": 2.2298531532287598, + "learning_rate": 1.2857346080154425e-05, + "loss": 0.7545, + "step": 5168 + }, + { + "epoch": 0.3911316257424993, + "grad_norm": 2.5509769916534424, + "learning_rate": 1.2855311087541393e-05, + "loss": 0.9029, + "step": 5169 + }, + { + "epoch": 0.391207294464833, + "grad_norm": 2.2877771854400635, + "learning_rate": 1.285327585349631e-05, + "loss": 0.6986, + "step": 5170 + }, + { + "epoch": 0.3912829631871666, + "grad_norm": 1.683161735534668, + "learning_rate": 1.2851240378146632e-05, + "loss": 0.8001, + "step": 5171 + }, + { + "epoch": 0.3913586319095002, + "grad_norm": 1.9525566101074219, + "learning_rate": 1.2849204661619822e-05, + "loss": 0.7955, + "step": 5172 + }, + { + "epoch": 0.3914343006318338, + "grad_norm": 2.2705700397491455, + "learning_rate": 1.284716870404337e-05, + "loss": 0.734, + "step": 5173 + }, + { + "epoch": 0.39150996935416743, + "grad_norm": 1.9373785257339478, + "learning_rate": 1.2845132505544766e-05, + "loss": 0.7796, + "step": 5174 + }, + { + "epoch": 0.3915856380765011, + "grad_norm": 2.5509033203125, + "learning_rate": 1.284309606625153e-05, + "loss": 0.7603, + "step": 5175 + }, + { + "epoch": 0.3916613067988347, + "grad_norm": 3.8573920726776123, + "learning_rate": 1.2841059386291191e-05, + "loss": 0.681, + "step": 5176 + }, + { + "epoch": 0.3917369755211683, + "grad_norm": 2.2207155227661133, + "learning_rate": 1.2839022465791285e-05, + "loss": 0.7496, + "step": 5177 + }, + { + "epoch": 0.39181264424350193, + "grad_norm": 3.7512471675872803, + "learning_rate": 1.283698530487938e-05, + "loss": 0.6177, + "step": 5178 + }, + { + "epoch": 0.3918883129658356, + "grad_norm": 2.095038890838623, + "learning_rate": 1.283494790368304e-05, + "loss": 0.7261, + "step": 5179 + }, + { + "epoch": 0.3919639816881692, + "grad_norm": 2.247019052505493, + "learning_rate": 1.2832910262329862e-05, + "loss": 0.7003, + "step": 5180 + }, + { + "epoch": 0.3920396504105028, + "grad_norm": 2.304305076599121, + "learning_rate": 1.2830872380947447e-05, + "loss": 0.7956, + "step": 5181 + }, + { + "epoch": 0.3921153191328364, + "grad_norm": 1.8406106233596802, + "learning_rate": 1.282883425966341e-05, + "loss": 0.882, + "step": 5182 + }, + { + "epoch": 0.3921909878551701, + "grad_norm": 3.2401795387268066, + "learning_rate": 1.2826795898605389e-05, + "loss": 0.7532, + "step": 5183 + }, + { + "epoch": 0.3922666565775037, + "grad_norm": 2.3485889434814453, + "learning_rate": 1.282475729790103e-05, + "loss": 0.595, + "step": 5184 + }, + { + "epoch": 0.3923423252998373, + "grad_norm": 1.8012150526046753, + "learning_rate": 1.2822718457678001e-05, + "loss": 0.6598, + "step": 5185 + }, + { + "epoch": 0.3924179940221709, + "grad_norm": 2.22017502784729, + "learning_rate": 1.2820679378063978e-05, + "loss": 0.6302, + "step": 5186 + }, + { + "epoch": 0.39249366274450453, + "grad_norm": 2.1888411045074463, + "learning_rate": 1.2818640059186653e-05, + "loss": 0.7243, + "step": 5187 + }, + { + "epoch": 0.3925693314668382, + "grad_norm": 2.191774845123291, + "learning_rate": 1.2816600501173737e-05, + "loss": 0.7592, + "step": 5188 + }, + { + "epoch": 0.3926450001891718, + "grad_norm": 2.351590871810913, + "learning_rate": 1.2814560704152955e-05, + "loss": 0.6887, + "step": 5189 + }, + { + "epoch": 0.3927206689115054, + "grad_norm": 2.8945960998535156, + "learning_rate": 1.2812520668252039e-05, + "loss": 0.6931, + "step": 5190 + }, + { + "epoch": 0.39279633763383903, + "grad_norm": 2.134856700897217, + "learning_rate": 1.281048039359875e-05, + "loss": 0.8368, + "step": 5191 + }, + { + "epoch": 0.3928720063561727, + "grad_norm": 2.4972636699676514, + "learning_rate": 1.2808439880320855e-05, + "loss": 0.837, + "step": 5192 + }, + { + "epoch": 0.3929476750785063, + "grad_norm": 2.14408016204834, + "learning_rate": 1.2806399128546137e-05, + "loss": 0.7233, + "step": 5193 + }, + { + "epoch": 0.3930233438008399, + "grad_norm": 1.9677777290344238, + "learning_rate": 1.2804358138402394e-05, + "loss": 0.6967, + "step": 5194 + }, + { + "epoch": 0.39309901252317353, + "grad_norm": 1.9467759132385254, + "learning_rate": 1.280231691001744e-05, + "loss": 0.7552, + "step": 5195 + }, + { + "epoch": 0.3931746812455072, + "grad_norm": 2.6038689613342285, + "learning_rate": 1.2800275443519102e-05, + "loss": 0.7232, + "step": 5196 + }, + { + "epoch": 0.3932503499678408, + "grad_norm": 2.1820638179779053, + "learning_rate": 1.2798233739035222e-05, + "loss": 0.8716, + "step": 5197 + }, + { + "epoch": 0.3933260186901744, + "grad_norm": 2.153744697570801, + "learning_rate": 1.2796191796693666e-05, + "loss": 0.7718, + "step": 5198 + }, + { + "epoch": 0.393401687412508, + "grad_norm": 2.0024526119232178, + "learning_rate": 1.2794149616622297e-05, + "loss": 0.7673, + "step": 5199 + }, + { + "epoch": 0.39347735613484164, + "grad_norm": 1.9214941263198853, + "learning_rate": 1.2792107198949008e-05, + "loss": 0.6543, + "step": 5200 + }, + { + "epoch": 0.3935530248571753, + "grad_norm": 2.2117514610290527, + "learning_rate": 1.2790064543801701e-05, + "loss": 0.7172, + "step": 5201 + }, + { + "epoch": 0.3936286935795089, + "grad_norm": 2.943007707595825, + "learning_rate": 1.2788021651308295e-05, + "loss": 0.7764, + "step": 5202 + }, + { + "epoch": 0.3937043623018425, + "grad_norm": 2.1258654594421387, + "learning_rate": 1.278597852159672e-05, + "loss": 0.7708, + "step": 5203 + }, + { + "epoch": 0.39378003102417614, + "grad_norm": 2.93727445602417, + "learning_rate": 1.2783935154794924e-05, + "loss": 0.779, + "step": 5204 + }, + { + "epoch": 0.3938556997465098, + "grad_norm": 1.9181923866271973, + "learning_rate": 1.2781891551030873e-05, + "loss": 0.7721, + "step": 5205 + }, + { + "epoch": 0.3939313684688434, + "grad_norm": 2.5555663108825684, + "learning_rate": 1.2779847710432538e-05, + "loss": 0.7231, + "step": 5206 + }, + { + "epoch": 0.394007037191177, + "grad_norm": 2.6176486015319824, + "learning_rate": 1.2777803633127914e-05, + "loss": 0.7323, + "step": 5207 + }, + { + "epoch": 0.39408270591351063, + "grad_norm": 2.785477876663208, + "learning_rate": 1.2775759319245007e-05, + "loss": 0.6702, + "step": 5208 + }, + { + "epoch": 0.3941583746358443, + "grad_norm": 2.178852081298828, + "learning_rate": 1.2773714768911842e-05, + "loss": 0.6602, + "step": 5209 + }, + { + "epoch": 0.3942340433581779, + "grad_norm": 2.0223734378814697, + "learning_rate": 1.277166998225645e-05, + "loss": 0.7339, + "step": 5210 + }, + { + "epoch": 0.3943097120805115, + "grad_norm": 3.054377555847168, + "learning_rate": 1.2769624959406885e-05, + "loss": 0.8046, + "step": 5211 + }, + { + "epoch": 0.39438538080284513, + "grad_norm": 2.5661230087280273, + "learning_rate": 1.2767579700491215e-05, + "loss": 0.9283, + "step": 5212 + }, + { + "epoch": 0.39446104952517874, + "grad_norm": 2.2824318408966064, + "learning_rate": 1.2765534205637514e-05, + "loss": 0.7967, + "step": 5213 + }, + { + "epoch": 0.3945367182475124, + "grad_norm": 2.100961923599243, + "learning_rate": 1.2763488474973886e-05, + "loss": 0.589, + "step": 5214 + }, + { + "epoch": 0.394612386969846, + "grad_norm": 2.079869508743286, + "learning_rate": 1.2761442508628432e-05, + "loss": 0.8719, + "step": 5215 + }, + { + "epoch": 0.39468805569217963, + "grad_norm": 2.3895928859710693, + "learning_rate": 1.2759396306729288e-05, + "loss": 0.8063, + "step": 5216 + }, + { + "epoch": 0.39476372441451324, + "grad_norm": 2.4894683361053467, + "learning_rate": 1.2757349869404585e-05, + "loss": 0.7269, + "step": 5217 + }, + { + "epoch": 0.3948393931368469, + "grad_norm": 2.383577346801758, + "learning_rate": 1.275530319678248e-05, + "loss": 0.7615, + "step": 5218 + }, + { + "epoch": 0.3949150618591805, + "grad_norm": 2.04361629486084, + "learning_rate": 1.2753256288991145e-05, + "loss": 0.5673, + "step": 5219 + }, + { + "epoch": 0.3949907305815141, + "grad_norm": 2.225693941116333, + "learning_rate": 1.2751209146158758e-05, + "loss": 0.8495, + "step": 5220 + }, + { + "epoch": 0.39506639930384774, + "grad_norm": 2.191380739212036, + "learning_rate": 1.2749161768413526e-05, + "loss": 0.6206, + "step": 5221 + }, + { + "epoch": 0.3951420680261814, + "grad_norm": 1.8933615684509277, + "learning_rate": 1.2747114155883653e-05, + "loss": 0.6419, + "step": 5222 + }, + { + "epoch": 0.395217736748515, + "grad_norm": 2.0595176219940186, + "learning_rate": 1.2745066308697374e-05, + "loss": 0.8184, + "step": 5223 + }, + { + "epoch": 0.3952934054708486, + "grad_norm": 2.14420485496521, + "learning_rate": 1.274301822698293e-05, + "loss": 0.7094, + "step": 5224 + }, + { + "epoch": 0.39536907419318223, + "grad_norm": 2.4212982654571533, + "learning_rate": 1.274096991086858e-05, + "loss": 0.6066, + "step": 5225 + }, + { + "epoch": 0.39544474291551585, + "grad_norm": 2.151181221008301, + "learning_rate": 1.2738921360482592e-05, + "loss": 0.7381, + "step": 5226 + }, + { + "epoch": 0.3955204116378495, + "grad_norm": 2.084139823913574, + "learning_rate": 1.2736872575953256e-05, + "loss": 0.7257, + "step": 5227 + }, + { + "epoch": 0.3955960803601831, + "grad_norm": 2.635713577270508, + "learning_rate": 1.2734823557408872e-05, + "loss": 0.6161, + "step": 5228 + }, + { + "epoch": 0.39567174908251673, + "grad_norm": 2.126365900039673, + "learning_rate": 1.2732774304977758e-05, + "loss": 0.7688, + "step": 5229 + }, + { + "epoch": 0.39574741780485034, + "grad_norm": 1.905297040939331, + "learning_rate": 1.2730724818788245e-05, + "loss": 0.6682, + "step": 5230 + }, + { + "epoch": 0.395823086527184, + "grad_norm": 2.004648447036743, + "learning_rate": 1.2728675098968672e-05, + "loss": 0.5425, + "step": 5231 + }, + { + "epoch": 0.3958987552495176, + "grad_norm": 2.102177381515503, + "learning_rate": 1.272662514564741e-05, + "loss": 0.7984, + "step": 5232 + }, + { + "epoch": 0.39597442397185123, + "grad_norm": 2.1269235610961914, + "learning_rate": 1.2724574958952827e-05, + "loss": 0.8195, + "step": 5233 + }, + { + "epoch": 0.39605009269418484, + "grad_norm": 2.0913987159729004, + "learning_rate": 1.2722524539013312e-05, + "loss": 0.8296, + "step": 5234 + }, + { + "epoch": 0.3961257614165185, + "grad_norm": 2.1840765476226807, + "learning_rate": 1.2720473885957271e-05, + "loss": 0.6681, + "step": 5235 + }, + { + "epoch": 0.3962014301388521, + "grad_norm": 2.1150200366973877, + "learning_rate": 1.271842299991312e-05, + "loss": 0.6953, + "step": 5236 + }, + { + "epoch": 0.39627709886118573, + "grad_norm": 2.1616060733795166, + "learning_rate": 1.2716371881009295e-05, + "loss": 0.7757, + "step": 5237 + }, + { + "epoch": 0.39635276758351934, + "grad_norm": 2.005535840988159, + "learning_rate": 1.2714320529374241e-05, + "loss": 0.7313, + "step": 5238 + }, + { + "epoch": 0.39642843630585295, + "grad_norm": 1.6546169519424438, + "learning_rate": 1.2712268945136425e-05, + "loss": 0.8232, + "step": 5239 + }, + { + "epoch": 0.3965041050281866, + "grad_norm": 1.8763610124588013, + "learning_rate": 1.271021712842432e-05, + "loss": 0.7103, + "step": 5240 + }, + { + "epoch": 0.3965797737505202, + "grad_norm": 2.4322524070739746, + "learning_rate": 1.2708165079366417e-05, + "loss": 0.7621, + "step": 5241 + }, + { + "epoch": 0.39665544247285384, + "grad_norm": 2.386225700378418, + "learning_rate": 1.2706112798091226e-05, + "loss": 0.8655, + "step": 5242 + }, + { + "epoch": 0.39673111119518745, + "grad_norm": 1.8513740301132202, + "learning_rate": 1.2704060284727262e-05, + "loss": 0.7369, + "step": 5243 + }, + { + "epoch": 0.3968067799175211, + "grad_norm": 2.741036891937256, + "learning_rate": 1.2702007539403062e-05, + "loss": 0.7186, + "step": 5244 + }, + { + "epoch": 0.3968824486398547, + "grad_norm": 2.038377046585083, + "learning_rate": 1.2699954562247177e-05, + "loss": 0.8323, + "step": 5245 + }, + { + "epoch": 0.39695811736218833, + "grad_norm": 1.7875847816467285, + "learning_rate": 1.2697901353388168e-05, + "loss": 0.8214, + "step": 5246 + }, + { + "epoch": 0.39703378608452194, + "grad_norm": 2.403543472290039, + "learning_rate": 1.269584791295462e-05, + "loss": 0.8744, + "step": 5247 + }, + { + "epoch": 0.3971094548068556, + "grad_norm": 1.7908776998519897, + "learning_rate": 1.269379424107512e-05, + "loss": 0.6912, + "step": 5248 + }, + { + "epoch": 0.3971851235291892, + "grad_norm": 2.3532586097717285, + "learning_rate": 1.2691740337878277e-05, + "loss": 0.6537, + "step": 5249 + }, + { + "epoch": 0.39726079225152283, + "grad_norm": 2.0470757484436035, + "learning_rate": 1.2689686203492713e-05, + "loss": 0.7524, + "step": 5250 + }, + { + "epoch": 0.39733646097385644, + "grad_norm": 1.9975119829177856, + "learning_rate": 1.2687631838047064e-05, + "loss": 0.7166, + "step": 5251 + }, + { + "epoch": 0.39741212969619005, + "grad_norm": 1.9511202573776245, + "learning_rate": 1.2685577241669984e-05, + "loss": 0.7518, + "step": 5252 + }, + { + "epoch": 0.3974877984185237, + "grad_norm": 2.32716703414917, + "learning_rate": 1.2683522414490138e-05, + "loss": 0.7032, + "step": 5253 + }, + { + "epoch": 0.39756346714085733, + "grad_norm": 2.9506001472473145, + "learning_rate": 1.2681467356636202e-05, + "loss": 0.8021, + "step": 5254 + }, + { + "epoch": 0.39763913586319094, + "grad_norm": 1.7488332986831665, + "learning_rate": 1.2679412068236875e-05, + "loss": 0.5693, + "step": 5255 + }, + { + "epoch": 0.39771480458552455, + "grad_norm": 2.3912007808685303, + "learning_rate": 1.2677356549420862e-05, + "loss": 0.8525, + "step": 5256 + }, + { + "epoch": 0.3977904733078582, + "grad_norm": 2.2952723503112793, + "learning_rate": 1.2675300800316889e-05, + "loss": 0.7448, + "step": 5257 + }, + { + "epoch": 0.3978661420301918, + "grad_norm": 3.195134401321411, + "learning_rate": 1.2673244821053692e-05, + "loss": 0.7458, + "step": 5258 + }, + { + "epoch": 0.39794181075252544, + "grad_norm": 2.2581069469451904, + "learning_rate": 1.267118861176002e-05, + "loss": 0.6457, + "step": 5259 + }, + { + "epoch": 0.39801747947485905, + "grad_norm": 2.550224542617798, + "learning_rate": 1.266913217256465e-05, + "loss": 0.8612, + "step": 5260 + }, + { + "epoch": 0.3980931481971927, + "grad_norm": 2.841343879699707, + "learning_rate": 1.2667075503596348e-05, + "loss": 0.6993, + "step": 5261 + }, + { + "epoch": 0.3981688169195263, + "grad_norm": 2.263087034225464, + "learning_rate": 1.2665018604983924e-05, + "loss": 0.7111, + "step": 5262 + }, + { + "epoch": 0.39824448564185994, + "grad_norm": 2.2085769176483154, + "learning_rate": 1.2662961476856177e-05, + "loss": 0.7621, + "step": 5263 + }, + { + "epoch": 0.39832015436419355, + "grad_norm": 1.9521921873092651, + "learning_rate": 1.2660904119341937e-05, + "loss": 0.7074, + "step": 5264 + }, + { + "epoch": 0.39839582308652716, + "grad_norm": 1.6472288370132446, + "learning_rate": 1.265884653257004e-05, + "loss": 0.7033, + "step": 5265 + }, + { + "epoch": 0.3984714918088608, + "grad_norm": 2.273076057434082, + "learning_rate": 1.2656788716669338e-05, + "loss": 0.6346, + "step": 5266 + }, + { + "epoch": 0.39854716053119443, + "grad_norm": 2.0756750106811523, + "learning_rate": 1.26547306717687e-05, + "loss": 0.7563, + "step": 5267 + }, + { + "epoch": 0.39862282925352804, + "grad_norm": 1.9330674409866333, + "learning_rate": 1.2652672397997006e-05, + "loss": 0.7495, + "step": 5268 + }, + { + "epoch": 0.39869849797586165, + "grad_norm": 1.7233151197433472, + "learning_rate": 1.2650613895483152e-05, + "loss": 0.6678, + "step": 5269 + }, + { + "epoch": 0.3987741666981953, + "grad_norm": 1.934144377708435, + "learning_rate": 1.2648555164356047e-05, + "loss": 0.7826, + "step": 5270 + }, + { + "epoch": 0.39884983542052893, + "grad_norm": 2.4113855361938477, + "learning_rate": 1.2646496204744618e-05, + "loss": 0.6297, + "step": 5271 + }, + { + "epoch": 0.39892550414286254, + "grad_norm": 1.8144162893295288, + "learning_rate": 1.2644437016777803e-05, + "loss": 0.5998, + "step": 5272 + }, + { + "epoch": 0.39900117286519615, + "grad_norm": 1.9845155477523804, + "learning_rate": 1.2642377600584556e-05, + "loss": 0.7491, + "step": 5273 + }, + { + "epoch": 0.3990768415875298, + "grad_norm": 2.8481497764587402, + "learning_rate": 1.264031795629384e-05, + "loss": 0.5937, + "step": 5274 + }, + { + "epoch": 0.39915251030986343, + "grad_norm": 2.570568323135376, + "learning_rate": 1.263825808403464e-05, + "loss": 0.8319, + "step": 5275 + }, + { + "epoch": 0.39922817903219704, + "grad_norm": 2.246908187866211, + "learning_rate": 1.2636197983935953e-05, + "loss": 0.6984, + "step": 5276 + }, + { + "epoch": 0.39930384775453065, + "grad_norm": 2.2146944999694824, + "learning_rate": 1.2634137656126784e-05, + "loss": 0.7518, + "step": 5277 + }, + { + "epoch": 0.3993795164768643, + "grad_norm": 2.186021327972412, + "learning_rate": 1.2632077100736164e-05, + "loss": 0.7488, + "step": 5278 + }, + { + "epoch": 0.3994551851991979, + "grad_norm": 1.7041964530944824, + "learning_rate": 1.2630016317893127e-05, + "loss": 0.6456, + "step": 5279 + }, + { + "epoch": 0.39953085392153154, + "grad_norm": 1.9374867677688599, + "learning_rate": 1.2627955307726726e-05, + "loss": 0.6803, + "step": 5280 + }, + { + "epoch": 0.39960652264386515, + "grad_norm": 2.4757678508758545, + "learning_rate": 1.2625894070366033e-05, + "loss": 0.6904, + "step": 5281 + }, + { + "epoch": 0.39968219136619876, + "grad_norm": 2.042297840118408, + "learning_rate": 1.2623832605940122e-05, + "loss": 0.7731, + "step": 5282 + }, + { + "epoch": 0.3997578600885324, + "grad_norm": 1.9081017971038818, + "learning_rate": 1.2621770914578095e-05, + "loss": 0.7498, + "step": 5283 + }, + { + "epoch": 0.39983352881086603, + "grad_norm": 1.9273154735565186, + "learning_rate": 1.2619708996409056e-05, + "loss": 0.7335, + "step": 5284 + }, + { + "epoch": 0.39990919753319965, + "grad_norm": 2.073868989944458, + "learning_rate": 1.2617646851562134e-05, + "loss": 0.7556, + "step": 5285 + }, + { + "epoch": 0.39998486625553326, + "grad_norm": 1.6271218061447144, + "learning_rate": 1.2615584480166465e-05, + "loss": 0.7457, + "step": 5286 + }, + { + "epoch": 0.4000605349778669, + "grad_norm": 3.472792387008667, + "learning_rate": 1.2613521882351204e-05, + "loss": 0.7719, + "step": 5287 + }, + { + "epoch": 0.40013620370020053, + "grad_norm": 2.107931613922119, + "learning_rate": 1.2611459058245511e-05, + "loss": 0.6264, + "step": 5288 + }, + { + "epoch": 0.40021187242253414, + "grad_norm": 2.132664203643799, + "learning_rate": 1.2609396007978573e-05, + "loss": 0.6974, + "step": 5289 + }, + { + "epoch": 0.40028754114486775, + "grad_norm": 1.6444696187973022, + "learning_rate": 1.2607332731679584e-05, + "loss": 0.6695, + "step": 5290 + }, + { + "epoch": 0.4003632098672014, + "grad_norm": 2.9400112628936768, + "learning_rate": 1.260526922947775e-05, + "loss": 0.7455, + "step": 5291 + }, + { + "epoch": 0.40043887858953503, + "grad_norm": 3.270721197128296, + "learning_rate": 1.2603205501502296e-05, + "loss": 0.7693, + "step": 5292 + }, + { + "epoch": 0.40051454731186864, + "grad_norm": 1.8366196155548096, + "learning_rate": 1.260114154788246e-05, + "loss": 0.6392, + "step": 5293 + }, + { + "epoch": 0.40059021603420225, + "grad_norm": 2.108292818069458, + "learning_rate": 1.2599077368747494e-05, + "loss": 0.7311, + "step": 5294 + }, + { + "epoch": 0.40066588475653586, + "grad_norm": 2.0967061519622803, + "learning_rate": 1.259701296422666e-05, + "loss": 0.7679, + "step": 5295 + }, + { + "epoch": 0.4007415534788695, + "grad_norm": 1.9956434965133667, + "learning_rate": 1.2594948334449241e-05, + "loss": 0.7826, + "step": 5296 + }, + { + "epoch": 0.40081722220120314, + "grad_norm": 2.254016160964966, + "learning_rate": 1.259288347954453e-05, + "loss": 0.5853, + "step": 5297 + }, + { + "epoch": 0.40089289092353675, + "grad_norm": 1.7637948989868164, + "learning_rate": 1.2590818399641833e-05, + "loss": 0.8752, + "step": 5298 + }, + { + "epoch": 0.40096855964587036, + "grad_norm": 2.54941987991333, + "learning_rate": 1.2588753094870477e-05, + "loss": 0.7963, + "step": 5299 + }, + { + "epoch": 0.401044228368204, + "grad_norm": 1.4599640369415283, + "learning_rate": 1.2586687565359791e-05, + "loss": 0.7636, + "step": 5300 + }, + { + "epoch": 0.40111989709053764, + "grad_norm": 2.2230331897735596, + "learning_rate": 1.2584621811239133e-05, + "loss": 0.7579, + "step": 5301 + }, + { + "epoch": 0.40119556581287125, + "grad_norm": 1.8923826217651367, + "learning_rate": 1.2582555832637862e-05, + "loss": 0.6866, + "step": 5302 + }, + { + "epoch": 0.40127123453520486, + "grad_norm": 2.1083552837371826, + "learning_rate": 1.2580489629685354e-05, + "loss": 0.6996, + "step": 5303 + }, + { + "epoch": 0.4013469032575385, + "grad_norm": 2.3579092025756836, + "learning_rate": 1.2578423202511008e-05, + "loss": 0.7097, + "step": 5304 + }, + { + "epoch": 0.40142257197987213, + "grad_norm": 2.3279149532318115, + "learning_rate": 1.2576356551244226e-05, + "loss": 0.7892, + "step": 5305 + }, + { + "epoch": 0.40149824070220574, + "grad_norm": 1.9012402296066284, + "learning_rate": 1.2574289676014431e-05, + "loss": 0.6017, + "step": 5306 + }, + { + "epoch": 0.40157390942453935, + "grad_norm": 1.9098631143569946, + "learning_rate": 1.2572222576951054e-05, + "loss": 0.7714, + "step": 5307 + }, + { + "epoch": 0.40164957814687297, + "grad_norm": 2.4168155193328857, + "learning_rate": 1.2570155254183544e-05, + "loss": 0.8486, + "step": 5308 + }, + { + "epoch": 0.40172524686920663, + "grad_norm": 2.088871479034424, + "learning_rate": 1.2568087707841367e-05, + "loss": 0.7525, + "step": 5309 + }, + { + "epoch": 0.40180091559154024, + "grad_norm": 2.0699868202209473, + "learning_rate": 1.2566019938053996e-05, + "loss": 0.6048, + "step": 5310 + }, + { + "epoch": 0.40187658431387385, + "grad_norm": 1.6784697771072388, + "learning_rate": 1.2563951944950923e-05, + "loss": 0.8357, + "step": 5311 + }, + { + "epoch": 0.40195225303620746, + "grad_norm": 2.897984504699707, + "learning_rate": 1.2561883728661652e-05, + "loss": 0.6136, + "step": 5312 + }, + { + "epoch": 0.40202792175854113, + "grad_norm": 2.1926939487457275, + "learning_rate": 1.2559815289315701e-05, + "loss": 0.6236, + "step": 5313 + }, + { + "epoch": 0.40210359048087474, + "grad_norm": 2.2458455562591553, + "learning_rate": 1.2557746627042605e-05, + "loss": 0.8109, + "step": 5314 + }, + { + "epoch": 0.40217925920320835, + "grad_norm": 2.5638132095336914, + "learning_rate": 1.2555677741971905e-05, + "loss": 0.7555, + "step": 5315 + }, + { + "epoch": 0.40225492792554196, + "grad_norm": 1.7509515285491943, + "learning_rate": 1.2553608634233166e-05, + "loss": 0.8178, + "step": 5316 + }, + { + "epoch": 0.4023305966478756, + "grad_norm": 2.2792065143585205, + "learning_rate": 1.2551539303955962e-05, + "loss": 0.8282, + "step": 5317 + }, + { + "epoch": 0.40240626537020924, + "grad_norm": 1.9566348791122437, + "learning_rate": 1.2549469751269876e-05, + "loss": 0.7285, + "step": 5318 + }, + { + "epoch": 0.40248193409254285, + "grad_norm": 2.4450414180755615, + "learning_rate": 1.2547399976304517e-05, + "loss": 0.7842, + "step": 5319 + }, + { + "epoch": 0.40255760281487646, + "grad_norm": 2.2559216022491455, + "learning_rate": 1.2545329979189495e-05, + "loss": 0.7553, + "step": 5320 + }, + { + "epoch": 0.40263327153721007, + "grad_norm": 2.124101400375366, + "learning_rate": 1.2543259760054444e-05, + "loss": 0.6608, + "step": 5321 + }, + { + "epoch": 0.40270894025954374, + "grad_norm": 16.932872772216797, + "learning_rate": 1.2541189319029008e-05, + "loss": 0.7524, + "step": 5322 + }, + { + "epoch": 0.40278460898187735, + "grad_norm": 1.7447752952575684, + "learning_rate": 1.2539118656242839e-05, + "loss": 0.7034, + "step": 5323 + }, + { + "epoch": 0.40286027770421096, + "grad_norm": 2.0638234615325928, + "learning_rate": 1.2537047771825618e-05, + "loss": 0.6932, + "step": 5324 + }, + { + "epoch": 0.40293594642654457, + "grad_norm": 2.157304048538208, + "learning_rate": 1.2534976665907024e-05, + "loss": 0.7061, + "step": 5325 + }, + { + "epoch": 0.40301161514887823, + "grad_norm": 1.8160690069198608, + "learning_rate": 1.2532905338616756e-05, + "loss": 0.8272, + "step": 5326 + }, + { + "epoch": 0.40308728387121184, + "grad_norm": 2.0002903938293457, + "learning_rate": 1.2530833790084527e-05, + "loss": 0.6858, + "step": 5327 + }, + { + "epoch": 0.40316295259354545, + "grad_norm": 2.2748143672943115, + "learning_rate": 1.252876202044007e-05, + "loss": 0.8348, + "step": 5328 + }, + { + "epoch": 0.40323862131587906, + "grad_norm": 2.3512051105499268, + "learning_rate": 1.2526690029813123e-05, + "loss": 0.9381, + "step": 5329 + }, + { + "epoch": 0.40331429003821273, + "grad_norm": 2.0239651203155518, + "learning_rate": 1.2524617818333437e-05, + "loss": 0.6707, + "step": 5330 + }, + { + "epoch": 0.40338995876054634, + "grad_norm": 2.503915786743164, + "learning_rate": 1.2522545386130781e-05, + "loss": 0.6709, + "step": 5331 + }, + { + "epoch": 0.40346562748287995, + "grad_norm": 2.1918065547943115, + "learning_rate": 1.2520472733334942e-05, + "loss": 0.901, + "step": 5332 + }, + { + "epoch": 0.40354129620521356, + "grad_norm": 2.0927445888519287, + "learning_rate": 1.2518399860075714e-05, + "loss": 0.8102, + "step": 5333 + }, + { + "epoch": 0.4036169649275472, + "grad_norm": 1.9232884645462036, + "learning_rate": 1.2516326766482908e-05, + "loss": 0.7762, + "step": 5334 + }, + { + "epoch": 0.40369263364988084, + "grad_norm": 2.297513961791992, + "learning_rate": 1.2514253452686346e-05, + "loss": 0.8568, + "step": 5335 + }, + { + "epoch": 0.40376830237221445, + "grad_norm": 2.033656597137451, + "learning_rate": 1.2512179918815865e-05, + "loss": 0.8079, + "step": 5336 + }, + { + "epoch": 0.40384397109454806, + "grad_norm": 1.996671438217163, + "learning_rate": 1.2510106165001317e-05, + "loss": 0.6862, + "step": 5337 + }, + { + "epoch": 0.40391963981688167, + "grad_norm": 2.0469861030578613, + "learning_rate": 1.250803219137257e-05, + "loss": 0.7625, + "step": 5338 + }, + { + "epoch": 0.40399530853921534, + "grad_norm": 1.9562182426452637, + "learning_rate": 1.25059579980595e-05, + "loss": 0.8611, + "step": 5339 + }, + { + "epoch": 0.40407097726154895, + "grad_norm": 2.2968802452087402, + "learning_rate": 1.2503883585192003e-05, + "loss": 0.6937, + "step": 5340 + }, + { + "epoch": 0.40414664598388256, + "grad_norm": 2.1029908657073975, + "learning_rate": 1.2501808952899976e-05, + "loss": 0.6784, + "step": 5341 + }, + { + "epoch": 0.40422231470621617, + "grad_norm": 1.7134768962860107, + "learning_rate": 1.2499734101313355e-05, + "loss": 0.9551, + "step": 5342 + }, + { + "epoch": 0.40429798342854983, + "grad_norm": 2.0898208618164062, + "learning_rate": 1.2497659030562058e-05, + "loss": 0.6126, + "step": 5343 + }, + { + "epoch": 0.40437365215088344, + "grad_norm": 1.7897844314575195, + "learning_rate": 1.2495583740776043e-05, + "loss": 0.9229, + "step": 5344 + }, + { + "epoch": 0.40444932087321706, + "grad_norm": 1.9922789335250854, + "learning_rate": 1.2493508232085271e-05, + "loss": 0.7787, + "step": 5345 + }, + { + "epoch": 0.40452498959555067, + "grad_norm": 2.0527071952819824, + "learning_rate": 1.2491432504619707e-05, + "loss": 0.764, + "step": 5346 + }, + { + "epoch": 0.4046006583178843, + "grad_norm": 1.69284188747406, + "learning_rate": 1.2489356558509353e-05, + "loss": 0.5887, + "step": 5347 + }, + { + "epoch": 0.40467632704021794, + "grad_norm": 2.769381284713745, + "learning_rate": 1.2487280393884202e-05, + "loss": 0.855, + "step": 5348 + }, + { + "epoch": 0.40475199576255155, + "grad_norm": 1.9943363666534424, + "learning_rate": 1.2485204010874276e-05, + "loss": 0.6139, + "step": 5349 + }, + { + "epoch": 0.40482766448488516, + "grad_norm": 2.07372784614563, + "learning_rate": 1.2483127409609598e-05, + "loss": 0.8462, + "step": 5350 + }, + { + "epoch": 0.4049033332072188, + "grad_norm": 2.265497922897339, + "learning_rate": 1.248105059022022e-05, + "loss": 0.6575, + "step": 5351 + }, + { + "epoch": 0.40497900192955244, + "grad_norm": 2.506788969039917, + "learning_rate": 1.2478973552836195e-05, + "loss": 0.7756, + "step": 5352 + }, + { + "epoch": 0.40505467065188605, + "grad_norm": 1.914207935333252, + "learning_rate": 1.2476896297587592e-05, + "loss": 0.7362, + "step": 5353 + }, + { + "epoch": 0.40513033937421966, + "grad_norm": 2.050699234008789, + "learning_rate": 1.2474818824604498e-05, + "loss": 0.6945, + "step": 5354 + }, + { + "epoch": 0.40520600809655327, + "grad_norm": 2.0397143363952637, + "learning_rate": 1.2472741134017008e-05, + "loss": 0.7752, + "step": 5355 + }, + { + "epoch": 0.40528167681888694, + "grad_norm": 2.459721803665161, + "learning_rate": 1.2470663225955239e-05, + "loss": 0.8221, + "step": 5356 + }, + { + "epoch": 0.40535734554122055, + "grad_norm": 1.8895254135131836, + "learning_rate": 1.2468585100549311e-05, + "loss": 0.6238, + "step": 5357 + }, + { + "epoch": 0.40543301426355416, + "grad_norm": 2.120483875274658, + "learning_rate": 1.2466506757929369e-05, + "loss": 0.6015, + "step": 5358 + }, + { + "epoch": 0.40550868298588777, + "grad_norm": 1.9569705724716187, + "learning_rate": 1.2464428198225558e-05, + "loss": 0.704, + "step": 5359 + }, + { + "epoch": 0.4055843517082214, + "grad_norm": 1.8289756774902344, + "learning_rate": 1.2462349421568047e-05, + "loss": 0.701, + "step": 5360 + }, + { + "epoch": 0.40566002043055505, + "grad_norm": 1.8984501361846924, + "learning_rate": 1.246027042808702e-05, + "loss": 0.6924, + "step": 5361 + }, + { + "epoch": 0.40573568915288866, + "grad_norm": 2.5578982830047607, + "learning_rate": 1.2458191217912664e-05, + "loss": 0.8879, + "step": 5362 + }, + { + "epoch": 0.40581135787522227, + "grad_norm": 2.1358728408813477, + "learning_rate": 1.2456111791175193e-05, + "loss": 0.7446, + "step": 5363 + }, + { + "epoch": 0.4058870265975559, + "grad_norm": 2.0823116302490234, + "learning_rate": 1.2454032148004819e-05, + "loss": 0.8014, + "step": 5364 + }, + { + "epoch": 0.40596269531988954, + "grad_norm": 1.6264188289642334, + "learning_rate": 1.2451952288531781e-05, + "loss": 0.7815, + "step": 5365 + }, + { + "epoch": 0.40603836404222315, + "grad_norm": 2.0880937576293945, + "learning_rate": 1.2449872212886328e-05, + "loss": 0.6668, + "step": 5366 + }, + { + "epoch": 0.40611403276455676, + "grad_norm": 2.2300379276275635, + "learning_rate": 1.2447791921198715e-05, + "loss": 0.7545, + "step": 5367 + }, + { + "epoch": 0.4061897014868904, + "grad_norm": 1.5105425119400024, + "learning_rate": 1.2445711413599226e-05, + "loss": 0.8274, + "step": 5368 + }, + { + "epoch": 0.40626537020922404, + "grad_norm": 2.3562936782836914, + "learning_rate": 1.2443630690218137e-05, + "loss": 0.8011, + "step": 5369 + }, + { + "epoch": 0.40634103893155765, + "grad_norm": 2.463721990585327, + "learning_rate": 1.2441549751185762e-05, + "loss": 0.706, + "step": 5370 + }, + { + "epoch": 0.40641670765389126, + "grad_norm": 2.0283122062683105, + "learning_rate": 1.2439468596632408e-05, + "loss": 0.7052, + "step": 5371 + }, + { + "epoch": 0.4064923763762249, + "grad_norm": 1.6645065546035767, + "learning_rate": 1.2437387226688404e-05, + "loss": 0.5734, + "step": 5372 + }, + { + "epoch": 0.4065680450985585, + "grad_norm": 2.11430025100708, + "learning_rate": 1.2435305641484095e-05, + "loss": 0.7679, + "step": 5373 + }, + { + "epoch": 0.40664371382089215, + "grad_norm": 2.3016443252563477, + "learning_rate": 1.2433223841149837e-05, + "loss": 0.7524, + "step": 5374 + }, + { + "epoch": 0.40671938254322576, + "grad_norm": 2.4958677291870117, + "learning_rate": 1.2431141825815998e-05, + "loss": 0.7631, + "step": 5375 + }, + { + "epoch": 0.40679505126555937, + "grad_norm": 1.683720350265503, + "learning_rate": 1.2429059595612957e-05, + "loss": 0.7059, + "step": 5376 + }, + { + "epoch": 0.406870719987893, + "grad_norm": 1.871661901473999, + "learning_rate": 1.2426977150671117e-05, + "loss": 0.6518, + "step": 5377 + }, + { + "epoch": 0.40694638871022665, + "grad_norm": 1.8744332790374756, + "learning_rate": 1.2424894491120879e-05, + "loss": 0.7192, + "step": 5378 + }, + { + "epoch": 0.40702205743256026, + "grad_norm": 2.238365888595581, + "learning_rate": 1.2422811617092675e-05, + "loss": 0.7441, + "step": 5379 + }, + { + "epoch": 0.40709772615489387, + "grad_norm": 2.139251232147217, + "learning_rate": 1.2420728528716933e-05, + "loss": 0.7847, + "step": 5380 + }, + { + "epoch": 0.4071733948772275, + "grad_norm": 2.121941328048706, + "learning_rate": 1.241864522612411e-05, + "loss": 0.781, + "step": 5381 + }, + { + "epoch": 0.40724906359956115, + "grad_norm": 2.219752788543701, + "learning_rate": 1.2416561709444665e-05, + "loss": 0.815, + "step": 5382 + }, + { + "epoch": 0.40732473232189476, + "grad_norm": 2.355746030807495, + "learning_rate": 1.2414477978809075e-05, + "loss": 0.8222, + "step": 5383 + }, + { + "epoch": 0.40740040104422837, + "grad_norm": 2.5740647315979004, + "learning_rate": 1.241239403434783e-05, + "loss": 0.8416, + "step": 5384 + }, + { + "epoch": 0.407476069766562, + "grad_norm": 2.1180238723754883, + "learning_rate": 1.2410309876191433e-05, + "loss": 0.8138, + "step": 5385 + }, + { + "epoch": 0.4075517384888956, + "grad_norm": 2.1506288051605225, + "learning_rate": 1.2408225504470402e-05, + "loss": 0.7021, + "step": 5386 + }, + { + "epoch": 0.40762740721122925, + "grad_norm": 3.2336843013763428, + "learning_rate": 1.2406140919315265e-05, + "loss": 0.8422, + "step": 5387 + }, + { + "epoch": 0.40770307593356286, + "grad_norm": 2.023808240890503, + "learning_rate": 1.2404056120856568e-05, + "loss": 0.7343, + "step": 5388 + }, + { + "epoch": 0.4077787446558965, + "grad_norm": 1.8890466690063477, + "learning_rate": 1.2401971109224865e-05, + "loss": 0.6938, + "step": 5389 + }, + { + "epoch": 0.4078544133782301, + "grad_norm": 2.454148530960083, + "learning_rate": 1.239988588455073e-05, + "loss": 0.8247, + "step": 5390 + }, + { + "epoch": 0.40793008210056375, + "grad_norm": 2.348931312561035, + "learning_rate": 1.2397800446964743e-05, + "loss": 0.6928, + "step": 5391 + }, + { + "epoch": 0.40800575082289736, + "grad_norm": 1.7471752166748047, + "learning_rate": 1.2395714796597503e-05, + "loss": 0.7767, + "step": 5392 + }, + { + "epoch": 0.408081419545231, + "grad_norm": 2.375242233276367, + "learning_rate": 1.239362893357962e-05, + "loss": 0.6983, + "step": 5393 + }, + { + "epoch": 0.4081570882675646, + "grad_norm": 2.8043153285980225, + "learning_rate": 1.2391542858041716e-05, + "loss": 0.6071, + "step": 5394 + }, + { + "epoch": 0.40823275698989825, + "grad_norm": 2.196038246154785, + "learning_rate": 1.238945657011443e-05, + "loss": 0.7422, + "step": 5395 + }, + { + "epoch": 0.40830842571223186, + "grad_norm": 2.552044153213501, + "learning_rate": 1.2387370069928408e-05, + "loss": 0.8483, + "step": 5396 + }, + { + "epoch": 0.40838409443456547, + "grad_norm": 2.0079550743103027, + "learning_rate": 1.2385283357614319e-05, + "loss": 0.7005, + "step": 5397 + }, + { + "epoch": 0.4084597631568991, + "grad_norm": 3.333538055419922, + "learning_rate": 1.2383196433302832e-05, + "loss": 0.6432, + "step": 5398 + }, + { + "epoch": 0.40853543187923275, + "grad_norm": 2.925452947616577, + "learning_rate": 1.2381109297124649e-05, + "loss": 0.7974, + "step": 5399 + }, + { + "epoch": 0.40861110060156636, + "grad_norm": 2.683720350265503, + "learning_rate": 1.2379021949210461e-05, + "loss": 0.7273, + "step": 5400 + }, + { + "epoch": 0.40868676932389997, + "grad_norm": 2.577501058578491, + "learning_rate": 1.2376934389690992e-05, + "loss": 0.8398, + "step": 5401 + }, + { + "epoch": 0.4087624380462336, + "grad_norm": 1.819061040878296, + "learning_rate": 1.2374846618696968e-05, + "loss": 0.6676, + "step": 5402 + }, + { + "epoch": 0.4088381067685672, + "grad_norm": 1.9940983057022095, + "learning_rate": 1.2372758636359129e-05, + "loss": 0.7512, + "step": 5403 + }, + { + "epoch": 0.40891377549090085, + "grad_norm": 2.4478976726531982, + "learning_rate": 1.2370670442808242e-05, + "loss": 0.7858, + "step": 5404 + }, + { + "epoch": 0.40898944421323447, + "grad_norm": 1.8682414293289185, + "learning_rate": 1.2368582038175066e-05, + "loss": 0.6828, + "step": 5405 + }, + { + "epoch": 0.4090651129355681, + "grad_norm": 1.9492807388305664, + "learning_rate": 1.2366493422590389e-05, + "loss": 0.5985, + "step": 5406 + }, + { + "epoch": 0.4091407816579017, + "grad_norm": 2.0559017658233643, + "learning_rate": 1.2364404596185005e-05, + "loss": 0.7456, + "step": 5407 + }, + { + "epoch": 0.40921645038023535, + "grad_norm": 1.9211013317108154, + "learning_rate": 1.2362315559089724e-05, + "loss": 0.8048, + "step": 5408 + }, + { + "epoch": 0.40929211910256896, + "grad_norm": 2.1894242763519287, + "learning_rate": 1.2360226311435368e-05, + "loss": 0.6719, + "step": 5409 + }, + { + "epoch": 0.4093677878249026, + "grad_norm": 2.1584041118621826, + "learning_rate": 1.235813685335277e-05, + "loss": 0.7722, + "step": 5410 + }, + { + "epoch": 0.4094434565472362, + "grad_norm": 2.0298099517822266, + "learning_rate": 1.235604718497278e-05, + "loss": 0.7631, + "step": 5411 + }, + { + "epoch": 0.40951912526956985, + "grad_norm": 2.1612000465393066, + "learning_rate": 1.2353957306426264e-05, + "loss": 0.7931, + "step": 5412 + }, + { + "epoch": 0.40959479399190346, + "grad_norm": 2.1110432147979736, + "learning_rate": 1.2351867217844091e-05, + "loss": 0.7961, + "step": 5413 + }, + { + "epoch": 0.40967046271423707, + "grad_norm": 2.3404643535614014, + "learning_rate": 1.2349776919357153e-05, + "loss": 0.7194, + "step": 5414 + }, + { + "epoch": 0.4097461314365707, + "grad_norm": 2.3472416400909424, + "learning_rate": 1.2347686411096347e-05, + "loss": 0.7346, + "step": 5415 + }, + { + "epoch": 0.4098218001589043, + "grad_norm": 2.5707545280456543, + "learning_rate": 1.2345595693192594e-05, + "loss": 0.7142, + "step": 5416 + }, + { + "epoch": 0.40989746888123796, + "grad_norm": 1.96560800075531, + "learning_rate": 1.2343504765776816e-05, + "loss": 0.8404, + "step": 5417 + }, + { + "epoch": 0.40997313760357157, + "grad_norm": 1.9135947227478027, + "learning_rate": 1.2341413628979957e-05, + "loss": 0.697, + "step": 5418 + }, + { + "epoch": 0.4100488063259052, + "grad_norm": 2.3961076736450195, + "learning_rate": 1.2339322282932964e-05, + "loss": 0.7307, + "step": 5419 + }, + { + "epoch": 0.4101244750482388, + "grad_norm": 2.5133774280548096, + "learning_rate": 1.2337230727766815e-05, + "loss": 0.6817, + "step": 5420 + }, + { + "epoch": 0.41020014377057246, + "grad_norm": 2.319206714630127, + "learning_rate": 1.233513896361248e-05, + "loss": 0.7036, + "step": 5421 + }, + { + "epoch": 0.41027581249290607, + "grad_norm": 2.3848443031311035, + "learning_rate": 1.2333046990600959e-05, + "loss": 0.7175, + "step": 5422 + }, + { + "epoch": 0.4103514812152397, + "grad_norm": 3.0399367809295654, + "learning_rate": 1.2330954808863253e-05, + "loss": 0.649, + "step": 5423 + }, + { + "epoch": 0.4104271499375733, + "grad_norm": 2.1274526119232178, + "learning_rate": 1.2328862418530381e-05, + "loss": 0.7032, + "step": 5424 + }, + { + "epoch": 0.41050281865990695, + "grad_norm": 2.8769493103027344, + "learning_rate": 1.2326769819733382e-05, + "loss": 0.7368, + "step": 5425 + }, + { + "epoch": 0.41057848738224056, + "grad_norm": 2.2980008125305176, + "learning_rate": 1.2324677012603294e-05, + "loss": 0.7648, + "step": 5426 + }, + { + "epoch": 0.4106541561045742, + "grad_norm": 2.084308624267578, + "learning_rate": 1.232258399727118e-05, + "loss": 0.7854, + "step": 5427 + }, + { + "epoch": 0.4107298248269078, + "grad_norm": 1.9208357334136963, + "learning_rate": 1.232049077386811e-05, + "loss": 0.6656, + "step": 5428 + }, + { + "epoch": 0.4108054935492414, + "grad_norm": 1.9888135194778442, + "learning_rate": 1.2318397342525164e-05, + "loss": 0.693, + "step": 5429 + }, + { + "epoch": 0.41088116227157506, + "grad_norm": 2.4940552711486816, + "learning_rate": 1.2316303703373448e-05, + "loss": 0.8753, + "step": 5430 + }, + { + "epoch": 0.4109568309939087, + "grad_norm": 1.7741725444793701, + "learning_rate": 1.2314209856544064e-05, + "loss": 0.5688, + "step": 5431 + }, + { + "epoch": 0.4110324997162423, + "grad_norm": 2.1441521644592285, + "learning_rate": 1.2312115802168144e-05, + "loss": 0.7552, + "step": 5432 + }, + { + "epoch": 0.4111081684385759, + "grad_norm": 2.1466495990753174, + "learning_rate": 1.2310021540376815e-05, + "loss": 0.7369, + "step": 5433 + }, + { + "epoch": 0.41118383716090956, + "grad_norm": 2.301936149597168, + "learning_rate": 1.2307927071301235e-05, + "loss": 0.7292, + "step": 5434 + }, + { + "epoch": 0.41125950588324317, + "grad_norm": 1.9242736101150513, + "learning_rate": 1.230583239507256e-05, + "loss": 0.7407, + "step": 5435 + }, + { + "epoch": 0.4113351746055768, + "grad_norm": 2.5096495151519775, + "learning_rate": 1.2303737511821969e-05, + "loss": 0.6824, + "step": 5436 + }, + { + "epoch": 0.4114108433279104, + "grad_norm": 2.5667378902435303, + "learning_rate": 1.2301642421680649e-05, + "loss": 0.7517, + "step": 5437 + }, + { + "epoch": 0.41148651205024406, + "grad_norm": 2.038986921310425, + "learning_rate": 1.2299547124779803e-05, + "loss": 0.6629, + "step": 5438 + }, + { + "epoch": 0.41156218077257767, + "grad_norm": 2.1099348068237305, + "learning_rate": 1.2297451621250643e-05, + "loss": 0.7357, + "step": 5439 + }, + { + "epoch": 0.4116378494949113, + "grad_norm": 1.7685575485229492, + "learning_rate": 1.2295355911224398e-05, + "loss": 0.8865, + "step": 5440 + }, + { + "epoch": 0.4117135182172449, + "grad_norm": 2.0220160484313965, + "learning_rate": 1.2293259994832306e-05, + "loss": 0.7703, + "step": 5441 + }, + { + "epoch": 0.4117891869395785, + "grad_norm": 1.9990547895431519, + "learning_rate": 1.2291163872205624e-05, + "loss": 0.8773, + "step": 5442 + }, + { + "epoch": 0.41186485566191217, + "grad_norm": 2.4579837322235107, + "learning_rate": 1.2289067543475613e-05, + "loss": 0.7528, + "step": 5443 + }, + { + "epoch": 0.4119405243842458, + "grad_norm": 2.3881900310516357, + "learning_rate": 1.2286971008773552e-05, + "loss": 0.8378, + "step": 5444 + }, + { + "epoch": 0.4120161931065794, + "grad_norm": 2.1781957149505615, + "learning_rate": 1.228487426823074e-05, + "loss": 0.824, + "step": 5445 + }, + { + "epoch": 0.412091861828913, + "grad_norm": 2.6942150592803955, + "learning_rate": 1.2282777321978474e-05, + "loss": 0.8154, + "step": 5446 + }, + { + "epoch": 0.41216753055124666, + "grad_norm": 1.4597055912017822, + "learning_rate": 1.2280680170148075e-05, + "loss": 0.7794, + "step": 5447 + }, + { + "epoch": 0.4122431992735803, + "grad_norm": 2.1186511516571045, + "learning_rate": 1.2278582812870874e-05, + "loss": 0.6442, + "step": 5448 + }, + { + "epoch": 0.4123188679959139, + "grad_norm": 1.8283412456512451, + "learning_rate": 1.2276485250278211e-05, + "loss": 0.8088, + "step": 5449 + }, + { + "epoch": 0.4123945367182475, + "grad_norm": 2.1991498470306396, + "learning_rate": 1.2274387482501444e-05, + "loss": 0.7892, + "step": 5450 + }, + { + "epoch": 0.41247020544058116, + "grad_norm": 1.786555290222168, + "learning_rate": 1.2272289509671943e-05, + "loss": 0.6558, + "step": 5451 + }, + { + "epoch": 0.41254587416291477, + "grad_norm": 1.8185572624206543, + "learning_rate": 1.227019133192109e-05, + "loss": 0.8318, + "step": 5452 + }, + { + "epoch": 0.4126215428852484, + "grad_norm": 2.238388776779175, + "learning_rate": 1.2268092949380277e-05, + "loss": 0.6967, + "step": 5453 + }, + { + "epoch": 0.412697211607582, + "grad_norm": 1.840320348739624, + "learning_rate": 1.2265994362180915e-05, + "loss": 0.7751, + "step": 5454 + }, + { + "epoch": 0.4127728803299156, + "grad_norm": 2.3982057571411133, + "learning_rate": 1.2263895570454424e-05, + "loss": 0.7067, + "step": 5455 + }, + { + "epoch": 0.41284854905224927, + "grad_norm": 3.4811136722564697, + "learning_rate": 1.2261796574332232e-05, + "loss": 0.6473, + "step": 5456 + }, + { + "epoch": 0.4129242177745829, + "grad_norm": 1.9703245162963867, + "learning_rate": 1.225969737394579e-05, + "loss": 0.7064, + "step": 5457 + }, + { + "epoch": 0.4129998864969165, + "grad_norm": 2.2226948738098145, + "learning_rate": 1.2257597969426555e-05, + "loss": 0.6056, + "step": 5458 + }, + { + "epoch": 0.4130755552192501, + "grad_norm": 2.0323541164398193, + "learning_rate": 1.2255498360905998e-05, + "loss": 0.7867, + "step": 5459 + }, + { + "epoch": 0.41315122394158377, + "grad_norm": 1.938133716583252, + "learning_rate": 1.2253398548515604e-05, + "loss": 0.6533, + "step": 5460 + }, + { + "epoch": 0.4132268926639174, + "grad_norm": 2.4267141819000244, + "learning_rate": 1.2251298532386874e-05, + "loss": 0.6883, + "step": 5461 + }, + { + "epoch": 0.413302561386251, + "grad_norm": 2.085056781768799, + "learning_rate": 1.224919831265131e-05, + "loss": 0.6782, + "step": 5462 + }, + { + "epoch": 0.4133782301085846, + "grad_norm": 1.7166036367416382, + "learning_rate": 1.2247097889440441e-05, + "loss": 0.8407, + "step": 5463 + }, + { + "epoch": 0.41345389883091826, + "grad_norm": 1.9741954803466797, + "learning_rate": 1.2244997262885797e-05, + "loss": 0.6178, + "step": 5464 + }, + { + "epoch": 0.4135295675532519, + "grad_norm": 3.0332493782043457, + "learning_rate": 1.224289643311893e-05, + "loss": 0.5895, + "step": 5465 + }, + { + "epoch": 0.4136052362755855, + "grad_norm": 2.2371206283569336, + "learning_rate": 1.2240795400271402e-05, + "loss": 0.6845, + "step": 5466 + }, + { + "epoch": 0.4136809049979191, + "grad_norm": 2.590519666671753, + "learning_rate": 1.223869416447478e-05, + "loss": 0.7363, + "step": 5467 + }, + { + "epoch": 0.4137565737202527, + "grad_norm": 2.10429310798645, + "learning_rate": 1.2236592725860656e-05, + "loss": 0.7608, + "step": 5468 + }, + { + "epoch": 0.4138322424425864, + "grad_norm": 2.429518461227417, + "learning_rate": 1.2234491084560629e-05, + "loss": 0.6788, + "step": 5469 + }, + { + "epoch": 0.41390791116492, + "grad_norm": 2.2328531742095947, + "learning_rate": 1.2232389240706306e-05, + "loss": 0.7914, + "step": 5470 + }, + { + "epoch": 0.4139835798872536, + "grad_norm": 1.950385570526123, + "learning_rate": 1.2230287194429316e-05, + "loss": 0.777, + "step": 5471 + }, + { + "epoch": 0.4140592486095872, + "grad_norm": 2.1990959644317627, + "learning_rate": 1.2228184945861291e-05, + "loss": 0.8321, + "step": 5472 + }, + { + "epoch": 0.41413491733192087, + "grad_norm": 2.1966259479522705, + "learning_rate": 1.2226082495133886e-05, + "loss": 0.644, + "step": 5473 + }, + { + "epoch": 0.4142105860542545, + "grad_norm": 2.390727996826172, + "learning_rate": 1.2223979842378756e-05, + "loss": 0.6661, + "step": 5474 + }, + { + "epoch": 0.4142862547765881, + "grad_norm": 2.415733575820923, + "learning_rate": 1.2221876987727586e-05, + "loss": 0.7288, + "step": 5475 + }, + { + "epoch": 0.4143619234989217, + "grad_norm": 2.3557534217834473, + "learning_rate": 1.2219773931312057e-05, + "loss": 0.6913, + "step": 5476 + }, + { + "epoch": 0.41443759222125537, + "grad_norm": 2.2471041679382324, + "learning_rate": 1.221767067326387e-05, + "loss": 0.7311, + "step": 5477 + }, + { + "epoch": 0.414513260943589, + "grad_norm": 1.8936131000518799, + "learning_rate": 1.221556721371474e-05, + "loss": 0.6176, + "step": 5478 + }, + { + "epoch": 0.4145889296659226, + "grad_norm": 1.792964220046997, + "learning_rate": 1.2213463552796388e-05, + "loss": 0.6454, + "step": 5479 + }, + { + "epoch": 0.4146645983882562, + "grad_norm": 2.177844762802124, + "learning_rate": 1.2211359690640556e-05, + "loss": 0.8097, + "step": 5480 + }, + { + "epoch": 0.4147402671105898, + "grad_norm": 1.9635275602340698, + "learning_rate": 1.2209255627378992e-05, + "loss": 0.7149, + "step": 5481 + }, + { + "epoch": 0.4148159358329235, + "grad_norm": 1.988793969154358, + "learning_rate": 1.2207151363143462e-05, + "loss": 0.7471, + "step": 5482 + }, + { + "epoch": 0.4148916045552571, + "grad_norm": 2.298090696334839, + "learning_rate": 1.220504689806574e-05, + "loss": 0.8539, + "step": 5483 + }, + { + "epoch": 0.4149672732775907, + "grad_norm": 1.7271684408187866, + "learning_rate": 1.2202942232277616e-05, + "loss": 0.8253, + "step": 5484 + }, + { + "epoch": 0.4150429419999243, + "grad_norm": 1.700923204421997, + "learning_rate": 1.2200837365910887e-05, + "loss": 0.7333, + "step": 5485 + }, + { + "epoch": 0.415118610722258, + "grad_norm": 2.043684482574463, + "learning_rate": 1.2198732299097373e-05, + "loss": 0.7908, + "step": 5486 + }, + { + "epoch": 0.4151942794445916, + "grad_norm": 1.729766845703125, + "learning_rate": 1.2196627031968894e-05, + "loss": 0.691, + "step": 5487 + }, + { + "epoch": 0.4152699481669252, + "grad_norm": 2.202939033508301, + "learning_rate": 1.2194521564657293e-05, + "loss": 0.8334, + "step": 5488 + }, + { + "epoch": 0.4153456168892588, + "grad_norm": 1.9363715648651123, + "learning_rate": 1.2192415897294418e-05, + "loss": 0.7155, + "step": 5489 + }, + { + "epoch": 0.4154212856115925, + "grad_norm": 2.62788724899292, + "learning_rate": 1.2190310030012132e-05, + "loss": 0.6351, + "step": 5490 + }, + { + "epoch": 0.4154969543339261, + "grad_norm": 1.856323480606079, + "learning_rate": 1.2188203962942318e-05, + "loss": 0.7471, + "step": 5491 + }, + { + "epoch": 0.4155726230562597, + "grad_norm": 2.280324935913086, + "learning_rate": 1.2186097696216856e-05, + "loss": 0.6655, + "step": 5492 + }, + { + "epoch": 0.4156482917785933, + "grad_norm": 21.438453674316406, + "learning_rate": 1.2183991229967652e-05, + "loss": 0.615, + "step": 5493 + }, + { + "epoch": 0.4157239605009269, + "grad_norm": 2.4099819660186768, + "learning_rate": 1.2181884564326618e-05, + "loss": 0.7488, + "step": 5494 + }, + { + "epoch": 0.4157996292232606, + "grad_norm": 2.0276858806610107, + "learning_rate": 1.2179777699425683e-05, + "loss": 0.7315, + "step": 5495 + }, + { + "epoch": 0.4158752979455942, + "grad_norm": 2.4924838542938232, + "learning_rate": 1.2177670635396786e-05, + "loss": 0.6686, + "step": 5496 + }, + { + "epoch": 0.4159509666679278, + "grad_norm": 2.914191961288452, + "learning_rate": 1.2175563372371872e-05, + "loss": 0.9143, + "step": 5497 + }, + { + "epoch": 0.4160266353902614, + "grad_norm": 3.372140884399414, + "learning_rate": 1.217345591048291e-05, + "loss": 0.6866, + "step": 5498 + }, + { + "epoch": 0.4161023041125951, + "grad_norm": 3.612203598022461, + "learning_rate": 1.2171348249861874e-05, + "loss": 0.7108, + "step": 5499 + }, + { + "epoch": 0.4161779728349287, + "grad_norm": 1.8624509572982788, + "learning_rate": 1.2169240390640753e-05, + "loss": 0.8142, + "step": 5500 + }, + { + "epoch": 0.4162536415572623, + "grad_norm": 2.179865837097168, + "learning_rate": 1.216713233295155e-05, + "loss": 0.7727, + "step": 5501 + }, + { + "epoch": 0.4163293102795959, + "grad_norm": 2.4808688163757324, + "learning_rate": 1.2165024076926276e-05, + "loss": 0.7229, + "step": 5502 + }, + { + "epoch": 0.4164049790019296, + "grad_norm": 1.9209163188934326, + "learning_rate": 1.2162915622696955e-05, + "loss": 0.6675, + "step": 5503 + }, + { + "epoch": 0.4164806477242632, + "grad_norm": 2.1031787395477295, + "learning_rate": 1.216080697039563e-05, + "loss": 0.7635, + "step": 5504 + }, + { + "epoch": 0.4165563164465968, + "grad_norm": 2.6245055198669434, + "learning_rate": 1.215869812015435e-05, + "loss": 0.7576, + "step": 5505 + }, + { + "epoch": 0.4166319851689304, + "grad_norm": 1.9226709604263306, + "learning_rate": 1.2156589072105175e-05, + "loss": 0.7822, + "step": 5506 + }, + { + "epoch": 0.4167076538912641, + "grad_norm": 2.297623872756958, + "learning_rate": 1.2154479826380185e-05, + "loss": 0.8283, + "step": 5507 + }, + { + "epoch": 0.4167833226135977, + "grad_norm": 2.166672706604004, + "learning_rate": 1.215237038311146e-05, + "loss": 0.8331, + "step": 5508 + }, + { + "epoch": 0.4168589913359313, + "grad_norm": 1.8679463863372803, + "learning_rate": 1.215026074243111e-05, + "loss": 0.749, + "step": 5509 + }, + { + "epoch": 0.4169346600582649, + "grad_norm": 2.0886306762695312, + "learning_rate": 1.2148150904471246e-05, + "loss": 0.6835, + "step": 5510 + }, + { + "epoch": 0.4170103287805985, + "grad_norm": 1.7681407928466797, + "learning_rate": 1.2146040869363986e-05, + "loss": 0.768, + "step": 5511 + }, + { + "epoch": 0.4170859975029322, + "grad_norm": 2.234034538269043, + "learning_rate": 1.2143930637241473e-05, + "loss": 0.7622, + "step": 5512 + }, + { + "epoch": 0.4171616662252658, + "grad_norm": 2.314732789993286, + "learning_rate": 1.2141820208235851e-05, + "loss": 0.778, + "step": 5513 + }, + { + "epoch": 0.4172373349475994, + "grad_norm": 2.147493362426758, + "learning_rate": 1.213970958247929e-05, + "loss": 0.644, + "step": 5514 + }, + { + "epoch": 0.417313003669933, + "grad_norm": 2.0052413940429688, + "learning_rate": 1.2137598760103958e-05, + "loss": 0.8898, + "step": 5515 + }, + { + "epoch": 0.4173886723922667, + "grad_norm": 2.1926968097686768, + "learning_rate": 1.2135487741242043e-05, + "loss": 0.8162, + "step": 5516 + }, + { + "epoch": 0.4174643411146003, + "grad_norm": 2.2886886596679688, + "learning_rate": 1.2133376526025745e-05, + "loss": 0.6808, + "step": 5517 + }, + { + "epoch": 0.4175400098369339, + "grad_norm": 2.391803503036499, + "learning_rate": 1.2131265114587274e-05, + "loss": 0.7002, + "step": 5518 + }, + { + "epoch": 0.4176156785592675, + "grad_norm": 2.0181946754455566, + "learning_rate": 1.2129153507058856e-05, + "loss": 0.7994, + "step": 5519 + }, + { + "epoch": 0.4176913472816012, + "grad_norm": 2.0882043838500977, + "learning_rate": 1.2127041703572722e-05, + "loss": 0.6383, + "step": 5520 + }, + { + "epoch": 0.4177670160039348, + "grad_norm": 1.928208351135254, + "learning_rate": 1.2124929704261123e-05, + "loss": 0.745, + "step": 5521 + }, + { + "epoch": 0.4178426847262684, + "grad_norm": 2.641408681869507, + "learning_rate": 1.212281750925632e-05, + "loss": 1.0537, + "step": 5522 + }, + { + "epoch": 0.417918353448602, + "grad_norm": 2.605942726135254, + "learning_rate": 1.2120705118690581e-05, + "loss": 0.6757, + "step": 5523 + }, + { + "epoch": 0.4179940221709356, + "grad_norm": 2.2262070178985596, + "learning_rate": 1.2118592532696196e-05, + "loss": 0.8022, + "step": 5524 + }, + { + "epoch": 0.4180696908932693, + "grad_norm": 2.1038734912872314, + "learning_rate": 1.2116479751405461e-05, + "loss": 0.6194, + "step": 5525 + }, + { + "epoch": 0.4181453596156029, + "grad_norm": 1.9260424375534058, + "learning_rate": 1.2114366774950681e-05, + "loss": 0.6886, + "step": 5526 + }, + { + "epoch": 0.4182210283379365, + "grad_norm": 1.8009731769561768, + "learning_rate": 1.2112253603464182e-05, + "loss": 0.5855, + "step": 5527 + }, + { + "epoch": 0.4182966970602701, + "grad_norm": 4.206608772277832, + "learning_rate": 1.2110140237078297e-05, + "loss": 0.8015, + "step": 5528 + }, + { + "epoch": 0.4183723657826038, + "grad_norm": 2.875774621963501, + "learning_rate": 1.2108026675925371e-05, + "loss": 0.7709, + "step": 5529 + }, + { + "epoch": 0.4184480345049374, + "grad_norm": 2.3251543045043945, + "learning_rate": 1.2105912920137762e-05, + "loss": 0.7194, + "step": 5530 + }, + { + "epoch": 0.418523703227271, + "grad_norm": 1.7996389865875244, + "learning_rate": 1.2103798969847836e-05, + "loss": 0.805, + "step": 5531 + }, + { + "epoch": 0.4185993719496046, + "grad_norm": 2.323073148727417, + "learning_rate": 1.2101684825187985e-05, + "loss": 0.7145, + "step": 5532 + }, + { + "epoch": 0.4186750406719383, + "grad_norm": 3.068136692047119, + "learning_rate": 1.2099570486290597e-05, + "loss": 0.8114, + "step": 5533 + }, + { + "epoch": 0.4187507093942719, + "grad_norm": 2.661367416381836, + "learning_rate": 1.209745595328808e-05, + "loss": 0.7873, + "step": 5534 + }, + { + "epoch": 0.4188263781166055, + "grad_norm": 1.9999775886535645, + "learning_rate": 1.2095341226312853e-05, + "loss": 0.7032, + "step": 5535 + }, + { + "epoch": 0.4189020468389391, + "grad_norm": 2.4388561248779297, + "learning_rate": 1.2093226305497341e-05, + "loss": 0.9638, + "step": 5536 + }, + { + "epoch": 0.4189777155612727, + "grad_norm": 2.280811309814453, + "learning_rate": 1.2091111190974e-05, + "loss": 0.7426, + "step": 5537 + }, + { + "epoch": 0.4190533842836064, + "grad_norm": 2.1886045932769775, + "learning_rate": 1.2088995882875275e-05, + "loss": 0.7784, + "step": 5538 + }, + { + "epoch": 0.41912905300594, + "grad_norm": 2.459237813949585, + "learning_rate": 1.208688038133364e-05, + "loss": 0.8087, + "step": 5539 + }, + { + "epoch": 0.4192047217282736, + "grad_norm": 2.0118658542633057, + "learning_rate": 1.2084764686481569e-05, + "loss": 0.7171, + "step": 5540 + }, + { + "epoch": 0.4192803904506072, + "grad_norm": 2.3412704467773438, + "learning_rate": 1.2082648798451555e-05, + "loss": 0.7725, + "step": 5541 + }, + { + "epoch": 0.4193560591729409, + "grad_norm": 1.796249270439148, + "learning_rate": 1.2080532717376106e-05, + "loss": 0.7044, + "step": 5542 + }, + { + "epoch": 0.4194317278952745, + "grad_norm": 2.0164694786071777, + "learning_rate": 1.2078416443387731e-05, + "loss": 0.7137, + "step": 5543 + }, + { + "epoch": 0.4195073966176081, + "grad_norm": 1.7637386322021484, + "learning_rate": 1.2076299976618965e-05, + "loss": 0.7083, + "step": 5544 + }, + { + "epoch": 0.4195830653399417, + "grad_norm": 1.95462167263031, + "learning_rate": 1.207418331720234e-05, + "loss": 0.7147, + "step": 5545 + }, + { + "epoch": 0.4196587340622754, + "grad_norm": 1.7692989110946655, + "learning_rate": 1.2072066465270415e-05, + "loss": 0.7749, + "step": 5546 + }, + { + "epoch": 0.419734402784609, + "grad_norm": 1.8411818742752075, + "learning_rate": 1.2069949420955753e-05, + "loss": 0.6869, + "step": 5547 + }, + { + "epoch": 0.4198100715069426, + "grad_norm": 1.8122678995132446, + "learning_rate": 1.2067832184390928e-05, + "loss": 0.7162, + "step": 5548 + }, + { + "epoch": 0.4198857402292762, + "grad_norm": 1.7828391790390015, + "learning_rate": 1.206571475570853e-05, + "loss": 0.6865, + "step": 5549 + }, + { + "epoch": 0.4199614089516098, + "grad_norm": 2.397252082824707, + "learning_rate": 1.2063597135041156e-05, + "loss": 0.658, + "step": 5550 + }, + { + "epoch": 0.4200370776739435, + "grad_norm": 2.136765956878662, + "learning_rate": 1.2061479322521422e-05, + "loss": 0.935, + "step": 5551 + }, + { + "epoch": 0.4201127463962771, + "grad_norm": 1.9939488172531128, + "learning_rate": 1.2059361318281949e-05, + "loss": 0.7466, + "step": 5552 + }, + { + "epoch": 0.4201884151186107, + "grad_norm": 2.698948860168457, + "learning_rate": 1.2057243122455378e-05, + "loss": 0.6457, + "step": 5553 + }, + { + "epoch": 0.4202640838409443, + "grad_norm": 18.516450881958008, + "learning_rate": 1.2055124735174352e-05, + "loss": 0.5688, + "step": 5554 + }, + { + "epoch": 0.420339752563278, + "grad_norm": 2.335066556930542, + "learning_rate": 1.2053006156571534e-05, + "loss": 0.7693, + "step": 5555 + }, + { + "epoch": 0.4204154212856116, + "grad_norm": 2.284088373184204, + "learning_rate": 1.2050887386779595e-05, + "loss": 0.6985, + "step": 5556 + }, + { + "epoch": 0.4204910900079452, + "grad_norm": 1.7979247570037842, + "learning_rate": 1.2048768425931222e-05, + "loss": 0.8248, + "step": 5557 + }, + { + "epoch": 0.4205667587302788, + "grad_norm": 1.9598959684371948, + "learning_rate": 1.204664927415911e-05, + "loss": 0.8005, + "step": 5558 + }, + { + "epoch": 0.4206424274526125, + "grad_norm": 1.7757333517074585, + "learning_rate": 1.2044529931595964e-05, + "loss": 0.6955, + "step": 5559 + }, + { + "epoch": 0.4207180961749461, + "grad_norm": 2.177375078201294, + "learning_rate": 1.2042410398374509e-05, + "loss": 0.6263, + "step": 5560 + }, + { + "epoch": 0.4207937648972797, + "grad_norm": 2.13222336769104, + "learning_rate": 1.2040290674627471e-05, + "loss": 0.7584, + "step": 5561 + }, + { + "epoch": 0.4208694336196133, + "grad_norm": 1.8539749383926392, + "learning_rate": 1.20381707604876e-05, + "loss": 0.7735, + "step": 5562 + }, + { + "epoch": 0.42094510234194693, + "grad_norm": 2.649493455886841, + "learning_rate": 1.2036050656087648e-05, + "loss": 0.9243, + "step": 5563 + }, + { + "epoch": 0.4210207710642806, + "grad_norm": 2.361145257949829, + "learning_rate": 1.2033930361560386e-05, + "loss": 0.6677, + "step": 5564 + }, + { + "epoch": 0.4210964397866142, + "grad_norm": 1.8541384935379028, + "learning_rate": 1.2031809877038592e-05, + "loss": 0.9055, + "step": 5565 + }, + { + "epoch": 0.4211721085089478, + "grad_norm": 2.4618043899536133, + "learning_rate": 1.2029689202655054e-05, + "loss": 0.8678, + "step": 5566 + }, + { + "epoch": 0.42124777723128143, + "grad_norm": 2.1291258335113525, + "learning_rate": 1.2027568338542583e-05, + "loss": 0.7327, + "step": 5567 + }, + { + "epoch": 0.4213234459536151, + "grad_norm": 2.079526424407959, + "learning_rate": 1.2025447284833987e-05, + "loss": 0.7069, + "step": 5568 + }, + { + "epoch": 0.4213991146759487, + "grad_norm": 1.8269448280334473, + "learning_rate": 1.2023326041662096e-05, + "loss": 0.7895, + "step": 5569 + }, + { + "epoch": 0.4214747833982823, + "grad_norm": 1.9055373668670654, + "learning_rate": 1.2021204609159753e-05, + "loss": 0.5952, + "step": 5570 + }, + { + "epoch": 0.4215504521206159, + "grad_norm": 2.0945003032684326, + "learning_rate": 1.2019082987459806e-05, + "loss": 0.7579, + "step": 5571 + }, + { + "epoch": 0.4216261208429496, + "grad_norm": 1.7482681274414062, + "learning_rate": 1.2016961176695113e-05, + "loss": 0.6244, + "step": 5572 + }, + { + "epoch": 0.4217017895652832, + "grad_norm": 2.3275108337402344, + "learning_rate": 1.2014839176998557e-05, + "loss": 0.6763, + "step": 5573 + }, + { + "epoch": 0.4217774582876168, + "grad_norm": 1.9964745044708252, + "learning_rate": 1.2012716988503021e-05, + "loss": 0.8104, + "step": 5574 + }, + { + "epoch": 0.4218531270099504, + "grad_norm": 1.7609212398529053, + "learning_rate": 1.20105946113414e-05, + "loss": 0.7107, + "step": 5575 + }, + { + "epoch": 0.42192879573228403, + "grad_norm": 2.2987399101257324, + "learning_rate": 1.200847204564661e-05, + "loss": 0.7246, + "step": 5576 + }, + { + "epoch": 0.4220044644546177, + "grad_norm": 1.8618190288543701, + "learning_rate": 1.2006349291551564e-05, + "loss": 0.6966, + "step": 5577 + }, + { + "epoch": 0.4220801331769513, + "grad_norm": 1.7116061449050903, + "learning_rate": 1.2004226349189208e-05, + "loss": 0.8733, + "step": 5578 + }, + { + "epoch": 0.4221558018992849, + "grad_norm": 2.474656105041504, + "learning_rate": 1.2002103218692479e-05, + "loss": 0.8025, + "step": 5579 + }, + { + "epoch": 0.42223147062161853, + "grad_norm": 1.4539145231246948, + "learning_rate": 1.1999979900194335e-05, + "loss": 0.8741, + "step": 5580 + }, + { + "epoch": 0.4223071393439522, + "grad_norm": 2.465669631958008, + "learning_rate": 1.1997856393827749e-05, + "loss": 0.7923, + "step": 5581 + }, + { + "epoch": 0.4223828080662858, + "grad_norm": 2.1911604404449463, + "learning_rate": 1.1995732699725697e-05, + "loss": 0.7288, + "step": 5582 + }, + { + "epoch": 0.4224584767886194, + "grad_norm": 1.682003378868103, + "learning_rate": 1.1993608818021176e-05, + "loss": 0.7595, + "step": 5583 + }, + { + "epoch": 0.42253414551095303, + "grad_norm": 2.285404920578003, + "learning_rate": 1.1991484748847187e-05, + "loss": 0.6823, + "step": 5584 + }, + { + "epoch": 0.4226098142332867, + "grad_norm": 3.933152675628662, + "learning_rate": 1.1989360492336747e-05, + "loss": 0.8576, + "step": 5585 + }, + { + "epoch": 0.4226854829556203, + "grad_norm": 2.1262471675872803, + "learning_rate": 1.1987236048622886e-05, + "loss": 0.7692, + "step": 5586 + }, + { + "epoch": 0.4227611516779539, + "grad_norm": 2.068648099899292, + "learning_rate": 1.198511141783864e-05, + "loss": 0.885, + "step": 5587 + }, + { + "epoch": 0.4228368204002875, + "grad_norm": 2.879906177520752, + "learning_rate": 1.1982986600117065e-05, + "loss": 0.744, + "step": 5588 + }, + { + "epoch": 0.42291248912262114, + "grad_norm": 2.1112852096557617, + "learning_rate": 1.198086159559122e-05, + "loss": 0.6636, + "step": 5589 + }, + { + "epoch": 0.4229881578449548, + "grad_norm": 2.5208778381347656, + "learning_rate": 1.1978736404394177e-05, + "loss": 0.7342, + "step": 5590 + }, + { + "epoch": 0.4230638265672884, + "grad_norm": 2.7024381160736084, + "learning_rate": 1.1976611026659029e-05, + "loss": 0.6204, + "step": 5591 + }, + { + "epoch": 0.423139495289622, + "grad_norm": 1.8119574785232544, + "learning_rate": 1.1974485462518872e-05, + "loss": 0.7252, + "step": 5592 + }, + { + "epoch": 0.42321516401195564, + "grad_norm": 2.16031813621521, + "learning_rate": 1.1972359712106811e-05, + "loss": 0.7198, + "step": 5593 + }, + { + "epoch": 0.4232908327342893, + "grad_norm": 1.8880118131637573, + "learning_rate": 1.1970233775555975e-05, + "loss": 0.7329, + "step": 5594 + }, + { + "epoch": 0.4233665014566229, + "grad_norm": 1.758371114730835, + "learning_rate": 1.196810765299949e-05, + "loss": 0.7565, + "step": 5595 + }, + { + "epoch": 0.4234421701789565, + "grad_norm": 2.201699733734131, + "learning_rate": 1.1965981344570504e-05, + "loss": 0.6688, + "step": 5596 + }, + { + "epoch": 0.42351783890129013, + "grad_norm": 1.9135266542434692, + "learning_rate": 1.1963854850402173e-05, + "loss": 0.7328, + "step": 5597 + }, + { + "epoch": 0.4235935076236238, + "grad_norm": 2.1251890659332275, + "learning_rate": 1.1961728170627666e-05, + "loss": 0.7701, + "step": 5598 + }, + { + "epoch": 0.4236691763459574, + "grad_norm": 2.2974348068237305, + "learning_rate": 1.1959601305380163e-05, + "loss": 0.6692, + "step": 5599 + }, + { + "epoch": 0.423744845068291, + "grad_norm": 2.4548261165618896, + "learning_rate": 1.1957474254792851e-05, + "loss": 0.7951, + "step": 5600 + }, + { + "epoch": 0.42382051379062463, + "grad_norm": 2.009052276611328, + "learning_rate": 1.195534701899894e-05, + "loss": 0.7192, + "step": 5601 + }, + { + "epoch": 0.42389618251295824, + "grad_norm": 2.0583083629608154, + "learning_rate": 1.1953219598131634e-05, + "loss": 0.6207, + "step": 5602 + }, + { + "epoch": 0.4239718512352919, + "grad_norm": 1.8193392753601074, + "learning_rate": 1.1951091992324167e-05, + "loss": 0.7451, + "step": 5603 + }, + { + "epoch": 0.4240475199576255, + "grad_norm": 2.209012269973755, + "learning_rate": 1.1948964201709775e-05, + "loss": 0.6402, + "step": 5604 + }, + { + "epoch": 0.42412318867995913, + "grad_norm": 2.069322347640991, + "learning_rate": 1.1946836226421708e-05, + "loss": 0.7, + "step": 5605 + }, + { + "epoch": 0.42419885740229274, + "grad_norm": 1.7103984355926514, + "learning_rate": 1.1944708066593225e-05, + "loss": 0.674, + "step": 5606 + }, + { + "epoch": 0.4242745261246264, + "grad_norm": 2.117616891860962, + "learning_rate": 1.1942579722357596e-05, + "loss": 0.6814, + "step": 5607 + }, + { + "epoch": 0.42435019484696, + "grad_norm": 4.542725086212158, + "learning_rate": 1.1940451193848108e-05, + "loss": 0.7538, + "step": 5608 + }, + { + "epoch": 0.4244258635692936, + "grad_norm": 2.359140157699585, + "learning_rate": 1.1938322481198056e-05, + "loss": 0.6849, + "step": 5609 + }, + { + "epoch": 0.42450153229162724, + "grad_norm": 3.171555995941162, + "learning_rate": 1.1936193584540747e-05, + "loss": 0.7442, + "step": 5610 + }, + { + "epoch": 0.4245772010139609, + "grad_norm": 2.4823436737060547, + "learning_rate": 1.19340645040095e-05, + "loss": 0.9193, + "step": 5611 + }, + { + "epoch": 0.4246528697362945, + "grad_norm": 1.8430533409118652, + "learning_rate": 1.1931935239737643e-05, + "loss": 0.6416, + "step": 5612 + }, + { + "epoch": 0.4247285384586281, + "grad_norm": 2.7708842754364014, + "learning_rate": 1.1929805791858518e-05, + "loss": 0.7639, + "step": 5613 + }, + { + "epoch": 0.42480420718096173, + "grad_norm": 2.5303053855895996, + "learning_rate": 1.1927676160505476e-05, + "loss": 0.7355, + "step": 5614 + }, + { + "epoch": 0.42487987590329535, + "grad_norm": 2.337359666824341, + "learning_rate": 1.1925546345811889e-05, + "loss": 0.8643, + "step": 5615 + }, + { + "epoch": 0.424955544625629, + "grad_norm": 2.4965403079986572, + "learning_rate": 1.1923416347911123e-05, + "loss": 0.7514, + "step": 5616 + }, + { + "epoch": 0.4250312133479626, + "grad_norm": 1.8858367204666138, + "learning_rate": 1.192128616693657e-05, + "loss": 0.6455, + "step": 5617 + }, + { + "epoch": 0.42510688207029623, + "grad_norm": 1.8443236351013184, + "learning_rate": 1.1919155803021628e-05, + "loss": 0.8063, + "step": 5618 + }, + { + "epoch": 0.42518255079262984, + "grad_norm": 1.9932689666748047, + "learning_rate": 1.1917025256299713e-05, + "loss": 0.6858, + "step": 5619 + }, + { + "epoch": 0.4252582195149635, + "grad_norm": 2.4727776050567627, + "learning_rate": 1.1914894526904236e-05, + "loss": 0.8452, + "step": 5620 + }, + { + "epoch": 0.4253338882372971, + "grad_norm": 1.8388804197311401, + "learning_rate": 1.1912763614968638e-05, + "loss": 0.8343, + "step": 5621 + }, + { + "epoch": 0.42540955695963073, + "grad_norm": 1.9806253910064697, + "learning_rate": 1.1910632520626363e-05, + "loss": 0.7089, + "step": 5622 + }, + { + "epoch": 0.42548522568196434, + "grad_norm": 2.018436908721924, + "learning_rate": 1.1908501244010862e-05, + "loss": 0.7397, + "step": 5623 + }, + { + "epoch": 0.425560894404298, + "grad_norm": 2.8145549297332764, + "learning_rate": 1.190636978525561e-05, + "loss": 0.6912, + "step": 5624 + }, + { + "epoch": 0.4256365631266316, + "grad_norm": 1.973929762840271, + "learning_rate": 1.190423814449408e-05, + "loss": 0.6269, + "step": 5625 + }, + { + "epoch": 0.42571223184896523, + "grad_norm": 1.5904262065887451, + "learning_rate": 1.1902106321859764e-05, + "loss": 0.8579, + "step": 5626 + }, + { + "epoch": 0.42578790057129884, + "grad_norm": 1.901921033859253, + "learning_rate": 1.189997431748616e-05, + "loss": 0.595, + "step": 5627 + }, + { + "epoch": 0.4258635692936325, + "grad_norm": 1.7810097932815552, + "learning_rate": 1.189784213150679e-05, + "loss": 0.6377, + "step": 5628 + }, + { + "epoch": 0.4259392380159661, + "grad_norm": 1.7638015747070312, + "learning_rate": 1.189570976405517e-05, + "loss": 0.6832, + "step": 5629 + }, + { + "epoch": 0.4260149067382997, + "grad_norm": 1.9742308855056763, + "learning_rate": 1.189357721526484e-05, + "loss": 0.7322, + "step": 5630 + }, + { + "epoch": 0.42609057546063334, + "grad_norm": 2.480738639831543, + "learning_rate": 1.1891444485269344e-05, + "loss": 0.735, + "step": 5631 + }, + { + "epoch": 0.42616624418296695, + "grad_norm": 1.8731553554534912, + "learning_rate": 1.1889311574202242e-05, + "loss": 0.7125, + "step": 5632 + }, + { + "epoch": 0.4262419129053006, + "grad_norm": 2.045454263687134, + "learning_rate": 1.1887178482197109e-05, + "loss": 0.7475, + "step": 5633 + }, + { + "epoch": 0.4263175816276342, + "grad_norm": 2.2472641468048096, + "learning_rate": 1.1885045209387514e-05, + "loss": 0.6585, + "step": 5634 + }, + { + "epoch": 0.42639325034996783, + "grad_norm": 1.9361463785171509, + "learning_rate": 1.1882911755907062e-05, + "loss": 0.6429, + "step": 5635 + }, + { + "epoch": 0.42646891907230144, + "grad_norm": 1.8335850238800049, + "learning_rate": 1.1880778121889349e-05, + "loss": 0.554, + "step": 5636 + }, + { + "epoch": 0.4265445877946351, + "grad_norm": 2.1291892528533936, + "learning_rate": 1.1878644307467992e-05, + "loss": 0.8045, + "step": 5637 + }, + { + "epoch": 0.4266202565169687, + "grad_norm": 2.041837692260742, + "learning_rate": 1.187651031277662e-05, + "loss": 0.7902, + "step": 5638 + }, + { + "epoch": 0.42669592523930233, + "grad_norm": 1.9485516548156738, + "learning_rate": 1.1874376137948867e-05, + "loss": 0.6962, + "step": 5639 + }, + { + "epoch": 0.42677159396163594, + "grad_norm": 2.171895742416382, + "learning_rate": 1.1872241783118386e-05, + "loss": 0.6273, + "step": 5640 + }, + { + "epoch": 0.4268472626839696, + "grad_norm": 1.8618667125701904, + "learning_rate": 1.187010724841883e-05, + "loss": 0.6796, + "step": 5641 + }, + { + "epoch": 0.4269229314063032, + "grad_norm": 1.9820287227630615, + "learning_rate": 1.1867972533983879e-05, + "loss": 0.7338, + "step": 5642 + }, + { + "epoch": 0.42699860012863683, + "grad_norm": 1.5241734981536865, + "learning_rate": 1.1865837639947209e-05, + "loss": 0.6571, + "step": 5643 + }, + { + "epoch": 0.42707426885097044, + "grad_norm": 1.9339543581008911, + "learning_rate": 1.1863702566442516e-05, + "loss": 0.7887, + "step": 5644 + }, + { + "epoch": 0.42714993757330405, + "grad_norm": 4.0906572341918945, + "learning_rate": 1.1861567313603511e-05, + "loss": 0.6873, + "step": 5645 + }, + { + "epoch": 0.4272256062956377, + "grad_norm": 2.432317018508911, + "learning_rate": 1.1859431881563899e-05, + "loss": 0.6544, + "step": 5646 + }, + { + "epoch": 0.4273012750179713, + "grad_norm": 2.0724246501922607, + "learning_rate": 1.185729627045742e-05, + "loss": 0.6835, + "step": 5647 + }, + { + "epoch": 0.42737694374030494, + "grad_norm": 2.3478708267211914, + "learning_rate": 1.1855160480417801e-05, + "loss": 0.8091, + "step": 5648 + }, + { + "epoch": 0.42745261246263855, + "grad_norm": 1.9543095827102661, + "learning_rate": 1.1853024511578802e-05, + "loss": 0.7343, + "step": 5649 + }, + { + "epoch": 0.4275282811849722, + "grad_norm": 1.7966238260269165, + "learning_rate": 1.1850888364074179e-05, + "loss": 0.6051, + "step": 5650 + }, + { + "epoch": 0.4276039499073058, + "grad_norm": 1.587853193283081, + "learning_rate": 1.1848752038037708e-05, + "loss": 0.6063, + "step": 5651 + }, + { + "epoch": 0.42767961862963944, + "grad_norm": 1.883578896522522, + "learning_rate": 1.1846615533603168e-05, + "loss": 0.8186, + "step": 5652 + }, + { + "epoch": 0.42775528735197305, + "grad_norm": 2.1655025482177734, + "learning_rate": 1.1844478850904357e-05, + "loss": 0.7779, + "step": 5653 + }, + { + "epoch": 0.4278309560743067, + "grad_norm": 2.625882387161255, + "learning_rate": 1.1842341990075081e-05, + "loss": 0.7361, + "step": 5654 + }, + { + "epoch": 0.4279066247966403, + "grad_norm": 1.935105323791504, + "learning_rate": 1.1840204951249152e-05, + "loss": 0.7115, + "step": 5655 + }, + { + "epoch": 0.42798229351897393, + "grad_norm": 2.1714651584625244, + "learning_rate": 1.1838067734560408e-05, + "loss": 0.8187, + "step": 5656 + }, + { + "epoch": 0.42805796224130754, + "grad_norm": 2.1692118644714355, + "learning_rate": 1.183593034014268e-05, + "loss": 0.6285, + "step": 5657 + }, + { + "epoch": 0.42813363096364115, + "grad_norm": 1.921869158744812, + "learning_rate": 1.1833792768129824e-05, + "loss": 0.6029, + "step": 5658 + }, + { + "epoch": 0.4282092996859748, + "grad_norm": 2.104144811630249, + "learning_rate": 1.1831655018655696e-05, + "loss": 0.7716, + "step": 5659 + }, + { + "epoch": 0.42828496840830843, + "grad_norm": 2.422243595123291, + "learning_rate": 1.1829517091854176e-05, + "loss": 0.7995, + "step": 5660 + }, + { + "epoch": 0.42836063713064204, + "grad_norm": 1.9468094110488892, + "learning_rate": 1.1827378987859144e-05, + "loss": 0.7132, + "step": 5661 + }, + { + "epoch": 0.42843630585297565, + "grad_norm": 1.8024096488952637, + "learning_rate": 1.1825240706804489e-05, + "loss": 0.5364, + "step": 5662 + }, + { + "epoch": 0.4285119745753093, + "grad_norm": 2.2247347831726074, + "learning_rate": 1.1823102248824128e-05, + "loss": 0.7529, + "step": 5663 + }, + { + "epoch": 0.42858764329764293, + "grad_norm": 1.928809404373169, + "learning_rate": 1.182096361405197e-05, + "loss": 0.7429, + "step": 5664 + }, + { + "epoch": 0.42866331201997654, + "grad_norm": 1.948986530303955, + "learning_rate": 1.181882480262195e-05, + "loss": 0.7047, + "step": 5665 + }, + { + "epoch": 0.42873898074231015, + "grad_norm": 2.77934193611145, + "learning_rate": 1.1816685814668e-05, + "loss": 0.8807, + "step": 5666 + }, + { + "epoch": 0.4288146494646438, + "grad_norm": 2.046111583709717, + "learning_rate": 1.1814546650324078e-05, + "loss": 0.8026, + "step": 5667 + }, + { + "epoch": 0.4288903181869774, + "grad_norm": 1.4681226015090942, + "learning_rate": 1.181240730972414e-05, + "loss": 0.8824, + "step": 5668 + }, + { + "epoch": 0.42896598690931104, + "grad_norm": 1.7636147737503052, + "learning_rate": 1.1810267793002158e-05, + "loss": 0.7005, + "step": 5669 + }, + { + "epoch": 0.42904165563164465, + "grad_norm": 2.328195571899414, + "learning_rate": 1.180812810029212e-05, + "loss": 0.7529, + "step": 5670 + }, + { + "epoch": 0.42911732435397826, + "grad_norm": 2.7056803703308105, + "learning_rate": 1.1805988231728015e-05, + "loss": 0.7472, + "step": 5671 + }, + { + "epoch": 0.4291929930763119, + "grad_norm": 2.115111827850342, + "learning_rate": 1.1803848187443853e-05, + "loss": 0.8469, + "step": 5672 + }, + { + "epoch": 0.42926866179864553, + "grad_norm": 1.9240214824676514, + "learning_rate": 1.1801707967573647e-05, + "loss": 0.7624, + "step": 5673 + }, + { + "epoch": 0.42934433052097914, + "grad_norm": 1.9669137001037598, + "learning_rate": 1.179956757225143e-05, + "loss": 0.69, + "step": 5674 + }, + { + "epoch": 0.42941999924331276, + "grad_norm": 1.713476300239563, + "learning_rate": 1.1797427001611232e-05, + "loss": 0.7508, + "step": 5675 + }, + { + "epoch": 0.4294956679656464, + "grad_norm": 2.3911690711975098, + "learning_rate": 1.179528625578711e-05, + "loss": 0.6873, + "step": 5676 + }, + { + "epoch": 0.42957133668798003, + "grad_norm": 2.203371047973633, + "learning_rate": 1.1793145334913121e-05, + "loss": 0.7431, + "step": 5677 + }, + { + "epoch": 0.42964700541031364, + "grad_norm": 1.975543737411499, + "learning_rate": 1.1791004239123336e-05, + "loss": 0.7112, + "step": 5678 + }, + { + "epoch": 0.42972267413264725, + "grad_norm": 1.8592123985290527, + "learning_rate": 1.1788862968551842e-05, + "loss": 0.6954, + "step": 5679 + }, + { + "epoch": 0.4297983428549809, + "grad_norm": 2.5964951515197754, + "learning_rate": 1.1786721523332723e-05, + "loss": 0.6297, + "step": 5680 + }, + { + "epoch": 0.42987401157731453, + "grad_norm": 2.0938498973846436, + "learning_rate": 1.1784579903600093e-05, + "loss": 0.6312, + "step": 5681 + }, + { + "epoch": 0.42994968029964814, + "grad_norm": 1.6955291032791138, + "learning_rate": 1.1782438109488063e-05, + "loss": 0.7806, + "step": 5682 + }, + { + "epoch": 0.43002534902198175, + "grad_norm": 2.2497923374176025, + "learning_rate": 1.1780296141130756e-05, + "loss": 0.7267, + "step": 5683 + }, + { + "epoch": 0.43010101774431536, + "grad_norm": 1.6915230751037598, + "learning_rate": 1.1778153998662316e-05, + "loss": 0.708, + "step": 5684 + }, + { + "epoch": 0.430176686466649, + "grad_norm": 1.8355711698532104, + "learning_rate": 1.1776011682216882e-05, + "loss": 0.6188, + "step": 5685 + }, + { + "epoch": 0.43025235518898264, + "grad_norm": 1.9086350202560425, + "learning_rate": 1.1773869191928624e-05, + "loss": 0.6782, + "step": 5686 + }, + { + "epoch": 0.43032802391131625, + "grad_norm": 2.346781015396118, + "learning_rate": 1.17717265279317e-05, + "loss": 0.694, + "step": 5687 + }, + { + "epoch": 0.43040369263364986, + "grad_norm": 2.6471588611602783, + "learning_rate": 1.17695836903603e-05, + "loss": 0.6686, + "step": 5688 + }, + { + "epoch": 0.4304793613559835, + "grad_norm": 2.062077045440674, + "learning_rate": 1.1767440679348607e-05, + "loss": 0.7097, + "step": 5689 + }, + { + "epoch": 0.43055503007831714, + "grad_norm": 2.0753626823425293, + "learning_rate": 1.1765297495030831e-05, + "loss": 0.6988, + "step": 5690 + }, + { + "epoch": 0.43063069880065075, + "grad_norm": 2.3270702362060547, + "learning_rate": 1.1763154137541183e-05, + "loss": 0.7583, + "step": 5691 + }, + { + "epoch": 0.43070636752298436, + "grad_norm": 1.7685538530349731, + "learning_rate": 1.1761010607013883e-05, + "loss": 0.6572, + "step": 5692 + }, + { + "epoch": 0.430782036245318, + "grad_norm": 1.7218018770217896, + "learning_rate": 1.175886690358317e-05, + "loss": 0.5876, + "step": 5693 + }, + { + "epoch": 0.43085770496765163, + "grad_norm": 1.9361426830291748, + "learning_rate": 1.1756723027383286e-05, + "loss": 0.6186, + "step": 5694 + }, + { + "epoch": 0.43093337368998524, + "grad_norm": 2.054652214050293, + "learning_rate": 1.1754578978548493e-05, + "loss": 0.7866, + "step": 5695 + }, + { + "epoch": 0.43100904241231885, + "grad_norm": 3.165121555328369, + "learning_rate": 1.1752434757213053e-05, + "loss": 0.6932, + "step": 5696 + }, + { + "epoch": 0.43108471113465247, + "grad_norm": 1.8090955018997192, + "learning_rate": 1.1750290363511248e-05, + "loss": 0.7486, + "step": 5697 + }, + { + "epoch": 0.43116037985698613, + "grad_norm": 2.016489028930664, + "learning_rate": 1.1748145797577363e-05, + "loss": 0.6689, + "step": 5698 + }, + { + "epoch": 0.43123604857931974, + "grad_norm": 2.5373289585113525, + "learning_rate": 1.17460010595457e-05, + "loss": 0.8227, + "step": 5699 + }, + { + "epoch": 0.43131171730165335, + "grad_norm": 1.8351738452911377, + "learning_rate": 1.1743856149550568e-05, + "loss": 0.6326, + "step": 5700 + }, + { + "epoch": 0.43138738602398696, + "grad_norm": 2.076626777648926, + "learning_rate": 1.174171106772629e-05, + "loss": 0.8595, + "step": 5701 + }, + { + "epoch": 0.43146305474632063, + "grad_norm": 2.6194770336151123, + "learning_rate": 1.1739565814207198e-05, + "loss": 0.8026, + "step": 5702 + }, + { + "epoch": 0.43153872346865424, + "grad_norm": 2.0482687950134277, + "learning_rate": 1.173742038912763e-05, + "loss": 0.6438, + "step": 5703 + }, + { + "epoch": 0.43161439219098785, + "grad_norm": 3.11622953414917, + "learning_rate": 1.173527479262195e-05, + "loss": 0.7928, + "step": 5704 + }, + { + "epoch": 0.43169006091332146, + "grad_norm": 1.6607571840286255, + "learning_rate": 1.1733129024824512e-05, + "loss": 0.6947, + "step": 5705 + }, + { + "epoch": 0.4317657296356551, + "grad_norm": 2.05531907081604, + "learning_rate": 1.1730983085869693e-05, + "loss": 0.7901, + "step": 5706 + }, + { + "epoch": 0.43184139835798874, + "grad_norm": 2.3457841873168945, + "learning_rate": 1.172883697589188e-05, + "loss": 0.7528, + "step": 5707 + }, + { + "epoch": 0.43191706708032235, + "grad_norm": 2.2732810974121094, + "learning_rate": 1.1726690695025472e-05, + "loss": 0.6971, + "step": 5708 + }, + { + "epoch": 0.43199273580265596, + "grad_norm": 1.925724744796753, + "learning_rate": 1.1724544243404873e-05, + "loss": 0.853, + "step": 5709 + }, + { + "epoch": 0.43206840452498957, + "grad_norm": 1.8667192459106445, + "learning_rate": 1.1722397621164502e-05, + "loss": 0.5859, + "step": 5710 + }, + { + "epoch": 0.43214407324732323, + "grad_norm": 2.398282289505005, + "learning_rate": 1.1720250828438785e-05, + "loss": 0.7186, + "step": 5711 + }, + { + "epoch": 0.43221974196965685, + "grad_norm": 2.3136038780212402, + "learning_rate": 1.1718103865362161e-05, + "loss": 0.849, + "step": 5712 + }, + { + "epoch": 0.43229541069199046, + "grad_norm": 1.7996965646743774, + "learning_rate": 1.1715956732069083e-05, + "loss": 0.7615, + "step": 5713 + }, + { + "epoch": 0.43237107941432407, + "grad_norm": 1.8926506042480469, + "learning_rate": 1.171380942869401e-05, + "loss": 0.6777, + "step": 5714 + }, + { + "epoch": 0.43244674813665773, + "grad_norm": 1.8787177801132202, + "learning_rate": 1.1711661955371416e-05, + "loss": 0.7415, + "step": 5715 + }, + { + "epoch": 0.43252241685899134, + "grad_norm": 1.7595826387405396, + "learning_rate": 1.1709514312235777e-05, + "loss": 0.756, + "step": 5716 + }, + { + "epoch": 0.43259808558132495, + "grad_norm": 1.8866539001464844, + "learning_rate": 1.1707366499421589e-05, + "loss": 0.7147, + "step": 5717 + }, + { + "epoch": 0.43267375430365856, + "grad_norm": 1.6653908491134644, + "learning_rate": 1.1705218517063353e-05, + "loss": 0.676, + "step": 5718 + }, + { + "epoch": 0.43274942302599223, + "grad_norm": 2.470182180404663, + "learning_rate": 1.1703070365295584e-05, + "loss": 0.6446, + "step": 5719 + }, + { + "epoch": 0.43282509174832584, + "grad_norm": 2.4232730865478516, + "learning_rate": 1.1700922044252808e-05, + "loss": 0.7754, + "step": 5720 + }, + { + "epoch": 0.43290076047065945, + "grad_norm": 1.9250999689102173, + "learning_rate": 1.1698773554069555e-05, + "loss": 0.5243, + "step": 5721 + }, + { + "epoch": 0.43297642919299306, + "grad_norm": 2.12267804145813, + "learning_rate": 1.1696624894880376e-05, + "loss": 0.6571, + "step": 5722 + }, + { + "epoch": 0.4330520979153267, + "grad_norm": 1.7465804815292358, + "learning_rate": 1.1694476066819821e-05, + "loss": 0.6991, + "step": 5723 + }, + { + "epoch": 0.43312776663766034, + "grad_norm": 3.37947154045105, + "learning_rate": 1.1692327070022462e-05, + "loss": 0.7811, + "step": 5724 + }, + { + "epoch": 0.43320343535999395, + "grad_norm": 2.1732442378997803, + "learning_rate": 1.1690177904622874e-05, + "loss": 0.7455, + "step": 5725 + }, + { + "epoch": 0.43327910408232756, + "grad_norm": 1.863910436630249, + "learning_rate": 1.1688028570755642e-05, + "loss": 0.8409, + "step": 5726 + }, + { + "epoch": 0.43335477280466117, + "grad_norm": 1.823136806488037, + "learning_rate": 1.1685879068555369e-05, + "loss": 0.6706, + "step": 5727 + }, + { + "epoch": 0.43343044152699484, + "grad_norm": 1.8565260171890259, + "learning_rate": 1.168372939815666e-05, + "loss": 0.6175, + "step": 5728 + }, + { + "epoch": 0.43350611024932845, + "grad_norm": 2.070004463195801, + "learning_rate": 1.1681579559694136e-05, + "loss": 0.7205, + "step": 5729 + }, + { + "epoch": 0.43358177897166206, + "grad_norm": 2.2450389862060547, + "learning_rate": 1.167942955330243e-05, + "loss": 0.7572, + "step": 5730 + }, + { + "epoch": 0.43365744769399567, + "grad_norm": 1.7799854278564453, + "learning_rate": 1.1677279379116174e-05, + "loss": 0.7553, + "step": 5731 + }, + { + "epoch": 0.43373311641632933, + "grad_norm": 2.37202787399292, + "learning_rate": 1.1675129037270028e-05, + "loss": 0.637, + "step": 5732 + }, + { + "epoch": 0.43380878513866294, + "grad_norm": 2.0018792152404785, + "learning_rate": 1.1672978527898647e-05, + "loss": 0.7219, + "step": 5733 + }, + { + "epoch": 0.43388445386099656, + "grad_norm": 2.104686975479126, + "learning_rate": 1.1670827851136704e-05, + "loss": 0.7433, + "step": 5734 + }, + { + "epoch": 0.43396012258333017, + "grad_norm": 2.32853102684021, + "learning_rate": 1.1668677007118884e-05, + "loss": 0.7045, + "step": 5735 + }, + { + "epoch": 0.4340357913056638, + "grad_norm": 1.7697525024414062, + "learning_rate": 1.166652599597988e-05, + "loss": 0.7692, + "step": 5736 + }, + { + "epoch": 0.43411146002799744, + "grad_norm": 2.1800074577331543, + "learning_rate": 1.166437481785439e-05, + "loss": 0.7022, + "step": 5737 + }, + { + "epoch": 0.43418712875033105, + "grad_norm": 2.2426414489746094, + "learning_rate": 1.1662223472877135e-05, + "loss": 0.7946, + "step": 5738 + }, + { + "epoch": 0.43426279747266466, + "grad_norm": 2.2983815670013428, + "learning_rate": 1.1660071961182834e-05, + "loss": 0.7674, + "step": 5739 + }, + { + "epoch": 0.4343384661949983, + "grad_norm": 2.3444814682006836, + "learning_rate": 1.1657920282906221e-05, + "loss": 0.6454, + "step": 5740 + }, + { + "epoch": 0.43441413491733194, + "grad_norm": 2.3554506301879883, + "learning_rate": 1.1655768438182046e-05, + "loss": 0.795, + "step": 5741 + }, + { + "epoch": 0.43448980363966555, + "grad_norm": 2.302736520767212, + "learning_rate": 1.1653616427145061e-05, + "loss": 0.7287, + "step": 5742 + }, + { + "epoch": 0.43456547236199916, + "grad_norm": 1.9690250158309937, + "learning_rate": 1.1651464249930032e-05, + "loss": 0.662, + "step": 5743 + }, + { + "epoch": 0.43464114108433277, + "grad_norm": 1.9559441804885864, + "learning_rate": 1.1649311906671735e-05, + "loss": 0.8327, + "step": 5744 + }, + { + "epoch": 0.43471680980666644, + "grad_norm": 1.8690423965454102, + "learning_rate": 1.1647159397504958e-05, + "loss": 0.5716, + "step": 5745 + }, + { + "epoch": 0.43479247852900005, + "grad_norm": 1.6188991069793701, + "learning_rate": 1.1645006722564499e-05, + "loss": 0.6871, + "step": 5746 + }, + { + "epoch": 0.43486814725133366, + "grad_norm": 2.0028879642486572, + "learning_rate": 1.1642853881985162e-05, + "loss": 0.7658, + "step": 5747 + }, + { + "epoch": 0.43494381597366727, + "grad_norm": 1.7991002798080444, + "learning_rate": 1.1640700875901768e-05, + "loss": 0.6294, + "step": 5748 + }, + { + "epoch": 0.43501948469600094, + "grad_norm": 1.7511948347091675, + "learning_rate": 1.1638547704449142e-05, + "loss": 0.6803, + "step": 5749 + }, + { + "epoch": 0.43509515341833455, + "grad_norm": 2.2755792140960693, + "learning_rate": 1.163639436776213e-05, + "loss": 0.634, + "step": 5750 + }, + { + "epoch": 0.43517082214066816, + "grad_norm": 1.9088243246078491, + "learning_rate": 1.1634240865975571e-05, + "loss": 0.6084, + "step": 5751 + }, + { + "epoch": 0.43524649086300177, + "grad_norm": 1.6567586660385132, + "learning_rate": 1.163208719922433e-05, + "loss": 0.6281, + "step": 5752 + }, + { + "epoch": 0.4353221595853354, + "grad_norm": 1.8782658576965332, + "learning_rate": 1.1629933367643274e-05, + "loss": 0.7178, + "step": 5753 + }, + { + "epoch": 0.43539782830766904, + "grad_norm": 2.370513677597046, + "learning_rate": 1.1627779371367286e-05, + "loss": 0.7157, + "step": 5754 + }, + { + "epoch": 0.43547349703000265, + "grad_norm": 2.0037007331848145, + "learning_rate": 1.1625625210531255e-05, + "loss": 0.7701, + "step": 5755 + }, + { + "epoch": 0.43554916575233626, + "grad_norm": 1.9564191102981567, + "learning_rate": 1.162347088527008e-05, + "loss": 0.8741, + "step": 5756 + }, + { + "epoch": 0.4356248344746699, + "grad_norm": 2.218358039855957, + "learning_rate": 1.1621316395718674e-05, + "loss": 0.7502, + "step": 5757 + }, + { + "epoch": 0.43570050319700354, + "grad_norm": 2.1146786212921143, + "learning_rate": 1.1619161742011953e-05, + "loss": 0.7886, + "step": 5758 + }, + { + "epoch": 0.43577617191933715, + "grad_norm": 1.9309403896331787, + "learning_rate": 1.1617006924284856e-05, + "loss": 0.7796, + "step": 5759 + }, + { + "epoch": 0.43585184064167076, + "grad_norm": 2.1848180294036865, + "learning_rate": 1.1614851942672319e-05, + "loss": 0.8031, + "step": 5760 + }, + { + "epoch": 0.4359275093640044, + "grad_norm": 2.1754567623138428, + "learning_rate": 1.1612696797309298e-05, + "loss": 0.7759, + "step": 5761 + }, + { + "epoch": 0.43600317808633804, + "grad_norm": 1.7848371267318726, + "learning_rate": 1.1610541488330753e-05, + "loss": 0.7262, + "step": 5762 + }, + { + "epoch": 0.43607884680867165, + "grad_norm": 2.1866798400878906, + "learning_rate": 1.1608386015871655e-05, + "loss": 0.7978, + "step": 5763 + }, + { + "epoch": 0.43615451553100526, + "grad_norm": 1.765702486038208, + "learning_rate": 1.1606230380066988e-05, + "loss": 0.7182, + "step": 5764 + }, + { + "epoch": 0.43623018425333887, + "grad_norm": 2.1728196144104004, + "learning_rate": 1.1604074581051746e-05, + "loss": 0.748, + "step": 5765 + }, + { + "epoch": 0.4363058529756725, + "grad_norm": 2.886596441268921, + "learning_rate": 1.1601918618960933e-05, + "loss": 0.8474, + "step": 5766 + }, + { + "epoch": 0.43638152169800615, + "grad_norm": 2.492180824279785, + "learning_rate": 1.1599762493929555e-05, + "loss": 0.7185, + "step": 5767 + }, + { + "epoch": 0.43645719042033976, + "grad_norm": 2.229836940765381, + "learning_rate": 1.1597606206092645e-05, + "loss": 0.7957, + "step": 5768 + }, + { + "epoch": 0.43653285914267337, + "grad_norm": 1.8042664527893066, + "learning_rate": 1.1595449755585232e-05, + "loss": 0.7395, + "step": 5769 + }, + { + "epoch": 0.436608527865007, + "grad_norm": 1.6901711225509644, + "learning_rate": 1.159329314254236e-05, + "loss": 0.6354, + "step": 5770 + }, + { + "epoch": 0.43668419658734065, + "grad_norm": 1.7246809005737305, + "learning_rate": 1.1591136367099087e-05, + "loss": 0.7282, + "step": 5771 + }, + { + "epoch": 0.43675986530967426, + "grad_norm": 1.6390856504440308, + "learning_rate": 1.1588979429390467e-05, + "loss": 0.7898, + "step": 5772 + }, + { + "epoch": 0.43683553403200787, + "grad_norm": 2.199267625808716, + "learning_rate": 1.1586822329551588e-05, + "loss": 0.8082, + "step": 5773 + }, + { + "epoch": 0.4369112027543415, + "grad_norm": 2.898261070251465, + "learning_rate": 1.1584665067717527e-05, + "loss": 0.6785, + "step": 5774 + }, + { + "epoch": 0.43698687147667514, + "grad_norm": 2.123633623123169, + "learning_rate": 1.1582507644023377e-05, + "loss": 0.7712, + "step": 5775 + }, + { + "epoch": 0.43706254019900875, + "grad_norm": 2.2638285160064697, + "learning_rate": 1.1580350058604246e-05, + "loss": 0.7443, + "step": 5776 + }, + { + "epoch": 0.43713820892134236, + "grad_norm": 1.94474458694458, + "learning_rate": 1.1578192311595247e-05, + "loss": 0.7249, + "step": 5777 + }, + { + "epoch": 0.437213877643676, + "grad_norm": 2.0058271884918213, + "learning_rate": 1.1576034403131511e-05, + "loss": 0.5709, + "step": 5778 + }, + { + "epoch": 0.4372895463660096, + "grad_norm": 2.0627119541168213, + "learning_rate": 1.1573876333348165e-05, + "loss": 0.6974, + "step": 5779 + }, + { + "epoch": 0.43736521508834325, + "grad_norm": 2.220038652420044, + "learning_rate": 1.157171810238036e-05, + "loss": 0.8202, + "step": 5780 + }, + { + "epoch": 0.43744088381067686, + "grad_norm": 1.8286783695220947, + "learning_rate": 1.1569559710363249e-05, + "loss": 0.6777, + "step": 5781 + }, + { + "epoch": 0.43751655253301047, + "grad_norm": 2.0850884914398193, + "learning_rate": 1.1567401157431998e-05, + "loss": 0.6338, + "step": 5782 + }, + { + "epoch": 0.4375922212553441, + "grad_norm": 2.0958290100097656, + "learning_rate": 1.1565242443721783e-05, + "loss": 0.7985, + "step": 5783 + }, + { + "epoch": 0.43766788997767775, + "grad_norm": 2.5265495777130127, + "learning_rate": 1.156308356936779e-05, + "loss": 0.7152, + "step": 5784 + }, + { + "epoch": 0.43774355870001136, + "grad_norm": 1.9527240991592407, + "learning_rate": 1.1560924534505212e-05, + "loss": 0.7323, + "step": 5785 + }, + { + "epoch": 0.43781922742234497, + "grad_norm": 2.079576253890991, + "learning_rate": 1.1558765339269255e-05, + "loss": 0.7322, + "step": 5786 + }, + { + "epoch": 0.4378948961446786, + "grad_norm": 2.6876163482666016, + "learning_rate": 1.1556605983795142e-05, + "loss": 0.7538, + "step": 5787 + }, + { + "epoch": 0.43797056486701225, + "grad_norm": 2.0275208950042725, + "learning_rate": 1.1554446468218087e-05, + "loss": 0.7103, + "step": 5788 + }, + { + "epoch": 0.43804623358934586, + "grad_norm": 2.2174196243286133, + "learning_rate": 1.1552286792673335e-05, + "loss": 0.7053, + "step": 5789 + }, + { + "epoch": 0.43812190231167947, + "grad_norm": 2.1054906845092773, + "learning_rate": 1.1550126957296128e-05, + "loss": 0.7026, + "step": 5790 + }, + { + "epoch": 0.4381975710340131, + "grad_norm": 2.7534801959991455, + "learning_rate": 1.1547966962221726e-05, + "loss": 0.7907, + "step": 5791 + }, + { + "epoch": 0.4382732397563467, + "grad_norm": 2.1715431213378906, + "learning_rate": 1.154580680758539e-05, + "loss": 0.7555, + "step": 5792 + }, + { + "epoch": 0.43834890847868035, + "grad_norm": 2.3663253784179688, + "learning_rate": 1.1543646493522395e-05, + "loss": 0.7672, + "step": 5793 + }, + { + "epoch": 0.43842457720101397, + "grad_norm": 2.317469358444214, + "learning_rate": 1.1541486020168034e-05, + "loss": 0.8387, + "step": 5794 + }, + { + "epoch": 0.4385002459233476, + "grad_norm": 2.1923418045043945, + "learning_rate": 1.1539325387657593e-05, + "loss": 0.774, + "step": 5795 + }, + { + "epoch": 0.4385759146456812, + "grad_norm": 2.3220553398132324, + "learning_rate": 1.1537164596126386e-05, + "loss": 0.7811, + "step": 5796 + }, + { + "epoch": 0.43865158336801485, + "grad_norm": 2.2181203365325928, + "learning_rate": 1.1535003645709725e-05, + "loss": 0.616, + "step": 5797 + }, + { + "epoch": 0.43872725209034846, + "grad_norm": 2.3010993003845215, + "learning_rate": 1.1532842536542936e-05, + "loss": 0.615, + "step": 5798 + }, + { + "epoch": 0.4388029208126821, + "grad_norm": 2.1476845741271973, + "learning_rate": 1.1530681268761356e-05, + "loss": 0.8901, + "step": 5799 + }, + { + "epoch": 0.4388785895350157, + "grad_norm": 1.7837895154953003, + "learning_rate": 1.1528519842500328e-05, + "loss": 0.8305, + "step": 5800 + }, + { + "epoch": 0.43895425825734935, + "grad_norm": 2.1396026611328125, + "learning_rate": 1.1526358257895216e-05, + "loss": 0.8109, + "step": 5801 + }, + { + "epoch": 0.43902992697968296, + "grad_norm": 2.0845377445220947, + "learning_rate": 1.1524196515081372e-05, + "loss": 0.8193, + "step": 5802 + }, + { + "epoch": 0.43910559570201657, + "grad_norm": 1.9465970993041992, + "learning_rate": 1.1522034614194178e-05, + "loss": 0.7111, + "step": 5803 + }, + { + "epoch": 0.4391812644243502, + "grad_norm": 1.8505274057388306, + "learning_rate": 1.1519872555369022e-05, + "loss": 0.7731, + "step": 5804 + }, + { + "epoch": 0.4392569331466838, + "grad_norm": 2.0143327713012695, + "learning_rate": 1.1517710338741297e-05, + "loss": 0.5693, + "step": 5805 + }, + { + "epoch": 0.43933260186901746, + "grad_norm": 2.1907799243927, + "learning_rate": 1.1515547964446403e-05, + "loss": 0.7013, + "step": 5806 + }, + { + "epoch": 0.43940827059135107, + "grad_norm": 2.1983025074005127, + "learning_rate": 1.1513385432619763e-05, + "loss": 0.8154, + "step": 5807 + }, + { + "epoch": 0.4394839393136847, + "grad_norm": 1.9378697872161865, + "learning_rate": 1.1511222743396797e-05, + "loss": 0.6557, + "step": 5808 + }, + { + "epoch": 0.4395596080360183, + "grad_norm": 1.7255088090896606, + "learning_rate": 1.150905989691294e-05, + "loss": 0.6812, + "step": 5809 + }, + { + "epoch": 0.43963527675835196, + "grad_norm": 1.7912418842315674, + "learning_rate": 1.1506896893303637e-05, + "loss": 0.642, + "step": 5810 + }, + { + "epoch": 0.43971094548068557, + "grad_norm": 2.0049731731414795, + "learning_rate": 1.1504733732704342e-05, + "loss": 0.7161, + "step": 5811 + }, + { + "epoch": 0.4397866142030192, + "grad_norm": 1.776609182357788, + "learning_rate": 1.1502570415250522e-05, + "loss": 0.6908, + "step": 5812 + }, + { + "epoch": 0.4398622829253528, + "grad_norm": 1.9277169704437256, + "learning_rate": 1.1500406941077642e-05, + "loss": 0.7799, + "step": 5813 + }, + { + "epoch": 0.43993795164768645, + "grad_norm": 1.8909525871276855, + "learning_rate": 1.1498243310321198e-05, + "loss": 0.7749, + "step": 5814 + }, + { + "epoch": 0.44001362037002006, + "grad_norm": 2.2037034034729004, + "learning_rate": 1.1496079523116677e-05, + "loss": 0.9047, + "step": 5815 + }, + { + "epoch": 0.4400892890923537, + "grad_norm": 1.9161611795425415, + "learning_rate": 1.1493915579599582e-05, + "loss": 0.6534, + "step": 5816 + }, + { + "epoch": 0.4401649578146873, + "grad_norm": 2.0024547576904297, + "learning_rate": 1.1491751479905425e-05, + "loss": 0.7846, + "step": 5817 + }, + { + "epoch": 0.4402406265370209, + "grad_norm": 1.8573756217956543, + "learning_rate": 1.1489587224169733e-05, + "loss": 0.7107, + "step": 5818 + }, + { + "epoch": 0.44031629525935456, + "grad_norm": 2.4334030151367188, + "learning_rate": 1.1487422812528037e-05, + "loss": 0.5948, + "step": 5819 + }, + { + "epoch": 0.4403919639816882, + "grad_norm": 1.6953381299972534, + "learning_rate": 1.1485258245115878e-05, + "loss": 0.761, + "step": 5820 + }, + { + "epoch": 0.4404676327040218, + "grad_norm": 1.992057204246521, + "learning_rate": 1.148309352206881e-05, + "loss": 0.6255, + "step": 5821 + }, + { + "epoch": 0.4405433014263554, + "grad_norm": 1.9691619873046875, + "learning_rate": 1.1480928643522396e-05, + "loss": 0.6193, + "step": 5822 + }, + { + "epoch": 0.44061897014868906, + "grad_norm": 2.430366277694702, + "learning_rate": 1.1478763609612204e-05, + "loss": 0.7825, + "step": 5823 + }, + { + "epoch": 0.44069463887102267, + "grad_norm": 1.9845337867736816, + "learning_rate": 1.1476598420473817e-05, + "loss": 0.5983, + "step": 5824 + }, + { + "epoch": 0.4407703075933563, + "grad_norm": 1.902275800704956, + "learning_rate": 1.147443307624283e-05, + "loss": 0.7193, + "step": 5825 + }, + { + "epoch": 0.4408459763156899, + "grad_norm": 2.0569605827331543, + "learning_rate": 1.1472267577054838e-05, + "loss": 0.8241, + "step": 5826 + }, + { + "epoch": 0.44092164503802356, + "grad_norm": 1.7399667501449585, + "learning_rate": 1.1470101923045453e-05, + "loss": 0.62, + "step": 5827 + }, + { + "epoch": 0.44099731376035717, + "grad_norm": 2.0543341636657715, + "learning_rate": 1.14679361143503e-05, + "loss": 0.6139, + "step": 5828 + }, + { + "epoch": 0.4410729824826908, + "grad_norm": 2.1195523738861084, + "learning_rate": 1.1465770151105e-05, + "loss": 0.7834, + "step": 5829 + }, + { + "epoch": 0.4411486512050244, + "grad_norm": 2.0110557079315186, + "learning_rate": 1.1463604033445203e-05, + "loss": 0.5876, + "step": 5830 + }, + { + "epoch": 0.441224319927358, + "grad_norm": 1.9073213338851929, + "learning_rate": 1.1461437761506548e-05, + "loss": 0.7183, + "step": 5831 + }, + { + "epoch": 0.44129998864969167, + "grad_norm": 1.7596300840377808, + "learning_rate": 1.1459271335424703e-05, + "loss": 0.7434, + "step": 5832 + }, + { + "epoch": 0.4413756573720253, + "grad_norm": 2.102022647857666, + "learning_rate": 1.1457104755335332e-05, + "loss": 0.7105, + "step": 5833 + }, + { + "epoch": 0.4414513260943589, + "grad_norm": 1.682158350944519, + "learning_rate": 1.1454938021374112e-05, + "loss": 0.6027, + "step": 5834 + }, + { + "epoch": 0.4415269948166925, + "grad_norm": 2.463780641555786, + "learning_rate": 1.1452771133676736e-05, + "loss": 0.7158, + "step": 5835 + }, + { + "epoch": 0.44160266353902616, + "grad_norm": 1.8532681465148926, + "learning_rate": 1.1450604092378895e-05, + "loss": 0.7358, + "step": 5836 + }, + { + "epoch": 0.4416783322613598, + "grad_norm": 2.06534743309021, + "learning_rate": 1.1448436897616304e-05, + "loss": 0.6122, + "step": 5837 + }, + { + "epoch": 0.4417540009836934, + "grad_norm": 1.474158763885498, + "learning_rate": 1.144626954952467e-05, + "loss": 0.869, + "step": 5838 + }, + { + "epoch": 0.441829669706027, + "grad_norm": 2.5886144638061523, + "learning_rate": 1.1444102048239729e-05, + "loss": 0.8578, + "step": 5839 + }, + { + "epoch": 0.44190533842836066, + "grad_norm": 2.0663976669311523, + "learning_rate": 1.1441934393897208e-05, + "loss": 0.6776, + "step": 5840 + }, + { + "epoch": 0.44198100715069427, + "grad_norm": 3.3373160362243652, + "learning_rate": 1.1439766586632861e-05, + "loss": 0.6511, + "step": 5841 + }, + { + "epoch": 0.4420566758730279, + "grad_norm": 1.776739239692688, + "learning_rate": 1.1437598626582438e-05, + "loss": 0.732, + "step": 5842 + }, + { + "epoch": 0.4421323445953615, + "grad_norm": 1.9327268600463867, + "learning_rate": 1.1435430513881705e-05, + "loss": 0.7917, + "step": 5843 + }, + { + "epoch": 0.4422080133176951, + "grad_norm": 2.4494521617889404, + "learning_rate": 1.1433262248666438e-05, + "loss": 0.7542, + "step": 5844 + }, + { + "epoch": 0.44228368204002877, + "grad_norm": 2.140435218811035, + "learning_rate": 1.1431093831072414e-05, + "loss": 0.5638, + "step": 5845 + }, + { + "epoch": 0.4423593507623624, + "grad_norm": 2.0090110301971436, + "learning_rate": 1.1428925261235437e-05, + "loss": 0.6552, + "step": 5846 + }, + { + "epoch": 0.442435019484696, + "grad_norm": 2.2307565212249756, + "learning_rate": 1.14267565392913e-05, + "loss": 0.9066, + "step": 5847 + }, + { + "epoch": 0.4425106882070296, + "grad_norm": 1.7167367935180664, + "learning_rate": 1.142458766537582e-05, + "loss": 0.7067, + "step": 5848 + }, + { + "epoch": 0.44258635692936327, + "grad_norm": 1.934412956237793, + "learning_rate": 1.1422418639624818e-05, + "loss": 0.641, + "step": 5849 + }, + { + "epoch": 0.4426620256516969, + "grad_norm": 1.84349524974823, + "learning_rate": 1.142024946217413e-05, + "loss": 0.8343, + "step": 5850 + }, + { + "epoch": 0.4427376943740305, + "grad_norm": 1.82757568359375, + "learning_rate": 1.1418080133159588e-05, + "loss": 0.7165, + "step": 5851 + }, + { + "epoch": 0.4428133630963641, + "grad_norm": 1.902925968170166, + "learning_rate": 1.1415910652717046e-05, + "loss": 0.7806, + "step": 5852 + }, + { + "epoch": 0.44288903181869776, + "grad_norm": 2.100587844848633, + "learning_rate": 1.1413741020982369e-05, + "loss": 0.7068, + "step": 5853 + }, + { + "epoch": 0.4429647005410314, + "grad_norm": 2.090022325515747, + "learning_rate": 1.1411571238091419e-05, + "loss": 0.6824, + "step": 5854 + }, + { + "epoch": 0.443040369263365, + "grad_norm": 1.6007441282272339, + "learning_rate": 1.1409401304180081e-05, + "loss": 0.7933, + "step": 5855 + }, + { + "epoch": 0.4431160379856986, + "grad_norm": 2.1241252422332764, + "learning_rate": 1.1407231219384238e-05, + "loss": 0.7396, + "step": 5856 + }, + { + "epoch": 0.44319170670803226, + "grad_norm": 2.4907209873199463, + "learning_rate": 1.140506098383979e-05, + "loss": 0.8325, + "step": 5857 + }, + { + "epoch": 0.4432673754303659, + "grad_norm": 1.7543824911117554, + "learning_rate": 1.1402890597682648e-05, + "loss": 0.6119, + "step": 5858 + }, + { + "epoch": 0.4433430441526995, + "grad_norm": 2.4518723487854004, + "learning_rate": 1.1400720061048718e-05, + "loss": 0.8612, + "step": 5859 + }, + { + "epoch": 0.4434187128750331, + "grad_norm": 2.1455647945404053, + "learning_rate": 1.139854937407394e-05, + "loss": 0.7573, + "step": 5860 + }, + { + "epoch": 0.4434943815973667, + "grad_norm": 2.118077039718628, + "learning_rate": 1.1396378536894239e-05, + "loss": 0.6258, + "step": 5861 + }, + { + "epoch": 0.44357005031970037, + "grad_norm": 1.8771562576293945, + "learning_rate": 1.1394207549645564e-05, + "loss": 0.5765, + "step": 5862 + }, + { + "epoch": 0.443645719042034, + "grad_norm": 2.192807912826538, + "learning_rate": 1.1392036412463868e-05, + "loss": 0.6963, + "step": 5863 + }, + { + "epoch": 0.4437213877643676, + "grad_norm": 3.553529739379883, + "learning_rate": 1.1389865125485116e-05, + "loss": 0.6827, + "step": 5864 + }, + { + "epoch": 0.4437970564867012, + "grad_norm": 3.7018728256225586, + "learning_rate": 1.138769368884528e-05, + "loss": 0.6063, + "step": 5865 + }, + { + "epoch": 0.44387272520903487, + "grad_norm": 2.078188896179199, + "learning_rate": 1.138552210268034e-05, + "loss": 0.7483, + "step": 5866 + }, + { + "epoch": 0.4439483939313685, + "grad_norm": 2.0784387588500977, + "learning_rate": 1.1383350367126292e-05, + "loss": 0.7824, + "step": 5867 + }, + { + "epoch": 0.4440240626537021, + "grad_norm": 3.4652624130249023, + "learning_rate": 1.1381178482319136e-05, + "loss": 0.7509, + "step": 5868 + }, + { + "epoch": 0.4440997313760357, + "grad_norm": 1.8125836849212646, + "learning_rate": 1.1379006448394882e-05, + "loss": 0.6492, + "step": 5869 + }, + { + "epoch": 0.44417540009836937, + "grad_norm": 2.023577928543091, + "learning_rate": 1.1376834265489545e-05, + "loss": 0.6456, + "step": 5870 + }, + { + "epoch": 0.444251068820703, + "grad_norm": 2.251408100128174, + "learning_rate": 1.1374661933739165e-05, + "loss": 0.7316, + "step": 5871 + }, + { + "epoch": 0.4443267375430366, + "grad_norm": 1.5530261993408203, + "learning_rate": 1.137248945327977e-05, + "loss": 0.9224, + "step": 5872 + }, + { + "epoch": 0.4444024062653702, + "grad_norm": 1.9940237998962402, + "learning_rate": 1.1370316824247414e-05, + "loss": 0.6529, + "step": 5873 + }, + { + "epoch": 0.4444780749877038, + "grad_norm": 2.0414655208587646, + "learning_rate": 1.1368144046778151e-05, + "loss": 0.6643, + "step": 5874 + }, + { + "epoch": 0.4445537437100375, + "grad_norm": 2.2049062252044678, + "learning_rate": 1.1365971121008047e-05, + "loss": 0.586, + "step": 5875 + }, + { + "epoch": 0.4446294124323711, + "grad_norm": 3.8549551963806152, + "learning_rate": 1.1363798047073183e-05, + "loss": 0.9112, + "step": 5876 + }, + { + "epoch": 0.4447050811547047, + "grad_norm": 2.402311325073242, + "learning_rate": 1.1361624825109634e-05, + "loss": 0.7245, + "step": 5877 + }, + { + "epoch": 0.4447807498770383, + "grad_norm": 2.2628328800201416, + "learning_rate": 1.1359451455253505e-05, + "loss": 0.786, + "step": 5878 + }, + { + "epoch": 0.44485641859937197, + "grad_norm": 2.2788891792297363, + "learning_rate": 1.1357277937640893e-05, + "loss": 0.751, + "step": 5879 + }, + { + "epoch": 0.4449320873217056, + "grad_norm": 2.0168333053588867, + "learning_rate": 1.135510427240791e-05, + "loss": 0.7533, + "step": 5880 + }, + { + "epoch": 0.4450077560440392, + "grad_norm": 1.9024062156677246, + "learning_rate": 1.1352930459690684e-05, + "loss": 0.677, + "step": 5881 + }, + { + "epoch": 0.4450834247663728, + "grad_norm": 1.9839564561843872, + "learning_rate": 1.135075649962534e-05, + "loss": 0.7093, + "step": 5882 + }, + { + "epoch": 0.44515909348870647, + "grad_norm": 2.0501761436462402, + "learning_rate": 1.1348582392348022e-05, + "loss": 0.733, + "step": 5883 + }, + { + "epoch": 0.4452347622110401, + "grad_norm": 2.069188356399536, + "learning_rate": 1.1346408137994876e-05, + "loss": 0.7962, + "step": 5884 + }, + { + "epoch": 0.4453104309333737, + "grad_norm": 2.593379497528076, + "learning_rate": 1.1344233736702065e-05, + "loss": 0.6942, + "step": 5885 + }, + { + "epoch": 0.4453860996557073, + "grad_norm": 2.6586804389953613, + "learning_rate": 1.1342059188605756e-05, + "loss": 0.7377, + "step": 5886 + }, + { + "epoch": 0.4454617683780409, + "grad_norm": 2.206529140472412, + "learning_rate": 1.1339884493842124e-05, + "loss": 0.6509, + "step": 5887 + }, + { + "epoch": 0.4455374371003746, + "grad_norm": 2.4398305416107178, + "learning_rate": 1.1337709652547357e-05, + "loss": 0.825, + "step": 5888 + }, + { + "epoch": 0.4456131058227082, + "grad_norm": 2.344985008239746, + "learning_rate": 1.1335534664857651e-05, + "loss": 0.675, + "step": 5889 + }, + { + "epoch": 0.4456887745450418, + "grad_norm": 2.7695236206054688, + "learning_rate": 1.1333359530909208e-05, + "loss": 0.6979, + "step": 5890 + }, + { + "epoch": 0.4457644432673754, + "grad_norm": 3.5418498516082764, + "learning_rate": 1.1331184250838249e-05, + "loss": 0.6195, + "step": 5891 + }, + { + "epoch": 0.4458401119897091, + "grad_norm": 2.1728100776672363, + "learning_rate": 1.132900882478099e-05, + "loss": 0.7099, + "step": 5892 + }, + { + "epoch": 0.4459157807120427, + "grad_norm": 2.0681023597717285, + "learning_rate": 1.1326833252873663e-05, + "loss": 0.7016, + "step": 5893 + }, + { + "epoch": 0.4459914494343763, + "grad_norm": 2.0414974689483643, + "learning_rate": 1.1324657535252514e-05, + "loss": 0.6981, + "step": 5894 + }, + { + "epoch": 0.4460671181567099, + "grad_norm": 2.562387228012085, + "learning_rate": 1.1322481672053791e-05, + "loss": 0.6492, + "step": 5895 + }, + { + "epoch": 0.4461427868790436, + "grad_norm": 1.7492594718933105, + "learning_rate": 1.1320305663413752e-05, + "loss": 0.5471, + "step": 5896 + }, + { + "epoch": 0.4462184556013772, + "grad_norm": 2.4081857204437256, + "learning_rate": 1.1318129509468671e-05, + "loss": 0.7666, + "step": 5897 + }, + { + "epoch": 0.4462941243237108, + "grad_norm": 2.3385374546051025, + "learning_rate": 1.1315953210354821e-05, + "loss": 0.6716, + "step": 5898 + }, + { + "epoch": 0.4463697930460444, + "grad_norm": 2.440551280975342, + "learning_rate": 1.1313776766208492e-05, + "loss": 0.9059, + "step": 5899 + }, + { + "epoch": 0.446445461768378, + "grad_norm": 1.830227017402649, + "learning_rate": 1.1311600177165972e-05, + "loss": 0.6836, + "step": 5900 + }, + { + "epoch": 0.4465211304907117, + "grad_norm": 1.9618531465530396, + "learning_rate": 1.130942344336358e-05, + "loss": 0.7531, + "step": 5901 + }, + { + "epoch": 0.4465967992130453, + "grad_norm": 1.9825726747512817, + "learning_rate": 1.1307246564937618e-05, + "loss": 0.8805, + "step": 5902 + }, + { + "epoch": 0.4466724679353789, + "grad_norm": 2.091987133026123, + "learning_rate": 1.1305069542024414e-05, + "loss": 0.7716, + "step": 5903 + }, + { + "epoch": 0.4467481366577125, + "grad_norm": 1.93959641456604, + "learning_rate": 1.1302892374760301e-05, + "loss": 0.6985, + "step": 5904 + }, + { + "epoch": 0.4468238053800462, + "grad_norm": 2.1887693405151367, + "learning_rate": 1.130071506328162e-05, + "loss": 0.7668, + "step": 5905 + }, + { + "epoch": 0.4468994741023798, + "grad_norm": 1.8061445951461792, + "learning_rate": 1.1298537607724716e-05, + "loss": 0.6938, + "step": 5906 + }, + { + "epoch": 0.4469751428247134, + "grad_norm": 1.748567819595337, + "learning_rate": 1.1296360008225957e-05, + "loss": 0.6903, + "step": 5907 + }, + { + "epoch": 0.447050811547047, + "grad_norm": 1.7834432125091553, + "learning_rate": 1.1294182264921704e-05, + "loss": 0.6602, + "step": 5908 + }, + { + "epoch": 0.4471264802693807, + "grad_norm": 1.9683499336242676, + "learning_rate": 1.1292004377948338e-05, + "loss": 0.7615, + "step": 5909 + }, + { + "epoch": 0.4472021489917143, + "grad_norm": 2.5268006324768066, + "learning_rate": 1.1289826347442247e-05, + "loss": 0.6007, + "step": 5910 + }, + { + "epoch": 0.4472778177140479, + "grad_norm": 2.608851671218872, + "learning_rate": 1.1287648173539822e-05, + "loss": 0.7841, + "step": 5911 + }, + { + "epoch": 0.4473534864363815, + "grad_norm": 2.4634876251220703, + "learning_rate": 1.128546985637747e-05, + "loss": 0.6308, + "step": 5912 + }, + { + "epoch": 0.4474291551587151, + "grad_norm": 1.7287302017211914, + "learning_rate": 1.1283291396091601e-05, + "loss": 0.6105, + "step": 5913 + }, + { + "epoch": 0.4475048238810488, + "grad_norm": 1.982318639755249, + "learning_rate": 1.1281112792818641e-05, + "loss": 0.7053, + "step": 5914 + }, + { + "epoch": 0.4475804926033824, + "grad_norm": 1.8996347188949585, + "learning_rate": 1.1278934046695023e-05, + "loss": 0.7603, + "step": 5915 + }, + { + "epoch": 0.447656161325716, + "grad_norm": 2.161860942840576, + "learning_rate": 1.1276755157857179e-05, + "loss": 0.7217, + "step": 5916 + }, + { + "epoch": 0.4477318300480496, + "grad_norm": 2.7637171745300293, + "learning_rate": 1.1274576126441568e-05, + "loss": 0.7831, + "step": 5917 + }, + { + "epoch": 0.4478074987703833, + "grad_norm": 1.9695764780044556, + "learning_rate": 1.127239695258464e-05, + "loss": 0.6614, + "step": 5918 + }, + { + "epoch": 0.4478831674927169, + "grad_norm": 2.0457887649536133, + "learning_rate": 1.1270217636422864e-05, + "loss": 0.7391, + "step": 5919 + }, + { + "epoch": 0.4479588362150505, + "grad_norm": 1.87351393699646, + "learning_rate": 1.1268038178092718e-05, + "loss": 0.8303, + "step": 5920 + }, + { + "epoch": 0.4480345049373841, + "grad_norm": 2.1492748260498047, + "learning_rate": 1.1265858577730685e-05, + "loss": 0.6984, + "step": 5921 + }, + { + "epoch": 0.4481101736597178, + "grad_norm": 2.0137827396392822, + "learning_rate": 1.1263678835473263e-05, + "loss": 0.7522, + "step": 5922 + }, + { + "epoch": 0.4481858423820514, + "grad_norm": 2.2012124061584473, + "learning_rate": 1.1261498951456948e-05, + "loss": 0.6075, + "step": 5923 + }, + { + "epoch": 0.448261511104385, + "grad_norm": 2.0582940578460693, + "learning_rate": 1.1259318925818253e-05, + "loss": 0.6671, + "step": 5924 + }, + { + "epoch": 0.4483371798267186, + "grad_norm": 2.405733823776245, + "learning_rate": 1.1257138758693701e-05, + "loss": 0.8391, + "step": 5925 + }, + { + "epoch": 0.4484128485490522, + "grad_norm": 3.624671697616577, + "learning_rate": 1.1254958450219817e-05, + "loss": 0.6537, + "step": 5926 + }, + { + "epoch": 0.4484885172713859, + "grad_norm": 2.217015504837036, + "learning_rate": 1.1252778000533143e-05, + "loss": 0.6828, + "step": 5927 + }, + { + "epoch": 0.4485641859937195, + "grad_norm": 2.174923896789551, + "learning_rate": 1.1250597409770225e-05, + "loss": 0.7816, + "step": 5928 + }, + { + "epoch": 0.4486398547160531, + "grad_norm": 1.7993848323822021, + "learning_rate": 1.1248416678067619e-05, + "loss": 0.6842, + "step": 5929 + }, + { + "epoch": 0.4487155234383867, + "grad_norm": 1.861826777458191, + "learning_rate": 1.1246235805561887e-05, + "loss": 0.8071, + "step": 5930 + }, + { + "epoch": 0.4487911921607204, + "grad_norm": 2.257115125656128, + "learning_rate": 1.1244054792389602e-05, + "loss": 0.7332, + "step": 5931 + }, + { + "epoch": 0.448866860883054, + "grad_norm": 2.5872914791107178, + "learning_rate": 1.1241873638687348e-05, + "loss": 0.6017, + "step": 5932 + }, + { + "epoch": 0.4489425296053876, + "grad_norm": 1.890411138534546, + "learning_rate": 1.1239692344591719e-05, + "loss": 0.6682, + "step": 5933 + }, + { + "epoch": 0.4490181983277212, + "grad_norm": 3.7354846000671387, + "learning_rate": 1.1237510910239306e-05, + "loss": 0.7684, + "step": 5934 + }, + { + "epoch": 0.4490938670500549, + "grad_norm": 1.9048963785171509, + "learning_rate": 1.1235329335766728e-05, + "loss": 0.524, + "step": 5935 + }, + { + "epoch": 0.4491695357723885, + "grad_norm": 1.8189629316329956, + "learning_rate": 1.1233147621310594e-05, + "loss": 0.6492, + "step": 5936 + }, + { + "epoch": 0.4492452044947221, + "grad_norm": 1.712294101715088, + "learning_rate": 1.1230965767007535e-05, + "loss": 0.7487, + "step": 5937 + }, + { + "epoch": 0.4493208732170557, + "grad_norm": 2.2259769439697266, + "learning_rate": 1.1228783772994184e-05, + "loss": 0.7662, + "step": 5938 + }, + { + "epoch": 0.4493965419393893, + "grad_norm": 3.1922950744628906, + "learning_rate": 1.122660163940718e-05, + "loss": 0.8065, + "step": 5939 + }, + { + "epoch": 0.449472210661723, + "grad_norm": 2.1241049766540527, + "learning_rate": 1.1224419366383186e-05, + "loss": 0.6927, + "step": 5940 + }, + { + "epoch": 0.4495478793840566, + "grad_norm": 2.3622326850891113, + "learning_rate": 1.1222236954058853e-05, + "loss": 0.8493, + "step": 5941 + }, + { + "epoch": 0.4496235481063902, + "grad_norm": 3.5606555938720703, + "learning_rate": 1.1220054402570854e-05, + "loss": 0.773, + "step": 5942 + }, + { + "epoch": 0.4496992168287238, + "grad_norm": 2.492074966430664, + "learning_rate": 1.1217871712055869e-05, + "loss": 0.6058, + "step": 5943 + }, + { + "epoch": 0.4497748855510575, + "grad_norm": 2.104963779449463, + "learning_rate": 1.1215688882650582e-05, + "loss": 0.7597, + "step": 5944 + }, + { + "epoch": 0.4498505542733911, + "grad_norm": 1.9802522659301758, + "learning_rate": 1.1213505914491695e-05, + "loss": 0.7904, + "step": 5945 + }, + { + "epoch": 0.4499262229957247, + "grad_norm": 1.8964923620224, + "learning_rate": 1.1211322807715906e-05, + "loss": 0.7552, + "step": 5946 + }, + { + "epoch": 0.4500018917180583, + "grad_norm": 3.3813583850860596, + "learning_rate": 1.1209139562459929e-05, + "loss": 0.6773, + "step": 5947 + }, + { + "epoch": 0.450077560440392, + "grad_norm": 2.5931599140167236, + "learning_rate": 1.120695617886049e-05, + "loss": 0.6421, + "step": 5948 + }, + { + "epoch": 0.4501532291627256, + "grad_norm": 2.148244857788086, + "learning_rate": 1.1204772657054314e-05, + "loss": 0.8242, + "step": 5949 + }, + { + "epoch": 0.4502288978850592, + "grad_norm": 1.9248651266098022, + "learning_rate": 1.1202588997178144e-05, + "loss": 0.737, + "step": 5950 + }, + { + "epoch": 0.4503045666073928, + "grad_norm": 2.1882691383361816, + "learning_rate": 1.1200405199368729e-05, + "loss": 0.641, + "step": 5951 + }, + { + "epoch": 0.45038023532972643, + "grad_norm": 2.8311820030212402, + "learning_rate": 1.119822126376282e-05, + "loss": 0.6549, + "step": 5952 + }, + { + "epoch": 0.4504559040520601, + "grad_norm": 2.2649013996124268, + "learning_rate": 1.1196037190497188e-05, + "loss": 0.7611, + "step": 5953 + }, + { + "epoch": 0.4505315727743937, + "grad_norm": 1.7124543190002441, + "learning_rate": 1.1193852979708604e-05, + "loss": 0.7877, + "step": 5954 + }, + { + "epoch": 0.4506072414967273, + "grad_norm": 2.419224739074707, + "learning_rate": 1.119166863153385e-05, + "loss": 0.7871, + "step": 5955 + }, + { + "epoch": 0.45068291021906093, + "grad_norm": 2.265690565109253, + "learning_rate": 1.1189484146109719e-05, + "loss": 0.5847, + "step": 5956 + }, + { + "epoch": 0.4507585789413946, + "grad_norm": 2.1658334732055664, + "learning_rate": 1.1187299523573007e-05, + "loss": 0.6962, + "step": 5957 + }, + { + "epoch": 0.4508342476637282, + "grad_norm": 2.0252439975738525, + "learning_rate": 1.1185114764060528e-05, + "loss": 0.7378, + "step": 5958 + }, + { + "epoch": 0.4509099163860618, + "grad_norm": 1.994943380355835, + "learning_rate": 1.118292986770909e-05, + "loss": 0.6885, + "step": 5959 + }, + { + "epoch": 0.4509855851083954, + "grad_norm": 2.032151699066162, + "learning_rate": 1.1180744834655526e-05, + "loss": 0.7695, + "step": 5960 + }, + { + "epoch": 0.4510612538307291, + "grad_norm": 1.8477638959884644, + "learning_rate": 1.1178559665036666e-05, + "loss": 0.8245, + "step": 5961 + }, + { + "epoch": 0.4511369225530627, + "grad_norm": 1.867470145225525, + "learning_rate": 1.1176374358989354e-05, + "loss": 0.6492, + "step": 5962 + }, + { + "epoch": 0.4512125912753963, + "grad_norm": 2.083955764770508, + "learning_rate": 1.117418891665044e-05, + "loss": 0.6438, + "step": 5963 + }, + { + "epoch": 0.4512882599977299, + "grad_norm": 2.1489977836608887, + "learning_rate": 1.1172003338156787e-05, + "loss": 0.6843, + "step": 5964 + }, + { + "epoch": 0.45136392872006353, + "grad_norm": 1.905900478363037, + "learning_rate": 1.1169817623645256e-05, + "loss": 0.6142, + "step": 5965 + }, + { + "epoch": 0.4514395974423972, + "grad_norm": 2.060368537902832, + "learning_rate": 1.116763177325273e-05, + "loss": 0.76, + "step": 5966 + }, + { + "epoch": 0.4515152661647308, + "grad_norm": 2.1221015453338623, + "learning_rate": 1.1165445787116088e-05, + "loss": 0.7409, + "step": 5967 + }, + { + "epoch": 0.4515909348870644, + "grad_norm": 1.9896661043167114, + "learning_rate": 1.116325966537223e-05, + "loss": 0.865, + "step": 5968 + }, + { + "epoch": 0.45166660360939803, + "grad_norm": 1.9330137968063354, + "learning_rate": 1.1161073408158054e-05, + "loss": 0.8041, + "step": 5969 + }, + { + "epoch": 0.4517422723317317, + "grad_norm": 1.8360910415649414, + "learning_rate": 1.115888701561047e-05, + "loss": 0.7317, + "step": 5970 + }, + { + "epoch": 0.4518179410540653, + "grad_norm": 2.239154815673828, + "learning_rate": 1.11567004878664e-05, + "loss": 0.765, + "step": 5971 + }, + { + "epoch": 0.4518936097763989, + "grad_norm": 2.8562796115875244, + "learning_rate": 1.115451382506277e-05, + "loss": 0.9, + "step": 5972 + }, + { + "epoch": 0.45196927849873253, + "grad_norm": 1.8659065961837769, + "learning_rate": 1.1152327027336513e-05, + "loss": 0.6336, + "step": 5973 + }, + { + "epoch": 0.4520449472210662, + "grad_norm": 2.5955421924591064, + "learning_rate": 1.1150140094824579e-05, + "loss": 0.6623, + "step": 5974 + }, + { + "epoch": 0.4521206159433998, + "grad_norm": 1.7861441373825073, + "learning_rate": 1.1147953027663919e-05, + "loss": 0.6716, + "step": 5975 + }, + { + "epoch": 0.4521962846657334, + "grad_norm": 1.989698052406311, + "learning_rate": 1.114576582599149e-05, + "loss": 0.5853, + "step": 5976 + }, + { + "epoch": 0.452271953388067, + "grad_norm": 1.9923795461654663, + "learning_rate": 1.1143578489944266e-05, + "loss": 0.7264, + "step": 5977 + }, + { + "epoch": 0.4523476221104007, + "grad_norm": 2.052943229675293, + "learning_rate": 1.1141391019659223e-05, + "loss": 0.6532, + "step": 5978 + }, + { + "epoch": 0.4524232908327343, + "grad_norm": 1.8937102556228638, + "learning_rate": 1.113920341527335e-05, + "loss": 0.6145, + "step": 5979 + }, + { + "epoch": 0.4524989595550679, + "grad_norm": 1.7291990518569946, + "learning_rate": 1.1137015676923637e-05, + "loss": 0.7514, + "step": 5980 + }, + { + "epoch": 0.4525746282774015, + "grad_norm": 1.8814363479614258, + "learning_rate": 1.1134827804747093e-05, + "loss": 0.7184, + "step": 5981 + }, + { + "epoch": 0.45265029699973514, + "grad_norm": 2.006896495819092, + "learning_rate": 1.1132639798880728e-05, + "loss": 0.6344, + "step": 5982 + }, + { + "epoch": 0.4527259657220688, + "grad_norm": 2.146019458770752, + "learning_rate": 1.1130451659461559e-05, + "loss": 0.6921, + "step": 5983 + }, + { + "epoch": 0.4528016344444024, + "grad_norm": 10.404562950134277, + "learning_rate": 1.1128263386626617e-05, + "loss": 0.5599, + "step": 5984 + }, + { + "epoch": 0.452877303166736, + "grad_norm": 1.8004459142684937, + "learning_rate": 1.1126074980512936e-05, + "loss": 0.6701, + "step": 5985 + }, + { + "epoch": 0.45295297188906963, + "grad_norm": 2.264495611190796, + "learning_rate": 1.1123886441257567e-05, + "loss": 0.7605, + "step": 5986 + }, + { + "epoch": 0.4530286406114033, + "grad_norm": 1.9421061277389526, + "learning_rate": 1.1121697768997556e-05, + "loss": 0.7667, + "step": 5987 + }, + { + "epoch": 0.4531043093337369, + "grad_norm": 3.031816244125366, + "learning_rate": 1.1119508963869971e-05, + "loss": 0.5885, + "step": 5988 + }, + { + "epoch": 0.4531799780560705, + "grad_norm": 3.6335830688476562, + "learning_rate": 1.1117320026011878e-05, + "loss": 0.6176, + "step": 5989 + }, + { + "epoch": 0.45325564677840413, + "grad_norm": 2.454843282699585, + "learning_rate": 1.1115130955560357e-05, + "loss": 0.7809, + "step": 5990 + }, + { + "epoch": 0.4533313155007378, + "grad_norm": 1.9949727058410645, + "learning_rate": 1.1112941752652495e-05, + "loss": 0.7147, + "step": 5991 + }, + { + "epoch": 0.4534069842230714, + "grad_norm": 1.9766342639923096, + "learning_rate": 1.1110752417425386e-05, + "loss": 0.8628, + "step": 5992 + }, + { + "epoch": 0.453482652945405, + "grad_norm": 2.25211763381958, + "learning_rate": 1.1108562950016133e-05, + "loss": 0.7544, + "step": 5993 + }, + { + "epoch": 0.45355832166773863, + "grad_norm": 2.635415554046631, + "learning_rate": 1.1106373350561848e-05, + "loss": 0.8409, + "step": 5994 + }, + { + "epoch": 0.45363399039007224, + "grad_norm": 2.761585235595703, + "learning_rate": 1.110418361919965e-05, + "loss": 0.8435, + "step": 5995 + }, + { + "epoch": 0.4537096591124059, + "grad_norm": 1.7957862615585327, + "learning_rate": 1.110199375606667e-05, + "loss": 0.6987, + "step": 5996 + }, + { + "epoch": 0.4537853278347395, + "grad_norm": 1.743152141571045, + "learning_rate": 1.1099803761300043e-05, + "loss": 0.5961, + "step": 5997 + }, + { + "epoch": 0.4538609965570731, + "grad_norm": 1.9768725633621216, + "learning_rate": 1.1097613635036912e-05, + "loss": 0.731, + "step": 5998 + }, + { + "epoch": 0.45393666527940674, + "grad_norm": 2.327970504760742, + "learning_rate": 1.109542337741443e-05, + "loss": 0.8248, + "step": 5999 + }, + { + "epoch": 0.4540123340017404, + "grad_norm": 2.5516083240509033, + "learning_rate": 1.1093232988569757e-05, + "loss": 0.7137, + "step": 6000 + }, + { + "epoch": 0.454088002724074, + "grad_norm": 2.588467836380005, + "learning_rate": 1.1091042468640066e-05, + "loss": 0.7983, + "step": 6001 + }, + { + "epoch": 0.4541636714464076, + "grad_norm": 2.372370481491089, + "learning_rate": 1.1088851817762537e-05, + "loss": 0.7425, + "step": 6002 + }, + { + "epoch": 0.45423934016874123, + "grad_norm": 2.482089042663574, + "learning_rate": 1.1086661036074342e-05, + "loss": 0.6915, + "step": 6003 + }, + { + "epoch": 0.4543150088910749, + "grad_norm": 2.0456271171569824, + "learning_rate": 1.108447012371269e-05, + "loss": 0.6623, + "step": 6004 + }, + { + "epoch": 0.4543906776134085, + "grad_norm": 2.8427894115448, + "learning_rate": 1.1082279080814775e-05, + "loss": 0.7134, + "step": 6005 + }, + { + "epoch": 0.4544663463357421, + "grad_norm": 2.3383491039276123, + "learning_rate": 1.1080087907517808e-05, + "loss": 0.8108, + "step": 6006 + }, + { + "epoch": 0.45454201505807573, + "grad_norm": 2.1955339908599854, + "learning_rate": 1.107789660395901e-05, + "loss": 0.6805, + "step": 6007 + }, + { + "epoch": 0.45461768378040934, + "grad_norm": 2.1509621143341064, + "learning_rate": 1.1075705170275605e-05, + "loss": 0.7771, + "step": 6008 + }, + { + "epoch": 0.454693352502743, + "grad_norm": 2.429506778717041, + "learning_rate": 1.107351360660483e-05, + "loss": 0.6084, + "step": 6009 + }, + { + "epoch": 0.4547690212250766, + "grad_norm": 2.3158512115478516, + "learning_rate": 1.1071321913083925e-05, + "loss": 0.7337, + "step": 6010 + }, + { + "epoch": 0.45484468994741023, + "grad_norm": 1.9755150079727173, + "learning_rate": 1.1069130089850142e-05, + "loss": 0.9059, + "step": 6011 + }, + { + "epoch": 0.45492035866974384, + "grad_norm": 1.9800193309783936, + "learning_rate": 1.1066938137040742e-05, + "loss": 0.9518, + "step": 6012 + }, + { + "epoch": 0.4549960273920775, + "grad_norm": 2.4362120628356934, + "learning_rate": 1.106474605479299e-05, + "loss": 0.8131, + "step": 6013 + }, + { + "epoch": 0.4550716961144111, + "grad_norm": 3.1265878677368164, + "learning_rate": 1.106255384324416e-05, + "loss": 0.8113, + "step": 6014 + }, + { + "epoch": 0.45514736483674473, + "grad_norm": 2.1288368701934814, + "learning_rate": 1.106036150253154e-05, + "loss": 0.6329, + "step": 6015 + }, + { + "epoch": 0.45522303355907834, + "grad_norm": 2.2900583744049072, + "learning_rate": 1.1058169032792419e-05, + "loss": 0.6617, + "step": 6016 + }, + { + "epoch": 0.455298702281412, + "grad_norm": 2.1186749935150146, + "learning_rate": 1.1055976434164094e-05, + "loss": 0.76, + "step": 6017 + }, + { + "epoch": 0.4553743710037456, + "grad_norm": 1.967383623123169, + "learning_rate": 1.1053783706783876e-05, + "loss": 0.7049, + "step": 6018 + }, + { + "epoch": 0.4554500397260792, + "grad_norm": 2.262080430984497, + "learning_rate": 1.1051590850789076e-05, + "loss": 0.739, + "step": 6019 + }, + { + "epoch": 0.45552570844841284, + "grad_norm": 2.2179148197174072, + "learning_rate": 1.1049397866317026e-05, + "loss": 0.8633, + "step": 6020 + }, + { + "epoch": 0.45560137717074645, + "grad_norm": 2.158219575881958, + "learning_rate": 1.1047204753505052e-05, + "loss": 0.7621, + "step": 6021 + }, + { + "epoch": 0.4556770458930801, + "grad_norm": 2.1269586086273193, + "learning_rate": 1.1045011512490493e-05, + "loss": 0.7006, + "step": 6022 + }, + { + "epoch": 0.4557527146154137, + "grad_norm": 2.0919365882873535, + "learning_rate": 1.1042818143410702e-05, + "loss": 0.6958, + "step": 6023 + }, + { + "epoch": 0.45582838333774733, + "grad_norm": 2.181525230407715, + "learning_rate": 1.1040624646403027e-05, + "loss": 0.6319, + "step": 6024 + }, + { + "epoch": 0.45590405206008094, + "grad_norm": 2.3236260414123535, + "learning_rate": 1.1038431021604841e-05, + "loss": 0.8105, + "step": 6025 + }, + { + "epoch": 0.4559797207824146, + "grad_norm": 2.2050108909606934, + "learning_rate": 1.1036237269153509e-05, + "loss": 0.7843, + "step": 6026 + }, + { + "epoch": 0.4560553895047482, + "grad_norm": 2.168041467666626, + "learning_rate": 1.1034043389186414e-05, + "loss": 0.8211, + "step": 6027 + }, + { + "epoch": 0.45613105822708183, + "grad_norm": 2.095221996307373, + "learning_rate": 1.1031849381840942e-05, + "loss": 0.7797, + "step": 6028 + }, + { + "epoch": 0.45620672694941544, + "grad_norm": 2.5357155799865723, + "learning_rate": 1.102965524725449e-05, + "loss": 0.748, + "step": 6029 + }, + { + "epoch": 0.4562823956717491, + "grad_norm": 2.3060734272003174, + "learning_rate": 1.1027460985564464e-05, + "loss": 0.6879, + "step": 6030 + }, + { + "epoch": 0.4563580643940827, + "grad_norm": 1.986255407333374, + "learning_rate": 1.102526659690827e-05, + "loss": 0.5767, + "step": 6031 + }, + { + "epoch": 0.45643373311641633, + "grad_norm": 1.7908231019973755, + "learning_rate": 1.1023072081423334e-05, + "loss": 0.5617, + "step": 6032 + }, + { + "epoch": 0.45650940183874994, + "grad_norm": 2.0068113803863525, + "learning_rate": 1.102087743924708e-05, + "loss": 0.8403, + "step": 6033 + }, + { + "epoch": 0.45658507056108355, + "grad_norm": 2.3249096870422363, + "learning_rate": 1.1018682670516945e-05, + "loss": 0.6172, + "step": 6034 + }, + { + "epoch": 0.4566607392834172, + "grad_norm": 2.054591178894043, + "learning_rate": 1.101648777537037e-05, + "loss": 0.7662, + "step": 6035 + }, + { + "epoch": 0.4567364080057508, + "grad_norm": 2.0367980003356934, + "learning_rate": 1.101429275394481e-05, + "loss": 0.6568, + "step": 6036 + }, + { + "epoch": 0.45681207672808444, + "grad_norm": 1.8275066614151, + "learning_rate": 1.1012097606377722e-05, + "loss": 0.7773, + "step": 6037 + }, + { + "epoch": 0.45688774545041805, + "grad_norm": 1.9169228076934814, + "learning_rate": 1.1009902332806577e-05, + "loss": 0.6533, + "step": 6038 + }, + { + "epoch": 0.4569634141727517, + "grad_norm": 2.531177520751953, + "learning_rate": 1.1007706933368843e-05, + "loss": 0.8055, + "step": 6039 + }, + { + "epoch": 0.4570390828950853, + "grad_norm": 2.8043148517608643, + "learning_rate": 1.1005511408202008e-05, + "loss": 0.8399, + "step": 6040 + }, + { + "epoch": 0.45711475161741894, + "grad_norm": 1.9905619621276855, + "learning_rate": 1.1003315757443565e-05, + "loss": 0.5476, + "step": 6041 + }, + { + "epoch": 0.45719042033975255, + "grad_norm": 2.0711894035339355, + "learning_rate": 1.1001119981231004e-05, + "loss": 0.6972, + "step": 6042 + }, + { + "epoch": 0.4572660890620862, + "grad_norm": 2.3095309734344482, + "learning_rate": 1.0998924079701843e-05, + "loss": 0.6728, + "step": 6043 + }, + { + "epoch": 0.4573417577844198, + "grad_norm": 3.2638471126556396, + "learning_rate": 1.0996728052993586e-05, + "loss": 0.838, + "step": 6044 + }, + { + "epoch": 0.45741742650675343, + "grad_norm": 2.1741018295288086, + "learning_rate": 1.0994531901243763e-05, + "loss": 0.7435, + "step": 6045 + }, + { + "epoch": 0.45749309522908704, + "grad_norm": 2.157972812652588, + "learning_rate": 1.0992335624589902e-05, + "loss": 0.7142, + "step": 6046 + }, + { + "epoch": 0.45756876395142065, + "grad_norm": 3.9512946605682373, + "learning_rate": 1.099013922316954e-05, + "loss": 0.5924, + "step": 6047 + }, + { + "epoch": 0.4576444326737543, + "grad_norm": 1.8326383829116821, + "learning_rate": 1.0987942697120223e-05, + "loss": 0.7153, + "step": 6048 + }, + { + "epoch": 0.45772010139608793, + "grad_norm": 2.491291046142578, + "learning_rate": 1.09857460465795e-05, + "loss": 0.7102, + "step": 6049 + }, + { + "epoch": 0.45779577011842154, + "grad_norm": 2.1547534465789795, + "learning_rate": 1.0983549271684944e-05, + "loss": 0.6816, + "step": 6050 + }, + { + "epoch": 0.45787143884075515, + "grad_norm": 2.5720443725585938, + "learning_rate": 1.0981352372574111e-05, + "loss": 0.7271, + "step": 6051 + }, + { + "epoch": 0.4579471075630888, + "grad_norm": 2.2329049110412598, + "learning_rate": 1.0979155349384587e-05, + "loss": 0.6481, + "step": 6052 + }, + { + "epoch": 0.45802277628542243, + "grad_norm": 2.2942488193511963, + "learning_rate": 1.0976958202253951e-05, + "loss": 0.7957, + "step": 6053 + }, + { + "epoch": 0.45809844500775604, + "grad_norm": 1.748425006866455, + "learning_rate": 1.0974760931319801e-05, + "loss": 0.9268, + "step": 6054 + }, + { + "epoch": 0.45817411373008965, + "grad_norm": 5.090272903442383, + "learning_rate": 1.0972563536719736e-05, + "loss": 0.7559, + "step": 6055 + }, + { + "epoch": 0.4582497824524233, + "grad_norm": 2.0367588996887207, + "learning_rate": 1.097036601859136e-05, + "loss": 0.7083, + "step": 6056 + }, + { + "epoch": 0.4583254511747569, + "grad_norm": 2.288196325302124, + "learning_rate": 1.096816837707229e-05, + "loss": 0.7582, + "step": 6057 + }, + { + "epoch": 0.45840111989709054, + "grad_norm": 1.7706087827682495, + "learning_rate": 1.096597061230015e-05, + "loss": 0.6756, + "step": 6058 + }, + { + "epoch": 0.45847678861942415, + "grad_norm": 2.1202590465545654, + "learning_rate": 1.0963772724412575e-05, + "loss": 0.7529, + "step": 6059 + }, + { + "epoch": 0.45855245734175776, + "grad_norm": 4.25075626373291, + "learning_rate": 1.0961574713547196e-05, + "loss": 0.8093, + "step": 6060 + }, + { + "epoch": 0.4586281260640914, + "grad_norm": 2.7584314346313477, + "learning_rate": 1.0959376579841669e-05, + "loss": 0.6798, + "step": 6061 + }, + { + "epoch": 0.45870379478642503, + "grad_norm": 2.2744717597961426, + "learning_rate": 1.095717832343364e-05, + "loss": 0.573, + "step": 6062 + }, + { + "epoch": 0.45877946350875864, + "grad_norm": 1.9462858438491821, + "learning_rate": 1.0954979944460773e-05, + "loss": 0.7179, + "step": 6063 + }, + { + "epoch": 0.45885513223109226, + "grad_norm": 2.249580144882202, + "learning_rate": 1.0952781443060742e-05, + "loss": 0.6776, + "step": 6064 + }, + { + "epoch": 0.4589308009534259, + "grad_norm": 2.2577133178710938, + "learning_rate": 1.0950582819371215e-05, + "loss": 0.7104, + "step": 6065 + }, + { + "epoch": 0.45900646967575953, + "grad_norm": 1.8561004400253296, + "learning_rate": 1.094838407352989e-05, + "loss": 0.6328, + "step": 6066 + }, + { + "epoch": 0.45908213839809314, + "grad_norm": 2.133049249649048, + "learning_rate": 1.0946185205674447e-05, + "loss": 0.803, + "step": 6067 + }, + { + "epoch": 0.45915780712042675, + "grad_norm": 2.2266693115234375, + "learning_rate": 1.0943986215942597e-05, + "loss": 0.7626, + "step": 6068 + }, + { + "epoch": 0.4592334758427604, + "grad_norm": 2.2593750953674316, + "learning_rate": 1.0941787104472038e-05, + "loss": 0.5993, + "step": 6069 + }, + { + "epoch": 0.45930914456509403, + "grad_norm": 1.5973234176635742, + "learning_rate": 1.0939587871400493e-05, + "loss": 0.9396, + "step": 6070 + }, + { + "epoch": 0.45938481328742764, + "grad_norm": 1.9865094423294067, + "learning_rate": 1.0937388516865681e-05, + "loss": 0.843, + "step": 6071 + }, + { + "epoch": 0.45946048200976125, + "grad_norm": 2.004659414291382, + "learning_rate": 1.093518904100533e-05, + "loss": 0.5817, + "step": 6072 + }, + { + "epoch": 0.45953615073209486, + "grad_norm": 1.994409441947937, + "learning_rate": 1.0932989443957188e-05, + "loss": 0.6163, + "step": 6073 + }, + { + "epoch": 0.4596118194544285, + "grad_norm": 2.4279356002807617, + "learning_rate": 1.0930789725858994e-05, + "loss": 0.6187, + "step": 6074 + }, + { + "epoch": 0.45968748817676214, + "grad_norm": 1.8175309896469116, + "learning_rate": 1.0928589886848499e-05, + "loss": 0.7091, + "step": 6075 + }, + { + "epoch": 0.45976315689909575, + "grad_norm": 2.018789768218994, + "learning_rate": 1.092638992706347e-05, + "loss": 0.5512, + "step": 6076 + }, + { + "epoch": 0.45983882562142936, + "grad_norm": 1.8385061025619507, + "learning_rate": 1.0924189846641673e-05, + "loss": 0.665, + "step": 6077 + }, + { + "epoch": 0.459914494343763, + "grad_norm": 2.198543071746826, + "learning_rate": 1.0921989645720883e-05, + "loss": 0.7314, + "step": 6078 + }, + { + "epoch": 0.45999016306609664, + "grad_norm": 3.3511157035827637, + "learning_rate": 1.0919789324438886e-05, + "loss": 0.7007, + "step": 6079 + }, + { + "epoch": 0.46006583178843025, + "grad_norm": 2.0471951961517334, + "learning_rate": 1.0917588882933472e-05, + "loss": 0.8203, + "step": 6080 + }, + { + "epoch": 0.46014150051076386, + "grad_norm": 2.699324131011963, + "learning_rate": 1.091538832134244e-05, + "loss": 0.5743, + "step": 6081 + }, + { + "epoch": 0.4602171692330975, + "grad_norm": 2.1900722980499268, + "learning_rate": 1.0913187639803598e-05, + "loss": 0.7094, + "step": 6082 + }, + { + "epoch": 0.46029283795543113, + "grad_norm": 2.500459671020508, + "learning_rate": 1.0910986838454754e-05, + "loss": 0.6425, + "step": 6083 + }, + { + "epoch": 0.46036850667776474, + "grad_norm": 2.3873353004455566, + "learning_rate": 1.0908785917433737e-05, + "loss": 0.6988, + "step": 6084 + }, + { + "epoch": 0.46044417540009835, + "grad_norm": 2.539494037628174, + "learning_rate": 1.090658487687837e-05, + "loss": 0.6685, + "step": 6085 + }, + { + "epoch": 0.46051984412243196, + "grad_norm": 2.0410473346710205, + "learning_rate": 1.0904383716926491e-05, + "loss": 0.8181, + "step": 6086 + }, + { + "epoch": 0.46059551284476563, + "grad_norm": 2.1296074390411377, + "learning_rate": 1.0902182437715947e-05, + "loss": 0.6946, + "step": 6087 + }, + { + "epoch": 0.46067118156709924, + "grad_norm": 2.1949639320373535, + "learning_rate": 1.0899981039384581e-05, + "loss": 0.7152, + "step": 6088 + }, + { + "epoch": 0.46074685028943285, + "grad_norm": 2.5032870769500732, + "learning_rate": 1.0897779522070262e-05, + "loss": 0.7305, + "step": 6089 + }, + { + "epoch": 0.46082251901176646, + "grad_norm": 1.8629069328308105, + "learning_rate": 1.0895577885910846e-05, + "loss": 0.7541, + "step": 6090 + }, + { + "epoch": 0.46089818773410013, + "grad_norm": 1.527066707611084, + "learning_rate": 1.0893376131044219e-05, + "loss": 0.6971, + "step": 6091 + }, + { + "epoch": 0.46097385645643374, + "grad_norm": 1.9106502532958984, + "learning_rate": 1.089117425760825e-05, + "loss": 0.7487, + "step": 6092 + }, + { + "epoch": 0.46104952517876735, + "grad_norm": 2.104304075241089, + "learning_rate": 1.0888972265740833e-05, + "loss": 0.7364, + "step": 6093 + }, + { + "epoch": 0.46112519390110096, + "grad_norm": 2.0999743938446045, + "learning_rate": 1.0886770155579864e-05, + "loss": 0.7293, + "step": 6094 + }, + { + "epoch": 0.4612008626234346, + "grad_norm": 2.2601325511932373, + "learning_rate": 1.0884567927263243e-05, + "loss": 0.7555, + "step": 6095 + }, + { + "epoch": 0.46127653134576824, + "grad_norm": 1.8899502754211426, + "learning_rate": 1.0882365580928885e-05, + "loss": 0.6945, + "step": 6096 + }, + { + "epoch": 0.46135220006810185, + "grad_norm": 2.614907741546631, + "learning_rate": 1.0880163116714706e-05, + "loss": 0.8823, + "step": 6097 + }, + { + "epoch": 0.46142786879043546, + "grad_norm": 1.670644760131836, + "learning_rate": 1.087796053475863e-05, + "loss": 0.8269, + "step": 6098 + }, + { + "epoch": 0.4615035375127691, + "grad_norm": 2.111875295639038, + "learning_rate": 1.0875757835198592e-05, + "loss": 0.6693, + "step": 6099 + }, + { + "epoch": 0.46157920623510273, + "grad_norm": 1.9727673530578613, + "learning_rate": 1.0873555018172533e-05, + "loss": 0.661, + "step": 6100 + }, + { + "epoch": 0.46165487495743635, + "grad_norm": 2.154547691345215, + "learning_rate": 1.0871352083818397e-05, + "loss": 0.6215, + "step": 6101 + }, + { + "epoch": 0.46173054367976996, + "grad_norm": 3.549818992614746, + "learning_rate": 1.0869149032274142e-05, + "loss": 0.8293, + "step": 6102 + }, + { + "epoch": 0.46180621240210357, + "grad_norm": 2.007596731185913, + "learning_rate": 1.0866945863677728e-05, + "loss": 0.8542, + "step": 6103 + }, + { + "epoch": 0.46188188112443723, + "grad_norm": 1.8901540040969849, + "learning_rate": 1.0864742578167123e-05, + "loss": 0.7118, + "step": 6104 + }, + { + "epoch": 0.46195754984677084, + "grad_norm": 2.25508975982666, + "learning_rate": 1.0862539175880313e-05, + "loss": 0.7328, + "step": 6105 + }, + { + "epoch": 0.46203321856910445, + "grad_norm": 1.9641987085342407, + "learning_rate": 1.086033565695527e-05, + "loss": 0.7337, + "step": 6106 + }, + { + "epoch": 0.46210888729143806, + "grad_norm": 1.9229612350463867, + "learning_rate": 1.0858132021529995e-05, + "loss": 0.7118, + "step": 6107 + }, + { + "epoch": 0.46218455601377173, + "grad_norm": 2.1345152854919434, + "learning_rate": 1.0855928269742479e-05, + "loss": 0.7995, + "step": 6108 + }, + { + "epoch": 0.46226022473610534, + "grad_norm": 1.999732255935669, + "learning_rate": 1.0853724401730733e-05, + "loss": 0.5753, + "step": 6109 + }, + { + "epoch": 0.46233589345843895, + "grad_norm": 1.7890396118164062, + "learning_rate": 1.0851520417632772e-05, + "loss": 0.7044, + "step": 6110 + }, + { + "epoch": 0.46241156218077256, + "grad_norm": 1.8957141637802124, + "learning_rate": 1.0849316317586611e-05, + "loss": 0.8104, + "step": 6111 + }, + { + "epoch": 0.46248723090310623, + "grad_norm": 1.9614812135696411, + "learning_rate": 1.0847112101730284e-05, + "loss": 0.7579, + "step": 6112 + }, + { + "epoch": 0.46256289962543984, + "grad_norm": 4.7304205894470215, + "learning_rate": 1.0844907770201818e-05, + "loss": 0.666, + "step": 6113 + }, + { + "epoch": 0.46263856834777345, + "grad_norm": 2.0775561332702637, + "learning_rate": 1.0842703323139265e-05, + "loss": 0.6668, + "step": 6114 + }, + { + "epoch": 0.46271423707010706, + "grad_norm": 1.9007248878479004, + "learning_rate": 1.0840498760680668e-05, + "loss": 0.6426, + "step": 6115 + }, + { + "epoch": 0.46278990579244067, + "grad_norm": 1.8677881956100464, + "learning_rate": 1.0838294082964087e-05, + "loss": 0.7154, + "step": 6116 + }, + { + "epoch": 0.46286557451477434, + "grad_norm": 2.1474485397338867, + "learning_rate": 1.0836089290127581e-05, + "loss": 0.8626, + "step": 6117 + }, + { + "epoch": 0.46294124323710795, + "grad_norm": 1.910867691040039, + "learning_rate": 1.083388438230923e-05, + "loss": 0.8349, + "step": 6118 + }, + { + "epoch": 0.46301691195944156, + "grad_norm": 1.9479092359542847, + "learning_rate": 1.0831679359647104e-05, + "loss": 0.7569, + "step": 6119 + }, + { + "epoch": 0.46309258068177517, + "grad_norm": 1.7901252508163452, + "learning_rate": 1.0829474222279293e-05, + "loss": 0.8097, + "step": 6120 + }, + { + "epoch": 0.46316824940410883, + "grad_norm": 2.567704677581787, + "learning_rate": 1.0827268970343888e-05, + "loss": 0.601, + "step": 6121 + }, + { + "epoch": 0.46324391812644244, + "grad_norm": 2.589970350265503, + "learning_rate": 1.082506360397899e-05, + "loss": 0.698, + "step": 6122 + }, + { + "epoch": 0.46331958684877605, + "grad_norm": 2.233529567718506, + "learning_rate": 1.082285812332271e-05, + "loss": 0.8512, + "step": 6123 + }, + { + "epoch": 0.46339525557110967, + "grad_norm": 2.331575870513916, + "learning_rate": 1.0820652528513151e-05, + "loss": 0.7811, + "step": 6124 + }, + { + "epoch": 0.46347092429344333, + "grad_norm": 2.2309775352478027, + "learning_rate": 1.081844681968845e-05, + "loss": 0.6453, + "step": 6125 + }, + { + "epoch": 0.46354659301577694, + "grad_norm": 1.7300264835357666, + "learning_rate": 1.0816240996986723e-05, + "loss": 0.6944, + "step": 6126 + }, + { + "epoch": 0.46362226173811055, + "grad_norm": 2.187654972076416, + "learning_rate": 1.0814035060546112e-05, + "loss": 0.7591, + "step": 6127 + }, + { + "epoch": 0.46369793046044416, + "grad_norm": 2.0955562591552734, + "learning_rate": 1.081182901050476e-05, + "loss": 0.7257, + "step": 6128 + }, + { + "epoch": 0.4637735991827778, + "grad_norm": 2.2414302825927734, + "learning_rate": 1.080962284700081e-05, + "loss": 0.6971, + "step": 6129 + }, + { + "epoch": 0.46384926790511144, + "grad_norm": 4.362318992614746, + "learning_rate": 1.0807416570172429e-05, + "loss": 0.7791, + "step": 6130 + }, + { + "epoch": 0.46392493662744505, + "grad_norm": 1.8338907957077026, + "learning_rate": 1.0805210180157772e-05, + "loss": 0.6372, + "step": 6131 + }, + { + "epoch": 0.46400060534977866, + "grad_norm": 2.5617542266845703, + "learning_rate": 1.080300367709502e-05, + "loss": 0.7845, + "step": 6132 + }, + { + "epoch": 0.46407627407211227, + "grad_norm": 1.9908982515335083, + "learning_rate": 1.0800797061122341e-05, + "loss": 0.5978, + "step": 6133 + }, + { + "epoch": 0.46415194279444594, + "grad_norm": 2.4405834674835205, + "learning_rate": 1.079859033237793e-05, + "loss": 0.7342, + "step": 6134 + }, + { + "epoch": 0.46422761151677955, + "grad_norm": 1.8507134914398193, + "learning_rate": 1.0796383490999975e-05, + "loss": 0.6845, + "step": 6135 + }, + { + "epoch": 0.46430328023911316, + "grad_norm": 2.517188787460327, + "learning_rate": 1.0794176537126674e-05, + "loss": 0.7715, + "step": 6136 + }, + { + "epoch": 0.46437894896144677, + "grad_norm": 1.7793668508529663, + "learning_rate": 1.0791969470896235e-05, + "loss": 0.8915, + "step": 6137 + }, + { + "epoch": 0.46445461768378044, + "grad_norm": 1.7618346214294434, + "learning_rate": 1.0789762292446869e-05, + "loss": 0.6284, + "step": 6138 + }, + { + "epoch": 0.46453028640611405, + "grad_norm": 2.0393192768096924, + "learning_rate": 1.0787555001916803e-05, + "loss": 0.6054, + "step": 6139 + }, + { + "epoch": 0.46460595512844766, + "grad_norm": 2.2760608196258545, + "learning_rate": 1.078534759944426e-05, + "loss": 0.8148, + "step": 6140 + }, + { + "epoch": 0.46468162385078127, + "grad_norm": 1.8736259937286377, + "learning_rate": 1.0783140085167477e-05, + "loss": 0.6872, + "step": 6141 + }, + { + "epoch": 0.4647572925731149, + "grad_norm": 2.1906280517578125, + "learning_rate": 1.0780932459224692e-05, + "loss": 0.7743, + "step": 6142 + }, + { + "epoch": 0.46483296129544854, + "grad_norm": 2.7648184299468994, + "learning_rate": 1.077872472175416e-05, + "loss": 0.6033, + "step": 6143 + }, + { + "epoch": 0.46490863001778215, + "grad_norm": 1.824427843093872, + "learning_rate": 1.077651687289413e-05, + "loss": 0.5442, + "step": 6144 + }, + { + "epoch": 0.46498429874011576, + "grad_norm": 3.1149511337280273, + "learning_rate": 1.0774308912782866e-05, + "loss": 0.9675, + "step": 6145 + }, + { + "epoch": 0.4650599674624494, + "grad_norm": 1.6089766025543213, + "learning_rate": 1.0772100841558644e-05, + "loss": 0.5172, + "step": 6146 + }, + { + "epoch": 0.46513563618478304, + "grad_norm": 9.786401748657227, + "learning_rate": 1.0769892659359731e-05, + "loss": 0.7237, + "step": 6147 + }, + { + "epoch": 0.46521130490711665, + "grad_norm": 1.9488837718963623, + "learning_rate": 1.0767684366324418e-05, + "loss": 0.8311, + "step": 6148 + }, + { + "epoch": 0.46528697362945026, + "grad_norm": 2.2119038105010986, + "learning_rate": 1.076547596259099e-05, + "loss": 0.7903, + "step": 6149 + }, + { + "epoch": 0.4653626423517839, + "grad_norm": 2.353832483291626, + "learning_rate": 1.076326744829775e-05, + "loss": 0.7647, + "step": 6150 + }, + { + "epoch": 0.46543831107411754, + "grad_norm": 2.166916847229004, + "learning_rate": 1.0761058823582999e-05, + "loss": 0.8551, + "step": 6151 + }, + { + "epoch": 0.46551397979645115, + "grad_norm": 2.20060658454895, + "learning_rate": 1.0758850088585045e-05, + "loss": 0.7732, + "step": 6152 + }, + { + "epoch": 0.46558964851878476, + "grad_norm": 1.9233969449996948, + "learning_rate": 1.0756641243442212e-05, + "loss": 0.7602, + "step": 6153 + }, + { + "epoch": 0.46566531724111837, + "grad_norm": 2.2503979206085205, + "learning_rate": 1.0754432288292825e-05, + "loss": 0.7605, + "step": 6154 + }, + { + "epoch": 0.465740985963452, + "grad_norm": 2.2500691413879395, + "learning_rate": 1.075222322327521e-05, + "loss": 0.7364, + "step": 6155 + }, + { + "epoch": 0.46581665468578565, + "grad_norm": 2.3264994621276855, + "learning_rate": 1.0750014048527709e-05, + "loss": 0.807, + "step": 6156 + }, + { + "epoch": 0.46589232340811926, + "grad_norm": 2.227979898452759, + "learning_rate": 1.074780476418867e-05, + "loss": 0.6017, + "step": 6157 + }, + { + "epoch": 0.46596799213045287, + "grad_norm": 2.266706705093384, + "learning_rate": 1.0745595370396444e-05, + "loss": 0.7044, + "step": 6158 + }, + { + "epoch": 0.4660436608527865, + "grad_norm": 2.1310250759124756, + "learning_rate": 1.074338586728939e-05, + "loss": 0.7331, + "step": 6159 + }, + { + "epoch": 0.46611932957512014, + "grad_norm": 2.122551202774048, + "learning_rate": 1.0741176255005873e-05, + "loss": 0.7152, + "step": 6160 + }, + { + "epoch": 0.46619499829745376, + "grad_norm": 2.330875873565674, + "learning_rate": 1.0738966533684268e-05, + "loss": 0.7346, + "step": 6161 + }, + { + "epoch": 0.46627066701978737, + "grad_norm": 1.9120339155197144, + "learning_rate": 1.0736756703462951e-05, + "loss": 0.7467, + "step": 6162 + }, + { + "epoch": 0.466346335742121, + "grad_norm": 2.05835223197937, + "learning_rate": 1.0734546764480316e-05, + "loss": 0.7593, + "step": 6163 + }, + { + "epoch": 0.46642200446445464, + "grad_norm": 2.023322820663452, + "learning_rate": 1.0732336716874753e-05, + "loss": 0.9205, + "step": 6164 + }, + { + "epoch": 0.46649767318678825, + "grad_norm": 1.8999871015548706, + "learning_rate": 1.073012656078466e-05, + "loss": 0.6393, + "step": 6165 + }, + { + "epoch": 0.46657334190912186, + "grad_norm": 2.1797525882720947, + "learning_rate": 1.0727916296348444e-05, + "loss": 0.8688, + "step": 6166 + }, + { + "epoch": 0.4666490106314555, + "grad_norm": 2.276228904724121, + "learning_rate": 1.0725705923704521e-05, + "loss": 0.7856, + "step": 6167 + }, + { + "epoch": 0.4667246793537891, + "grad_norm": 2.254700183868408, + "learning_rate": 1.0723495442991314e-05, + "loss": 0.5862, + "step": 6168 + }, + { + "epoch": 0.46680034807612275, + "grad_norm": 1.760817527770996, + "learning_rate": 1.0721284854347248e-05, + "loss": 0.6833, + "step": 6169 + }, + { + "epoch": 0.46687601679845636, + "grad_norm": 3.151423692703247, + "learning_rate": 1.0719074157910752e-05, + "loss": 0.8858, + "step": 6170 + }, + { + "epoch": 0.46695168552078997, + "grad_norm": 1.7372065782546997, + "learning_rate": 1.0716863353820278e-05, + "loss": 0.6091, + "step": 6171 + }, + { + "epoch": 0.4670273542431236, + "grad_norm": 1.8967386484146118, + "learning_rate": 1.0714652442214266e-05, + "loss": 0.8024, + "step": 6172 + }, + { + "epoch": 0.46710302296545725, + "grad_norm": 1.915652871131897, + "learning_rate": 1.0712441423231172e-05, + "loss": 0.6228, + "step": 6173 + }, + { + "epoch": 0.46717869168779086, + "grad_norm": 2.134735345840454, + "learning_rate": 1.0710230297009458e-05, + "loss": 0.8067, + "step": 6174 + }, + { + "epoch": 0.46725436041012447, + "grad_norm": 2.2443509101867676, + "learning_rate": 1.070801906368759e-05, + "loss": 0.7097, + "step": 6175 + }, + { + "epoch": 0.4673300291324581, + "grad_norm": 2.271360158920288, + "learning_rate": 1.0705807723404044e-05, + "loss": 0.6685, + "step": 6176 + }, + { + "epoch": 0.46740569785479175, + "grad_norm": 2.679190158843994, + "learning_rate": 1.0703596276297303e-05, + "loss": 0.7566, + "step": 6177 + }, + { + "epoch": 0.46748136657712536, + "grad_norm": 2.672213315963745, + "learning_rate": 1.0701384722505851e-05, + "loss": 0.6706, + "step": 6178 + }, + { + "epoch": 0.46755703529945897, + "grad_norm": 2.041059732437134, + "learning_rate": 1.0699173062168183e-05, + "loss": 0.6699, + "step": 6179 + }, + { + "epoch": 0.4676327040217926, + "grad_norm": 2.066025972366333, + "learning_rate": 1.0696961295422806e-05, + "loss": 0.6701, + "step": 6180 + }, + { + "epoch": 0.4677083727441262, + "grad_norm": 2.1812968254089355, + "learning_rate": 1.0694749422408223e-05, + "loss": 0.6914, + "step": 6181 + }, + { + "epoch": 0.46778404146645985, + "grad_norm": 2.1719706058502197, + "learning_rate": 1.0692537443262949e-05, + "loss": 0.8392, + "step": 6182 + }, + { + "epoch": 0.46785971018879347, + "grad_norm": 2.272714614868164, + "learning_rate": 1.0690325358125506e-05, + "loss": 0.8093, + "step": 6183 + }, + { + "epoch": 0.4679353789111271, + "grad_norm": 2.0720415115356445, + "learning_rate": 1.0688113167134421e-05, + "loss": 0.5, + "step": 6184 + }, + { + "epoch": 0.4680110476334607, + "grad_norm": 2.3994011878967285, + "learning_rate": 1.0685900870428232e-05, + "loss": 0.8163, + "step": 6185 + }, + { + "epoch": 0.46808671635579435, + "grad_norm": 1.8097761869430542, + "learning_rate": 1.0683688468145474e-05, + "loss": 0.761, + "step": 6186 + }, + { + "epoch": 0.46816238507812796, + "grad_norm": 1.8033745288848877, + "learning_rate": 1.0681475960424703e-05, + "loss": 0.6908, + "step": 6187 + }, + { + "epoch": 0.4682380538004616, + "grad_norm": 1.8102970123291016, + "learning_rate": 1.0679263347404466e-05, + "loss": 0.6382, + "step": 6188 + }, + { + "epoch": 0.4683137225227952, + "grad_norm": 2.0066001415252686, + "learning_rate": 1.0677050629223325e-05, + "loss": 0.6318, + "step": 6189 + }, + { + "epoch": 0.46838939124512885, + "grad_norm": 1.7005255222320557, + "learning_rate": 1.0674837806019852e-05, + "loss": 0.7265, + "step": 6190 + }, + { + "epoch": 0.46846505996746246, + "grad_norm": 2.1413681507110596, + "learning_rate": 1.0672624877932618e-05, + "loss": 0.6977, + "step": 6191 + }, + { + "epoch": 0.46854072868979607, + "grad_norm": 2.8440780639648438, + "learning_rate": 1.0670411845100205e-05, + "loss": 0.8837, + "step": 6192 + }, + { + "epoch": 0.4686163974121297, + "grad_norm": 1.863978385925293, + "learning_rate": 1.0668198707661198e-05, + "loss": 0.5702, + "step": 6193 + }, + { + "epoch": 0.4686920661344633, + "grad_norm": 2.1832001209259033, + "learning_rate": 1.0665985465754193e-05, + "loss": 0.6077, + "step": 6194 + }, + { + "epoch": 0.46876773485679696, + "grad_norm": 2.120635747909546, + "learning_rate": 1.066377211951779e-05, + "loss": 0.732, + "step": 6195 + }, + { + "epoch": 0.46884340357913057, + "grad_norm": 2.332000255584717, + "learning_rate": 1.0661558669090595e-05, + "loss": 0.6214, + "step": 6196 + }, + { + "epoch": 0.4689190723014642, + "grad_norm": 2.0023016929626465, + "learning_rate": 1.0659345114611225e-05, + "loss": 0.7018, + "step": 6197 + }, + { + "epoch": 0.4689947410237978, + "grad_norm": 2.0073201656341553, + "learning_rate": 1.0657131456218291e-05, + "loss": 0.7069, + "step": 6198 + }, + { + "epoch": 0.46907040974613146, + "grad_norm": 2.4780099391937256, + "learning_rate": 1.065491769405043e-05, + "loss": 0.7045, + "step": 6199 + }, + { + "epoch": 0.46914607846846507, + "grad_norm": 3.5818912982940674, + "learning_rate": 1.0652703828246268e-05, + "loss": 0.6786, + "step": 6200 + }, + { + "epoch": 0.4692217471907987, + "grad_norm": 2.6084370613098145, + "learning_rate": 1.0650489858944447e-05, + "loss": 0.7227, + "step": 6201 + }, + { + "epoch": 0.4692974159131323, + "grad_norm": 1.8275518417358398, + "learning_rate": 1.0648275786283613e-05, + "loss": 0.7012, + "step": 6202 + }, + { + "epoch": 0.46937308463546595, + "grad_norm": 2.5070960521698, + "learning_rate": 1.0646061610402418e-05, + "loss": 0.9001, + "step": 6203 + }, + { + "epoch": 0.46944875335779956, + "grad_norm": 2.149526596069336, + "learning_rate": 1.0643847331439523e-05, + "loss": 0.8065, + "step": 6204 + }, + { + "epoch": 0.4695244220801332, + "grad_norm": 2.6355538368225098, + "learning_rate": 1.0641632949533589e-05, + "loss": 0.8699, + "step": 6205 + }, + { + "epoch": 0.4696000908024668, + "grad_norm": 1.9515386819839478, + "learning_rate": 1.0639418464823292e-05, + "loss": 0.7724, + "step": 6206 + }, + { + "epoch": 0.46967575952480045, + "grad_norm": 1.7659491300582886, + "learning_rate": 1.0637203877447305e-05, + "loss": 0.724, + "step": 6207 + }, + { + "epoch": 0.46975142824713406, + "grad_norm": 2.187346935272217, + "learning_rate": 1.0634989187544317e-05, + "loss": 0.7288, + "step": 6208 + }, + { + "epoch": 0.4698270969694677, + "grad_norm": 2.0950958728790283, + "learning_rate": 1.0632774395253019e-05, + "loss": 0.7164, + "step": 6209 + }, + { + "epoch": 0.4699027656918013, + "grad_norm": 1.7800686359405518, + "learning_rate": 1.063055950071211e-05, + "loss": 0.9003, + "step": 6210 + }, + { + "epoch": 0.4699784344141349, + "grad_norm": 2.117051124572754, + "learning_rate": 1.0628344504060288e-05, + "loss": 0.5784, + "step": 6211 + }, + { + "epoch": 0.47005410313646856, + "grad_norm": 1.9472649097442627, + "learning_rate": 1.0626129405436266e-05, + "loss": 0.7032, + "step": 6212 + }, + { + "epoch": 0.47012977185880217, + "grad_norm": 2.002767562866211, + "learning_rate": 1.0623914204978761e-05, + "loss": 0.7656, + "step": 6213 + }, + { + "epoch": 0.4702054405811358, + "grad_norm": 2.2014105319976807, + "learning_rate": 1.0621698902826497e-05, + "loss": 0.888, + "step": 6214 + }, + { + "epoch": 0.4702811093034694, + "grad_norm": 2.010322332382202, + "learning_rate": 1.0619483499118204e-05, + "loss": 0.7655, + "step": 6215 + }, + { + "epoch": 0.47035677802580306, + "grad_norm": 2.8622562885284424, + "learning_rate": 1.0617267993992612e-05, + "loss": 0.8364, + "step": 6216 + }, + { + "epoch": 0.47043244674813667, + "grad_norm": 1.905005931854248, + "learning_rate": 1.061505238758847e-05, + "loss": 0.6593, + "step": 6217 + }, + { + "epoch": 0.4705081154704703, + "grad_norm": 1.9572088718414307, + "learning_rate": 1.0612836680044525e-05, + "loss": 0.6098, + "step": 6218 + }, + { + "epoch": 0.4705837841928039, + "grad_norm": 2.0442428588867188, + "learning_rate": 1.0610620871499529e-05, + "loss": 0.7608, + "step": 6219 + }, + { + "epoch": 0.47065945291513755, + "grad_norm": 2.0061099529266357, + "learning_rate": 1.0608404962092244e-05, + "loss": 0.7947, + "step": 6220 + }, + { + "epoch": 0.47073512163747117, + "grad_norm": 2.0030205249786377, + "learning_rate": 1.0606188951961438e-05, + "loss": 0.706, + "step": 6221 + }, + { + "epoch": 0.4708107903598048, + "grad_norm": 2.660520315170288, + "learning_rate": 1.0603972841245887e-05, + "loss": 0.665, + "step": 6222 + }, + { + "epoch": 0.4708864590821384, + "grad_norm": 1.7287667989730835, + "learning_rate": 1.0601756630084367e-05, + "loss": 0.6559, + "step": 6223 + }, + { + "epoch": 0.470962127804472, + "grad_norm": 1.7627499103546143, + "learning_rate": 1.0599540318615667e-05, + "loss": 0.7481, + "step": 6224 + }, + { + "epoch": 0.47103779652680566, + "grad_norm": 2.1454079151153564, + "learning_rate": 1.0597323906978577e-05, + "loss": 0.6736, + "step": 6225 + }, + { + "epoch": 0.4711134652491393, + "grad_norm": 1.9202808141708374, + "learning_rate": 1.05951073953119e-05, + "loss": 0.7624, + "step": 6226 + }, + { + "epoch": 0.4711891339714729, + "grad_norm": 2.2088475227355957, + "learning_rate": 1.0592890783754437e-05, + "loss": 0.7322, + "step": 6227 + }, + { + "epoch": 0.4712648026938065, + "grad_norm": 2.0040664672851562, + "learning_rate": 1.0590674072445002e-05, + "loss": 0.7205, + "step": 6228 + }, + { + "epoch": 0.47134047141614016, + "grad_norm": 2.295071601867676, + "learning_rate": 1.0588457261522413e-05, + "loss": 0.6734, + "step": 6229 + }, + { + "epoch": 0.47141614013847377, + "grad_norm": 2.2637734413146973, + "learning_rate": 1.0586240351125489e-05, + "loss": 0.8409, + "step": 6230 + }, + { + "epoch": 0.4714918088608074, + "grad_norm": 2.1231727600097656, + "learning_rate": 1.0584023341393069e-05, + "loss": 0.7858, + "step": 6231 + }, + { + "epoch": 0.471567477583141, + "grad_norm": 2.0771729946136475, + "learning_rate": 1.0581806232463978e-05, + "loss": 0.7433, + "step": 6232 + }, + { + "epoch": 0.47164314630547466, + "grad_norm": 2.2591214179992676, + "learning_rate": 1.0579589024477068e-05, + "loss": 0.7656, + "step": 6233 + }, + { + "epoch": 0.47171881502780827, + "grad_norm": 2.270939826965332, + "learning_rate": 1.0577371717571182e-05, + "loss": 0.8632, + "step": 6234 + }, + { + "epoch": 0.4717944837501419, + "grad_norm": 2.1328177452087402, + "learning_rate": 1.057515431188518e-05, + "loss": 0.7845, + "step": 6235 + }, + { + "epoch": 0.4718701524724755, + "grad_norm": 2.4155235290527344, + "learning_rate": 1.0572936807557919e-05, + "loss": 0.6961, + "step": 6236 + }, + { + "epoch": 0.4719458211948091, + "grad_norm": 2.18650221824646, + "learning_rate": 1.0570719204728265e-05, + "loss": 0.69, + "step": 6237 + }, + { + "epoch": 0.47202148991714277, + "grad_norm": 2.087130308151245, + "learning_rate": 1.05685015035351e-05, + "loss": 0.7847, + "step": 6238 + }, + { + "epoch": 0.4720971586394764, + "grad_norm": 1.8233695030212402, + "learning_rate": 1.0566283704117292e-05, + "loss": 0.7336, + "step": 6239 + }, + { + "epoch": 0.47217282736181, + "grad_norm": 2.3203165531158447, + "learning_rate": 1.0564065806613736e-05, + "loss": 0.6075, + "step": 6240 + }, + { + "epoch": 0.4722484960841436, + "grad_norm": 1.858660101890564, + "learning_rate": 1.056184781116332e-05, + "loss": 0.6882, + "step": 6241 + }, + { + "epoch": 0.47232416480647726, + "grad_norm": 2.2066173553466797, + "learning_rate": 1.055962971790494e-05, + "loss": 0.6147, + "step": 6242 + }, + { + "epoch": 0.4723998335288109, + "grad_norm": 2.0067367553710938, + "learning_rate": 1.0557411526977506e-05, + "loss": 0.7309, + "step": 6243 + }, + { + "epoch": 0.4724755022511445, + "grad_norm": 2.2683217525482178, + "learning_rate": 1.055519323851992e-05, + "loss": 0.6129, + "step": 6244 + }, + { + "epoch": 0.4725511709734781, + "grad_norm": 2.247870683670044, + "learning_rate": 1.0552974852671111e-05, + "loss": 0.7197, + "step": 6245 + }, + { + "epoch": 0.47262683969581176, + "grad_norm": 2.4525437355041504, + "learning_rate": 1.0550756369569987e-05, + "loss": 0.5801, + "step": 6246 + }, + { + "epoch": 0.4727025084181454, + "grad_norm": 2.0607056617736816, + "learning_rate": 1.0548537789355486e-05, + "loss": 0.6399, + "step": 6247 + }, + { + "epoch": 0.472778177140479, + "grad_norm": 1.7411423921585083, + "learning_rate": 1.054631911216654e-05, + "loss": 0.6645, + "step": 6248 + }, + { + "epoch": 0.4728538458628126, + "grad_norm": 2.4376776218414307, + "learning_rate": 1.0544100338142088e-05, + "loss": 0.7827, + "step": 6249 + }, + { + "epoch": 0.4729295145851462, + "grad_norm": 1.9150795936584473, + "learning_rate": 1.0541881467421081e-05, + "loss": 0.8126, + "step": 6250 + }, + { + "epoch": 0.47300518330747987, + "grad_norm": 2.8938183784484863, + "learning_rate": 1.053966250014247e-05, + "loss": 0.8907, + "step": 6251 + }, + { + "epoch": 0.4730808520298135, + "grad_norm": 2.5442142486572266, + "learning_rate": 1.0537443436445213e-05, + "loss": 0.7493, + "step": 6252 + }, + { + "epoch": 0.4731565207521471, + "grad_norm": 5.999145984649658, + "learning_rate": 1.0535224276468274e-05, + "loss": 0.7522, + "step": 6253 + }, + { + "epoch": 0.4732321894744807, + "grad_norm": 3.0991311073303223, + "learning_rate": 1.0533005020350627e-05, + "loss": 0.8252, + "step": 6254 + }, + { + "epoch": 0.47330785819681437, + "grad_norm": 3.191121816635132, + "learning_rate": 1.0530785668231243e-05, + "loss": 0.7184, + "step": 6255 + }, + { + "epoch": 0.473383526919148, + "grad_norm": 2.009726047515869, + "learning_rate": 1.0528566220249113e-05, + "loss": 0.7366, + "step": 6256 + }, + { + "epoch": 0.4734591956414816, + "grad_norm": 2.614388942718506, + "learning_rate": 1.052634667654322e-05, + "loss": 0.8348, + "step": 6257 + }, + { + "epoch": 0.4735348643638152, + "grad_norm": 2.2507081031799316, + "learning_rate": 1.0524127037252564e-05, + "loss": 0.7253, + "step": 6258 + }, + { + "epoch": 0.47361053308614887, + "grad_norm": 2.0302281379699707, + "learning_rate": 1.0521907302516143e-05, + "loss": 0.7483, + "step": 6259 + }, + { + "epoch": 0.4736862018084825, + "grad_norm": 2.1905441284179688, + "learning_rate": 1.0519687472472962e-05, + "loss": 0.7226, + "step": 6260 + }, + { + "epoch": 0.4737618705308161, + "grad_norm": 1.9951703548431396, + "learning_rate": 1.0517467547262038e-05, + "loss": 0.8689, + "step": 6261 + }, + { + "epoch": 0.4738375392531497, + "grad_norm": 2.1481285095214844, + "learning_rate": 1.0515247527022386e-05, + "loss": 0.8369, + "step": 6262 + }, + { + "epoch": 0.4739132079754833, + "grad_norm": 1.8148024082183838, + "learning_rate": 1.0513027411893035e-05, + "loss": 0.7932, + "step": 6263 + }, + { + "epoch": 0.473988876697817, + "grad_norm": 2.1687381267547607, + "learning_rate": 1.0510807202013016e-05, + "loss": 0.7735, + "step": 6264 + }, + { + "epoch": 0.4740645454201506, + "grad_norm": 1.8470525741577148, + "learning_rate": 1.0508586897521359e-05, + "loss": 0.8987, + "step": 6265 + }, + { + "epoch": 0.4741402141424842, + "grad_norm": 2.30938458442688, + "learning_rate": 1.0506366498557113e-05, + "loss": 0.6753, + "step": 6266 + }, + { + "epoch": 0.4742158828648178, + "grad_norm": 2.1527512073516846, + "learning_rate": 1.0504146005259323e-05, + "loss": 0.8064, + "step": 6267 + }, + { + "epoch": 0.47429155158715147, + "grad_norm": 2.357869863510132, + "learning_rate": 1.050192541776705e-05, + "loss": 0.8538, + "step": 6268 + }, + { + "epoch": 0.4743672203094851, + "grad_norm": 2.084754705429077, + "learning_rate": 1.0499704736219345e-05, + "loss": 0.7007, + "step": 6269 + }, + { + "epoch": 0.4744428890318187, + "grad_norm": 1.8246986865997314, + "learning_rate": 1.049748396075528e-05, + "loss": 0.6057, + "step": 6270 + }, + { + "epoch": 0.4745185577541523, + "grad_norm": 3.0122311115264893, + "learning_rate": 1.0495263091513926e-05, + "loss": 0.6435, + "step": 6271 + }, + { + "epoch": 0.47459422647648597, + "grad_norm": 2.0100696086883545, + "learning_rate": 1.0493042128634361e-05, + "loss": 0.6919, + "step": 6272 + }, + { + "epoch": 0.4746698951988196, + "grad_norm": 2.3327362537384033, + "learning_rate": 1.0490821072255667e-05, + "loss": 0.6692, + "step": 6273 + }, + { + "epoch": 0.4747455639211532, + "grad_norm": 2.43101167678833, + "learning_rate": 1.0488599922516941e-05, + "loss": 0.7824, + "step": 6274 + }, + { + "epoch": 0.4748212326434868, + "grad_norm": 1.8376708030700684, + "learning_rate": 1.048637867955727e-05, + "loss": 0.7273, + "step": 6275 + }, + { + "epoch": 0.4748969013658204, + "grad_norm": 1.9170385599136353, + "learning_rate": 1.0484157343515756e-05, + "loss": 0.6116, + "step": 6276 + }, + { + "epoch": 0.4749725700881541, + "grad_norm": 1.9089981317520142, + "learning_rate": 1.0481935914531513e-05, + "loss": 0.7632, + "step": 6277 + }, + { + "epoch": 0.4750482388104877, + "grad_norm": 2.5500640869140625, + "learning_rate": 1.0479714392743645e-05, + "loss": 0.643, + "step": 6278 + }, + { + "epoch": 0.4751239075328213, + "grad_norm": 2.08392596244812, + "learning_rate": 1.0477492778291281e-05, + "loss": 0.7338, + "step": 6279 + }, + { + "epoch": 0.4751995762551549, + "grad_norm": 2.564549446105957, + "learning_rate": 1.0475271071313535e-05, + "loss": 0.7122, + "step": 6280 + }, + { + "epoch": 0.4752752449774886, + "grad_norm": 1.9711978435516357, + "learning_rate": 1.0473049271949547e-05, + "loss": 0.7447, + "step": 6281 + }, + { + "epoch": 0.4753509136998222, + "grad_norm": 1.9568284749984741, + "learning_rate": 1.0470827380338448e-05, + "loss": 0.66, + "step": 6282 + }, + { + "epoch": 0.4754265824221558, + "grad_norm": 2.3648922443389893, + "learning_rate": 1.046860539661938e-05, + "loss": 0.7885, + "step": 6283 + }, + { + "epoch": 0.4755022511444894, + "grad_norm": 2.0861058235168457, + "learning_rate": 1.0466383320931494e-05, + "loss": 0.8754, + "step": 6284 + }, + { + "epoch": 0.4755779198668231, + "grad_norm": 2.200965642929077, + "learning_rate": 1.046416115341394e-05, + "loss": 0.8098, + "step": 6285 + }, + { + "epoch": 0.4756535885891567, + "grad_norm": 2.617201328277588, + "learning_rate": 1.0461938894205882e-05, + "loss": 0.6633, + "step": 6286 + }, + { + "epoch": 0.4757292573114903, + "grad_norm": 2.2858943939208984, + "learning_rate": 1.0459716543446477e-05, + "loss": 0.729, + "step": 6287 + }, + { + "epoch": 0.4758049260338239, + "grad_norm": 2.4813878536224365, + "learning_rate": 1.0457494101274904e-05, + "loss": 0.6442, + "step": 6288 + }, + { + "epoch": 0.4758805947561575, + "grad_norm": 2.3235511779785156, + "learning_rate": 1.0455271567830336e-05, + "loss": 0.7065, + "step": 6289 + }, + { + "epoch": 0.4759562634784912, + "grad_norm": 2.1012747287750244, + "learning_rate": 1.0453048943251956e-05, + "loss": 0.8561, + "step": 6290 + }, + { + "epoch": 0.4760319322008248, + "grad_norm": 2.3040177822113037, + "learning_rate": 1.045082622767895e-05, + "loss": 0.6949, + "step": 6291 + }, + { + "epoch": 0.4761076009231584, + "grad_norm": 2.2570884227752686, + "learning_rate": 1.0448603421250513e-05, + "loss": 0.77, + "step": 6292 + }, + { + "epoch": 0.476183269645492, + "grad_norm": 2.1080000400543213, + "learning_rate": 1.0446380524105847e-05, + "loss": 0.8376, + "step": 6293 + }, + { + "epoch": 0.4762589383678257, + "grad_norm": 2.6243064403533936, + "learning_rate": 1.0444157536384152e-05, + "loss": 0.7462, + "step": 6294 + }, + { + "epoch": 0.4763346070901593, + "grad_norm": 2.161816358566284, + "learning_rate": 1.0441934458224642e-05, + "loss": 0.8241, + "step": 6295 + }, + { + "epoch": 0.4764102758124929, + "grad_norm": 2.4741382598876953, + "learning_rate": 1.043971128976653e-05, + "loss": 0.733, + "step": 6296 + }, + { + "epoch": 0.4764859445348265, + "grad_norm": 2.5977022647857666, + "learning_rate": 1.0437488031149042e-05, + "loss": 0.6428, + "step": 6297 + }, + { + "epoch": 0.4765616132571602, + "grad_norm": 2.8118369579315186, + "learning_rate": 1.0435264682511405e-05, + "loss": 0.8184, + "step": 6298 + }, + { + "epoch": 0.4766372819794938, + "grad_norm": 2.1726109981536865, + "learning_rate": 1.0433041243992852e-05, + "loss": 0.6495, + "step": 6299 + }, + { + "epoch": 0.4767129507018274, + "grad_norm": 1.90822172164917, + "learning_rate": 1.0430817715732622e-05, + "loss": 0.7302, + "step": 6300 + }, + { + "epoch": 0.476788619424161, + "grad_norm": 2.576930522918701, + "learning_rate": 1.0428594097869953e-05, + "loss": 0.8132, + "step": 6301 + }, + { + "epoch": 0.4768642881464946, + "grad_norm": 1.8655272722244263, + "learning_rate": 1.0426370390544107e-05, + "loss": 0.8921, + "step": 6302 + }, + { + "epoch": 0.4769399568688283, + "grad_norm": 2.354846715927124, + "learning_rate": 1.042414659389433e-05, + "loss": 0.803, + "step": 6303 + }, + { + "epoch": 0.4770156255911619, + "grad_norm": 2.417755603790283, + "learning_rate": 1.0421922708059892e-05, + "loss": 0.8491, + "step": 6304 + }, + { + "epoch": 0.4770912943134955, + "grad_norm": 1.6831603050231934, + "learning_rate": 1.041969873318005e-05, + "loss": 0.6529, + "step": 6305 + }, + { + "epoch": 0.4771669630358291, + "grad_norm": 2.6375699043273926, + "learning_rate": 1.0417474669394084e-05, + "loss": 0.8822, + "step": 6306 + }, + { + "epoch": 0.4772426317581628, + "grad_norm": 2.0903217792510986, + "learning_rate": 1.041525051684127e-05, + "loss": 0.6572, + "step": 6307 + }, + { + "epoch": 0.4773183004804964, + "grad_norm": 1.7691537141799927, + "learning_rate": 1.0413026275660887e-05, + "loss": 0.7839, + "step": 6308 + }, + { + "epoch": 0.47739396920283, + "grad_norm": 2.0674221515655518, + "learning_rate": 1.0410801945992233e-05, + "loss": 0.6806, + "step": 6309 + }, + { + "epoch": 0.4774696379251636, + "grad_norm": 2.6139473915100098, + "learning_rate": 1.0408577527974595e-05, + "loss": 0.7133, + "step": 6310 + }, + { + "epoch": 0.4775453066474973, + "grad_norm": 2.3850035667419434, + "learning_rate": 1.0406353021747277e-05, + "loss": 0.6617, + "step": 6311 + }, + { + "epoch": 0.4776209753698309, + "grad_norm": 2.5777697563171387, + "learning_rate": 1.0404128427449584e-05, + "loss": 0.8671, + "step": 6312 + }, + { + "epoch": 0.4776966440921645, + "grad_norm": 2.2594873905181885, + "learning_rate": 1.0401903745220831e-05, + "loss": 0.7522, + "step": 6313 + }, + { + "epoch": 0.4777723128144981, + "grad_norm": 2.265115737915039, + "learning_rate": 1.0399678975200328e-05, + "loss": 0.7659, + "step": 6314 + }, + { + "epoch": 0.4778479815368317, + "grad_norm": 2.038884162902832, + "learning_rate": 1.03974541175274e-05, + "loss": 0.6763, + "step": 6315 + }, + { + "epoch": 0.4779236502591654, + "grad_norm": 2.0255823135375977, + "learning_rate": 1.0395229172341377e-05, + "loss": 0.7657, + "step": 6316 + }, + { + "epoch": 0.477999318981499, + "grad_norm": 1.9382271766662598, + "learning_rate": 1.0393004139781586e-05, + "loss": 0.5579, + "step": 6317 + }, + { + "epoch": 0.4780749877038326, + "grad_norm": 2.1081786155700684, + "learning_rate": 1.0390779019987379e-05, + "loss": 0.7453, + "step": 6318 + }, + { + "epoch": 0.4781506564261662, + "grad_norm": 1.9590938091278076, + "learning_rate": 1.0388553813098082e-05, + "loss": 0.7372, + "step": 6319 + }, + { + "epoch": 0.4782263251484999, + "grad_norm": 1.6101315021514893, + "learning_rate": 1.0386328519253061e-05, + "loss": 0.8169, + "step": 6320 + }, + { + "epoch": 0.4783019938708335, + "grad_norm": 3.1328699588775635, + "learning_rate": 1.0384103138591659e-05, + "loss": 0.7454, + "step": 6321 + }, + { + "epoch": 0.4783776625931671, + "grad_norm": 2.7711310386657715, + "learning_rate": 1.0381877671253245e-05, + "loss": 0.5887, + "step": 6322 + }, + { + "epoch": 0.4784533313155007, + "grad_norm": 2.25168514251709, + "learning_rate": 1.037965211737718e-05, + "loss": 0.6437, + "step": 6323 + }, + { + "epoch": 0.4785290000378344, + "grad_norm": 3.660417318344116, + "learning_rate": 1.0377426477102837e-05, + "loss": 0.7606, + "step": 6324 + }, + { + "epoch": 0.478604668760168, + "grad_norm": 2.1805953979492188, + "learning_rate": 1.0375200750569595e-05, + "loss": 0.651, + "step": 6325 + }, + { + "epoch": 0.4786803374825016, + "grad_norm": 2.296247720718384, + "learning_rate": 1.037297493791683e-05, + "loss": 0.7059, + "step": 6326 + }, + { + "epoch": 0.4787560062048352, + "grad_norm": 2.1568257808685303, + "learning_rate": 1.037074903928394e-05, + "loss": 0.7095, + "step": 6327 + }, + { + "epoch": 0.4788316749271689, + "grad_norm": 2.124258041381836, + "learning_rate": 1.0368523054810308e-05, + "loss": 0.775, + "step": 6328 + }, + { + "epoch": 0.4789073436495025, + "grad_norm": 2.2516794204711914, + "learning_rate": 1.0366296984635335e-05, + "loss": 0.8205, + "step": 6329 + }, + { + "epoch": 0.4789830123718361, + "grad_norm": 2.5653510093688965, + "learning_rate": 1.0364070828898425e-05, + "loss": 0.6034, + "step": 6330 + }, + { + "epoch": 0.4790586810941697, + "grad_norm": 2.1148502826690674, + "learning_rate": 1.0361844587738991e-05, + "loss": 0.6733, + "step": 6331 + }, + { + "epoch": 0.4791343498165033, + "grad_norm": 2.0348715782165527, + "learning_rate": 1.0359618261296443e-05, + "loss": 0.6521, + "step": 6332 + }, + { + "epoch": 0.479210018538837, + "grad_norm": 1.895645022392273, + "learning_rate": 1.0357391849710202e-05, + "loss": 0.7377, + "step": 6333 + }, + { + "epoch": 0.4792856872611706, + "grad_norm": 2.0529367923736572, + "learning_rate": 1.0355165353119692e-05, + "loss": 0.7686, + "step": 6334 + }, + { + "epoch": 0.4793613559835042, + "grad_norm": 2.688235282897949, + "learning_rate": 1.0352938771664346e-05, + "loss": 0.7611, + "step": 6335 + }, + { + "epoch": 0.4794370247058378, + "grad_norm": 2.2325501441955566, + "learning_rate": 1.0350712105483598e-05, + "loss": 0.7466, + "step": 6336 + }, + { + "epoch": 0.4795126934281715, + "grad_norm": 2.4246463775634766, + "learning_rate": 1.0348485354716888e-05, + "loss": 0.738, + "step": 6337 + }, + { + "epoch": 0.4795883621505051, + "grad_norm": 2.519249439239502, + "learning_rate": 1.0346258519503663e-05, + "loss": 0.6102, + "step": 6338 + }, + { + "epoch": 0.4796640308728387, + "grad_norm": 2.213670253753662, + "learning_rate": 1.0344031599983377e-05, + "loss": 0.72, + "step": 6339 + }, + { + "epoch": 0.4797396995951723, + "grad_norm": 2.312347173690796, + "learning_rate": 1.0341804596295483e-05, + "loss": 0.7855, + "step": 6340 + }, + { + "epoch": 0.479815368317506, + "grad_norm": 2.551732063293457, + "learning_rate": 1.033957750857945e-05, + "loss": 0.6129, + "step": 6341 + }, + { + "epoch": 0.4798910370398396, + "grad_norm": 2.086526870727539, + "learning_rate": 1.0337350336974735e-05, + "loss": 0.8321, + "step": 6342 + }, + { + "epoch": 0.4799667057621732, + "grad_norm": 1.661421298980713, + "learning_rate": 1.033512308162082e-05, + "loss": 0.6088, + "step": 6343 + }, + { + "epoch": 0.4800423744845068, + "grad_norm": 1.759372591972351, + "learning_rate": 1.0332895742657175e-05, + "loss": 0.7405, + "step": 6344 + }, + { + "epoch": 0.48011804320684043, + "grad_norm": 1.9246104955673218, + "learning_rate": 1.0330668320223293e-05, + "loss": 0.6529, + "step": 6345 + }, + { + "epoch": 0.4801937119291741, + "grad_norm": 2.265521764755249, + "learning_rate": 1.0328440814458652e-05, + "loss": 0.6742, + "step": 6346 + }, + { + "epoch": 0.4802693806515077, + "grad_norm": 2.2895123958587646, + "learning_rate": 1.0326213225502754e-05, + "loss": 0.7062, + "step": 6347 + }, + { + "epoch": 0.4803450493738413, + "grad_norm": 2.0983073711395264, + "learning_rate": 1.0323985553495094e-05, + "loss": 0.8804, + "step": 6348 + }, + { + "epoch": 0.4804207180961749, + "grad_norm": 2.0943613052368164, + "learning_rate": 1.0321757798575176e-05, + "loss": 0.7967, + "step": 6349 + }, + { + "epoch": 0.4804963868185086, + "grad_norm": 1.9753507375717163, + "learning_rate": 1.0319529960882508e-05, + "loss": 0.7633, + "step": 6350 + }, + { + "epoch": 0.4805720555408422, + "grad_norm": 1.9423227310180664, + "learning_rate": 1.0317302040556607e-05, + "loss": 0.9147, + "step": 6351 + }, + { + "epoch": 0.4806477242631758, + "grad_norm": 2.10290265083313, + "learning_rate": 1.0315074037736991e-05, + "loss": 0.6627, + "step": 6352 + }, + { + "epoch": 0.4807233929855094, + "grad_norm": 2.183432102203369, + "learning_rate": 1.0312845952563187e-05, + "loss": 0.7181, + "step": 6353 + }, + { + "epoch": 0.4807990617078431, + "grad_norm": 1.9517886638641357, + "learning_rate": 1.0310617785174721e-05, + "loss": 0.6473, + "step": 6354 + }, + { + "epoch": 0.4808747304301767, + "grad_norm": 1.996443510055542, + "learning_rate": 1.0308389535711133e-05, + "loss": 0.5921, + "step": 6355 + }, + { + "epoch": 0.4809503991525103, + "grad_norm": 1.8111634254455566, + "learning_rate": 1.0306161204311958e-05, + "loss": 0.5869, + "step": 6356 + }, + { + "epoch": 0.4810260678748439, + "grad_norm": 2.648256301879883, + "learning_rate": 1.0303932791116744e-05, + "loss": 0.768, + "step": 6357 + }, + { + "epoch": 0.48110173659717753, + "grad_norm": 2.27350115776062, + "learning_rate": 1.0301704296265043e-05, + "loss": 0.6906, + "step": 6358 + }, + { + "epoch": 0.4811774053195112, + "grad_norm": 1.7875982522964478, + "learning_rate": 1.0299475719896409e-05, + "loss": 0.6928, + "step": 6359 + }, + { + "epoch": 0.4812530740418448, + "grad_norm": 2.582258939743042, + "learning_rate": 1.0297247062150398e-05, + "loss": 0.588, + "step": 6360 + }, + { + "epoch": 0.4813287427641784, + "grad_norm": 2.343061685562134, + "learning_rate": 1.0295018323166583e-05, + "loss": 0.8526, + "step": 6361 + }, + { + "epoch": 0.48140441148651203, + "grad_norm": 2.1170144081115723, + "learning_rate": 1.0292789503084532e-05, + "loss": 0.7459, + "step": 6362 + }, + { + "epoch": 0.4814800802088457, + "grad_norm": 2.0151526927948, + "learning_rate": 1.029056060204382e-05, + "loss": 0.7721, + "step": 6363 + }, + { + "epoch": 0.4815557489311793, + "grad_norm": 1.9160104990005493, + "learning_rate": 1.0288331620184032e-05, + "loss": 0.6751, + "step": 6364 + }, + { + "epoch": 0.4816314176535129, + "grad_norm": 2.3645200729370117, + "learning_rate": 1.0286102557644746e-05, + "loss": 0.6832, + "step": 6365 + }, + { + "epoch": 0.4817070863758465, + "grad_norm": 1.6131598949432373, + "learning_rate": 1.0283873414565564e-05, + "loss": 0.6133, + "step": 6366 + }, + { + "epoch": 0.4817827550981802, + "grad_norm": 2.991684913635254, + "learning_rate": 1.0281644191086073e-05, + "loss": 0.6093, + "step": 6367 + }, + { + "epoch": 0.4818584238205138, + "grad_norm": 2.0066182613372803, + "learning_rate": 1.0279414887345876e-05, + "loss": 0.7508, + "step": 6368 + }, + { + "epoch": 0.4819340925428474, + "grad_norm": 2.1001648902893066, + "learning_rate": 1.0277185503484583e-05, + "loss": 0.8123, + "step": 6369 + }, + { + "epoch": 0.482009761265181, + "grad_norm": 2.06756591796875, + "learning_rate": 1.0274956039641801e-05, + "loss": 0.6822, + "step": 6370 + }, + { + "epoch": 0.48208542998751464, + "grad_norm": 2.315427541732788, + "learning_rate": 1.027272649595715e-05, + "loss": 0.7408, + "step": 6371 + }, + { + "epoch": 0.4821610987098483, + "grad_norm": 3.052438735961914, + "learning_rate": 1.0270496872570249e-05, + "loss": 0.6234, + "step": 6372 + }, + { + "epoch": 0.4822367674321819, + "grad_norm": 1.8254213333129883, + "learning_rate": 1.0268267169620725e-05, + "loss": 0.7506, + "step": 6373 + }, + { + "epoch": 0.4823124361545155, + "grad_norm": 2.1949892044067383, + "learning_rate": 1.0266037387248206e-05, + "loss": 0.7951, + "step": 6374 + }, + { + "epoch": 0.48238810487684913, + "grad_norm": 2.8453803062438965, + "learning_rate": 1.0263807525592332e-05, + "loss": 0.8468, + "step": 6375 + }, + { + "epoch": 0.4824637735991828, + "grad_norm": 2.467980146408081, + "learning_rate": 1.0261577584792743e-05, + "loss": 0.7182, + "step": 6376 + }, + { + "epoch": 0.4825394423215164, + "grad_norm": 2.0588455200195312, + "learning_rate": 1.0259347564989087e-05, + "loss": 0.7689, + "step": 6377 + }, + { + "epoch": 0.48261511104385, + "grad_norm": 2.141855001449585, + "learning_rate": 1.0257117466321015e-05, + "loss": 0.8404, + "step": 6378 + }, + { + "epoch": 0.48269077976618363, + "grad_norm": 1.7582626342773438, + "learning_rate": 1.0254887288928176e-05, + "loss": 0.6482, + "step": 6379 + }, + { + "epoch": 0.4827664484885173, + "grad_norm": 2.70973539352417, + "learning_rate": 1.0252657032950239e-05, + "loss": 0.7863, + "step": 6380 + }, + { + "epoch": 0.4828421172108509, + "grad_norm": 2.2643096446990967, + "learning_rate": 1.0250426698526867e-05, + "loss": 0.6601, + "step": 6381 + }, + { + "epoch": 0.4829177859331845, + "grad_norm": 2.705983877182007, + "learning_rate": 1.0248196285797733e-05, + "loss": 0.6439, + "step": 6382 + }, + { + "epoch": 0.48299345465551813, + "grad_norm": 1.8606898784637451, + "learning_rate": 1.0245965794902505e-05, + "loss": 0.7542, + "step": 6383 + }, + { + "epoch": 0.48306912337785174, + "grad_norm": 2.3897595405578613, + "learning_rate": 1.0243735225980873e-05, + "loss": 0.6444, + "step": 6384 + }, + { + "epoch": 0.4831447921001854, + "grad_norm": 3.5943784713745117, + "learning_rate": 1.0241504579172518e-05, + "loss": 0.6732, + "step": 6385 + }, + { + "epoch": 0.483220460822519, + "grad_norm": 2.2588038444519043, + "learning_rate": 1.023927385461713e-05, + "loss": 0.635, + "step": 6386 + }, + { + "epoch": 0.4832961295448526, + "grad_norm": 1.9176634550094604, + "learning_rate": 1.0237043052454404e-05, + "loss": 0.7008, + "step": 6387 + }, + { + "epoch": 0.48337179826718624, + "grad_norm": 2.0969033241271973, + "learning_rate": 1.023481217282404e-05, + "loss": 0.8144, + "step": 6388 + }, + { + "epoch": 0.4834474669895199, + "grad_norm": 2.217078924179077, + "learning_rate": 1.0232581215865748e-05, + "loss": 0.6075, + "step": 6389 + }, + { + "epoch": 0.4835231357118535, + "grad_norm": 2.35626220703125, + "learning_rate": 1.0230350181719231e-05, + "loss": 0.8421, + "step": 6390 + }, + { + "epoch": 0.4835988044341871, + "grad_norm": 2.1566507816314697, + "learning_rate": 1.0228119070524205e-05, + "loss": 0.7397, + "step": 6391 + }, + { + "epoch": 0.48367447315652073, + "grad_norm": 2.613382577896118, + "learning_rate": 1.0225887882420394e-05, + "loss": 0.7971, + "step": 6392 + }, + { + "epoch": 0.4837501418788544, + "grad_norm": 6.792184829711914, + "learning_rate": 1.0223656617547517e-05, + "loss": 0.7269, + "step": 6393 + }, + { + "epoch": 0.483825810601188, + "grad_norm": 2.764080047607422, + "learning_rate": 1.0221425276045305e-05, + "loss": 0.7354, + "step": 6394 + }, + { + "epoch": 0.4839014793235216, + "grad_norm": 2.654021978378296, + "learning_rate": 1.0219193858053493e-05, + "loss": 0.6826, + "step": 6395 + }, + { + "epoch": 0.48397714804585523, + "grad_norm": 2.122959613800049, + "learning_rate": 1.0216962363711816e-05, + "loss": 0.6705, + "step": 6396 + }, + { + "epoch": 0.48405281676818884, + "grad_norm": 2.246718645095825, + "learning_rate": 1.0214730793160018e-05, + "loss": 0.6594, + "step": 6397 + }, + { + "epoch": 0.4841284854905225, + "grad_norm": 2.107835054397583, + "learning_rate": 1.0212499146537853e-05, + "loss": 0.6163, + "step": 6398 + }, + { + "epoch": 0.4842041542128561, + "grad_norm": 2.2131471633911133, + "learning_rate": 1.0210267423985067e-05, + "loss": 0.6586, + "step": 6399 + }, + { + "epoch": 0.48427982293518973, + "grad_norm": 2.5556445121765137, + "learning_rate": 1.0208035625641424e-05, + "loss": 0.6997, + "step": 6400 + }, + { + "epoch": 0.48435549165752334, + "grad_norm": 1.8932169675827026, + "learning_rate": 1.020580375164668e-05, + "loss": 0.8308, + "step": 6401 + }, + { + "epoch": 0.484431160379857, + "grad_norm": 1.881028413772583, + "learning_rate": 1.0203571802140605e-05, + "loss": 0.6717, + "step": 6402 + }, + { + "epoch": 0.4845068291021906, + "grad_norm": 2.3895459175109863, + "learning_rate": 1.020133977726297e-05, + "loss": 0.6854, + "step": 6403 + }, + { + "epoch": 0.4845824978245242, + "grad_norm": 1.8925831317901611, + "learning_rate": 1.0199107677153554e-05, + "loss": 0.6973, + "step": 6404 + }, + { + "epoch": 0.48465816654685784, + "grad_norm": 2.5870819091796875, + "learning_rate": 1.0196875501952137e-05, + "loss": 0.6679, + "step": 6405 + }, + { + "epoch": 0.4847338352691915, + "grad_norm": 2.1854963302612305, + "learning_rate": 1.01946432517985e-05, + "loss": 0.7334, + "step": 6406 + }, + { + "epoch": 0.4848095039915251, + "grad_norm": 3.5086848735809326, + "learning_rate": 1.0192410926832446e-05, + "loss": 0.6914, + "step": 6407 + }, + { + "epoch": 0.4848851727138587, + "grad_norm": 3.082146167755127, + "learning_rate": 1.0190178527193761e-05, + "loss": 0.7652, + "step": 6408 + }, + { + "epoch": 0.48496084143619234, + "grad_norm": 2.8593039512634277, + "learning_rate": 1.0187946053022247e-05, + "loss": 0.6731, + "step": 6409 + }, + { + "epoch": 0.48503651015852595, + "grad_norm": 2.5862269401550293, + "learning_rate": 1.0185713504457709e-05, + "loss": 0.6637, + "step": 6410 + }, + { + "epoch": 0.4851121788808596, + "grad_norm": 1.955764889717102, + "learning_rate": 1.0183480881639952e-05, + "loss": 0.6526, + "step": 6411 + }, + { + "epoch": 0.4851878476031932, + "grad_norm": 2.400613307952881, + "learning_rate": 1.01812481847088e-05, + "loss": 0.7565, + "step": 6412 + }, + { + "epoch": 0.48526351632552683, + "grad_norm": 1.8675727844238281, + "learning_rate": 1.0179015413804063e-05, + "loss": 0.6738, + "step": 6413 + }, + { + "epoch": 0.48533918504786044, + "grad_norm": 2.350315809249878, + "learning_rate": 1.0176782569065568e-05, + "loss": 0.7441, + "step": 6414 + }, + { + "epoch": 0.4854148537701941, + "grad_norm": 2.3835151195526123, + "learning_rate": 1.0174549650633142e-05, + "loss": 0.6982, + "step": 6415 + }, + { + "epoch": 0.4854905224925277, + "grad_norm": 2.2682459354400635, + "learning_rate": 1.0172316658646619e-05, + "loss": 0.6537, + "step": 6416 + }, + { + "epoch": 0.48556619121486133, + "grad_norm": 1.8722403049468994, + "learning_rate": 1.0170083593245836e-05, + "loss": 0.8612, + "step": 6417 + }, + { + "epoch": 0.48564185993719494, + "grad_norm": 1.8039960861206055, + "learning_rate": 1.0167850454570632e-05, + "loss": 0.7195, + "step": 6418 + }, + { + "epoch": 0.4857175286595286, + "grad_norm": 2.1905548572540283, + "learning_rate": 1.0165617242760855e-05, + "loss": 0.8805, + "step": 6419 + }, + { + "epoch": 0.4857931973818622, + "grad_norm": 3.8410537242889404, + "learning_rate": 1.0163383957956357e-05, + "loss": 0.7325, + "step": 6420 + }, + { + "epoch": 0.48586886610419583, + "grad_norm": 1.9922008514404297, + "learning_rate": 1.0161150600296993e-05, + "loss": 0.762, + "step": 6421 + }, + { + "epoch": 0.48594453482652944, + "grad_norm": 2.191408157348633, + "learning_rate": 1.0158917169922622e-05, + "loss": 0.7489, + "step": 6422 + }, + { + "epoch": 0.48602020354886305, + "grad_norm": 1.968904733657837, + "learning_rate": 1.0156683666973112e-05, + "loss": 0.6926, + "step": 6423 + }, + { + "epoch": 0.4860958722711967, + "grad_norm": 2.3216192722320557, + "learning_rate": 1.0154450091588326e-05, + "loss": 0.7792, + "step": 6424 + }, + { + "epoch": 0.4861715409935303, + "grad_norm": 1.8544453382492065, + "learning_rate": 1.0152216443908144e-05, + "loss": 0.6568, + "step": 6425 + }, + { + "epoch": 0.48624720971586394, + "grad_norm": 2.2261478900909424, + "learning_rate": 1.0149982724072439e-05, + "loss": 0.7715, + "step": 6426 + }, + { + "epoch": 0.48632287843819755, + "grad_norm": 2.093865394592285, + "learning_rate": 1.0147748932221098e-05, + "loss": 0.738, + "step": 6427 + }, + { + "epoch": 0.4863985471605312, + "grad_norm": 2.6762888431549072, + "learning_rate": 1.0145515068494007e-05, + "loss": 0.7401, + "step": 6428 + }, + { + "epoch": 0.4864742158828648, + "grad_norm": 1.9878803491592407, + "learning_rate": 1.0143281133031056e-05, + "loss": 0.6209, + "step": 6429 + }, + { + "epoch": 0.48654988460519843, + "grad_norm": 2.1690900325775146, + "learning_rate": 1.0141047125972145e-05, + "loss": 0.7118, + "step": 6430 + }, + { + "epoch": 0.48662555332753205, + "grad_norm": 2.016566038131714, + "learning_rate": 1.013881304745717e-05, + "loss": 0.6214, + "step": 6431 + }, + { + "epoch": 0.4867012220498657, + "grad_norm": 1.7480603456497192, + "learning_rate": 1.0136578897626037e-05, + "loss": 0.7928, + "step": 6432 + }, + { + "epoch": 0.4867768907721993, + "grad_norm": 2.419851064682007, + "learning_rate": 1.013434467661866e-05, + "loss": 0.7508, + "step": 6433 + }, + { + "epoch": 0.48685255949453293, + "grad_norm": 3.0105323791503906, + "learning_rate": 1.0132110384574949e-05, + "loss": 0.7791, + "step": 6434 + }, + { + "epoch": 0.48692822821686654, + "grad_norm": 1.9230643510818481, + "learning_rate": 1.0129876021634826e-05, + "loss": 0.7613, + "step": 6435 + }, + { + "epoch": 0.4870038969392002, + "grad_norm": 2.396361827850342, + "learning_rate": 1.0127641587938213e-05, + "loss": 0.6539, + "step": 6436 + }, + { + "epoch": 0.4870795656615338, + "grad_norm": 2.384631395339966, + "learning_rate": 1.0125407083625034e-05, + "loss": 0.661, + "step": 6437 + }, + { + "epoch": 0.48715523438386743, + "grad_norm": 1.9804085493087769, + "learning_rate": 1.0123172508835224e-05, + "loss": 0.6685, + "step": 6438 + }, + { + "epoch": 0.48723090310620104, + "grad_norm": 2.5004003047943115, + "learning_rate": 1.0120937863708718e-05, + "loss": 0.937, + "step": 6439 + }, + { + "epoch": 0.48730657182853465, + "grad_norm": 1.8940950632095337, + "learning_rate": 1.0118703148385458e-05, + "loss": 0.7562, + "step": 6440 + }, + { + "epoch": 0.4873822405508683, + "grad_norm": 1.8344001770019531, + "learning_rate": 1.0116468363005388e-05, + "loss": 0.6854, + "step": 6441 + }, + { + "epoch": 0.48745790927320193, + "grad_norm": 2.383427858352661, + "learning_rate": 1.011423350770846e-05, + "loss": 0.7046, + "step": 6442 + }, + { + "epoch": 0.48753357799553554, + "grad_norm": 2.256309747695923, + "learning_rate": 1.0111998582634623e-05, + "loss": 0.7245, + "step": 6443 + }, + { + "epoch": 0.48760924671786915, + "grad_norm": 7.691187381744385, + "learning_rate": 1.0109763587923842e-05, + "loss": 0.7306, + "step": 6444 + }, + { + "epoch": 0.4876849154402028, + "grad_norm": 2.102891206741333, + "learning_rate": 1.0107528523716071e-05, + "loss": 0.6399, + "step": 6445 + }, + { + "epoch": 0.4877605841625364, + "grad_norm": 2.5367937088012695, + "learning_rate": 1.0105293390151287e-05, + "loss": 0.6951, + "step": 6446 + }, + { + "epoch": 0.48783625288487004, + "grad_norm": 1.8842484951019287, + "learning_rate": 1.0103058187369451e-05, + "loss": 0.7347, + "step": 6447 + }, + { + "epoch": 0.48791192160720365, + "grad_norm": 1.78765869140625, + "learning_rate": 1.0100822915510547e-05, + "loss": 0.6858, + "step": 6448 + }, + { + "epoch": 0.4879875903295373, + "grad_norm": 1.82582688331604, + "learning_rate": 1.0098587574714548e-05, + "loss": 0.7601, + "step": 6449 + }, + { + "epoch": 0.4880632590518709, + "grad_norm": 2.041457176208496, + "learning_rate": 1.0096352165121444e-05, + "loss": 0.6765, + "step": 6450 + }, + { + "epoch": 0.48813892777420453, + "grad_norm": 2.2032086849212646, + "learning_rate": 1.0094116686871222e-05, + "loss": 0.6296, + "step": 6451 + }, + { + "epoch": 0.48821459649653814, + "grad_norm": 2.4517290592193604, + "learning_rate": 1.0091881140103873e-05, + "loss": 0.8565, + "step": 6452 + }, + { + "epoch": 0.48829026521887176, + "grad_norm": 2.434957504272461, + "learning_rate": 1.0089645524959398e-05, + "loss": 0.8896, + "step": 6453 + }, + { + "epoch": 0.4883659339412054, + "grad_norm": 2.1253631114959717, + "learning_rate": 1.0087409841577793e-05, + "loss": 0.5887, + "step": 6454 + }, + { + "epoch": 0.48844160266353903, + "grad_norm": 2.2699928283691406, + "learning_rate": 1.0085174090099066e-05, + "loss": 0.7519, + "step": 6455 + }, + { + "epoch": 0.48851727138587264, + "grad_norm": 2.0873475074768066, + "learning_rate": 1.008293827066323e-05, + "loss": 0.6143, + "step": 6456 + }, + { + "epoch": 0.48859294010820625, + "grad_norm": 2.778334140777588, + "learning_rate": 1.0080702383410296e-05, + "loss": 0.7002, + "step": 6457 + }, + { + "epoch": 0.4886686088305399, + "grad_norm": 2.2571263313293457, + "learning_rate": 1.0078466428480285e-05, + "loss": 0.6735, + "step": 6458 + }, + { + "epoch": 0.48874427755287353, + "grad_norm": 2.0266454219818115, + "learning_rate": 1.0076230406013216e-05, + "loss": 0.782, + "step": 6459 + }, + { + "epoch": 0.48881994627520714, + "grad_norm": 1.8740894794464111, + "learning_rate": 1.0073994316149117e-05, + "loss": 0.6624, + "step": 6460 + }, + { + "epoch": 0.48889561499754075, + "grad_norm": 2.3470191955566406, + "learning_rate": 1.0071758159028023e-05, + "loss": 0.6661, + "step": 6461 + }, + { + "epoch": 0.4889712837198744, + "grad_norm": 2.5701887607574463, + "learning_rate": 1.0069521934789965e-05, + "loss": 0.6928, + "step": 6462 + }, + { + "epoch": 0.489046952442208, + "grad_norm": 1.8447167873382568, + "learning_rate": 1.0067285643574983e-05, + "loss": 0.5812, + "step": 6463 + }, + { + "epoch": 0.48912262116454164, + "grad_norm": 2.286895751953125, + "learning_rate": 1.0065049285523126e-05, + "loss": 0.751, + "step": 6464 + }, + { + "epoch": 0.48919828988687525, + "grad_norm": 1.9406249523162842, + "learning_rate": 1.0062812860774435e-05, + "loss": 0.7846, + "step": 6465 + }, + { + "epoch": 0.48927395860920886, + "grad_norm": 2.343668222427368, + "learning_rate": 1.0060576369468964e-05, + "loss": 0.6193, + "step": 6466 + }, + { + "epoch": 0.4893496273315425, + "grad_norm": 2.254185199737549, + "learning_rate": 1.0058339811746774e-05, + "loss": 0.7514, + "step": 6467 + }, + { + "epoch": 0.48942529605387614, + "grad_norm": 1.851659893989563, + "learning_rate": 1.0056103187747916e-05, + "loss": 0.6014, + "step": 6468 + }, + { + "epoch": 0.48950096477620975, + "grad_norm": 1.9072391986846924, + "learning_rate": 1.0053866497612465e-05, + "loss": 0.6593, + "step": 6469 + }, + { + "epoch": 0.48957663349854336, + "grad_norm": 2.4362573623657227, + "learning_rate": 1.0051629741480483e-05, + "loss": 0.6269, + "step": 6470 + }, + { + "epoch": 0.489652302220877, + "grad_norm": 2.2436280250549316, + "learning_rate": 1.004939291949205e-05, + "loss": 0.6636, + "step": 6471 + }, + { + "epoch": 0.48972797094321063, + "grad_norm": 1.9060099124908447, + "learning_rate": 1.0047156031787233e-05, + "loss": 0.7561, + "step": 6472 + }, + { + "epoch": 0.48980363966554424, + "grad_norm": 2.352036714553833, + "learning_rate": 1.0044919078506122e-05, + "loss": 0.6255, + "step": 6473 + }, + { + "epoch": 0.48987930838787785, + "grad_norm": 1.977865219116211, + "learning_rate": 1.0042682059788798e-05, + "loss": 0.6376, + "step": 6474 + }, + { + "epoch": 0.4899549771102115, + "grad_norm": 3.1520273685455322, + "learning_rate": 1.0040444975775348e-05, + "loss": 0.8148, + "step": 6475 + }, + { + "epoch": 0.49003064583254513, + "grad_norm": 1.8340657949447632, + "learning_rate": 1.0038207826605871e-05, + "loss": 0.5468, + "step": 6476 + }, + { + "epoch": 0.49010631455487874, + "grad_norm": 2.7415192127227783, + "learning_rate": 1.003597061242046e-05, + "loss": 0.7775, + "step": 6477 + }, + { + "epoch": 0.49018198327721235, + "grad_norm": 2.266130208969116, + "learning_rate": 1.003373333335922e-05, + "loss": 0.7789, + "step": 6478 + }, + { + "epoch": 0.49025765199954596, + "grad_norm": 2.030848503112793, + "learning_rate": 1.0031495989562255e-05, + "loss": 0.709, + "step": 6479 + }, + { + "epoch": 0.49033332072187963, + "grad_norm": 1.9253787994384766, + "learning_rate": 1.0029258581169675e-05, + "loss": 0.8155, + "step": 6480 + }, + { + "epoch": 0.49040898944421324, + "grad_norm": 2.235677480697632, + "learning_rate": 1.0027021108321597e-05, + "loss": 0.6628, + "step": 6481 + }, + { + "epoch": 0.49048465816654685, + "grad_norm": 2.522493600845337, + "learning_rate": 1.002478357115813e-05, + "loss": 0.6663, + "step": 6482 + }, + { + "epoch": 0.49056032688888046, + "grad_norm": 2.5765788555145264, + "learning_rate": 1.0022545969819403e-05, + "loss": 0.6466, + "step": 6483 + }, + { + "epoch": 0.4906359956112141, + "grad_norm": 2.641915798187256, + "learning_rate": 1.0020308304445539e-05, + "loss": 0.777, + "step": 6484 + }, + { + "epoch": 0.49071166433354774, + "grad_norm": 4.392044544219971, + "learning_rate": 1.0018070575176672e-05, + "loss": 0.6881, + "step": 6485 + }, + { + "epoch": 0.49078733305588135, + "grad_norm": 1.90240478515625, + "learning_rate": 1.0015832782152928e-05, + "loss": 0.7021, + "step": 6486 + }, + { + "epoch": 0.49086300177821496, + "grad_norm": 1.9071075916290283, + "learning_rate": 1.0013594925514453e-05, + "loss": 0.6333, + "step": 6487 + }, + { + "epoch": 0.4909386705005486, + "grad_norm": 1.997644066810608, + "learning_rate": 1.0011357005401386e-05, + "loss": 0.6004, + "step": 6488 + }, + { + "epoch": 0.49101433922288223, + "grad_norm": 2.4626305103302, + "learning_rate": 1.000911902195387e-05, + "loss": 0.7033, + "step": 6489 + }, + { + "epoch": 0.49109000794521585, + "grad_norm": 1.9787263870239258, + "learning_rate": 1.0006880975312061e-05, + "loss": 0.6778, + "step": 6490 + }, + { + "epoch": 0.49116567666754946, + "grad_norm": 2.7848620414733887, + "learning_rate": 1.0004642865616104e-05, + "loss": 0.5979, + "step": 6491 + }, + { + "epoch": 0.49124134538988307, + "grad_norm": 2.4936132431030273, + "learning_rate": 1.0002404693006164e-05, + "loss": 0.7636, + "step": 6492 + }, + { + "epoch": 0.49131701411221673, + "grad_norm": 1.9078294038772583, + "learning_rate": 1.0000166457622396e-05, + "loss": 0.6217, + "step": 6493 + }, + { + "epoch": 0.49139268283455034, + "grad_norm": 2.200657844543457, + "learning_rate": 9.997928159604974e-06, + "loss": 0.698, + "step": 6494 + }, + { + "epoch": 0.49146835155688395, + "grad_norm": 2.0636913776397705, + "learning_rate": 9.99568979909406e-06, + "loss": 0.7468, + "step": 6495 + }, + { + "epoch": 0.49154402027921756, + "grad_norm": 2.1066038608551025, + "learning_rate": 9.993451376229832e-06, + "loss": 0.5998, + "step": 6496 + }, + { + "epoch": 0.49161968900155123, + "grad_norm": 2.909907579421997, + "learning_rate": 9.991212891152469e-06, + "loss": 0.7663, + "step": 6497 + }, + { + "epoch": 0.49169535772388484, + "grad_norm": 2.411532402038574, + "learning_rate": 9.988974344002143e-06, + "loss": 0.727, + "step": 6498 + }, + { + "epoch": 0.49177102644621845, + "grad_norm": 2.1404001712799072, + "learning_rate": 9.986735734919048e-06, + "loss": 0.7662, + "step": 6499 + }, + { + "epoch": 0.49184669516855206, + "grad_norm": 1.9575718641281128, + "learning_rate": 9.984497064043367e-06, + "loss": 0.7276, + "step": 6500 + }, + { + "epoch": 0.4919223638908857, + "grad_norm": 2.085799217224121, + "learning_rate": 9.982258331515298e-06, + "loss": 0.8749, + "step": 6501 + }, + { + "epoch": 0.49199803261321934, + "grad_norm": 2.514505624771118, + "learning_rate": 9.980019537475034e-06, + "loss": 0.7701, + "step": 6502 + }, + { + "epoch": 0.49207370133555295, + "grad_norm": 2.4015519618988037, + "learning_rate": 9.977780682062779e-06, + "loss": 0.7562, + "step": 6503 + }, + { + "epoch": 0.49214937005788656, + "grad_norm": 2.25130033493042, + "learning_rate": 9.975541765418734e-06, + "loss": 0.9941, + "step": 6504 + }, + { + "epoch": 0.49222503878022017, + "grad_norm": 2.4228641986846924, + "learning_rate": 9.973302787683106e-06, + "loss": 0.7318, + "step": 6505 + }, + { + "epoch": 0.49230070750255384, + "grad_norm": 2.58864688873291, + "learning_rate": 9.971063748996113e-06, + "loss": 0.6254, + "step": 6506 + }, + { + "epoch": 0.49237637622488745, + "grad_norm": 2.389697790145874, + "learning_rate": 9.968824649497963e-06, + "loss": 0.6684, + "step": 6507 + }, + { + "epoch": 0.49245204494722106, + "grad_norm": 2.1879706382751465, + "learning_rate": 9.966585489328885e-06, + "loss": 0.7188, + "step": 6508 + }, + { + "epoch": 0.49252771366955467, + "grad_norm": 2.2431092262268066, + "learning_rate": 9.964346268629092e-06, + "loss": 0.727, + "step": 6509 + }, + { + "epoch": 0.49260338239188833, + "grad_norm": 2.4414150714874268, + "learning_rate": 9.962106987538822e-06, + "loss": 0.9228, + "step": 6510 + }, + { + "epoch": 0.49267905111422194, + "grad_norm": 2.145207166671753, + "learning_rate": 9.959867646198299e-06, + "loss": 0.611, + "step": 6511 + }, + { + "epoch": 0.49275471983655555, + "grad_norm": 2.1104111671447754, + "learning_rate": 9.957628244747755e-06, + "loss": 0.8066, + "step": 6512 + }, + { + "epoch": 0.49283038855888917, + "grad_norm": 2.5595433712005615, + "learning_rate": 9.95538878332744e-06, + "loss": 0.6367, + "step": 6513 + }, + { + "epoch": 0.49290605728122283, + "grad_norm": 2.492157459259033, + "learning_rate": 9.953149262077583e-06, + "loss": 0.811, + "step": 6514 + }, + { + "epoch": 0.49298172600355644, + "grad_norm": 2.075108528137207, + "learning_rate": 9.95090968113844e-06, + "loss": 0.6916, + "step": 6515 + }, + { + "epoch": 0.49305739472589005, + "grad_norm": 3.124265432357788, + "learning_rate": 9.948670040650253e-06, + "loss": 0.6464, + "step": 6516 + }, + { + "epoch": 0.49313306344822366, + "grad_norm": 3.6622323989868164, + "learning_rate": 9.946430340753285e-06, + "loss": 0.5503, + "step": 6517 + }, + { + "epoch": 0.4932087321705573, + "grad_norm": 2.151686191558838, + "learning_rate": 9.944190581587787e-06, + "loss": 0.697, + "step": 6518 + }, + { + "epoch": 0.49328440089289094, + "grad_norm": 1.9287328720092773, + "learning_rate": 9.941950763294019e-06, + "loss": 0.9503, + "step": 6519 + }, + { + "epoch": 0.49336006961522455, + "grad_norm": 2.108152389526367, + "learning_rate": 9.93971088601225e-06, + "loss": 0.6141, + "step": 6520 + }, + { + "epoch": 0.49343573833755816, + "grad_norm": 1.9373310804367065, + "learning_rate": 9.937470949882741e-06, + "loss": 0.8837, + "step": 6521 + }, + { + "epoch": 0.49351140705989177, + "grad_norm": 2.2187082767486572, + "learning_rate": 9.935230955045775e-06, + "loss": 0.7105, + "step": 6522 + }, + { + "epoch": 0.49358707578222544, + "grad_norm": 2.0548200607299805, + "learning_rate": 9.932990901641616e-06, + "loss": 0.7974, + "step": 6523 + }, + { + "epoch": 0.49366274450455905, + "grad_norm": 2.771439790725708, + "learning_rate": 9.930750789810554e-06, + "loss": 0.7277, + "step": 6524 + }, + { + "epoch": 0.49373841322689266, + "grad_norm": 2.260519504547119, + "learning_rate": 9.928510619692862e-06, + "loss": 0.7294, + "step": 6525 + }, + { + "epoch": 0.49381408194922627, + "grad_norm": 2.2592484951019287, + "learning_rate": 9.92627039142884e-06, + "loss": 0.6168, + "step": 6526 + }, + { + "epoch": 0.49388975067155994, + "grad_norm": 2.256322145462036, + "learning_rate": 9.924030105158762e-06, + "loss": 0.7252, + "step": 6527 + }, + { + "epoch": 0.49396541939389355, + "grad_norm": 2.2179460525512695, + "learning_rate": 9.921789761022933e-06, + "loss": 0.715, + "step": 6528 + }, + { + "epoch": 0.49404108811622716, + "grad_norm": 2.221290111541748, + "learning_rate": 9.919549359161649e-06, + "loss": 0.7855, + "step": 6529 + }, + { + "epoch": 0.49411675683856077, + "grad_norm": 2.497633218765259, + "learning_rate": 9.917308899715208e-06, + "loss": 0.6075, + "step": 6530 + }, + { + "epoch": 0.4941924255608944, + "grad_norm": 2.1124684810638428, + "learning_rate": 9.915068382823918e-06, + "loss": 0.7536, + "step": 6531 + }, + { + "epoch": 0.49426809428322804, + "grad_norm": 1.9984757900238037, + "learning_rate": 9.912827808628085e-06, + "loss": 0.5954, + "step": 6532 + }, + { + "epoch": 0.49434376300556165, + "grad_norm": 2.670492172241211, + "learning_rate": 9.910587177268025e-06, + "loss": 0.7755, + "step": 6533 + }, + { + "epoch": 0.49441943172789526, + "grad_norm": 1.9507744312286377, + "learning_rate": 9.908346488884048e-06, + "loss": 0.6622, + "step": 6534 + }, + { + "epoch": 0.4944951004502289, + "grad_norm": 2.2064528465270996, + "learning_rate": 9.906105743616476e-06, + "loss": 0.7481, + "step": 6535 + }, + { + "epoch": 0.49457076917256254, + "grad_norm": 2.032932758331299, + "learning_rate": 9.903864941605631e-06, + "loss": 0.6665, + "step": 6536 + }, + { + "epoch": 0.49464643789489615, + "grad_norm": 1.6875627040863037, + "learning_rate": 9.901624082991842e-06, + "loss": 0.6829, + "step": 6537 + }, + { + "epoch": 0.49472210661722976, + "grad_norm": 2.333106517791748, + "learning_rate": 9.899383167915438e-06, + "loss": 0.7117, + "step": 6538 + }, + { + "epoch": 0.4947977753395634, + "grad_norm": 2.2066781520843506, + "learning_rate": 9.897142196516745e-06, + "loss": 0.628, + "step": 6539 + }, + { + "epoch": 0.49487344406189704, + "grad_norm": 1.9764480590820312, + "learning_rate": 9.894901168936112e-06, + "loss": 0.7689, + "step": 6540 + }, + { + "epoch": 0.49494911278423065, + "grad_norm": 1.911521077156067, + "learning_rate": 9.892660085313872e-06, + "loss": 0.6511, + "step": 6541 + }, + { + "epoch": 0.49502478150656426, + "grad_norm": 2.4168167114257812, + "learning_rate": 9.890418945790369e-06, + "loss": 0.6539, + "step": 6542 + }, + { + "epoch": 0.49510045022889787, + "grad_norm": 2.118840217590332, + "learning_rate": 9.88817775050595e-06, + "loss": 0.7646, + "step": 6543 + }, + { + "epoch": 0.4951761189512315, + "grad_norm": 2.163649082183838, + "learning_rate": 9.885936499600972e-06, + "loss": 0.8885, + "step": 6544 + }, + { + "epoch": 0.49525178767356515, + "grad_norm": 1.633584976196289, + "learning_rate": 9.883695193215784e-06, + "loss": 0.7617, + "step": 6545 + }, + { + "epoch": 0.49532745639589876, + "grad_norm": 2.28901743888855, + "learning_rate": 9.881453831490741e-06, + "loss": 0.6508, + "step": 6546 + }, + { + "epoch": 0.49540312511823237, + "grad_norm": 1.933939814567566, + "learning_rate": 9.879212414566212e-06, + "loss": 0.6829, + "step": 6547 + }, + { + "epoch": 0.495478793840566, + "grad_norm": 2.058504104614258, + "learning_rate": 9.876970942582555e-06, + "loss": 0.7911, + "step": 6548 + }, + { + "epoch": 0.49555446256289964, + "grad_norm": 4.5961127281188965, + "learning_rate": 9.874729415680145e-06, + "loss": 0.6346, + "step": 6549 + }, + { + "epoch": 0.49563013128523326, + "grad_norm": 2.232285499572754, + "learning_rate": 9.872487833999343e-06, + "loss": 0.8858, + "step": 6550 + }, + { + "epoch": 0.49570580000756687, + "grad_norm": 2.6463253498077393, + "learning_rate": 9.870246197680539e-06, + "loss": 0.7364, + "step": 6551 + }, + { + "epoch": 0.4957814687299005, + "grad_norm": 2.0343997478485107, + "learning_rate": 9.868004506864098e-06, + "loss": 0.6202, + "step": 6552 + }, + { + "epoch": 0.49585713745223414, + "grad_norm": 2.176643133163452, + "learning_rate": 9.86576276169041e-06, + "loss": 0.6113, + "step": 6553 + }, + { + "epoch": 0.49593280617456775, + "grad_norm": 2.058581829071045, + "learning_rate": 9.863520962299858e-06, + "loss": 0.6814, + "step": 6554 + }, + { + "epoch": 0.49600847489690136, + "grad_norm": 1.982015609741211, + "learning_rate": 9.861279108832825e-06, + "loss": 0.7254, + "step": 6555 + }, + { + "epoch": 0.496084143619235, + "grad_norm": 2.0388500690460205, + "learning_rate": 9.859037201429715e-06, + "loss": 0.7784, + "step": 6556 + }, + { + "epoch": 0.49615981234156864, + "grad_norm": 2.746868133544922, + "learning_rate": 9.85679524023091e-06, + "loss": 0.7083, + "step": 6557 + }, + { + "epoch": 0.49623548106390225, + "grad_norm": 1.9284957647323608, + "learning_rate": 9.854553225376823e-06, + "loss": 0.6415, + "step": 6558 + }, + { + "epoch": 0.49631114978623586, + "grad_norm": 2.158855438232422, + "learning_rate": 9.852311157007845e-06, + "loss": 0.4699, + "step": 6559 + }, + { + "epoch": 0.49638681850856947, + "grad_norm": 1.9091185331344604, + "learning_rate": 9.850069035264388e-06, + "loss": 0.7226, + "step": 6560 + }, + { + "epoch": 0.4964624872309031, + "grad_norm": 2.265294313430786, + "learning_rate": 9.84782686028686e-06, + "loss": 0.7521, + "step": 6561 + }, + { + "epoch": 0.49653815595323675, + "grad_norm": 2.8000192642211914, + "learning_rate": 9.845584632215667e-06, + "loss": 0.8844, + "step": 6562 + }, + { + "epoch": 0.49661382467557036, + "grad_norm": 2.02162504196167, + "learning_rate": 9.843342351191232e-06, + "loss": 0.7957, + "step": 6563 + }, + { + "epoch": 0.49668949339790397, + "grad_norm": 2.197127103805542, + "learning_rate": 9.841100017353972e-06, + "loss": 0.791, + "step": 6564 + }, + { + "epoch": 0.4967651621202376, + "grad_norm": 1.8492189645767212, + "learning_rate": 9.838857630844305e-06, + "loss": 0.7472, + "step": 6565 + }, + { + "epoch": 0.49684083084257125, + "grad_norm": 2.151035785675049, + "learning_rate": 9.836615191802663e-06, + "loss": 0.5994, + "step": 6566 + }, + { + "epoch": 0.49691649956490486, + "grad_norm": 2.232987642288208, + "learning_rate": 9.834372700369472e-06, + "loss": 0.8005, + "step": 6567 + }, + { + "epoch": 0.49699216828723847, + "grad_norm": 1.6914528608322144, + "learning_rate": 9.832130156685163e-06, + "loss": 0.795, + "step": 6568 + }, + { + "epoch": 0.4970678370095721, + "grad_norm": 2.227271318435669, + "learning_rate": 9.829887560890171e-06, + "loss": 0.7877, + "step": 6569 + }, + { + "epoch": 0.49714350573190574, + "grad_norm": 2.173454523086548, + "learning_rate": 9.827644913124937e-06, + "loss": 0.7479, + "step": 6570 + }, + { + "epoch": 0.49721917445423935, + "grad_norm": 2.1114470958709717, + "learning_rate": 9.8254022135299e-06, + "loss": 0.5879, + "step": 6571 + }, + { + "epoch": 0.49729484317657296, + "grad_norm": 1.8386335372924805, + "learning_rate": 9.82315946224551e-06, + "loss": 0.572, + "step": 6572 + }, + { + "epoch": 0.4973705118989066, + "grad_norm": 2.5144834518432617, + "learning_rate": 9.820916659412208e-06, + "loss": 0.6295, + "step": 6573 + }, + { + "epoch": 0.4974461806212402, + "grad_norm": 2.596182107925415, + "learning_rate": 9.818673805170454e-06, + "loss": 0.7412, + "step": 6574 + }, + { + "epoch": 0.49752184934357385, + "grad_norm": 1.809083342552185, + "learning_rate": 9.816430899660695e-06, + "loss": 0.7819, + "step": 6575 + }, + { + "epoch": 0.49759751806590746, + "grad_norm": 1.6025587320327759, + "learning_rate": 9.814187943023394e-06, + "loss": 0.6106, + "step": 6576 + }, + { + "epoch": 0.4976731867882411, + "grad_norm": 2.2268176078796387, + "learning_rate": 9.811944935399011e-06, + "loss": 0.7396, + "step": 6577 + }, + { + "epoch": 0.4977488555105747, + "grad_norm": 2.883540391921997, + "learning_rate": 9.809701876928007e-06, + "loss": 0.7096, + "step": 6578 + }, + { + "epoch": 0.49782452423290835, + "grad_norm": 2.127419948577881, + "learning_rate": 9.807458767750857e-06, + "loss": 0.5795, + "step": 6579 + }, + { + "epoch": 0.49790019295524196, + "grad_norm": 2.2806951999664307, + "learning_rate": 9.805215608008025e-06, + "loss": 0.7534, + "step": 6580 + }, + { + "epoch": 0.49797586167757557, + "grad_norm": 2.44822359085083, + "learning_rate": 9.802972397839987e-06, + "loss": 0.6425, + "step": 6581 + }, + { + "epoch": 0.4980515303999092, + "grad_norm": 2.3223483562469482, + "learning_rate": 9.800729137387221e-06, + "loss": 0.6621, + "step": 6582 + }, + { + "epoch": 0.49812719912224285, + "grad_norm": 2.271935224533081, + "learning_rate": 9.798485826790205e-06, + "loss": 0.6618, + "step": 6583 + }, + { + "epoch": 0.49820286784457646, + "grad_norm": 1.8532313108444214, + "learning_rate": 9.796242466189427e-06, + "loss": 0.615, + "step": 6584 + }, + { + "epoch": 0.49827853656691007, + "grad_norm": 1.9369428157806396, + "learning_rate": 9.793999055725368e-06, + "loss": 0.7089, + "step": 6585 + }, + { + "epoch": 0.4983542052892437, + "grad_norm": 2.717416286468506, + "learning_rate": 9.79175559553852e-06, + "loss": 0.6442, + "step": 6586 + }, + { + "epoch": 0.4984298740115773, + "grad_norm": 2.2628488540649414, + "learning_rate": 9.789512085769375e-06, + "loss": 0.6872, + "step": 6587 + }, + { + "epoch": 0.49850554273391096, + "grad_norm": 2.6931004524230957, + "learning_rate": 9.787268526558431e-06, + "loss": 0.6916, + "step": 6588 + }, + { + "epoch": 0.49858121145624457, + "grad_norm": 2.3270750045776367, + "learning_rate": 9.785024918046185e-06, + "loss": 0.7876, + "step": 6589 + }, + { + "epoch": 0.4986568801785782, + "grad_norm": 2.268930673599243, + "learning_rate": 9.782781260373143e-06, + "loss": 0.6831, + "step": 6590 + }, + { + "epoch": 0.4987325489009118, + "grad_norm": 3.6653621196746826, + "learning_rate": 9.780537553679803e-06, + "loss": 0.6437, + "step": 6591 + }, + { + "epoch": 0.49880821762324545, + "grad_norm": 3.2328827381134033, + "learning_rate": 9.778293798106676e-06, + "loss": 0.668, + "step": 6592 + }, + { + "epoch": 0.49888388634557906, + "grad_norm": 2.2856032848358154, + "learning_rate": 9.776049993794277e-06, + "loss": 0.8449, + "step": 6593 + }, + { + "epoch": 0.4989595550679127, + "grad_norm": 1.9682530164718628, + "learning_rate": 9.773806140883115e-06, + "loss": 0.8408, + "step": 6594 + }, + { + "epoch": 0.4990352237902463, + "grad_norm": 2.026069402694702, + "learning_rate": 9.771562239513712e-06, + "loss": 1.0183, + "step": 6595 + }, + { + "epoch": 0.49911089251257995, + "grad_norm": 2.4245307445526123, + "learning_rate": 9.769318289826581e-06, + "loss": 0.666, + "step": 6596 + }, + { + "epoch": 0.49918656123491356, + "grad_norm": 1.815333604812622, + "learning_rate": 9.767074291962257e-06, + "loss": 0.7764, + "step": 6597 + }, + { + "epoch": 0.49926222995724717, + "grad_norm": 1.9312478303909302, + "learning_rate": 9.764830246061256e-06, + "loss": 0.6955, + "step": 6598 + }, + { + "epoch": 0.4993378986795808, + "grad_norm": 2.2235007286071777, + "learning_rate": 9.762586152264112e-06, + "loss": 0.799, + "step": 6599 + }, + { + "epoch": 0.4994135674019144, + "grad_norm": 2.057189464569092, + "learning_rate": 9.760342010711359e-06, + "loss": 0.8393, + "step": 6600 + }, + { + "epoch": 0.49948923612424806, + "grad_norm": 2.18095326423645, + "learning_rate": 9.758097821543523e-06, + "loss": 0.5707, + "step": 6601 + }, + { + "epoch": 0.49956490484658167, + "grad_norm": 2.4955899715423584, + "learning_rate": 9.755853584901159e-06, + "loss": 0.6642, + "step": 6602 + }, + { + "epoch": 0.4996405735689153, + "grad_norm": 2.098681688308716, + "learning_rate": 9.753609300924791e-06, + "loss": 0.7294, + "step": 6603 + }, + { + "epoch": 0.4997162422912489, + "grad_norm": 1.8592923879623413, + "learning_rate": 9.751364969754975e-06, + "loss": 0.79, + "step": 6604 + }, + { + "epoch": 0.49979191101358256, + "grad_norm": 2.4341955184936523, + "learning_rate": 9.749120591532253e-06, + "loss": 0.7224, + "step": 6605 + }, + { + "epoch": 0.49986757973591617, + "grad_norm": 2.533416748046875, + "learning_rate": 9.746876166397175e-06, + "loss": 0.6489, + "step": 6606 + }, + { + "epoch": 0.4999432484582498, + "grad_norm": 1.9363288879394531, + "learning_rate": 9.7446316944903e-06, + "loss": 0.6737, + "step": 6607 + }, + { + "epoch": 0.5000189171805834, + "grad_norm": 2.0590407848358154, + "learning_rate": 9.742387175952178e-06, + "loss": 0.6893, + "step": 6608 + }, + { + "epoch": 0.500094585902917, + "grad_norm": 1.818794846534729, + "learning_rate": 9.740142610923369e-06, + "loss": 0.7371, + "step": 6609 + }, + { + "epoch": 0.5001702546252507, + "grad_norm": 2.1174185276031494, + "learning_rate": 9.737897999544437e-06, + "loss": 0.7349, + "step": 6610 + }, + { + "epoch": 0.5002459233475843, + "grad_norm": 2.133242607116699, + "learning_rate": 9.735653341955944e-06, + "loss": 0.7632, + "step": 6611 + }, + { + "epoch": 0.5003215920699179, + "grad_norm": 2.435183525085449, + "learning_rate": 9.73340863829846e-06, + "loss": 0.612, + "step": 6612 + }, + { + "epoch": 0.5003972607922516, + "grad_norm": 2.1215646266937256, + "learning_rate": 9.731163888712557e-06, + "loss": 0.6771, + "step": 6613 + }, + { + "epoch": 0.5004729295145851, + "grad_norm": 1.9832267761230469, + "learning_rate": 9.728919093338804e-06, + "loss": 0.8126, + "step": 6614 + }, + { + "epoch": 0.5005485982369188, + "grad_norm": 2.078450918197632, + "learning_rate": 9.726674252317781e-06, + "loss": 0.5838, + "step": 6615 + }, + { + "epoch": 0.5006242669592524, + "grad_norm": 2.8075807094573975, + "learning_rate": 9.724429365790064e-06, + "loss": 0.5838, + "step": 6616 + }, + { + "epoch": 0.500699935681586, + "grad_norm": 2.4533121585845947, + "learning_rate": 9.722184433896237e-06, + "loss": 0.712, + "step": 6617 + }, + { + "epoch": 0.5007756044039197, + "grad_norm": 3.052152633666992, + "learning_rate": 9.71993945677689e-06, + "loss": 0.742, + "step": 6618 + }, + { + "epoch": 0.5008512731262532, + "grad_norm": 2.4858107566833496, + "learning_rate": 9.717694434572599e-06, + "loss": 0.7089, + "step": 6619 + }, + { + "epoch": 0.5009269418485869, + "grad_norm": 1.992408037185669, + "learning_rate": 9.715449367423966e-06, + "loss": 0.6742, + "step": 6620 + }, + { + "epoch": 0.5010026105709205, + "grad_norm": 2.0677504539489746, + "learning_rate": 9.713204255471577e-06, + "loss": 0.5632, + "step": 6621 + }, + { + "epoch": 0.5010782792932541, + "grad_norm": 2.122046947479248, + "learning_rate": 9.71095909885603e-06, + "loss": 0.7803, + "step": 6622 + }, + { + "epoch": 0.5011539480155878, + "grad_norm": 1.7793095111846924, + "learning_rate": 9.708713897717928e-06, + "loss": 0.7774, + "step": 6623 + }, + { + "epoch": 0.5012296167379214, + "grad_norm": 2.6335649490356445, + "learning_rate": 9.706468652197866e-06, + "loss": 0.7956, + "step": 6624 + }, + { + "epoch": 0.501305285460255, + "grad_norm": 2.394765853881836, + "learning_rate": 9.704223362436454e-06, + "loss": 0.7354, + "step": 6625 + }, + { + "epoch": 0.5013809541825887, + "grad_norm": 2.1570701599121094, + "learning_rate": 9.701978028574298e-06, + "loss": 0.6291, + "step": 6626 + }, + { + "epoch": 0.5014566229049222, + "grad_norm": 2.289980411529541, + "learning_rate": 9.699732650752005e-06, + "loss": 0.7032, + "step": 6627 + }, + { + "epoch": 0.5015322916272559, + "grad_norm": 2.2219419479370117, + "learning_rate": 9.697487229110192e-06, + "loss": 0.7864, + "step": 6628 + }, + { + "epoch": 0.5016079603495895, + "grad_norm": 2.5143167972564697, + "learning_rate": 9.695241763789474e-06, + "loss": 0.6695, + "step": 6629 + }, + { + "epoch": 0.5016836290719231, + "grad_norm": 2.949692726135254, + "learning_rate": 9.692996254930464e-06, + "loss": 0.733, + "step": 6630 + }, + { + "epoch": 0.5017592977942568, + "grad_norm": 1.9106590747833252, + "learning_rate": 9.690750702673792e-06, + "loss": 0.7533, + "step": 6631 + }, + { + "epoch": 0.5018349665165903, + "grad_norm": 2.054891586303711, + "learning_rate": 9.688505107160074e-06, + "loss": 0.7022, + "step": 6632 + }, + { + "epoch": 0.501910635238924, + "grad_norm": 1.7698699235916138, + "learning_rate": 9.686259468529938e-06, + "loss": 0.6956, + "step": 6633 + }, + { + "epoch": 0.5019863039612577, + "grad_norm": 2.303417205810547, + "learning_rate": 9.684013786924014e-06, + "loss": 0.6195, + "step": 6634 + }, + { + "epoch": 0.5020619726835912, + "grad_norm": 1.9892022609710693, + "learning_rate": 9.681768062482937e-06, + "loss": 0.6891, + "step": 6635 + }, + { + "epoch": 0.5021376414059249, + "grad_norm": 2.3008720874786377, + "learning_rate": 9.679522295347342e-06, + "loss": 0.6299, + "step": 6636 + }, + { + "epoch": 0.5022133101282585, + "grad_norm": 2.5181405544281006, + "learning_rate": 9.677276485657857e-06, + "loss": 0.6741, + "step": 6637 + }, + { + "epoch": 0.5022889788505921, + "grad_norm": 2.3965871334075928, + "learning_rate": 9.67503063355513e-06, + "loss": 0.758, + "step": 6638 + }, + { + "epoch": 0.5023646475729258, + "grad_norm": 2.2800729274749756, + "learning_rate": 9.672784739179801e-06, + "loss": 0.7284, + "step": 6639 + }, + { + "epoch": 0.5024403162952593, + "grad_norm": 2.018068790435791, + "learning_rate": 9.670538802672516e-06, + "loss": 0.5924, + "step": 6640 + }, + { + "epoch": 0.502515985017593, + "grad_norm": 2.180234670639038, + "learning_rate": 9.668292824173925e-06, + "loss": 0.8005, + "step": 6641 + }, + { + "epoch": 0.5025916537399266, + "grad_norm": 2.2746224403381348, + "learning_rate": 9.666046803824671e-06, + "loss": 0.7719, + "step": 6642 + }, + { + "epoch": 0.5026673224622602, + "grad_norm": 3.1850080490112305, + "learning_rate": 9.663800741765416e-06, + "loss": 0.8144, + "step": 6643 + }, + { + "epoch": 0.5027429911845939, + "grad_norm": 2.118246078491211, + "learning_rate": 9.661554638136808e-06, + "loss": 0.7937, + "step": 6644 + }, + { + "epoch": 0.5028186599069274, + "grad_norm": 1.9210091829299927, + "learning_rate": 9.659308493079511e-06, + "loss": 0.5085, + "step": 6645 + }, + { + "epoch": 0.5028943286292611, + "grad_norm": 2.1599907875061035, + "learning_rate": 9.657062306734182e-06, + "loss": 0.6544, + "step": 6646 + }, + { + "epoch": 0.5029699973515948, + "grad_norm": 2.152679443359375, + "learning_rate": 9.654816079241487e-06, + "loss": 0.8208, + "step": 6647 + }, + { + "epoch": 0.5030456660739283, + "grad_norm": 2.152647018432617, + "learning_rate": 9.652569810742093e-06, + "loss": 0.6365, + "step": 6648 + }, + { + "epoch": 0.503121334796262, + "grad_norm": 2.0750904083251953, + "learning_rate": 9.650323501376666e-06, + "loss": 0.6757, + "step": 6649 + }, + { + "epoch": 0.5031970035185956, + "grad_norm": 2.493428945541382, + "learning_rate": 9.648077151285877e-06, + "loss": 0.7954, + "step": 6650 + }, + { + "epoch": 0.5032726722409292, + "grad_norm": 1.762840986251831, + "learning_rate": 9.645830760610401e-06, + "loss": 0.8119, + "step": 6651 + }, + { + "epoch": 0.5033483409632629, + "grad_norm": 2.1899425983428955, + "learning_rate": 9.643584329490914e-06, + "loss": 0.6388, + "step": 6652 + }, + { + "epoch": 0.5034240096855964, + "grad_norm": 2.0570414066314697, + "learning_rate": 9.641337858068094e-06, + "loss": 0.7231, + "step": 6653 + }, + { + "epoch": 0.5034996784079301, + "grad_norm": 2.0795912742614746, + "learning_rate": 9.639091346482624e-06, + "loss": 0.7492, + "step": 6654 + }, + { + "epoch": 0.5035753471302638, + "grad_norm": 2.199233293533325, + "learning_rate": 9.636844794875187e-06, + "loss": 0.6797, + "step": 6655 + }, + { + "epoch": 0.5036510158525973, + "grad_norm": 2.3205556869506836, + "learning_rate": 9.634598203386467e-06, + "loss": 0.6839, + "step": 6656 + }, + { + "epoch": 0.503726684574931, + "grad_norm": 2.707512855529785, + "learning_rate": 9.632351572157156e-06, + "loss": 0.8496, + "step": 6657 + }, + { + "epoch": 0.5038023532972645, + "grad_norm": 1.746833086013794, + "learning_rate": 9.630104901327944e-06, + "loss": 0.8147, + "step": 6658 + }, + { + "epoch": 0.5038780220195982, + "grad_norm": 1.9517803192138672, + "learning_rate": 9.62785819103953e-06, + "loss": 0.6498, + "step": 6659 + }, + { + "epoch": 0.5039536907419319, + "grad_norm": 1.9872726202011108, + "learning_rate": 9.625611441432598e-06, + "loss": 0.6999, + "step": 6660 + }, + { + "epoch": 0.5040293594642654, + "grad_norm": 2.2390949726104736, + "learning_rate": 9.623364652647858e-06, + "loss": 0.6743, + "step": 6661 + }, + { + "epoch": 0.5041050281865991, + "grad_norm": 2.458838939666748, + "learning_rate": 9.621117824826008e-06, + "loss": 0.7239, + "step": 6662 + }, + { + "epoch": 0.5041806969089327, + "grad_norm": 2.7684452533721924, + "learning_rate": 9.618870958107747e-06, + "loss": 0.67, + "step": 6663 + }, + { + "epoch": 0.5042563656312663, + "grad_norm": 2.003108501434326, + "learning_rate": 9.61662405263379e-06, + "loss": 0.6692, + "step": 6664 + }, + { + "epoch": 0.5043320343536, + "grad_norm": 1.8543363809585571, + "learning_rate": 9.614377108544836e-06, + "loss": 0.7839, + "step": 6665 + }, + { + "epoch": 0.5044077030759335, + "grad_norm": 1.8645862340927124, + "learning_rate": 9.612130125981603e-06, + "loss": 0.6441, + "step": 6666 + }, + { + "epoch": 0.5044833717982672, + "grad_norm": 2.3227145671844482, + "learning_rate": 9.6098831050848e-06, + "loss": 0.7593, + "step": 6667 + }, + { + "epoch": 0.5045590405206009, + "grad_norm": 2.1270523071289062, + "learning_rate": 9.607636045995145e-06, + "loss": 0.7418, + "step": 6668 + }, + { + "epoch": 0.5046347092429344, + "grad_norm": 2.077716827392578, + "learning_rate": 9.605388948853355e-06, + "loss": 0.7234, + "step": 6669 + }, + { + "epoch": 0.5047103779652681, + "grad_norm": 1.9619203805923462, + "learning_rate": 9.60314181380015e-06, + "loss": 0.6765, + "step": 6670 + }, + { + "epoch": 0.5047860466876016, + "grad_norm": 1.965520977973938, + "learning_rate": 9.600894640976257e-06, + "loss": 0.7541, + "step": 6671 + }, + { + "epoch": 0.5048617154099353, + "grad_norm": 1.9246147871017456, + "learning_rate": 9.598647430522397e-06, + "loss": 0.7266, + "step": 6672 + }, + { + "epoch": 0.504937384132269, + "grad_norm": 2.243699550628662, + "learning_rate": 9.596400182579299e-06, + "loss": 0.6776, + "step": 6673 + }, + { + "epoch": 0.5050130528546025, + "grad_norm": 2.064436912536621, + "learning_rate": 9.594152897287689e-06, + "loss": 0.661, + "step": 6674 + }, + { + "epoch": 0.5050887215769362, + "grad_norm": 2.3579468727111816, + "learning_rate": 9.591905574788305e-06, + "loss": 0.7555, + "step": 6675 + }, + { + "epoch": 0.5051643902992698, + "grad_norm": 2.24985408782959, + "learning_rate": 9.58965821522188e-06, + "loss": 0.6199, + "step": 6676 + }, + { + "epoch": 0.5052400590216034, + "grad_norm": 2.8891539573669434, + "learning_rate": 9.587410818729151e-06, + "loss": 0.7357, + "step": 6677 + }, + { + "epoch": 0.5053157277439371, + "grad_norm": 1.8925280570983887, + "learning_rate": 9.585163385450857e-06, + "loss": 0.6271, + "step": 6678 + }, + { + "epoch": 0.5053913964662706, + "grad_norm": 2.103595495223999, + "learning_rate": 9.582915915527738e-06, + "loss": 0.7584, + "step": 6679 + }, + { + "epoch": 0.5054670651886043, + "grad_norm": 1.9762611389160156, + "learning_rate": 9.580668409100539e-06, + "loss": 0.6955, + "step": 6680 + }, + { + "epoch": 0.505542733910938, + "grad_norm": 1.8071695566177368, + "learning_rate": 9.578420866310004e-06, + "loss": 0.7404, + "step": 6681 + }, + { + "epoch": 0.5056184026332715, + "grad_norm": 1.5950270891189575, + "learning_rate": 9.576173287296889e-06, + "loss": 0.8587, + "step": 6682 + }, + { + "epoch": 0.5056940713556052, + "grad_norm": 1.9475131034851074, + "learning_rate": 9.573925672201932e-06, + "loss": 0.7747, + "step": 6683 + }, + { + "epoch": 0.5057697400779387, + "grad_norm": 1.9169731140136719, + "learning_rate": 9.5716780211659e-06, + "loss": 0.5432, + "step": 6684 + }, + { + "epoch": 0.5058454088002724, + "grad_norm": 2.5060229301452637, + "learning_rate": 9.569430334329538e-06, + "loss": 0.6395, + "step": 6685 + }, + { + "epoch": 0.5059210775226061, + "grad_norm": 1.836045503616333, + "learning_rate": 9.567182611833605e-06, + "loss": 0.7145, + "step": 6686 + }, + { + "epoch": 0.5059967462449396, + "grad_norm": 1.950330138206482, + "learning_rate": 9.564934853818867e-06, + "loss": 0.6531, + "step": 6687 + }, + { + "epoch": 0.5060724149672733, + "grad_norm": 2.326371192932129, + "learning_rate": 9.562687060426075e-06, + "loss": 0.7988, + "step": 6688 + }, + { + "epoch": 0.506148083689607, + "grad_norm": 1.9932469129562378, + "learning_rate": 9.560439231796005e-06, + "loss": 0.6736, + "step": 6689 + }, + { + "epoch": 0.5062237524119405, + "grad_norm": 1.9955806732177734, + "learning_rate": 9.558191368069414e-06, + "loss": 0.6104, + "step": 6690 + }, + { + "epoch": 0.5062994211342742, + "grad_norm": 2.626847505569458, + "learning_rate": 9.555943469387074e-06, + "loss": 0.7226, + "step": 6691 + }, + { + "epoch": 0.5063750898566077, + "grad_norm": 2.498826026916504, + "learning_rate": 9.553695535889759e-06, + "loss": 0.7776, + "step": 6692 + }, + { + "epoch": 0.5064507585789414, + "grad_norm": 2.252244710922241, + "learning_rate": 9.551447567718236e-06, + "loss": 0.9246, + "step": 6693 + }, + { + "epoch": 0.5065264273012751, + "grad_norm": 2.162478446960449, + "learning_rate": 9.549199565013286e-06, + "loss": 0.6745, + "step": 6694 + }, + { + "epoch": 0.5066020960236086, + "grad_norm": 2.2146854400634766, + "learning_rate": 9.546951527915681e-06, + "loss": 0.834, + "step": 6695 + }, + { + "epoch": 0.5066777647459423, + "grad_norm": 2.3544445037841797, + "learning_rate": 9.5447034565662e-06, + "loss": 0.8678, + "step": 6696 + }, + { + "epoch": 0.5067534334682758, + "grad_norm": 1.949468970298767, + "learning_rate": 9.54245535110563e-06, + "loss": 0.6463, + "step": 6697 + }, + { + "epoch": 0.5068291021906095, + "grad_norm": 2.0964138507843018, + "learning_rate": 9.540207211674751e-06, + "loss": 0.8263, + "step": 6698 + }, + { + "epoch": 0.5069047709129432, + "grad_norm": 1.5557283163070679, + "learning_rate": 9.537959038414345e-06, + "loss": 0.6854, + "step": 6699 + }, + { + "epoch": 0.5069804396352767, + "grad_norm": 1.7753989696502686, + "learning_rate": 9.53571083146521e-06, + "loss": 0.5893, + "step": 6700 + }, + { + "epoch": 0.5070561083576104, + "grad_norm": 1.9944871664047241, + "learning_rate": 9.533462590968125e-06, + "loss": 0.7938, + "step": 6701 + }, + { + "epoch": 0.5071317770799441, + "grad_norm": 2.079556465148926, + "learning_rate": 9.531214317063891e-06, + "loss": 0.7135, + "step": 6702 + }, + { + "epoch": 0.5072074458022776, + "grad_norm": 2.094221591949463, + "learning_rate": 9.528966009893297e-06, + "loss": 0.8092, + "step": 6703 + }, + { + "epoch": 0.5072831145246113, + "grad_norm": 2.0519769191741943, + "learning_rate": 9.526717669597139e-06, + "loss": 0.7803, + "step": 6704 + }, + { + "epoch": 0.5073587832469448, + "grad_norm": 1.9500595331192017, + "learning_rate": 9.52446929631622e-06, + "loss": 0.6599, + "step": 6705 + }, + { + "epoch": 0.5074344519692785, + "grad_norm": 2.6499404907226562, + "learning_rate": 9.522220890191332e-06, + "loss": 0.6721, + "step": 6706 + }, + { + "epoch": 0.5075101206916122, + "grad_norm": 1.9413082599639893, + "learning_rate": 9.51997245136329e-06, + "loss": 0.6661, + "step": 6707 + }, + { + "epoch": 0.5075857894139457, + "grad_norm": 1.8632111549377441, + "learning_rate": 9.517723979972886e-06, + "loss": 0.6466, + "step": 6708 + }, + { + "epoch": 0.5076614581362794, + "grad_norm": 2.5629031658172607, + "learning_rate": 9.515475476160934e-06, + "loss": 0.7257, + "step": 6709 + }, + { + "epoch": 0.5077371268586129, + "grad_norm": 2.516348361968994, + "learning_rate": 9.513226940068241e-06, + "loss": 0.8455, + "step": 6710 + }, + { + "epoch": 0.5078127955809466, + "grad_norm": 2.427999973297119, + "learning_rate": 9.510978371835613e-06, + "loss": 0.7796, + "step": 6711 + }, + { + "epoch": 0.5078884643032803, + "grad_norm": 2.137712001800537, + "learning_rate": 9.508729771603872e-06, + "loss": 0.7222, + "step": 6712 + }, + { + "epoch": 0.5079641330256138, + "grad_norm": 2.6136555671691895, + "learning_rate": 9.506481139513824e-06, + "loss": 0.8969, + "step": 6713 + }, + { + "epoch": 0.5080398017479475, + "grad_norm": 2.343489646911621, + "learning_rate": 9.50423247570629e-06, + "loss": 0.7039, + "step": 6714 + }, + { + "epoch": 0.5081154704702812, + "grad_norm": 2.309749126434326, + "learning_rate": 9.501983780322088e-06, + "loss": 0.7666, + "step": 6715 + }, + { + "epoch": 0.5081911391926147, + "grad_norm": 2.360842704772949, + "learning_rate": 9.499735053502037e-06, + "loss": 0.772, + "step": 6716 + }, + { + "epoch": 0.5082668079149484, + "grad_norm": 1.918006181716919, + "learning_rate": 9.497486295386962e-06, + "loss": 0.7022, + "step": 6717 + }, + { + "epoch": 0.5083424766372819, + "grad_norm": 2.2111973762512207, + "learning_rate": 9.495237506117685e-06, + "loss": 0.5871, + "step": 6718 + }, + { + "epoch": 0.5084181453596156, + "grad_norm": 2.214198350906372, + "learning_rate": 9.492988685835031e-06, + "loss": 0.6968, + "step": 6719 + }, + { + "epoch": 0.5084938140819493, + "grad_norm": 1.9260320663452148, + "learning_rate": 9.490739834679834e-06, + "loss": 0.8053, + "step": 6720 + }, + { + "epoch": 0.5085694828042828, + "grad_norm": 2.5409858226776123, + "learning_rate": 9.488490952792924e-06, + "loss": 0.6891, + "step": 6721 + }, + { + "epoch": 0.5086451515266165, + "grad_norm": 2.6639885902404785, + "learning_rate": 9.486242040315125e-06, + "loss": 0.6264, + "step": 6722 + }, + { + "epoch": 0.50872082024895, + "grad_norm": 2.6699607372283936, + "learning_rate": 9.48399309738728e-06, + "loss": 0.7295, + "step": 6723 + }, + { + "epoch": 0.5087964889712837, + "grad_norm": 1.702734351158142, + "learning_rate": 9.481744124150222e-06, + "loss": 0.705, + "step": 6724 + }, + { + "epoch": 0.5088721576936174, + "grad_norm": 2.3843774795532227, + "learning_rate": 9.479495120744786e-06, + "loss": 0.7714, + "step": 6725 + }, + { + "epoch": 0.5089478264159509, + "grad_norm": 2.0906484127044678, + "learning_rate": 9.477246087311816e-06, + "loss": 0.7696, + "step": 6726 + }, + { + "epoch": 0.5090234951382846, + "grad_norm": 2.4674184322357178, + "learning_rate": 9.474997023992152e-06, + "loss": 0.7912, + "step": 6727 + }, + { + "epoch": 0.5090991638606183, + "grad_norm": 1.9919297695159912, + "learning_rate": 9.472747930926641e-06, + "loss": 0.6566, + "step": 6728 + }, + { + "epoch": 0.5091748325829518, + "grad_norm": 1.8796985149383545, + "learning_rate": 9.470498808256121e-06, + "loss": 0.6215, + "step": 6729 + }, + { + "epoch": 0.5092505013052855, + "grad_norm": 2.5070700645446777, + "learning_rate": 9.46824965612145e-06, + "loss": 0.7519, + "step": 6730 + }, + { + "epoch": 0.509326170027619, + "grad_norm": 3.2498910427093506, + "learning_rate": 9.466000474663466e-06, + "loss": 0.6886, + "step": 6731 + }, + { + "epoch": 0.5094018387499527, + "grad_norm": 2.552865743637085, + "learning_rate": 9.463751264023028e-06, + "loss": 0.6306, + "step": 6732 + }, + { + "epoch": 0.5094775074722864, + "grad_norm": 2.403729200363159, + "learning_rate": 9.461502024340982e-06, + "loss": 0.6939, + "step": 6733 + }, + { + "epoch": 0.5095531761946199, + "grad_norm": 1.9017412662506104, + "learning_rate": 9.45925275575819e-06, + "loss": 0.8801, + "step": 6734 + }, + { + "epoch": 0.5096288449169536, + "grad_norm": 2.5211994647979736, + "learning_rate": 9.457003458415504e-06, + "loss": 0.6766, + "step": 6735 + }, + { + "epoch": 0.5097045136392871, + "grad_norm": 1.963262915611267, + "learning_rate": 9.454754132453783e-06, + "loss": 0.6982, + "step": 6736 + }, + { + "epoch": 0.5097801823616208, + "grad_norm": 2.2367377281188965, + "learning_rate": 9.452504778013888e-06, + "loss": 0.807, + "step": 6737 + }, + { + "epoch": 0.5098558510839545, + "grad_norm": 2.5869593620300293, + "learning_rate": 9.450255395236678e-06, + "loss": 0.7474, + "step": 6738 + }, + { + "epoch": 0.509931519806288, + "grad_norm": 3.3216679096221924, + "learning_rate": 9.448005984263022e-06, + "loss": 0.581, + "step": 6739 + }, + { + "epoch": 0.5100071885286217, + "grad_norm": 3.6230344772338867, + "learning_rate": 9.44575654523378e-06, + "loss": 0.7932, + "step": 6740 + }, + { + "epoch": 0.5100828572509554, + "grad_norm": 2.0393731594085693, + "learning_rate": 9.443507078289822e-06, + "loss": 0.7265, + "step": 6741 + }, + { + "epoch": 0.5101585259732889, + "grad_norm": 2.0096278190612793, + "learning_rate": 9.441257583572017e-06, + "loss": 0.7329, + "step": 6742 + }, + { + "epoch": 0.5102341946956226, + "grad_norm": 2.389496088027954, + "learning_rate": 9.439008061221235e-06, + "loss": 0.752, + "step": 6743 + }, + { + "epoch": 0.5103098634179561, + "grad_norm": 2.3090922832489014, + "learning_rate": 9.436758511378348e-06, + "loss": 0.7655, + "step": 6744 + }, + { + "epoch": 0.5103855321402898, + "grad_norm": 1.957068920135498, + "learning_rate": 9.434508934184228e-06, + "loss": 0.6015, + "step": 6745 + }, + { + "epoch": 0.5104612008626235, + "grad_norm": 2.256654739379883, + "learning_rate": 9.432259329779758e-06, + "loss": 0.7901, + "step": 6746 + }, + { + "epoch": 0.510536869584957, + "grad_norm": 2.0302345752716064, + "learning_rate": 9.430009698305804e-06, + "loss": 0.8028, + "step": 6747 + }, + { + "epoch": 0.5106125383072907, + "grad_norm": 2.3182132244110107, + "learning_rate": 9.427760039903258e-06, + "loss": 0.6554, + "step": 6748 + }, + { + "epoch": 0.5106882070296243, + "grad_norm": 2.3157958984375, + "learning_rate": 9.425510354712992e-06, + "loss": 0.6733, + "step": 6749 + }, + { + "epoch": 0.5107638757519579, + "grad_norm": 1.7436319589614868, + "learning_rate": 9.423260642875892e-06, + "loss": 0.7008, + "step": 6750 + }, + { + "epoch": 0.5108395444742916, + "grad_norm": 2.456576108932495, + "learning_rate": 9.421010904532843e-06, + "loss": 0.5907, + "step": 6751 + }, + { + "epoch": 0.5109152131966251, + "grad_norm": 2.08577561378479, + "learning_rate": 9.418761139824726e-06, + "loss": 0.6606, + "step": 6752 + }, + { + "epoch": 0.5109908819189588, + "grad_norm": 2.5024051666259766, + "learning_rate": 9.416511348892434e-06, + "loss": 0.7737, + "step": 6753 + }, + { + "epoch": 0.5110665506412925, + "grad_norm": 2.1575140953063965, + "learning_rate": 9.414261531876855e-06, + "loss": 0.7219, + "step": 6754 + }, + { + "epoch": 0.511142219363626, + "grad_norm": 1.9586470127105713, + "learning_rate": 9.412011688918878e-06, + "loss": 0.8176, + "step": 6755 + }, + { + "epoch": 0.5112178880859597, + "grad_norm": 2.153289556503296, + "learning_rate": 9.409761820159396e-06, + "loss": 0.8414, + "step": 6756 + }, + { + "epoch": 0.5112935568082932, + "grad_norm": 2.560614585876465, + "learning_rate": 9.407511925739306e-06, + "loss": 0.6309, + "step": 6757 + }, + { + "epoch": 0.5113692255306269, + "grad_norm": 2.1728456020355225, + "learning_rate": 9.405262005799498e-06, + "loss": 0.8317, + "step": 6758 + }, + { + "epoch": 0.5114448942529606, + "grad_norm": 2.0352866649627686, + "learning_rate": 9.403012060480872e-06, + "loss": 0.6428, + "step": 6759 + }, + { + "epoch": 0.5115205629752941, + "grad_norm": 2.638589859008789, + "learning_rate": 9.400762089924329e-06, + "loss": 0.7381, + "step": 6760 + }, + { + "epoch": 0.5115962316976278, + "grad_norm": 2.571859836578369, + "learning_rate": 9.398512094270768e-06, + "loss": 0.7103, + "step": 6761 + }, + { + "epoch": 0.5116719004199614, + "grad_norm": 1.9200884103775024, + "learning_rate": 9.396262073661092e-06, + "loss": 0.7302, + "step": 6762 + }, + { + "epoch": 0.511747569142295, + "grad_norm": 2.4631552696228027, + "learning_rate": 9.394012028236199e-06, + "loss": 0.6403, + "step": 6763 + }, + { + "epoch": 0.5118232378646287, + "grad_norm": 2.1766726970672607, + "learning_rate": 9.391761958137e-06, + "loss": 0.8078, + "step": 6764 + }, + { + "epoch": 0.5118989065869622, + "grad_norm": 1.9873024225234985, + "learning_rate": 9.389511863504403e-06, + "loss": 0.6108, + "step": 6765 + }, + { + "epoch": 0.5119745753092959, + "grad_norm": 2.2387278079986572, + "learning_rate": 9.38726174447931e-06, + "loss": 0.806, + "step": 6766 + }, + { + "epoch": 0.5120502440316296, + "grad_norm": 2.1650137901306152, + "learning_rate": 9.385011601202637e-06, + "loss": 0.6439, + "step": 6767 + }, + { + "epoch": 0.5121259127539631, + "grad_norm": 2.864428758621216, + "learning_rate": 9.382761433815289e-06, + "loss": 0.7591, + "step": 6768 + }, + { + "epoch": 0.5122015814762968, + "grad_norm": 2.0299293994903564, + "learning_rate": 9.380511242458185e-06, + "loss": 0.7405, + "step": 6769 + }, + { + "epoch": 0.5122772501986304, + "grad_norm": 2.2550573348999023, + "learning_rate": 9.378261027272231e-06, + "loss": 0.7065, + "step": 6770 + }, + { + "epoch": 0.512352918920964, + "grad_norm": 1.7595993280410767, + "learning_rate": 9.376010788398354e-06, + "loss": 0.7129, + "step": 6771 + }, + { + "epoch": 0.5124285876432977, + "grad_norm": 2.2322864532470703, + "learning_rate": 9.373760525977464e-06, + "loss": 0.7328, + "step": 6772 + }, + { + "epoch": 0.5125042563656312, + "grad_norm": 1.9751027822494507, + "learning_rate": 9.37151024015048e-06, + "loss": 0.6518, + "step": 6773 + }, + { + "epoch": 0.5125799250879649, + "grad_norm": 2.4681308269500732, + "learning_rate": 9.369259931058326e-06, + "loss": 0.6701, + "step": 6774 + }, + { + "epoch": 0.5126555938102985, + "grad_norm": 2.20025372505188, + "learning_rate": 9.367009598841916e-06, + "loss": 0.7454, + "step": 6775 + }, + { + "epoch": 0.5127312625326321, + "grad_norm": 2.411095380783081, + "learning_rate": 9.36475924364218e-06, + "loss": 0.8019, + "step": 6776 + }, + { + "epoch": 0.5128069312549658, + "grad_norm": 2.401850938796997, + "learning_rate": 9.362508865600039e-06, + "loss": 0.6141, + "step": 6777 + }, + { + "epoch": 0.5128825999772993, + "grad_norm": 2.024711847305298, + "learning_rate": 9.360258464856422e-06, + "loss": 0.6127, + "step": 6778 + }, + { + "epoch": 0.512958268699633, + "grad_norm": 2.656041145324707, + "learning_rate": 9.358008041552254e-06, + "loss": 0.7047, + "step": 6779 + }, + { + "epoch": 0.5130339374219667, + "grad_norm": 2.6526777744293213, + "learning_rate": 9.355757595828464e-06, + "loss": 0.831, + "step": 6780 + }, + { + "epoch": 0.5131096061443002, + "grad_norm": 2.158024787902832, + "learning_rate": 9.353507127825985e-06, + "loss": 0.6814, + "step": 6781 + }, + { + "epoch": 0.5131852748666339, + "grad_norm": 2.018216848373413, + "learning_rate": 9.351256637685745e-06, + "loss": 0.6801, + "step": 6782 + }, + { + "epoch": 0.5132609435889675, + "grad_norm": 2.381420135498047, + "learning_rate": 9.349006125548676e-06, + "loss": 0.708, + "step": 6783 + }, + { + "epoch": 0.5133366123113011, + "grad_norm": 2.565141201019287, + "learning_rate": 9.346755591555718e-06, + "loss": 0.7645, + "step": 6784 + }, + { + "epoch": 0.5134122810336348, + "grad_norm": 2.3605058193206787, + "learning_rate": 9.344505035847804e-06, + "loss": 0.6184, + "step": 6785 + }, + { + "epoch": 0.5134879497559683, + "grad_norm": 2.587228536605835, + "learning_rate": 9.342254458565865e-06, + "loss": 0.7598, + "step": 6786 + }, + { + "epoch": 0.513563618478302, + "grad_norm": 1.9587780237197876, + "learning_rate": 9.34000385985085e-06, + "loss": 0.7085, + "step": 6787 + }, + { + "epoch": 0.5136392872006356, + "grad_norm": 1.9432623386383057, + "learning_rate": 9.337753239843691e-06, + "loss": 0.6803, + "step": 6788 + }, + { + "epoch": 0.5137149559229692, + "grad_norm": 2.544887065887451, + "learning_rate": 9.335502598685333e-06, + "loss": 0.8014, + "step": 6789 + }, + { + "epoch": 0.5137906246453029, + "grad_norm": 2.4957051277160645, + "learning_rate": 9.333251936516718e-06, + "loss": 0.7523, + "step": 6790 + }, + { + "epoch": 0.5138662933676365, + "grad_norm": 2.6802759170532227, + "learning_rate": 9.331001253478786e-06, + "loss": 0.6507, + "step": 6791 + }, + { + "epoch": 0.5139419620899701, + "grad_norm": 2.0066659450531006, + "learning_rate": 9.328750549712486e-06, + "loss": 0.675, + "step": 6792 + }, + { + "epoch": 0.5140176308123038, + "grad_norm": 2.4365880489349365, + "learning_rate": 9.326499825358763e-06, + "loss": 0.6524, + "step": 6793 + }, + { + "epoch": 0.5140932995346373, + "grad_norm": 1.9508459568023682, + "learning_rate": 9.324249080558565e-06, + "loss": 0.7767, + "step": 6794 + }, + { + "epoch": 0.514168968256971, + "grad_norm": 2.4417145252227783, + "learning_rate": 9.321998315452841e-06, + "loss": 0.7902, + "step": 6795 + }, + { + "epoch": 0.5142446369793046, + "grad_norm": 2.5067169666290283, + "learning_rate": 9.319747530182542e-06, + "loss": 0.7099, + "step": 6796 + }, + { + "epoch": 0.5143203057016382, + "grad_norm": 2.2060787677764893, + "learning_rate": 9.31749672488862e-06, + "loss": 0.6742, + "step": 6797 + }, + { + "epoch": 0.5143959744239719, + "grad_norm": 2.209672689437866, + "learning_rate": 9.315245899712022e-06, + "loss": 0.594, + "step": 6798 + }, + { + "epoch": 0.5144716431463054, + "grad_norm": 2.720317840576172, + "learning_rate": 9.312995054793708e-06, + "loss": 0.8389, + "step": 6799 + }, + { + "epoch": 0.5145473118686391, + "grad_norm": 2.3275961875915527, + "learning_rate": 9.310744190274631e-06, + "loss": 0.7015, + "step": 6800 + }, + { + "epoch": 0.5146229805909727, + "grad_norm": 2.169250726699829, + "learning_rate": 9.308493306295748e-06, + "loss": 0.6436, + "step": 6801 + }, + { + "epoch": 0.5146986493133063, + "grad_norm": 2.38690185546875, + "learning_rate": 9.306242402998016e-06, + "loss": 0.8256, + "step": 6802 + }, + { + "epoch": 0.51477431803564, + "grad_norm": 2.143653631210327, + "learning_rate": 9.303991480522397e-06, + "loss": 0.727, + "step": 6803 + }, + { + "epoch": 0.5148499867579736, + "grad_norm": 2.64727783203125, + "learning_rate": 9.301740539009845e-06, + "loss": 0.7609, + "step": 6804 + }, + { + "epoch": 0.5149256554803072, + "grad_norm": 2.108668565750122, + "learning_rate": 9.299489578601326e-06, + "loss": 0.7427, + "step": 6805 + }, + { + "epoch": 0.5150013242026409, + "grad_norm": 2.263934850692749, + "learning_rate": 9.2972385994378e-06, + "loss": 0.7691, + "step": 6806 + }, + { + "epoch": 0.5150769929249744, + "grad_norm": 2.89751935005188, + "learning_rate": 9.294987601660231e-06, + "loss": 0.7268, + "step": 6807 + }, + { + "epoch": 0.5151526616473081, + "grad_norm": 2.1842520236968994, + "learning_rate": 9.292736585409588e-06, + "loss": 0.7398, + "step": 6808 + }, + { + "epoch": 0.5152283303696417, + "grad_norm": 2.5846338272094727, + "learning_rate": 9.290485550826828e-06, + "loss": 0.7019, + "step": 6809 + }, + { + "epoch": 0.5153039990919753, + "grad_norm": 2.2591192722320557, + "learning_rate": 9.288234498052927e-06, + "loss": 0.7099, + "step": 6810 + }, + { + "epoch": 0.515379667814309, + "grad_norm": 2.2069311141967773, + "learning_rate": 9.285983427228849e-06, + "loss": 0.658, + "step": 6811 + }, + { + "epoch": 0.5154553365366425, + "grad_norm": 2.1312320232391357, + "learning_rate": 9.283732338495562e-06, + "loss": 0.7175, + "step": 6812 + }, + { + "epoch": 0.5155310052589762, + "grad_norm": 3.7413275241851807, + "learning_rate": 9.28148123199404e-06, + "loss": 0.6437, + "step": 6813 + }, + { + "epoch": 0.5156066739813098, + "grad_norm": 2.0996272563934326, + "learning_rate": 9.27923010786525e-06, + "loss": 0.6743, + "step": 6814 + }, + { + "epoch": 0.5156823427036434, + "grad_norm": 2.2065136432647705, + "learning_rate": 9.27697896625017e-06, + "loss": 0.7956, + "step": 6815 + }, + { + "epoch": 0.5157580114259771, + "grad_norm": 2.435471534729004, + "learning_rate": 9.274727807289772e-06, + "loss": 0.5638, + "step": 6816 + }, + { + "epoch": 0.5158336801483107, + "grad_norm": 2.6238391399383545, + "learning_rate": 9.272476631125027e-06, + "loss": 0.8299, + "step": 6817 + }, + { + "epoch": 0.5159093488706443, + "grad_norm": 2.0177040100097656, + "learning_rate": 9.270225437896916e-06, + "loss": 0.7226, + "step": 6818 + }, + { + "epoch": 0.515985017592978, + "grad_norm": 2.575056552886963, + "learning_rate": 9.267974227746415e-06, + "loss": 0.6647, + "step": 6819 + }, + { + "epoch": 0.5160606863153115, + "grad_norm": 2.2004969120025635, + "learning_rate": 9.265723000814501e-06, + "loss": 0.739, + "step": 6820 + }, + { + "epoch": 0.5161363550376452, + "grad_norm": 1.8651002645492554, + "learning_rate": 9.263471757242153e-06, + "loss": 0.6448, + "step": 6821 + }, + { + "epoch": 0.5162120237599788, + "grad_norm": 2.0674023628234863, + "learning_rate": 9.261220497170349e-06, + "loss": 0.6993, + "step": 6822 + }, + { + "epoch": 0.5162876924823124, + "grad_norm": 2.2824978828430176, + "learning_rate": 9.258969220740075e-06, + "loss": 0.7217, + "step": 6823 + }, + { + "epoch": 0.5163633612046461, + "grad_norm": 2.708824634552002, + "learning_rate": 9.25671792809231e-06, + "loss": 0.6483, + "step": 6824 + }, + { + "epoch": 0.5164390299269797, + "grad_norm": 4.072855472564697, + "learning_rate": 9.254466619368038e-06, + "loss": 0.5936, + "step": 6825 + }, + { + "epoch": 0.5165146986493133, + "grad_norm": 8.13134765625, + "learning_rate": 9.252215294708247e-06, + "loss": 0.7496, + "step": 6826 + }, + { + "epoch": 0.5165903673716469, + "grad_norm": 2.2548294067382812, + "learning_rate": 9.249963954253913e-06, + "loss": 0.5736, + "step": 6827 + }, + { + "epoch": 0.5166660360939805, + "grad_norm": 2.1357603073120117, + "learning_rate": 9.247712598146028e-06, + "loss": 0.7561, + "step": 6828 + }, + { + "epoch": 0.5167417048163142, + "grad_norm": 1.8459572792053223, + "learning_rate": 9.245461226525584e-06, + "loss": 0.6388, + "step": 6829 + }, + { + "epoch": 0.5168173735386478, + "grad_norm": 2.7423040866851807, + "learning_rate": 9.24320983953356e-06, + "loss": 0.8378, + "step": 6830 + }, + { + "epoch": 0.5168930422609814, + "grad_norm": 2.403897523880005, + "learning_rate": 9.24095843731095e-06, + "loss": 0.603, + "step": 6831 + }, + { + "epoch": 0.5169687109833151, + "grad_norm": 2.0380334854125977, + "learning_rate": 9.23870701999874e-06, + "loss": 0.6979, + "step": 6832 + }, + { + "epoch": 0.5170443797056486, + "grad_norm": 2.8560407161712646, + "learning_rate": 9.23645558773793e-06, + "loss": 0.7479, + "step": 6833 + }, + { + "epoch": 0.5171200484279823, + "grad_norm": 2.2404606342315674, + "learning_rate": 9.234204140669502e-06, + "loss": 0.5442, + "step": 6834 + }, + { + "epoch": 0.5171957171503159, + "grad_norm": 1.846641182899475, + "learning_rate": 9.231952678934456e-06, + "loss": 0.8732, + "step": 6835 + }, + { + "epoch": 0.5172713858726495, + "grad_norm": 2.014122486114502, + "learning_rate": 9.229701202673781e-06, + "loss": 0.6284, + "step": 6836 + }, + { + "epoch": 0.5173470545949832, + "grad_norm": 1.9352043867111206, + "learning_rate": 9.227449712028475e-06, + "loss": 0.7152, + "step": 6837 + }, + { + "epoch": 0.5174227233173168, + "grad_norm": 2.4555435180664062, + "learning_rate": 9.225198207139533e-06, + "loss": 0.6324, + "step": 6838 + }, + { + "epoch": 0.5174983920396504, + "grad_norm": 2.332766056060791, + "learning_rate": 9.222946688147949e-06, + "loss": 0.668, + "step": 6839 + }, + { + "epoch": 0.5175740607619841, + "grad_norm": 2.4688680171966553, + "learning_rate": 9.220695155194724e-06, + "loss": 0.7283, + "step": 6840 + }, + { + "epoch": 0.5176497294843176, + "grad_norm": 2.0865702629089355, + "learning_rate": 9.218443608420855e-06, + "loss": 0.6561, + "step": 6841 + }, + { + "epoch": 0.5177253982066513, + "grad_norm": 1.7171027660369873, + "learning_rate": 9.216192047967337e-06, + "loss": 0.537, + "step": 6842 + }, + { + "epoch": 0.5178010669289849, + "grad_norm": 2.2573938369750977, + "learning_rate": 9.213940473975178e-06, + "loss": 0.7413, + "step": 6843 + }, + { + "epoch": 0.5178767356513185, + "grad_norm": 1.9172108173370361, + "learning_rate": 9.211688886585373e-06, + "loss": 0.5654, + "step": 6844 + }, + { + "epoch": 0.5179524043736522, + "grad_norm": 2.324502468109131, + "learning_rate": 9.209437285938926e-06, + "loss": 0.6488, + "step": 6845 + }, + { + "epoch": 0.5180280730959858, + "grad_norm": 1.878939151763916, + "learning_rate": 9.207185672176837e-06, + "loss": 0.7264, + "step": 6846 + }, + { + "epoch": 0.5181037418183194, + "grad_norm": 2.321331262588501, + "learning_rate": 9.204934045440111e-06, + "loss": 0.8104, + "step": 6847 + }, + { + "epoch": 0.518179410540653, + "grad_norm": 2.2414441108703613, + "learning_rate": 9.202682405869753e-06, + "loss": 0.8987, + "step": 6848 + }, + { + "epoch": 0.5182550792629866, + "grad_norm": 2.0336124897003174, + "learning_rate": 9.20043075360677e-06, + "loss": 0.7724, + "step": 6849 + }, + { + "epoch": 0.5183307479853203, + "grad_norm": 2.182743787765503, + "learning_rate": 9.198179088792159e-06, + "loss": 0.8737, + "step": 6850 + }, + { + "epoch": 0.5184064167076539, + "grad_norm": 1.6060758829116821, + "learning_rate": 9.195927411566938e-06, + "loss": 0.6383, + "step": 6851 + }, + { + "epoch": 0.5184820854299875, + "grad_norm": 1.7125988006591797, + "learning_rate": 9.193675722072106e-06, + "loss": 0.6993, + "step": 6852 + }, + { + "epoch": 0.5185577541523212, + "grad_norm": 2.322448968887329, + "learning_rate": 9.191424020448673e-06, + "loss": 0.7036, + "step": 6853 + }, + { + "epoch": 0.5186334228746547, + "grad_norm": 2.939868688583374, + "learning_rate": 9.189172306837653e-06, + "loss": 0.6713, + "step": 6854 + }, + { + "epoch": 0.5187090915969884, + "grad_norm": 4.6412811279296875, + "learning_rate": 9.186920581380045e-06, + "loss": 0.6601, + "step": 6855 + }, + { + "epoch": 0.518784760319322, + "grad_norm": 7.659454822540283, + "learning_rate": 9.184668844216872e-06, + "loss": 0.8464, + "step": 6856 + }, + { + "epoch": 0.5188604290416556, + "grad_norm": 6.378112316131592, + "learning_rate": 9.182417095489135e-06, + "loss": 0.7449, + "step": 6857 + }, + { + "epoch": 0.5189360977639893, + "grad_norm": 1.7814310789108276, + "learning_rate": 9.180165335337848e-06, + "loss": 0.7629, + "step": 6858 + }, + { + "epoch": 0.5190117664863229, + "grad_norm": 2.039429187774658, + "learning_rate": 9.177913563904029e-06, + "loss": 0.7624, + "step": 6859 + }, + { + "epoch": 0.5190874352086565, + "grad_norm": 2.5181028842926025, + "learning_rate": 9.175661781328684e-06, + "loss": 0.709, + "step": 6860 + }, + { + "epoch": 0.5191631039309901, + "grad_norm": 2.226398229598999, + "learning_rate": 9.173409987752834e-06, + "loss": 0.6182, + "step": 6861 + }, + { + "epoch": 0.5192387726533237, + "grad_norm": 1.9586197137832642, + "learning_rate": 9.171158183317486e-06, + "loss": 0.6997, + "step": 6862 + }, + { + "epoch": 0.5193144413756574, + "grad_norm": 2.897284984588623, + "learning_rate": 9.16890636816366e-06, + "loss": 0.8039, + "step": 6863 + }, + { + "epoch": 0.519390110097991, + "grad_norm": 2.2634551525115967, + "learning_rate": 9.166654542432372e-06, + "loss": 0.7075, + "step": 6864 + }, + { + "epoch": 0.5194657788203246, + "grad_norm": 3.2109482288360596, + "learning_rate": 9.164402706264635e-06, + "loss": 0.6153, + "step": 6865 + }, + { + "epoch": 0.5195414475426583, + "grad_norm": 2.3147075176239014, + "learning_rate": 9.16215085980147e-06, + "loss": 0.7481, + "step": 6866 + }, + { + "epoch": 0.5196171162649919, + "grad_norm": 5.927982330322266, + "learning_rate": 9.159899003183894e-06, + "loss": 0.6095, + "step": 6867 + }, + { + "epoch": 0.5196927849873255, + "grad_norm": 1.7811951637268066, + "learning_rate": 9.157647136552926e-06, + "loss": 0.8572, + "step": 6868 + }, + { + "epoch": 0.5197684537096591, + "grad_norm": 1.9003556966781616, + "learning_rate": 9.155395260049584e-06, + "loss": 0.7132, + "step": 6869 + }, + { + "epoch": 0.5198441224319927, + "grad_norm": 2.136584520339966, + "learning_rate": 9.153143373814887e-06, + "loss": 0.685, + "step": 6870 + }, + { + "epoch": 0.5199197911543264, + "grad_norm": 3.125502109527588, + "learning_rate": 9.150891477989858e-06, + "loss": 0.8047, + "step": 6871 + }, + { + "epoch": 0.51999545987666, + "grad_norm": 3.0765082836151123, + "learning_rate": 9.148639572715517e-06, + "loss": 0.5735, + "step": 6872 + }, + { + "epoch": 0.5200711285989936, + "grad_norm": 2.166370153427124, + "learning_rate": 9.146387658132881e-06, + "loss": 0.8376, + "step": 6873 + }, + { + "epoch": 0.5201467973213272, + "grad_norm": 2.2477190494537354, + "learning_rate": 9.144135734382983e-06, + "loss": 0.7244, + "step": 6874 + }, + { + "epoch": 0.5202224660436608, + "grad_norm": 2.6588950157165527, + "learning_rate": 9.141883801606836e-06, + "loss": 0.6419, + "step": 6875 + }, + { + "epoch": 0.5202981347659945, + "grad_norm": 2.5737180709838867, + "learning_rate": 9.139631859945466e-06, + "loss": 0.8748, + "step": 6876 + }, + { + "epoch": 0.5203738034883281, + "grad_norm": 1.9617338180541992, + "learning_rate": 9.137379909539897e-06, + "loss": 0.7089, + "step": 6877 + }, + { + "epoch": 0.5204494722106617, + "grad_norm": 2.4707319736480713, + "learning_rate": 9.135127950531153e-06, + "loss": 0.6812, + "step": 6878 + }, + { + "epoch": 0.5205251409329954, + "grad_norm": 1.5463811159133911, + "learning_rate": 9.132875983060262e-06, + "loss": 0.8144, + "step": 6879 + }, + { + "epoch": 0.520600809655329, + "grad_norm": 2.1439154148101807, + "learning_rate": 9.130624007268247e-06, + "loss": 0.7185, + "step": 6880 + }, + { + "epoch": 0.5206764783776626, + "grad_norm": 2.178358554840088, + "learning_rate": 9.128372023296132e-06, + "loss": 0.7119, + "step": 6881 + }, + { + "epoch": 0.5207521470999962, + "grad_norm": 2.2699880599975586, + "learning_rate": 9.126120031284947e-06, + "loss": 0.7, + "step": 6882 + }, + { + "epoch": 0.5208278158223298, + "grad_norm": 2.0759565830230713, + "learning_rate": 9.123868031375716e-06, + "loss": 0.6736, + "step": 6883 + }, + { + "epoch": 0.5209034845446635, + "grad_norm": 2.5509610176086426, + "learning_rate": 9.121616023709473e-06, + "loss": 0.8851, + "step": 6884 + }, + { + "epoch": 0.5209791532669971, + "grad_norm": 2.314539909362793, + "learning_rate": 9.119364008427239e-06, + "loss": 0.8735, + "step": 6885 + }, + { + "epoch": 0.5210548219893307, + "grad_norm": 1.9383503198623657, + "learning_rate": 9.117111985670043e-06, + "loss": 0.7673, + "step": 6886 + }, + { + "epoch": 0.5211304907116643, + "grad_norm": 2.7739522457122803, + "learning_rate": 9.114859955578916e-06, + "loss": 0.736, + "step": 6887 + }, + { + "epoch": 0.521206159433998, + "grad_norm": 1.9783974885940552, + "learning_rate": 9.112607918294887e-06, + "loss": 0.868, + "step": 6888 + }, + { + "epoch": 0.5212818281563316, + "grad_norm": 2.2085728645324707, + "learning_rate": 9.110355873958987e-06, + "loss": 0.7107, + "step": 6889 + }, + { + "epoch": 0.5213574968786652, + "grad_norm": 2.447173833847046, + "learning_rate": 9.108103822712246e-06, + "loss": 0.7465, + "step": 6890 + }, + { + "epoch": 0.5214331656009988, + "grad_norm": 2.1307895183563232, + "learning_rate": 9.105851764695691e-06, + "loss": 0.6948, + "step": 6891 + }, + { + "epoch": 0.5215088343233325, + "grad_norm": 2.2277281284332275, + "learning_rate": 9.103599700050358e-06, + "loss": 0.6859, + "step": 6892 + }, + { + "epoch": 0.5215845030456661, + "grad_norm": 1.6344878673553467, + "learning_rate": 9.101347628917278e-06, + "loss": 0.6701, + "step": 6893 + }, + { + "epoch": 0.5216601717679997, + "grad_norm": 2.285489797592163, + "learning_rate": 9.09909555143748e-06, + "loss": 0.5954, + "step": 6894 + }, + { + "epoch": 0.5217358404903333, + "grad_norm": 2.104238986968994, + "learning_rate": 9.096843467751999e-06, + "loss": 0.6608, + "step": 6895 + }, + { + "epoch": 0.521811509212667, + "grad_norm": 2.4383769035339355, + "learning_rate": 9.094591378001864e-06, + "loss": 0.7292, + "step": 6896 + }, + { + "epoch": 0.5218871779350006, + "grad_norm": 2.041077136993408, + "learning_rate": 9.092339282328115e-06, + "loss": 0.7528, + "step": 6897 + }, + { + "epoch": 0.5219628466573342, + "grad_norm": 2.0433545112609863, + "learning_rate": 9.09008718087178e-06, + "loss": 0.5971, + "step": 6898 + }, + { + "epoch": 0.5220385153796678, + "grad_norm": 2.3326566219329834, + "learning_rate": 9.087835073773893e-06, + "loss": 0.6639, + "step": 6899 + }, + { + "epoch": 0.5221141841020014, + "grad_norm": 1.7605972290039062, + "learning_rate": 9.085582961175493e-06, + "loss": 0.5656, + "step": 6900 + }, + { + "epoch": 0.522189852824335, + "grad_norm": 2.2532219886779785, + "learning_rate": 9.083330843217606e-06, + "loss": 0.6726, + "step": 6901 + }, + { + "epoch": 0.5222655215466687, + "grad_norm": 2.456960678100586, + "learning_rate": 9.081078720041277e-06, + "loss": 0.8113, + "step": 6902 + }, + { + "epoch": 0.5223411902690023, + "grad_norm": 2.177849292755127, + "learning_rate": 9.078826591787532e-06, + "loss": 0.6898, + "step": 6903 + }, + { + "epoch": 0.5224168589913359, + "grad_norm": 2.2699694633483887, + "learning_rate": 9.076574458597413e-06, + "loss": 0.7005, + "step": 6904 + }, + { + "epoch": 0.5224925277136696, + "grad_norm": 2.6185245513916016, + "learning_rate": 9.074322320611954e-06, + "loss": 0.6497, + "step": 6905 + }, + { + "epoch": 0.5225681964360032, + "grad_norm": 2.110748767852783, + "learning_rate": 9.07207017797219e-06, + "loss": 0.8304, + "step": 6906 + }, + { + "epoch": 0.5226438651583368, + "grad_norm": 2.2064247131347656, + "learning_rate": 9.069818030819162e-06, + "loss": 0.714, + "step": 6907 + }, + { + "epoch": 0.5227195338806704, + "grad_norm": 1.88336181640625, + "learning_rate": 9.067565879293898e-06, + "loss": 0.8837, + "step": 6908 + }, + { + "epoch": 0.522795202603004, + "grad_norm": 2.301023244857788, + "learning_rate": 9.065313723537443e-06, + "loss": 0.8356, + "step": 6909 + }, + { + "epoch": 0.5228708713253377, + "grad_norm": 2.310317039489746, + "learning_rate": 9.06306156369083e-06, + "loss": 0.8133, + "step": 6910 + }, + { + "epoch": 0.5229465400476713, + "grad_norm": 2.0655336380004883, + "learning_rate": 9.060809399895099e-06, + "loss": 0.6615, + "step": 6911 + }, + { + "epoch": 0.5230222087700049, + "grad_norm": 2.006929874420166, + "learning_rate": 9.058557232291283e-06, + "loss": 0.6776, + "step": 6912 + }, + { + "epoch": 0.5230978774923385, + "grad_norm": 2.7849390506744385, + "learning_rate": 9.05630506102043e-06, + "loss": 0.6462, + "step": 6913 + }, + { + "epoch": 0.5231735462146722, + "grad_norm": 1.9213398694992065, + "learning_rate": 9.054052886223568e-06, + "loss": 0.6398, + "step": 6914 + }, + { + "epoch": 0.5232492149370058, + "grad_norm": 2.0991053581237793, + "learning_rate": 9.051800708041741e-06, + "loss": 0.6703, + "step": 6915 + }, + { + "epoch": 0.5233248836593394, + "grad_norm": 1.9303232431411743, + "learning_rate": 9.049548526615986e-06, + "loss": 0.7027, + "step": 6916 + }, + { + "epoch": 0.523400552381673, + "grad_norm": 1.928173303604126, + "learning_rate": 9.04729634208734e-06, + "loss": 0.7511, + "step": 6917 + }, + { + "epoch": 0.5234762211040067, + "grad_norm": 2.1134893894195557, + "learning_rate": 9.045044154596846e-06, + "loss": 0.6348, + "step": 6918 + }, + { + "epoch": 0.5235518898263403, + "grad_norm": 2.236544609069824, + "learning_rate": 9.04279196428554e-06, + "loss": 0.7813, + "step": 6919 + }, + { + "epoch": 0.5236275585486739, + "grad_norm": 2.3019282817840576, + "learning_rate": 9.040539771294464e-06, + "loss": 0.8056, + "step": 6920 + }, + { + "epoch": 0.5237032272710075, + "grad_norm": 2.164768934249878, + "learning_rate": 9.038287575764656e-06, + "loss": 0.7114, + "step": 6921 + }, + { + "epoch": 0.5237788959933412, + "grad_norm": 2.2749412059783936, + "learning_rate": 9.036035377837156e-06, + "loss": 0.8045, + "step": 6922 + }, + { + "epoch": 0.5238545647156748, + "grad_norm": 1.8921995162963867, + "learning_rate": 9.033783177653006e-06, + "loss": 0.9092, + "step": 6923 + }, + { + "epoch": 0.5239302334380084, + "grad_norm": 1.9506926536560059, + "learning_rate": 9.03153097535324e-06, + "loss": 0.6368, + "step": 6924 + }, + { + "epoch": 0.524005902160342, + "grad_norm": 2.1694705486297607, + "learning_rate": 9.029278771078905e-06, + "loss": 0.809, + "step": 6925 + }, + { + "epoch": 0.5240815708826756, + "grad_norm": 4.481318950653076, + "learning_rate": 9.02702656497104e-06, + "loss": 0.7019, + "step": 6926 + }, + { + "epoch": 0.5241572396050093, + "grad_norm": 1.8642576932907104, + "learning_rate": 9.024774357170681e-06, + "loss": 0.7031, + "step": 6927 + }, + { + "epoch": 0.5242329083273429, + "grad_norm": 1.5886856317520142, + "learning_rate": 9.022522147818873e-06, + "loss": 0.6229, + "step": 6928 + }, + { + "epoch": 0.5243085770496765, + "grad_norm": 2.0907092094421387, + "learning_rate": 9.020269937056657e-06, + "loss": 0.6835, + "step": 6929 + }, + { + "epoch": 0.5243842457720101, + "grad_norm": 2.4200832843780518, + "learning_rate": 9.01801772502507e-06, + "loss": 0.8053, + "step": 6930 + }, + { + "epoch": 0.5244599144943438, + "grad_norm": 1.7767586708068848, + "learning_rate": 9.015765511865156e-06, + "loss": 0.8483, + "step": 6931 + }, + { + "epoch": 0.5245355832166774, + "grad_norm": 1.9560530185699463, + "learning_rate": 9.013513297717954e-06, + "loss": 0.7639, + "step": 6932 + }, + { + "epoch": 0.524611251939011, + "grad_norm": 2.2985103130340576, + "learning_rate": 9.011261082724503e-06, + "loss": 0.7953, + "step": 6933 + }, + { + "epoch": 0.5246869206613446, + "grad_norm": 2.1248481273651123, + "learning_rate": 9.00900886702585e-06, + "loss": 0.7215, + "step": 6934 + }, + { + "epoch": 0.5247625893836783, + "grad_norm": 1.9643497467041016, + "learning_rate": 9.006756650763031e-06, + "loss": 0.6989, + "step": 6935 + }, + { + "epoch": 0.5248382581060119, + "grad_norm": 2.1842288970947266, + "learning_rate": 9.004504434077092e-06, + "loss": 0.7278, + "step": 6936 + }, + { + "epoch": 0.5249139268283455, + "grad_norm": 2.345261812210083, + "learning_rate": 9.002252217109065e-06, + "loss": 0.7213, + "step": 6937 + }, + { + "epoch": 0.5249895955506791, + "grad_norm": 2.2348873615264893, + "learning_rate": 9e-06, + "loss": 0.6579, + "step": 6938 + }, + { + "epoch": 0.5250652642730127, + "grad_norm": 2.263463258743286, + "learning_rate": 8.997747782890936e-06, + "loss": 0.7455, + "step": 6939 + }, + { + "epoch": 0.5251409329953464, + "grad_norm": 2.1686911582946777, + "learning_rate": 8.995495565922914e-06, + "loss": 0.7449, + "step": 6940 + }, + { + "epoch": 0.52521660171768, + "grad_norm": 2.4088521003723145, + "learning_rate": 8.99324334923697e-06, + "loss": 0.7396, + "step": 6941 + }, + { + "epoch": 0.5252922704400136, + "grad_norm": 2.31754207611084, + "learning_rate": 8.99099113297415e-06, + "loss": 0.6586, + "step": 6942 + }, + { + "epoch": 0.5253679391623473, + "grad_norm": 1.726416826248169, + "learning_rate": 8.988738917275497e-06, + "loss": 0.8426, + "step": 6943 + }, + { + "epoch": 0.5254436078846809, + "grad_norm": 2.5192909240722656, + "learning_rate": 8.986486702282048e-06, + "loss": 0.7568, + "step": 6944 + }, + { + "epoch": 0.5255192766070145, + "grad_norm": 2.0664658546447754, + "learning_rate": 8.98423448813485e-06, + "loss": 0.5892, + "step": 6945 + }, + { + "epoch": 0.5255949453293481, + "grad_norm": 3.337775230407715, + "learning_rate": 8.981982274974932e-06, + "loss": 0.7521, + "step": 6946 + }, + { + "epoch": 0.5256706140516817, + "grad_norm": 2.2166309356689453, + "learning_rate": 8.979730062943344e-06, + "loss": 0.8307, + "step": 6947 + }, + { + "epoch": 0.5257462827740154, + "grad_norm": 1.8791322708129883, + "learning_rate": 8.977477852181128e-06, + "loss": 0.6808, + "step": 6948 + }, + { + "epoch": 0.525821951496349, + "grad_norm": 2.134373664855957, + "learning_rate": 8.97522564282932e-06, + "loss": 0.7119, + "step": 6949 + }, + { + "epoch": 0.5258976202186826, + "grad_norm": 2.081508159637451, + "learning_rate": 8.972973435028964e-06, + "loss": 0.6941, + "step": 6950 + }, + { + "epoch": 0.5259732889410162, + "grad_norm": 2.069390296936035, + "learning_rate": 8.970721228921096e-06, + "loss": 0.7172, + "step": 6951 + }, + { + "epoch": 0.5260489576633498, + "grad_norm": 2.029240369796753, + "learning_rate": 8.968469024646759e-06, + "loss": 0.7142, + "step": 6952 + }, + { + "epoch": 0.5261246263856835, + "grad_norm": 2.3885130882263184, + "learning_rate": 8.966216822346996e-06, + "loss": 0.6241, + "step": 6953 + }, + { + "epoch": 0.5262002951080171, + "grad_norm": 1.988101840019226, + "learning_rate": 8.963964622162846e-06, + "loss": 0.6023, + "step": 6954 + }, + { + "epoch": 0.5262759638303507, + "grad_norm": 1.8047088384628296, + "learning_rate": 8.961712424235346e-06, + "loss": 0.6205, + "step": 6955 + }, + { + "epoch": 0.5263516325526844, + "grad_norm": 2.6779801845550537, + "learning_rate": 8.959460228705535e-06, + "loss": 0.9106, + "step": 6956 + }, + { + "epoch": 0.526427301275018, + "grad_norm": 2.134953022003174, + "learning_rate": 8.957208035714461e-06, + "loss": 0.7214, + "step": 6957 + }, + { + "epoch": 0.5265029699973516, + "grad_norm": 1.9478284120559692, + "learning_rate": 8.954955845403156e-06, + "loss": 0.8335, + "step": 6958 + }, + { + "epoch": 0.5265786387196852, + "grad_norm": 1.8003441095352173, + "learning_rate": 8.952703657912662e-06, + "loss": 0.7422, + "step": 6959 + }, + { + "epoch": 0.5266543074420188, + "grad_norm": 1.7551347017288208, + "learning_rate": 8.950451473384017e-06, + "loss": 0.6058, + "step": 6960 + }, + { + "epoch": 0.5267299761643525, + "grad_norm": 2.0923233032226562, + "learning_rate": 8.948199291958263e-06, + "loss": 0.8269, + "step": 6961 + }, + { + "epoch": 0.5268056448866861, + "grad_norm": 1.9860175848007202, + "learning_rate": 8.945947113776432e-06, + "loss": 0.6191, + "step": 6962 + }, + { + "epoch": 0.5268813136090197, + "grad_norm": 2.2504024505615234, + "learning_rate": 8.943694938979572e-06, + "loss": 0.6805, + "step": 6963 + }, + { + "epoch": 0.5269569823313534, + "grad_norm": 2.547034978866577, + "learning_rate": 8.941442767708717e-06, + "loss": 0.6573, + "step": 6964 + }, + { + "epoch": 0.5270326510536869, + "grad_norm": 1.7897727489471436, + "learning_rate": 8.939190600104905e-06, + "loss": 0.7814, + "step": 6965 + }, + { + "epoch": 0.5271083197760206, + "grad_norm": 2.5294055938720703, + "learning_rate": 8.936938436309175e-06, + "loss": 0.7107, + "step": 6966 + }, + { + "epoch": 0.5271839884983542, + "grad_norm": 2.606584310531616, + "learning_rate": 8.934686276462558e-06, + "loss": 0.6996, + "step": 6967 + }, + { + "epoch": 0.5272596572206878, + "grad_norm": 1.6937854290008545, + "learning_rate": 8.932434120706104e-06, + "loss": 0.836, + "step": 6968 + }, + { + "epoch": 0.5273353259430215, + "grad_norm": 2.6257095336914062, + "learning_rate": 8.93018196918084e-06, + "loss": 0.6491, + "step": 6969 + }, + { + "epoch": 0.5274109946653551, + "grad_norm": 3.0788724422454834, + "learning_rate": 8.927929822027812e-06, + "loss": 0.6871, + "step": 6970 + }, + { + "epoch": 0.5274866633876887, + "grad_norm": 2.0726146697998047, + "learning_rate": 8.925677679388048e-06, + "loss": 0.7104, + "step": 6971 + }, + { + "epoch": 0.5275623321100223, + "grad_norm": 2.652439832687378, + "learning_rate": 8.923425541402586e-06, + "loss": 0.6209, + "step": 6972 + }, + { + "epoch": 0.5276380008323559, + "grad_norm": 2.571249485015869, + "learning_rate": 8.921173408212468e-06, + "loss": 0.8608, + "step": 6973 + }, + { + "epoch": 0.5277136695546896, + "grad_norm": 2.138179302215576, + "learning_rate": 8.918921279958725e-06, + "loss": 0.7663, + "step": 6974 + }, + { + "epoch": 0.5277893382770232, + "grad_norm": 2.010223388671875, + "learning_rate": 8.916669156782396e-06, + "loss": 0.7324, + "step": 6975 + }, + { + "epoch": 0.5278650069993568, + "grad_norm": 1.760847806930542, + "learning_rate": 8.914417038824511e-06, + "loss": 0.736, + "step": 6976 + }, + { + "epoch": 0.5279406757216905, + "grad_norm": 2.2697741985321045, + "learning_rate": 8.912164926226107e-06, + "loss": 0.868, + "step": 6977 + }, + { + "epoch": 0.528016344444024, + "grad_norm": 1.8922936916351318, + "learning_rate": 8.909912819128223e-06, + "loss": 0.5409, + "step": 6978 + }, + { + "epoch": 0.5280920131663577, + "grad_norm": 1.831129789352417, + "learning_rate": 8.907660717671887e-06, + "loss": 0.667, + "step": 6979 + }, + { + "epoch": 0.5281676818886913, + "grad_norm": 2.529381275177002, + "learning_rate": 8.905408621998138e-06, + "loss": 0.8239, + "step": 6980 + }, + { + "epoch": 0.5282433506110249, + "grad_norm": 2.6161415576934814, + "learning_rate": 8.903156532248005e-06, + "loss": 0.7192, + "step": 6981 + }, + { + "epoch": 0.5283190193333586, + "grad_norm": 2.2429845333099365, + "learning_rate": 8.90090444856252e-06, + "loss": 0.6279, + "step": 6982 + }, + { + "epoch": 0.5283946880556922, + "grad_norm": 2.3764171600341797, + "learning_rate": 8.898652371082722e-06, + "loss": 0.6857, + "step": 6983 + }, + { + "epoch": 0.5284703567780258, + "grad_norm": 2.131578207015991, + "learning_rate": 8.896400299949642e-06, + "loss": 0.6394, + "step": 6984 + }, + { + "epoch": 0.5285460255003595, + "grad_norm": 2.332397699356079, + "learning_rate": 8.894148235304309e-06, + "loss": 0.6967, + "step": 6985 + }, + { + "epoch": 0.528621694222693, + "grad_norm": 2.148642063140869, + "learning_rate": 8.891896177287758e-06, + "loss": 0.6876, + "step": 6986 + }, + { + "epoch": 0.5286973629450267, + "grad_norm": 2.2178142070770264, + "learning_rate": 8.889644126041014e-06, + "loss": 0.5579, + "step": 6987 + }, + { + "epoch": 0.5287730316673603, + "grad_norm": 2.4631600379943848, + "learning_rate": 8.887392081705112e-06, + "loss": 0.596, + "step": 6988 + }, + { + "epoch": 0.5288487003896939, + "grad_norm": 2.4894394874572754, + "learning_rate": 8.885140044421086e-06, + "loss": 0.7106, + "step": 6989 + }, + { + "epoch": 0.5289243691120276, + "grad_norm": 2.318631410598755, + "learning_rate": 8.88288801432996e-06, + "loss": 0.678, + "step": 6990 + }, + { + "epoch": 0.5290000378343611, + "grad_norm": 2.5202953815460205, + "learning_rate": 8.880635991572765e-06, + "loss": 0.8136, + "step": 6991 + }, + { + "epoch": 0.5290757065566948, + "grad_norm": 2.017930746078491, + "learning_rate": 8.878383976290529e-06, + "loss": 0.7493, + "step": 6992 + }, + { + "epoch": 0.5291513752790284, + "grad_norm": 2.3018083572387695, + "learning_rate": 8.876131968624282e-06, + "loss": 0.6035, + "step": 6993 + }, + { + "epoch": 0.529227044001362, + "grad_norm": 2.4072072505950928, + "learning_rate": 8.873879968715055e-06, + "loss": 0.772, + "step": 6994 + }, + { + "epoch": 0.5293027127236957, + "grad_norm": 3.478423595428467, + "learning_rate": 8.87162797670387e-06, + "loss": 0.6318, + "step": 6995 + }, + { + "epoch": 0.5293783814460293, + "grad_norm": 1.5796644687652588, + "learning_rate": 8.869375992731757e-06, + "loss": 0.5953, + "step": 6996 + }, + { + "epoch": 0.5294540501683629, + "grad_norm": 2.066748857498169, + "learning_rate": 8.867124016939742e-06, + "loss": 0.6553, + "step": 6997 + }, + { + "epoch": 0.5295297188906966, + "grad_norm": 2.06510591506958, + "learning_rate": 8.864872049468846e-06, + "loss": 0.6526, + "step": 6998 + }, + { + "epoch": 0.5296053876130301, + "grad_norm": 2.0738534927368164, + "learning_rate": 8.862620090460104e-06, + "loss": 0.8406, + "step": 6999 + }, + { + "epoch": 0.5296810563353638, + "grad_norm": 2.1421914100646973, + "learning_rate": 8.860368140054536e-06, + "loss": 0.714, + "step": 7000 + }, + { + "epoch": 0.5297567250576974, + "grad_norm": 1.9644984006881714, + "learning_rate": 8.858116198393166e-06, + "loss": 0.628, + "step": 7001 + }, + { + "epoch": 0.529832393780031, + "grad_norm": 2.3363261222839355, + "learning_rate": 8.85586426561702e-06, + "loss": 0.8578, + "step": 7002 + }, + { + "epoch": 0.5299080625023647, + "grad_norm": 2.117711305618286, + "learning_rate": 8.853612341867116e-06, + "loss": 0.6332, + "step": 7003 + }, + { + "epoch": 0.5299837312246982, + "grad_norm": 2.1709940433502197, + "learning_rate": 8.851360427284485e-06, + "loss": 0.7398, + "step": 7004 + }, + { + "epoch": 0.5300593999470319, + "grad_norm": 1.8637499809265137, + "learning_rate": 8.849108522010144e-06, + "loss": 0.6726, + "step": 7005 + }, + { + "epoch": 0.5301350686693656, + "grad_norm": 2.101854085922241, + "learning_rate": 8.846856626185113e-06, + "loss": 0.6772, + "step": 7006 + }, + { + "epoch": 0.5302107373916991, + "grad_norm": 1.8691213130950928, + "learning_rate": 8.84460473995042e-06, + "loss": 0.7488, + "step": 7007 + }, + { + "epoch": 0.5302864061140328, + "grad_norm": 2.332582473754883, + "learning_rate": 8.842352863447076e-06, + "loss": 0.6498, + "step": 7008 + }, + { + "epoch": 0.5303620748363664, + "grad_norm": 1.9999667406082153, + "learning_rate": 8.840100996816106e-06, + "loss": 0.6713, + "step": 7009 + }, + { + "epoch": 0.5304377435587, + "grad_norm": 2.319936752319336, + "learning_rate": 8.837849140198531e-06, + "loss": 0.6997, + "step": 7010 + }, + { + "epoch": 0.5305134122810337, + "grad_norm": 2.1495556831359863, + "learning_rate": 8.835597293735367e-06, + "loss": 0.7488, + "step": 7011 + }, + { + "epoch": 0.5305890810033672, + "grad_norm": 1.9620660543441772, + "learning_rate": 8.833345457567632e-06, + "loss": 0.6522, + "step": 7012 + }, + { + "epoch": 0.5306647497257009, + "grad_norm": 1.9892656803131104, + "learning_rate": 8.83109363183634e-06, + "loss": 0.6923, + "step": 7013 + }, + { + "epoch": 0.5307404184480345, + "grad_norm": 1.7837767601013184, + "learning_rate": 8.828841816682515e-06, + "loss": 0.6985, + "step": 7014 + }, + { + "epoch": 0.5308160871703681, + "grad_norm": 2.1873769760131836, + "learning_rate": 8.826590012247167e-06, + "loss": 0.7771, + "step": 7015 + }, + { + "epoch": 0.5308917558927018, + "grad_norm": 2.4407780170440674, + "learning_rate": 8.824338218671317e-06, + "loss": 0.8917, + "step": 7016 + }, + { + "epoch": 0.5309674246150353, + "grad_norm": 2.155855178833008, + "learning_rate": 8.822086436095973e-06, + "loss": 0.7085, + "step": 7017 + }, + { + "epoch": 0.531043093337369, + "grad_norm": 2.3129708766937256, + "learning_rate": 8.819834664662149e-06, + "loss": 0.8051, + "step": 7018 + }, + { + "epoch": 0.5311187620597027, + "grad_norm": 1.8992316722869873, + "learning_rate": 8.817582904510867e-06, + "loss": 0.624, + "step": 7019 + }, + { + "epoch": 0.5311944307820362, + "grad_norm": 1.9501327276229858, + "learning_rate": 8.81533115578313e-06, + "loss": 0.7945, + "step": 7020 + }, + { + "epoch": 0.5312700995043699, + "grad_norm": 2.9170970916748047, + "learning_rate": 8.813079418619955e-06, + "loss": 0.5384, + "step": 7021 + }, + { + "epoch": 0.5313457682267035, + "grad_norm": 1.9943617582321167, + "learning_rate": 8.81082769316235e-06, + "loss": 0.8106, + "step": 7022 + }, + { + "epoch": 0.5314214369490371, + "grad_norm": 1.9144606590270996, + "learning_rate": 8.808575979551325e-06, + "loss": 0.8237, + "step": 7023 + }, + { + "epoch": 0.5314971056713708, + "grad_norm": 2.214576482772827, + "learning_rate": 8.806324277927895e-06, + "loss": 0.6764, + "step": 7024 + }, + { + "epoch": 0.5315727743937043, + "grad_norm": 2.135948657989502, + "learning_rate": 8.804072588433063e-06, + "loss": 0.5962, + "step": 7025 + }, + { + "epoch": 0.531648443116038, + "grad_norm": 2.7250607013702393, + "learning_rate": 8.801820911207842e-06, + "loss": 0.6888, + "step": 7026 + }, + { + "epoch": 0.5317241118383716, + "grad_norm": 3.235295534133911, + "learning_rate": 8.799569246393234e-06, + "loss": 0.6464, + "step": 7027 + }, + { + "epoch": 0.5317997805607052, + "grad_norm": 2.087533473968506, + "learning_rate": 8.797317594130245e-06, + "loss": 0.6293, + "step": 7028 + }, + { + "epoch": 0.5318754492830389, + "grad_norm": 5.911130905151367, + "learning_rate": 8.795065954559888e-06, + "loss": 0.7318, + "step": 7029 + }, + { + "epoch": 0.5319511180053724, + "grad_norm": 1.9690461158752441, + "learning_rate": 8.792814327823165e-06, + "loss": 0.5787, + "step": 7030 + }, + { + "epoch": 0.5320267867277061, + "grad_norm": 2.165632486343384, + "learning_rate": 8.790562714061076e-06, + "loss": 0.7131, + "step": 7031 + }, + { + "epoch": 0.5321024554500398, + "grad_norm": 2.008331060409546, + "learning_rate": 8.78831111341463e-06, + "loss": 0.5816, + "step": 7032 + }, + { + "epoch": 0.5321781241723733, + "grad_norm": 2.393400192260742, + "learning_rate": 8.786059526024823e-06, + "loss": 0.6825, + "step": 7033 + }, + { + "epoch": 0.532253792894707, + "grad_norm": 2.293968915939331, + "learning_rate": 8.78380795203266e-06, + "loss": 0.6761, + "step": 7034 + }, + { + "epoch": 0.5323294616170406, + "grad_norm": 2.2142651081085205, + "learning_rate": 8.781556391579148e-06, + "loss": 0.7001, + "step": 7035 + }, + { + "epoch": 0.5324051303393742, + "grad_norm": 1.9812933206558228, + "learning_rate": 8.779304844805278e-06, + "loss": 0.6993, + "step": 7036 + }, + { + "epoch": 0.5324807990617079, + "grad_norm": 1.889985203742981, + "learning_rate": 8.777053311852053e-06, + "loss": 0.7246, + "step": 7037 + }, + { + "epoch": 0.5325564677840414, + "grad_norm": 2.5295422077178955, + "learning_rate": 8.77480179286047e-06, + "loss": 0.5645, + "step": 7038 + }, + { + "epoch": 0.5326321365063751, + "grad_norm": 1.9612340927124023, + "learning_rate": 8.772550287971525e-06, + "loss": 0.6118, + "step": 7039 + }, + { + "epoch": 0.5327078052287088, + "grad_norm": 2.831002712249756, + "learning_rate": 8.77029879732622e-06, + "loss": 0.7933, + "step": 7040 + }, + { + "epoch": 0.5327834739510423, + "grad_norm": 2.5195281505584717, + "learning_rate": 8.768047321065547e-06, + "loss": 0.66, + "step": 7041 + }, + { + "epoch": 0.532859142673376, + "grad_norm": 2.25597882270813, + "learning_rate": 8.765795859330498e-06, + "loss": 0.5955, + "step": 7042 + }, + { + "epoch": 0.5329348113957095, + "grad_norm": 2.0232596397399902, + "learning_rate": 8.763544412262074e-06, + "loss": 0.7174, + "step": 7043 + }, + { + "epoch": 0.5330104801180432, + "grad_norm": 2.8326871395111084, + "learning_rate": 8.761292980001259e-06, + "loss": 0.6191, + "step": 7044 + }, + { + "epoch": 0.5330861488403769, + "grad_norm": 2.0261335372924805, + "learning_rate": 8.759041562689053e-06, + "loss": 0.7789, + "step": 7045 + }, + { + "epoch": 0.5331618175627104, + "grad_norm": 2.5931825637817383, + "learning_rate": 8.756790160466445e-06, + "loss": 0.7508, + "step": 7046 + }, + { + "epoch": 0.5332374862850441, + "grad_norm": 2.142396926879883, + "learning_rate": 8.75453877347442e-06, + "loss": 0.6213, + "step": 7047 + }, + { + "epoch": 0.5333131550073777, + "grad_norm": 2.075147867202759, + "learning_rate": 8.752287401853974e-06, + "loss": 0.5699, + "step": 7048 + }, + { + "epoch": 0.5333888237297113, + "grad_norm": 2.620086908340454, + "learning_rate": 8.750036045746087e-06, + "loss": 0.6605, + "step": 7049 + }, + { + "epoch": 0.533464492452045, + "grad_norm": 2.613399028778076, + "learning_rate": 8.747784705291756e-06, + "loss": 0.7942, + "step": 7050 + }, + { + "epoch": 0.5335401611743785, + "grad_norm": 2.060634136199951, + "learning_rate": 8.745533380631963e-06, + "loss": 0.5541, + "step": 7051 + }, + { + "epoch": 0.5336158298967122, + "grad_norm": 1.947361946105957, + "learning_rate": 8.743282071907692e-06, + "loss": 0.8374, + "step": 7052 + }, + { + "epoch": 0.5336914986190459, + "grad_norm": 2.726940393447876, + "learning_rate": 8.741030779259927e-06, + "loss": 0.815, + "step": 7053 + }, + { + "epoch": 0.5337671673413794, + "grad_norm": 2.2680883407592773, + "learning_rate": 8.738779502829651e-06, + "loss": 0.7913, + "step": 7054 + }, + { + "epoch": 0.5338428360637131, + "grad_norm": 1.992996096611023, + "learning_rate": 8.736528242757849e-06, + "loss": 0.6763, + "step": 7055 + }, + { + "epoch": 0.5339185047860466, + "grad_norm": 2.075242519378662, + "learning_rate": 8.7342769991855e-06, + "loss": 0.6267, + "step": 7056 + }, + { + "epoch": 0.5339941735083803, + "grad_norm": 2.2018284797668457, + "learning_rate": 8.732025772253586e-06, + "loss": 0.7294, + "step": 7057 + }, + { + "epoch": 0.534069842230714, + "grad_norm": 1.962738275527954, + "learning_rate": 8.729774562103084e-06, + "loss": 0.6908, + "step": 7058 + }, + { + "epoch": 0.5341455109530475, + "grad_norm": 1.9997669458389282, + "learning_rate": 8.727523368874971e-06, + "loss": 0.9049, + "step": 7059 + }, + { + "epoch": 0.5342211796753812, + "grad_norm": 2.593151092529297, + "learning_rate": 8.725272192710229e-06, + "loss": 0.7374, + "step": 7060 + }, + { + "epoch": 0.5342968483977149, + "grad_norm": 1.8062269687652588, + "learning_rate": 8.72302103374983e-06, + "loss": 0.7373, + "step": 7061 + }, + { + "epoch": 0.5343725171200484, + "grad_norm": 1.9314745664596558, + "learning_rate": 8.720769892134751e-06, + "loss": 0.7628, + "step": 7062 + }, + { + "epoch": 0.5344481858423821, + "grad_norm": 2.1951372623443604, + "learning_rate": 8.718518768005963e-06, + "loss": 0.8498, + "step": 7063 + }, + { + "epoch": 0.5345238545647156, + "grad_norm": 3.0464980602264404, + "learning_rate": 8.716267661504437e-06, + "loss": 0.591, + "step": 7064 + }, + { + "epoch": 0.5345995232870493, + "grad_norm": 2.5231964588165283, + "learning_rate": 8.714016572771154e-06, + "loss": 0.6656, + "step": 7065 + }, + { + "epoch": 0.534675192009383, + "grad_norm": 1.8425929546356201, + "learning_rate": 8.711765501947074e-06, + "loss": 0.608, + "step": 7066 + }, + { + "epoch": 0.5347508607317165, + "grad_norm": 2.2831332683563232, + "learning_rate": 8.709514449173173e-06, + "loss": 0.691, + "step": 7067 + }, + { + "epoch": 0.5348265294540502, + "grad_norm": 1.8244147300720215, + "learning_rate": 8.707263414590416e-06, + "loss": 0.6913, + "step": 7068 + }, + { + "epoch": 0.5349021981763837, + "grad_norm": 1.8097063302993774, + "learning_rate": 8.705012398339768e-06, + "loss": 0.6704, + "step": 7069 + }, + { + "epoch": 0.5349778668987174, + "grad_norm": 1.9643720388412476, + "learning_rate": 8.7027614005622e-06, + "loss": 0.5699, + "step": 7070 + }, + { + "epoch": 0.5350535356210511, + "grad_norm": 2.7243692874908447, + "learning_rate": 8.700510421398676e-06, + "loss": 0.7782, + "step": 7071 + }, + { + "epoch": 0.5351292043433846, + "grad_norm": 1.7081341743469238, + "learning_rate": 8.698259460990155e-06, + "loss": 0.7373, + "step": 7072 + }, + { + "epoch": 0.5352048730657183, + "grad_norm": 1.9497652053833008, + "learning_rate": 8.696008519477607e-06, + "loss": 0.5009, + "step": 7073 + }, + { + "epoch": 0.535280541788052, + "grad_norm": 2.6480894088745117, + "learning_rate": 8.693757597001985e-06, + "loss": 0.7564, + "step": 7074 + }, + { + "epoch": 0.5353562105103855, + "grad_norm": 1.7461856603622437, + "learning_rate": 8.691506693704252e-06, + "loss": 0.6427, + "step": 7075 + }, + { + "epoch": 0.5354318792327192, + "grad_norm": 2.0621261596679688, + "learning_rate": 8.68925580972537e-06, + "loss": 0.7876, + "step": 7076 + }, + { + "epoch": 0.5355075479550527, + "grad_norm": 2.9666640758514404, + "learning_rate": 8.687004945206293e-06, + "loss": 0.6868, + "step": 7077 + }, + { + "epoch": 0.5355832166773864, + "grad_norm": 1.9791302680969238, + "learning_rate": 8.68475410028798e-06, + "loss": 0.7977, + "step": 7078 + }, + { + "epoch": 0.5356588853997201, + "grad_norm": 2.7180376052856445, + "learning_rate": 8.682503275111385e-06, + "loss": 0.8495, + "step": 7079 + }, + { + "epoch": 0.5357345541220536, + "grad_norm": 2.0174508094787598, + "learning_rate": 8.680252469817459e-06, + "loss": 0.7696, + "step": 7080 + }, + { + "epoch": 0.5358102228443873, + "grad_norm": 2.2519609928131104, + "learning_rate": 8.678001684547159e-06, + "loss": 0.6735, + "step": 7081 + }, + { + "epoch": 0.535885891566721, + "grad_norm": 2.086402654647827, + "learning_rate": 8.675750919441436e-06, + "loss": 0.7296, + "step": 7082 + }, + { + "epoch": 0.5359615602890545, + "grad_norm": 2.2186553478240967, + "learning_rate": 8.67350017464124e-06, + "loss": 0.7418, + "step": 7083 + }, + { + "epoch": 0.5360372290113882, + "grad_norm": 2.314690589904785, + "learning_rate": 8.671249450287517e-06, + "loss": 0.8358, + "step": 7084 + }, + { + "epoch": 0.5361128977337217, + "grad_norm": 2.289297580718994, + "learning_rate": 8.668998746521215e-06, + "loss": 0.8058, + "step": 7085 + }, + { + "epoch": 0.5361885664560554, + "grad_norm": 2.941833972930908, + "learning_rate": 8.666748063483284e-06, + "loss": 0.7801, + "step": 7086 + }, + { + "epoch": 0.5362642351783891, + "grad_norm": 2.7016420364379883, + "learning_rate": 8.66449740131467e-06, + "loss": 0.6243, + "step": 7087 + }, + { + "epoch": 0.5363399039007226, + "grad_norm": 2.9800827503204346, + "learning_rate": 8.66224676015631e-06, + "loss": 0.8864, + "step": 7088 + }, + { + "epoch": 0.5364155726230563, + "grad_norm": 2.015385389328003, + "learning_rate": 8.659996140149154e-06, + "loss": 0.7499, + "step": 7089 + }, + { + "epoch": 0.5364912413453898, + "grad_norm": 2.1981661319732666, + "learning_rate": 8.657745541434134e-06, + "loss": 0.6763, + "step": 7090 + }, + { + "epoch": 0.5365669100677235, + "grad_norm": 1.9867689609527588, + "learning_rate": 8.655494964152199e-06, + "loss": 0.6845, + "step": 7091 + }, + { + "epoch": 0.5366425787900572, + "grad_norm": 2.070387125015259, + "learning_rate": 8.653244408444284e-06, + "loss": 0.7047, + "step": 7092 + }, + { + "epoch": 0.5367182475123907, + "grad_norm": 1.7507538795471191, + "learning_rate": 8.650993874451324e-06, + "loss": 0.9036, + "step": 7093 + }, + { + "epoch": 0.5367939162347244, + "grad_norm": 2.0866281986236572, + "learning_rate": 8.648743362314259e-06, + "loss": 0.7097, + "step": 7094 + }, + { + "epoch": 0.536869584957058, + "grad_norm": 3.4941627979278564, + "learning_rate": 8.646492872174018e-06, + "loss": 0.6882, + "step": 7095 + }, + { + "epoch": 0.5369452536793916, + "grad_norm": 1.8780581951141357, + "learning_rate": 8.644242404171536e-06, + "loss": 0.7404, + "step": 7096 + }, + { + "epoch": 0.5370209224017253, + "grad_norm": 1.995606541633606, + "learning_rate": 8.641991958447748e-06, + "loss": 0.7844, + "step": 7097 + }, + { + "epoch": 0.5370965911240588, + "grad_norm": 2.5743753910064697, + "learning_rate": 8.63974153514358e-06, + "loss": 0.7448, + "step": 7098 + }, + { + "epoch": 0.5371722598463925, + "grad_norm": 2.6984145641326904, + "learning_rate": 8.637491134399965e-06, + "loss": 0.7314, + "step": 7099 + }, + { + "epoch": 0.5372479285687262, + "grad_norm": 2.029510021209717, + "learning_rate": 8.63524075635782e-06, + "loss": 0.6158, + "step": 7100 + }, + { + "epoch": 0.5373235972910597, + "grad_norm": 2.0353198051452637, + "learning_rate": 8.632990401158086e-06, + "loss": 0.6718, + "step": 7101 + }, + { + "epoch": 0.5373992660133934, + "grad_norm": 1.9957705736160278, + "learning_rate": 8.630740068941678e-06, + "loss": 0.7376, + "step": 7102 + }, + { + "epoch": 0.5374749347357269, + "grad_norm": 2.259077787399292, + "learning_rate": 8.628489759849522e-06, + "loss": 0.5768, + "step": 7103 + }, + { + "epoch": 0.5375506034580606, + "grad_norm": 2.07145619392395, + "learning_rate": 8.626239474022538e-06, + "loss": 0.591, + "step": 7104 + }, + { + "epoch": 0.5376262721803943, + "grad_norm": 1.6986279487609863, + "learning_rate": 8.623989211601645e-06, + "loss": 0.9477, + "step": 7105 + }, + { + "epoch": 0.5377019409027278, + "grad_norm": 2.296743392944336, + "learning_rate": 8.621738972727768e-06, + "loss": 0.6912, + "step": 7106 + }, + { + "epoch": 0.5377776096250615, + "grad_norm": 9.655526161193848, + "learning_rate": 8.619488757541817e-06, + "loss": 0.678, + "step": 7107 + }, + { + "epoch": 0.5378532783473952, + "grad_norm": 1.8935918807983398, + "learning_rate": 8.617238566184713e-06, + "loss": 0.7756, + "step": 7108 + }, + { + "epoch": 0.5379289470697287, + "grad_norm": 2.278226137161255, + "learning_rate": 8.614988398797366e-06, + "loss": 0.7101, + "step": 7109 + }, + { + "epoch": 0.5380046157920624, + "grad_norm": 2.2196240425109863, + "learning_rate": 8.612738255520689e-06, + "loss": 0.7687, + "step": 7110 + }, + { + "epoch": 0.5380802845143959, + "grad_norm": 2.080031156539917, + "learning_rate": 8.610488136495599e-06, + "loss": 0.7437, + "step": 7111 + }, + { + "epoch": 0.5381559532367296, + "grad_norm": 1.6355682611465454, + "learning_rate": 8.608238041863e-06, + "loss": 0.6928, + "step": 7112 + }, + { + "epoch": 0.5382316219590633, + "grad_norm": 2.384467124938965, + "learning_rate": 8.605987971763803e-06, + "loss": 0.788, + "step": 7113 + }, + { + "epoch": 0.5383072906813968, + "grad_norm": 2.125551223754883, + "learning_rate": 8.603737926338912e-06, + "loss": 0.6659, + "step": 7114 + }, + { + "epoch": 0.5383829594037305, + "grad_norm": 2.0060691833496094, + "learning_rate": 8.601487905729235e-06, + "loss": 0.6208, + "step": 7115 + }, + { + "epoch": 0.538458628126064, + "grad_norm": 2.356447458267212, + "learning_rate": 8.59923791007567e-06, + "loss": 0.7137, + "step": 7116 + }, + { + "epoch": 0.5385342968483977, + "grad_norm": 2.69724178314209, + "learning_rate": 8.596987939519128e-06, + "loss": 0.7895, + "step": 7117 + }, + { + "epoch": 0.5386099655707314, + "grad_norm": 2.0127129554748535, + "learning_rate": 8.594737994200504e-06, + "loss": 0.5838, + "step": 7118 + }, + { + "epoch": 0.5386856342930649, + "grad_norm": 2.2311911582946777, + "learning_rate": 8.592488074260698e-06, + "loss": 0.7257, + "step": 7119 + }, + { + "epoch": 0.5387613030153986, + "grad_norm": 2.5712006092071533, + "learning_rate": 8.590238179840606e-06, + "loss": 0.7806, + "step": 7120 + }, + { + "epoch": 0.5388369717377323, + "grad_norm": 2.380955219268799, + "learning_rate": 8.587988311081122e-06, + "loss": 0.7459, + "step": 7121 + }, + { + "epoch": 0.5389126404600658, + "grad_norm": 2.549931764602661, + "learning_rate": 8.585738468123147e-06, + "loss": 0.6662, + "step": 7122 + }, + { + "epoch": 0.5389883091823995, + "grad_norm": 2.1435601711273193, + "learning_rate": 8.583488651107566e-06, + "loss": 0.7061, + "step": 7123 + }, + { + "epoch": 0.539063977904733, + "grad_norm": 5.311253070831299, + "learning_rate": 8.581238860175276e-06, + "loss": 0.8853, + "step": 7124 + }, + { + "epoch": 0.5391396466270667, + "grad_norm": 2.5951385498046875, + "learning_rate": 8.578989095467161e-06, + "loss": 0.6598, + "step": 7125 + }, + { + "epoch": 0.5392153153494004, + "grad_norm": 2.184601068496704, + "learning_rate": 8.576739357124107e-06, + "loss": 0.6537, + "step": 7126 + }, + { + "epoch": 0.5392909840717339, + "grad_norm": 2.4066100120544434, + "learning_rate": 8.57448964528701e-06, + "loss": 0.7578, + "step": 7127 + }, + { + "epoch": 0.5393666527940676, + "grad_norm": 3.0579781532287598, + "learning_rate": 8.572239960096742e-06, + "loss": 0.7298, + "step": 7128 + }, + { + "epoch": 0.5394423215164011, + "grad_norm": 1.7824645042419434, + "learning_rate": 8.569990301694196e-06, + "loss": 0.5753, + "step": 7129 + }, + { + "epoch": 0.5395179902387348, + "grad_norm": 2.685590982437134, + "learning_rate": 8.567740670220246e-06, + "loss": 0.7501, + "step": 7130 + }, + { + "epoch": 0.5395936589610685, + "grad_norm": 2.148221254348755, + "learning_rate": 8.565491065815771e-06, + "loss": 0.5976, + "step": 7131 + }, + { + "epoch": 0.539669327683402, + "grad_norm": 2.3486313819885254, + "learning_rate": 8.563241488621652e-06, + "loss": 0.8495, + "step": 7132 + }, + { + "epoch": 0.5397449964057357, + "grad_norm": 2.0380804538726807, + "learning_rate": 8.560991938778767e-06, + "loss": 0.6935, + "step": 7133 + }, + { + "epoch": 0.5398206651280694, + "grad_norm": 2.2446913719177246, + "learning_rate": 8.558742416427985e-06, + "loss": 0.6925, + "step": 7134 + }, + { + "epoch": 0.5398963338504029, + "grad_norm": 2.0997154712677, + "learning_rate": 8.55649292171018e-06, + "loss": 0.5679, + "step": 7135 + }, + { + "epoch": 0.5399720025727366, + "grad_norm": 2.4768741130828857, + "learning_rate": 8.55424345476622e-06, + "loss": 0.7029, + "step": 7136 + }, + { + "epoch": 0.5400476712950701, + "grad_norm": 3.763378858566284, + "learning_rate": 8.551994015736978e-06, + "loss": 0.6053, + "step": 7137 + }, + { + "epoch": 0.5401233400174038, + "grad_norm": 1.940919280052185, + "learning_rate": 8.549744604763322e-06, + "loss": 0.6981, + "step": 7138 + }, + { + "epoch": 0.5401990087397375, + "grad_norm": 2.1327764987945557, + "learning_rate": 8.547495221986114e-06, + "loss": 0.8293, + "step": 7139 + }, + { + "epoch": 0.540274677462071, + "grad_norm": 2.6629951000213623, + "learning_rate": 8.54524586754622e-06, + "loss": 0.8285, + "step": 7140 + }, + { + "epoch": 0.5403503461844047, + "grad_norm": 2.080131769180298, + "learning_rate": 8.542996541584498e-06, + "loss": 0.7531, + "step": 7141 + }, + { + "epoch": 0.5404260149067383, + "grad_norm": 1.9729998111724854, + "learning_rate": 8.540747244241811e-06, + "loss": 0.9104, + "step": 7142 + }, + { + "epoch": 0.5405016836290719, + "grad_norm": 1.957764744758606, + "learning_rate": 8.53849797565902e-06, + "loss": 0.836, + "step": 7143 + }, + { + "epoch": 0.5405773523514056, + "grad_norm": 2.3214311599731445, + "learning_rate": 8.536248735976976e-06, + "loss": 0.6771, + "step": 7144 + }, + { + "epoch": 0.5406530210737391, + "grad_norm": 1.9388372898101807, + "learning_rate": 8.533999525336536e-06, + "loss": 0.6914, + "step": 7145 + }, + { + "epoch": 0.5407286897960728, + "grad_norm": 2.0969862937927246, + "learning_rate": 8.531750343878551e-06, + "loss": 0.7405, + "step": 7146 + }, + { + "epoch": 0.5408043585184065, + "grad_norm": 2.0790417194366455, + "learning_rate": 8.529501191743876e-06, + "loss": 0.7561, + "step": 7147 + }, + { + "epoch": 0.54088002724074, + "grad_norm": 1.9571138620376587, + "learning_rate": 8.527252069073359e-06, + "loss": 0.8044, + "step": 7148 + }, + { + "epoch": 0.5409556959630737, + "grad_norm": 2.024289131164551, + "learning_rate": 8.525002976007848e-06, + "loss": 0.5776, + "step": 7149 + }, + { + "epoch": 0.5410313646854072, + "grad_norm": 2.4422504901885986, + "learning_rate": 8.522753912688184e-06, + "loss": 0.7017, + "step": 7150 + }, + { + "epoch": 0.5411070334077409, + "grad_norm": 2.0276260375976562, + "learning_rate": 8.520504879255214e-06, + "loss": 0.5853, + "step": 7151 + }, + { + "epoch": 0.5411827021300746, + "grad_norm": 2.331113576889038, + "learning_rate": 8.51825587584978e-06, + "loss": 0.6115, + "step": 7152 + }, + { + "epoch": 0.5412583708524081, + "grad_norm": 1.7973954677581787, + "learning_rate": 8.516006902612721e-06, + "loss": 0.5986, + "step": 7153 + }, + { + "epoch": 0.5413340395747418, + "grad_norm": 2.5584282875061035, + "learning_rate": 8.513757959684877e-06, + "loss": 0.7094, + "step": 7154 + }, + { + "epoch": 0.5414097082970754, + "grad_norm": 2.177795886993408, + "learning_rate": 8.51150904720708e-06, + "loss": 0.6867, + "step": 7155 + }, + { + "epoch": 0.541485377019409, + "grad_norm": 2.400580644607544, + "learning_rate": 8.509260165320168e-06, + "loss": 0.6919, + "step": 7156 + }, + { + "epoch": 0.5415610457417427, + "grad_norm": 1.9860256910324097, + "learning_rate": 8.50701131416497e-06, + "loss": 0.6899, + "step": 7157 + }, + { + "epoch": 0.5416367144640762, + "grad_norm": 1.862237572669983, + "learning_rate": 8.504762493882317e-06, + "loss": 0.6249, + "step": 7158 + }, + { + "epoch": 0.5417123831864099, + "grad_norm": 2.687009811401367, + "learning_rate": 8.50251370461304e-06, + "loss": 0.6726, + "step": 7159 + }, + { + "epoch": 0.5417880519087436, + "grad_norm": 1.91978120803833, + "learning_rate": 8.500264946497967e-06, + "loss": 0.5824, + "step": 7160 + }, + { + "epoch": 0.5418637206310771, + "grad_norm": 2.6464669704437256, + "learning_rate": 8.498016219677915e-06, + "loss": 0.6439, + "step": 7161 + }, + { + "epoch": 0.5419393893534108, + "grad_norm": 2.026582717895508, + "learning_rate": 8.49576752429371e-06, + "loss": 0.76, + "step": 7162 + }, + { + "epoch": 0.5420150580757443, + "grad_norm": 2.627919912338257, + "learning_rate": 8.493518860486177e-06, + "loss": 0.7907, + "step": 7163 + }, + { + "epoch": 0.542090726798078, + "grad_norm": 2.2149717807769775, + "learning_rate": 8.49127022839613e-06, + "loss": 0.6007, + "step": 7164 + }, + { + "epoch": 0.5421663955204117, + "grad_norm": 1.838017225265503, + "learning_rate": 8.489021628164388e-06, + "loss": 0.6642, + "step": 7165 + }, + { + "epoch": 0.5422420642427452, + "grad_norm": 2.120692014694214, + "learning_rate": 8.486773059931763e-06, + "loss": 0.5709, + "step": 7166 + }, + { + "epoch": 0.5423177329650789, + "grad_norm": 2.953522205352783, + "learning_rate": 8.484524523839067e-06, + "loss": 0.6147, + "step": 7167 + }, + { + "epoch": 0.5423934016874125, + "grad_norm": 1.9977935552597046, + "learning_rate": 8.482276020027114e-06, + "loss": 0.7301, + "step": 7168 + }, + { + "epoch": 0.5424690704097461, + "grad_norm": 2.271860361099243, + "learning_rate": 8.480027548636714e-06, + "loss": 0.6553, + "step": 7169 + }, + { + "epoch": 0.5425447391320798, + "grad_norm": 1.9562822580337524, + "learning_rate": 8.477779109808668e-06, + "loss": 0.638, + "step": 7170 + }, + { + "epoch": 0.5426204078544133, + "grad_norm": 2.308135509490967, + "learning_rate": 8.475530703683784e-06, + "loss": 0.781, + "step": 7171 + }, + { + "epoch": 0.542696076576747, + "grad_norm": 1.8787046670913696, + "learning_rate": 8.47328233040286e-06, + "loss": 0.6161, + "step": 7172 + }, + { + "epoch": 0.5427717452990807, + "grad_norm": 2.3262102603912354, + "learning_rate": 8.471033990106703e-06, + "loss": 0.6771, + "step": 7173 + }, + { + "epoch": 0.5428474140214142, + "grad_norm": 2.1242523193359375, + "learning_rate": 8.46878568293611e-06, + "loss": 0.8264, + "step": 7174 + }, + { + "epoch": 0.5429230827437479, + "grad_norm": 3.241060256958008, + "learning_rate": 8.466537409031875e-06, + "loss": 0.5887, + "step": 7175 + }, + { + "epoch": 0.5429987514660815, + "grad_norm": 1.77642023563385, + "learning_rate": 8.464289168534794e-06, + "loss": 0.6155, + "step": 7176 + }, + { + "epoch": 0.5430744201884151, + "grad_norm": 2.4458696842193604, + "learning_rate": 8.462040961585655e-06, + "loss": 0.6388, + "step": 7177 + }, + { + "epoch": 0.5431500889107488, + "grad_norm": 1.5858855247497559, + "learning_rate": 8.459792788325251e-06, + "loss": 0.6843, + "step": 7178 + }, + { + "epoch": 0.5432257576330823, + "grad_norm": 2.241321325302124, + "learning_rate": 8.457544648894372e-06, + "loss": 0.6885, + "step": 7179 + }, + { + "epoch": 0.543301426355416, + "grad_norm": 1.9236705303192139, + "learning_rate": 8.4552965434338e-06, + "loss": 0.6897, + "step": 7180 + }, + { + "epoch": 0.5433770950777496, + "grad_norm": 2.625894069671631, + "learning_rate": 8.453048472084323e-06, + "loss": 0.7572, + "step": 7181 + }, + { + "epoch": 0.5434527638000832, + "grad_norm": 1.9380083084106445, + "learning_rate": 8.450800434986716e-06, + "loss": 0.8159, + "step": 7182 + }, + { + "epoch": 0.5435284325224169, + "grad_norm": 2.195018768310547, + "learning_rate": 8.448552432281763e-06, + "loss": 0.7674, + "step": 7183 + }, + { + "epoch": 0.5436041012447504, + "grad_norm": 3.2216532230377197, + "learning_rate": 8.446304464110243e-06, + "loss": 0.7002, + "step": 7184 + }, + { + "epoch": 0.5436797699670841, + "grad_norm": 2.094017505645752, + "learning_rate": 8.444056530612926e-06, + "loss": 0.6522, + "step": 7185 + }, + { + "epoch": 0.5437554386894178, + "grad_norm": 2.447477340698242, + "learning_rate": 8.441808631930588e-06, + "loss": 0.8029, + "step": 7186 + }, + { + "epoch": 0.5438311074117513, + "grad_norm": 1.7024062871932983, + "learning_rate": 8.439560768203996e-06, + "loss": 0.6894, + "step": 7187 + }, + { + "epoch": 0.543906776134085, + "grad_norm": 1.634425163269043, + "learning_rate": 8.437312939573925e-06, + "loss": 0.6593, + "step": 7188 + }, + { + "epoch": 0.5439824448564186, + "grad_norm": 2.105696678161621, + "learning_rate": 8.435065146181135e-06, + "loss": 0.7249, + "step": 7189 + }, + { + "epoch": 0.5440581135787522, + "grad_norm": 2.9935972690582275, + "learning_rate": 8.432817388166395e-06, + "loss": 0.632, + "step": 7190 + }, + { + "epoch": 0.5441337823010859, + "grad_norm": 2.017632007598877, + "learning_rate": 8.430569665670464e-06, + "loss": 0.8673, + "step": 7191 + }, + { + "epoch": 0.5442094510234194, + "grad_norm": 1.9083147048950195, + "learning_rate": 8.428321978834104e-06, + "loss": 0.6792, + "step": 7192 + }, + { + "epoch": 0.5442851197457531, + "grad_norm": 2.1017110347747803, + "learning_rate": 8.426074327798067e-06, + "loss": 0.6695, + "step": 7193 + }, + { + "epoch": 0.5443607884680867, + "grad_norm": 1.9872255325317383, + "learning_rate": 8.423826712703114e-06, + "loss": 0.6914, + "step": 7194 + }, + { + "epoch": 0.5444364571904203, + "grad_norm": 2.61326003074646, + "learning_rate": 8.421579133689997e-06, + "loss": 0.6831, + "step": 7195 + }, + { + "epoch": 0.544512125912754, + "grad_norm": 2.0831680297851562, + "learning_rate": 8.419331590899463e-06, + "loss": 0.662, + "step": 7196 + }, + { + "epoch": 0.5445877946350876, + "grad_norm": 1.7519179582595825, + "learning_rate": 8.417084084472267e-06, + "loss": 0.7071, + "step": 7197 + }, + { + "epoch": 0.5446634633574212, + "grad_norm": 1.9903972148895264, + "learning_rate": 8.414836614549145e-06, + "loss": 0.7907, + "step": 7198 + }, + { + "epoch": 0.5447391320797549, + "grad_norm": 3.3325815200805664, + "learning_rate": 8.412589181270849e-06, + "loss": 0.6182, + "step": 7199 + }, + { + "epoch": 0.5448148008020884, + "grad_norm": 2.1972944736480713, + "learning_rate": 8.410341784778121e-06, + "loss": 0.8044, + "step": 7200 + }, + { + "epoch": 0.5448904695244221, + "grad_norm": 2.348930835723877, + "learning_rate": 8.408094425211695e-06, + "loss": 0.7072, + "step": 7201 + }, + { + "epoch": 0.5449661382467557, + "grad_norm": 2.407869338989258, + "learning_rate": 8.405847102712313e-06, + "loss": 0.7567, + "step": 7202 + }, + { + "epoch": 0.5450418069690893, + "grad_norm": 2.114015579223633, + "learning_rate": 8.403599817420702e-06, + "loss": 0.8147, + "step": 7203 + }, + { + "epoch": 0.545117475691423, + "grad_norm": 2.1249501705169678, + "learning_rate": 8.401352569477605e-06, + "loss": 0.6118, + "step": 7204 + }, + { + "epoch": 0.5451931444137565, + "grad_norm": 2.1897776126861572, + "learning_rate": 8.399105359023743e-06, + "loss": 0.6311, + "step": 7205 + }, + { + "epoch": 0.5452688131360902, + "grad_norm": 2.278143882751465, + "learning_rate": 8.39685818619985e-06, + "loss": 0.8238, + "step": 7206 + }, + { + "epoch": 0.5453444818584238, + "grad_norm": 1.9254010915756226, + "learning_rate": 8.394611051146647e-06, + "loss": 0.6331, + "step": 7207 + }, + { + "epoch": 0.5454201505807574, + "grad_norm": 1.6860229969024658, + "learning_rate": 8.392363954004855e-06, + "loss": 0.7736, + "step": 7208 + }, + { + "epoch": 0.5454958193030911, + "grad_norm": 2.540018081665039, + "learning_rate": 8.390116894915201e-06, + "loss": 0.6795, + "step": 7209 + }, + { + "epoch": 0.5455714880254247, + "grad_norm": 2.2471187114715576, + "learning_rate": 8.387869874018399e-06, + "loss": 0.7135, + "step": 7210 + }, + { + "epoch": 0.5456471567477583, + "grad_norm": 2.69797682762146, + "learning_rate": 8.385622891455167e-06, + "loss": 0.6507, + "step": 7211 + }, + { + "epoch": 0.545722825470092, + "grad_norm": 3.0235390663146973, + "learning_rate": 8.383375947366214e-06, + "loss": 0.6393, + "step": 7212 + }, + { + "epoch": 0.5457984941924255, + "grad_norm": 2.0341522693634033, + "learning_rate": 8.381129041892252e-06, + "loss": 0.8328, + "step": 7213 + }, + { + "epoch": 0.5458741629147592, + "grad_norm": 1.5147240161895752, + "learning_rate": 8.378882175173996e-06, + "loss": 0.5524, + "step": 7214 + }, + { + "epoch": 0.5459498316370928, + "grad_norm": 2.072648286819458, + "learning_rate": 8.376635347352143e-06, + "loss": 0.6394, + "step": 7215 + }, + { + "epoch": 0.5460255003594264, + "grad_norm": 2.308685064315796, + "learning_rate": 8.374388558567405e-06, + "loss": 0.7882, + "step": 7216 + }, + { + "epoch": 0.5461011690817601, + "grad_norm": 2.836092472076416, + "learning_rate": 8.372141808960474e-06, + "loss": 0.6588, + "step": 7217 + }, + { + "epoch": 0.5461768378040937, + "grad_norm": 2.7137436866760254, + "learning_rate": 8.369895098672053e-06, + "loss": 0.6675, + "step": 7218 + }, + { + "epoch": 0.5462525065264273, + "grad_norm": 1.9144799709320068, + "learning_rate": 8.367648427842842e-06, + "loss": 0.8562, + "step": 7219 + }, + { + "epoch": 0.5463281752487609, + "grad_norm": 2.2741446495056152, + "learning_rate": 8.365401796613534e-06, + "loss": 0.6291, + "step": 7220 + }, + { + "epoch": 0.5464038439710945, + "grad_norm": 2.282801866531372, + "learning_rate": 8.363155205124815e-06, + "loss": 0.6835, + "step": 7221 + }, + { + "epoch": 0.5464795126934282, + "grad_norm": 2.0239760875701904, + "learning_rate": 8.36090865351738e-06, + "loss": 0.606, + "step": 7222 + }, + { + "epoch": 0.5465551814157618, + "grad_norm": 2.0465428829193115, + "learning_rate": 8.358662141931906e-06, + "loss": 0.8214, + "step": 7223 + }, + { + "epoch": 0.5466308501380954, + "grad_norm": 1.8509844541549683, + "learning_rate": 8.356415670509085e-06, + "loss": 0.6769, + "step": 7224 + }, + { + "epoch": 0.5467065188604291, + "grad_norm": 2.0568501949310303, + "learning_rate": 8.3541692393896e-06, + "loss": 0.6617, + "step": 7225 + }, + { + "epoch": 0.5467821875827626, + "grad_norm": 2.1013340950012207, + "learning_rate": 8.351922848714125e-06, + "loss": 0.7638, + "step": 7226 + }, + { + "epoch": 0.5468578563050963, + "grad_norm": 2.2861831188201904, + "learning_rate": 8.349676498623337e-06, + "loss": 0.743, + "step": 7227 + }, + { + "epoch": 0.5469335250274299, + "grad_norm": 2.278883457183838, + "learning_rate": 8.347430189257907e-06, + "loss": 0.6331, + "step": 7228 + }, + { + "epoch": 0.5470091937497635, + "grad_norm": 1.7488274574279785, + "learning_rate": 8.345183920758512e-06, + "loss": 0.7192, + "step": 7229 + }, + { + "epoch": 0.5470848624720972, + "grad_norm": 1.8391131162643433, + "learning_rate": 8.342937693265819e-06, + "loss": 0.8068, + "step": 7230 + }, + { + "epoch": 0.5471605311944308, + "grad_norm": 2.0929038524627686, + "learning_rate": 8.340691506920491e-06, + "loss": 0.6009, + "step": 7231 + }, + { + "epoch": 0.5472361999167644, + "grad_norm": 2.610123872756958, + "learning_rate": 8.338445361863193e-06, + "loss": 0.841, + "step": 7232 + }, + { + "epoch": 0.547311868639098, + "grad_norm": 2.2190845012664795, + "learning_rate": 8.336199258234588e-06, + "loss": 0.7216, + "step": 7233 + }, + { + "epoch": 0.5473875373614316, + "grad_norm": 1.9688743352890015, + "learning_rate": 8.33395319617533e-06, + "loss": 0.63, + "step": 7234 + }, + { + "epoch": 0.5474632060837653, + "grad_norm": 2.508888006210327, + "learning_rate": 8.331707175826077e-06, + "loss": 0.6036, + "step": 7235 + }, + { + "epoch": 0.5475388748060989, + "grad_norm": 2.6636786460876465, + "learning_rate": 8.329461197327484e-06, + "loss": 0.7058, + "step": 7236 + }, + { + "epoch": 0.5476145435284325, + "grad_norm": 2.2972769737243652, + "learning_rate": 8.3272152608202e-06, + "loss": 0.6575, + "step": 7237 + }, + { + "epoch": 0.5476902122507662, + "grad_norm": 2.279362440109253, + "learning_rate": 8.324969366444874e-06, + "loss": 0.6517, + "step": 7238 + }, + { + "epoch": 0.5477658809730998, + "grad_norm": 2.3690998554229736, + "learning_rate": 8.322723514342143e-06, + "loss": 0.6411, + "step": 7239 + }, + { + "epoch": 0.5478415496954334, + "grad_norm": 1.700605034828186, + "learning_rate": 8.320477704652662e-06, + "loss": 0.7621, + "step": 7240 + }, + { + "epoch": 0.547917218417767, + "grad_norm": 2.426114320755005, + "learning_rate": 8.318231937517063e-06, + "loss": 0.6546, + "step": 7241 + }, + { + "epoch": 0.5479928871401006, + "grad_norm": 2.4113972187042236, + "learning_rate": 8.315986213075986e-06, + "loss": 0.6745, + "step": 7242 + }, + { + "epoch": 0.5480685558624343, + "grad_norm": 2.2647318840026855, + "learning_rate": 8.313740531470065e-06, + "loss": 0.8873, + "step": 7243 + }, + { + "epoch": 0.5481442245847679, + "grad_norm": 2.242077112197876, + "learning_rate": 8.311494892839929e-06, + "loss": 0.737, + "step": 7244 + }, + { + "epoch": 0.5482198933071015, + "grad_norm": 3.1310784816741943, + "learning_rate": 8.30924929732621e-06, + "loss": 0.8376, + "step": 7245 + }, + { + "epoch": 0.5482955620294351, + "grad_norm": 2.1693949699401855, + "learning_rate": 8.307003745069537e-06, + "loss": 0.7175, + "step": 7246 + }, + { + "epoch": 0.5483712307517687, + "grad_norm": 2.6031949520111084, + "learning_rate": 8.30475823621053e-06, + "loss": 0.5512, + "step": 7247 + }, + { + "epoch": 0.5484468994741024, + "grad_norm": 2.4191761016845703, + "learning_rate": 8.30251277088981e-06, + "loss": 0.6359, + "step": 7248 + }, + { + "epoch": 0.548522568196436, + "grad_norm": 1.9725650548934937, + "learning_rate": 8.300267349247993e-06, + "loss": 0.7108, + "step": 7249 + }, + { + "epoch": 0.5485982369187696, + "grad_norm": 2.7272510528564453, + "learning_rate": 8.298021971425704e-06, + "loss": 0.6975, + "step": 7250 + }, + { + "epoch": 0.5486739056411033, + "grad_norm": 2.1538898944854736, + "learning_rate": 8.295776637563546e-06, + "loss": 0.8058, + "step": 7251 + }, + { + "epoch": 0.5487495743634369, + "grad_norm": 2.162531852722168, + "learning_rate": 8.293531347802136e-06, + "loss": 0.7727, + "step": 7252 + }, + { + "epoch": 0.5488252430857705, + "grad_norm": 2.4503231048583984, + "learning_rate": 8.291286102282076e-06, + "loss": 0.7097, + "step": 7253 + }, + { + "epoch": 0.5489009118081041, + "grad_norm": 2.264075994491577, + "learning_rate": 8.289040901143969e-06, + "loss": 0.5951, + "step": 7254 + }, + { + "epoch": 0.5489765805304377, + "grad_norm": 2.111943244934082, + "learning_rate": 8.286795744528425e-06, + "loss": 0.6764, + "step": 7255 + }, + { + "epoch": 0.5490522492527714, + "grad_norm": 1.6367228031158447, + "learning_rate": 8.284550632576037e-06, + "loss": 0.599, + "step": 7256 + }, + { + "epoch": 0.549127917975105, + "grad_norm": 2.353544235229492, + "learning_rate": 8.282305565427402e-06, + "loss": 0.6703, + "step": 7257 + }, + { + "epoch": 0.5492035866974386, + "grad_norm": 2.2156119346618652, + "learning_rate": 8.280060543223115e-06, + "loss": 0.6398, + "step": 7258 + }, + { + "epoch": 0.5492792554197722, + "grad_norm": 2.4467849731445312, + "learning_rate": 8.27781556610376e-06, + "loss": 0.705, + "step": 7259 + }, + { + "epoch": 0.5493549241421058, + "grad_norm": 1.8530616760253906, + "learning_rate": 8.275570634209936e-06, + "loss": 0.7576, + "step": 7260 + }, + { + "epoch": 0.5494305928644395, + "grad_norm": 2.1063315868377686, + "learning_rate": 8.273325747682223e-06, + "loss": 0.6817, + "step": 7261 + }, + { + "epoch": 0.5495062615867731, + "grad_norm": 3.19496488571167, + "learning_rate": 8.271080906661197e-06, + "loss": 0.7781, + "step": 7262 + }, + { + "epoch": 0.5495819303091067, + "grad_norm": 1.902632713317871, + "learning_rate": 8.268836111287447e-06, + "loss": 0.6575, + "step": 7263 + }, + { + "epoch": 0.5496575990314404, + "grad_norm": 1.7759108543395996, + "learning_rate": 8.26659136170154e-06, + "loss": 0.6641, + "step": 7264 + }, + { + "epoch": 0.549733267753774, + "grad_norm": 1.6828055381774902, + "learning_rate": 8.264346658044056e-06, + "loss": 0.6752, + "step": 7265 + }, + { + "epoch": 0.5498089364761076, + "grad_norm": 2.365657329559326, + "learning_rate": 8.262102000455565e-06, + "loss": 0.7019, + "step": 7266 + }, + { + "epoch": 0.5498846051984412, + "grad_norm": 1.8660743236541748, + "learning_rate": 8.259857389076632e-06, + "loss": 0.6905, + "step": 7267 + }, + { + "epoch": 0.5499602739207748, + "grad_norm": 2.5874767303466797, + "learning_rate": 8.257612824047825e-06, + "loss": 0.7434, + "step": 7268 + }, + { + "epoch": 0.5500359426431085, + "grad_norm": 1.9300464391708374, + "learning_rate": 8.255368305509703e-06, + "loss": 0.7444, + "step": 7269 + }, + { + "epoch": 0.5501116113654421, + "grad_norm": 2.1249427795410156, + "learning_rate": 8.253123833602823e-06, + "loss": 0.675, + "step": 7270 + }, + { + "epoch": 0.5501872800877757, + "grad_norm": 2.6504623889923096, + "learning_rate": 8.25087940846775e-06, + "loss": 0.6052, + "step": 7271 + }, + { + "epoch": 0.5502629488101093, + "grad_norm": 1.6666854619979858, + "learning_rate": 8.248635030245026e-06, + "loss": 0.9488, + "step": 7272 + }, + { + "epoch": 0.550338617532443, + "grad_norm": 1.920323133468628, + "learning_rate": 8.246390699075211e-06, + "loss": 0.7232, + "step": 7273 + }, + { + "epoch": 0.5504142862547766, + "grad_norm": 1.9666804075241089, + "learning_rate": 8.244146415098847e-06, + "loss": 0.6699, + "step": 7274 + }, + { + "epoch": 0.5504899549771102, + "grad_norm": 2.455787181854248, + "learning_rate": 8.241902178456474e-06, + "loss": 0.7122, + "step": 7275 + }, + { + "epoch": 0.5505656236994438, + "grad_norm": 2.3213722705841064, + "learning_rate": 8.239657989288643e-06, + "loss": 0.8173, + "step": 7276 + }, + { + "epoch": 0.5506412924217775, + "grad_norm": 1.930747628211975, + "learning_rate": 8.23741384773589e-06, + "loss": 0.646, + "step": 7277 + }, + { + "epoch": 0.5507169611441111, + "grad_norm": 2.0072009563446045, + "learning_rate": 8.235169753938745e-06, + "loss": 0.7275, + "step": 7278 + }, + { + "epoch": 0.5507926298664447, + "grad_norm": 2.398528814315796, + "learning_rate": 8.232925708037748e-06, + "loss": 0.6895, + "step": 7279 + }, + { + "epoch": 0.5508682985887783, + "grad_norm": 2.015350103378296, + "learning_rate": 8.230681710173418e-06, + "loss": 0.6356, + "step": 7280 + }, + { + "epoch": 0.550943967311112, + "grad_norm": 2.454782485961914, + "learning_rate": 8.22843776048629e-06, + "loss": 0.7882, + "step": 7281 + }, + { + "epoch": 0.5510196360334456, + "grad_norm": 2.29709529876709, + "learning_rate": 8.226193859116887e-06, + "loss": 0.6594, + "step": 7282 + }, + { + "epoch": 0.5510953047557792, + "grad_norm": 2.140209197998047, + "learning_rate": 8.223950006205725e-06, + "loss": 0.6665, + "step": 7283 + }, + { + "epoch": 0.5511709734781128, + "grad_norm": 1.968939185142517, + "learning_rate": 8.221706201893326e-06, + "loss": 0.7324, + "step": 7284 + }, + { + "epoch": 0.5512466422004464, + "grad_norm": 1.8887324333190918, + "learning_rate": 8.219462446320199e-06, + "loss": 0.5153, + "step": 7285 + }, + { + "epoch": 0.5513223109227801, + "grad_norm": 2.2595646381378174, + "learning_rate": 8.21721873962686e-06, + "loss": 0.8726, + "step": 7286 + }, + { + "epoch": 0.5513979796451137, + "grad_norm": 5.0413899421691895, + "learning_rate": 8.214975081953816e-06, + "loss": 0.7368, + "step": 7287 + }, + { + "epoch": 0.5514736483674473, + "grad_norm": 2.2042415142059326, + "learning_rate": 8.21273147344157e-06, + "loss": 0.6438, + "step": 7288 + }, + { + "epoch": 0.5515493170897809, + "grad_norm": 2.034449815750122, + "learning_rate": 8.210487914230627e-06, + "loss": 0.655, + "step": 7289 + }, + { + "epoch": 0.5516249858121146, + "grad_norm": 2.396224021911621, + "learning_rate": 8.208244404461479e-06, + "loss": 0.6204, + "step": 7290 + }, + { + "epoch": 0.5517006545344482, + "grad_norm": 2.4055793285369873, + "learning_rate": 8.206000944274634e-06, + "loss": 0.7578, + "step": 7291 + }, + { + "epoch": 0.5517763232567818, + "grad_norm": 2.2324295043945312, + "learning_rate": 8.203757533810575e-06, + "loss": 0.6712, + "step": 7292 + }, + { + "epoch": 0.5518519919791154, + "grad_norm": 2.729771614074707, + "learning_rate": 8.201514173209797e-06, + "loss": 0.7397, + "step": 7293 + }, + { + "epoch": 0.551927660701449, + "grad_norm": 1.7570246458053589, + "learning_rate": 8.199270862612781e-06, + "loss": 0.7682, + "step": 7294 + }, + { + "epoch": 0.5520033294237827, + "grad_norm": 2.058879852294922, + "learning_rate": 8.197027602160013e-06, + "loss": 0.5722, + "step": 7295 + }, + { + "epoch": 0.5520789981461163, + "grad_norm": 2.095630645751953, + "learning_rate": 8.194784391991977e-06, + "loss": 0.7722, + "step": 7296 + }, + { + "epoch": 0.5521546668684499, + "grad_norm": 2.408841371536255, + "learning_rate": 8.192541232249145e-06, + "loss": 0.7326, + "step": 7297 + }, + { + "epoch": 0.5522303355907835, + "grad_norm": 2.27298641204834, + "learning_rate": 8.190298123071993e-06, + "loss": 0.7144, + "step": 7298 + }, + { + "epoch": 0.5523060043131172, + "grad_norm": 2.675877809524536, + "learning_rate": 8.188055064600991e-06, + "loss": 0.874, + "step": 7299 + }, + { + "epoch": 0.5523816730354508, + "grad_norm": 2.062973976135254, + "learning_rate": 8.185812056976605e-06, + "loss": 0.6534, + "step": 7300 + }, + { + "epoch": 0.5524573417577844, + "grad_norm": 2.2476651668548584, + "learning_rate": 8.183569100339305e-06, + "loss": 0.8483, + "step": 7301 + }, + { + "epoch": 0.552533010480118, + "grad_norm": 2.406301259994507, + "learning_rate": 8.181326194829548e-06, + "loss": 0.8172, + "step": 7302 + }, + { + "epoch": 0.5526086792024517, + "grad_norm": 2.0138354301452637, + "learning_rate": 8.179083340587794e-06, + "loss": 0.6932, + "step": 7303 + }, + { + "epoch": 0.5526843479247853, + "grad_norm": 1.9023960828781128, + "learning_rate": 8.176840537754493e-06, + "loss": 0.643, + "step": 7304 + }, + { + "epoch": 0.5527600166471189, + "grad_norm": 2.067840576171875, + "learning_rate": 8.1745977864701e-06, + "loss": 0.8828, + "step": 7305 + }, + { + "epoch": 0.5528356853694525, + "grad_norm": 2.148322105407715, + "learning_rate": 8.172355086875064e-06, + "loss": 0.7668, + "step": 7306 + }, + { + "epoch": 0.5529113540917862, + "grad_norm": 2.0931055545806885, + "learning_rate": 8.17011243910983e-06, + "loss": 0.6585, + "step": 7307 + }, + { + "epoch": 0.5529870228141198, + "grad_norm": 1.9692755937576294, + "learning_rate": 8.167869843314839e-06, + "loss": 0.7746, + "step": 7308 + }, + { + "epoch": 0.5530626915364534, + "grad_norm": 2.0854835510253906, + "learning_rate": 8.165627299630532e-06, + "loss": 0.7212, + "step": 7309 + }, + { + "epoch": 0.553138360258787, + "grad_norm": 2.4114372730255127, + "learning_rate": 8.163384808197339e-06, + "loss": 0.7573, + "step": 7310 + }, + { + "epoch": 0.5532140289811207, + "grad_norm": 2.3823318481445312, + "learning_rate": 8.161142369155693e-06, + "loss": 0.6895, + "step": 7311 + }, + { + "epoch": 0.5532896977034543, + "grad_norm": 2.2626543045043945, + "learning_rate": 8.158899982646032e-06, + "loss": 0.7496, + "step": 7312 + }, + { + "epoch": 0.5533653664257879, + "grad_norm": 2.219508647918701, + "learning_rate": 8.156657648808769e-06, + "loss": 0.6999, + "step": 7313 + }, + { + "epoch": 0.5534410351481215, + "grad_norm": 2.2591121196746826, + "learning_rate": 8.154415367784335e-06, + "loss": 0.7347, + "step": 7314 + }, + { + "epoch": 0.5535167038704552, + "grad_norm": 2.205310344696045, + "learning_rate": 8.152173139713146e-06, + "loss": 0.7123, + "step": 7315 + }, + { + "epoch": 0.5535923725927888, + "grad_norm": 2.087782382965088, + "learning_rate": 8.149930964735612e-06, + "loss": 0.6367, + "step": 7316 + }, + { + "epoch": 0.5536680413151224, + "grad_norm": 2.4498722553253174, + "learning_rate": 8.147688842992155e-06, + "loss": 0.614, + "step": 7317 + }, + { + "epoch": 0.553743710037456, + "grad_norm": 3.0710363388061523, + "learning_rate": 8.14544677462318e-06, + "loss": 0.7032, + "step": 7318 + }, + { + "epoch": 0.5538193787597896, + "grad_norm": 2.3299739360809326, + "learning_rate": 8.14320475976909e-06, + "loss": 0.854, + "step": 7319 + }, + { + "epoch": 0.5538950474821233, + "grad_norm": 1.8151921033859253, + "learning_rate": 8.140962798570289e-06, + "loss": 0.5846, + "step": 7320 + }, + { + "epoch": 0.5539707162044569, + "grad_norm": 2.084280014038086, + "learning_rate": 8.138720891167174e-06, + "loss": 0.713, + "step": 7321 + }, + { + "epoch": 0.5540463849267905, + "grad_norm": 2.037383794784546, + "learning_rate": 8.136479037700146e-06, + "loss": 0.6499, + "step": 7322 + }, + { + "epoch": 0.5541220536491241, + "grad_norm": 2.4484384059906006, + "learning_rate": 8.134237238309593e-06, + "loss": 0.7387, + "step": 7323 + }, + { + "epoch": 0.5541977223714578, + "grad_norm": 2.2470922470092773, + "learning_rate": 8.131995493135903e-06, + "loss": 0.7522, + "step": 7324 + }, + { + "epoch": 0.5542733910937914, + "grad_norm": 2.310148239135742, + "learning_rate": 8.129753802319467e-06, + "loss": 0.7125, + "step": 7325 + }, + { + "epoch": 0.554349059816125, + "grad_norm": 1.8426295518875122, + "learning_rate": 8.127512166000656e-06, + "loss": 0.8074, + "step": 7326 + }, + { + "epoch": 0.5544247285384586, + "grad_norm": 1.8981022834777832, + "learning_rate": 8.125270584319857e-06, + "loss": 0.5748, + "step": 7327 + }, + { + "epoch": 0.5545003972607923, + "grad_norm": 2.154419183731079, + "learning_rate": 8.123029057417446e-06, + "loss": 0.7373, + "step": 7328 + }, + { + "epoch": 0.5545760659831259, + "grad_norm": 1.6180402040481567, + "learning_rate": 8.12078758543379e-06, + "loss": 0.6536, + "step": 7329 + }, + { + "epoch": 0.5546517347054595, + "grad_norm": 2.104201555252075, + "learning_rate": 8.11854616850926e-06, + "loss": 0.7068, + "step": 7330 + }, + { + "epoch": 0.5547274034277931, + "grad_norm": 1.8646724224090576, + "learning_rate": 8.116304806784218e-06, + "loss": 0.5606, + "step": 7331 + }, + { + "epoch": 0.5548030721501267, + "grad_norm": 2.118800163269043, + "learning_rate": 8.11406350039903e-06, + "loss": 0.7093, + "step": 7332 + }, + { + "epoch": 0.5548787408724604, + "grad_norm": 2.5664494037628174, + "learning_rate": 8.11182224949405e-06, + "loss": 0.7353, + "step": 7333 + }, + { + "epoch": 0.554954409594794, + "grad_norm": 2.0426931381225586, + "learning_rate": 8.109581054209633e-06, + "loss": 0.6415, + "step": 7334 + }, + { + "epoch": 0.5550300783171276, + "grad_norm": 1.8999990224838257, + "learning_rate": 8.10733991468613e-06, + "loss": 0.6621, + "step": 7335 + }, + { + "epoch": 0.5551057470394613, + "grad_norm": 2.4535303115844727, + "learning_rate": 8.105098831063887e-06, + "loss": 0.6392, + "step": 7336 + }, + { + "epoch": 0.5551814157617949, + "grad_norm": 1.9093875885009766, + "learning_rate": 8.102857803483254e-06, + "loss": 0.6715, + "step": 7337 + }, + { + "epoch": 0.5552570844841285, + "grad_norm": 1.9944705963134766, + "learning_rate": 8.100616832084564e-06, + "loss": 0.6541, + "step": 7338 + }, + { + "epoch": 0.5553327532064621, + "grad_norm": 1.7408812046051025, + "learning_rate": 8.09837591700816e-06, + "loss": 0.7232, + "step": 7339 + }, + { + "epoch": 0.5554084219287957, + "grad_norm": 2.030344247817993, + "learning_rate": 8.09613505839437e-06, + "loss": 0.7087, + "step": 7340 + }, + { + "epoch": 0.5554840906511294, + "grad_norm": 1.8891116380691528, + "learning_rate": 8.093894256383525e-06, + "loss": 0.737, + "step": 7341 + }, + { + "epoch": 0.555559759373463, + "grad_norm": 1.9823884963989258, + "learning_rate": 8.091653511115954e-06, + "loss": 0.7039, + "step": 7342 + }, + { + "epoch": 0.5556354280957966, + "grad_norm": 2.3397977352142334, + "learning_rate": 8.089412822731979e-06, + "loss": 0.7968, + "step": 7343 + }, + { + "epoch": 0.5557110968181302, + "grad_norm": 1.816161870956421, + "learning_rate": 8.087172191371917e-06, + "loss": 0.6302, + "step": 7344 + }, + { + "epoch": 0.5557867655404638, + "grad_norm": 2.0150673389434814, + "learning_rate": 8.084931617176084e-06, + "loss": 0.5506, + "step": 7345 + }, + { + "epoch": 0.5558624342627975, + "grad_norm": 2.0042967796325684, + "learning_rate": 8.082691100284796e-06, + "loss": 0.6103, + "step": 7346 + }, + { + "epoch": 0.5559381029851311, + "grad_norm": 1.9429584741592407, + "learning_rate": 8.080450640838353e-06, + "loss": 0.8005, + "step": 7347 + }, + { + "epoch": 0.5560137717074647, + "grad_norm": 1.7641698122024536, + "learning_rate": 8.078210238977067e-06, + "loss": 0.6253, + "step": 7348 + }, + { + "epoch": 0.5560894404297984, + "grad_norm": 3.233842372894287, + "learning_rate": 8.075969894841239e-06, + "loss": 0.6349, + "step": 7349 + }, + { + "epoch": 0.556165109152132, + "grad_norm": 1.9746023416519165, + "learning_rate": 8.073729608571166e-06, + "loss": 0.6736, + "step": 7350 + }, + { + "epoch": 0.5562407778744656, + "grad_norm": 2.0112850666046143, + "learning_rate": 8.071489380307138e-06, + "loss": 0.7653, + "step": 7351 + }, + { + "epoch": 0.5563164465967992, + "grad_norm": 2.242575168609619, + "learning_rate": 8.069249210189447e-06, + "loss": 0.741, + "step": 7352 + }, + { + "epoch": 0.5563921153191328, + "grad_norm": 1.929771065711975, + "learning_rate": 8.067009098358384e-06, + "loss": 0.6086, + "step": 7353 + }, + { + "epoch": 0.5564677840414665, + "grad_norm": 2.020859718322754, + "learning_rate": 8.064769044954229e-06, + "loss": 0.7699, + "step": 7354 + }, + { + "epoch": 0.5565434527638001, + "grad_norm": 2.042544364929199, + "learning_rate": 8.06252905011726e-06, + "loss": 0.6731, + "step": 7355 + }, + { + "epoch": 0.5566191214861337, + "grad_norm": 2.6606943607330322, + "learning_rate": 8.060289113987754e-06, + "loss": 0.6402, + "step": 7356 + }, + { + "epoch": 0.5566947902084673, + "grad_norm": 1.789626955986023, + "learning_rate": 8.058049236705982e-06, + "loss": 0.8114, + "step": 7357 + }, + { + "epoch": 0.5567704589308009, + "grad_norm": 1.9699000120162964, + "learning_rate": 8.055809418412215e-06, + "loss": 0.7457, + "step": 7358 + }, + { + "epoch": 0.5568461276531346, + "grad_norm": 2.0746593475341797, + "learning_rate": 8.053569659246716e-06, + "loss": 0.855, + "step": 7359 + }, + { + "epoch": 0.5569217963754682, + "grad_norm": 2.2033884525299072, + "learning_rate": 8.051329959349748e-06, + "loss": 0.6439, + "step": 7360 + }, + { + "epoch": 0.5569974650978018, + "grad_norm": 1.9378942251205444, + "learning_rate": 8.049090318861563e-06, + "loss": 0.7654, + "step": 7361 + }, + { + "epoch": 0.5570731338201355, + "grad_norm": 2.35151743888855, + "learning_rate": 8.046850737922418e-06, + "loss": 0.7722, + "step": 7362 + }, + { + "epoch": 0.5571488025424691, + "grad_norm": 2.5247981548309326, + "learning_rate": 8.044611216672562e-06, + "loss": 0.6724, + "step": 7363 + }, + { + "epoch": 0.5572244712648027, + "grad_norm": 2.331882953643799, + "learning_rate": 8.042371755252245e-06, + "loss": 0.6965, + "step": 7364 + }, + { + "epoch": 0.5573001399871363, + "grad_norm": 2.167255401611328, + "learning_rate": 8.040132353801705e-06, + "loss": 0.7462, + "step": 7365 + }, + { + "epoch": 0.5573758087094699, + "grad_norm": 2.5408060550689697, + "learning_rate": 8.037893012461182e-06, + "loss": 0.6558, + "step": 7366 + }, + { + "epoch": 0.5574514774318036, + "grad_norm": 1.810550332069397, + "learning_rate": 8.035653731370906e-06, + "loss": 0.6378, + "step": 7367 + }, + { + "epoch": 0.5575271461541372, + "grad_norm": 2.1500096321105957, + "learning_rate": 8.033414510671117e-06, + "loss": 0.6658, + "step": 7368 + }, + { + "epoch": 0.5576028148764708, + "grad_norm": 2.4346792697906494, + "learning_rate": 8.031175350502037e-06, + "loss": 0.6172, + "step": 7369 + }, + { + "epoch": 0.5576784835988045, + "grad_norm": 2.380704641342163, + "learning_rate": 8.02893625100389e-06, + "loss": 0.6924, + "step": 7370 + }, + { + "epoch": 0.557754152321138, + "grad_norm": 1.7149864435195923, + "learning_rate": 8.026697212316896e-06, + "loss": 0.7118, + "step": 7371 + }, + { + "epoch": 0.5578298210434717, + "grad_norm": 1.924574851989746, + "learning_rate": 8.02445823458127e-06, + "loss": 0.6955, + "step": 7372 + }, + { + "epoch": 0.5579054897658053, + "grad_norm": 2.193007707595825, + "learning_rate": 8.022219317937223e-06, + "loss": 0.6082, + "step": 7373 + }, + { + "epoch": 0.5579811584881389, + "grad_norm": 1.9797512292861938, + "learning_rate": 8.019980462524968e-06, + "loss": 0.84, + "step": 7374 + }, + { + "epoch": 0.5580568272104726, + "grad_norm": 2.113743543624878, + "learning_rate": 8.017741668484704e-06, + "loss": 0.8963, + "step": 7375 + }, + { + "epoch": 0.5581324959328062, + "grad_norm": 2.2391629219055176, + "learning_rate": 8.015502935956636e-06, + "loss": 0.7656, + "step": 7376 + }, + { + "epoch": 0.5582081646551398, + "grad_norm": 2.6514220237731934, + "learning_rate": 8.013264265080955e-06, + "loss": 0.673, + "step": 7377 + }, + { + "epoch": 0.5582838333774734, + "grad_norm": 2.256282091140747, + "learning_rate": 8.011025655997858e-06, + "loss": 0.7703, + "step": 7378 + }, + { + "epoch": 0.558359502099807, + "grad_norm": 2.2570066452026367, + "learning_rate": 8.008787108847533e-06, + "loss": 0.7589, + "step": 7379 + }, + { + "epoch": 0.5584351708221407, + "grad_norm": 2.5450544357299805, + "learning_rate": 8.006548623770168e-06, + "loss": 0.6944, + "step": 7380 + }, + { + "epoch": 0.5585108395444743, + "grad_norm": 2.3593268394470215, + "learning_rate": 8.00431020090594e-06, + "loss": 0.6387, + "step": 7381 + }, + { + "epoch": 0.5585865082668079, + "grad_norm": 1.9944931268692017, + "learning_rate": 8.002071840395026e-06, + "loss": 0.6667, + "step": 7382 + }, + { + "epoch": 0.5586621769891416, + "grad_norm": 1.908489465713501, + "learning_rate": 7.999833542377605e-06, + "loss": 0.635, + "step": 7383 + }, + { + "epoch": 0.5587378457114751, + "grad_norm": 2.8577051162719727, + "learning_rate": 7.997595306993838e-06, + "loss": 0.6809, + "step": 7384 + }, + { + "epoch": 0.5588135144338088, + "grad_norm": 2.8183300495147705, + "learning_rate": 7.995357134383898e-06, + "loss": 0.7129, + "step": 7385 + }, + { + "epoch": 0.5588891831561424, + "grad_norm": 2.3807883262634277, + "learning_rate": 7.993119024687943e-06, + "loss": 0.6761, + "step": 7386 + }, + { + "epoch": 0.558964851878476, + "grad_norm": 1.9848699569702148, + "learning_rate": 7.990880978046132e-06, + "loss": 0.5648, + "step": 7387 + }, + { + "epoch": 0.5590405206008097, + "grad_norm": 1.9537297487258911, + "learning_rate": 7.988642994598616e-06, + "loss": 0.6346, + "step": 7388 + }, + { + "epoch": 0.5591161893231433, + "grad_norm": 2.2345707416534424, + "learning_rate": 7.986405074485547e-06, + "loss": 0.6992, + "step": 7389 + }, + { + "epoch": 0.5591918580454769, + "grad_norm": 1.7011851072311401, + "learning_rate": 7.984167217847072e-06, + "loss": 0.6245, + "step": 7390 + }, + { + "epoch": 0.5592675267678106, + "grad_norm": 1.5342859029769897, + "learning_rate": 7.98192942482333e-06, + "loss": 0.71, + "step": 7391 + }, + { + "epoch": 0.5593431954901441, + "grad_norm": 1.9042041301727295, + "learning_rate": 7.979691695554464e-06, + "loss": 0.8046, + "step": 7392 + }, + { + "epoch": 0.5594188642124778, + "grad_norm": 4.508663654327393, + "learning_rate": 7.977454030180597e-06, + "loss": 0.661, + "step": 7393 + }, + { + "epoch": 0.5594945329348114, + "grad_norm": 2.309535503387451, + "learning_rate": 7.975216428841871e-06, + "loss": 0.7014, + "step": 7394 + }, + { + "epoch": 0.559570201657145, + "grad_norm": 2.226212739944458, + "learning_rate": 7.972978891678407e-06, + "loss": 0.6736, + "step": 7395 + }, + { + "epoch": 0.5596458703794787, + "grad_norm": 2.99662446975708, + "learning_rate": 7.970741418830327e-06, + "loss": 0.7326, + "step": 7396 + }, + { + "epoch": 0.5597215391018122, + "grad_norm": 2.0158193111419678, + "learning_rate": 7.968504010437746e-06, + "loss": 0.7136, + "step": 7397 + }, + { + "epoch": 0.5597972078241459, + "grad_norm": 2.631486654281616, + "learning_rate": 7.966266666640778e-06, + "loss": 0.7801, + "step": 7398 + }, + { + "epoch": 0.5598728765464795, + "grad_norm": 2.236232280731201, + "learning_rate": 7.96402938757954e-06, + "loss": 0.7771, + "step": 7399 + }, + { + "epoch": 0.5599485452688131, + "grad_norm": 1.8148475885391235, + "learning_rate": 7.96179217339413e-06, + "loss": 0.6843, + "step": 7400 + }, + { + "epoch": 0.5600242139911468, + "grad_norm": 2.4410221576690674, + "learning_rate": 7.959555024224654e-06, + "loss": 0.606, + "step": 7401 + }, + { + "epoch": 0.5600998827134804, + "grad_norm": 1.8520560264587402, + "learning_rate": 7.957317940211205e-06, + "loss": 0.6385, + "step": 7402 + }, + { + "epoch": 0.560175551435814, + "grad_norm": 3.167067050933838, + "learning_rate": 7.955080921493879e-06, + "loss": 0.7738, + "step": 7403 + }, + { + "epoch": 0.5602512201581477, + "grad_norm": 1.9839564561843872, + "learning_rate": 7.952843968212768e-06, + "loss": 0.8493, + "step": 7404 + }, + { + "epoch": 0.5603268888804812, + "grad_norm": 3.0600149631500244, + "learning_rate": 7.950607080507951e-06, + "loss": 0.6369, + "step": 7405 + }, + { + "epoch": 0.5604025576028149, + "grad_norm": 2.1107337474823, + "learning_rate": 7.948370258519519e-06, + "loss": 0.6013, + "step": 7406 + }, + { + "epoch": 0.5604782263251485, + "grad_norm": 2.616274118423462, + "learning_rate": 7.946133502387537e-06, + "loss": 0.731, + "step": 7407 + }, + { + "epoch": 0.5605538950474821, + "grad_norm": 2.0208301544189453, + "learning_rate": 7.943896812252083e-06, + "loss": 0.6894, + "step": 7408 + }, + { + "epoch": 0.5606295637698158, + "grad_norm": 4.073803424835205, + "learning_rate": 7.941660188253228e-06, + "loss": 0.5672, + "step": 7409 + }, + { + "epoch": 0.5607052324921493, + "grad_norm": 2.3524041175842285, + "learning_rate": 7.939423630531038e-06, + "loss": 0.7285, + "step": 7410 + }, + { + "epoch": 0.560780901214483, + "grad_norm": 2.553096294403076, + "learning_rate": 7.937187139225567e-06, + "loss": 0.708, + "step": 7411 + }, + { + "epoch": 0.5608565699368167, + "grad_norm": 2.174004554748535, + "learning_rate": 7.93495071447688e-06, + "loss": 0.6833, + "step": 7412 + }, + { + "epoch": 0.5609322386591502, + "grad_norm": 2.332759380340576, + "learning_rate": 7.932714356425018e-06, + "loss": 0.6054, + "step": 7413 + }, + { + "epoch": 0.5610079073814839, + "grad_norm": 2.4970386028289795, + "learning_rate": 7.930478065210035e-06, + "loss": 0.7808, + "step": 7414 + }, + { + "epoch": 0.5610835761038175, + "grad_norm": 1.955387830734253, + "learning_rate": 7.92824184097198e-06, + "loss": 0.7495, + "step": 7415 + }, + { + "epoch": 0.5611592448261511, + "grad_norm": 1.921034574508667, + "learning_rate": 7.926005683850883e-06, + "loss": 0.7805, + "step": 7416 + }, + { + "epoch": 0.5612349135484848, + "grad_norm": 2.5919716358184814, + "learning_rate": 7.923769593986788e-06, + "loss": 0.6509, + "step": 7417 + }, + { + "epoch": 0.5613105822708183, + "grad_norm": 2.1961042881011963, + "learning_rate": 7.921533571519717e-06, + "loss": 0.6006, + "step": 7418 + }, + { + "epoch": 0.561386250993152, + "grad_norm": 2.0641181468963623, + "learning_rate": 7.919297616589703e-06, + "loss": 0.6438, + "step": 7419 + }, + { + "epoch": 0.5614619197154856, + "grad_norm": 2.4737486839294434, + "learning_rate": 7.917061729336771e-06, + "loss": 0.7045, + "step": 7420 + }, + { + "epoch": 0.5615375884378192, + "grad_norm": 2.176301956176758, + "learning_rate": 7.914825909900935e-06, + "loss": 0.7689, + "step": 7421 + }, + { + "epoch": 0.5616132571601529, + "grad_norm": 2.713766574859619, + "learning_rate": 7.91259015842221e-06, + "loss": 0.7377, + "step": 7422 + }, + { + "epoch": 0.5616889258824864, + "grad_norm": 1.8123505115509033, + "learning_rate": 7.910354475040606e-06, + "loss": 0.7732, + "step": 7423 + }, + { + "epoch": 0.5617645946048201, + "grad_norm": 2.313922643661499, + "learning_rate": 7.908118859896127e-06, + "loss": 0.658, + "step": 7424 + }, + { + "epoch": 0.5618402633271538, + "grad_norm": 2.6944148540496826, + "learning_rate": 7.905883313128779e-06, + "loss": 0.6323, + "step": 7425 + }, + { + "epoch": 0.5619159320494873, + "grad_norm": 2.4101080894470215, + "learning_rate": 7.903647834878557e-06, + "loss": 0.7163, + "step": 7426 + }, + { + "epoch": 0.561991600771821, + "grad_norm": 2.65920352935791, + "learning_rate": 7.901412425285453e-06, + "loss": 0.776, + "step": 7427 + }, + { + "epoch": 0.5620672694941546, + "grad_norm": 2.0206611156463623, + "learning_rate": 7.899177084489457e-06, + "loss": 0.6004, + "step": 7428 + }, + { + "epoch": 0.5621429382164882, + "grad_norm": 2.1238224506378174, + "learning_rate": 7.89694181263055e-06, + "loss": 0.7055, + "step": 7429 + }, + { + "epoch": 0.5622186069388219, + "grad_norm": 2.1638028621673584, + "learning_rate": 7.894706609848717e-06, + "loss": 0.7816, + "step": 7430 + }, + { + "epoch": 0.5622942756611554, + "grad_norm": 2.223090171813965, + "learning_rate": 7.89247147628393e-06, + "loss": 0.6012, + "step": 7431 + }, + { + "epoch": 0.5623699443834891, + "grad_norm": 3.290097951889038, + "learning_rate": 7.890236412076162e-06, + "loss": 0.7829, + "step": 7432 + }, + { + "epoch": 0.5624456131058228, + "grad_norm": 1.7183538675308228, + "learning_rate": 7.888001417365379e-06, + "loss": 0.6503, + "step": 7433 + }, + { + "epoch": 0.5625212818281563, + "grad_norm": 2.0650956630706787, + "learning_rate": 7.885766492291543e-06, + "loss": 0.7117, + "step": 7434 + }, + { + "epoch": 0.56259695055049, + "grad_norm": 2.289998769760132, + "learning_rate": 7.883531636994612e-06, + "loss": 0.8198, + "step": 7435 + }, + { + "epoch": 0.5626726192728235, + "grad_norm": 2.176126718521118, + "learning_rate": 7.881296851614544e-06, + "loss": 0.7623, + "step": 7436 + }, + { + "epoch": 0.5627482879951572, + "grad_norm": 2.0864546298980713, + "learning_rate": 7.879062136291284e-06, + "loss": 0.7213, + "step": 7437 + }, + { + "epoch": 0.5628239567174909, + "grad_norm": 2.1789047718048096, + "learning_rate": 7.876827491164778e-06, + "loss": 0.638, + "step": 7438 + }, + { + "epoch": 0.5628996254398244, + "grad_norm": 2.4413700103759766, + "learning_rate": 7.874592916374966e-06, + "loss": 0.5869, + "step": 7439 + }, + { + "epoch": 0.5629752941621581, + "grad_norm": 2.1864519119262695, + "learning_rate": 7.87235841206179e-06, + "loss": 0.7605, + "step": 7440 + }, + { + "epoch": 0.5630509628844917, + "grad_norm": 2.2284204959869385, + "learning_rate": 7.870123978365174e-06, + "loss": 0.7422, + "step": 7441 + }, + { + "epoch": 0.5631266316068253, + "grad_norm": 1.9402129650115967, + "learning_rate": 7.867889615425052e-06, + "loss": 0.6012, + "step": 7442 + }, + { + "epoch": 0.563202300329159, + "grad_norm": 1.827885627746582, + "learning_rate": 7.865655323381342e-06, + "loss": 0.6708, + "step": 7443 + }, + { + "epoch": 0.5632779690514925, + "grad_norm": 2.1613986492156982, + "learning_rate": 7.863421102373963e-06, + "loss": 0.5724, + "step": 7444 + }, + { + "epoch": 0.5633536377738262, + "grad_norm": 2.0706324577331543, + "learning_rate": 7.861186952542832e-06, + "loss": 0.8642, + "step": 7445 + }, + { + "epoch": 0.5634293064961599, + "grad_norm": 2.373830795288086, + "learning_rate": 7.85895287402786e-06, + "loss": 0.6601, + "step": 7446 + }, + { + "epoch": 0.5635049752184934, + "grad_norm": 1.8708213567733765, + "learning_rate": 7.856718866968947e-06, + "loss": 0.6013, + "step": 7447 + }, + { + "epoch": 0.5635806439408271, + "grad_norm": 2.5757904052734375, + "learning_rate": 7.854484931505997e-06, + "loss": 0.7932, + "step": 7448 + }, + { + "epoch": 0.5636563126631606, + "grad_norm": 1.908565640449524, + "learning_rate": 7.852251067778903e-06, + "loss": 0.6691, + "step": 7449 + }, + { + "epoch": 0.5637319813854943, + "grad_norm": 4.402620792388916, + "learning_rate": 7.850017275927563e-06, + "loss": 0.7278, + "step": 7450 + }, + { + "epoch": 0.563807650107828, + "grad_norm": 2.60870623588562, + "learning_rate": 7.847783556091858e-06, + "loss": 0.7348, + "step": 7451 + }, + { + "epoch": 0.5638833188301615, + "grad_norm": 2.4514048099517822, + "learning_rate": 7.845549908411676e-06, + "loss": 0.8417, + "step": 7452 + }, + { + "epoch": 0.5639589875524952, + "grad_norm": 2.5910489559173584, + "learning_rate": 7.843316333026892e-06, + "loss": 0.6838, + "step": 7453 + }, + { + "epoch": 0.5640346562748288, + "grad_norm": 2.305745840072632, + "learning_rate": 7.841082830077378e-06, + "loss": 0.4627, + "step": 7454 + }, + { + "epoch": 0.5641103249971624, + "grad_norm": 3.0249183177948, + "learning_rate": 7.838849399703007e-06, + "loss": 0.6928, + "step": 7455 + }, + { + "epoch": 0.5641859937194961, + "grad_norm": 2.0935657024383545, + "learning_rate": 7.836616042043643e-06, + "loss": 0.6973, + "step": 7456 + }, + { + "epoch": 0.5642616624418296, + "grad_norm": 2.505596399307251, + "learning_rate": 7.834382757239145e-06, + "loss": 0.6782, + "step": 7457 + }, + { + "epoch": 0.5643373311641633, + "grad_norm": 2.2332210540771484, + "learning_rate": 7.832149545429372e-06, + "loss": 0.7438, + "step": 7458 + }, + { + "epoch": 0.564412999886497, + "grad_norm": 2.181175470352173, + "learning_rate": 7.829916406754166e-06, + "loss": 0.6196, + "step": 7459 + }, + { + "epoch": 0.5644886686088305, + "grad_norm": 1.7004423141479492, + "learning_rate": 7.827683341353381e-06, + "loss": 0.6966, + "step": 7460 + }, + { + "epoch": 0.5645643373311642, + "grad_norm": 2.624783992767334, + "learning_rate": 7.825450349366859e-06, + "loss": 0.8012, + "step": 7461 + }, + { + "epoch": 0.5646400060534977, + "grad_norm": 1.9809978008270264, + "learning_rate": 7.823217430934434e-06, + "loss": 0.7437, + "step": 7462 + }, + { + "epoch": 0.5647156747758314, + "grad_norm": 3.563035011291504, + "learning_rate": 7.82098458619594e-06, + "loss": 0.6683, + "step": 7463 + }, + { + "epoch": 0.5647913434981651, + "grad_norm": 2.226811408996582, + "learning_rate": 7.818751815291204e-06, + "loss": 0.7564, + "step": 7464 + }, + { + "epoch": 0.5648670122204986, + "grad_norm": 1.755900502204895, + "learning_rate": 7.816519118360046e-06, + "loss": 0.62, + "step": 7465 + }, + { + "epoch": 0.5649426809428323, + "grad_norm": 8.155805587768555, + "learning_rate": 7.814286495542293e-06, + "loss": 0.7387, + "step": 7466 + }, + { + "epoch": 0.565018349665166, + "grad_norm": 2.6799027919769287, + "learning_rate": 7.812053946977755e-06, + "loss": 0.583, + "step": 7467 + }, + { + "epoch": 0.5650940183874995, + "grad_norm": 2.1574854850769043, + "learning_rate": 7.80982147280624e-06, + "loss": 0.564, + "step": 7468 + }, + { + "epoch": 0.5651696871098332, + "grad_norm": 2.8113937377929688, + "learning_rate": 7.807589073167556e-06, + "loss": 0.7773, + "step": 7469 + }, + { + "epoch": 0.5652453558321667, + "grad_norm": 2.041391611099243, + "learning_rate": 7.805356748201497e-06, + "loss": 0.7095, + "step": 7470 + }, + { + "epoch": 0.5653210245545004, + "grad_norm": 2.0056307315826416, + "learning_rate": 7.803124498047865e-06, + "loss": 0.6397, + "step": 7471 + }, + { + "epoch": 0.5653966932768341, + "grad_norm": 2.5884063243865967, + "learning_rate": 7.80089232284645e-06, + "loss": 0.7664, + "step": 7472 + }, + { + "epoch": 0.5654723619991676, + "grad_norm": 2.3261334896087646, + "learning_rate": 7.798660222737033e-06, + "loss": 0.7108, + "step": 7473 + }, + { + "epoch": 0.5655480307215013, + "grad_norm": 1.9324620962142944, + "learning_rate": 7.7964281978594e-06, + "loss": 0.7209, + "step": 7474 + }, + { + "epoch": 0.5656236994438348, + "grad_norm": 2.28661847114563, + "learning_rate": 7.794196248353323e-06, + "loss": 0.6863, + "step": 7475 + }, + { + "epoch": 0.5656993681661685, + "grad_norm": 3.6437911987304688, + "learning_rate": 7.791964374358579e-06, + "loss": 0.7256, + "step": 7476 + }, + { + "epoch": 0.5657750368885022, + "grad_norm": 2.1123297214508057, + "learning_rate": 7.789732576014934e-06, + "loss": 0.7905, + "step": 7477 + }, + { + "epoch": 0.5658507056108357, + "grad_norm": 2.2938523292541504, + "learning_rate": 7.787500853462149e-06, + "loss": 0.579, + "step": 7478 + }, + { + "epoch": 0.5659263743331694, + "grad_norm": 2.4166362285614014, + "learning_rate": 7.785269206839984e-06, + "loss": 0.6481, + "step": 7479 + }, + { + "epoch": 0.5660020430555031, + "grad_norm": 1.8204790353775024, + "learning_rate": 7.783037636288185e-06, + "loss": 0.7452, + "step": 7480 + }, + { + "epoch": 0.5660777117778366, + "grad_norm": 2.0831942558288574, + "learning_rate": 7.78080614194651e-06, + "loss": 0.5121, + "step": 7481 + }, + { + "epoch": 0.5661533805001703, + "grad_norm": 2.7705237865448, + "learning_rate": 7.778574723954695e-06, + "loss": 0.7819, + "step": 7482 + }, + { + "epoch": 0.5662290492225038, + "grad_norm": 2.23626708984375, + "learning_rate": 7.776343382452485e-06, + "loss": 0.609, + "step": 7483 + }, + { + "epoch": 0.5663047179448375, + "grad_norm": 2.0028553009033203, + "learning_rate": 7.774112117579608e-06, + "loss": 0.701, + "step": 7484 + }, + { + "epoch": 0.5663803866671712, + "grad_norm": 4.554917812347412, + "learning_rate": 7.771880929475792e-06, + "loss": 0.6932, + "step": 7485 + }, + { + "epoch": 0.5664560553895047, + "grad_norm": 1.90889310836792, + "learning_rate": 7.76964981828077e-06, + "loss": 0.7258, + "step": 7486 + }, + { + "epoch": 0.5665317241118384, + "grad_norm": 2.3061931133270264, + "learning_rate": 7.767418784134253e-06, + "loss": 0.7325, + "step": 7487 + }, + { + "epoch": 0.5666073928341719, + "grad_norm": 2.0186426639556885, + "learning_rate": 7.76518782717596e-06, + "loss": 0.7392, + "step": 7488 + }, + { + "epoch": 0.5666830615565056, + "grad_norm": 2.2334370613098145, + "learning_rate": 7.762956947545598e-06, + "loss": 0.7617, + "step": 7489 + }, + { + "epoch": 0.5667587302788393, + "grad_norm": 2.3827006816864014, + "learning_rate": 7.760726145382871e-06, + "loss": 0.648, + "step": 7490 + }, + { + "epoch": 0.5668343990011728, + "grad_norm": 4.122293472290039, + "learning_rate": 7.758495420827485e-06, + "loss": 0.7135, + "step": 7491 + }, + { + "epoch": 0.5669100677235065, + "grad_norm": 2.1730690002441406, + "learning_rate": 7.75626477401913e-06, + "loss": 0.6489, + "step": 7492 + }, + { + "epoch": 0.5669857364458402, + "grad_norm": 2.0568599700927734, + "learning_rate": 7.754034205097497e-06, + "loss": 0.7312, + "step": 7493 + }, + { + "epoch": 0.5670614051681737, + "grad_norm": 2.0819613933563232, + "learning_rate": 7.751803714202273e-06, + "loss": 0.7455, + "step": 7494 + }, + { + "epoch": 0.5671370738905074, + "grad_norm": 2.208782434463501, + "learning_rate": 7.749573301473133e-06, + "loss": 0.6664, + "step": 7495 + }, + { + "epoch": 0.5672127426128409, + "grad_norm": 2.119398593902588, + "learning_rate": 7.74734296704976e-06, + "loss": 0.5872, + "step": 7496 + }, + { + "epoch": 0.5672884113351746, + "grad_norm": 1.802098274230957, + "learning_rate": 7.745112711071824e-06, + "loss": 0.8714, + "step": 7497 + }, + { + "epoch": 0.5673640800575083, + "grad_norm": 2.2059485912323, + "learning_rate": 7.742882533678988e-06, + "loss": 0.7653, + "step": 7498 + }, + { + "epoch": 0.5674397487798418, + "grad_norm": 3.0473945140838623, + "learning_rate": 7.740652435010915e-06, + "loss": 0.7615, + "step": 7499 + }, + { + "epoch": 0.5675154175021755, + "grad_norm": 2.383582830429077, + "learning_rate": 7.738422415207257e-06, + "loss": 0.8784, + "step": 7500 + }, + { + "epoch": 0.567591086224509, + "grad_norm": 2.5827648639678955, + "learning_rate": 7.736192474407667e-06, + "loss": 0.7992, + "step": 7501 + }, + { + "epoch": 0.5676667549468427, + "grad_norm": 2.1168980598449707, + "learning_rate": 7.733962612751795e-06, + "loss": 0.7262, + "step": 7502 + }, + { + "epoch": 0.5677424236691764, + "grad_norm": 1.6200450658798218, + "learning_rate": 7.731732830379278e-06, + "loss": 0.6776, + "step": 7503 + }, + { + "epoch": 0.5678180923915099, + "grad_norm": 2.622553825378418, + "learning_rate": 7.729503127429755e-06, + "loss": 0.7279, + "step": 7504 + }, + { + "epoch": 0.5678937611138436, + "grad_norm": 1.9916034936904907, + "learning_rate": 7.727273504042853e-06, + "loss": 0.815, + "step": 7505 + }, + { + "epoch": 0.5679694298361773, + "grad_norm": 2.347259521484375, + "learning_rate": 7.7250439603582e-06, + "loss": 0.4899, + "step": 7506 + }, + { + "epoch": 0.5680450985585108, + "grad_norm": 2.532034397125244, + "learning_rate": 7.722814496515418e-06, + "loss": 0.7588, + "step": 7507 + }, + { + "epoch": 0.5681207672808445, + "grad_norm": 2.015144109725952, + "learning_rate": 7.720585112654124e-06, + "loss": 0.7181, + "step": 7508 + }, + { + "epoch": 0.568196436003178, + "grad_norm": 2.0816779136657715, + "learning_rate": 7.718355808913931e-06, + "loss": 0.7219, + "step": 7509 + }, + { + "epoch": 0.5682721047255117, + "grad_norm": 2.1527559757232666, + "learning_rate": 7.71612658543444e-06, + "loss": 0.6645, + "step": 7510 + }, + { + "epoch": 0.5683477734478454, + "grad_norm": 2.143007278442383, + "learning_rate": 7.713897442355251e-06, + "loss": 0.6128, + "step": 7511 + }, + { + "epoch": 0.5684234421701789, + "grad_norm": 2.356175184249878, + "learning_rate": 7.711668379815969e-06, + "loss": 0.7695, + "step": 7512 + }, + { + "epoch": 0.5684991108925126, + "grad_norm": 2.8489623069763184, + "learning_rate": 7.70943939795618e-06, + "loss": 0.8197, + "step": 7513 + }, + { + "epoch": 0.5685747796148461, + "grad_norm": 1.8885236978530884, + "learning_rate": 7.707210496915469e-06, + "loss": 0.6958, + "step": 7514 + }, + { + "epoch": 0.5686504483371798, + "grad_norm": 1.7150541543960571, + "learning_rate": 7.70498167683342e-06, + "loss": 0.6678, + "step": 7515 + }, + { + "epoch": 0.5687261170595135, + "grad_norm": 2.1104001998901367, + "learning_rate": 7.702752937849603e-06, + "loss": 0.6665, + "step": 7516 + }, + { + "epoch": 0.568801785781847, + "grad_norm": 2.0098612308502197, + "learning_rate": 7.700524280103593e-06, + "loss": 0.7077, + "step": 7517 + }, + { + "epoch": 0.5688774545041807, + "grad_norm": 2.4959285259246826, + "learning_rate": 7.69829570373496e-06, + "loss": 0.7196, + "step": 7518 + }, + { + "epoch": 0.5689531232265144, + "grad_norm": 1.7228821516036987, + "learning_rate": 7.696067208883257e-06, + "loss": 0.7246, + "step": 7519 + }, + { + "epoch": 0.5690287919488479, + "grad_norm": 1.951913833618164, + "learning_rate": 7.693838795688046e-06, + "loss": 0.5224, + "step": 7520 + }, + { + "epoch": 0.5691044606711816, + "grad_norm": 1.957993745803833, + "learning_rate": 7.691610464288869e-06, + "loss": 0.6015, + "step": 7521 + }, + { + "epoch": 0.5691801293935151, + "grad_norm": 2.1833786964416504, + "learning_rate": 7.689382214825279e-06, + "loss": 0.7048, + "step": 7522 + }, + { + "epoch": 0.5692557981158488, + "grad_norm": 2.1259801387786865, + "learning_rate": 7.687154047436815e-06, + "loss": 0.6339, + "step": 7523 + }, + { + "epoch": 0.5693314668381825, + "grad_norm": 3.2358062267303467, + "learning_rate": 7.68492596226301e-06, + "loss": 0.9118, + "step": 7524 + }, + { + "epoch": 0.569407135560516, + "grad_norm": 3.1258087158203125, + "learning_rate": 7.682697959443396e-06, + "loss": 0.733, + "step": 7525 + }, + { + "epoch": 0.5694828042828497, + "grad_norm": 1.9816136360168457, + "learning_rate": 7.680470039117491e-06, + "loss": 0.5748, + "step": 7526 + }, + { + "epoch": 0.5695584730051833, + "grad_norm": 1.8756901025772095, + "learning_rate": 7.678242201424825e-06, + "loss": 0.7879, + "step": 7527 + }, + { + "epoch": 0.5696341417275169, + "grad_norm": 2.385270595550537, + "learning_rate": 7.676014446504906e-06, + "loss": 0.6642, + "step": 7528 + }, + { + "epoch": 0.5697098104498506, + "grad_norm": 1.8264906406402588, + "learning_rate": 7.673786774497248e-06, + "loss": 0.5951, + "step": 7529 + }, + { + "epoch": 0.5697854791721841, + "grad_norm": 3.006786823272705, + "learning_rate": 7.671559185541348e-06, + "loss": 0.6021, + "step": 7530 + }, + { + "epoch": 0.5698611478945178, + "grad_norm": 2.019592523574829, + "learning_rate": 7.669331679776708e-06, + "loss": 0.6055, + "step": 7531 + }, + { + "epoch": 0.5699368166168515, + "grad_norm": 2.036536693572998, + "learning_rate": 7.667104257342825e-06, + "loss": 0.7984, + "step": 7532 + }, + { + "epoch": 0.570012485339185, + "grad_norm": 2.4801836013793945, + "learning_rate": 7.664876918379182e-06, + "loss": 0.747, + "step": 7533 + }, + { + "epoch": 0.5700881540615187, + "grad_norm": 2.761587381362915, + "learning_rate": 7.662649663025267e-06, + "loss": 0.7959, + "step": 7534 + }, + { + "epoch": 0.5701638227838522, + "grad_norm": 1.7771096229553223, + "learning_rate": 7.660422491420554e-06, + "loss": 0.6943, + "step": 7535 + }, + { + "epoch": 0.5702394915061859, + "grad_norm": 2.084876537322998, + "learning_rate": 7.658195403704516e-06, + "loss": 0.6666, + "step": 7536 + }, + { + "epoch": 0.5703151602285196, + "grad_norm": 1.841198444366455, + "learning_rate": 7.655968400016624e-06, + "loss": 0.791, + "step": 7537 + }, + { + "epoch": 0.5703908289508531, + "grad_norm": 1.9432196617126465, + "learning_rate": 7.653741480496337e-06, + "loss": 0.8101, + "step": 7538 + }, + { + "epoch": 0.5704664976731868, + "grad_norm": 1.6360357999801636, + "learning_rate": 7.651514645283116e-06, + "loss": 0.6755, + "step": 7539 + }, + { + "epoch": 0.5705421663955205, + "grad_norm": 2.8515806198120117, + "learning_rate": 7.649287894516406e-06, + "loss": 0.7672, + "step": 7540 + }, + { + "epoch": 0.570617835117854, + "grad_norm": 1.9835529327392578, + "learning_rate": 7.647061228335656e-06, + "loss": 0.763, + "step": 7541 + }, + { + "epoch": 0.5706935038401877, + "grad_norm": 1.8836452960968018, + "learning_rate": 7.644834646880308e-06, + "loss": 0.6668, + "step": 7542 + }, + { + "epoch": 0.5707691725625212, + "grad_norm": 2.047100067138672, + "learning_rate": 7.6426081502898e-06, + "loss": 0.6978, + "step": 7543 + }, + { + "epoch": 0.5708448412848549, + "grad_norm": 1.7520688772201538, + "learning_rate": 7.640381738703558e-06, + "loss": 0.5824, + "step": 7544 + }, + { + "epoch": 0.5709205100071886, + "grad_norm": 2.1768970489501953, + "learning_rate": 7.638155412261011e-06, + "loss": 0.8075, + "step": 7545 + }, + { + "epoch": 0.5709961787295221, + "grad_norm": 2.6738734245300293, + "learning_rate": 7.635929171101575e-06, + "loss": 0.7113, + "step": 7546 + }, + { + "epoch": 0.5710718474518558, + "grad_norm": 1.7277650833129883, + "learning_rate": 7.633703015364664e-06, + "loss": 0.6632, + "step": 7547 + }, + { + "epoch": 0.5711475161741894, + "grad_norm": 1.8607732057571411, + "learning_rate": 7.631476945189694e-06, + "loss": 0.5316, + "step": 7548 + }, + { + "epoch": 0.571223184896523, + "grad_norm": 2.290065050125122, + "learning_rate": 7.629250960716061e-06, + "loss": 0.6583, + "step": 7549 + }, + { + "epoch": 0.5712988536188567, + "grad_norm": 2.1077466011047363, + "learning_rate": 7.62702506208317e-06, + "loss": 0.7311, + "step": 7550 + }, + { + "epoch": 0.5713745223411902, + "grad_norm": 2.3623740673065186, + "learning_rate": 7.6247992494304075e-06, + "loss": 0.6345, + "step": 7551 + }, + { + "epoch": 0.5714501910635239, + "grad_norm": 2.172419548034668, + "learning_rate": 7.622573522897162e-06, + "loss": 0.6144, + "step": 7552 + }, + { + "epoch": 0.5715258597858576, + "grad_norm": 1.9453599452972412, + "learning_rate": 7.620347882622821e-06, + "loss": 0.8414, + "step": 7553 + }, + { + "epoch": 0.5716015285081911, + "grad_norm": 2.7051191329956055, + "learning_rate": 7.6181223287467574e-06, + "loss": 0.6487, + "step": 7554 + }, + { + "epoch": 0.5716771972305248, + "grad_norm": 2.7036776542663574, + "learning_rate": 7.615896861408342e-06, + "loss": 0.6815, + "step": 7555 + }, + { + "epoch": 0.5717528659528583, + "grad_norm": 2.5882720947265625, + "learning_rate": 7.613671480746944e-06, + "loss": 0.735, + "step": 7556 + }, + { + "epoch": 0.571828534675192, + "grad_norm": 2.0548622608184814, + "learning_rate": 7.611446186901918e-06, + "loss": 0.7788, + "step": 7557 + }, + { + "epoch": 0.5719042033975257, + "grad_norm": 2.1789886951446533, + "learning_rate": 7.609220980012624e-06, + "loss": 0.672, + "step": 7558 + }, + { + "epoch": 0.5719798721198592, + "grad_norm": 1.767871618270874, + "learning_rate": 7.606995860218413e-06, + "loss": 0.6939, + "step": 7559 + }, + { + "epoch": 0.5720555408421929, + "grad_norm": 2.4886820316314697, + "learning_rate": 7.604770827658626e-06, + "loss": 0.7978, + "step": 7560 + }, + { + "epoch": 0.5721312095645265, + "grad_norm": 2.7979044914245605, + "learning_rate": 7.602545882472603e-06, + "loss": 0.7555, + "step": 7561 + }, + { + "epoch": 0.5722068782868601, + "grad_norm": 2.692528009414673, + "learning_rate": 7.6003210247996736e-06, + "loss": 0.7346, + "step": 7562 + }, + { + "epoch": 0.5722825470091938, + "grad_norm": 1.8994667530059814, + "learning_rate": 7.59809625477917e-06, + "loss": 0.7628, + "step": 7563 + }, + { + "epoch": 0.5723582157315273, + "grad_norm": 2.2061333656311035, + "learning_rate": 7.595871572550416e-06, + "loss": 0.7109, + "step": 7564 + }, + { + "epoch": 0.572433884453861, + "grad_norm": 2.585219621658325, + "learning_rate": 7.593646978252723e-06, + "loss": 0.663, + "step": 7565 + }, + { + "epoch": 0.5725095531761947, + "grad_norm": 2.168567180633545, + "learning_rate": 7.591422472025408e-06, + "loss": 0.6996, + "step": 7566 + }, + { + "epoch": 0.5725852218985282, + "grad_norm": 2.5026702880859375, + "learning_rate": 7.589198054007769e-06, + "loss": 0.5515, + "step": 7567 + }, + { + "epoch": 0.5726608906208619, + "grad_norm": 2.055335760116577, + "learning_rate": 7.5869737243391125e-06, + "loss": 0.6029, + "step": 7568 + }, + { + "epoch": 0.5727365593431955, + "grad_norm": 1.7085371017456055, + "learning_rate": 7.584749483158733e-06, + "loss": 0.6204, + "step": 7569 + }, + { + "epoch": 0.5728122280655291, + "grad_norm": 2.084444522857666, + "learning_rate": 7.582525330605918e-06, + "loss": 0.7365, + "step": 7570 + }, + { + "epoch": 0.5728878967878628, + "grad_norm": 2.4518747329711914, + "learning_rate": 7.580301266819951e-06, + "loss": 0.7121, + "step": 7571 + }, + { + "epoch": 0.5729635655101963, + "grad_norm": 2.6568686962127686, + "learning_rate": 7.578077291940109e-06, + "loss": 0.79, + "step": 7572 + }, + { + "epoch": 0.57303923423253, + "grad_norm": 1.885642409324646, + "learning_rate": 7.575853406105669e-06, + "loss": 0.67, + "step": 7573 + }, + { + "epoch": 0.5731149029548636, + "grad_norm": 2.200899362564087, + "learning_rate": 7.573629609455893e-06, + "loss": 0.7487, + "step": 7574 + }, + { + "epoch": 0.5731905716771972, + "grad_norm": 1.83932626247406, + "learning_rate": 7.571405902130047e-06, + "loss": 0.8097, + "step": 7575 + }, + { + "epoch": 0.5732662403995309, + "grad_norm": 1.8516623973846436, + "learning_rate": 7.569182284267382e-06, + "loss": 0.7444, + "step": 7576 + }, + { + "epoch": 0.5733419091218644, + "grad_norm": 2.9464001655578613, + "learning_rate": 7.566958756007148e-06, + "loss": 0.7044, + "step": 7577 + }, + { + "epoch": 0.5734175778441981, + "grad_norm": 2.2606701850891113, + "learning_rate": 7.5647353174885956e-06, + "loss": 0.7526, + "step": 7578 + }, + { + "epoch": 0.5734932465665318, + "grad_norm": 1.8875621557235718, + "learning_rate": 7.5625119688509575e-06, + "loss": 0.5802, + "step": 7579 + }, + { + "epoch": 0.5735689152888653, + "grad_norm": 2.271878957748413, + "learning_rate": 7.560288710233472e-06, + "loss": 0.7184, + "step": 7580 + }, + { + "epoch": 0.573644584011199, + "grad_norm": 2.303473711013794, + "learning_rate": 7.558065541775362e-06, + "loss": 0.8383, + "step": 7581 + }, + { + "epoch": 0.5737202527335326, + "grad_norm": 1.9427469968795776, + "learning_rate": 7.555842463615853e-06, + "loss": 0.7299, + "step": 7582 + }, + { + "epoch": 0.5737959214558662, + "grad_norm": 1.7646313905715942, + "learning_rate": 7.553619475894155e-06, + "loss": 0.6953, + "step": 7583 + }, + { + "epoch": 0.5738715901781999, + "grad_norm": 2.1616227626800537, + "learning_rate": 7.551396578749487e-06, + "loss": 0.8009, + "step": 7584 + }, + { + "epoch": 0.5739472589005334, + "grad_norm": 1.9275579452514648, + "learning_rate": 7.5491737723210515e-06, + "loss": 0.7245, + "step": 7585 + }, + { + "epoch": 0.5740229276228671, + "grad_norm": 1.933892011642456, + "learning_rate": 7.546951056748047e-06, + "loss": 0.7036, + "step": 7586 + }, + { + "epoch": 0.5740985963452007, + "grad_norm": 1.7795796394348145, + "learning_rate": 7.544728432169666e-06, + "loss": 0.6322, + "step": 7587 + }, + { + "epoch": 0.5741742650675343, + "grad_norm": 1.7775262594223022, + "learning_rate": 7.542505898725095e-06, + "loss": 0.6999, + "step": 7588 + }, + { + "epoch": 0.574249933789868, + "grad_norm": 1.9181798696517944, + "learning_rate": 7.540283456553523e-06, + "loss": 0.5708, + "step": 7589 + }, + { + "epoch": 0.5743256025122015, + "grad_norm": 4.439153671264648, + "learning_rate": 7.538061105794121e-06, + "loss": 0.8357, + "step": 7590 + }, + { + "epoch": 0.5744012712345352, + "grad_norm": 1.863204836845398, + "learning_rate": 7.5358388465860625e-06, + "loss": 0.5036, + "step": 7591 + }, + { + "epoch": 0.5744769399568689, + "grad_norm": 2.4404690265655518, + "learning_rate": 7.533616679068508e-06, + "loss": 0.8833, + "step": 7592 + }, + { + "epoch": 0.5745526086792024, + "grad_norm": 2.3636746406555176, + "learning_rate": 7.53139460338062e-06, + "loss": 0.6487, + "step": 7593 + }, + { + "epoch": 0.5746282774015361, + "grad_norm": 2.188185691833496, + "learning_rate": 7.5291726196615545e-06, + "loss": 0.6516, + "step": 7594 + }, + { + "epoch": 0.5747039461238697, + "grad_norm": 2.0827059745788574, + "learning_rate": 7.526950728050455e-06, + "loss": 0.6179, + "step": 7595 + }, + { + "epoch": 0.5747796148462033, + "grad_norm": 2.1536214351654053, + "learning_rate": 7.524728928686468e-06, + "loss": 0.5059, + "step": 7596 + }, + { + "epoch": 0.574855283568537, + "grad_norm": 1.9691141843795776, + "learning_rate": 7.522507221708724e-06, + "loss": 0.6105, + "step": 7597 + }, + { + "epoch": 0.5749309522908705, + "grad_norm": 2.0234973430633545, + "learning_rate": 7.520285607256354e-06, + "loss": 0.7116, + "step": 7598 + }, + { + "epoch": 0.5750066210132042, + "grad_norm": 2.0433642864227295, + "learning_rate": 7.51806408546849e-06, + "loss": 0.6751, + "step": 7599 + }, + { + "epoch": 0.5750822897355378, + "grad_norm": 2.2238693237304688, + "learning_rate": 7.515842656484246e-06, + "loss": 0.5849, + "step": 7600 + }, + { + "epoch": 0.5751579584578714, + "grad_norm": 2.1468698978424072, + "learning_rate": 7.513621320442734e-06, + "loss": 0.6573, + "step": 7601 + }, + { + "epoch": 0.5752336271802051, + "grad_norm": 1.8017033338546753, + "learning_rate": 7.5114000774830645e-06, + "loss": 0.7018, + "step": 7602 + }, + { + "epoch": 0.5753092959025387, + "grad_norm": 3.9429688453674316, + "learning_rate": 7.509178927744331e-06, + "loss": 0.676, + "step": 7603 + }, + { + "epoch": 0.5753849646248723, + "grad_norm": 2.290649175643921, + "learning_rate": 7.506957871365639e-06, + "loss": 0.6806, + "step": 7604 + }, + { + "epoch": 0.575460633347206, + "grad_norm": 2.842364549636841, + "learning_rate": 7.504736908486076e-06, + "loss": 0.8284, + "step": 7605 + }, + { + "epoch": 0.5755363020695395, + "grad_norm": 1.501198172569275, + "learning_rate": 7.502516039244721e-06, + "loss": 0.7676, + "step": 7606 + }, + { + "epoch": 0.5756119707918732, + "grad_norm": 1.9701095819473267, + "learning_rate": 7.500295263780658e-06, + "loss": 0.7604, + "step": 7607 + }, + { + "epoch": 0.5756876395142068, + "grad_norm": 2.247490644454956, + "learning_rate": 7.498074582232952e-06, + "loss": 0.895, + "step": 7608 + }, + { + "epoch": 0.5757633082365404, + "grad_norm": 1.9799500703811646, + "learning_rate": 7.4958539947406755e-06, + "loss": 0.7798, + "step": 7609 + }, + { + "epoch": 0.5758389769588741, + "grad_norm": 2.8931593894958496, + "learning_rate": 7.493633501442889e-06, + "loss": 0.6262, + "step": 7610 + }, + { + "epoch": 0.5759146456812076, + "grad_norm": 2.1218535900115967, + "learning_rate": 7.4914131024786425e-06, + "loss": 0.7037, + "step": 7611 + }, + { + "epoch": 0.5759903144035413, + "grad_norm": 2.6001503467559814, + "learning_rate": 7.4891927979869885e-06, + "loss": 0.6523, + "step": 7612 + }, + { + "epoch": 0.5760659831258749, + "grad_norm": 2.1940317153930664, + "learning_rate": 7.486972588106963e-06, + "loss": 0.6337, + "step": 7613 + }, + { + "epoch": 0.5761416518482085, + "grad_norm": 1.9558513164520264, + "learning_rate": 7.4847524729776135e-06, + "loss": 0.7347, + "step": 7614 + }, + { + "epoch": 0.5762173205705422, + "grad_norm": 1.9365533590316772, + "learning_rate": 7.4825324527379625e-06, + "loss": 0.7676, + "step": 7615 + }, + { + "epoch": 0.5762929892928758, + "grad_norm": 2.3588614463806152, + "learning_rate": 7.48031252752704e-06, + "loss": 0.5735, + "step": 7616 + }, + { + "epoch": 0.5763686580152094, + "grad_norm": 1.7206127643585205, + "learning_rate": 7.4780926974838605e-06, + "loss": 0.6839, + "step": 7617 + }, + { + "epoch": 0.5764443267375431, + "grad_norm": 2.759472370147705, + "learning_rate": 7.4758729627474395e-06, + "loss": 0.6453, + "step": 7618 + }, + { + "epoch": 0.5765199954598766, + "grad_norm": 2.1076037883758545, + "learning_rate": 7.473653323456781e-06, + "loss": 0.6109, + "step": 7619 + }, + { + "epoch": 0.5765956641822103, + "grad_norm": 1.9081132411956787, + "learning_rate": 7.471433779750889e-06, + "loss": 0.7941, + "step": 7620 + }, + { + "epoch": 0.5766713329045439, + "grad_norm": 1.8242427110671997, + "learning_rate": 7.4692143317687595e-06, + "loss": 0.6116, + "step": 7621 + }, + { + "epoch": 0.5767470016268775, + "grad_norm": 2.058173179626465, + "learning_rate": 7.466994979649378e-06, + "loss": 0.6837, + "step": 7622 + }, + { + "epoch": 0.5768226703492112, + "grad_norm": 2.6424033641815186, + "learning_rate": 7.464775723531731e-06, + "loss": 0.7541, + "step": 7623 + }, + { + "epoch": 0.5768983390715448, + "grad_norm": 1.771959900856018, + "learning_rate": 7.46255656355479e-06, + "loss": 0.6146, + "step": 7624 + }, + { + "epoch": 0.5769740077938784, + "grad_norm": 2.1034512519836426, + "learning_rate": 7.460337499857531e-06, + "loss": 0.7532, + "step": 7625 + }, + { + "epoch": 0.577049676516212, + "grad_norm": 2.0289289951324463, + "learning_rate": 7.4581185325789204e-06, + "loss": 0.7371, + "step": 7626 + }, + { + "epoch": 0.5771253452385456, + "grad_norm": 2.0849831104278564, + "learning_rate": 7.455899661857912e-06, + "loss": 0.6457, + "step": 7627 + }, + { + "epoch": 0.5772010139608793, + "grad_norm": 1.9334015846252441, + "learning_rate": 7.453680887833464e-06, + "loss": 0.6986, + "step": 7628 + }, + { + "epoch": 0.5772766826832129, + "grad_norm": 2.172220468521118, + "learning_rate": 7.451462210644513e-06, + "loss": 0.6897, + "step": 7629 + }, + { + "epoch": 0.5773523514055465, + "grad_norm": 2.764195203781128, + "learning_rate": 7.449243630430013e-06, + "loss": 0.7162, + "step": 7630 + }, + { + "epoch": 0.5774280201278802, + "grad_norm": 1.864995002746582, + "learning_rate": 7.447025147328891e-06, + "loss": 0.7502, + "step": 7631 + }, + { + "epoch": 0.5775036888502137, + "grad_norm": 1.9428197145462036, + "learning_rate": 7.444806761480079e-06, + "loss": 0.7187, + "step": 7632 + }, + { + "epoch": 0.5775793575725474, + "grad_norm": 1.9613124132156372, + "learning_rate": 7.442588473022497e-06, + "loss": 0.8163, + "step": 7633 + }, + { + "epoch": 0.577655026294881, + "grad_norm": 1.6671011447906494, + "learning_rate": 7.440370282095059e-06, + "loss": 0.7783, + "step": 7634 + }, + { + "epoch": 0.5777306950172146, + "grad_norm": 1.9171720743179321, + "learning_rate": 7.438152188836682e-06, + "loss": 0.8034, + "step": 7635 + }, + { + "epoch": 0.5778063637395483, + "grad_norm": 2.3247179985046387, + "learning_rate": 7.435934193386265e-06, + "loss": 0.6332, + "step": 7636 + }, + { + "epoch": 0.5778820324618819, + "grad_norm": 2.2075064182281494, + "learning_rate": 7.433716295882709e-06, + "loss": 0.725, + "step": 7637 + }, + { + "epoch": 0.5779577011842155, + "grad_norm": 2.2133593559265137, + "learning_rate": 7.431498496464904e-06, + "loss": 0.8622, + "step": 7638 + }, + { + "epoch": 0.5780333699065491, + "grad_norm": 1.9004621505737305, + "learning_rate": 7.4292807952717325e-06, + "loss": 0.6223, + "step": 7639 + }, + { + "epoch": 0.5781090386288827, + "grad_norm": 2.1120524406433105, + "learning_rate": 7.427063192442083e-06, + "loss": 0.6237, + "step": 7640 + }, + { + "epoch": 0.5781847073512164, + "grad_norm": 1.8914631605148315, + "learning_rate": 7.424845688114822e-06, + "loss": 0.6204, + "step": 7641 + }, + { + "epoch": 0.57826037607355, + "grad_norm": 1.872819423675537, + "learning_rate": 7.42262828242882e-06, + "loss": 0.7482, + "step": 7642 + }, + { + "epoch": 0.5783360447958836, + "grad_norm": 1.7717469930648804, + "learning_rate": 7.420410975522935e-06, + "loss": 0.7326, + "step": 7643 + }, + { + "epoch": 0.5784117135182173, + "grad_norm": 2.0166468620300293, + "learning_rate": 7.418193767536022e-06, + "loss": 0.6824, + "step": 7644 + }, + { + "epoch": 0.5784873822405509, + "grad_norm": 2.0506820678710938, + "learning_rate": 7.4159766586069335e-06, + "loss": 0.602, + "step": 7645 + }, + { + "epoch": 0.5785630509628845, + "grad_norm": 1.8485995531082153, + "learning_rate": 7.413759648874512e-06, + "loss": 0.7065, + "step": 7646 + }, + { + "epoch": 0.5786387196852181, + "grad_norm": 3.743983030319214, + "learning_rate": 7.411542738477589e-06, + "loss": 0.777, + "step": 7647 + }, + { + "epoch": 0.5787143884075517, + "grad_norm": 2.7498581409454346, + "learning_rate": 7.409325927555001e-06, + "loss": 0.7437, + "step": 7648 + }, + { + "epoch": 0.5787900571298854, + "grad_norm": 1.9643309116363525, + "learning_rate": 7.4071092162455635e-06, + "loss": 0.7352, + "step": 7649 + }, + { + "epoch": 0.578865725852219, + "grad_norm": 1.9491885900497437, + "learning_rate": 7.4048926046881e-06, + "loss": 0.7209, + "step": 7650 + }, + { + "epoch": 0.5789413945745526, + "grad_norm": 1.8065403699874878, + "learning_rate": 7.402676093021424e-06, + "loss": 0.728, + "step": 7651 + }, + { + "epoch": 0.5790170632968862, + "grad_norm": 1.9861661195755005, + "learning_rate": 7.400459681384335e-06, + "loss": 0.5903, + "step": 7652 + }, + { + "epoch": 0.5790927320192198, + "grad_norm": 1.8976181745529175, + "learning_rate": 7.398243369915636e-06, + "loss": 0.7683, + "step": 7653 + }, + { + "epoch": 0.5791684007415535, + "grad_norm": 1.8493512868881226, + "learning_rate": 7.396027158754114e-06, + "loss": 0.5691, + "step": 7654 + }, + { + "epoch": 0.5792440694638871, + "grad_norm": 1.9040534496307373, + "learning_rate": 7.393811048038561e-06, + "loss": 0.7016, + "step": 7655 + }, + { + "epoch": 0.5793197381862207, + "grad_norm": 1.8322153091430664, + "learning_rate": 7.391595037907758e-06, + "loss": 0.725, + "step": 7656 + }, + { + "epoch": 0.5793954069085544, + "grad_norm": 2.335909128189087, + "learning_rate": 7.389379128500474e-06, + "loss": 0.7147, + "step": 7657 + }, + { + "epoch": 0.579471075630888, + "grad_norm": 2.1806111335754395, + "learning_rate": 7.3871633199554775e-06, + "loss": 0.5799, + "step": 7658 + }, + { + "epoch": 0.5795467443532216, + "grad_norm": 2.703801393508911, + "learning_rate": 7.384947612411532e-06, + "loss": 0.7216, + "step": 7659 + }, + { + "epoch": 0.5796224130755552, + "grad_norm": 2.5444979667663574, + "learning_rate": 7.3827320060073886e-06, + "loss": 0.692, + "step": 7660 + }, + { + "epoch": 0.5796980817978888, + "grad_norm": 2.3260116577148438, + "learning_rate": 7.380516500881799e-06, + "loss": 0.7942, + "step": 7661 + }, + { + "epoch": 0.5797737505202225, + "grad_norm": 2.3497567176818848, + "learning_rate": 7.378301097173506e-06, + "loss": 0.6695, + "step": 7662 + }, + { + "epoch": 0.5798494192425561, + "grad_norm": 2.0174307823181152, + "learning_rate": 7.376085795021241e-06, + "loss": 0.6379, + "step": 7663 + }, + { + "epoch": 0.5799250879648897, + "grad_norm": 3.6419637203216553, + "learning_rate": 7.373870594563739e-06, + "loss": 0.6858, + "step": 7664 + }, + { + "epoch": 0.5800007566872233, + "grad_norm": 2.2158446311950684, + "learning_rate": 7.3716554959397145e-06, + "loss": 0.7634, + "step": 7665 + }, + { + "epoch": 0.580076425409557, + "grad_norm": 1.666245937347412, + "learning_rate": 7.369440499287893e-06, + "loss": 0.5982, + "step": 7666 + }, + { + "epoch": 0.5801520941318906, + "grad_norm": 2.2408547401428223, + "learning_rate": 7.367225604746981e-06, + "loss": 0.6844, + "step": 7667 + }, + { + "epoch": 0.5802277628542242, + "grad_norm": 2.1953916549682617, + "learning_rate": 7.365010812455683e-06, + "loss": 0.803, + "step": 7668 + }, + { + "epoch": 0.5803034315765578, + "grad_norm": 2.5317487716674805, + "learning_rate": 7.362796122552698e-06, + "loss": 0.7437, + "step": 7669 + }, + { + "epoch": 0.5803791002988915, + "grad_norm": 2.728444814682007, + "learning_rate": 7.3605815351767105e-06, + "loss": 0.725, + "step": 7670 + }, + { + "epoch": 0.5804547690212251, + "grad_norm": 2.354583740234375, + "learning_rate": 7.358367050466411e-06, + "loss": 0.7462, + "step": 7671 + }, + { + "epoch": 0.5805304377435587, + "grad_norm": 1.8923773765563965, + "learning_rate": 7.356152668560478e-06, + "loss": 0.6499, + "step": 7672 + }, + { + "epoch": 0.5806061064658923, + "grad_norm": 2.219557523727417, + "learning_rate": 7.353938389597583e-06, + "loss": 0.6165, + "step": 7673 + }, + { + "epoch": 0.580681775188226, + "grad_norm": 2.9942805767059326, + "learning_rate": 7.351724213716388e-06, + "loss": 0.6961, + "step": 7674 + }, + { + "epoch": 0.5807574439105596, + "grad_norm": 1.9421963691711426, + "learning_rate": 7.349510141055552e-06, + "loss": 0.7431, + "step": 7675 + }, + { + "epoch": 0.5808331126328932, + "grad_norm": 2.3083291053771973, + "learning_rate": 7.347296171753734e-06, + "loss": 0.739, + "step": 7676 + }, + { + "epoch": 0.5809087813552268, + "grad_norm": 2.1545681953430176, + "learning_rate": 7.345082305949572e-06, + "loss": 0.6407, + "step": 7677 + }, + { + "epoch": 0.5809844500775604, + "grad_norm": 2.1147782802581787, + "learning_rate": 7.342868543781711e-06, + "loss": 0.721, + "step": 7678 + }, + { + "epoch": 0.581060118799894, + "grad_norm": 2.1288297176361084, + "learning_rate": 7.34065488538878e-06, + "loss": 0.6971, + "step": 7679 + }, + { + "epoch": 0.5811357875222277, + "grad_norm": 2.313871145248413, + "learning_rate": 7.338441330909405e-06, + "loss": 0.7317, + "step": 7680 + }, + { + "epoch": 0.5812114562445613, + "grad_norm": 2.2980892658233643, + "learning_rate": 7.336227880482211e-06, + "loss": 0.6622, + "step": 7681 + }, + { + "epoch": 0.5812871249668949, + "grad_norm": 1.893399715423584, + "learning_rate": 7.334014534245808e-06, + "loss": 0.6524, + "step": 7682 + }, + { + "epoch": 0.5813627936892286, + "grad_norm": 2.0858371257781982, + "learning_rate": 7.3318012923388046e-06, + "loss": 0.7039, + "step": 7683 + }, + { + "epoch": 0.5814384624115622, + "grad_norm": 1.9755072593688965, + "learning_rate": 7.329588154899797e-06, + "loss": 0.6282, + "step": 7684 + }, + { + "epoch": 0.5815141311338958, + "grad_norm": 1.9997296333312988, + "learning_rate": 7.327375122067382e-06, + "loss": 0.5055, + "step": 7685 + }, + { + "epoch": 0.5815897998562294, + "grad_norm": 2.181192636489868, + "learning_rate": 7.325162193980147e-06, + "loss": 0.7401, + "step": 7686 + }, + { + "epoch": 0.581665468578563, + "grad_norm": 1.5383789539337158, + "learning_rate": 7.322949370776675e-06, + "loss": 0.7502, + "step": 7687 + }, + { + "epoch": 0.5817411373008967, + "grad_norm": 2.279628276824951, + "learning_rate": 7.320736652595537e-06, + "loss": 0.6221, + "step": 7688 + }, + { + "epoch": 0.5818168060232303, + "grad_norm": 2.0535850524902344, + "learning_rate": 7.3185240395753005e-06, + "loss": 0.853, + "step": 7689 + }, + { + "epoch": 0.5818924747455639, + "grad_norm": 1.900155782699585, + "learning_rate": 7.316311531854524e-06, + "loss": 0.8605, + "step": 7690 + }, + { + "epoch": 0.5819681434678975, + "grad_norm": 1.8378933668136597, + "learning_rate": 7.314099129571769e-06, + "loss": 0.4497, + "step": 7691 + }, + { + "epoch": 0.5820438121902312, + "grad_norm": 2.2572903633117676, + "learning_rate": 7.3118868328655795e-06, + "loss": 0.7395, + "step": 7692 + }, + { + "epoch": 0.5821194809125648, + "grad_norm": 1.9963607788085938, + "learning_rate": 7.309674641874496e-06, + "loss": 0.8227, + "step": 7693 + }, + { + "epoch": 0.5821951496348984, + "grad_norm": 2.2713754177093506, + "learning_rate": 7.307462556737054e-06, + "loss": 0.667, + "step": 7694 + }, + { + "epoch": 0.582270818357232, + "grad_norm": 1.7703185081481934, + "learning_rate": 7.30525057759178e-06, + "loss": 0.8367, + "step": 7695 + }, + { + "epoch": 0.5823464870795657, + "grad_norm": 2.0329811573028564, + "learning_rate": 7.3030387045771945e-06, + "loss": 0.7944, + "step": 7696 + }, + { + "epoch": 0.5824221558018993, + "grad_norm": 2.182164192199707, + "learning_rate": 7.300826937831816e-06, + "loss": 0.6249, + "step": 7697 + }, + { + "epoch": 0.5824978245242329, + "grad_norm": 1.8738006353378296, + "learning_rate": 7.298615277494151e-06, + "loss": 0.6997, + "step": 7698 + }, + { + "epoch": 0.5825734932465665, + "grad_norm": 2.295851469039917, + "learning_rate": 7.2964037237027004e-06, + "loss": 0.6997, + "step": 7699 + }, + { + "epoch": 0.5826491619689002, + "grad_norm": 2.639524221420288, + "learning_rate": 7.294192276595958e-06, + "loss": 0.8636, + "step": 7700 + }, + { + "epoch": 0.5827248306912338, + "grad_norm": 2.49822998046875, + "learning_rate": 7.2919809363124104e-06, + "loss": 0.5851, + "step": 7701 + }, + { + "epoch": 0.5828004994135674, + "grad_norm": 1.8222301006317139, + "learning_rate": 7.289769702990542e-06, + "loss": 0.7613, + "step": 7702 + }, + { + "epoch": 0.582876168135901, + "grad_norm": 1.6940174102783203, + "learning_rate": 7.28755857676883e-06, + "loss": 0.5668, + "step": 7703 + }, + { + "epoch": 0.5829518368582346, + "grad_norm": 2.8774144649505615, + "learning_rate": 7.285347557785736e-06, + "loss": 0.7722, + "step": 7704 + }, + { + "epoch": 0.5830275055805683, + "grad_norm": 2.2571170330047607, + "learning_rate": 7.283136646179724e-06, + "loss": 0.7295, + "step": 7705 + }, + { + "epoch": 0.5831031743029019, + "grad_norm": 2.2378146648406982, + "learning_rate": 7.2809258420892455e-06, + "loss": 0.7539, + "step": 7706 + }, + { + "epoch": 0.5831788430252355, + "grad_norm": 2.1192033290863037, + "learning_rate": 7.278715145652754e-06, + "loss": 0.7074, + "step": 7707 + }, + { + "epoch": 0.5832545117475691, + "grad_norm": 2.149138927459717, + "learning_rate": 7.276504557008687e-06, + "loss": 0.6908, + "step": 7708 + }, + { + "epoch": 0.5833301804699028, + "grad_norm": 2.1191375255584717, + "learning_rate": 7.274294076295479e-06, + "loss": 0.6748, + "step": 7709 + }, + { + "epoch": 0.5834058491922364, + "grad_norm": 1.9595415592193604, + "learning_rate": 7.27208370365156e-06, + "loss": 0.8031, + "step": 7710 + }, + { + "epoch": 0.58348151791457, + "grad_norm": 2.0136635303497314, + "learning_rate": 7.269873439215343e-06, + "loss": 0.535, + "step": 7711 + }, + { + "epoch": 0.5835571866369036, + "grad_norm": 6.647026062011719, + "learning_rate": 7.267663283125249e-06, + "loss": 0.5824, + "step": 7712 + }, + { + "epoch": 0.5836328553592373, + "grad_norm": 2.1714112758636475, + "learning_rate": 7.265453235519686e-06, + "loss": 0.6723, + "step": 7713 + }, + { + "epoch": 0.5837085240815709, + "grad_norm": 1.932857632637024, + "learning_rate": 7.26324329653705e-06, + "loss": 0.768, + "step": 7714 + }, + { + "epoch": 0.5837841928039045, + "grad_norm": 3.383786916732788, + "learning_rate": 7.261033466315737e-06, + "loss": 0.5688, + "step": 7715 + }, + { + "epoch": 0.5838598615262381, + "grad_norm": 2.061591863632202, + "learning_rate": 7.2588237449941274e-06, + "loss": 0.4932, + "step": 7716 + }, + { + "epoch": 0.5839355302485717, + "grad_norm": 1.7182115316390991, + "learning_rate": 7.256614132710612e-06, + "loss": 0.5914, + "step": 7717 + }, + { + "epoch": 0.5840111989709054, + "grad_norm": 2.315765142440796, + "learning_rate": 7.254404629603557e-06, + "loss": 0.7175, + "step": 7718 + }, + { + "epoch": 0.584086867693239, + "grad_norm": 2.1149778366088867, + "learning_rate": 7.252195235811331e-06, + "loss": 0.6716, + "step": 7719 + }, + { + "epoch": 0.5841625364155726, + "grad_norm": 2.2034709453582764, + "learning_rate": 7.2499859514722925e-06, + "loss": 0.718, + "step": 7720 + }, + { + "epoch": 0.5842382051379063, + "grad_norm": 1.7320899963378906, + "learning_rate": 7.24777677672479e-06, + "loss": 0.8193, + "step": 7721 + }, + { + "epoch": 0.5843138738602399, + "grad_norm": 2.224959373474121, + "learning_rate": 7.2455677117071785e-06, + "loss": 0.6078, + "step": 7722 + }, + { + "epoch": 0.5843895425825735, + "grad_norm": 1.8637721538543701, + "learning_rate": 7.243358756557788e-06, + "loss": 0.7856, + "step": 7723 + }, + { + "epoch": 0.5844652113049071, + "grad_norm": 1.866568922996521, + "learning_rate": 7.241149911414957e-06, + "loss": 0.6537, + "step": 7724 + }, + { + "epoch": 0.5845408800272407, + "grad_norm": 2.14298415184021, + "learning_rate": 7.238941176417005e-06, + "loss": 0.7189, + "step": 7725 + }, + { + "epoch": 0.5846165487495744, + "grad_norm": 2.2058675289154053, + "learning_rate": 7.236732551702251e-06, + "loss": 0.6751, + "step": 7726 + }, + { + "epoch": 0.584692217471908, + "grad_norm": 2.182589054107666, + "learning_rate": 7.23452403740901e-06, + "loss": 0.757, + "step": 7727 + }, + { + "epoch": 0.5847678861942416, + "grad_norm": 2.33272123336792, + "learning_rate": 7.232315633675584e-06, + "loss": 0.6802, + "step": 7728 + }, + { + "epoch": 0.5848435549165752, + "grad_norm": 1.9340473413467407, + "learning_rate": 7.230107340640272e-06, + "loss": 0.6658, + "step": 7729 + }, + { + "epoch": 0.5849192236389088, + "grad_norm": 6.095152854919434, + "learning_rate": 7.22789915844136e-06, + "loss": 0.6181, + "step": 7730 + }, + { + "epoch": 0.5849948923612425, + "grad_norm": 1.9372726678848267, + "learning_rate": 7.225691087217132e-06, + "loss": 0.6955, + "step": 7731 + }, + { + "epoch": 0.5850705610835761, + "grad_norm": 2.411663055419922, + "learning_rate": 7.22348312710587e-06, + "loss": 0.7125, + "step": 7732 + }, + { + "epoch": 0.5851462298059097, + "grad_norm": 2.0117082595825195, + "learning_rate": 7.221275278245842e-06, + "loss": 0.6744, + "step": 7733 + }, + { + "epoch": 0.5852218985282434, + "grad_norm": 2.462001085281372, + "learning_rate": 7.2190675407753075e-06, + "loss": 0.7146, + "step": 7734 + }, + { + "epoch": 0.585297567250577, + "grad_norm": 2.144810438156128, + "learning_rate": 7.216859914832526e-06, + "loss": 0.6407, + "step": 7735 + }, + { + "epoch": 0.5853732359729106, + "grad_norm": 2.357346773147583, + "learning_rate": 7.2146524005557416e-06, + "loss": 0.6624, + "step": 7736 + }, + { + "epoch": 0.5854489046952442, + "grad_norm": 3.149186134338379, + "learning_rate": 7.212444998083196e-06, + "loss": 0.6403, + "step": 7737 + }, + { + "epoch": 0.5855245734175778, + "grad_norm": 1.9493343830108643, + "learning_rate": 7.210237707553132e-06, + "loss": 0.6004, + "step": 7738 + }, + { + "epoch": 0.5856002421399115, + "grad_norm": 2.3722169399261475, + "learning_rate": 7.208030529103768e-06, + "loss": 0.7029, + "step": 7739 + }, + { + "epoch": 0.5856759108622451, + "grad_norm": 1.8572362661361694, + "learning_rate": 7.205823462873331e-06, + "loss": 0.5583, + "step": 7740 + }, + { + "epoch": 0.5857515795845787, + "grad_norm": 19.258787155151367, + "learning_rate": 7.203616509000029e-06, + "loss": 0.7381, + "step": 7741 + }, + { + "epoch": 0.5858272483069124, + "grad_norm": 2.024709939956665, + "learning_rate": 7.201409667622069e-06, + "loss": 0.6997, + "step": 7742 + }, + { + "epoch": 0.5859029170292459, + "grad_norm": 1.7652946710586548, + "learning_rate": 7.199202938877658e-06, + "loss": 0.6909, + "step": 7743 + }, + { + "epoch": 0.5859785857515796, + "grad_norm": 1.8822402954101562, + "learning_rate": 7.196996322904982e-06, + "loss": 0.635, + "step": 7744 + }, + { + "epoch": 0.5860542544739132, + "grad_norm": 1.622046947479248, + "learning_rate": 7.194789819842228e-06, + "loss": 0.7197, + "step": 7745 + }, + { + "epoch": 0.5861299231962468, + "grad_norm": 3.179093837738037, + "learning_rate": 7.1925834298275735e-06, + "loss": 0.6946, + "step": 7746 + }, + { + "epoch": 0.5862055919185805, + "grad_norm": 2.236264944076538, + "learning_rate": 7.19037715299919e-06, + "loss": 0.7203, + "step": 7747 + }, + { + "epoch": 0.5862812606409141, + "grad_norm": 2.4791111946105957, + "learning_rate": 7.188170989495242e-06, + "loss": 0.5344, + "step": 7748 + }, + { + "epoch": 0.5863569293632477, + "grad_norm": 2.7363762855529785, + "learning_rate": 7.18596493945389e-06, + "loss": 0.6981, + "step": 7749 + }, + { + "epoch": 0.5864325980855813, + "grad_norm": 1.8051518201828003, + "learning_rate": 7.183759003013277e-06, + "loss": 0.6166, + "step": 7750 + }, + { + "epoch": 0.5865082668079149, + "grad_norm": 2.029879331588745, + "learning_rate": 7.181553180311554e-06, + "loss": 0.7514, + "step": 7751 + }, + { + "epoch": 0.5865839355302486, + "grad_norm": 2.204545736312866, + "learning_rate": 7.1793474714868465e-06, + "loss": 0.7361, + "step": 7752 + }, + { + "epoch": 0.5866596042525822, + "grad_norm": 2.728564739227295, + "learning_rate": 7.177141876677292e-06, + "loss": 0.637, + "step": 7753 + }, + { + "epoch": 0.5867352729749158, + "grad_norm": 2.3274734020233154, + "learning_rate": 7.174936396021011e-06, + "loss": 0.6613, + "step": 7754 + }, + { + "epoch": 0.5868109416972495, + "grad_norm": 2.8488948345184326, + "learning_rate": 7.172731029656113e-06, + "loss": 0.8433, + "step": 7755 + }, + { + "epoch": 0.586886610419583, + "grad_norm": 2.334925651550293, + "learning_rate": 7.1705257777207115e-06, + "loss": 0.7575, + "step": 7756 + }, + { + "epoch": 0.5869622791419167, + "grad_norm": 4.486547470092773, + "learning_rate": 7.168320640352898e-06, + "loss": 0.8143, + "step": 7757 + }, + { + "epoch": 0.5870379478642503, + "grad_norm": 1.958241581916809, + "learning_rate": 7.1661156176907716e-06, + "loss": 0.742, + "step": 7758 + }, + { + "epoch": 0.5871136165865839, + "grad_norm": 2.2698402404785156, + "learning_rate": 7.163910709872421e-06, + "loss": 0.817, + "step": 7759 + }, + { + "epoch": 0.5871892853089176, + "grad_norm": 2.016049861907959, + "learning_rate": 7.1617059170359165e-06, + "loss": 0.6541, + "step": 7760 + }, + { + "epoch": 0.5872649540312512, + "grad_norm": 1.8849208354949951, + "learning_rate": 7.1595012393193346e-06, + "loss": 0.6868, + "step": 7761 + }, + { + "epoch": 0.5873406227535848, + "grad_norm": 2.1924757957458496, + "learning_rate": 7.157296676860735e-06, + "loss": 0.6863, + "step": 7762 + }, + { + "epoch": 0.5874162914759185, + "grad_norm": 2.1753952503204346, + "learning_rate": 7.155092229798181e-06, + "loss": 0.6678, + "step": 7763 + }, + { + "epoch": 0.587491960198252, + "grad_norm": 1.9696418046951294, + "learning_rate": 7.152887898269718e-06, + "loss": 0.706, + "step": 7764 + }, + { + "epoch": 0.5875676289205857, + "grad_norm": 2.0107688903808594, + "learning_rate": 7.15068368241339e-06, + "loss": 0.7368, + "step": 7765 + }, + { + "epoch": 0.5876432976429193, + "grad_norm": 1.9954001903533936, + "learning_rate": 7.14847958236723e-06, + "loss": 0.6301, + "step": 7766 + }, + { + "epoch": 0.5877189663652529, + "grad_norm": 1.949197769165039, + "learning_rate": 7.146275598269265e-06, + "loss": 0.5946, + "step": 7767 + }, + { + "epoch": 0.5877946350875866, + "grad_norm": 2.207111358642578, + "learning_rate": 7.144071730257521e-06, + "loss": 0.6304, + "step": 7768 + }, + { + "epoch": 0.5878703038099202, + "grad_norm": 1.9679584503173828, + "learning_rate": 7.141867978470007e-06, + "loss": 0.652, + "step": 7769 + }, + { + "epoch": 0.5879459725322538, + "grad_norm": 2.0223073959350586, + "learning_rate": 7.139664343044732e-06, + "loss": 0.6923, + "step": 7770 + }, + { + "epoch": 0.5880216412545874, + "grad_norm": 2.1896934509277344, + "learning_rate": 7.137460824119691e-06, + "loss": 0.7841, + "step": 7771 + }, + { + "epoch": 0.588097309976921, + "grad_norm": 2.157703161239624, + "learning_rate": 7.135257421832879e-06, + "loss": 0.6671, + "step": 7772 + }, + { + "epoch": 0.5881729786992547, + "grad_norm": 2.098355293273926, + "learning_rate": 7.133054136322274e-06, + "loss": 0.6282, + "step": 7773 + }, + { + "epoch": 0.5882486474215883, + "grad_norm": 1.8522675037384033, + "learning_rate": 7.130850967725861e-06, + "loss": 0.6418, + "step": 7774 + }, + { + "epoch": 0.5883243161439219, + "grad_norm": 1.7040518522262573, + "learning_rate": 7.128647916181605e-06, + "loss": 0.5497, + "step": 7775 + }, + { + "epoch": 0.5883999848662556, + "grad_norm": 2.2610528469085693, + "learning_rate": 7.126444981827471e-06, + "loss": 0.6669, + "step": 7776 + }, + { + "epoch": 0.5884756535885891, + "grad_norm": 2.119845390319824, + "learning_rate": 7.12424216480141e-06, + "loss": 0.7393, + "step": 7777 + }, + { + "epoch": 0.5885513223109228, + "grad_norm": 2.3598594665527344, + "learning_rate": 7.12203946524137e-06, + "loss": 0.6026, + "step": 7778 + }, + { + "epoch": 0.5886269910332564, + "grad_norm": 2.374350070953369, + "learning_rate": 7.119836883285297e-06, + "loss": 0.6199, + "step": 7779 + }, + { + "epoch": 0.58870265975559, + "grad_norm": 2.1946024894714355, + "learning_rate": 7.117634419071117e-06, + "loss": 0.6357, + "step": 7780 + }, + { + "epoch": 0.5887783284779237, + "grad_norm": 2.086524724960327, + "learning_rate": 7.115432072736759e-06, + "loss": 0.8439, + "step": 7781 + }, + { + "epoch": 0.5888539972002573, + "grad_norm": 2.3127012252807617, + "learning_rate": 7.1132298444201395e-06, + "loss": 0.7654, + "step": 7782 + }, + { + "epoch": 0.5889296659225909, + "grad_norm": 2.065904378890991, + "learning_rate": 7.111027734259167e-06, + "loss": 0.7323, + "step": 7783 + }, + { + "epoch": 0.5890053346449245, + "grad_norm": 1.9671047925949097, + "learning_rate": 7.108825742391752e-06, + "loss": 0.7849, + "step": 7784 + }, + { + "epoch": 0.5890810033672581, + "grad_norm": 2.391326665878296, + "learning_rate": 7.106623868955784e-06, + "loss": 0.6453, + "step": 7785 + }, + { + "epoch": 0.5891566720895918, + "grad_norm": 2.2341959476470947, + "learning_rate": 7.104422114089155e-06, + "loss": 0.7335, + "step": 7786 + }, + { + "epoch": 0.5892323408119254, + "grad_norm": 2.1001060009002686, + "learning_rate": 7.1022204779297415e-06, + "loss": 0.752, + "step": 7787 + }, + { + "epoch": 0.589308009534259, + "grad_norm": 2.2669565677642822, + "learning_rate": 7.1000189606154185e-06, + "loss": 0.6614, + "step": 7788 + }, + { + "epoch": 0.5893836782565927, + "grad_norm": 2.1226632595062256, + "learning_rate": 7.097817562284056e-06, + "loss": 0.7003, + "step": 7789 + }, + { + "epoch": 0.5894593469789262, + "grad_norm": 1.9791269302368164, + "learning_rate": 7.095616283073511e-06, + "loss": 0.7109, + "step": 7790 + }, + { + "epoch": 0.5895350157012599, + "grad_norm": 2.323878765106201, + "learning_rate": 7.093415123121633e-06, + "loss": 0.7033, + "step": 7791 + }, + { + "epoch": 0.5896106844235935, + "grad_norm": 2.5213472843170166, + "learning_rate": 7.091214082566267e-06, + "loss": 0.7201, + "step": 7792 + }, + { + "epoch": 0.5896863531459271, + "grad_norm": 2.2447805404663086, + "learning_rate": 7.089013161545246e-06, + "loss": 0.7435, + "step": 7793 + }, + { + "epoch": 0.5897620218682608, + "grad_norm": 1.9220545291900635, + "learning_rate": 7.086812360196404e-06, + "loss": 0.7438, + "step": 7794 + }, + { + "epoch": 0.5898376905905944, + "grad_norm": 2.168332815170288, + "learning_rate": 7.084611678657562e-06, + "loss": 0.6878, + "step": 7795 + }, + { + "epoch": 0.589913359312928, + "grad_norm": 2.1411046981811523, + "learning_rate": 7.082411117066529e-06, + "loss": 0.7192, + "step": 7796 + }, + { + "epoch": 0.5899890280352617, + "grad_norm": 2.133474111557007, + "learning_rate": 7.080210675561116e-06, + "loss": 0.6435, + "step": 7797 + }, + { + "epoch": 0.5900646967575952, + "grad_norm": 2.075592041015625, + "learning_rate": 7.078010354279117e-06, + "loss": 0.7506, + "step": 7798 + }, + { + "epoch": 0.5901403654799289, + "grad_norm": 1.674235224723816, + "learning_rate": 7.075810153358327e-06, + "loss": 0.6847, + "step": 7799 + }, + { + "epoch": 0.5902160342022625, + "grad_norm": 2.099836587905884, + "learning_rate": 7.073610072936532e-06, + "loss": 0.778, + "step": 7800 + }, + { + "epoch": 0.5902917029245961, + "grad_norm": 1.963880181312561, + "learning_rate": 7.0714101131515015e-06, + "loss": 0.6492, + "step": 7801 + }, + { + "epoch": 0.5903673716469298, + "grad_norm": 2.0048913955688477, + "learning_rate": 7.069210274141011e-06, + "loss": 0.7348, + "step": 7802 + }, + { + "epoch": 0.5904430403692633, + "grad_norm": 1.986953616142273, + "learning_rate": 7.067010556042812e-06, + "loss": 0.7612, + "step": 7803 + }, + { + "epoch": 0.590518709091597, + "grad_norm": 2.7819113731384277, + "learning_rate": 7.064810958994668e-06, + "loss": 0.7183, + "step": 7804 + }, + { + "epoch": 0.5905943778139306, + "grad_norm": 2.236011266708374, + "learning_rate": 7.062611483134321e-06, + "loss": 0.8888, + "step": 7805 + }, + { + "epoch": 0.5906700465362642, + "grad_norm": 1.9459607601165771, + "learning_rate": 7.06041212859951e-06, + "loss": 0.7414, + "step": 7806 + }, + { + "epoch": 0.5907457152585979, + "grad_norm": 1.7577186822891235, + "learning_rate": 7.058212895527964e-06, + "loss": 0.7483, + "step": 7807 + }, + { + "epoch": 0.5908213839809315, + "grad_norm": 1.7476128339767456, + "learning_rate": 7.056013784057404e-06, + "loss": 0.6792, + "step": 7808 + }, + { + "epoch": 0.5908970527032651, + "grad_norm": 2.0670974254608154, + "learning_rate": 7.053814794325552e-06, + "loss": 0.743, + "step": 7809 + }, + { + "epoch": 0.5909727214255988, + "grad_norm": 1.867936372756958, + "learning_rate": 7.051615926470112e-06, + "loss": 0.7591, + "step": 7810 + }, + { + "epoch": 0.5910483901479323, + "grad_norm": 2.2760121822357178, + "learning_rate": 7.049417180628785e-06, + "loss": 0.6799, + "step": 7811 + }, + { + "epoch": 0.591124058870266, + "grad_norm": 2.5277442932128906, + "learning_rate": 7.047218556939262e-06, + "loss": 0.7732, + "step": 7812 + }, + { + "epoch": 0.5911997275925996, + "grad_norm": 2.176264524459839, + "learning_rate": 7.04502005553923e-06, + "loss": 0.6876, + "step": 7813 + }, + { + "epoch": 0.5912753963149332, + "grad_norm": 2.316378355026245, + "learning_rate": 7.042821676566363e-06, + "loss": 0.6702, + "step": 7814 + }, + { + "epoch": 0.5913510650372669, + "grad_norm": 2.1952383518218994, + "learning_rate": 7.040623420158334e-06, + "loss": 0.6593, + "step": 7815 + }, + { + "epoch": 0.5914267337596004, + "grad_norm": 2.2782609462738037, + "learning_rate": 7.038425286452806e-06, + "loss": 0.7548, + "step": 7816 + }, + { + "epoch": 0.5915024024819341, + "grad_norm": 1.863445520401001, + "learning_rate": 7.036227275587428e-06, + "loss": 0.5509, + "step": 7817 + }, + { + "epoch": 0.5915780712042678, + "grad_norm": 5.7330851554870605, + "learning_rate": 7.034029387699853e-06, + "loss": 0.6268, + "step": 7818 + }, + { + "epoch": 0.5916537399266013, + "grad_norm": 1.4934370517730713, + "learning_rate": 7.031831622927709e-06, + "loss": 0.6594, + "step": 7819 + }, + { + "epoch": 0.591729408648935, + "grad_norm": 1.9840810298919678, + "learning_rate": 7.0296339814086425e-06, + "loss": 0.7832, + "step": 7820 + }, + { + "epoch": 0.5918050773712686, + "grad_norm": 1.9863804578781128, + "learning_rate": 7.027436463280266e-06, + "loss": 0.6377, + "step": 7821 + }, + { + "epoch": 0.5918807460936022, + "grad_norm": 2.300100803375244, + "learning_rate": 7.0252390686802e-06, + "loss": 0.6932, + "step": 7822 + }, + { + "epoch": 0.5919564148159359, + "grad_norm": 1.7306914329528809, + "learning_rate": 7.023041797746048e-06, + "loss": 0.7395, + "step": 7823 + }, + { + "epoch": 0.5920320835382694, + "grad_norm": 2.2924342155456543, + "learning_rate": 7.020844650615412e-06, + "loss": 0.6836, + "step": 7824 + }, + { + "epoch": 0.5921077522606031, + "grad_norm": 2.0509254932403564, + "learning_rate": 7.018647627425889e-06, + "loss": 0.7414, + "step": 7825 + }, + { + "epoch": 0.5921834209829367, + "grad_norm": 1.9954980611801147, + "learning_rate": 7.016450728315059e-06, + "loss": 0.7232, + "step": 7826 + }, + { + "epoch": 0.5922590897052703, + "grad_norm": 2.126925468444824, + "learning_rate": 7.014253953420501e-06, + "loss": 0.8562, + "step": 7827 + }, + { + "epoch": 0.592334758427604, + "grad_norm": 2.0931129455566406, + "learning_rate": 7.0120573028797814e-06, + "loss": 0.5782, + "step": 7828 + }, + { + "epoch": 0.5924104271499375, + "grad_norm": 1.9808735847473145, + "learning_rate": 7.009860776830461e-06, + "loss": 0.6848, + "step": 7829 + }, + { + "epoch": 0.5924860958722712, + "grad_norm": 1.9941705465316772, + "learning_rate": 7.007664375410099e-06, + "loss": 0.6859, + "step": 7830 + }, + { + "epoch": 0.5925617645946049, + "grad_norm": 1.9075549840927124, + "learning_rate": 7.005468098756237e-06, + "loss": 0.6063, + "step": 7831 + }, + { + "epoch": 0.5926374333169384, + "grad_norm": 2.4272186756134033, + "learning_rate": 7.003271947006415e-06, + "loss": 0.7188, + "step": 7832 + }, + { + "epoch": 0.5927131020392721, + "grad_norm": 2.166393518447876, + "learning_rate": 7.00107592029816e-06, + "loss": 0.7613, + "step": 7833 + }, + { + "epoch": 0.5927887707616057, + "grad_norm": 2.0019025802612305, + "learning_rate": 6.998880018768995e-06, + "loss": 0.5959, + "step": 7834 + }, + { + "epoch": 0.5928644394839393, + "grad_norm": 2.18587327003479, + "learning_rate": 6.996684242556438e-06, + "loss": 0.6934, + "step": 7835 + }, + { + "epoch": 0.592940108206273, + "grad_norm": 1.79190993309021, + "learning_rate": 6.9944885917979935e-06, + "loss": 0.5657, + "step": 7836 + }, + { + "epoch": 0.5930157769286065, + "grad_norm": 1.922041893005371, + "learning_rate": 6.992293066631159e-06, + "loss": 0.7778, + "step": 7837 + }, + { + "epoch": 0.5930914456509402, + "grad_norm": 2.1699047088623047, + "learning_rate": 6.990097667193427e-06, + "loss": 0.5798, + "step": 7838 + }, + { + "epoch": 0.5931671143732739, + "grad_norm": 1.9367480278015137, + "learning_rate": 6.987902393622278e-06, + "loss": 0.8402, + "step": 7839 + }, + { + "epoch": 0.5932427830956074, + "grad_norm": 2.1018497943878174, + "learning_rate": 6.985707246055189e-06, + "loss": 0.6521, + "step": 7840 + }, + { + "epoch": 0.5933184518179411, + "grad_norm": 1.8842015266418457, + "learning_rate": 6.983512224629631e-06, + "loss": 0.6046, + "step": 7841 + }, + { + "epoch": 0.5933941205402746, + "grad_norm": 2.317081928253174, + "learning_rate": 6.981317329483057e-06, + "loss": 0.6315, + "step": 7842 + }, + { + "epoch": 0.5934697892626083, + "grad_norm": 2.405687093734741, + "learning_rate": 6.979122560752923e-06, + "loss": 0.5215, + "step": 7843 + }, + { + "epoch": 0.593545457984942, + "grad_norm": 7.7125935554504395, + "learning_rate": 6.976927918576667e-06, + "loss": 0.6223, + "step": 7844 + }, + { + "epoch": 0.5936211267072755, + "grad_norm": 2.015296697616577, + "learning_rate": 6.974733403091729e-06, + "loss": 0.6515, + "step": 7845 + }, + { + "epoch": 0.5936967954296092, + "grad_norm": 1.864576816558838, + "learning_rate": 6.972539014435539e-06, + "loss": 0.74, + "step": 7846 + }, + { + "epoch": 0.5937724641519428, + "grad_norm": 2.105365514755249, + "learning_rate": 6.970344752745511e-06, + "loss": 0.6876, + "step": 7847 + }, + { + "epoch": 0.5938481328742764, + "grad_norm": 1.9904052019119263, + "learning_rate": 6.968150618159058e-06, + "loss": 0.7621, + "step": 7848 + }, + { + "epoch": 0.5939238015966101, + "grad_norm": 2.3770105838775635, + "learning_rate": 6.965956610813589e-06, + "loss": 0.5695, + "step": 7849 + }, + { + "epoch": 0.5939994703189436, + "grad_norm": 2.017836570739746, + "learning_rate": 6.963762730846492e-06, + "loss": 0.6737, + "step": 7850 + }, + { + "epoch": 0.5940751390412773, + "grad_norm": 1.9348866939544678, + "learning_rate": 6.96156897839516e-06, + "loss": 0.8016, + "step": 7851 + }, + { + "epoch": 0.594150807763611, + "grad_norm": 1.867281198501587, + "learning_rate": 6.959375353596973e-06, + "loss": 0.5993, + "step": 7852 + }, + { + "epoch": 0.5942264764859445, + "grad_norm": 2.012544870376587, + "learning_rate": 6.957181856589301e-06, + "loss": 0.5875, + "step": 7853 + }, + { + "epoch": 0.5943021452082782, + "grad_norm": 2.0992937088012695, + "learning_rate": 6.9549884875095095e-06, + "loss": 0.5162, + "step": 7854 + }, + { + "epoch": 0.5943778139306117, + "grad_norm": 1.8907424211502075, + "learning_rate": 6.952795246494949e-06, + "loss": 0.7365, + "step": 7855 + }, + { + "epoch": 0.5944534826529454, + "grad_norm": 1.9334065914154053, + "learning_rate": 6.9506021336829745e-06, + "loss": 0.6326, + "step": 7856 + }, + { + "epoch": 0.5945291513752791, + "grad_norm": 2.2515668869018555, + "learning_rate": 6.948409149210924e-06, + "loss": 0.7034, + "step": 7857 + }, + { + "epoch": 0.5946048200976126, + "grad_norm": 2.1506471633911133, + "learning_rate": 6.946216293216127e-06, + "loss": 0.8012, + "step": 7858 + }, + { + "epoch": 0.5946804888199463, + "grad_norm": 1.9353808164596558, + "learning_rate": 6.944023565835911e-06, + "loss": 0.6319, + "step": 7859 + }, + { + "epoch": 0.59475615754228, + "grad_norm": 3.7930614948272705, + "learning_rate": 6.941830967207584e-06, + "loss": 0.6095, + "step": 7860 + }, + { + "epoch": 0.5948318262646135, + "grad_norm": 1.9903547763824463, + "learning_rate": 6.939638497468461e-06, + "loss": 0.7019, + "step": 7861 + }, + { + "epoch": 0.5949074949869472, + "grad_norm": 2.3188912868499756, + "learning_rate": 6.937446156755841e-06, + "loss": 0.8333, + "step": 7862 + }, + { + "epoch": 0.5949831637092807, + "grad_norm": 2.208460569381714, + "learning_rate": 6.935253945207013e-06, + "loss": 0.5642, + "step": 7863 + }, + { + "epoch": 0.5950588324316144, + "grad_norm": 2.100937604904175, + "learning_rate": 6.93306186295926e-06, + "loss": 0.6346, + "step": 7864 + }, + { + "epoch": 0.5951345011539481, + "grad_norm": 2.0526506900787354, + "learning_rate": 6.9308699101498565e-06, + "loss": 0.6951, + "step": 7865 + }, + { + "epoch": 0.5952101698762816, + "grad_norm": 2.3143303394317627, + "learning_rate": 6.928678086916076e-06, + "loss": 0.6788, + "step": 7866 + }, + { + "epoch": 0.5952858385986153, + "grad_norm": 2.076948404312134, + "learning_rate": 6.926486393395171e-06, + "loss": 0.7481, + "step": 7867 + }, + { + "epoch": 0.5953615073209488, + "grad_norm": 2.473203659057617, + "learning_rate": 6.9242948297243975e-06, + "loss": 0.5965, + "step": 7868 + }, + { + "epoch": 0.5954371760432825, + "grad_norm": 2.1721701622009277, + "learning_rate": 6.922103396040992e-06, + "loss": 0.768, + "step": 7869 + }, + { + "epoch": 0.5955128447656162, + "grad_norm": 1.6090091466903687, + "learning_rate": 6.919912092482192e-06, + "loss": 0.6591, + "step": 7870 + }, + { + "epoch": 0.5955885134879497, + "grad_norm": 1.9884438514709473, + "learning_rate": 6.917720919185227e-06, + "loss": 0.7708, + "step": 7871 + }, + { + "epoch": 0.5956641822102834, + "grad_norm": 2.0646588802337646, + "learning_rate": 6.9155298762873115e-06, + "loss": 0.6364, + "step": 7872 + }, + { + "epoch": 0.595739850932617, + "grad_norm": 2.5629851818084717, + "learning_rate": 6.913338963925659e-06, + "loss": 0.8015, + "step": 7873 + }, + { + "epoch": 0.5958155196549506, + "grad_norm": 2.499199151992798, + "learning_rate": 6.9111481822374685e-06, + "loss": 0.5926, + "step": 7874 + }, + { + "epoch": 0.5958911883772843, + "grad_norm": 2.0954859256744385, + "learning_rate": 6.908957531359932e-06, + "loss": 0.7402, + "step": 7875 + }, + { + "epoch": 0.5959668570996178, + "grad_norm": 1.77829110622406, + "learning_rate": 6.906767011430242e-06, + "loss": 0.6274, + "step": 7876 + }, + { + "epoch": 0.5960425258219515, + "grad_norm": 2.359452962875366, + "learning_rate": 6.904576622585572e-06, + "loss": 0.6625, + "step": 7877 + }, + { + "epoch": 0.5961181945442852, + "grad_norm": 2.3234505653381348, + "learning_rate": 6.9023863649630894e-06, + "loss": 0.8029, + "step": 7878 + }, + { + "epoch": 0.5961938632666187, + "grad_norm": 2.330986261367798, + "learning_rate": 6.90019623869996e-06, + "loss": 0.5878, + "step": 7879 + }, + { + "epoch": 0.5962695319889524, + "grad_norm": 2.1926472187042236, + "learning_rate": 6.898006243933329e-06, + "loss": 0.6217, + "step": 7880 + }, + { + "epoch": 0.5963452007112859, + "grad_norm": 1.9008252620697021, + "learning_rate": 6.8958163808003485e-06, + "loss": 0.6465, + "step": 7881 + }, + { + "epoch": 0.5964208694336196, + "grad_norm": 2.272428512573242, + "learning_rate": 6.893626649438154e-06, + "loss": 0.692, + "step": 7882 + }, + { + "epoch": 0.5964965381559533, + "grad_norm": 2.106896162033081, + "learning_rate": 6.891437049983869e-06, + "loss": 0.5936, + "step": 7883 + }, + { + "epoch": 0.5965722068782868, + "grad_norm": 2.124065399169922, + "learning_rate": 6.889247582574617e-06, + "loss": 0.6684, + "step": 7884 + }, + { + "epoch": 0.5966478756006205, + "grad_norm": 1.6563143730163574, + "learning_rate": 6.887058247347506e-06, + "loss": 0.6952, + "step": 7885 + }, + { + "epoch": 0.5967235443229542, + "grad_norm": 1.7926180362701416, + "learning_rate": 6.884869044439644e-06, + "loss": 0.7559, + "step": 7886 + }, + { + "epoch": 0.5967992130452877, + "grad_norm": 2.0105221271514893, + "learning_rate": 6.8826799739881235e-06, + "loss": 0.6041, + "step": 7887 + }, + { + "epoch": 0.5968748817676214, + "grad_norm": 2.2636537551879883, + "learning_rate": 6.88049103613003e-06, + "loss": 0.6275, + "step": 7888 + }, + { + "epoch": 0.5969505504899549, + "grad_norm": 2.092416524887085, + "learning_rate": 6.878302231002446e-06, + "loss": 0.7661, + "step": 7889 + }, + { + "epoch": 0.5970262192122886, + "grad_norm": 1.8432059288024902, + "learning_rate": 6.876113558742437e-06, + "loss": 0.6898, + "step": 7890 + }, + { + "epoch": 0.5971018879346223, + "grad_norm": 1.8818315267562866, + "learning_rate": 6.873925019487064e-06, + "loss": 0.5599, + "step": 7891 + }, + { + "epoch": 0.5971775566569558, + "grad_norm": 1.8227505683898926, + "learning_rate": 6.871736613373384e-06, + "loss": 0.6097, + "step": 7892 + }, + { + "epoch": 0.5972532253792895, + "grad_norm": 5.854292869567871, + "learning_rate": 6.869548340538444e-06, + "loss": 0.7175, + "step": 7893 + }, + { + "epoch": 0.597328894101623, + "grad_norm": 1.9756673574447632, + "learning_rate": 6.8673602011192746e-06, + "loss": 0.6218, + "step": 7894 + }, + { + "epoch": 0.5974045628239567, + "grad_norm": 2.129859209060669, + "learning_rate": 6.86517219525291e-06, + "loss": 0.7691, + "step": 7895 + }, + { + "epoch": 0.5974802315462904, + "grad_norm": 2.6184825897216797, + "learning_rate": 6.862984323076363e-06, + "loss": 0.784, + "step": 7896 + }, + { + "epoch": 0.5975559002686239, + "grad_norm": 2.6178689002990723, + "learning_rate": 6.860796584726652e-06, + "loss": 0.6983, + "step": 7897 + }, + { + "epoch": 0.5976315689909576, + "grad_norm": 1.5637413263320923, + "learning_rate": 6.858608980340779e-06, + "loss": 0.6212, + "step": 7898 + }, + { + "epoch": 0.5977072377132913, + "grad_norm": 1.7278257608413696, + "learning_rate": 6.856421510055736e-06, + "loss": 0.6918, + "step": 7899 + }, + { + "epoch": 0.5977829064356248, + "grad_norm": 2.5124382972717285, + "learning_rate": 6.8542341740085136e-06, + "loss": 0.772, + "step": 7900 + }, + { + "epoch": 0.5978585751579585, + "grad_norm": 2.5317604541778564, + "learning_rate": 6.8520469723360835e-06, + "loss": 0.5569, + "step": 7901 + }, + { + "epoch": 0.597934243880292, + "grad_norm": 2.7019991874694824, + "learning_rate": 6.849859905175421e-06, + "loss": 0.7898, + "step": 7902 + }, + { + "epoch": 0.5980099126026257, + "grad_norm": 1.7270742654800415, + "learning_rate": 6.847672972663488e-06, + "loss": 0.8241, + "step": 7903 + }, + { + "epoch": 0.5980855813249594, + "grad_norm": 2.8753139972686768, + "learning_rate": 6.845486174937233e-06, + "loss": 0.6372, + "step": 7904 + }, + { + "epoch": 0.5981612500472929, + "grad_norm": 2.0319454669952393, + "learning_rate": 6.843299512133604e-06, + "loss": 0.578, + "step": 7905 + }, + { + "epoch": 0.5982369187696266, + "grad_norm": 2.0023133754730225, + "learning_rate": 6.841112984389529e-06, + "loss": 0.6541, + "step": 7906 + }, + { + "epoch": 0.5983125874919601, + "grad_norm": 2.8424692153930664, + "learning_rate": 6.8389265918419485e-06, + "loss": 0.6049, + "step": 7907 + }, + { + "epoch": 0.5983882562142938, + "grad_norm": 2.44048810005188, + "learning_rate": 6.836740334627771e-06, + "loss": 0.6917, + "step": 7908 + }, + { + "epoch": 0.5984639249366275, + "grad_norm": 1.9694398641586304, + "learning_rate": 6.8345542128839146e-06, + "loss": 0.5634, + "step": 7909 + }, + { + "epoch": 0.598539593658961, + "grad_norm": 3.217658519744873, + "learning_rate": 6.832368226747273e-06, + "loss": 0.824, + "step": 7910 + }, + { + "epoch": 0.5986152623812947, + "grad_norm": 2.072033643722534, + "learning_rate": 6.830182376354744e-06, + "loss": 0.6921, + "step": 7911 + }, + { + "epoch": 0.5986909311036284, + "grad_norm": 2.394564628601074, + "learning_rate": 6.8279966618432155e-06, + "loss": 0.6915, + "step": 7912 + }, + { + "epoch": 0.5987665998259619, + "grad_norm": 1.8874036073684692, + "learning_rate": 6.825811083349559e-06, + "loss": 0.6492, + "step": 7913 + }, + { + "epoch": 0.5988422685482956, + "grad_norm": 1.7274051904678345, + "learning_rate": 6.8236256410106476e-06, + "loss": 0.6783, + "step": 7914 + }, + { + "epoch": 0.5989179372706291, + "grad_norm": 2.2389156818389893, + "learning_rate": 6.821440334963335e-06, + "loss": 0.5332, + "step": 7915 + }, + { + "epoch": 0.5989936059929628, + "grad_norm": 3.8984248638153076, + "learning_rate": 6.819255165344475e-06, + "loss": 0.6413, + "step": 7916 + }, + { + "epoch": 0.5990692747152965, + "grad_norm": 2.091085433959961, + "learning_rate": 6.817070132290911e-06, + "loss": 0.6986, + "step": 7917 + }, + { + "epoch": 0.59914494343763, + "grad_norm": 2.023153781890869, + "learning_rate": 6.814885235939475e-06, + "loss": 0.734, + "step": 7918 + }, + { + "epoch": 0.5992206121599637, + "grad_norm": 2.2519912719726562, + "learning_rate": 6.812700476426995e-06, + "loss": 0.6936, + "step": 7919 + }, + { + "epoch": 0.5992962808822972, + "grad_norm": 1.7816762924194336, + "learning_rate": 6.810515853890283e-06, + "loss": 0.7838, + "step": 7920 + }, + { + "epoch": 0.5993719496046309, + "grad_norm": 2.385741710662842, + "learning_rate": 6.808331368466149e-06, + "loss": 0.7145, + "step": 7921 + }, + { + "epoch": 0.5994476183269646, + "grad_norm": 2.6496787071228027, + "learning_rate": 6.806147020291395e-06, + "loss": 0.7415, + "step": 7922 + }, + { + "epoch": 0.5995232870492981, + "grad_norm": 2.55653977394104, + "learning_rate": 6.803962809502812e-06, + "loss": 0.7278, + "step": 7923 + }, + { + "epoch": 0.5995989557716318, + "grad_norm": 2.4421041011810303, + "learning_rate": 6.80177873623718e-06, + "loss": 0.7035, + "step": 7924 + }, + { + "epoch": 0.5996746244939655, + "grad_norm": 2.765859603881836, + "learning_rate": 6.799594800631275e-06, + "loss": 0.5941, + "step": 7925 + }, + { + "epoch": 0.599750293216299, + "grad_norm": 2.608785390853882, + "learning_rate": 6.797411002821856e-06, + "loss": 0.6181, + "step": 7926 + }, + { + "epoch": 0.5998259619386327, + "grad_norm": 2.0030951499938965, + "learning_rate": 6.795227342945686e-06, + "loss": 0.6672, + "step": 7927 + }, + { + "epoch": 0.5999016306609662, + "grad_norm": 1.9999091625213623, + "learning_rate": 6.7930438211395136e-06, + "loss": 0.7108, + "step": 7928 + }, + { + "epoch": 0.5999772993832999, + "grad_norm": 2.112736463546753, + "learning_rate": 6.7908604375400725e-06, + "loss": 0.5911, + "step": 7929 + }, + { + "epoch": 0.6000529681056336, + "grad_norm": 2.1307666301727295, + "learning_rate": 6.788677192284098e-06, + "loss": 0.5783, + "step": 7930 + }, + { + "epoch": 0.6001286368279671, + "grad_norm": 2.141432046890259, + "learning_rate": 6.7864940855083085e-06, + "loss": 0.6246, + "step": 7931 + }, + { + "epoch": 0.6002043055503008, + "grad_norm": 1.9539786577224731, + "learning_rate": 6.784311117349416e-06, + "loss": 0.79, + "step": 7932 + }, + { + "epoch": 0.6002799742726344, + "grad_norm": 2.6877975463867188, + "learning_rate": 6.782128287944133e-06, + "loss": 0.665, + "step": 7933 + }, + { + "epoch": 0.600355642994968, + "grad_norm": 2.274559259414673, + "learning_rate": 6.779945597429147e-06, + "loss": 0.7325, + "step": 7934 + }, + { + "epoch": 0.6004313117173017, + "grad_norm": 2.1375906467437744, + "learning_rate": 6.77776304594115e-06, + "loss": 0.8202, + "step": 7935 + }, + { + "epoch": 0.6005069804396352, + "grad_norm": 1.993807315826416, + "learning_rate": 6.775580633616818e-06, + "loss": 0.7036, + "step": 7936 + }, + { + "epoch": 0.6005826491619689, + "grad_norm": 2.2041423320770264, + "learning_rate": 6.773398360592818e-06, + "loss": 0.8387, + "step": 7937 + }, + { + "epoch": 0.6006583178843026, + "grad_norm": 2.223114490509033, + "learning_rate": 6.771216227005818e-06, + "loss": 0.6734, + "step": 7938 + }, + { + "epoch": 0.6007339866066361, + "grad_norm": 1.8410695791244507, + "learning_rate": 6.769034232992466e-06, + "loss": 0.5136, + "step": 7939 + }, + { + "epoch": 0.6008096553289698, + "grad_norm": 2.056565046310425, + "learning_rate": 6.766852378689406e-06, + "loss": 0.6536, + "step": 7940 + }, + { + "epoch": 0.6008853240513033, + "grad_norm": 2.2308459281921387, + "learning_rate": 6.764670664233275e-06, + "loss": 0.6653, + "step": 7941 + }, + { + "epoch": 0.600960992773637, + "grad_norm": 2.1193389892578125, + "learning_rate": 6.762489089760692e-06, + "loss": 0.6981, + "step": 7942 + }, + { + "epoch": 0.6010366614959707, + "grad_norm": 2.764047145843506, + "learning_rate": 6.760307655408282e-06, + "loss": 0.7005, + "step": 7943 + }, + { + "epoch": 0.6011123302183042, + "grad_norm": 2.1812756061553955, + "learning_rate": 6.758126361312653e-06, + "loss": 0.6901, + "step": 7944 + }, + { + "epoch": 0.6011879989406379, + "grad_norm": 2.3195836544036865, + "learning_rate": 6.7559452076104e-06, + "loss": 0.8043, + "step": 7945 + }, + { + "epoch": 0.6012636676629715, + "grad_norm": 2.256772994995117, + "learning_rate": 6.753764194438118e-06, + "loss": 0.7321, + "step": 7946 + }, + { + "epoch": 0.6013393363853051, + "grad_norm": 1.9437758922576904, + "learning_rate": 6.751583321932382e-06, + "loss": 0.7759, + "step": 7947 + }, + { + "epoch": 0.6014150051076388, + "grad_norm": 1.943305253982544, + "learning_rate": 6.749402590229775e-06, + "loss": 0.6389, + "step": 7948 + }, + { + "epoch": 0.6014906738299723, + "grad_norm": 2.6814630031585693, + "learning_rate": 6.747221999466858e-06, + "loss": 0.7856, + "step": 7949 + }, + { + "epoch": 0.601566342552306, + "grad_norm": 2.330061197280884, + "learning_rate": 6.745041549780184e-06, + "loss": 0.6706, + "step": 7950 + }, + { + "epoch": 0.6016420112746397, + "grad_norm": 2.5710747241973877, + "learning_rate": 6.742861241306301e-06, + "loss": 0.7252, + "step": 7951 + }, + { + "epoch": 0.6017176799969732, + "grad_norm": 2.0507359504699707, + "learning_rate": 6.7406810741817464e-06, + "loss": 0.7641, + "step": 7952 + }, + { + "epoch": 0.6017933487193069, + "grad_norm": 2.1858019828796387, + "learning_rate": 6.738501048543054e-06, + "loss": 0.6429, + "step": 7953 + }, + { + "epoch": 0.6018690174416405, + "grad_norm": 1.9703868627548218, + "learning_rate": 6.736321164526739e-06, + "loss": 0.6259, + "step": 7954 + }, + { + "epoch": 0.6019446861639741, + "grad_norm": 3.4809346199035645, + "learning_rate": 6.734141422269315e-06, + "loss": 0.6968, + "step": 7955 + }, + { + "epoch": 0.6020203548863078, + "grad_norm": 1.7865307331085205, + "learning_rate": 6.731961821907283e-06, + "loss": 0.6615, + "step": 7956 + }, + { + "epoch": 0.6020960236086413, + "grad_norm": 2.6982762813568115, + "learning_rate": 6.729782363577135e-06, + "loss": 0.7117, + "step": 7957 + }, + { + "epoch": 0.602171692330975, + "grad_norm": 2.4082143306732178, + "learning_rate": 6.727603047415362e-06, + "loss": 0.884, + "step": 7958 + }, + { + "epoch": 0.6022473610533086, + "grad_norm": 1.7152711153030396, + "learning_rate": 6.725423873558435e-06, + "loss": 0.5956, + "step": 7959 + }, + { + "epoch": 0.6023230297756422, + "grad_norm": 2.04510760307312, + "learning_rate": 6.723244842142823e-06, + "loss": 0.762, + "step": 7960 + }, + { + "epoch": 0.6023986984979759, + "grad_norm": 2.1493642330169678, + "learning_rate": 6.721065953304981e-06, + "loss": 0.6102, + "step": 7961 + }, + { + "epoch": 0.6024743672203094, + "grad_norm": 1.814997911453247, + "learning_rate": 6.718887207181358e-06, + "loss": 0.6394, + "step": 7962 + }, + { + "epoch": 0.6025500359426431, + "grad_norm": 2.8531596660614014, + "learning_rate": 6.716708603908399e-06, + "loss": 0.635, + "step": 7963 + }, + { + "epoch": 0.6026257046649768, + "grad_norm": 2.303659677505493, + "learning_rate": 6.714530143622533e-06, + "loss": 0.8212, + "step": 7964 + }, + { + "epoch": 0.6027013733873103, + "grad_norm": 2.530139684677124, + "learning_rate": 6.71235182646018e-06, + "loss": 0.6074, + "step": 7965 + }, + { + "epoch": 0.602777042109644, + "grad_norm": 2.0177268981933594, + "learning_rate": 6.710173652557756e-06, + "loss": 0.6609, + "step": 7966 + }, + { + "epoch": 0.6028527108319776, + "grad_norm": 2.0074338912963867, + "learning_rate": 6.707995622051663e-06, + "loss": 0.764, + "step": 7967 + }, + { + "epoch": 0.6029283795543112, + "grad_norm": 1.9502952098846436, + "learning_rate": 6.705817735078295e-06, + "loss": 0.7149, + "step": 7968 + }, + { + "epoch": 0.6030040482766449, + "grad_norm": 2.4225590229034424, + "learning_rate": 6.703639991774045e-06, + "loss": 0.8186, + "step": 7969 + }, + { + "epoch": 0.6030797169989784, + "grad_norm": 2.3697550296783447, + "learning_rate": 6.701462392275284e-06, + "loss": 0.7241, + "step": 7970 + }, + { + "epoch": 0.6031553857213121, + "grad_norm": 1.843167781829834, + "learning_rate": 6.699284936718385e-06, + "loss": 0.5219, + "step": 7971 + }, + { + "epoch": 0.6032310544436457, + "grad_norm": 2.5825045108795166, + "learning_rate": 6.697107625239701e-06, + "loss": 0.7933, + "step": 7972 + }, + { + "epoch": 0.6033067231659793, + "grad_norm": 1.4268264770507812, + "learning_rate": 6.694930457975585e-06, + "loss": 0.8802, + "step": 7973 + }, + { + "epoch": 0.603382391888313, + "grad_norm": 2.36474347114563, + "learning_rate": 6.692753435062383e-06, + "loss": 0.8809, + "step": 7974 + }, + { + "epoch": 0.6034580606106466, + "grad_norm": 1.9146884679794312, + "learning_rate": 6.6905765566364225e-06, + "loss": 0.6139, + "step": 7975 + }, + { + "epoch": 0.6035337293329802, + "grad_norm": 2.1779873371124268, + "learning_rate": 6.688399822834028e-06, + "loss": 0.6991, + "step": 7976 + }, + { + "epoch": 0.6036093980553139, + "grad_norm": 2.397181272506714, + "learning_rate": 6.686223233791513e-06, + "loss": 0.8585, + "step": 7977 + }, + { + "epoch": 0.6036850667776474, + "grad_norm": 2.285151481628418, + "learning_rate": 6.68404678964518e-06, + "loss": 0.7212, + "step": 7978 + }, + { + "epoch": 0.6037607354999811, + "grad_norm": 2.0508203506469727, + "learning_rate": 6.681870490531329e-06, + "loss": 0.7258, + "step": 7979 + }, + { + "epoch": 0.6038364042223147, + "grad_norm": 1.981296181678772, + "learning_rate": 6.679694336586247e-06, + "loss": 0.6721, + "step": 7980 + }, + { + "epoch": 0.6039120729446483, + "grad_norm": 2.3068652153015137, + "learning_rate": 6.6775183279462105e-06, + "loss": 0.5249, + "step": 7981 + }, + { + "epoch": 0.603987741666982, + "grad_norm": 2.042125940322876, + "learning_rate": 6.675342464747489e-06, + "loss": 0.7777, + "step": 7982 + }, + { + "epoch": 0.6040634103893155, + "grad_norm": 2.7529611587524414, + "learning_rate": 6.673166747126338e-06, + "loss": 0.7063, + "step": 7983 + }, + { + "epoch": 0.6041390791116492, + "grad_norm": 1.930591344833374, + "learning_rate": 6.670991175219012e-06, + "loss": 0.7152, + "step": 7984 + }, + { + "epoch": 0.6042147478339828, + "grad_norm": 1.793343186378479, + "learning_rate": 6.668815749161754e-06, + "loss": 0.5882, + "step": 7985 + }, + { + "epoch": 0.6042904165563164, + "grad_norm": 2.146716356277466, + "learning_rate": 6.666640469090792e-06, + "loss": 0.7066, + "step": 7986 + }, + { + "epoch": 0.6043660852786501, + "grad_norm": 2.544588327407837, + "learning_rate": 6.664465335142352e-06, + "loss": 0.7241, + "step": 7987 + }, + { + "epoch": 0.6044417540009837, + "grad_norm": 2.0014796257019043, + "learning_rate": 6.662290347452644e-06, + "loss": 0.6437, + "step": 7988 + }, + { + "epoch": 0.6045174227233173, + "grad_norm": 2.055544376373291, + "learning_rate": 6.660115506157876e-06, + "loss": 0.5982, + "step": 7989 + }, + { + "epoch": 0.604593091445651, + "grad_norm": 2.17427134513855, + "learning_rate": 6.6579408113942466e-06, + "loss": 0.6711, + "step": 7990 + }, + { + "epoch": 0.6046687601679845, + "grad_norm": 2.4635872840881348, + "learning_rate": 6.655766263297936e-06, + "loss": 0.7088, + "step": 7991 + }, + { + "epoch": 0.6047444288903182, + "grad_norm": 2.311051607131958, + "learning_rate": 6.653591862005126e-06, + "loss": 0.7915, + "step": 7992 + }, + { + "epoch": 0.6048200976126518, + "grad_norm": 2.1790106296539307, + "learning_rate": 6.65141760765198e-06, + "loss": 0.5936, + "step": 7993 + }, + { + "epoch": 0.6048957663349854, + "grad_norm": 2.363032341003418, + "learning_rate": 6.64924350037466e-06, + "loss": 0.7366, + "step": 7994 + }, + { + "epoch": 0.6049714350573191, + "grad_norm": 4.0425214767456055, + "learning_rate": 6.6470695403093156e-06, + "loss": 0.6819, + "step": 7995 + }, + { + "epoch": 0.6050471037796527, + "grad_norm": 1.7406569719314575, + "learning_rate": 6.6448957275920895e-06, + "loss": 0.5655, + "step": 7996 + }, + { + "epoch": 0.6051227725019863, + "grad_norm": 2.071523666381836, + "learning_rate": 6.642722062359109e-06, + "loss": 0.6732, + "step": 7997 + }, + { + "epoch": 0.6051984412243199, + "grad_norm": 2.9325242042541504, + "learning_rate": 6.640548544746494e-06, + "loss": 0.7125, + "step": 7998 + }, + { + "epoch": 0.6052741099466535, + "grad_norm": 1.7183152437210083, + "learning_rate": 6.638375174890364e-06, + "loss": 0.6568, + "step": 7999 + }, + { + "epoch": 0.6053497786689872, + "grad_norm": 2.2659685611724854, + "learning_rate": 6.636201952926818e-06, + "loss": 0.6901, + "step": 8000 + }, + { + "epoch": 0.6054254473913208, + "grad_norm": 1.8365509510040283, + "learning_rate": 6.634028878991954e-06, + "loss": 0.6363, + "step": 8001 + }, + { + "epoch": 0.6055011161136544, + "grad_norm": 2.3198299407958984, + "learning_rate": 6.631855953221851e-06, + "loss": 0.748, + "step": 8002 + }, + { + "epoch": 0.6055767848359881, + "grad_norm": 2.4509475231170654, + "learning_rate": 6.629683175752586e-06, + "loss": 0.667, + "step": 8003 + }, + { + "epoch": 0.6056524535583216, + "grad_norm": 2.3595855236053467, + "learning_rate": 6.62751054672023e-06, + "loss": 0.693, + "step": 8004 + }, + { + "epoch": 0.6057281222806553, + "grad_norm": 2.1252949237823486, + "learning_rate": 6.625338066260836e-06, + "loss": 0.766, + "step": 8005 + }, + { + "epoch": 0.6058037910029889, + "grad_norm": 1.8091083765029907, + "learning_rate": 6.623165734510455e-06, + "loss": 0.7842, + "step": 8006 + }, + { + "epoch": 0.6058794597253225, + "grad_norm": 2.139522075653076, + "learning_rate": 6.620993551605123e-06, + "loss": 0.8644, + "step": 8007 + }, + { + "epoch": 0.6059551284476562, + "grad_norm": 2.6988987922668457, + "learning_rate": 6.618821517680869e-06, + "loss": 0.7944, + "step": 8008 + }, + { + "epoch": 0.6060307971699898, + "grad_norm": 1.9569789171218872, + "learning_rate": 6.616649632873708e-06, + "loss": 0.7234, + "step": 8009 + }, + { + "epoch": 0.6061064658923234, + "grad_norm": 2.4347798824310303, + "learning_rate": 6.614477897319661e-06, + "loss": 0.7029, + "step": 8010 + }, + { + "epoch": 0.6061821346146571, + "grad_norm": 2.1535308361053467, + "learning_rate": 6.612306311154722e-06, + "loss": 0.5297, + "step": 8011 + }, + { + "epoch": 0.6062578033369906, + "grad_norm": 2.128608465194702, + "learning_rate": 6.6101348745148865e-06, + "loss": 0.5245, + "step": 8012 + }, + { + "epoch": 0.6063334720593243, + "grad_norm": 2.0060644149780273, + "learning_rate": 6.607963587536134e-06, + "loss": 0.6084, + "step": 8013 + }, + { + "epoch": 0.6064091407816579, + "grad_norm": 2.5640623569488525, + "learning_rate": 6.605792450354436e-06, + "loss": 0.6693, + "step": 8014 + }, + { + "epoch": 0.6064848095039915, + "grad_norm": 2.0857560634613037, + "learning_rate": 6.603621463105762e-06, + "loss": 0.7536, + "step": 8015 + }, + { + "epoch": 0.6065604782263252, + "grad_norm": 2.7970449924468994, + "learning_rate": 6.601450625926061e-06, + "loss": 0.8408, + "step": 8016 + }, + { + "epoch": 0.6066361469486587, + "grad_norm": 2.1201984882354736, + "learning_rate": 6.599279938951282e-06, + "loss": 0.6348, + "step": 8017 + }, + { + "epoch": 0.6067118156709924, + "grad_norm": 2.2606847286224365, + "learning_rate": 6.597109402317356e-06, + "loss": 0.8402, + "step": 8018 + }, + { + "epoch": 0.606787484393326, + "grad_norm": 2.860076904296875, + "learning_rate": 6.594939016160209e-06, + "loss": 0.6749, + "step": 8019 + }, + { + "epoch": 0.6068631531156596, + "grad_norm": 2.2081804275512695, + "learning_rate": 6.592768780615764e-06, + "loss": 0.704, + "step": 8020 + }, + { + "epoch": 0.6069388218379933, + "grad_norm": 2.2822539806365967, + "learning_rate": 6.590598695819921e-06, + "loss": 0.6199, + "step": 8021 + }, + { + "epoch": 0.6070144905603269, + "grad_norm": 2.0878994464874268, + "learning_rate": 6.588428761908583e-06, + "loss": 0.6372, + "step": 8022 + }, + { + "epoch": 0.6070901592826605, + "grad_norm": 2.373242139816284, + "learning_rate": 6.586258979017634e-06, + "loss": 0.7862, + "step": 8023 + }, + { + "epoch": 0.6071658280049942, + "grad_norm": 1.9555684328079224, + "learning_rate": 6.584089347282954e-06, + "loss": 0.676, + "step": 8024 + }, + { + "epoch": 0.6072414967273277, + "grad_norm": 2.07110595703125, + "learning_rate": 6.581919866840413e-06, + "loss": 0.5871, + "step": 8025 + }, + { + "epoch": 0.6073171654496614, + "grad_norm": 3.0451760292053223, + "learning_rate": 6.579750537825874e-06, + "loss": 0.6727, + "step": 8026 + }, + { + "epoch": 0.607392834171995, + "grad_norm": 2.067446708679199, + "learning_rate": 6.577581360375182e-06, + "loss": 0.726, + "step": 8027 + }, + { + "epoch": 0.6074685028943286, + "grad_norm": 1.995710849761963, + "learning_rate": 6.575412334624183e-06, + "loss": 0.79, + "step": 8028 + }, + { + "epoch": 0.6075441716166623, + "grad_norm": 2.084068536758423, + "learning_rate": 6.573243460708701e-06, + "loss": 0.7256, + "step": 8029 + }, + { + "epoch": 0.6076198403389959, + "grad_norm": 2.150193929672241, + "learning_rate": 6.571074738764565e-06, + "loss": 0.7144, + "step": 8030 + }, + { + "epoch": 0.6076955090613295, + "grad_norm": 1.506453514099121, + "learning_rate": 6.568906168927585e-06, + "loss": 0.8205, + "step": 8031 + }, + { + "epoch": 0.6077711777836631, + "grad_norm": 2.1317901611328125, + "learning_rate": 6.5667377513335645e-06, + "loss": 0.7269, + "step": 8032 + }, + { + "epoch": 0.6078468465059967, + "grad_norm": 2.2331316471099854, + "learning_rate": 6.564569486118297e-06, + "loss": 0.8241, + "step": 8033 + }, + { + "epoch": 0.6079225152283304, + "grad_norm": 2.03304386138916, + "learning_rate": 6.562401373417562e-06, + "loss": 0.6857, + "step": 8034 + }, + { + "epoch": 0.607998183950664, + "grad_norm": 1.8887367248535156, + "learning_rate": 6.560233413367139e-06, + "loss": 0.6265, + "step": 8035 + }, + { + "epoch": 0.6080738526729976, + "grad_norm": 1.7368558645248413, + "learning_rate": 6.558065606102792e-06, + "loss": 0.6891, + "step": 8036 + }, + { + "epoch": 0.6081495213953313, + "grad_norm": 2.355769395828247, + "learning_rate": 6.555897951760274e-06, + "loss": 0.7196, + "step": 8037 + }, + { + "epoch": 0.6082251901176648, + "grad_norm": 2.221735715866089, + "learning_rate": 6.553730450475333e-06, + "loss": 0.6272, + "step": 8038 + }, + { + "epoch": 0.6083008588399985, + "grad_norm": 2.569798231124878, + "learning_rate": 6.551563102383697e-06, + "loss": 0.7951, + "step": 8039 + }, + { + "epoch": 0.6083765275623321, + "grad_norm": 1.8355985879898071, + "learning_rate": 6.5493959076211055e-06, + "loss": 0.61, + "step": 8040 + }, + { + "epoch": 0.6084521962846657, + "grad_norm": 2.0829899311065674, + "learning_rate": 6.547228866323265e-06, + "loss": 0.7505, + "step": 8041 + }, + { + "epoch": 0.6085278650069994, + "grad_norm": 1.4479275941848755, + "learning_rate": 6.54506197862589e-06, + "loss": 0.827, + "step": 8042 + }, + { + "epoch": 0.608603533729333, + "grad_norm": 1.3985527753829956, + "learning_rate": 6.542895244664671e-06, + "loss": 0.9008, + "step": 8043 + }, + { + "epoch": 0.6086792024516666, + "grad_norm": 1.865014672279358, + "learning_rate": 6.540728664575301e-06, + "loss": 0.7863, + "step": 8044 + }, + { + "epoch": 0.6087548711740002, + "grad_norm": 1.861416220664978, + "learning_rate": 6.538562238493453e-06, + "loss": 0.6838, + "step": 8045 + }, + { + "epoch": 0.6088305398963338, + "grad_norm": 2.162048578262329, + "learning_rate": 6.5363959665547996e-06, + "loss": 0.8778, + "step": 8046 + }, + { + "epoch": 0.6089062086186675, + "grad_norm": 2.3792169094085693, + "learning_rate": 6.534229848895002e-06, + "loss": 0.6775, + "step": 8047 + }, + { + "epoch": 0.6089818773410011, + "grad_norm": 1.898842692375183, + "learning_rate": 6.532063885649705e-06, + "loss": 0.5081, + "step": 8048 + }, + { + "epoch": 0.6090575460633347, + "grad_norm": 1.847794532775879, + "learning_rate": 6.52989807695455e-06, + "loss": 0.7093, + "step": 8049 + }, + { + "epoch": 0.6091332147856684, + "grad_norm": 1.8337700366973877, + "learning_rate": 6.527732422945164e-06, + "loss": 0.7425, + "step": 8050 + }, + { + "epoch": 0.609208883508002, + "grad_norm": 2.419373035430908, + "learning_rate": 6.525566923757172e-06, + "loss": 0.7282, + "step": 8051 + }, + { + "epoch": 0.6092845522303356, + "grad_norm": 2.604099750518799, + "learning_rate": 6.5234015795261845e-06, + "loss": 0.7182, + "step": 8052 + }, + { + "epoch": 0.6093602209526692, + "grad_norm": 1.8286306858062744, + "learning_rate": 6.5212363903877975e-06, + "loss": 0.6825, + "step": 8053 + }, + { + "epoch": 0.6094358896750028, + "grad_norm": 2.197711229324341, + "learning_rate": 6.519071356477606e-06, + "loss": 0.6857, + "step": 8054 + }, + { + "epoch": 0.6095115583973365, + "grad_norm": 2.546816110610962, + "learning_rate": 6.516906477931188e-06, + "loss": 0.6036, + "step": 8055 + }, + { + "epoch": 0.6095872271196701, + "grad_norm": 1.9592362642288208, + "learning_rate": 6.514741754884122e-06, + "loss": 0.8768, + "step": 8056 + }, + { + "epoch": 0.6096628958420037, + "grad_norm": 2.2540531158447266, + "learning_rate": 6.512577187471963e-06, + "loss": 0.8306, + "step": 8057 + }, + { + "epoch": 0.6097385645643373, + "grad_norm": 2.1701200008392334, + "learning_rate": 6.510412775830269e-06, + "loss": 0.7949, + "step": 8058 + }, + { + "epoch": 0.609814233286671, + "grad_norm": 2.3326847553253174, + "learning_rate": 6.508248520094577e-06, + "loss": 0.8873, + "step": 8059 + }, + { + "epoch": 0.6098899020090046, + "grad_norm": 2.0343968868255615, + "learning_rate": 6.506084420400419e-06, + "loss": 0.6667, + "step": 8060 + }, + { + "epoch": 0.6099655707313382, + "grad_norm": 2.17753529548645, + "learning_rate": 6.503920476883326e-06, + "loss": 0.7618, + "step": 8061 + }, + { + "epoch": 0.6100412394536718, + "grad_norm": 2.2259414196014404, + "learning_rate": 6.501756689678802e-06, + "loss": 0.629, + "step": 8062 + }, + { + "epoch": 0.6101169081760055, + "grad_norm": 2.3345701694488525, + "learning_rate": 6.4995930589223575e-06, + "loss": 0.74, + "step": 8063 + }, + { + "epoch": 0.6101925768983391, + "grad_norm": 2.023045301437378, + "learning_rate": 6.497429584749482e-06, + "loss": 0.8297, + "step": 8064 + }, + { + "epoch": 0.6102682456206727, + "grad_norm": 2.1104860305786133, + "learning_rate": 6.495266267295658e-06, + "loss": 0.8041, + "step": 8065 + }, + { + "epoch": 0.6103439143430063, + "grad_norm": 2.1051814556121826, + "learning_rate": 6.493103106696364e-06, + "loss": 0.7396, + "step": 8066 + }, + { + "epoch": 0.6104195830653399, + "grad_norm": 1.9070154428482056, + "learning_rate": 6.490940103087062e-06, + "loss": 0.7472, + "step": 8067 + }, + { + "epoch": 0.6104952517876736, + "grad_norm": 2.5003867149353027, + "learning_rate": 6.488777256603204e-06, + "loss": 0.8995, + "step": 8068 + }, + { + "epoch": 0.6105709205100072, + "grad_norm": 2.510374069213867, + "learning_rate": 6.486614567380239e-06, + "loss": 0.7385, + "step": 8069 + }, + { + "epoch": 0.6106465892323408, + "grad_norm": 1.955937385559082, + "learning_rate": 6.484452035553597e-06, + "loss": 0.728, + "step": 8070 + }, + { + "epoch": 0.6107222579546744, + "grad_norm": 1.8284022808074951, + "learning_rate": 6.482289661258704e-06, + "loss": 0.6269, + "step": 8071 + }, + { + "epoch": 0.610797926677008, + "grad_norm": 1.7981479167938232, + "learning_rate": 6.4801274446309794e-06, + "loss": 0.7175, + "step": 8072 + }, + { + "epoch": 0.6108735953993417, + "grad_norm": 1.8497958183288574, + "learning_rate": 6.477965385805822e-06, + "loss": 0.7925, + "step": 8073 + }, + { + "epoch": 0.6109492641216753, + "grad_norm": 1.7783727645874023, + "learning_rate": 6.475803484918631e-06, + "loss": 0.6708, + "step": 8074 + }, + { + "epoch": 0.6110249328440089, + "grad_norm": 1.979129433631897, + "learning_rate": 6.473641742104787e-06, + "loss": 0.5945, + "step": 8075 + }, + { + "epoch": 0.6111006015663426, + "grad_norm": 1.9033944606781006, + "learning_rate": 6.4714801574996695e-06, + "loss": 0.6975, + "step": 8076 + }, + { + "epoch": 0.6111762702886762, + "grad_norm": 1.668627381324768, + "learning_rate": 6.469318731238645e-06, + "loss": 0.6458, + "step": 8077 + }, + { + "epoch": 0.6112519390110098, + "grad_norm": 2.2541165351867676, + "learning_rate": 6.467157463457064e-06, + "loss": 0.6398, + "step": 8078 + }, + { + "epoch": 0.6113276077333434, + "grad_norm": 2.112131118774414, + "learning_rate": 6.464996354290277e-06, + "loss": 0.7406, + "step": 8079 + }, + { + "epoch": 0.611403276455677, + "grad_norm": 1.7680984735488892, + "learning_rate": 6.462835403873615e-06, + "loss": 0.7407, + "step": 8080 + }, + { + "epoch": 0.6114789451780107, + "grad_norm": 1.924974799156189, + "learning_rate": 6.460674612342407e-06, + "loss": 0.7928, + "step": 8081 + }, + { + "epoch": 0.6115546139003443, + "grad_norm": 1.6907529830932617, + "learning_rate": 6.458513979831969e-06, + "loss": 0.7178, + "step": 8082 + }, + { + "epoch": 0.6116302826226779, + "grad_norm": 2.3238351345062256, + "learning_rate": 6.456353506477607e-06, + "loss": 0.7008, + "step": 8083 + }, + { + "epoch": 0.6117059513450115, + "grad_norm": 1.6362533569335938, + "learning_rate": 6.454193192414613e-06, + "loss": 0.6097, + "step": 8084 + }, + { + "epoch": 0.6117816200673452, + "grad_norm": 2.3989768028259277, + "learning_rate": 6.452033037778277e-06, + "loss": 0.7459, + "step": 8085 + }, + { + "epoch": 0.6118572887896788, + "grad_norm": 2.1705870628356934, + "learning_rate": 6.449873042703871e-06, + "loss": 0.6697, + "step": 8086 + }, + { + "epoch": 0.6119329575120124, + "grad_norm": 2.115978240966797, + "learning_rate": 6.4477132073266645e-06, + "loss": 0.6262, + "step": 8087 + }, + { + "epoch": 0.612008626234346, + "grad_norm": 1.8548389673233032, + "learning_rate": 6.445553531781915e-06, + "loss": 0.6613, + "step": 8088 + }, + { + "epoch": 0.6120842949566797, + "grad_norm": 1.7452164888381958, + "learning_rate": 6.443394016204861e-06, + "loss": 0.6711, + "step": 8089 + }, + { + "epoch": 0.6121599636790133, + "grad_norm": 1.9946900606155396, + "learning_rate": 6.441234660730747e-06, + "loss": 0.6328, + "step": 8090 + }, + { + "epoch": 0.6122356324013469, + "grad_norm": 2.100299596786499, + "learning_rate": 6.43907546549479e-06, + "loss": 0.7854, + "step": 8091 + }, + { + "epoch": 0.6123113011236805, + "grad_norm": 2.174600124359131, + "learning_rate": 6.4369164306322125e-06, + "loss": 0.7751, + "step": 8092 + }, + { + "epoch": 0.6123869698460142, + "grad_norm": 1.753521203994751, + "learning_rate": 6.434757556278219e-06, + "loss": 0.5145, + "step": 8093 + }, + { + "epoch": 0.6124626385683478, + "grad_norm": 1.6871925592422485, + "learning_rate": 6.432598842568003e-06, + "loss": 0.6158, + "step": 8094 + }, + { + "epoch": 0.6125383072906814, + "grad_norm": 1.852492332458496, + "learning_rate": 6.430440289636754e-06, + "loss": 0.613, + "step": 8095 + }, + { + "epoch": 0.612613976013015, + "grad_norm": 1.7599942684173584, + "learning_rate": 6.428281897619638e-06, + "loss": 0.8107, + "step": 8096 + }, + { + "epoch": 0.6126896447353486, + "grad_norm": 2.176158905029297, + "learning_rate": 6.4261236666518345e-06, + "loss": 0.6481, + "step": 8097 + }, + { + "epoch": 0.6127653134576823, + "grad_norm": 2.7235372066497803, + "learning_rate": 6.423965596868489e-06, + "loss": 0.7695, + "step": 8098 + }, + { + "epoch": 0.6128409821800159, + "grad_norm": 1.6822925806045532, + "learning_rate": 6.421807688404753e-06, + "loss": 0.533, + "step": 8099 + }, + { + "epoch": 0.6129166509023495, + "grad_norm": 1.8413187265396118, + "learning_rate": 6.419649941395756e-06, + "loss": 0.6069, + "step": 8100 + }, + { + "epoch": 0.6129923196246831, + "grad_norm": 1.9572733640670776, + "learning_rate": 6.417492355976624e-06, + "loss": 0.7027, + "step": 8101 + }, + { + "epoch": 0.6130679883470168, + "grad_norm": 2.1810455322265625, + "learning_rate": 6.4153349322824765e-06, + "loss": 0.6632, + "step": 8102 + }, + { + "epoch": 0.6131436570693504, + "grad_norm": 2.011125326156616, + "learning_rate": 6.413177670448413e-06, + "loss": 0.7041, + "step": 8103 + }, + { + "epoch": 0.613219325791684, + "grad_norm": 2.0759332180023193, + "learning_rate": 6.411020570609533e-06, + "loss": 0.7887, + "step": 8104 + }, + { + "epoch": 0.6132949945140176, + "grad_norm": 1.7652744054794312, + "learning_rate": 6.408863632900918e-06, + "loss": 0.6336, + "step": 8105 + }, + { + "epoch": 0.6133706632363513, + "grad_norm": 1.9956303834915161, + "learning_rate": 6.406706857457639e-06, + "loss": 0.7777, + "step": 8106 + }, + { + "epoch": 0.6134463319586849, + "grad_norm": 2.095097780227661, + "learning_rate": 6.40455024441477e-06, + "loss": 0.6843, + "step": 8107 + }, + { + "epoch": 0.6135220006810185, + "grad_norm": 2.287135601043701, + "learning_rate": 6.402393793907355e-06, + "loss": 0.7009, + "step": 8108 + }, + { + "epoch": 0.6135976694033521, + "grad_norm": 1.8834456205368042, + "learning_rate": 6.4002375060704465e-06, + "loss": 0.6393, + "step": 8109 + }, + { + "epoch": 0.6136733381256857, + "grad_norm": 4.113138198852539, + "learning_rate": 6.398081381039072e-06, + "loss": 0.7464, + "step": 8110 + }, + { + "epoch": 0.6137490068480194, + "grad_norm": 1.7231311798095703, + "learning_rate": 6.395925418948255e-06, + "loss": 0.6792, + "step": 8111 + }, + { + "epoch": 0.613824675570353, + "grad_norm": 3.072347640991211, + "learning_rate": 6.3937696199330116e-06, + "loss": 0.5752, + "step": 8112 + }, + { + "epoch": 0.6139003442926866, + "grad_norm": 1.6998788118362427, + "learning_rate": 6.3916139841283465e-06, + "loss": 0.6967, + "step": 8113 + }, + { + "epoch": 0.6139760130150202, + "grad_norm": 2.079223394393921, + "learning_rate": 6.3894585116692496e-06, + "loss": 0.6728, + "step": 8114 + }, + { + "epoch": 0.6140516817373539, + "grad_norm": 2.595216751098633, + "learning_rate": 6.387303202690705e-06, + "loss": 0.6351, + "step": 8115 + }, + { + "epoch": 0.6141273504596875, + "grad_norm": 2.1260061264038086, + "learning_rate": 6.385148057327681e-06, + "loss": 0.7492, + "step": 8116 + }, + { + "epoch": 0.6142030191820211, + "grad_norm": 1.9894077777862549, + "learning_rate": 6.382993075715144e-06, + "loss": 0.8202, + "step": 8117 + }, + { + "epoch": 0.6142786879043547, + "grad_norm": 3.4752848148345947, + "learning_rate": 6.380838257988048e-06, + "loss": 0.6512, + "step": 8118 + }, + { + "epoch": 0.6143543566266884, + "grad_norm": 2.0392558574676514, + "learning_rate": 6.378683604281329e-06, + "loss": 0.6064, + "step": 8119 + }, + { + "epoch": 0.614430025349022, + "grad_norm": 1.8147621154785156, + "learning_rate": 6.376529114729924e-06, + "loss": 0.6444, + "step": 8120 + }, + { + "epoch": 0.6145056940713556, + "grad_norm": 2.527744770050049, + "learning_rate": 6.374374789468749e-06, + "loss": 0.6505, + "step": 8121 + }, + { + "epoch": 0.6145813627936892, + "grad_norm": 2.176684856414795, + "learning_rate": 6.372220628632714e-06, + "loss": 0.5707, + "step": 8122 + }, + { + "epoch": 0.6146570315160228, + "grad_norm": 2.0857019424438477, + "learning_rate": 6.3700666323567265e-06, + "loss": 0.7341, + "step": 8123 + }, + { + "epoch": 0.6147327002383565, + "grad_norm": 2.4121081829071045, + "learning_rate": 6.3679128007756724e-06, + "loss": 0.7551, + "step": 8124 + }, + { + "epoch": 0.6148083689606901, + "grad_norm": 2.668250560760498, + "learning_rate": 6.365759134024433e-06, + "loss": 0.7629, + "step": 8125 + }, + { + "epoch": 0.6148840376830237, + "grad_norm": 1.942336916923523, + "learning_rate": 6.363605632237874e-06, + "loss": 0.6856, + "step": 8126 + }, + { + "epoch": 0.6149597064053574, + "grad_norm": 2.963472843170166, + "learning_rate": 6.361452295550856e-06, + "loss": 0.684, + "step": 8127 + }, + { + "epoch": 0.615035375127691, + "grad_norm": 2.0592167377471924, + "learning_rate": 6.359299124098231e-06, + "loss": 0.7157, + "step": 8128 + }, + { + "epoch": 0.6151110438500246, + "grad_norm": 2.172013282775879, + "learning_rate": 6.3571461180148395e-06, + "loss": 0.6001, + "step": 8129 + }, + { + "epoch": 0.6151867125723582, + "grad_norm": 2.4280693531036377, + "learning_rate": 6.354993277435503e-06, + "loss": 0.5956, + "step": 8130 + }, + { + "epoch": 0.6152623812946918, + "grad_norm": 1.9213684797286987, + "learning_rate": 6.352840602495044e-06, + "loss": 0.59, + "step": 8131 + }, + { + "epoch": 0.6153380500170255, + "grad_norm": 2.0935068130493164, + "learning_rate": 6.350688093328266e-06, + "loss": 0.6688, + "step": 8132 + }, + { + "epoch": 0.6154137187393591, + "grad_norm": 3.5406651496887207, + "learning_rate": 6.348535750069969e-06, + "loss": 0.8106, + "step": 8133 + }, + { + "epoch": 0.6154893874616927, + "grad_norm": 2.5586190223693848, + "learning_rate": 6.346383572854942e-06, + "loss": 0.6554, + "step": 8134 + }, + { + "epoch": 0.6155650561840263, + "grad_norm": 5.290948390960693, + "learning_rate": 6.344231561817956e-06, + "loss": 0.5677, + "step": 8135 + }, + { + "epoch": 0.6156407249063599, + "grad_norm": 9.015077590942383, + "learning_rate": 6.342079717093782e-06, + "loss": 0.7897, + "step": 8136 + }, + { + "epoch": 0.6157163936286936, + "grad_norm": 8.069103240966797, + "learning_rate": 6.339928038817168e-06, + "loss": 0.741, + "step": 8137 + }, + { + "epoch": 0.6157920623510272, + "grad_norm": 58.060359954833984, + "learning_rate": 6.337776527122865e-06, + "loss": 0.7947, + "step": 8138 + }, + { + "epoch": 0.6158677310733608, + "grad_norm": 12.597784996032715, + "learning_rate": 6.335625182145611e-06, + "loss": 0.6853, + "step": 8139 + }, + { + "epoch": 0.6159433997956945, + "grad_norm": 2.0326411724090576, + "learning_rate": 6.333474004020123e-06, + "loss": 0.672, + "step": 8140 + }, + { + "epoch": 0.6160190685180281, + "grad_norm": 2.0402638912200928, + "learning_rate": 6.331322992881118e-06, + "loss": 0.7124, + "step": 8141 + }, + { + "epoch": 0.6160947372403617, + "grad_norm": 4.363745212554932, + "learning_rate": 6.329172148863294e-06, + "loss": 0.8492, + "step": 8142 + }, + { + "epoch": 0.6161704059626953, + "grad_norm": 4.418940544128418, + "learning_rate": 6.327021472101355e-06, + "loss": 0.6559, + "step": 8143 + }, + { + "epoch": 0.6162460746850289, + "grad_norm": 2.802222728729248, + "learning_rate": 6.3248709627299735e-06, + "loss": 0.7396, + "step": 8144 + }, + { + "epoch": 0.6163217434073626, + "grad_norm": 2.031181573867798, + "learning_rate": 6.322720620883827e-06, + "loss": 0.6788, + "step": 8145 + }, + { + "epoch": 0.6163974121296962, + "grad_norm": 2.4163577556610107, + "learning_rate": 6.320570446697574e-06, + "loss": 0.8298, + "step": 8146 + }, + { + "epoch": 0.6164730808520298, + "grad_norm": 2.520803928375244, + "learning_rate": 6.318420440305863e-06, + "loss": 0.8713, + "step": 8147 + }, + { + "epoch": 0.6165487495743635, + "grad_norm": 2.569690227508545, + "learning_rate": 6.316270601843342e-06, + "loss": 0.5737, + "step": 8148 + }, + { + "epoch": 0.616624418296697, + "grad_norm": 2.0397837162017822, + "learning_rate": 6.314120931444631e-06, + "loss": 0.6953, + "step": 8149 + }, + { + "epoch": 0.6167000870190307, + "grad_norm": 2.84653639793396, + "learning_rate": 6.31197142924436e-06, + "loss": 0.7573, + "step": 8150 + }, + { + "epoch": 0.6167757557413643, + "grad_norm": 2.532266616821289, + "learning_rate": 6.30982209537713e-06, + "loss": 0.5795, + "step": 8151 + }, + { + "epoch": 0.6168514244636979, + "grad_norm": 2.0461106300354004, + "learning_rate": 6.307672929977539e-06, + "loss": 0.7261, + "step": 8152 + }, + { + "epoch": 0.6169270931860316, + "grad_norm": 2.5015604496002197, + "learning_rate": 6.3055239331801795e-06, + "loss": 0.6285, + "step": 8153 + }, + { + "epoch": 0.6170027619083652, + "grad_norm": 3.2419116497039795, + "learning_rate": 6.303375105119626e-06, + "loss": 0.618, + "step": 8154 + }, + { + "epoch": 0.6170784306306988, + "grad_norm": 2.273522138595581, + "learning_rate": 6.301226445930447e-06, + "loss": 0.6384, + "step": 8155 + }, + { + "epoch": 0.6171540993530324, + "grad_norm": 2.353877544403076, + "learning_rate": 6.299077955747195e-06, + "loss": 0.6086, + "step": 8156 + }, + { + "epoch": 0.617229768075366, + "grad_norm": 2.2574803829193115, + "learning_rate": 6.296929634704415e-06, + "loss": 0.8628, + "step": 8157 + }, + { + "epoch": 0.6173054367976997, + "grad_norm": 2.2511677742004395, + "learning_rate": 6.294781482936646e-06, + "loss": 0.8214, + "step": 8158 + }, + { + "epoch": 0.6173811055200333, + "grad_norm": 2.1180849075317383, + "learning_rate": 6.292633500578412e-06, + "loss": 0.6905, + "step": 8159 + }, + { + "epoch": 0.6174567742423669, + "grad_norm": 2.302046775817871, + "learning_rate": 6.290485687764223e-06, + "loss": 0.807, + "step": 8160 + }, + { + "epoch": 0.6175324429647006, + "grad_norm": 2.0591933727264404, + "learning_rate": 6.2883380446285865e-06, + "loss": 0.7709, + "step": 8161 + }, + { + "epoch": 0.6176081116870341, + "grad_norm": 2.0985355377197266, + "learning_rate": 6.28619057130599e-06, + "loss": 0.8037, + "step": 8162 + }, + { + "epoch": 0.6176837804093678, + "grad_norm": 1.847230315208435, + "learning_rate": 6.284043267930915e-06, + "loss": 0.6209, + "step": 8163 + }, + { + "epoch": 0.6177594491317014, + "grad_norm": 2.1725268363952637, + "learning_rate": 6.28189613463784e-06, + "loss": 0.7832, + "step": 8164 + }, + { + "epoch": 0.617835117854035, + "grad_norm": 1.3901453018188477, + "learning_rate": 6.279749171561218e-06, + "loss": 0.772, + "step": 8165 + }, + { + "epoch": 0.6179107865763687, + "grad_norm": 2.406770944595337, + "learning_rate": 6.277602378835502e-06, + "loss": 0.6068, + "step": 8166 + }, + { + "epoch": 0.6179864552987023, + "grad_norm": 2.2921957969665527, + "learning_rate": 6.275455756595129e-06, + "loss": 0.7577, + "step": 8167 + }, + { + "epoch": 0.6180621240210359, + "grad_norm": 2.531522274017334, + "learning_rate": 6.273309304974528e-06, + "loss": 0.5886, + "step": 8168 + }, + { + "epoch": 0.6181377927433696, + "grad_norm": 2.0290911197662354, + "learning_rate": 6.27116302410812e-06, + "loss": 0.7469, + "step": 8169 + }, + { + "epoch": 0.6182134614657031, + "grad_norm": 2.061554431915283, + "learning_rate": 6.269016914130309e-06, + "loss": 0.5977, + "step": 8170 + }, + { + "epoch": 0.6182891301880368, + "grad_norm": 4.355343818664551, + "learning_rate": 6.266870975175491e-06, + "loss": 0.6287, + "step": 8171 + }, + { + "epoch": 0.6183647989103704, + "grad_norm": 2.044095754623413, + "learning_rate": 6.264725207378055e-06, + "loss": 0.8558, + "step": 8172 + }, + { + "epoch": 0.618440467632704, + "grad_norm": 1.840848684310913, + "learning_rate": 6.262579610872368e-06, + "loss": 0.7437, + "step": 8173 + }, + { + "epoch": 0.6185161363550377, + "grad_norm": 5.799072742462158, + "learning_rate": 6.260434185792803e-06, + "loss": 0.7079, + "step": 8174 + }, + { + "epoch": 0.6185918050773712, + "grad_norm": 1.9204273223876953, + "learning_rate": 6.258288932273713e-06, + "loss": 0.6698, + "step": 8175 + }, + { + "epoch": 0.6186674737997049, + "grad_norm": 2.231785774230957, + "learning_rate": 6.2561438504494346e-06, + "loss": 0.7382, + "step": 8176 + }, + { + "epoch": 0.6187431425220385, + "grad_norm": 1.8387155532836914, + "learning_rate": 6.253998940454305e-06, + "loss": 0.5977, + "step": 8177 + }, + { + "epoch": 0.6188188112443721, + "grad_norm": 2.453481435775757, + "learning_rate": 6.25185420242264e-06, + "loss": 0.7128, + "step": 8178 + }, + { + "epoch": 0.6188944799667058, + "grad_norm": 2.066225528717041, + "learning_rate": 6.249709636488755e-06, + "loss": 0.5713, + "step": 8179 + }, + { + "epoch": 0.6189701486890394, + "grad_norm": 1.8709297180175781, + "learning_rate": 6.2475652427869495e-06, + "loss": 0.7591, + "step": 8180 + }, + { + "epoch": 0.619045817411373, + "grad_norm": 2.087465763092041, + "learning_rate": 6.2454210214515095e-06, + "loss": 0.7528, + "step": 8181 + }, + { + "epoch": 0.6191214861337067, + "grad_norm": 2.2975540161132812, + "learning_rate": 6.243276972616716e-06, + "loss": 0.7861, + "step": 8182 + }, + { + "epoch": 0.6191971548560402, + "grad_norm": 1.8741811513900757, + "learning_rate": 6.241133096416832e-06, + "loss": 0.629, + "step": 8183 + }, + { + "epoch": 0.6192728235783739, + "grad_norm": 1.919198751449585, + "learning_rate": 6.238989392986118e-06, + "loss": 0.7016, + "step": 8184 + }, + { + "epoch": 0.6193484923007075, + "grad_norm": 2.3332748413085938, + "learning_rate": 6.236845862458818e-06, + "loss": 0.59, + "step": 8185 + }, + { + "epoch": 0.6194241610230411, + "grad_norm": 2.41435170173645, + "learning_rate": 6.2347025049691696e-06, + "loss": 0.7076, + "step": 8186 + }, + { + "epoch": 0.6194998297453748, + "grad_norm": 2.0539135932922363, + "learning_rate": 6.232559320651392e-06, + "loss": 0.6218, + "step": 8187 + }, + { + "epoch": 0.6195754984677083, + "grad_norm": 2.1097443103790283, + "learning_rate": 6.2304163096397e-06, + "loss": 0.7403, + "step": 8188 + }, + { + "epoch": 0.619651167190042, + "grad_norm": 2.184004306793213, + "learning_rate": 6.2282734720683e-06, + "loss": 0.5566, + "step": 8189 + }, + { + "epoch": 0.6197268359123757, + "grad_norm": 2.4492106437683105, + "learning_rate": 6.226130808071377e-06, + "loss": 0.6588, + "step": 8190 + }, + { + "epoch": 0.6198025046347092, + "grad_norm": 2.1081714630126953, + "learning_rate": 6.2239883177831174e-06, + "loss": 0.6285, + "step": 8191 + }, + { + "epoch": 0.6198781733570429, + "grad_norm": 2.0451788902282715, + "learning_rate": 6.221846001337686e-06, + "loss": 0.7617, + "step": 8192 + }, + { + "epoch": 0.6199538420793765, + "grad_norm": 2.1223862171173096, + "learning_rate": 6.219703858869242e-06, + "loss": 0.734, + "step": 8193 + }, + { + "epoch": 0.6200295108017101, + "grad_norm": 1.9312350749969482, + "learning_rate": 6.217561890511939e-06, + "loss": 0.5471, + "step": 8194 + }, + { + "epoch": 0.6201051795240438, + "grad_norm": 2.3495125770568848, + "learning_rate": 6.215420096399907e-06, + "loss": 0.7066, + "step": 8195 + }, + { + "epoch": 0.6201808482463773, + "grad_norm": 2.163055181503296, + "learning_rate": 6.213278476667278e-06, + "loss": 0.7731, + "step": 8196 + }, + { + "epoch": 0.620256516968711, + "grad_norm": 2.2514150142669678, + "learning_rate": 6.211137031448162e-06, + "loss": 0.6057, + "step": 8197 + }, + { + "epoch": 0.6203321856910446, + "grad_norm": 1.9421865940093994, + "learning_rate": 6.2089957608766664e-06, + "loss": 0.6273, + "step": 8198 + }, + { + "epoch": 0.6204078544133782, + "grad_norm": 1.938656210899353, + "learning_rate": 6.2068546650868785e-06, + "loss": 0.8241, + "step": 8199 + }, + { + "epoch": 0.6204835231357119, + "grad_norm": 2.263339042663574, + "learning_rate": 6.204713744212891e-06, + "loss": 0.7253, + "step": 8200 + }, + { + "epoch": 0.6205591918580454, + "grad_norm": 1.7081513404846191, + "learning_rate": 6.202572998388768e-06, + "loss": 0.5888, + "step": 8201 + }, + { + "epoch": 0.6206348605803791, + "grad_norm": 2.124990940093994, + "learning_rate": 6.200432427748574e-06, + "loss": 0.7011, + "step": 8202 + }, + { + "epoch": 0.6207105293027128, + "grad_norm": 2.4887731075286865, + "learning_rate": 6.198292032426354e-06, + "loss": 0.7463, + "step": 8203 + }, + { + "epoch": 0.6207861980250463, + "grad_norm": 2.28210186958313, + "learning_rate": 6.1961518125561485e-06, + "loss": 0.7691, + "step": 8204 + }, + { + "epoch": 0.62086186674738, + "grad_norm": 1.8958467245101929, + "learning_rate": 6.194011768271986e-06, + "loss": 0.7223, + "step": 8205 + }, + { + "epoch": 0.6209375354697136, + "grad_norm": 1.9587349891662598, + "learning_rate": 6.191871899707883e-06, + "loss": 0.769, + "step": 8206 + }, + { + "epoch": 0.6210132041920472, + "grad_norm": 2.0116994380950928, + "learning_rate": 6.189732206997845e-06, + "loss": 0.5752, + "step": 8207 + }, + { + "epoch": 0.6210888729143809, + "grad_norm": 2.030748128890991, + "learning_rate": 6.187592690275864e-06, + "loss": 0.6877, + "step": 8208 + }, + { + "epoch": 0.6211645416367144, + "grad_norm": 1.9845973253250122, + "learning_rate": 6.185453349675923e-06, + "loss": 0.6563, + "step": 8209 + }, + { + "epoch": 0.6212402103590481, + "grad_norm": 2.015986680984497, + "learning_rate": 6.1833141853319995e-06, + "loss": 0.6908, + "step": 8210 + }, + { + "epoch": 0.6213158790813818, + "grad_norm": 2.061414957046509, + "learning_rate": 6.181175197378053e-06, + "loss": 0.7801, + "step": 8211 + }, + { + "epoch": 0.6213915478037153, + "grad_norm": 1.997130274772644, + "learning_rate": 6.179036385948032e-06, + "loss": 0.6684, + "step": 8212 + }, + { + "epoch": 0.621467216526049, + "grad_norm": 2.0803966522216797, + "learning_rate": 6.1768977511758755e-06, + "loss": 0.6245, + "step": 8213 + }, + { + "epoch": 0.6215428852483825, + "grad_norm": 2.1265552043914795, + "learning_rate": 6.174759293195511e-06, + "loss": 0.6819, + "step": 8214 + }, + { + "epoch": 0.6216185539707162, + "grad_norm": 1.9517875909805298, + "learning_rate": 6.1726210121408594e-06, + "loss": 0.7735, + "step": 8215 + }, + { + "epoch": 0.6216942226930499, + "grad_norm": 2.6221845149993896, + "learning_rate": 6.170482908145827e-06, + "loss": 0.7706, + "step": 8216 + }, + { + "epoch": 0.6217698914153834, + "grad_norm": 2.260093927383423, + "learning_rate": 6.168344981344304e-06, + "loss": 0.6144, + "step": 8217 + }, + { + "epoch": 0.6218455601377171, + "grad_norm": 2.432312488555908, + "learning_rate": 6.166207231870179e-06, + "loss": 0.8123, + "step": 8218 + }, + { + "epoch": 0.6219212288600507, + "grad_norm": 1.845873236656189, + "learning_rate": 6.16406965985732e-06, + "loss": 0.7548, + "step": 8219 + }, + { + "epoch": 0.6219968975823843, + "grad_norm": 2.7891929149627686, + "learning_rate": 6.161932265439592e-06, + "loss": 0.6276, + "step": 8220 + }, + { + "epoch": 0.622072566304718, + "grad_norm": 1.9426125288009644, + "learning_rate": 6.159795048750848e-06, + "loss": 0.778, + "step": 8221 + }, + { + "epoch": 0.6221482350270515, + "grad_norm": 2.255960702896118, + "learning_rate": 6.157658009924922e-06, + "loss": 0.7768, + "step": 8222 + }, + { + "epoch": 0.6222239037493852, + "grad_norm": 2.089102029800415, + "learning_rate": 6.155521149095647e-06, + "loss": 0.7533, + "step": 8223 + }, + { + "epoch": 0.6222995724717189, + "grad_norm": 2.0780301094055176, + "learning_rate": 6.153384466396833e-06, + "loss": 0.8131, + "step": 8224 + }, + { + "epoch": 0.6223752411940524, + "grad_norm": 2.264507532119751, + "learning_rate": 6.151247961962294e-06, + "loss": 0.5875, + "step": 8225 + }, + { + "epoch": 0.6224509099163861, + "grad_norm": 1.9472380876541138, + "learning_rate": 6.1491116359258215e-06, + "loss": 0.6929, + "step": 8226 + }, + { + "epoch": 0.6225265786387196, + "grad_norm": 1.84212327003479, + "learning_rate": 6.146975488421199e-06, + "loss": 0.5678, + "step": 8227 + }, + { + "epoch": 0.6226022473610533, + "grad_norm": 1.9093300104141235, + "learning_rate": 6.144839519582201e-06, + "loss": 0.739, + "step": 8228 + }, + { + "epoch": 0.622677916083387, + "grad_norm": 1.707137942314148, + "learning_rate": 6.142703729542581e-06, + "loss": 0.6922, + "step": 8229 + }, + { + "epoch": 0.6227535848057205, + "grad_norm": 1.5876680612564087, + "learning_rate": 6.1405681184361e-06, + "loss": 0.7791, + "step": 8230 + }, + { + "epoch": 0.6228292535280542, + "grad_norm": 2.1227242946624756, + "learning_rate": 6.138432686396492e-06, + "loss": 0.8995, + "step": 8231 + }, + { + "epoch": 0.6229049222503878, + "grad_norm": 1.9821844100952148, + "learning_rate": 6.1362974335574835e-06, + "loss": 0.8453, + "step": 8232 + }, + { + "epoch": 0.6229805909727214, + "grad_norm": 2.247864007949829, + "learning_rate": 6.134162360052793e-06, + "loss": 0.7166, + "step": 8233 + }, + { + "epoch": 0.6230562596950551, + "grad_norm": 2.540407657623291, + "learning_rate": 6.132027466016122e-06, + "loss": 0.7784, + "step": 8234 + }, + { + "epoch": 0.6231319284173886, + "grad_norm": 2.323075294494629, + "learning_rate": 6.129892751581171e-06, + "loss": 0.6891, + "step": 8235 + }, + { + "epoch": 0.6232075971397223, + "grad_norm": 1.875849723815918, + "learning_rate": 6.1277582168816165e-06, + "loss": 0.6888, + "step": 8236 + }, + { + "epoch": 0.623283265862056, + "grad_norm": 2.02666974067688, + "learning_rate": 6.125623862051135e-06, + "loss": 0.7028, + "step": 8237 + }, + { + "epoch": 0.6233589345843895, + "grad_norm": 1.8737157583236694, + "learning_rate": 6.1234896872233815e-06, + "loss": 0.9008, + "step": 8238 + }, + { + "epoch": 0.6234346033067232, + "grad_norm": 1.9101803302764893, + "learning_rate": 6.1213556925320105e-06, + "loss": 0.6532, + "step": 8239 + }, + { + "epoch": 0.6235102720290568, + "grad_norm": 2.0796594619750977, + "learning_rate": 6.119221878110652e-06, + "loss": 0.5757, + "step": 8240 + }, + { + "epoch": 0.6235859407513904, + "grad_norm": 2.185795307159424, + "learning_rate": 6.1170882440929385e-06, + "loss": 0.6812, + "step": 8241 + }, + { + "epoch": 0.6236616094737241, + "grad_norm": 2.2780838012695312, + "learning_rate": 6.114954790612487e-06, + "loss": 0.7021, + "step": 8242 + }, + { + "epoch": 0.6237372781960576, + "grad_norm": 2.1631765365600586, + "learning_rate": 6.112821517802896e-06, + "loss": 0.6584, + "step": 8243 + }, + { + "epoch": 0.6238129469183913, + "grad_norm": 2.143260955810547, + "learning_rate": 6.11068842579776e-06, + "loss": 0.7908, + "step": 8244 + }, + { + "epoch": 0.623888615640725, + "grad_norm": 2.4261891841888428, + "learning_rate": 6.108555514730655e-06, + "loss": 0.704, + "step": 8245 + }, + { + "epoch": 0.6239642843630585, + "grad_norm": 2.189960241317749, + "learning_rate": 6.106422784735162e-06, + "loss": 0.7673, + "step": 8246 + }, + { + "epoch": 0.6240399530853922, + "grad_norm": 2.8708183765411377, + "learning_rate": 6.104290235944831e-06, + "loss": 0.6039, + "step": 8247 + }, + { + "epoch": 0.6241156218077257, + "grad_norm": 1.9821953773498535, + "learning_rate": 6.1021578684932136e-06, + "loss": 0.6746, + "step": 8248 + }, + { + "epoch": 0.6241912905300594, + "grad_norm": 2.20764422416687, + "learning_rate": 6.1000256825138405e-06, + "loss": 0.6322, + "step": 8249 + }, + { + "epoch": 0.6242669592523931, + "grad_norm": 2.8695743083953857, + "learning_rate": 6.097893678140237e-06, + "loss": 0.7263, + "step": 8250 + }, + { + "epoch": 0.6243426279747266, + "grad_norm": 2.6145105361938477, + "learning_rate": 6.095761855505921e-06, + "loss": 0.7057, + "step": 8251 + }, + { + "epoch": 0.6244182966970603, + "grad_norm": 2.075453758239746, + "learning_rate": 6.093630214744391e-06, + "loss": 0.8061, + "step": 8252 + }, + { + "epoch": 0.624493965419394, + "grad_norm": 2.094466209411621, + "learning_rate": 6.091498755989139e-06, + "loss": 0.613, + "step": 8253 + }, + { + "epoch": 0.6245696341417275, + "grad_norm": 1.5534762144088745, + "learning_rate": 6.089367479373639e-06, + "loss": 0.7078, + "step": 8254 + }, + { + "epoch": 0.6246453028640612, + "grad_norm": 2.0539896488189697, + "learning_rate": 6.087236385031361e-06, + "loss": 0.7436, + "step": 8255 + }, + { + "epoch": 0.6247209715863947, + "grad_norm": 1.873143196105957, + "learning_rate": 6.085105473095764e-06, + "loss": 0.7195, + "step": 8256 + }, + { + "epoch": 0.6247966403087284, + "grad_norm": 2.717400312423706, + "learning_rate": 6.082974743700289e-06, + "loss": 0.726, + "step": 8257 + }, + { + "epoch": 0.6248723090310621, + "grad_norm": 2.0352890491485596, + "learning_rate": 6.0808441969783714e-06, + "loss": 0.727, + "step": 8258 + }, + { + "epoch": 0.6249479777533956, + "grad_norm": 2.217931032180786, + "learning_rate": 6.078713833063431e-06, + "loss": 0.7579, + "step": 8259 + }, + { + "epoch": 0.6250236464757293, + "grad_norm": 1.9803812503814697, + "learning_rate": 6.0765836520888774e-06, + "loss": 0.7234, + "step": 8260 + }, + { + "epoch": 0.6250993151980628, + "grad_norm": 2.017169237136841, + "learning_rate": 6.074453654188113e-06, + "loss": 0.7283, + "step": 8261 + }, + { + "epoch": 0.6251749839203965, + "grad_norm": 2.0302610397338867, + "learning_rate": 6.072323839494523e-06, + "loss": 0.7154, + "step": 8262 + }, + { + "epoch": 0.6252506526427302, + "grad_norm": 1.8315473794937134, + "learning_rate": 6.070194208141484e-06, + "loss": 0.642, + "step": 8263 + }, + { + "epoch": 0.6253263213650637, + "grad_norm": 2.1627883911132812, + "learning_rate": 6.0680647602623605e-06, + "loss": 0.7464, + "step": 8264 + }, + { + "epoch": 0.6254019900873974, + "grad_norm": 1.989489197731018, + "learning_rate": 6.065935495990501e-06, + "loss": 0.5703, + "step": 8265 + }, + { + "epoch": 0.625477658809731, + "grad_norm": 1.6930984258651733, + "learning_rate": 6.063806415459253e-06, + "loss": 0.5945, + "step": 8266 + }, + { + "epoch": 0.6255533275320646, + "grad_norm": 1.9315162897109985, + "learning_rate": 6.0616775188019444e-06, + "loss": 0.6163, + "step": 8267 + }, + { + "epoch": 0.6256289962543983, + "grad_norm": 1.9883739948272705, + "learning_rate": 6.059548806151893e-06, + "loss": 0.6101, + "step": 8268 + }, + { + "epoch": 0.6257046649767318, + "grad_norm": 2.2060041427612305, + "learning_rate": 6.057420277642407e-06, + "loss": 0.6781, + "step": 8269 + }, + { + "epoch": 0.6257803336990655, + "grad_norm": 2.501366138458252, + "learning_rate": 6.055291933406778e-06, + "loss": 0.6316, + "step": 8270 + }, + { + "epoch": 0.6258560024213992, + "grad_norm": 1.9674981832504272, + "learning_rate": 6.053163773578293e-06, + "loss": 0.8069, + "step": 8271 + }, + { + "epoch": 0.6259316711437327, + "grad_norm": 2.0307114124298096, + "learning_rate": 6.051035798290226e-06, + "loss": 0.5404, + "step": 8272 + }, + { + "epoch": 0.6260073398660664, + "grad_norm": 1.978594183921814, + "learning_rate": 6.048908007675834e-06, + "loss": 0.6138, + "step": 8273 + }, + { + "epoch": 0.6260830085883999, + "grad_norm": 3.162856340408325, + "learning_rate": 6.046780401868367e-06, + "loss": 0.705, + "step": 8274 + }, + { + "epoch": 0.6261586773107336, + "grad_norm": 2.049156427383423, + "learning_rate": 6.044652981001066e-06, + "loss": 0.7661, + "step": 8275 + }, + { + "epoch": 0.6262343460330673, + "grad_norm": 2.057219982147217, + "learning_rate": 6.042525745207149e-06, + "loss": 0.6902, + "step": 8276 + }, + { + "epoch": 0.6263100147554008, + "grad_norm": 2.105268716812134, + "learning_rate": 6.040398694619838e-06, + "loss": 0.6733, + "step": 8277 + }, + { + "epoch": 0.6263856834777345, + "grad_norm": 2.046825647354126, + "learning_rate": 6.038271829372335e-06, + "loss": 0.6682, + "step": 8278 + }, + { + "epoch": 0.6264613522000682, + "grad_norm": 2.0353481769561768, + "learning_rate": 6.036145149597828e-06, + "loss": 0.7527, + "step": 8279 + }, + { + "epoch": 0.6265370209224017, + "grad_norm": 1.9645310640335083, + "learning_rate": 6.034018655429499e-06, + "loss": 0.6571, + "step": 8280 + }, + { + "epoch": 0.6266126896447354, + "grad_norm": 1.8533154726028442, + "learning_rate": 6.031892347000512e-06, + "loss": 0.6641, + "step": 8281 + }, + { + "epoch": 0.6266883583670689, + "grad_norm": 2.540891170501709, + "learning_rate": 6.029766224444028e-06, + "loss": 0.7443, + "step": 8282 + }, + { + "epoch": 0.6267640270894026, + "grad_norm": 2.1608104705810547, + "learning_rate": 6.027640287893191e-06, + "loss": 0.6818, + "step": 8283 + }, + { + "epoch": 0.6268396958117363, + "grad_norm": 2.1452369689941406, + "learning_rate": 6.0255145374811315e-06, + "loss": 0.7741, + "step": 8284 + }, + { + "epoch": 0.6269153645340698, + "grad_norm": 1.848710060119629, + "learning_rate": 6.023388973340974e-06, + "loss": 0.6587, + "step": 8285 + }, + { + "epoch": 0.6269910332564035, + "grad_norm": 2.499025821685791, + "learning_rate": 6.021263595605825e-06, + "loss": 0.7964, + "step": 8286 + }, + { + "epoch": 0.627066701978737, + "grad_norm": 2.207012414932251, + "learning_rate": 6.019138404408783e-06, + "loss": 0.6924, + "step": 8287 + }, + { + "epoch": 0.6271423707010707, + "grad_norm": 2.040011405944824, + "learning_rate": 6.017013399882936e-06, + "loss": 0.6481, + "step": 8288 + }, + { + "epoch": 0.6272180394234044, + "grad_norm": 1.5633232593536377, + "learning_rate": 6.014888582161361e-06, + "loss": 0.6835, + "step": 8289 + }, + { + "epoch": 0.6272937081457379, + "grad_norm": 2.0602505207061768, + "learning_rate": 6.012763951377116e-06, + "loss": 0.8177, + "step": 8290 + }, + { + "epoch": 0.6273693768680716, + "grad_norm": 1.8293778896331787, + "learning_rate": 6.010639507663251e-06, + "loss": 0.6229, + "step": 8291 + }, + { + "epoch": 0.6274450455904053, + "grad_norm": 2.240879535675049, + "learning_rate": 6.008515251152815e-06, + "loss": 0.6448, + "step": 8292 + }, + { + "epoch": 0.6275207143127388, + "grad_norm": 1.7623291015625, + "learning_rate": 6.006391181978825e-06, + "loss": 0.6318, + "step": 8293 + }, + { + "epoch": 0.6275963830350725, + "grad_norm": 2.637432098388672, + "learning_rate": 6.004267300274305e-06, + "loss": 0.632, + "step": 8294 + }, + { + "epoch": 0.627672051757406, + "grad_norm": 2.096395492553711, + "learning_rate": 6.002143606172254e-06, + "loss": 0.798, + "step": 8295 + }, + { + "epoch": 0.6277477204797397, + "grad_norm": 2.2483010292053223, + "learning_rate": 6.000020099805665e-06, + "loss": 0.6926, + "step": 8296 + }, + { + "epoch": 0.6278233892020734, + "grad_norm": 2.0288619995117188, + "learning_rate": 5.997896781307524e-06, + "loss": 0.7208, + "step": 8297 + }, + { + "epoch": 0.6278990579244069, + "grad_norm": 1.9230502843856812, + "learning_rate": 5.995773650810794e-06, + "loss": 0.6955, + "step": 8298 + }, + { + "epoch": 0.6279747266467406, + "grad_norm": 1.8316569328308105, + "learning_rate": 5.993650708448437e-06, + "loss": 0.5128, + "step": 8299 + }, + { + "epoch": 0.6280503953690741, + "grad_norm": 2.2068562507629395, + "learning_rate": 5.991527954353395e-06, + "loss": 0.765, + "step": 8300 + }, + { + "epoch": 0.6281260640914078, + "grad_norm": 2.418468952178955, + "learning_rate": 5.9894053886586006e-06, + "loss": 0.7241, + "step": 8301 + }, + { + "epoch": 0.6282017328137415, + "grad_norm": 1.7980351448059082, + "learning_rate": 5.987283011496981e-06, + "loss": 0.6045, + "step": 8302 + }, + { + "epoch": 0.628277401536075, + "grad_norm": 2.48748779296875, + "learning_rate": 5.985160823001445e-06, + "loss": 0.7215, + "step": 8303 + }, + { + "epoch": 0.6283530702584087, + "grad_norm": 1.8230119943618774, + "learning_rate": 5.983038823304886e-06, + "loss": 0.705, + "step": 8304 + }, + { + "epoch": 0.6284287389807424, + "grad_norm": 2.0660839080810547, + "learning_rate": 5.980917012540198e-06, + "loss": 0.7044, + "step": 8305 + }, + { + "epoch": 0.6285044077030759, + "grad_norm": 1.6837860345840454, + "learning_rate": 5.978795390840247e-06, + "loss": 0.6187, + "step": 8306 + }, + { + "epoch": 0.6285800764254096, + "grad_norm": 1.9054850339889526, + "learning_rate": 5.976673958337902e-06, + "loss": 0.6454, + "step": 8307 + }, + { + "epoch": 0.6286557451477431, + "grad_norm": 2.1775870323181152, + "learning_rate": 5.974552715166014e-06, + "loss": 0.6592, + "step": 8308 + }, + { + "epoch": 0.6287314138700768, + "grad_norm": 1.7539699077606201, + "learning_rate": 5.97243166145742e-06, + "loss": 0.6329, + "step": 8309 + }, + { + "epoch": 0.6288070825924105, + "grad_norm": 7.603641033172607, + "learning_rate": 5.970310797344949e-06, + "loss": 0.6322, + "step": 8310 + }, + { + "epoch": 0.628882751314744, + "grad_norm": 2.0297069549560547, + "learning_rate": 5.968190122961411e-06, + "loss": 0.7299, + "step": 8311 + }, + { + "epoch": 0.6289584200370777, + "grad_norm": 2.0094974040985107, + "learning_rate": 5.966069638439615e-06, + "loss": 0.5245, + "step": 8312 + }, + { + "epoch": 0.6290340887594112, + "grad_norm": 1.9231394529342651, + "learning_rate": 5.963949343912353e-06, + "loss": 0.6593, + "step": 8313 + }, + { + "epoch": 0.6291097574817449, + "grad_norm": 25.850337982177734, + "learning_rate": 5.961829239512402e-06, + "loss": 0.5678, + "step": 8314 + }, + { + "epoch": 0.6291854262040786, + "grad_norm": 3.02929425239563, + "learning_rate": 5.959709325372531e-06, + "loss": 0.5795, + "step": 8315 + }, + { + "epoch": 0.6292610949264121, + "grad_norm": 1.7381452322006226, + "learning_rate": 5.957589601625495e-06, + "loss": 0.6775, + "step": 8316 + }, + { + "epoch": 0.6293367636487458, + "grad_norm": 2.252568483352661, + "learning_rate": 5.955470068404037e-06, + "loss": 0.6707, + "step": 8317 + }, + { + "epoch": 0.6294124323710795, + "grad_norm": 1.7029986381530762, + "learning_rate": 5.953350725840891e-06, + "loss": 0.6799, + "step": 8318 + }, + { + "epoch": 0.629488101093413, + "grad_norm": 1.7706902027130127, + "learning_rate": 5.9512315740687785e-06, + "loss": 0.6089, + "step": 8319 + }, + { + "epoch": 0.6295637698157467, + "grad_norm": 2.4172465801239014, + "learning_rate": 5.949112613220405e-06, + "loss": 0.8089, + "step": 8320 + }, + { + "epoch": 0.6296394385380802, + "grad_norm": 2.203758955001831, + "learning_rate": 5.946993843428469e-06, + "loss": 0.6302, + "step": 8321 + }, + { + "epoch": 0.6297151072604139, + "grad_norm": 2.0722339153289795, + "learning_rate": 5.944875264825648e-06, + "loss": 0.613, + "step": 8322 + }, + { + "epoch": 0.6297907759827476, + "grad_norm": 2.695570707321167, + "learning_rate": 5.942756877544623e-06, + "loss": 0.7249, + "step": 8323 + }, + { + "epoch": 0.6298664447050811, + "grad_norm": 2.075514554977417, + "learning_rate": 5.940638681718052e-06, + "loss": 0.5446, + "step": 8324 + }, + { + "epoch": 0.6299421134274148, + "grad_norm": 1.8209716081619263, + "learning_rate": 5.938520677478581e-06, + "loss": 0.719, + "step": 8325 + }, + { + "epoch": 0.6300177821497484, + "grad_norm": 2.407820224761963, + "learning_rate": 5.936402864958848e-06, + "loss": 0.7208, + "step": 8326 + }, + { + "epoch": 0.630093450872082, + "grad_norm": 2.0046629905700684, + "learning_rate": 5.934285244291473e-06, + "loss": 0.7994, + "step": 8327 + }, + { + "epoch": 0.6301691195944157, + "grad_norm": 2.1427841186523438, + "learning_rate": 5.932167815609073e-06, + "loss": 0.7415, + "step": 8328 + }, + { + "epoch": 0.6302447883167492, + "grad_norm": 2.2254834175109863, + "learning_rate": 5.930050579044249e-06, + "loss": 0.6476, + "step": 8329 + }, + { + "epoch": 0.6303204570390829, + "grad_norm": 2.195834159851074, + "learning_rate": 5.927933534729585e-06, + "loss": 0.5771, + "step": 8330 + }, + { + "epoch": 0.6303961257614166, + "grad_norm": 1.9512183666229248, + "learning_rate": 5.925816682797663e-06, + "loss": 0.7415, + "step": 8331 + }, + { + "epoch": 0.6304717944837501, + "grad_norm": 2.1546523571014404, + "learning_rate": 5.9237000233810356e-06, + "loss": 0.6585, + "step": 8332 + }, + { + "epoch": 0.6305474632060838, + "grad_norm": 1.8281909227371216, + "learning_rate": 5.9215835566122696e-06, + "loss": 0.7334, + "step": 8333 + }, + { + "epoch": 0.6306231319284173, + "grad_norm": 2.1117517948150635, + "learning_rate": 5.919467282623896e-06, + "loss": 0.5892, + "step": 8334 + }, + { + "epoch": 0.630698800650751, + "grad_norm": 2.3042280673980713, + "learning_rate": 5.917351201548447e-06, + "loss": 0.6998, + "step": 8335 + }, + { + "epoch": 0.6307744693730847, + "grad_norm": 2.325014114379883, + "learning_rate": 5.9152353135184335e-06, + "loss": 0.6966, + "step": 8336 + }, + { + "epoch": 0.6308501380954182, + "grad_norm": 1.9471384286880493, + "learning_rate": 5.913119618666361e-06, + "loss": 0.6993, + "step": 8337 + }, + { + "epoch": 0.6309258068177519, + "grad_norm": 2.687548875808716, + "learning_rate": 5.911004117124724e-06, + "loss": 0.7726, + "step": 8338 + }, + { + "epoch": 0.6310014755400855, + "grad_norm": 1.9551631212234497, + "learning_rate": 5.908888809026001e-06, + "loss": 0.7378, + "step": 8339 + }, + { + "epoch": 0.6310771442624191, + "grad_norm": 2.2991223335266113, + "learning_rate": 5.9067736945026594e-06, + "loss": 0.7404, + "step": 8340 + }, + { + "epoch": 0.6311528129847528, + "grad_norm": 2.1046478748321533, + "learning_rate": 5.904658773687153e-06, + "loss": 0.7065, + "step": 8341 + }, + { + "epoch": 0.6312284817070863, + "grad_norm": 2.240189790725708, + "learning_rate": 5.902544046711922e-06, + "loss": 0.6193, + "step": 8342 + }, + { + "epoch": 0.63130415042942, + "grad_norm": 1.9708818197250366, + "learning_rate": 5.9004295137094054e-06, + "loss": 0.6508, + "step": 8343 + }, + { + "epoch": 0.6313798191517537, + "grad_norm": 2.299236297607422, + "learning_rate": 5.898315174812016e-06, + "loss": 0.842, + "step": 8344 + }, + { + "epoch": 0.6314554878740872, + "grad_norm": 2.2343852519989014, + "learning_rate": 5.896201030152164e-06, + "loss": 0.7665, + "step": 8345 + }, + { + "epoch": 0.6315311565964209, + "grad_norm": 2.4583773612976074, + "learning_rate": 5.894087079862241e-06, + "loss": 0.6585, + "step": 8346 + }, + { + "epoch": 0.6316068253187545, + "grad_norm": 2.2705533504486084, + "learning_rate": 5.89197332407463e-06, + "loss": 0.5592, + "step": 8347 + }, + { + "epoch": 0.6316824940410881, + "grad_norm": 1.823162317276001, + "learning_rate": 5.889859762921702e-06, + "loss": 0.5589, + "step": 8348 + }, + { + "epoch": 0.6317581627634218, + "grad_norm": 2.318509340286255, + "learning_rate": 5.8877463965358175e-06, + "loss": 0.6892, + "step": 8349 + }, + { + "epoch": 0.6318338314857553, + "grad_norm": 2.149641275405884, + "learning_rate": 5.885633225049318e-06, + "loss": 0.789, + "step": 8350 + }, + { + "epoch": 0.631909500208089, + "grad_norm": 2.2734084129333496, + "learning_rate": 5.883520248594542e-06, + "loss": 0.7964, + "step": 8351 + }, + { + "epoch": 0.6319851689304226, + "grad_norm": 1.703250527381897, + "learning_rate": 5.881407467303804e-06, + "loss": 0.6932, + "step": 8352 + }, + { + "epoch": 0.6320608376527562, + "grad_norm": 1.407254934310913, + "learning_rate": 5.879294881309418e-06, + "loss": 0.9285, + "step": 8353 + }, + { + "epoch": 0.6321365063750899, + "grad_norm": 1.6375536918640137, + "learning_rate": 5.877182490743683e-06, + "loss": 0.5346, + "step": 8354 + }, + { + "epoch": 0.6322121750974234, + "grad_norm": 2.1857407093048096, + "learning_rate": 5.875070295738878e-06, + "loss": 0.675, + "step": 8355 + }, + { + "epoch": 0.6322878438197571, + "grad_norm": 2.0227956771850586, + "learning_rate": 5.872958296427281e-06, + "loss": 0.6196, + "step": 8356 + }, + { + "epoch": 0.6323635125420908, + "grad_norm": 1.9548943042755127, + "learning_rate": 5.870846492941147e-06, + "loss": 0.8473, + "step": 8357 + }, + { + "epoch": 0.6324391812644243, + "grad_norm": 1.8974775075912476, + "learning_rate": 5.868734885412725e-06, + "loss": 0.7865, + "step": 8358 + }, + { + "epoch": 0.632514849986758, + "grad_norm": 1.963448405265808, + "learning_rate": 5.866623473974256e-06, + "loss": 0.6768, + "step": 8359 + }, + { + "epoch": 0.6325905187090916, + "grad_norm": 2.001469135284424, + "learning_rate": 5.864512258757957e-06, + "loss": 0.7864, + "step": 8360 + }, + { + "epoch": 0.6326661874314252, + "grad_norm": 1.9863218069076538, + "learning_rate": 5.862401239896045e-06, + "loss": 0.6782, + "step": 8361 + }, + { + "epoch": 0.6327418561537589, + "grad_norm": 2.1429598331451416, + "learning_rate": 5.8602904175207126e-06, + "loss": 0.8312, + "step": 8362 + }, + { + "epoch": 0.6328175248760924, + "grad_norm": 1.959979772567749, + "learning_rate": 5.858179791764148e-06, + "loss": 0.7098, + "step": 8363 + }, + { + "epoch": 0.6328931935984261, + "grad_norm": 2.1421241760253906, + "learning_rate": 5.856069362758528e-06, + "loss": 0.7651, + "step": 8364 + }, + { + "epoch": 0.6329688623207597, + "grad_norm": 2.1992077827453613, + "learning_rate": 5.853959130636017e-06, + "loss": 0.6911, + "step": 8365 + }, + { + "epoch": 0.6330445310430933, + "grad_norm": 1.951259970664978, + "learning_rate": 5.8518490955287564e-06, + "loss": 0.7088, + "step": 8366 + }, + { + "epoch": 0.633120199765427, + "grad_norm": 2.2392029762268066, + "learning_rate": 5.849739257568891e-06, + "loss": 0.6065, + "step": 8367 + }, + { + "epoch": 0.6331958684877605, + "grad_norm": 1.755878210067749, + "learning_rate": 5.847629616888538e-06, + "loss": 0.5707, + "step": 8368 + }, + { + "epoch": 0.6332715372100942, + "grad_norm": 1.9123753309249878, + "learning_rate": 5.845520173619817e-06, + "loss": 0.6594, + "step": 8369 + }, + { + "epoch": 0.6333472059324279, + "grad_norm": 2.4851553440093994, + "learning_rate": 5.843410927894827e-06, + "loss": 0.7454, + "step": 8370 + }, + { + "epoch": 0.6334228746547614, + "grad_norm": 2.0496127605438232, + "learning_rate": 5.841301879845653e-06, + "loss": 0.6757, + "step": 8371 + }, + { + "epoch": 0.6334985433770951, + "grad_norm": 2.043626070022583, + "learning_rate": 5.839193029604373e-06, + "loss": 0.7011, + "step": 8372 + }, + { + "epoch": 0.6335742120994287, + "grad_norm": 2.5057694911956787, + "learning_rate": 5.837084377303045e-06, + "loss": 0.6438, + "step": 8373 + }, + { + "epoch": 0.6336498808217623, + "grad_norm": 1.9614980220794678, + "learning_rate": 5.834975923073727e-06, + "loss": 0.4584, + "step": 8374 + }, + { + "epoch": 0.633725549544096, + "grad_norm": 1.787739872932434, + "learning_rate": 5.832867667048453e-06, + "loss": 0.6868, + "step": 8375 + }, + { + "epoch": 0.6338012182664295, + "grad_norm": 2.6709229946136475, + "learning_rate": 5.830759609359248e-06, + "loss": 0.7381, + "step": 8376 + }, + { + "epoch": 0.6338768869887632, + "grad_norm": 2.4086039066314697, + "learning_rate": 5.828651750138128e-06, + "loss": 0.6239, + "step": 8377 + }, + { + "epoch": 0.6339525557110968, + "grad_norm": 2.210710287094116, + "learning_rate": 5.82654408951709e-06, + "loss": 0.7521, + "step": 8378 + }, + { + "epoch": 0.6340282244334304, + "grad_norm": 2.0984058380126953, + "learning_rate": 5.82443662762813e-06, + "loss": 0.6862, + "step": 8379 + }, + { + "epoch": 0.6341038931557641, + "grad_norm": 2.363299608230591, + "learning_rate": 5.8223293646032166e-06, + "loss": 0.6896, + "step": 8380 + }, + { + "epoch": 0.6341795618780977, + "grad_norm": 1.9636515378952026, + "learning_rate": 5.820222300574318e-06, + "loss": 0.7168, + "step": 8381 + }, + { + "epoch": 0.6342552306004313, + "grad_norm": 2.6832809448242188, + "learning_rate": 5.8181154356733815e-06, + "loss": 0.7862, + "step": 8382 + }, + { + "epoch": 0.634330899322765, + "grad_norm": 1.9470926523208618, + "learning_rate": 5.816008770032347e-06, + "loss": 0.7394, + "step": 8383 + }, + { + "epoch": 0.6344065680450985, + "grad_norm": 1.952431321144104, + "learning_rate": 5.8139023037831446e-06, + "loss": 0.7354, + "step": 8384 + }, + { + "epoch": 0.6344822367674322, + "grad_norm": 2.5503687858581543, + "learning_rate": 5.8117960370576845e-06, + "loss": 0.7003, + "step": 8385 + }, + { + "epoch": 0.6345579054897658, + "grad_norm": 2.3349881172180176, + "learning_rate": 5.809689969987869e-06, + "loss": 0.6624, + "step": 8386 + }, + { + "epoch": 0.6346335742120994, + "grad_norm": 2.486293315887451, + "learning_rate": 5.807584102705585e-06, + "loss": 0.7769, + "step": 8387 + }, + { + "epoch": 0.6347092429344331, + "grad_norm": 2.2371928691864014, + "learning_rate": 5.805478435342707e-06, + "loss": 0.7067, + "step": 8388 + }, + { + "epoch": 0.6347849116567666, + "grad_norm": 2.085529088973999, + "learning_rate": 5.803372968031108e-06, + "loss": 0.7602, + "step": 8389 + }, + { + "epoch": 0.6348605803791003, + "grad_norm": 2.1521055698394775, + "learning_rate": 5.80126770090263e-06, + "loss": 0.9044, + "step": 8390 + }, + { + "epoch": 0.6349362491014339, + "grad_norm": 2.458247661590576, + "learning_rate": 5.799162634089113e-06, + "loss": 0.6094, + "step": 8391 + }, + { + "epoch": 0.6350119178237675, + "grad_norm": 1.8370447158813477, + "learning_rate": 5.7970577677223876e-06, + "loss": 0.6742, + "step": 8392 + }, + { + "epoch": 0.6350875865461012, + "grad_norm": 2.2332441806793213, + "learning_rate": 5.794953101934262e-06, + "loss": 0.6287, + "step": 8393 + }, + { + "epoch": 0.6351632552684348, + "grad_norm": 2.1860263347625732, + "learning_rate": 5.792848636856537e-06, + "loss": 0.674, + "step": 8394 + }, + { + "epoch": 0.6352389239907684, + "grad_norm": 2.5469307899475098, + "learning_rate": 5.790744372621009e-06, + "loss": 0.7648, + "step": 8395 + }, + { + "epoch": 0.6353145927131021, + "grad_norm": 2.202962636947632, + "learning_rate": 5.788640309359445e-06, + "loss": 0.8781, + "step": 8396 + }, + { + "epoch": 0.6353902614354356, + "grad_norm": 1.9034675359725952, + "learning_rate": 5.786536447203615e-06, + "loss": 0.4877, + "step": 8397 + }, + { + "epoch": 0.6354659301577693, + "grad_norm": 1.8396954536437988, + "learning_rate": 5.784432786285264e-06, + "loss": 0.6477, + "step": 8398 + }, + { + "epoch": 0.6355415988801029, + "grad_norm": 2.2514522075653076, + "learning_rate": 5.78232932673613e-06, + "loss": 0.6755, + "step": 8399 + }, + { + "epoch": 0.6356172676024365, + "grad_norm": 2.3135526180267334, + "learning_rate": 5.780226068687944e-06, + "loss": 0.6906, + "step": 8400 + }, + { + "epoch": 0.6356929363247702, + "grad_norm": 1.9569233655929565, + "learning_rate": 5.778123012272415e-06, + "loss": 0.7055, + "step": 8401 + }, + { + "epoch": 0.6357686050471038, + "grad_norm": 2.095384359359741, + "learning_rate": 5.776020157621244e-06, + "loss": 0.8051, + "step": 8402 + }, + { + "epoch": 0.6358442737694374, + "grad_norm": 2.363507032394409, + "learning_rate": 5.773917504866118e-06, + "loss": 0.6488, + "step": 8403 + }, + { + "epoch": 0.635919942491771, + "grad_norm": 1.7671669721603394, + "learning_rate": 5.77181505413871e-06, + "loss": 0.6069, + "step": 8404 + }, + { + "epoch": 0.6359956112141046, + "grad_norm": 2.2608208656311035, + "learning_rate": 5.7697128055706865e-06, + "loss": 0.7638, + "step": 8405 + }, + { + "epoch": 0.6360712799364383, + "grad_norm": 2.832077741622925, + "learning_rate": 5.767610759293697e-06, + "loss": 0.5604, + "step": 8406 + }, + { + "epoch": 0.6361469486587719, + "grad_norm": 1.9445099830627441, + "learning_rate": 5.765508915439374e-06, + "loss": 0.5337, + "step": 8407 + }, + { + "epoch": 0.6362226173811055, + "grad_norm": 2.1563804149627686, + "learning_rate": 5.763407274139347e-06, + "loss": 0.7573, + "step": 8408 + }, + { + "epoch": 0.6362982861034392, + "grad_norm": 2.05100417137146, + "learning_rate": 5.761305835525221e-06, + "loss": 0.5423, + "step": 8409 + }, + { + "epoch": 0.6363739548257727, + "grad_norm": 2.729825258255005, + "learning_rate": 5.7592045997286e-06, + "loss": 0.6868, + "step": 8410 + }, + { + "epoch": 0.6364496235481064, + "grad_norm": 2.297889232635498, + "learning_rate": 5.757103566881071e-06, + "loss": 0.8106, + "step": 8411 + }, + { + "epoch": 0.63652529227044, + "grad_norm": 2.0106875896453857, + "learning_rate": 5.755002737114204e-06, + "loss": 0.6299, + "step": 8412 + }, + { + "epoch": 0.6366009609927736, + "grad_norm": 2.3025240898132324, + "learning_rate": 5.752902110559564e-06, + "loss": 0.6633, + "step": 8413 + }, + { + "epoch": 0.6366766297151073, + "grad_norm": 2.3724968433380127, + "learning_rate": 5.75080168734869e-06, + "loss": 0.6491, + "step": 8414 + }, + { + "epoch": 0.6367522984374409, + "grad_norm": 2.080514430999756, + "learning_rate": 5.748701467613128e-06, + "loss": 0.5447, + "step": 8415 + }, + { + "epoch": 0.6368279671597745, + "grad_norm": 1.9041091203689575, + "learning_rate": 5.746601451484396e-06, + "loss": 0.7418, + "step": 8416 + }, + { + "epoch": 0.6369036358821081, + "grad_norm": 2.2937114238739014, + "learning_rate": 5.744501639094003e-06, + "loss": 0.9319, + "step": 8417 + }, + { + "epoch": 0.6369793046044417, + "grad_norm": 1.9907230138778687, + "learning_rate": 5.742402030573449e-06, + "loss": 0.7111, + "step": 8418 + }, + { + "epoch": 0.6370549733267754, + "grad_norm": 1.8847614526748657, + "learning_rate": 5.74030262605421e-06, + "loss": 0.6703, + "step": 8419 + }, + { + "epoch": 0.637130642049109, + "grad_norm": 1.972623586654663, + "learning_rate": 5.73820342566777e-06, + "loss": 0.763, + "step": 8420 + }, + { + "epoch": 0.6372063107714426, + "grad_norm": 2.680828332901001, + "learning_rate": 5.736104429545579e-06, + "loss": 0.7193, + "step": 8421 + }, + { + "epoch": 0.6372819794937763, + "grad_norm": 2.13775372505188, + "learning_rate": 5.7340056378190865e-06, + "loss": 0.6998, + "step": 8422 + }, + { + "epoch": 0.6373576482161099, + "grad_norm": 2.1243772506713867, + "learning_rate": 5.731907050619723e-06, + "loss": 0.6792, + "step": 8423 + }, + { + "epoch": 0.6374333169384435, + "grad_norm": 1.8973451852798462, + "learning_rate": 5.72980866807891e-06, + "loss": 0.7644, + "step": 8424 + }, + { + "epoch": 0.6375089856607771, + "grad_norm": 1.803795576095581, + "learning_rate": 5.7277104903280575e-06, + "loss": 0.6412, + "step": 8425 + }, + { + "epoch": 0.6375846543831107, + "grad_norm": 1.9631472826004028, + "learning_rate": 5.725612517498555e-06, + "loss": 0.6546, + "step": 8426 + }, + { + "epoch": 0.6376603231054444, + "grad_norm": 2.7881197929382324, + "learning_rate": 5.723514749721792e-06, + "loss": 0.7727, + "step": 8427 + }, + { + "epoch": 0.637735991827778, + "grad_norm": 2.023376941680908, + "learning_rate": 5.721417187129128e-06, + "loss": 0.7832, + "step": 8428 + }, + { + "epoch": 0.6378116605501116, + "grad_norm": 2.40487003326416, + "learning_rate": 5.719319829851925e-06, + "loss": 0.8232, + "step": 8429 + }, + { + "epoch": 0.6378873292724452, + "grad_norm": 2.0782392024993896, + "learning_rate": 5.717222678021528e-06, + "loss": 0.5967, + "step": 8430 + }, + { + "epoch": 0.6379629979947788, + "grad_norm": 2.3502237796783447, + "learning_rate": 5.715125731769261e-06, + "loss": 0.8443, + "step": 8431 + }, + { + "epoch": 0.6380386667171125, + "grad_norm": 4.884004592895508, + "learning_rate": 5.713028991226448e-06, + "loss": 0.7055, + "step": 8432 + }, + { + "epoch": 0.6381143354394461, + "grad_norm": 2.4166226387023926, + "learning_rate": 5.71093245652439e-06, + "loss": 0.6572, + "step": 8433 + }, + { + "epoch": 0.6381900041617797, + "grad_norm": 1.7382185459136963, + "learning_rate": 5.708836127794382e-06, + "loss": 0.4822, + "step": 8434 + }, + { + "epoch": 0.6382656728841134, + "grad_norm": 1.87117338180542, + "learning_rate": 5.706740005167694e-06, + "loss": 0.7563, + "step": 8435 + }, + { + "epoch": 0.638341341606447, + "grad_norm": 2.3526854515075684, + "learning_rate": 5.704644088775605e-06, + "loss": 0.6714, + "step": 8436 + }, + { + "epoch": 0.6384170103287806, + "grad_norm": 1.952702283859253, + "learning_rate": 5.702548378749359e-06, + "loss": 0.6943, + "step": 8437 + }, + { + "epoch": 0.6384926790511142, + "grad_norm": 4.3784356117248535, + "learning_rate": 5.7004528752202e-06, + "loss": 0.5664, + "step": 8438 + }, + { + "epoch": 0.6385683477734478, + "grad_norm": 2.005580425262451, + "learning_rate": 5.698357578319353e-06, + "loss": 0.4389, + "step": 8439 + }, + { + "epoch": 0.6386440164957815, + "grad_norm": 2.7630388736724854, + "learning_rate": 5.696262488178031e-06, + "loss": 0.5916, + "step": 8440 + }, + { + "epoch": 0.6387196852181151, + "grad_norm": 2.2635788917541504, + "learning_rate": 5.694167604927441e-06, + "loss": 0.8042, + "step": 8441 + }, + { + "epoch": 0.6387953539404487, + "grad_norm": 2.6450791358947754, + "learning_rate": 5.692072928698768e-06, + "loss": 0.5544, + "step": 8442 + }, + { + "epoch": 0.6388710226627823, + "grad_norm": 2.0859289169311523, + "learning_rate": 5.689978459623186e-06, + "loss": 0.798, + "step": 8443 + }, + { + "epoch": 0.638946691385116, + "grad_norm": 2.267434597015381, + "learning_rate": 5.6878841978318596e-06, + "loss": 0.789, + "step": 8444 + }, + { + "epoch": 0.6390223601074496, + "grad_norm": 2.525017738342285, + "learning_rate": 5.6857901434559335e-06, + "loss": 0.6433, + "step": 8445 + }, + { + "epoch": 0.6390980288297832, + "grad_norm": 3.037821054458618, + "learning_rate": 5.683696296626554e-06, + "loss": 0.7781, + "step": 8446 + }, + { + "epoch": 0.6391736975521168, + "grad_norm": 2.607813835144043, + "learning_rate": 5.681602657474835e-06, + "loss": 0.7136, + "step": 8447 + }, + { + "epoch": 0.6392493662744505, + "grad_norm": 2.1024436950683594, + "learning_rate": 5.679509226131894e-06, + "loss": 0.6246, + "step": 8448 + }, + { + "epoch": 0.6393250349967841, + "grad_norm": 2.2261359691619873, + "learning_rate": 5.677416002728822e-06, + "loss": 0.847, + "step": 8449 + }, + { + "epoch": 0.6394007037191177, + "grad_norm": 2.5437755584716797, + "learning_rate": 5.675322987396705e-06, + "loss": 0.5989, + "step": 8450 + }, + { + "epoch": 0.6394763724414513, + "grad_norm": 2.4765031337738037, + "learning_rate": 5.673230180266618e-06, + "loss": 0.6378, + "step": 8451 + }, + { + "epoch": 0.639552041163785, + "grad_norm": 2.359419584274292, + "learning_rate": 5.6711375814696184e-06, + "loss": 0.7394, + "step": 8452 + }, + { + "epoch": 0.6396277098861186, + "grad_norm": 1.8588393926620483, + "learning_rate": 5.66904519113675e-06, + "loss": 0.6876, + "step": 8453 + }, + { + "epoch": 0.6397033786084522, + "grad_norm": 2.5035762786865234, + "learning_rate": 5.666953009399045e-06, + "loss": 0.8412, + "step": 8454 + }, + { + "epoch": 0.6397790473307858, + "grad_norm": 2.0973055362701416, + "learning_rate": 5.6648610363875196e-06, + "loss": 0.6031, + "step": 8455 + }, + { + "epoch": 0.6398547160531194, + "grad_norm": 1.9568322896957397, + "learning_rate": 5.662769272233186e-06, + "loss": 0.6001, + "step": 8456 + }, + { + "epoch": 0.639930384775453, + "grad_norm": 1.8160796165466309, + "learning_rate": 5.660677717067035e-06, + "loss": 0.6849, + "step": 8457 + }, + { + "epoch": 0.6400060534977867, + "grad_norm": 2.6608216762542725, + "learning_rate": 5.658586371020046e-06, + "loss": 0.8533, + "step": 8458 + }, + { + "epoch": 0.6400817222201203, + "grad_norm": 1.9408966302871704, + "learning_rate": 5.6564952342231875e-06, + "loss": 0.584, + "step": 8459 + }, + { + "epoch": 0.6401573909424539, + "grad_norm": 2.22806978225708, + "learning_rate": 5.654404306807407e-06, + "loss": 0.6623, + "step": 8460 + }, + { + "epoch": 0.6402330596647876, + "grad_norm": 1.8229750394821167, + "learning_rate": 5.652313588903652e-06, + "loss": 0.6162, + "step": 8461 + }, + { + "epoch": 0.6403087283871212, + "grad_norm": 2.114300489425659, + "learning_rate": 5.650223080642849e-06, + "loss": 0.6585, + "step": 8462 + }, + { + "epoch": 0.6403843971094548, + "grad_norm": 2.9402008056640625, + "learning_rate": 5.648132782155911e-06, + "loss": 0.7531, + "step": 8463 + }, + { + "epoch": 0.6404600658317884, + "grad_norm": 2.2352404594421387, + "learning_rate": 5.646042693573738e-06, + "loss": 0.7114, + "step": 8464 + }, + { + "epoch": 0.640535734554122, + "grad_norm": 1.94172203540802, + "learning_rate": 5.643952815027218e-06, + "loss": 0.6461, + "step": 8465 + }, + { + "epoch": 0.6406114032764557, + "grad_norm": 2.5948123931884766, + "learning_rate": 5.6418631466472315e-06, + "loss": 0.714, + "step": 8466 + }, + { + "epoch": 0.6406870719987893, + "grad_norm": 1.9451491832733154, + "learning_rate": 5.639773688564634e-06, + "loss": 0.664, + "step": 8467 + }, + { + "epoch": 0.6407627407211229, + "grad_norm": 2.0619945526123047, + "learning_rate": 5.637684440910279e-06, + "loss": 0.7656, + "step": 8468 + }, + { + "epoch": 0.6408384094434566, + "grad_norm": 1.8974416255950928, + "learning_rate": 5.635595403814996e-06, + "loss": 0.6097, + "step": 8469 + }, + { + "epoch": 0.6409140781657902, + "grad_norm": 3.0312063694000244, + "learning_rate": 5.633506577409614e-06, + "loss": 0.7523, + "step": 8470 + }, + { + "epoch": 0.6409897468881238, + "grad_norm": 2.127302885055542, + "learning_rate": 5.631417961824933e-06, + "loss": 0.7443, + "step": 8471 + }, + { + "epoch": 0.6410654156104574, + "grad_norm": 2.226922035217285, + "learning_rate": 5.629329557191758e-06, + "loss": 0.6491, + "step": 8472 + }, + { + "epoch": 0.641141084332791, + "grad_norm": 1.9703574180603027, + "learning_rate": 5.627241363640871e-06, + "loss": 0.5922, + "step": 8473 + }, + { + "epoch": 0.6412167530551247, + "grad_norm": 2.090078353881836, + "learning_rate": 5.6251533813030355e-06, + "loss": 0.6001, + "step": 8474 + }, + { + "epoch": 0.6412924217774583, + "grad_norm": 2.1429638862609863, + "learning_rate": 5.623065610309013e-06, + "loss": 0.6735, + "step": 8475 + }, + { + "epoch": 0.6413680904997919, + "grad_norm": 1.9501971006393433, + "learning_rate": 5.6209780507895404e-06, + "loss": 0.6364, + "step": 8476 + }, + { + "epoch": 0.6414437592221255, + "grad_norm": 1.956324577331543, + "learning_rate": 5.618890702875353e-06, + "loss": 0.7588, + "step": 8477 + }, + { + "epoch": 0.6415194279444592, + "grad_norm": 2.1515815258026123, + "learning_rate": 5.616803566697168e-06, + "loss": 0.7206, + "step": 8478 + }, + { + "epoch": 0.6415950966667928, + "grad_norm": 2.6640117168426514, + "learning_rate": 5.614716642385684e-06, + "loss": 0.6887, + "step": 8479 + }, + { + "epoch": 0.6416707653891264, + "grad_norm": 2.3514342308044434, + "learning_rate": 5.612629930071594e-06, + "loss": 0.5355, + "step": 8480 + }, + { + "epoch": 0.64174643411146, + "grad_norm": 1.976243019104004, + "learning_rate": 5.610543429885571e-06, + "loss": 0.7735, + "step": 8481 + }, + { + "epoch": 0.6418221028337937, + "grad_norm": 2.3452563285827637, + "learning_rate": 5.608457141958285e-06, + "loss": 0.6755, + "step": 8482 + }, + { + "epoch": 0.6418977715561273, + "grad_norm": 2.0472850799560547, + "learning_rate": 5.60637106642038e-06, + "loss": 0.6521, + "step": 8483 + }, + { + "epoch": 0.6419734402784609, + "grad_norm": 2.2305705547332764, + "learning_rate": 5.6042852034024995e-06, + "loss": 0.8029, + "step": 8484 + }, + { + "epoch": 0.6420491090007945, + "grad_norm": 2.1921286582946777, + "learning_rate": 5.602199553035258e-06, + "loss": 0.6344, + "step": 8485 + }, + { + "epoch": 0.6421247777231281, + "grad_norm": 2.1539828777313232, + "learning_rate": 5.600114115449269e-06, + "loss": 0.7381, + "step": 8486 + }, + { + "epoch": 0.6422004464454618, + "grad_norm": 1.9343609809875488, + "learning_rate": 5.598028890775135e-06, + "loss": 0.5944, + "step": 8487 + }, + { + "epoch": 0.6422761151677954, + "grad_norm": 1.9569308757781982, + "learning_rate": 5.595943879143434e-06, + "loss": 0.6096, + "step": 8488 + }, + { + "epoch": 0.642351783890129, + "grad_norm": 2.5590083599090576, + "learning_rate": 5.593859080684738e-06, + "loss": 0.6341, + "step": 8489 + }, + { + "epoch": 0.6424274526124626, + "grad_norm": 2.0084571838378906, + "learning_rate": 5.591774495529602e-06, + "loss": 0.5779, + "step": 8490 + }, + { + "epoch": 0.6425031213347963, + "grad_norm": 2.7293450832366943, + "learning_rate": 5.589690123808568e-06, + "loss": 0.7232, + "step": 8491 + }, + { + "epoch": 0.6425787900571299, + "grad_norm": 2.072709321975708, + "learning_rate": 5.587605965652173e-06, + "loss": 0.7979, + "step": 8492 + }, + { + "epoch": 0.6426544587794635, + "grad_norm": 1.9151296615600586, + "learning_rate": 5.585522021190928e-06, + "loss": 0.7374, + "step": 8493 + }, + { + "epoch": 0.6427301275017971, + "grad_norm": 3.357416868209839, + "learning_rate": 5.583438290555337e-06, + "loss": 0.6413, + "step": 8494 + }, + { + "epoch": 0.6428057962241308, + "grad_norm": 2.125547409057617, + "learning_rate": 5.581354773875893e-06, + "loss": 0.6862, + "step": 8495 + }, + { + "epoch": 0.6428814649464644, + "grad_norm": 2.053462266921997, + "learning_rate": 5.579271471283065e-06, + "loss": 0.7123, + "step": 8496 + }, + { + "epoch": 0.642957133668798, + "grad_norm": 2.2528815269470215, + "learning_rate": 5.577188382907326e-06, + "loss": 0.6743, + "step": 8497 + }, + { + "epoch": 0.6430328023911316, + "grad_norm": 2.1880528926849365, + "learning_rate": 5.575105508879122e-06, + "loss": 0.6552, + "step": 8498 + }, + { + "epoch": 0.6431084711134653, + "grad_norm": 2.2088634967803955, + "learning_rate": 5.573022849328886e-06, + "loss": 0.8085, + "step": 8499 + }, + { + "epoch": 0.6431841398357989, + "grad_norm": 2.240103244781494, + "learning_rate": 5.570940404387046e-06, + "loss": 0.6571, + "step": 8500 + }, + { + "epoch": 0.6432598085581325, + "grad_norm": 2.2424118518829346, + "learning_rate": 5.568858174184005e-06, + "loss": 0.5845, + "step": 8501 + }, + { + "epoch": 0.6433354772804661, + "grad_norm": 2.1710944175720215, + "learning_rate": 5.566776158850164e-06, + "loss": 0.7788, + "step": 8502 + }, + { + "epoch": 0.6434111460027997, + "grad_norm": 2.4002442359924316, + "learning_rate": 5.564694358515907e-06, + "loss": 0.5717, + "step": 8503 + }, + { + "epoch": 0.6434868147251334, + "grad_norm": 1.750193476676941, + "learning_rate": 5.5626127733115976e-06, + "loss": 0.8584, + "step": 8504 + }, + { + "epoch": 0.643562483447467, + "grad_norm": 2.198309898376465, + "learning_rate": 5.560531403367596e-06, + "loss": 0.8146, + "step": 8505 + }, + { + "epoch": 0.6436381521698006, + "grad_norm": 1.8954391479492188, + "learning_rate": 5.55845024881424e-06, + "loss": 0.6983, + "step": 8506 + }, + { + "epoch": 0.6437138208921342, + "grad_norm": 2.1946661472320557, + "learning_rate": 5.556369309781862e-06, + "loss": 0.729, + "step": 8507 + }, + { + "epoch": 0.6437894896144679, + "grad_norm": 2.274904727935791, + "learning_rate": 5.5542885864007756e-06, + "loss": 0.6662, + "step": 8508 + }, + { + "epoch": 0.6438651583368015, + "grad_norm": 2.831035852432251, + "learning_rate": 5.5522080788012845e-06, + "loss": 0.8062, + "step": 8509 + }, + { + "epoch": 0.6439408270591351, + "grad_norm": 2.2157704830169678, + "learning_rate": 5.550127787113674e-06, + "loss": 0.4898, + "step": 8510 + }, + { + "epoch": 0.6440164957814687, + "grad_norm": 2.1222331523895264, + "learning_rate": 5.548047711468221e-06, + "loss": 0.7425, + "step": 8511 + }, + { + "epoch": 0.6440921645038024, + "grad_norm": 2.3518216609954834, + "learning_rate": 5.545967851995182e-06, + "loss": 0.9132, + "step": 8512 + }, + { + "epoch": 0.644167833226136, + "grad_norm": 2.1871204376220703, + "learning_rate": 5.543888208824809e-06, + "loss": 0.697, + "step": 8513 + }, + { + "epoch": 0.6442435019484696, + "grad_norm": 1.9656989574432373, + "learning_rate": 5.541808782087337e-06, + "loss": 0.7371, + "step": 8514 + }, + { + "epoch": 0.6443191706708032, + "grad_norm": 2.232603073120117, + "learning_rate": 5.539729571912982e-06, + "loss": 0.689, + "step": 8515 + }, + { + "epoch": 0.6443948393931368, + "grad_norm": 3.9067957401275635, + "learning_rate": 5.537650578431956e-06, + "loss": 0.7104, + "step": 8516 + }, + { + "epoch": 0.6444705081154705, + "grad_norm": 2.082522392272949, + "learning_rate": 5.5355718017744444e-06, + "loss": 0.7162, + "step": 8517 + }, + { + "epoch": 0.6445461768378041, + "grad_norm": 2.175347328186035, + "learning_rate": 5.533493242070634e-06, + "loss": 0.7402, + "step": 8518 + }, + { + "epoch": 0.6446218455601377, + "grad_norm": 1.8790175914764404, + "learning_rate": 5.531414899450692e-06, + "loss": 0.6263, + "step": 8519 + }, + { + "epoch": 0.6446975142824714, + "grad_norm": 2.5705785751342773, + "learning_rate": 5.529336774044764e-06, + "loss": 0.694, + "step": 8520 + }, + { + "epoch": 0.644773183004805, + "grad_norm": 1.955169916152954, + "learning_rate": 5.527258865982995e-06, + "loss": 0.8155, + "step": 8521 + }, + { + "epoch": 0.6448488517271386, + "grad_norm": 1.9175649881362915, + "learning_rate": 5.525181175395503e-06, + "loss": 0.8298, + "step": 8522 + }, + { + "epoch": 0.6449245204494722, + "grad_norm": 2.2717673778533936, + "learning_rate": 5.523103702412411e-06, + "loss": 0.7048, + "step": 8523 + }, + { + "epoch": 0.6450001891718058, + "grad_norm": 2.508749485015869, + "learning_rate": 5.521026447163807e-06, + "loss": 0.7999, + "step": 8524 + }, + { + "epoch": 0.6450758578941395, + "grad_norm": 1.875464677810669, + "learning_rate": 5.5189494097797825e-06, + "loss": 0.6481, + "step": 8525 + }, + { + "epoch": 0.6451515266164731, + "grad_norm": 2.2824044227600098, + "learning_rate": 5.516872590390402e-06, + "loss": 0.6753, + "step": 8526 + }, + { + "epoch": 0.6452271953388067, + "grad_norm": 2.4387025833129883, + "learning_rate": 5.514795989125725e-06, + "loss": 0.7616, + "step": 8527 + }, + { + "epoch": 0.6453028640611403, + "grad_norm": 1.9747780561447144, + "learning_rate": 5.512719606115799e-06, + "loss": 0.7541, + "step": 8528 + }, + { + "epoch": 0.6453785327834739, + "grad_norm": 2.339794158935547, + "learning_rate": 5.510643441490649e-06, + "loss": 0.7607, + "step": 8529 + }, + { + "epoch": 0.6454542015058076, + "grad_norm": 2.1262335777282715, + "learning_rate": 5.5085674953802945e-06, + "loss": 0.5666, + "step": 8530 + }, + { + "epoch": 0.6455298702281412, + "grad_norm": 1.9069366455078125, + "learning_rate": 5.506491767914734e-06, + "loss": 0.6467, + "step": 8531 + }, + { + "epoch": 0.6456055389504748, + "grad_norm": 2.164801597595215, + "learning_rate": 5.504416259223956e-06, + "loss": 0.6073, + "step": 8532 + }, + { + "epoch": 0.6456812076728085, + "grad_norm": 3.146503210067749, + "learning_rate": 5.502340969437942e-06, + "loss": 0.6192, + "step": 8533 + }, + { + "epoch": 0.6457568763951421, + "grad_norm": 2.1276612281799316, + "learning_rate": 5.5002658986866475e-06, + "loss": 0.6759, + "step": 8534 + }, + { + "epoch": 0.6458325451174757, + "grad_norm": 1.8618454933166504, + "learning_rate": 5.498191047100023e-06, + "loss": 0.5535, + "step": 8535 + }, + { + "epoch": 0.6459082138398093, + "grad_norm": 1.86492121219635, + "learning_rate": 5.496116414808002e-06, + "loss": 0.6922, + "step": 8536 + }, + { + "epoch": 0.6459838825621429, + "grad_norm": 1.8783886432647705, + "learning_rate": 5.4940420019405e-06, + "loss": 0.6147, + "step": 8537 + }, + { + "epoch": 0.6460595512844766, + "grad_norm": 2.076680898666382, + "learning_rate": 5.49196780862743e-06, + "loss": 0.7457, + "step": 8538 + }, + { + "epoch": 0.6461352200068102, + "grad_norm": 1.9781914949417114, + "learning_rate": 5.489893834998683e-06, + "loss": 0.7596, + "step": 8539 + }, + { + "epoch": 0.6462108887291438, + "grad_norm": 3.7510299682617188, + "learning_rate": 5.487820081184136e-06, + "loss": 0.6868, + "step": 8540 + }, + { + "epoch": 0.6462865574514775, + "grad_norm": 2.0768771171569824, + "learning_rate": 5.485746547313658e-06, + "loss": 0.606, + "step": 8541 + }, + { + "epoch": 0.646362226173811, + "grad_norm": 1.8995541334152222, + "learning_rate": 5.483673233517094e-06, + "loss": 0.7812, + "step": 8542 + }, + { + "epoch": 0.6464378948961447, + "grad_norm": 2.8474512100219727, + "learning_rate": 5.4816001399242865e-06, + "loss": 0.6534, + "step": 8543 + }, + { + "epoch": 0.6465135636184783, + "grad_norm": 2.1161701679229736, + "learning_rate": 5.479527266665059e-06, + "loss": 0.6513, + "step": 8544 + }, + { + "epoch": 0.6465892323408119, + "grad_norm": 1.9404470920562744, + "learning_rate": 5.477454613869219e-06, + "loss": 0.5536, + "step": 8545 + }, + { + "epoch": 0.6466649010631456, + "grad_norm": 2.743656873703003, + "learning_rate": 5.475382181666567e-06, + "loss": 0.6976, + "step": 8546 + }, + { + "epoch": 0.6467405697854792, + "grad_norm": 1.7919031381607056, + "learning_rate": 5.473309970186882e-06, + "loss": 0.6573, + "step": 8547 + }, + { + "epoch": 0.6468162385078128, + "grad_norm": 2.278919219970703, + "learning_rate": 5.4712379795599295e-06, + "loss": 0.7562, + "step": 8548 + }, + { + "epoch": 0.6468919072301464, + "grad_norm": 2.6098785400390625, + "learning_rate": 5.469166209915472e-06, + "loss": 0.7391, + "step": 8549 + }, + { + "epoch": 0.64696757595248, + "grad_norm": 2.1722793579101562, + "learning_rate": 5.4670946613832466e-06, + "loss": 0.839, + "step": 8550 + }, + { + "epoch": 0.6470432446748137, + "grad_norm": 2.256061553955078, + "learning_rate": 5.465023334092981e-06, + "loss": 0.6023, + "step": 8551 + }, + { + "epoch": 0.6471189133971473, + "grad_norm": 2.3544907569885254, + "learning_rate": 5.4629522281743846e-06, + "loss": 0.6151, + "step": 8552 + }, + { + "epoch": 0.6471945821194809, + "grad_norm": 1.9995858669281006, + "learning_rate": 5.46088134375716e-06, + "loss": 0.6645, + "step": 8553 + }, + { + "epoch": 0.6472702508418146, + "grad_norm": 2.3130130767822266, + "learning_rate": 5.4588106809709945e-06, + "loss": 0.6585, + "step": 8554 + }, + { + "epoch": 0.6473459195641481, + "grad_norm": 2.5768182277679443, + "learning_rate": 5.456740239945559e-06, + "loss": 0.8411, + "step": 8555 + }, + { + "epoch": 0.6474215882864818, + "grad_norm": 2.34093976020813, + "learning_rate": 5.454670020810507e-06, + "loss": 0.7124, + "step": 8556 + }, + { + "epoch": 0.6474972570088154, + "grad_norm": 3.165555477142334, + "learning_rate": 5.452600023695488e-06, + "loss": 0.6142, + "step": 8557 + }, + { + "epoch": 0.647572925731149, + "grad_norm": 2.1847751140594482, + "learning_rate": 5.450530248730125e-06, + "loss": 0.6331, + "step": 8558 + }, + { + "epoch": 0.6476485944534827, + "grad_norm": 2.339569568634033, + "learning_rate": 5.448460696044041e-06, + "loss": 0.6593, + "step": 8559 + }, + { + "epoch": 0.6477242631758163, + "grad_norm": 2.989414691925049, + "learning_rate": 5.446391365766837e-06, + "loss": 0.6377, + "step": 8560 + }, + { + "epoch": 0.6477999318981499, + "grad_norm": 2.329726457595825, + "learning_rate": 5.444322258028096e-06, + "loss": 0.5623, + "step": 8561 + }, + { + "epoch": 0.6478756006204835, + "grad_norm": 2.963811159133911, + "learning_rate": 5.442253372957399e-06, + "loss": 0.816, + "step": 8562 + }, + { + "epoch": 0.6479512693428171, + "grad_norm": 2.018897533416748, + "learning_rate": 5.440184710684299e-06, + "loss": 0.742, + "step": 8563 + }, + { + "epoch": 0.6480269380651508, + "grad_norm": 2.518212080001831, + "learning_rate": 5.438116271338347e-06, + "loss": 0.8152, + "step": 8564 + }, + { + "epoch": 0.6481026067874844, + "grad_norm": 1.4820868968963623, + "learning_rate": 5.436048055049079e-06, + "loss": 0.5836, + "step": 8565 + }, + { + "epoch": 0.648178275509818, + "grad_norm": 2.1838934421539307, + "learning_rate": 5.433980061946006e-06, + "loss": 0.6756, + "step": 8566 + }, + { + "epoch": 0.6482539442321517, + "grad_norm": 2.867804527282715, + "learning_rate": 5.4319122921586354e-06, + "loss": 0.6669, + "step": 8567 + }, + { + "epoch": 0.6483296129544852, + "grad_norm": 2.323784112930298, + "learning_rate": 5.429844745816454e-06, + "loss": 0.7107, + "step": 8568 + }, + { + "epoch": 0.6484052816768189, + "grad_norm": 2.1240692138671875, + "learning_rate": 5.427777423048948e-06, + "loss": 0.7573, + "step": 8569 + }, + { + "epoch": 0.6484809503991525, + "grad_norm": 2.3311257362365723, + "learning_rate": 5.425710323985571e-06, + "loss": 0.6085, + "step": 8570 + }, + { + "epoch": 0.6485566191214861, + "grad_norm": 2.233501672744751, + "learning_rate": 5.423643448755776e-06, + "loss": 0.7538, + "step": 8571 + }, + { + "epoch": 0.6486322878438198, + "grad_norm": 2.3207247257232666, + "learning_rate": 5.421576797488994e-06, + "loss": 0.678, + "step": 8572 + }, + { + "epoch": 0.6487079565661534, + "grad_norm": 2.2097394466400146, + "learning_rate": 5.4195103703146445e-06, + "loss": 0.7674, + "step": 8573 + }, + { + "epoch": 0.648783625288487, + "grad_norm": 3.0667641162872314, + "learning_rate": 5.41744416736214e-06, + "loss": 0.6953, + "step": 8574 + }, + { + "epoch": 0.6488592940108207, + "grad_norm": 2.093312978744507, + "learning_rate": 5.4153781887608684e-06, + "loss": 0.7736, + "step": 8575 + }, + { + "epoch": 0.6489349627331542, + "grad_norm": 1.9957607984542847, + "learning_rate": 5.41331243464021e-06, + "loss": 0.8296, + "step": 8576 + }, + { + "epoch": 0.6490106314554879, + "grad_norm": 1.9288572072982788, + "learning_rate": 5.411246905129525e-06, + "loss": 0.6244, + "step": 8577 + }, + { + "epoch": 0.6490863001778215, + "grad_norm": 2.4149534702301025, + "learning_rate": 5.409181600358165e-06, + "loss": 0.7823, + "step": 8578 + }, + { + "epoch": 0.6491619689001551, + "grad_norm": 2.701127052307129, + "learning_rate": 5.407116520455471e-06, + "loss": 0.661, + "step": 8579 + }, + { + "epoch": 0.6492376376224888, + "grad_norm": 2.4020912647247314, + "learning_rate": 5.405051665550759e-06, + "loss": 0.7354, + "step": 8580 + }, + { + "epoch": 0.6493133063448223, + "grad_norm": 2.4615938663482666, + "learning_rate": 5.4029870357733405e-06, + "loss": 0.6902, + "step": 8581 + }, + { + "epoch": 0.649388975067156, + "grad_norm": 2.4611809253692627, + "learning_rate": 5.400922631252509e-06, + "loss": 0.5852, + "step": 8582 + }, + { + "epoch": 0.6494646437894896, + "grad_norm": 2.5221340656280518, + "learning_rate": 5.398858452117539e-06, + "loss": 0.6872, + "step": 8583 + }, + { + "epoch": 0.6495403125118232, + "grad_norm": 1.908057451248169, + "learning_rate": 5.396794498497703e-06, + "loss": 0.6117, + "step": 8584 + }, + { + "epoch": 0.6496159812341569, + "grad_norm": 1.944278597831726, + "learning_rate": 5.3947307705222515e-06, + "loss": 0.6342, + "step": 8585 + }, + { + "epoch": 0.6496916499564905, + "grad_norm": 1.9990040063858032, + "learning_rate": 5.392667268320418e-06, + "loss": 0.6878, + "step": 8586 + }, + { + "epoch": 0.6497673186788241, + "grad_norm": 2.623224973678589, + "learning_rate": 5.390603992021429e-06, + "loss": 0.6674, + "step": 8587 + }, + { + "epoch": 0.6498429874011578, + "grad_norm": 2.0191500186920166, + "learning_rate": 5.38854094175449e-06, + "loss": 0.6673, + "step": 8588 + }, + { + "epoch": 0.6499186561234913, + "grad_norm": 2.417546510696411, + "learning_rate": 5.386478117648798e-06, + "loss": 0.5823, + "step": 8589 + }, + { + "epoch": 0.649994324845825, + "grad_norm": 1.9548628330230713, + "learning_rate": 5.384415519833536e-06, + "loss": 0.7199, + "step": 8590 + }, + { + "epoch": 0.6500699935681586, + "grad_norm": 2.064302444458008, + "learning_rate": 5.382353148437866e-06, + "loss": 0.6522, + "step": 8591 + }, + { + "epoch": 0.6501456622904922, + "grad_norm": 2.792672872543335, + "learning_rate": 5.380291003590946e-06, + "loss": 0.5625, + "step": 8592 + }, + { + "epoch": 0.6502213310128259, + "grad_norm": 2.0606002807617188, + "learning_rate": 5.378229085421909e-06, + "loss": 0.7184, + "step": 8593 + }, + { + "epoch": 0.6502969997351594, + "grad_norm": 2.4181129932403564, + "learning_rate": 5.376167394059879e-06, + "loss": 0.7157, + "step": 8594 + }, + { + "epoch": 0.6503726684574931, + "grad_norm": 2.101733922958374, + "learning_rate": 5.374105929633969e-06, + "loss": 0.6396, + "step": 8595 + }, + { + "epoch": 0.6504483371798268, + "grad_norm": 2.896683931350708, + "learning_rate": 5.372044692273275e-06, + "loss": 0.749, + "step": 8596 + }, + { + "epoch": 0.6505240059021603, + "grad_norm": 2.5679473876953125, + "learning_rate": 5.369983682106875e-06, + "loss": 0.6503, + "step": 8597 + }, + { + "epoch": 0.650599674624494, + "grad_norm": 2.505889892578125, + "learning_rate": 5.36792289926384e-06, + "loss": 0.7238, + "step": 8598 + }, + { + "epoch": 0.6506753433468276, + "grad_norm": 2.405747413635254, + "learning_rate": 5.3658623438732165e-06, + "loss": 0.7156, + "step": 8599 + }, + { + "epoch": 0.6507510120691612, + "grad_norm": 2.227442741394043, + "learning_rate": 5.363802016064049e-06, + "loss": 0.5993, + "step": 8600 + }, + { + "epoch": 0.6508266807914949, + "grad_norm": 2.0744035243988037, + "learning_rate": 5.361741915965362e-06, + "loss": 0.7058, + "step": 8601 + }, + { + "epoch": 0.6509023495138284, + "grad_norm": 2.3363168239593506, + "learning_rate": 5.359682043706162e-06, + "loss": 0.5921, + "step": 8602 + }, + { + "epoch": 0.6509780182361621, + "grad_norm": 2.573744058609009, + "learning_rate": 5.357622399415448e-06, + "loss": 0.7344, + "step": 8603 + }, + { + "epoch": 0.6510536869584957, + "grad_norm": 2.494749069213867, + "learning_rate": 5.355562983222197e-06, + "loss": 0.7317, + "step": 8604 + }, + { + "epoch": 0.6511293556808293, + "grad_norm": 2.3259880542755127, + "learning_rate": 5.353503795255382e-06, + "loss": 0.695, + "step": 8605 + }, + { + "epoch": 0.651205024403163, + "grad_norm": 2.2108333110809326, + "learning_rate": 5.3514448356439545e-06, + "loss": 0.5293, + "step": 8606 + }, + { + "epoch": 0.6512806931254965, + "grad_norm": 2.3737170696258545, + "learning_rate": 5.3493861045168514e-06, + "loss": 0.5965, + "step": 8607 + }, + { + "epoch": 0.6513563618478302, + "grad_norm": 2.9041547775268555, + "learning_rate": 5.347327602002999e-06, + "loss": 0.7724, + "step": 8608 + }, + { + "epoch": 0.6514320305701639, + "grad_norm": 2.1703732013702393, + "learning_rate": 5.345269328231303e-06, + "loss": 0.5928, + "step": 8609 + }, + { + "epoch": 0.6515076992924974, + "grad_norm": 2.191357374191284, + "learning_rate": 5.343211283330663e-06, + "loss": 0.8274, + "step": 8610 + }, + { + "epoch": 0.6515833680148311, + "grad_norm": 2.2903363704681396, + "learning_rate": 5.341153467429962e-06, + "loss": 0.6822, + "step": 8611 + }, + { + "epoch": 0.6516590367371647, + "grad_norm": 2.7158641815185547, + "learning_rate": 5.339095880658066e-06, + "loss": 0.7942, + "step": 8612 + }, + { + "epoch": 0.6517347054594983, + "grad_norm": 1.8598185777664185, + "learning_rate": 5.3370385231438245e-06, + "loss": 0.7329, + "step": 8613 + }, + { + "epoch": 0.651810374181832, + "grad_norm": 2.0901763439178467, + "learning_rate": 5.334981395016076e-06, + "loss": 0.7677, + "step": 8614 + }, + { + "epoch": 0.6518860429041655, + "grad_norm": 2.2997934818267822, + "learning_rate": 5.33292449640365e-06, + "loss": 0.6171, + "step": 8615 + }, + { + "epoch": 0.6519617116264992, + "grad_norm": 2.0025293827056885, + "learning_rate": 5.330867827435353e-06, + "loss": 0.713, + "step": 8616 + }, + { + "epoch": 0.6520373803488329, + "grad_norm": 2.187777042388916, + "learning_rate": 5.328811388239981e-06, + "loss": 0.5571, + "step": 8617 + }, + { + "epoch": 0.6521130490711664, + "grad_norm": 2.3849644660949707, + "learning_rate": 5.326755178946312e-06, + "loss": 0.6938, + "step": 8618 + }, + { + "epoch": 0.6521887177935001, + "grad_norm": 2.3532207012176514, + "learning_rate": 5.324699199683113e-06, + "loss": 0.7331, + "step": 8619 + }, + { + "epoch": 0.6522643865158336, + "grad_norm": 2.0683131217956543, + "learning_rate": 5.3226434505791405e-06, + "loss": 0.6739, + "step": 8620 + }, + { + "epoch": 0.6523400552381673, + "grad_norm": 1.9506670236587524, + "learning_rate": 5.320587931763127e-06, + "loss": 0.6907, + "step": 8621 + }, + { + "epoch": 0.652415723960501, + "grad_norm": 1.8213778734207153, + "learning_rate": 5.3185326433638e-06, + "loss": 0.7844, + "step": 8622 + }, + { + "epoch": 0.6524913926828345, + "grad_norm": 2.387977361679077, + "learning_rate": 5.316477585509865e-06, + "loss": 0.6507, + "step": 8623 + }, + { + "epoch": 0.6525670614051682, + "grad_norm": 2.132040500640869, + "learning_rate": 5.3144227583300185e-06, + "loss": 0.6664, + "step": 8624 + }, + { + "epoch": 0.6526427301275018, + "grad_norm": 2.443291187286377, + "learning_rate": 5.312368161952933e-06, + "loss": 0.5767, + "step": 8625 + }, + { + "epoch": 0.6527183988498354, + "grad_norm": 2.27197265625, + "learning_rate": 5.310313796507288e-06, + "loss": 0.6735, + "step": 8626 + }, + { + "epoch": 0.6527940675721691, + "grad_norm": 2.6916403770446777, + "learning_rate": 5.308259662121724e-06, + "loss": 0.6195, + "step": 8627 + }, + { + "epoch": 0.6528697362945026, + "grad_norm": 2.7389891147613525, + "learning_rate": 5.306205758924883e-06, + "loss": 0.7494, + "step": 8628 + }, + { + "epoch": 0.6529454050168363, + "grad_norm": 2.406222105026245, + "learning_rate": 5.304152087045383e-06, + "loss": 0.6614, + "step": 8629 + }, + { + "epoch": 0.65302107373917, + "grad_norm": 2.205073595046997, + "learning_rate": 5.3020986466118305e-06, + "loss": 0.7074, + "step": 8630 + }, + { + "epoch": 0.6530967424615035, + "grad_norm": 2.350299119949341, + "learning_rate": 5.3000454377528256e-06, + "loss": 0.8051, + "step": 8631 + }, + { + "epoch": 0.6531724111838372, + "grad_norm": 2.4323620796203613, + "learning_rate": 5.297992460596941e-06, + "loss": 0.6793, + "step": 8632 + }, + { + "epoch": 0.6532480799061707, + "grad_norm": 2.1624834537506104, + "learning_rate": 5.295939715272742e-06, + "loss": 0.7988, + "step": 8633 + }, + { + "epoch": 0.6533237486285044, + "grad_norm": 2.935800075531006, + "learning_rate": 5.293887201908778e-06, + "loss": 0.7486, + "step": 8634 + }, + { + "epoch": 0.6533994173508381, + "grad_norm": 2.3798375129699707, + "learning_rate": 5.291834920633583e-06, + "loss": 0.5782, + "step": 8635 + }, + { + "epoch": 0.6534750860731716, + "grad_norm": 2.4022819995880127, + "learning_rate": 5.289782871575682e-06, + "loss": 0.666, + "step": 8636 + }, + { + "epoch": 0.6535507547955053, + "grad_norm": 1.8358285427093506, + "learning_rate": 5.287731054863575e-06, + "loss": 0.7288, + "step": 8637 + }, + { + "epoch": 0.653626423517839, + "grad_norm": 2.1910176277160645, + "learning_rate": 5.28567947062576e-06, + "loss": 0.8125, + "step": 8638 + }, + { + "epoch": 0.6537020922401725, + "grad_norm": 2.170234203338623, + "learning_rate": 5.283628118990708e-06, + "loss": 0.6749, + "step": 8639 + }, + { + "epoch": 0.6537777609625062, + "grad_norm": 2.4804930686950684, + "learning_rate": 5.281577000086881e-06, + "loss": 0.6696, + "step": 8640 + }, + { + "epoch": 0.6538534296848397, + "grad_norm": 2.3019261360168457, + "learning_rate": 5.279526114042731e-06, + "loss": 0.5192, + "step": 8641 + }, + { + "epoch": 0.6539290984071734, + "grad_norm": 2.1788456439971924, + "learning_rate": 5.27747546098669e-06, + "loss": 0.7353, + "step": 8642 + }, + { + "epoch": 0.6540047671295071, + "grad_norm": 2.3936922550201416, + "learning_rate": 5.2754250410471755e-06, + "loss": 0.5753, + "step": 8643 + }, + { + "epoch": 0.6540804358518406, + "grad_norm": 2.109896421432495, + "learning_rate": 5.2733748543525925e-06, + "loss": 0.572, + "step": 8644 + }, + { + "epoch": 0.6541561045741743, + "grad_norm": 2.3399693965911865, + "learning_rate": 5.271324901031326e-06, + "loss": 0.6521, + "step": 8645 + }, + { + "epoch": 0.6542317732965078, + "grad_norm": 2.2131054401397705, + "learning_rate": 5.2692751812117576e-06, + "loss": 0.6941, + "step": 8646 + }, + { + "epoch": 0.6543074420188415, + "grad_norm": 2.646402597427368, + "learning_rate": 5.267225695022244e-06, + "loss": 0.802, + "step": 8647 + }, + { + "epoch": 0.6543831107411752, + "grad_norm": 1.9599591493606567, + "learning_rate": 5.26517644259113e-06, + "loss": 0.8231, + "step": 8648 + }, + { + "epoch": 0.6544587794635087, + "grad_norm": 2.2101340293884277, + "learning_rate": 5.263127424046747e-06, + "loss": 0.7479, + "step": 8649 + }, + { + "epoch": 0.6545344481858424, + "grad_norm": 2.379575252532959, + "learning_rate": 5.26107863951741e-06, + "loss": 0.6335, + "step": 8650 + }, + { + "epoch": 0.654610116908176, + "grad_norm": 2.6849305629730225, + "learning_rate": 5.259030089131421e-06, + "loss": 0.7674, + "step": 8651 + }, + { + "epoch": 0.6546857856305096, + "grad_norm": 2.3194165229797363, + "learning_rate": 5.256981773017071e-06, + "loss": 0.7081, + "step": 8652 + }, + { + "epoch": 0.6547614543528433, + "grad_norm": 5.000668048858643, + "learning_rate": 5.254933691302628e-06, + "loss": 0.737, + "step": 8653 + }, + { + "epoch": 0.6548371230751768, + "grad_norm": 2.265462875366211, + "learning_rate": 5.252885844116347e-06, + "loss": 0.5589, + "step": 8654 + }, + { + "epoch": 0.6549127917975105, + "grad_norm": 2.609497308731079, + "learning_rate": 5.250838231586477e-06, + "loss": 0.7, + "step": 8655 + }, + { + "epoch": 0.6549884605198442, + "grad_norm": 2.6671085357666016, + "learning_rate": 5.248790853841241e-06, + "loss": 0.6093, + "step": 8656 + }, + { + "epoch": 0.6550641292421777, + "grad_norm": 2.3832755088806152, + "learning_rate": 5.2467437110088565e-06, + "loss": 0.7668, + "step": 8657 + }, + { + "epoch": 0.6551397979645114, + "grad_norm": 2.164400815963745, + "learning_rate": 5.24469680321752e-06, + "loss": 0.6777, + "step": 8658 + }, + { + "epoch": 0.6552154666868449, + "grad_norm": 1.7403801679611206, + "learning_rate": 5.242650130595418e-06, + "loss": 0.6456, + "step": 8659 + }, + { + "epoch": 0.6552911354091786, + "grad_norm": 2.5465121269226074, + "learning_rate": 5.240603693270712e-06, + "loss": 0.7281, + "step": 8660 + }, + { + "epoch": 0.6553668041315123, + "grad_norm": 2.5230278968811035, + "learning_rate": 5.238557491371566e-06, + "loss": 0.7588, + "step": 8661 + }, + { + "epoch": 0.6554424728538458, + "grad_norm": 1.767684817314148, + "learning_rate": 5.236511525026118e-06, + "loss": 0.824, + "step": 8662 + }, + { + "epoch": 0.6555181415761795, + "grad_norm": 2.1746935844421387, + "learning_rate": 5.2344657943624876e-06, + "loss": 0.712, + "step": 8663 + }, + { + "epoch": 0.6555938102985132, + "grad_norm": 2.5205862522125244, + "learning_rate": 5.232420299508789e-06, + "loss": 0.7244, + "step": 8664 + }, + { + "epoch": 0.6556694790208467, + "grad_norm": 2.556107759475708, + "learning_rate": 5.230375040593117e-06, + "loss": 0.7721, + "step": 8665 + }, + { + "epoch": 0.6557451477431804, + "grad_norm": 2.134599208831787, + "learning_rate": 5.228330017743552e-06, + "loss": 0.7349, + "step": 8666 + }, + { + "epoch": 0.6558208164655139, + "grad_norm": 2.397552251815796, + "learning_rate": 5.226285231088161e-06, + "loss": 0.6472, + "step": 8667 + }, + { + "epoch": 0.6558964851878476, + "grad_norm": 2.927499532699585, + "learning_rate": 5.224240680754993e-06, + "loss": 0.5487, + "step": 8668 + }, + { + "epoch": 0.6559721539101813, + "grad_norm": 1.7893126010894775, + "learning_rate": 5.222196366872091e-06, + "loss": 0.6124, + "step": 8669 + }, + { + "epoch": 0.6560478226325148, + "grad_norm": 1.9689534902572632, + "learning_rate": 5.220152289567468e-06, + "loss": 0.7077, + "step": 8670 + }, + { + "epoch": 0.6561234913548485, + "grad_norm": 2.465264081954956, + "learning_rate": 5.218108448969129e-06, + "loss": 0.6206, + "step": 8671 + }, + { + "epoch": 0.656199160077182, + "grad_norm": 2.3276827335357666, + "learning_rate": 5.216064845205075e-06, + "loss": 0.8604, + "step": 8672 + }, + { + "epoch": 0.6562748287995157, + "grad_norm": 3.2658963203430176, + "learning_rate": 5.214021478403283e-06, + "loss": 0.6325, + "step": 8673 + }, + { + "epoch": 0.6563504975218494, + "grad_norm": 2.2174007892608643, + "learning_rate": 5.211978348691708e-06, + "loss": 0.7559, + "step": 8674 + }, + { + "epoch": 0.6564261662441829, + "grad_norm": 2.120908260345459, + "learning_rate": 5.209935456198301e-06, + "loss": 0.7938, + "step": 8675 + }, + { + "epoch": 0.6565018349665166, + "grad_norm": 2.1590099334716797, + "learning_rate": 5.207892801050993e-06, + "loss": 0.6541, + "step": 8676 + }, + { + "epoch": 0.6565775036888503, + "grad_norm": 2.1857635974884033, + "learning_rate": 5.205850383377704e-06, + "loss": 0.6656, + "step": 8677 + }, + { + "epoch": 0.6566531724111838, + "grad_norm": 2.734827995300293, + "learning_rate": 5.2038082033063365e-06, + "loss": 0.7115, + "step": 8678 + }, + { + "epoch": 0.6567288411335175, + "grad_norm": 1.769230604171753, + "learning_rate": 5.201766260964777e-06, + "loss": 0.585, + "step": 8679 + }, + { + "epoch": 0.656804509855851, + "grad_norm": 1.630325198173523, + "learning_rate": 5.199724556480902e-06, + "loss": 0.7843, + "step": 8680 + }, + { + "epoch": 0.6568801785781847, + "grad_norm": 2.335381269454956, + "learning_rate": 5.19768308998256e-06, + "loss": 0.5799, + "step": 8681 + }, + { + "epoch": 0.6569558473005184, + "grad_norm": 2.884042739868164, + "learning_rate": 5.1956418615976054e-06, + "loss": 0.6448, + "step": 8682 + }, + { + "epoch": 0.6570315160228519, + "grad_norm": 2.7924041748046875, + "learning_rate": 5.193600871453866e-06, + "loss": 0.8123, + "step": 8683 + }, + { + "epoch": 0.6571071847451856, + "grad_norm": 2.314347267150879, + "learning_rate": 5.191560119679147e-06, + "loss": 0.7676, + "step": 8684 + }, + { + "epoch": 0.6571828534675191, + "grad_norm": 2.933027744293213, + "learning_rate": 5.189519606401252e-06, + "loss": 0.7057, + "step": 8685 + }, + { + "epoch": 0.6572585221898528, + "grad_norm": 2.0092270374298096, + "learning_rate": 5.18747933174796e-06, + "loss": 0.7644, + "step": 8686 + }, + { + "epoch": 0.6573341909121865, + "grad_norm": 3.9684855937957764, + "learning_rate": 5.18543929584705e-06, + "loss": 0.6298, + "step": 8687 + }, + { + "epoch": 0.65740985963452, + "grad_norm": 3.053493022918701, + "learning_rate": 5.183399498826266e-06, + "loss": 0.7078, + "step": 8688 + }, + { + "epoch": 0.6574855283568537, + "grad_norm": 2.5908734798431396, + "learning_rate": 5.18135994081335e-06, + "loss": 0.6395, + "step": 8689 + }, + { + "epoch": 0.6575611970791874, + "grad_norm": 2.060353994369507, + "learning_rate": 5.179320621936025e-06, + "loss": 0.5517, + "step": 8690 + }, + { + "epoch": 0.6576368658015209, + "grad_norm": 2.310406446456909, + "learning_rate": 5.177281542322e-06, + "loss": 0.7194, + "step": 8691 + }, + { + "epoch": 0.6577125345238546, + "grad_norm": 2.261384963989258, + "learning_rate": 5.175242702098969e-06, + "loss": 0.5707, + "step": 8692 + }, + { + "epoch": 0.6577882032461881, + "grad_norm": 2.6075448989868164, + "learning_rate": 5.173204101394612e-06, + "loss": 0.6743, + "step": 8693 + }, + { + "epoch": 0.6578638719685218, + "grad_norm": 2.4900269508361816, + "learning_rate": 5.1711657403365935e-06, + "loss": 0.7795, + "step": 8694 + }, + { + "epoch": 0.6579395406908555, + "grad_norm": 3.0863840579986572, + "learning_rate": 5.169127619052558e-06, + "loss": 0.6771, + "step": 8695 + }, + { + "epoch": 0.658015209413189, + "grad_norm": 2.687215566635132, + "learning_rate": 5.167089737670137e-06, + "loss": 0.7066, + "step": 8696 + }, + { + "epoch": 0.6580908781355227, + "grad_norm": 2.019657850265503, + "learning_rate": 5.16505209631696e-06, + "loss": 0.7545, + "step": 8697 + }, + { + "epoch": 0.6581665468578564, + "grad_norm": 2.119903564453125, + "learning_rate": 5.163014695120623e-06, + "loss": 0.8052, + "step": 8698 + }, + { + "epoch": 0.6582422155801899, + "grad_norm": 8.88284683227539, + "learning_rate": 5.160977534208716e-06, + "loss": 0.8043, + "step": 8699 + }, + { + "epoch": 0.6583178843025236, + "grad_norm": 1.7967544794082642, + "learning_rate": 5.158940613708812e-06, + "loss": 0.7364, + "step": 8700 + }, + { + "epoch": 0.6583935530248571, + "grad_norm": 2.3100337982177734, + "learning_rate": 5.15690393374847e-06, + "loss": 0.6598, + "step": 8701 + }, + { + "epoch": 0.6584692217471908, + "grad_norm": 2.662794589996338, + "learning_rate": 5.154867494455234e-06, + "loss": 0.5787, + "step": 8702 + }, + { + "epoch": 0.6585448904695245, + "grad_norm": 2.560645341873169, + "learning_rate": 5.152831295956632e-06, + "loss": 0.6013, + "step": 8703 + }, + { + "epoch": 0.658620559191858, + "grad_norm": 2.0240793228149414, + "learning_rate": 5.150795338380178e-06, + "loss": 0.7068, + "step": 8704 + }, + { + "epoch": 0.6586962279141917, + "grad_norm": 2.2594637870788574, + "learning_rate": 5.1487596218533735e-06, + "loss": 0.656, + "step": 8705 + }, + { + "epoch": 0.6587718966365252, + "grad_norm": 3.0724432468414307, + "learning_rate": 5.146724146503693e-06, + "loss": 0.8438, + "step": 8706 + }, + { + "epoch": 0.6588475653588589, + "grad_norm": 2.630356788635254, + "learning_rate": 5.144688912458607e-06, + "loss": 0.5843, + "step": 8707 + }, + { + "epoch": 0.6589232340811926, + "grad_norm": 2.645566701889038, + "learning_rate": 5.142653919845578e-06, + "loss": 0.6826, + "step": 8708 + }, + { + "epoch": 0.6589989028035261, + "grad_norm": 2.8515145778656006, + "learning_rate": 5.140619168792033e-06, + "loss": 0.6833, + "step": 8709 + }, + { + "epoch": 0.6590745715258598, + "grad_norm": 3.5823323726654053, + "learning_rate": 5.138584659425398e-06, + "loss": 0.679, + "step": 8710 + }, + { + "epoch": 0.6591502402481935, + "grad_norm": 2.3689839839935303, + "learning_rate": 5.136550391873082e-06, + "loss": 0.6783, + "step": 8711 + }, + { + "epoch": 0.659225908970527, + "grad_norm": 2.3048954010009766, + "learning_rate": 5.134516366262475e-06, + "loss": 0.8017, + "step": 8712 + }, + { + "epoch": 0.6593015776928607, + "grad_norm": 3.267667531967163, + "learning_rate": 5.1324825827209564e-06, + "loss": 0.7462, + "step": 8713 + }, + { + "epoch": 0.6593772464151942, + "grad_norm": 2.637197732925415, + "learning_rate": 5.130449041375887e-06, + "loss": 0.487, + "step": 8714 + }, + { + "epoch": 0.6594529151375279, + "grad_norm": 2.3977911472320557, + "learning_rate": 5.128415742354615e-06, + "loss": 0.6639, + "step": 8715 + }, + { + "epoch": 0.6595285838598616, + "grad_norm": 2.283331871032715, + "learning_rate": 5.126382685784475e-06, + "loss": 0.7063, + "step": 8716 + }, + { + "epoch": 0.6596042525821951, + "grad_norm": 2.826462507247925, + "learning_rate": 5.124349871792772e-06, + "loss": 0.6383, + "step": 8717 + }, + { + "epoch": 0.6596799213045288, + "grad_norm": 3.0163345336914062, + "learning_rate": 5.122317300506819e-06, + "loss": 0.6715, + "step": 8718 + }, + { + "epoch": 0.6597555900268623, + "grad_norm": 2.01212215423584, + "learning_rate": 5.1202849720539035e-06, + "loss": 0.698, + "step": 8719 + }, + { + "epoch": 0.659831258749196, + "grad_norm": 2.5961616039276123, + "learning_rate": 5.118252886561287e-06, + "loss": 0.6204, + "step": 8720 + }, + { + "epoch": 0.6599069274715297, + "grad_norm": 2.244570255279541, + "learning_rate": 5.11622104415623e-06, + "loss": 0.6391, + "step": 8721 + }, + { + "epoch": 0.6599825961938632, + "grad_norm": 2.480987787246704, + "learning_rate": 5.114189444965974e-06, + "loss": 0.869, + "step": 8722 + }, + { + "epoch": 0.6600582649161969, + "grad_norm": 2.770308256149292, + "learning_rate": 5.112158089117742e-06, + "loss": 0.7924, + "step": 8723 + }, + { + "epoch": 0.6601339336385306, + "grad_norm": 2.041288375854492, + "learning_rate": 5.110126976738745e-06, + "loss": 0.715, + "step": 8724 + }, + { + "epoch": 0.6602096023608641, + "grad_norm": 2.9713993072509766, + "learning_rate": 5.108096107956178e-06, + "loss": 0.6904, + "step": 8725 + }, + { + "epoch": 0.6602852710831978, + "grad_norm": 2.7926697731018066, + "learning_rate": 5.106065482897225e-06, + "loss": 0.6128, + "step": 8726 + }, + { + "epoch": 0.6603609398055313, + "grad_norm": 2.216324806213379, + "learning_rate": 5.104035101689038e-06, + "loss": 0.7352, + "step": 8727 + }, + { + "epoch": 0.660436608527865, + "grad_norm": 2.27839732170105, + "learning_rate": 5.1020049644587795e-06, + "loss": 0.6477, + "step": 8728 + }, + { + "epoch": 0.6605122772501987, + "grad_norm": 1.9538133144378662, + "learning_rate": 5.0999750713335745e-06, + "loss": 0.6509, + "step": 8729 + }, + { + "epoch": 0.6605879459725322, + "grad_norm": 2.1575965881347656, + "learning_rate": 5.097945422440551e-06, + "loss": 0.7347, + "step": 8730 + }, + { + "epoch": 0.6606636146948659, + "grad_norm": 2.204331398010254, + "learning_rate": 5.095916017906802e-06, + "loss": 0.7395, + "step": 8731 + }, + { + "epoch": 0.6607392834171995, + "grad_norm": 3.217972993850708, + "learning_rate": 5.093886857859415e-06, + "loss": 0.5874, + "step": 8732 + }, + { + "epoch": 0.6608149521395331, + "grad_norm": 2.131350040435791, + "learning_rate": 5.0918579424254736e-06, + "loss": 0.7008, + "step": 8733 + }, + { + "epoch": 0.6608906208618668, + "grad_norm": 2.436474323272705, + "learning_rate": 5.089829271732025e-06, + "loss": 0.6264, + "step": 8734 + }, + { + "epoch": 0.6609662895842003, + "grad_norm": 1.924094796180725, + "learning_rate": 5.087800845906116e-06, + "loss": 0.5002, + "step": 8735 + }, + { + "epoch": 0.661041958306534, + "grad_norm": 2.152076005935669, + "learning_rate": 5.085772665074771e-06, + "loss": 0.7399, + "step": 8736 + }, + { + "epoch": 0.6611176270288677, + "grad_norm": 2.1113076210021973, + "learning_rate": 5.083744729365001e-06, + "loss": 0.7238, + "step": 8737 + }, + { + "epoch": 0.6611932957512012, + "grad_norm": 3.3278396129608154, + "learning_rate": 5.081717038903803e-06, + "loss": 0.664, + "step": 8738 + }, + { + "epoch": 0.6612689644735349, + "grad_norm": 2.1845004558563232, + "learning_rate": 5.079689593818156e-06, + "loss": 0.6055, + "step": 8739 + }, + { + "epoch": 0.6613446331958684, + "grad_norm": 2.576305627822876, + "learning_rate": 5.0776623942350324e-06, + "loss": 0.6543, + "step": 8740 + }, + { + "epoch": 0.6614203019182021, + "grad_norm": 2.260627269744873, + "learning_rate": 5.075635440281372e-06, + "loss": 0.7901, + "step": 8741 + }, + { + "epoch": 0.6614959706405358, + "grad_norm": 2.078648805618286, + "learning_rate": 5.073608732084113e-06, + "loss": 0.6235, + "step": 8742 + }, + { + "epoch": 0.6615716393628693, + "grad_norm": 4.44390869140625, + "learning_rate": 5.0715822697701704e-06, + "loss": 0.7602, + "step": 8743 + }, + { + "epoch": 0.661647308085203, + "grad_norm": 2.401336193084717, + "learning_rate": 5.06955605346646e-06, + "loss": 0.5624, + "step": 8744 + }, + { + "epoch": 0.6617229768075366, + "grad_norm": 2.3435049057006836, + "learning_rate": 5.067530083299858e-06, + "loss": 0.6278, + "step": 8745 + }, + { + "epoch": 0.6617986455298702, + "grad_norm": 3.2491648197174072, + "learning_rate": 5.065504359397241e-06, + "loss": 0.6314, + "step": 8746 + }, + { + "epoch": 0.6618743142522039, + "grad_norm": 1.942625641822815, + "learning_rate": 5.063478881885468e-06, + "loss": 0.8968, + "step": 8747 + }, + { + "epoch": 0.6619499829745374, + "grad_norm": 3.2278025150299072, + "learning_rate": 5.0614536508913785e-06, + "loss": 0.6857, + "step": 8748 + }, + { + "epoch": 0.6620256516968711, + "grad_norm": 2.8973255157470703, + "learning_rate": 5.059428666541801e-06, + "loss": 0.7619, + "step": 8749 + }, + { + "epoch": 0.6621013204192048, + "grad_norm": 1.850770354270935, + "learning_rate": 5.057403928963545e-06, + "loss": 0.591, + "step": 8750 + }, + { + "epoch": 0.6621769891415383, + "grad_norm": 1.6158236265182495, + "learning_rate": 5.055379438283411e-06, + "loss": 0.5541, + "step": 8751 + }, + { + "epoch": 0.662252657863872, + "grad_norm": 2.043518304824829, + "learning_rate": 5.053355194628172e-06, + "loss": 0.7462, + "step": 8752 + }, + { + "epoch": 0.6623283265862056, + "grad_norm": 1.9455914497375488, + "learning_rate": 5.05133119812459e-06, + "loss": 0.7079, + "step": 8753 + }, + { + "epoch": 0.6624039953085392, + "grad_norm": 2.781599283218384, + "learning_rate": 5.0493074488994296e-06, + "loss": 0.628, + "step": 8754 + }, + { + "epoch": 0.6624796640308729, + "grad_norm": 1.801193356513977, + "learning_rate": 5.04728394707941e-06, + "loss": 0.6794, + "step": 8755 + }, + { + "epoch": 0.6625553327532064, + "grad_norm": 2.097200632095337, + "learning_rate": 5.045260692791256e-06, + "loss": 0.6512, + "step": 8756 + }, + { + "epoch": 0.6626310014755401, + "grad_norm": 2.374746322631836, + "learning_rate": 5.0432376861616655e-06, + "loss": 0.7123, + "step": 8757 + }, + { + "epoch": 0.6627066701978737, + "grad_norm": 2.2179994583129883, + "learning_rate": 5.0412149273173305e-06, + "loss": 0.5607, + "step": 8758 + }, + { + "epoch": 0.6627823389202073, + "grad_norm": 1.721039056777954, + "learning_rate": 5.039192416384922e-06, + "loss": 0.7004, + "step": 8759 + }, + { + "epoch": 0.662858007642541, + "grad_norm": 2.0622527599334717, + "learning_rate": 5.037170153491093e-06, + "loss": 0.5236, + "step": 8760 + }, + { + "epoch": 0.6629336763648745, + "grad_norm": 2.1661341190338135, + "learning_rate": 5.035148138762487e-06, + "loss": 0.6125, + "step": 8761 + }, + { + "epoch": 0.6630093450872082, + "grad_norm": 2.070807695388794, + "learning_rate": 5.033126372325733e-06, + "loss": 0.7534, + "step": 8762 + }, + { + "epoch": 0.6630850138095419, + "grad_norm": 1.9332561492919922, + "learning_rate": 5.031104854307428e-06, + "loss": 0.6172, + "step": 8763 + }, + { + "epoch": 0.6631606825318754, + "grad_norm": 2.211865186691284, + "learning_rate": 5.029083584834179e-06, + "loss": 0.8465, + "step": 8764 + }, + { + "epoch": 0.6632363512542091, + "grad_norm": 6.03785514831543, + "learning_rate": 5.027062564032561e-06, + "loss": 0.6893, + "step": 8765 + }, + { + "epoch": 0.6633120199765427, + "grad_norm": 1.8481940031051636, + "learning_rate": 5.025041792029133e-06, + "loss": 0.722, + "step": 8766 + }, + { + "epoch": 0.6633876886988763, + "grad_norm": 1.6104676723480225, + "learning_rate": 5.023021268950444e-06, + "loss": 0.6848, + "step": 8767 + }, + { + "epoch": 0.66346335742121, + "grad_norm": 2.4193434715270996, + "learning_rate": 5.021000994923026e-06, + "loss": 0.7977, + "step": 8768 + }, + { + "epoch": 0.6635390261435435, + "grad_norm": 2.3450634479522705, + "learning_rate": 5.018980970073395e-06, + "loss": 0.6101, + "step": 8769 + }, + { + "epoch": 0.6636146948658772, + "grad_norm": 1.899941325187683, + "learning_rate": 5.016961194528053e-06, + "loss": 0.5525, + "step": 8770 + }, + { + "epoch": 0.6636903635882108, + "grad_norm": 1.9693715572357178, + "learning_rate": 5.014941668413483e-06, + "loss": 0.8298, + "step": 8771 + }, + { + "epoch": 0.6637660323105444, + "grad_norm": 2.609485626220703, + "learning_rate": 5.012922391856156e-06, + "loss": 0.6256, + "step": 8772 + }, + { + "epoch": 0.6638417010328781, + "grad_norm": 2.0286991596221924, + "learning_rate": 5.010903364982523e-06, + "loss": 0.8595, + "step": 8773 + }, + { + "epoch": 0.6639173697552117, + "grad_norm": 1.6877880096435547, + "learning_rate": 5.008884587919025e-06, + "loss": 0.6734, + "step": 8774 + }, + { + "epoch": 0.6639930384775453, + "grad_norm": 2.174236297607422, + "learning_rate": 5.006866060792081e-06, + "loss": 0.6317, + "step": 8775 + }, + { + "epoch": 0.664068707199879, + "grad_norm": 1.8174835443496704, + "learning_rate": 5.004847783728106e-06, + "loss": 0.6284, + "step": 8776 + }, + { + "epoch": 0.6641443759222125, + "grad_norm": 2.0437092781066895, + "learning_rate": 5.002829756853479e-06, + "loss": 0.7169, + "step": 8777 + }, + { + "epoch": 0.6642200446445462, + "grad_norm": 2.2291433811187744, + "learning_rate": 5.000811980294578e-06, + "loss": 0.6002, + "step": 8778 + }, + { + "epoch": 0.6642957133668798, + "grad_norm": 2.1589958667755127, + "learning_rate": 4.998794454177773e-06, + "loss": 0.7183, + "step": 8779 + }, + { + "epoch": 0.6643713820892134, + "grad_norm": 1.941250205039978, + "learning_rate": 4.996777178629397e-06, + "loss": 0.6259, + "step": 8780 + }, + { + "epoch": 0.6644470508115471, + "grad_norm": 1.7236140966415405, + "learning_rate": 4.994760153775782e-06, + "loss": 0.6895, + "step": 8781 + }, + { + "epoch": 0.6645227195338806, + "grad_norm": 2.2558975219726562, + "learning_rate": 4.992743379743242e-06, + "loss": 0.6669, + "step": 8782 + }, + { + "epoch": 0.6645983882562143, + "grad_norm": 1.9559904336929321, + "learning_rate": 4.990726856658075e-06, + "loss": 0.6926, + "step": 8783 + }, + { + "epoch": 0.6646740569785479, + "grad_norm": 2.2078473567962646, + "learning_rate": 4.988710584646552e-06, + "loss": 0.7465, + "step": 8784 + }, + { + "epoch": 0.6647497257008815, + "grad_norm": 2.152083396911621, + "learning_rate": 4.986694563834951e-06, + "loss": 0.7085, + "step": 8785 + }, + { + "epoch": 0.6648253944232152, + "grad_norm": 2.4791300296783447, + "learning_rate": 4.98467879434952e-06, + "loss": 0.6156, + "step": 8786 + }, + { + "epoch": 0.6649010631455488, + "grad_norm": 2.4543418884277344, + "learning_rate": 4.982663276316487e-06, + "loss": 0.6784, + "step": 8787 + }, + { + "epoch": 0.6649767318678824, + "grad_norm": 9.999984741210938, + "learning_rate": 4.980648009862073e-06, + "loss": 0.6158, + "step": 8788 + }, + { + "epoch": 0.6650524005902161, + "grad_norm": 2.0445075035095215, + "learning_rate": 4.978632995112476e-06, + "loss": 0.765, + "step": 8789 + }, + { + "epoch": 0.6651280693125496, + "grad_norm": 2.165191411972046, + "learning_rate": 4.976618232193895e-06, + "loss": 0.6143, + "step": 8790 + }, + { + "epoch": 0.6652037380348833, + "grad_norm": 1.9150837659835815, + "learning_rate": 4.974603721232492e-06, + "loss": 0.5768, + "step": 8791 + }, + { + "epoch": 0.6652794067572169, + "grad_norm": 2.5317835807800293, + "learning_rate": 4.972589462354423e-06, + "loss": 0.7633, + "step": 8792 + }, + { + "epoch": 0.6653550754795505, + "grad_norm": 2.1504557132720947, + "learning_rate": 4.970575455685826e-06, + "loss": 0.6546, + "step": 8793 + }, + { + "epoch": 0.6654307442018842, + "grad_norm": 1.821834683418274, + "learning_rate": 4.968561701352829e-06, + "loss": 0.7508, + "step": 8794 + }, + { + "epoch": 0.6655064129242177, + "grad_norm": 2.307339668273926, + "learning_rate": 4.966548199481536e-06, + "loss": 0.5994, + "step": 8795 + }, + { + "epoch": 0.6655820816465514, + "grad_norm": 2.067732334136963, + "learning_rate": 4.964534950198041e-06, + "loss": 0.6737, + "step": 8796 + }, + { + "epoch": 0.665657750368885, + "grad_norm": 2.2294864654541016, + "learning_rate": 4.962521953628425e-06, + "loss": 0.6829, + "step": 8797 + }, + { + "epoch": 0.6657334190912186, + "grad_norm": 2.271959066390991, + "learning_rate": 4.960509209898737e-06, + "loss": 0.6612, + "step": 8798 + }, + { + "epoch": 0.6658090878135523, + "grad_norm": 2.435523271560669, + "learning_rate": 4.958496719135024e-06, + "loss": 0.7108, + "step": 8799 + }, + { + "epoch": 0.6658847565358859, + "grad_norm": 2.1555373668670654, + "learning_rate": 4.956484481463328e-06, + "loss": 0.741, + "step": 8800 + }, + { + "epoch": 0.6659604252582195, + "grad_norm": 2.427854061126709, + "learning_rate": 4.954472497009647e-06, + "loss": 0.9206, + "step": 8801 + }, + { + "epoch": 0.6660360939805532, + "grad_norm": 2.376939296722412, + "learning_rate": 4.952460765899982e-06, + "loss": 0.677, + "step": 8802 + }, + { + "epoch": 0.6661117627028867, + "grad_norm": 2.0379579067230225, + "learning_rate": 4.950449288260316e-06, + "loss": 0.6682, + "step": 8803 + }, + { + "epoch": 0.6661874314252204, + "grad_norm": 1.950374722480774, + "learning_rate": 4.948438064216615e-06, + "loss": 0.5976, + "step": 8804 + }, + { + "epoch": 0.666263100147554, + "grad_norm": 1.9691082239151, + "learning_rate": 4.946427093894825e-06, + "loss": 0.6487, + "step": 8805 + }, + { + "epoch": 0.6663387688698876, + "grad_norm": 2.2391226291656494, + "learning_rate": 4.944416377420881e-06, + "loss": 0.7114, + "step": 8806 + }, + { + "epoch": 0.6664144375922213, + "grad_norm": 1.9541027545928955, + "learning_rate": 4.942405914920701e-06, + "loss": 0.5922, + "step": 8807 + }, + { + "epoch": 0.6664901063145549, + "grad_norm": 2.247073173522949, + "learning_rate": 4.94039570652019e-06, + "loss": 0.7678, + "step": 8808 + }, + { + "epoch": 0.6665657750368885, + "grad_norm": 2.149578094482422, + "learning_rate": 4.938385752345224e-06, + "loss": 0.7262, + "step": 8809 + }, + { + "epoch": 0.6666414437592221, + "grad_norm": 2.3252670764923096, + "learning_rate": 4.936376052521682e-06, + "loss": 0.6556, + "step": 8810 + }, + { + "epoch": 0.6667171124815557, + "grad_norm": 1.9192390441894531, + "learning_rate": 4.934366607175419e-06, + "loss": 0.6206, + "step": 8811 + }, + { + "epoch": 0.6667927812038894, + "grad_norm": 2.2456088066101074, + "learning_rate": 4.932357416432264e-06, + "loss": 0.6447, + "step": 8812 + }, + { + "epoch": 0.666868449926223, + "grad_norm": 2.371447801589966, + "learning_rate": 4.930348480418045e-06, + "loss": 0.6276, + "step": 8813 + }, + { + "epoch": 0.6669441186485566, + "grad_norm": 2.4890329837799072, + "learning_rate": 4.928339799258567e-06, + "loss": 0.5793, + "step": 8814 + }, + { + "epoch": 0.6670197873708903, + "grad_norm": 2.112485885620117, + "learning_rate": 4.926331373079619e-06, + "loss": 0.6851, + "step": 8815 + }, + { + "epoch": 0.6670954560932238, + "grad_norm": 2.055906295776367, + "learning_rate": 4.9243232020069775e-06, + "loss": 0.4433, + "step": 8816 + }, + { + "epoch": 0.6671711248155575, + "grad_norm": 2.641813278198242, + "learning_rate": 4.9223152861664e-06, + "loss": 0.7469, + "step": 8817 + }, + { + "epoch": 0.6672467935378911, + "grad_norm": 2.035452127456665, + "learning_rate": 4.920307625683626e-06, + "loss": 0.5807, + "step": 8818 + }, + { + "epoch": 0.6673224622602247, + "grad_norm": 2.3828279972076416, + "learning_rate": 4.9183002206843894e-06, + "loss": 0.8857, + "step": 8819 + }, + { + "epoch": 0.6673981309825584, + "grad_norm": 2.7702672481536865, + "learning_rate": 4.916293071294386e-06, + "loss": 0.5946, + "step": 8820 + }, + { + "epoch": 0.667473799704892, + "grad_norm": 2.4469079971313477, + "learning_rate": 4.914286177639324e-06, + "loss": 0.6725, + "step": 8821 + }, + { + "epoch": 0.6675494684272256, + "grad_norm": 2.4644572734832764, + "learning_rate": 4.912279539844879e-06, + "loss": 0.7755, + "step": 8822 + }, + { + "epoch": 0.6676251371495592, + "grad_norm": 2.4523849487304688, + "learning_rate": 4.9102731580367075e-06, + "loss": 0.7279, + "step": 8823 + }, + { + "epoch": 0.6677008058718928, + "grad_norm": 2.323460340499878, + "learning_rate": 4.908267032340458e-06, + "loss": 0.594, + "step": 8824 + }, + { + "epoch": 0.6677764745942265, + "grad_norm": 2.83245849609375, + "learning_rate": 4.906261162881761e-06, + "loss": 0.7527, + "step": 8825 + }, + { + "epoch": 0.6678521433165601, + "grad_norm": 3.085604429244995, + "learning_rate": 4.9042555497862314e-06, + "loss": 0.659, + "step": 8826 + }, + { + "epoch": 0.6679278120388937, + "grad_norm": 2.575090169906616, + "learning_rate": 4.902250193179466e-06, + "loss": 0.6034, + "step": 8827 + }, + { + "epoch": 0.6680034807612274, + "grad_norm": 2.862489700317383, + "learning_rate": 4.900245093187049e-06, + "loss": 0.7112, + "step": 8828 + }, + { + "epoch": 0.668079149483561, + "grad_norm": 2.3818016052246094, + "learning_rate": 4.898240249934546e-06, + "loss": 0.7612, + "step": 8829 + }, + { + "epoch": 0.6681548182058946, + "grad_norm": 2.6290574073791504, + "learning_rate": 4.896235663547498e-06, + "loss": 0.6276, + "step": 8830 + }, + { + "epoch": 0.6682304869282282, + "grad_norm": 1.9389046430587769, + "learning_rate": 4.89423133415145e-06, + "loss": 0.5628, + "step": 8831 + }, + { + "epoch": 0.6683061556505618, + "grad_norm": 2.1501002311706543, + "learning_rate": 4.8922272618719154e-06, + "loss": 0.776, + "step": 8832 + }, + { + "epoch": 0.6683818243728955, + "grad_norm": 1.9380501508712769, + "learning_rate": 4.8902234468344e-06, + "loss": 0.5632, + "step": 8833 + }, + { + "epoch": 0.6684574930952291, + "grad_norm": 1.943785309791565, + "learning_rate": 4.888219889164381e-06, + "loss": 0.6471, + "step": 8834 + }, + { + "epoch": 0.6685331618175627, + "grad_norm": 2.238030433654785, + "learning_rate": 4.886216588987328e-06, + "loss": 0.5461, + "step": 8835 + }, + { + "epoch": 0.6686088305398963, + "grad_norm": 2.7209441661834717, + "learning_rate": 4.884213546428706e-06, + "loss": 0.6858, + "step": 8836 + }, + { + "epoch": 0.66868449926223, + "grad_norm": 1.9361830949783325, + "learning_rate": 4.882210761613938e-06, + "loss": 0.627, + "step": 8837 + }, + { + "epoch": 0.6687601679845636, + "grad_norm": 2.493215799331665, + "learning_rate": 4.880208234668452e-06, + "loss": 0.6585, + "step": 8838 + }, + { + "epoch": 0.6688358367068972, + "grad_norm": 2.2090611457824707, + "learning_rate": 4.878205965717652e-06, + "loss": 0.7604, + "step": 8839 + }, + { + "epoch": 0.6689115054292308, + "grad_norm": 1.8827425241470337, + "learning_rate": 4.8762039548869245e-06, + "loss": 0.6121, + "step": 8840 + }, + { + "epoch": 0.6689871741515645, + "grad_norm": 1.9385136365890503, + "learning_rate": 4.8742022023016445e-06, + "loss": 0.7205, + "step": 8841 + }, + { + "epoch": 0.6690628428738981, + "grad_norm": 2.260593891143799, + "learning_rate": 4.8722007080871675e-06, + "loss": 0.626, + "step": 8842 + }, + { + "epoch": 0.6691385115962317, + "grad_norm": 1.950851321220398, + "learning_rate": 4.870199472368835e-06, + "loss": 0.665, + "step": 8843 + }, + { + "epoch": 0.6692141803185653, + "grad_norm": 3.029724597930908, + "learning_rate": 4.868198495271966e-06, + "loss": 0.7195, + "step": 8844 + }, + { + "epoch": 0.6692898490408989, + "grad_norm": 2.3940155506134033, + "learning_rate": 4.866197776921867e-06, + "loss": 0.5533, + "step": 8845 + }, + { + "epoch": 0.6693655177632326, + "grad_norm": 1.612696647644043, + "learning_rate": 4.864197317443839e-06, + "loss": 0.7753, + "step": 8846 + }, + { + "epoch": 0.6694411864855662, + "grad_norm": 1.8332983255386353, + "learning_rate": 4.8621971169631535e-06, + "loss": 0.6191, + "step": 8847 + }, + { + "epoch": 0.6695168552078998, + "grad_norm": 1.9380171298980713, + "learning_rate": 4.8601971756050645e-06, + "loss": 0.6095, + "step": 8848 + }, + { + "epoch": 0.6695925239302334, + "grad_norm": 2.1490824222564697, + "learning_rate": 4.858197493494819e-06, + "loss": 0.7483, + "step": 8849 + }, + { + "epoch": 0.669668192652567, + "grad_norm": 2.265101909637451, + "learning_rate": 4.8561980707576415e-06, + "loss": 0.6927, + "step": 8850 + }, + { + "epoch": 0.6697438613749007, + "grad_norm": 1.6495442390441895, + "learning_rate": 4.8541989075187446e-06, + "loss": 0.5957, + "step": 8851 + }, + { + "epoch": 0.6698195300972343, + "grad_norm": 2.1201844215393066, + "learning_rate": 4.852200003903321e-06, + "loss": 0.662, + "step": 8852 + }, + { + "epoch": 0.6698951988195679, + "grad_norm": 2.0578794479370117, + "learning_rate": 4.850201360036548e-06, + "loss": 0.7122, + "step": 8853 + }, + { + "epoch": 0.6699708675419016, + "grad_norm": 2.0692341327667236, + "learning_rate": 4.848202976043593e-06, + "loss": 0.7108, + "step": 8854 + }, + { + "epoch": 0.6700465362642352, + "grad_norm": 2.5348756313323975, + "learning_rate": 4.846204852049588e-06, + "loss": 0.6537, + "step": 8855 + }, + { + "epoch": 0.6701222049865688, + "grad_norm": 2.185525417327881, + "learning_rate": 4.844206988179674e-06, + "loss": 0.6937, + "step": 8856 + }, + { + "epoch": 0.6701978737089024, + "grad_norm": 2.0344419479370117, + "learning_rate": 4.842209384558962e-06, + "loss": 0.535, + "step": 8857 + }, + { + "epoch": 0.670273542431236, + "grad_norm": 2.430760145187378, + "learning_rate": 4.840212041312545e-06, + "loss": 0.8704, + "step": 8858 + }, + { + "epoch": 0.6703492111535697, + "grad_norm": 2.276468276977539, + "learning_rate": 4.838214958565503e-06, + "loss": 0.8045, + "step": 8859 + }, + { + "epoch": 0.6704248798759033, + "grad_norm": 2.0148627758026123, + "learning_rate": 4.836218136442902e-06, + "loss": 0.6484, + "step": 8860 + }, + { + "epoch": 0.6705005485982369, + "grad_norm": 1.9523584842681885, + "learning_rate": 4.834221575069788e-06, + "loss": 0.7177, + "step": 8861 + }, + { + "epoch": 0.6705762173205705, + "grad_norm": 1.8609076738357544, + "learning_rate": 4.8322252745711925e-06, + "loss": 0.8153, + "step": 8862 + }, + { + "epoch": 0.6706518860429042, + "grad_norm": 2.0499250888824463, + "learning_rate": 4.83022923507213e-06, + "loss": 0.7258, + "step": 8863 + }, + { + "epoch": 0.6707275547652378, + "grad_norm": 2.329328775405884, + "learning_rate": 4.8282334566976e-06, + "loss": 0.7621, + "step": 8864 + }, + { + "epoch": 0.6708032234875714, + "grad_norm": 2.1345584392547607, + "learning_rate": 4.8262379395725885e-06, + "loss": 0.564, + "step": 8865 + }, + { + "epoch": 0.670878892209905, + "grad_norm": 1.9283918142318726, + "learning_rate": 4.82424268382205e-06, + "loss": 0.6522, + "step": 8866 + }, + { + "epoch": 0.6709545609322387, + "grad_norm": 2.036198377609253, + "learning_rate": 4.822247689570943e-06, + "loss": 0.7945, + "step": 8867 + }, + { + "epoch": 0.6710302296545723, + "grad_norm": 2.039332389831543, + "learning_rate": 4.8202529569442015e-06, + "loss": 0.6903, + "step": 8868 + }, + { + "epoch": 0.6711058983769059, + "grad_norm": 2.211557149887085, + "learning_rate": 4.818258486066736e-06, + "loss": 0.6866, + "step": 8869 + }, + { + "epoch": 0.6711815670992395, + "grad_norm": 3.0144009590148926, + "learning_rate": 4.816264277063449e-06, + "loss": 0.6603, + "step": 8870 + }, + { + "epoch": 0.6712572358215732, + "grad_norm": 2.047494649887085, + "learning_rate": 4.814270330059226e-06, + "loss": 0.6691, + "step": 8871 + }, + { + "epoch": 0.6713329045439068, + "grad_norm": 2.4948368072509766, + "learning_rate": 4.812276645178932e-06, + "loss": 0.6757, + "step": 8872 + }, + { + "epoch": 0.6714085732662404, + "grad_norm": 2.7353858947753906, + "learning_rate": 4.8102832225474194e-06, + "loss": 0.6022, + "step": 8873 + }, + { + "epoch": 0.671484241988574, + "grad_norm": 2.2280282974243164, + "learning_rate": 4.8082900622895226e-06, + "loss": 0.585, + "step": 8874 + }, + { + "epoch": 0.6715599107109076, + "grad_norm": 1.8477840423583984, + "learning_rate": 4.806297164530059e-06, + "loss": 0.8268, + "step": 8875 + }, + { + "epoch": 0.6716355794332413, + "grad_norm": 2.323336601257324, + "learning_rate": 4.804304529393834e-06, + "loss": 0.6874, + "step": 8876 + }, + { + "epoch": 0.6717112481555749, + "grad_norm": 2.1553761959075928, + "learning_rate": 4.8023121570056265e-06, + "loss": 0.6259, + "step": 8877 + }, + { + "epoch": 0.6717869168779085, + "grad_norm": 2.515099287033081, + "learning_rate": 4.800320047490211e-06, + "loss": 0.8223, + "step": 8878 + }, + { + "epoch": 0.6718625856002421, + "grad_norm": 2.2152724266052246, + "learning_rate": 4.798328200972339e-06, + "loss": 0.5351, + "step": 8879 + }, + { + "epoch": 0.6719382543225758, + "grad_norm": 2.307845115661621, + "learning_rate": 4.7963366175767425e-06, + "loss": 0.6745, + "step": 8880 + }, + { + "epoch": 0.6720139230449094, + "grad_norm": 2.3008527755737305, + "learning_rate": 4.79434529742814e-06, + "loss": 0.7065, + "step": 8881 + }, + { + "epoch": 0.672089591767243, + "grad_norm": 2.3067424297332764, + "learning_rate": 4.792354240651245e-06, + "loss": 0.6639, + "step": 8882 + }, + { + "epoch": 0.6721652604895766, + "grad_norm": 2.0410807132720947, + "learning_rate": 4.790363447370733e-06, + "loss": 0.6769, + "step": 8883 + }, + { + "epoch": 0.6722409292119103, + "grad_norm": 1.8702601194381714, + "learning_rate": 4.788372917711276e-06, + "loss": 0.7758, + "step": 8884 + }, + { + "epoch": 0.6723165979342439, + "grad_norm": 1.8784395456314087, + "learning_rate": 4.78638265179753e-06, + "loss": 0.7677, + "step": 8885 + }, + { + "epoch": 0.6723922666565775, + "grad_norm": 2.5597028732299805, + "learning_rate": 4.784392649754131e-06, + "loss": 0.7775, + "step": 8886 + }, + { + "epoch": 0.6724679353789111, + "grad_norm": 2.01203989982605, + "learning_rate": 4.782402911705699e-06, + "loss": 0.8557, + "step": 8887 + }, + { + "epoch": 0.6725436041012447, + "grad_norm": 2.301677942276001, + "learning_rate": 4.780413437776838e-06, + "loss": 0.7891, + "step": 8888 + }, + { + "epoch": 0.6726192728235784, + "grad_norm": 1.7823597192764282, + "learning_rate": 4.778424228092136e-06, + "loss": 0.5104, + "step": 8889 + }, + { + "epoch": 0.672694941545912, + "grad_norm": 1.991434097290039, + "learning_rate": 4.776435282776166e-06, + "loss": 0.7134, + "step": 8890 + }, + { + "epoch": 0.6727706102682456, + "grad_norm": 2.2954578399658203, + "learning_rate": 4.774446601953472e-06, + "loss": 0.7555, + "step": 8891 + }, + { + "epoch": 0.6728462789905792, + "grad_norm": 2.434096097946167, + "learning_rate": 4.772458185748603e-06, + "loss": 0.5947, + "step": 8892 + }, + { + "epoch": 0.6729219477129129, + "grad_norm": 2.205364942550659, + "learning_rate": 4.770470034286079e-06, + "loss": 0.601, + "step": 8893 + }, + { + "epoch": 0.6729976164352465, + "grad_norm": 2.9410433769226074, + "learning_rate": 4.768482147690398e-06, + "loss": 0.8037, + "step": 8894 + }, + { + "epoch": 0.6730732851575801, + "grad_norm": 2.166212320327759, + "learning_rate": 4.766494526086052e-06, + "loss": 0.7444, + "step": 8895 + }, + { + "epoch": 0.6731489538799137, + "grad_norm": 1.7446627616882324, + "learning_rate": 4.76450716959751e-06, + "loss": 0.7428, + "step": 8896 + }, + { + "epoch": 0.6732246226022474, + "grad_norm": 2.243112802505493, + "learning_rate": 4.762520078349229e-06, + "loss": 0.5938, + "step": 8897 + }, + { + "epoch": 0.673300291324581, + "grad_norm": 2.0831973552703857, + "learning_rate": 4.760533252465647e-06, + "loss": 0.683, + "step": 8898 + }, + { + "epoch": 0.6733759600469146, + "grad_norm": 2.1453609466552734, + "learning_rate": 4.7585466920711845e-06, + "loss": 0.6909, + "step": 8899 + }, + { + "epoch": 0.6734516287692482, + "grad_norm": 2.366060733795166, + "learning_rate": 4.756560397290251e-06, + "loss": 0.7826, + "step": 8900 + }, + { + "epoch": 0.6735272974915818, + "grad_norm": 2.3025095462799072, + "learning_rate": 4.754574368247225e-06, + "loss": 0.6098, + "step": 8901 + }, + { + "epoch": 0.6736029662139155, + "grad_norm": 1.991722822189331, + "learning_rate": 4.752588605066481e-06, + "loss": 0.7002, + "step": 8902 + }, + { + "epoch": 0.6736786349362491, + "grad_norm": 2.3091063499450684, + "learning_rate": 4.75060310787238e-06, + "loss": 0.8689, + "step": 8903 + }, + { + "epoch": 0.6737543036585827, + "grad_norm": 2.582026243209839, + "learning_rate": 4.748617876789259e-06, + "loss": 0.759, + "step": 8904 + }, + { + "epoch": 0.6738299723809164, + "grad_norm": 2.0004804134368896, + "learning_rate": 4.746632911941435e-06, + "loss": 0.7951, + "step": 8905 + }, + { + "epoch": 0.67390564110325, + "grad_norm": 2.1170551776885986, + "learning_rate": 4.744648213453215e-06, + "loss": 0.6839, + "step": 8906 + }, + { + "epoch": 0.6739813098255836, + "grad_norm": 2.757847785949707, + "learning_rate": 4.742663781448887e-06, + "loss": 0.6751, + "step": 8907 + }, + { + "epoch": 0.6740569785479172, + "grad_norm": 1.556501865386963, + "learning_rate": 4.740679616052722e-06, + "loss": 0.7912, + "step": 8908 + }, + { + "epoch": 0.6741326472702508, + "grad_norm": 2.0596907138824463, + "learning_rate": 4.7386957173889775e-06, + "loss": 0.7228, + "step": 8909 + }, + { + "epoch": 0.6742083159925845, + "grad_norm": 2.1540966033935547, + "learning_rate": 4.736712085581889e-06, + "loss": 0.7179, + "step": 8910 + }, + { + "epoch": 0.6742839847149181, + "grad_norm": 2.1931824684143066, + "learning_rate": 4.734728720755683e-06, + "loss": 0.717, + "step": 8911 + }, + { + "epoch": 0.6743596534372517, + "grad_norm": 2.3688266277313232, + "learning_rate": 4.732745623034552e-06, + "loss": 0.6503, + "step": 8912 + }, + { + "epoch": 0.6744353221595853, + "grad_norm": 2.4349288940429688, + "learning_rate": 4.730762792542696e-06, + "loss": 0.5946, + "step": 8913 + }, + { + "epoch": 0.6745109908819189, + "grad_norm": 2.7900352478027344, + "learning_rate": 4.728780229404286e-06, + "loss": 0.6437, + "step": 8914 + }, + { + "epoch": 0.6745866596042526, + "grad_norm": 3.1007180213928223, + "learning_rate": 4.726797933743469e-06, + "loss": 0.7885, + "step": 8915 + }, + { + "epoch": 0.6746623283265862, + "grad_norm": 1.8395260572433472, + "learning_rate": 4.724815905684387e-06, + "loss": 0.5847, + "step": 8916 + }, + { + "epoch": 0.6747379970489198, + "grad_norm": 2.2980258464813232, + "learning_rate": 4.722834145351159e-06, + "loss": 0.6564, + "step": 8917 + }, + { + "epoch": 0.6748136657712535, + "grad_norm": 2.2813050746917725, + "learning_rate": 4.7208526528678934e-06, + "loss": 0.6424, + "step": 8918 + }, + { + "epoch": 0.6748893344935871, + "grad_norm": 3.0813608169555664, + "learning_rate": 4.7188714283586735e-06, + "loss": 0.6765, + "step": 8919 + }, + { + "epoch": 0.6749650032159207, + "grad_norm": 3.065865993499756, + "learning_rate": 4.716890471947572e-06, + "loss": 0.6069, + "step": 8920 + }, + { + "epoch": 0.6750406719382543, + "grad_norm": 2.9707062244415283, + "learning_rate": 4.7149097837586425e-06, + "loss": 0.7634, + "step": 8921 + }, + { + "epoch": 0.6751163406605879, + "grad_norm": 2.9453794956207275, + "learning_rate": 4.712929363915923e-06, + "loss": 0.7505, + "step": 8922 + }, + { + "epoch": 0.6751920093829216, + "grad_norm": 1.7816321849822998, + "learning_rate": 4.710949212543431e-06, + "loss": 0.6617, + "step": 8923 + }, + { + "epoch": 0.6752676781052552, + "grad_norm": 1.8292231559753418, + "learning_rate": 4.7089693297651725e-06, + "loss": 0.7435, + "step": 8924 + }, + { + "epoch": 0.6753433468275888, + "grad_norm": 2.4246504306793213, + "learning_rate": 4.706989715705137e-06, + "loss": 0.5491, + "step": 8925 + }, + { + "epoch": 0.6754190155499225, + "grad_norm": 2.1744165420532227, + "learning_rate": 4.705010370487287e-06, + "loss": 0.7401, + "step": 8926 + }, + { + "epoch": 0.6754946842722561, + "grad_norm": 1.9293251037597656, + "learning_rate": 4.703031294235576e-06, + "loss": 0.6611, + "step": 8927 + }, + { + "epoch": 0.6755703529945897, + "grad_norm": 2.0167737007141113, + "learning_rate": 4.701052487073951e-06, + "loss": 0.6623, + "step": 8928 + }, + { + "epoch": 0.6756460217169233, + "grad_norm": 2.362187147140503, + "learning_rate": 4.69907394912632e-06, + "loss": 0.8366, + "step": 8929 + }, + { + "epoch": 0.6757216904392569, + "grad_norm": 2.09653377532959, + "learning_rate": 4.697095680516588e-06, + "loss": 0.757, + "step": 8930 + }, + { + "epoch": 0.6757973591615906, + "grad_norm": 2.5158259868621826, + "learning_rate": 4.695117681368643e-06, + "loss": 0.6652, + "step": 8931 + }, + { + "epoch": 0.6758730278839242, + "grad_norm": 2.2666311264038086, + "learning_rate": 4.693139951806352e-06, + "loss": 0.728, + "step": 8932 + }, + { + "epoch": 0.6759486966062578, + "grad_norm": 2.03359055519104, + "learning_rate": 4.691162491953568e-06, + "loss": 0.772, + "step": 8933 + }, + { + "epoch": 0.6760243653285914, + "grad_norm": 2.2918758392333984, + "learning_rate": 4.689185301934124e-06, + "loss": 0.7318, + "step": 8934 + }, + { + "epoch": 0.676100034050925, + "grad_norm": 2.3821206092834473, + "learning_rate": 4.6872083818718404e-06, + "loss": 0.678, + "step": 8935 + }, + { + "epoch": 0.6761757027732587, + "grad_norm": 1.8658883571624756, + "learning_rate": 4.685231731890521e-06, + "loss": 0.7425, + "step": 8936 + }, + { + "epoch": 0.6762513714955923, + "grad_norm": 1.821655511856079, + "learning_rate": 4.6832553521139415e-06, + "loss": 0.6313, + "step": 8937 + }, + { + "epoch": 0.6763270402179259, + "grad_norm": 2.3317677974700928, + "learning_rate": 4.6812792426658715e-06, + "loss": 0.8466, + "step": 8938 + }, + { + "epoch": 0.6764027089402596, + "grad_norm": 2.308093547821045, + "learning_rate": 4.679303403670069e-06, + "loss": 0.7643, + "step": 8939 + }, + { + "epoch": 0.6764783776625932, + "grad_norm": 2.0288562774658203, + "learning_rate": 4.67732783525026e-06, + "loss": 0.6358, + "step": 8940 + }, + { + "epoch": 0.6765540463849268, + "grad_norm": 2.001481771469116, + "learning_rate": 4.675352537530162e-06, + "loss": 0.5445, + "step": 8941 + }, + { + "epoch": 0.6766297151072604, + "grad_norm": 2.197216749191284, + "learning_rate": 4.673377510633478e-06, + "loss": 0.7168, + "step": 8942 + }, + { + "epoch": 0.676705383829594, + "grad_norm": 3.374070882797241, + "learning_rate": 4.671402754683887e-06, + "loss": 0.7088, + "step": 8943 + }, + { + "epoch": 0.6767810525519277, + "grad_norm": 2.1551625728607178, + "learning_rate": 4.669428269805055e-06, + "loss": 0.7868, + "step": 8944 + }, + { + "epoch": 0.6768567212742613, + "grad_norm": 2.4078245162963867, + "learning_rate": 4.6674540561206336e-06, + "loss": 0.8625, + "step": 8945 + }, + { + "epoch": 0.6769323899965949, + "grad_norm": 2.43843674659729, + "learning_rate": 4.665480113754253e-06, + "loss": 0.6372, + "step": 8946 + }, + { + "epoch": 0.6770080587189286, + "grad_norm": 6.045653343200684, + "learning_rate": 4.663506442829526e-06, + "loss": 0.7967, + "step": 8947 + }, + { + "epoch": 0.6770837274412621, + "grad_norm": 1.9134116172790527, + "learning_rate": 4.661533043470047e-06, + "loss": 0.675, + "step": 8948 + }, + { + "epoch": 0.6771593961635958, + "grad_norm": 2.1514625549316406, + "learning_rate": 4.659559915799406e-06, + "loss": 0.7456, + "step": 8949 + }, + { + "epoch": 0.6772350648859294, + "grad_norm": 2.0664405822753906, + "learning_rate": 4.657587059941163e-06, + "loss": 0.6689, + "step": 8950 + }, + { + "epoch": 0.677310733608263, + "grad_norm": 2.511876344680786, + "learning_rate": 4.655614476018862e-06, + "loss": 0.7499, + "step": 8951 + }, + { + "epoch": 0.6773864023305967, + "grad_norm": 2.2233481407165527, + "learning_rate": 4.653642164156032e-06, + "loss": 0.695, + "step": 8952 + }, + { + "epoch": 0.6774620710529303, + "grad_norm": 4.162423610687256, + "learning_rate": 4.651670124476189e-06, + "loss": 0.5902, + "step": 8953 + }, + { + "epoch": 0.6775377397752639, + "grad_norm": 2.2912869453430176, + "learning_rate": 4.649698357102826e-06, + "loss": 0.9048, + "step": 8954 + }, + { + "epoch": 0.6776134084975975, + "grad_norm": 2.0150766372680664, + "learning_rate": 4.647726862159423e-06, + "loss": 0.6542, + "step": 8955 + }, + { + "epoch": 0.6776890772199311, + "grad_norm": 2.4221763610839844, + "learning_rate": 4.6457556397694415e-06, + "loss": 0.7111, + "step": 8956 + }, + { + "epoch": 0.6777647459422648, + "grad_norm": 2.3624653816223145, + "learning_rate": 4.643784690056328e-06, + "loss": 0.6624, + "step": 8957 + }, + { + "epoch": 0.6778404146645984, + "grad_norm": 2.5852222442626953, + "learning_rate": 4.641814013143499e-06, + "loss": 0.7025, + "step": 8958 + }, + { + "epoch": 0.677916083386932, + "grad_norm": 2.3841378688812256, + "learning_rate": 4.639843609154379e-06, + "loss": 0.6945, + "step": 8959 + }, + { + "epoch": 0.6779917521092657, + "grad_norm": 2.0488786697387695, + "learning_rate": 4.637873478212354e-06, + "loss": 0.7785, + "step": 8960 + }, + { + "epoch": 0.6780674208315992, + "grad_norm": 2.140420436859131, + "learning_rate": 4.6359036204408e-06, + "loss": 0.5558, + "step": 8961 + }, + { + "epoch": 0.6781430895539329, + "grad_norm": 2.20794939994812, + "learning_rate": 4.633934035963076e-06, + "loss": 0.7389, + "step": 8962 + }, + { + "epoch": 0.6782187582762665, + "grad_norm": 2.0823874473571777, + "learning_rate": 4.631964724902521e-06, + "loss": 0.5781, + "step": 8963 + }, + { + "epoch": 0.6782944269986001, + "grad_norm": 2.717106580734253, + "learning_rate": 4.629995687382469e-06, + "loss": 0.7518, + "step": 8964 + }, + { + "epoch": 0.6783700957209338, + "grad_norm": 1.9206687211990356, + "learning_rate": 4.6280269235262175e-06, + "loss": 0.6779, + "step": 8965 + }, + { + "epoch": 0.6784457644432674, + "grad_norm": 2.1041131019592285, + "learning_rate": 4.626058433457062e-06, + "loss": 0.6477, + "step": 8966 + }, + { + "epoch": 0.678521433165601, + "grad_norm": 1.8171806335449219, + "learning_rate": 4.624090217298274e-06, + "loss": 0.7458, + "step": 8967 + }, + { + "epoch": 0.6785971018879347, + "grad_norm": 2.141724109649658, + "learning_rate": 4.62212227517311e-06, + "loss": 0.599, + "step": 8968 + }, + { + "epoch": 0.6786727706102682, + "grad_norm": 2.140650510787964, + "learning_rate": 4.620154607204809e-06, + "loss": 0.6146, + "step": 8969 + }, + { + "epoch": 0.6787484393326019, + "grad_norm": 1.9559601545333862, + "learning_rate": 4.618187213516592e-06, + "loss": 0.7644, + "step": 8970 + }, + { + "epoch": 0.6788241080549355, + "grad_norm": 2.512819528579712, + "learning_rate": 4.616220094231669e-06, + "loss": 0.6922, + "step": 8971 + }, + { + "epoch": 0.6788997767772691, + "grad_norm": 1.995936393737793, + "learning_rate": 4.614253249473218e-06, + "loss": 0.4985, + "step": 8972 + }, + { + "epoch": 0.6789754454996028, + "grad_norm": 1.9954118728637695, + "learning_rate": 4.612286679364414e-06, + "loss": 0.6756, + "step": 8973 + }, + { + "epoch": 0.6790511142219363, + "grad_norm": 2.6867809295654297, + "learning_rate": 4.610320384028409e-06, + "loss": 0.6245, + "step": 8974 + }, + { + "epoch": 0.67912678294427, + "grad_norm": 2.2337684631347656, + "learning_rate": 4.60835436358834e-06, + "loss": 0.7234, + "step": 8975 + }, + { + "epoch": 0.6792024516666036, + "grad_norm": 2.338660478591919, + "learning_rate": 4.606388618167325e-06, + "loss": 0.6593, + "step": 8976 + }, + { + "epoch": 0.6792781203889372, + "grad_norm": 2.2352778911590576, + "learning_rate": 4.604423147888467e-06, + "loss": 0.6932, + "step": 8977 + }, + { + "epoch": 0.6793537891112709, + "grad_norm": 1.7522343397140503, + "learning_rate": 4.6024579528748465e-06, + "loss": 0.706, + "step": 8978 + }, + { + "epoch": 0.6794294578336045, + "grad_norm": 2.317509412765503, + "learning_rate": 4.600493033249532e-06, + "loss": 0.6231, + "step": 8979 + }, + { + "epoch": 0.6795051265559381, + "grad_norm": 1.9539676904678345, + "learning_rate": 4.598528389135574e-06, + "loss": 0.7355, + "step": 8980 + }, + { + "epoch": 0.6795807952782718, + "grad_norm": 1.9069515466690063, + "learning_rate": 4.5965640206560055e-06, + "loss": 0.6524, + "step": 8981 + }, + { + "epoch": 0.6796564640006053, + "grad_norm": 2.012300729751587, + "learning_rate": 4.594599927933843e-06, + "loss": 0.6192, + "step": 8982 + }, + { + "epoch": 0.679732132722939, + "grad_norm": 2.2685201168060303, + "learning_rate": 4.59263611109208e-06, + "loss": 0.7487, + "step": 8983 + }, + { + "epoch": 0.6798078014452726, + "grad_norm": 2.114442825317383, + "learning_rate": 4.5906725702536925e-06, + "loss": 0.6905, + "step": 8984 + }, + { + "epoch": 0.6798834701676062, + "grad_norm": 1.9639710187911987, + "learning_rate": 4.588709305541659e-06, + "loss": 0.6025, + "step": 8985 + }, + { + "epoch": 0.6799591388899399, + "grad_norm": 1.5509191751480103, + "learning_rate": 4.586746317078913e-06, + "loss": 0.8064, + "step": 8986 + }, + { + "epoch": 0.6800348076122734, + "grad_norm": 2.1551706790924072, + "learning_rate": 4.584783604988387e-06, + "loss": 0.6756, + "step": 8987 + }, + { + "epoch": 0.6801104763346071, + "grad_norm": 2.119821548461914, + "learning_rate": 4.5828211693929915e-06, + "loss": 0.8007, + "step": 8988 + }, + { + "epoch": 0.6801861450569407, + "grad_norm": 2.0947601795196533, + "learning_rate": 4.580859010415622e-06, + "loss": 0.6009, + "step": 8989 + }, + { + "epoch": 0.6802618137792743, + "grad_norm": 1.9060765504837036, + "learning_rate": 4.5788971281791535e-06, + "loss": 0.6957, + "step": 8990 + }, + { + "epoch": 0.680337482501608, + "grad_norm": 2.3293838500976562, + "learning_rate": 4.576935522806447e-06, + "loss": 0.552, + "step": 8991 + }, + { + "epoch": 0.6804131512239416, + "grad_norm": 2.8745076656341553, + "learning_rate": 4.574974194420344e-06, + "loss": 0.6307, + "step": 8992 + }, + { + "epoch": 0.6804888199462752, + "grad_norm": 2.3362932205200195, + "learning_rate": 4.573013143143672e-06, + "loss": 0.6628, + "step": 8993 + }, + { + "epoch": 0.6805644886686089, + "grad_norm": 2.5250635147094727, + "learning_rate": 4.5710523690992296e-06, + "loss": 0.6863, + "step": 8994 + }, + { + "epoch": 0.6806401573909424, + "grad_norm": 2.553046226501465, + "learning_rate": 4.569091872409816e-06, + "loss": 0.7694, + "step": 8995 + }, + { + "epoch": 0.6807158261132761, + "grad_norm": 2.729386806488037, + "learning_rate": 4.567131653198204e-06, + "loss": 0.7139, + "step": 8996 + }, + { + "epoch": 0.6807914948356097, + "grad_norm": 1.8493585586547852, + "learning_rate": 4.5651717115871415e-06, + "loss": 0.6534, + "step": 8997 + }, + { + "epoch": 0.6808671635579433, + "grad_norm": 2.3107059001922607, + "learning_rate": 4.563212047699371e-06, + "loss": 0.6176, + "step": 8998 + }, + { + "epoch": 0.680942832280277, + "grad_norm": 1.6899579763412476, + "learning_rate": 4.561252661657613e-06, + "loss": 0.6259, + "step": 8999 + }, + { + "epoch": 0.6810185010026105, + "grad_norm": 2.0711593627929688, + "learning_rate": 4.559293553584569e-06, + "loss": 0.6786, + "step": 9000 + }, + { + "epoch": 0.6810941697249442, + "grad_norm": 1.8845760822296143, + "learning_rate": 4.557334723602927e-06, + "loss": 0.6397, + "step": 9001 + }, + { + "epoch": 0.6811698384472779, + "grad_norm": 2.040178060531616, + "learning_rate": 4.555376171835352e-06, + "loss": 0.5297, + "step": 9002 + }, + { + "epoch": 0.6812455071696114, + "grad_norm": 2.0751795768737793, + "learning_rate": 4.5534178984045e-06, + "loss": 0.6987, + "step": 9003 + }, + { + "epoch": 0.6813211758919451, + "grad_norm": 2.1096200942993164, + "learning_rate": 4.551459903432997e-06, + "loss": 0.7418, + "step": 9004 + }, + { + "epoch": 0.6813968446142787, + "grad_norm": 2.9717164039611816, + "learning_rate": 4.549502187043465e-06, + "loss": 0.6486, + "step": 9005 + }, + { + "epoch": 0.6814725133366123, + "grad_norm": 1.7609609365463257, + "learning_rate": 4.5475447493585004e-06, + "loss": 0.7631, + "step": 9006 + }, + { + "epoch": 0.681548182058946, + "grad_norm": 2.13960599899292, + "learning_rate": 4.545587590500689e-06, + "loss": 0.7899, + "step": 9007 + }, + { + "epoch": 0.6816238507812795, + "grad_norm": 1.9353188276290894, + "learning_rate": 4.543630710592585e-06, + "loss": 0.6638, + "step": 9008 + }, + { + "epoch": 0.6816995195036132, + "grad_norm": 1.8156839609146118, + "learning_rate": 4.5416741097567385e-06, + "loss": 0.7883, + "step": 9009 + }, + { + "epoch": 0.6817751882259468, + "grad_norm": 1.8130958080291748, + "learning_rate": 4.539717788115684e-06, + "loss": 0.618, + "step": 9010 + }, + { + "epoch": 0.6818508569482804, + "grad_norm": 1.71598219871521, + "learning_rate": 4.537761745791925e-06, + "loss": 0.627, + "step": 9011 + }, + { + "epoch": 0.6819265256706141, + "grad_norm": 2.342985153198242, + "learning_rate": 4.535805982907958e-06, + "loss": 0.7325, + "step": 9012 + }, + { + "epoch": 0.6820021943929476, + "grad_norm": 1.825118064880371, + "learning_rate": 4.53385049958626e-06, + "loss": 0.6055, + "step": 9013 + }, + { + "epoch": 0.6820778631152813, + "grad_norm": 1.9702906608581543, + "learning_rate": 4.531895295949292e-06, + "loss": 0.7337, + "step": 9014 + }, + { + "epoch": 0.682153531837615, + "grad_norm": 2.108592987060547, + "learning_rate": 4.529940372119486e-06, + "loss": 0.635, + "step": 9015 + }, + { + "epoch": 0.6822292005599485, + "grad_norm": 1.7487767934799194, + "learning_rate": 4.5279857282192735e-06, + "loss": 0.6604, + "step": 9016 + }, + { + "epoch": 0.6823048692822822, + "grad_norm": 2.037961721420288, + "learning_rate": 4.5260313643710625e-06, + "loss": 0.6136, + "step": 9017 + }, + { + "epoch": 0.6823805380046158, + "grad_norm": 2.4088294506073, + "learning_rate": 4.524077280697237e-06, + "loss": 0.6478, + "step": 9018 + }, + { + "epoch": 0.6824562067269494, + "grad_norm": 2.065066337585449, + "learning_rate": 4.522123477320167e-06, + "loss": 0.6117, + "step": 9019 + }, + { + "epoch": 0.6825318754492831, + "grad_norm": 3.416294813156128, + "learning_rate": 4.520169954362204e-06, + "loss": 0.76, + "step": 9020 + }, + { + "epoch": 0.6826075441716166, + "grad_norm": 2.648378610610962, + "learning_rate": 4.518216711945697e-06, + "loss": 0.7329, + "step": 9021 + }, + { + "epoch": 0.6826832128939503, + "grad_norm": 2.742499589920044, + "learning_rate": 4.516263750192951e-06, + "loss": 0.8434, + "step": 9022 + }, + { + "epoch": 0.682758881616284, + "grad_norm": 1.814097285270691, + "learning_rate": 4.514311069226272e-06, + "loss": 0.7289, + "step": 9023 + }, + { + "epoch": 0.6828345503386175, + "grad_norm": 1.8923470973968506, + "learning_rate": 4.5123586691679405e-06, + "loss": 0.7187, + "step": 9024 + }, + { + "epoch": 0.6829102190609512, + "grad_norm": 2.0584118366241455, + "learning_rate": 4.510406550140226e-06, + "loss": 0.7942, + "step": 9025 + }, + { + "epoch": 0.6829858877832847, + "grad_norm": 2.10097074508667, + "learning_rate": 4.508454712265373e-06, + "loss": 0.6773, + "step": 9026 + }, + { + "epoch": 0.6830615565056184, + "grad_norm": 2.169400453567505, + "learning_rate": 4.506503155665613e-06, + "loss": 0.492, + "step": 9027 + }, + { + "epoch": 0.6831372252279521, + "grad_norm": 2.034270763397217, + "learning_rate": 4.5045518804631635e-06, + "loss": 0.6998, + "step": 9028 + }, + { + "epoch": 0.6832128939502856, + "grad_norm": 3.989406108856201, + "learning_rate": 4.502600886780212e-06, + "loss": 0.6633, + "step": 9029 + }, + { + "epoch": 0.6832885626726193, + "grad_norm": 2.1138885021209717, + "learning_rate": 4.500650174738935e-06, + "loss": 0.674, + "step": 9030 + }, + { + "epoch": 0.683364231394953, + "grad_norm": 1.778512954711914, + "learning_rate": 4.498699744461504e-06, + "loss": 0.6803, + "step": 9031 + }, + { + "epoch": 0.6834399001172865, + "grad_norm": 2.0274288654327393, + "learning_rate": 4.496749596070052e-06, + "loss": 0.7654, + "step": 9032 + }, + { + "epoch": 0.6835155688396202, + "grad_norm": 1.8345074653625488, + "learning_rate": 4.494799729686703e-06, + "loss": 0.7808, + "step": 9033 + }, + { + "epoch": 0.6835912375619537, + "grad_norm": 1.9967671632766724, + "learning_rate": 4.492850145433567e-06, + "loss": 0.5957, + "step": 9034 + }, + { + "epoch": 0.6836669062842874, + "grad_norm": 2.4879066944122314, + "learning_rate": 4.490900843432734e-06, + "loss": 0.6603, + "step": 9035 + }, + { + "epoch": 0.6837425750066211, + "grad_norm": 1.9220361709594727, + "learning_rate": 4.488951823806274e-06, + "loss": 0.6626, + "step": 9036 + }, + { + "epoch": 0.6838182437289546, + "grad_norm": 2.3953804969787598, + "learning_rate": 4.487003086676241e-06, + "loss": 0.7222, + "step": 9037 + }, + { + "epoch": 0.6838939124512883, + "grad_norm": 4.208496570587158, + "learning_rate": 4.485054632164672e-06, + "loss": 0.6668, + "step": 9038 + }, + { + "epoch": 0.6839695811736218, + "grad_norm": 1.9624886512756348, + "learning_rate": 4.483106460393587e-06, + "loss": 0.6975, + "step": 9039 + }, + { + "epoch": 0.6840452498959555, + "grad_norm": 1.8978896141052246, + "learning_rate": 4.481158571484981e-06, + "loss": 0.5826, + "step": 9040 + }, + { + "epoch": 0.6841209186182892, + "grad_norm": 2.2070109844207764, + "learning_rate": 4.479210965560841e-06, + "loss": 0.743, + "step": 9041 + }, + { + "epoch": 0.6841965873406227, + "grad_norm": 2.3264880180358887, + "learning_rate": 4.477263642743137e-06, + "loss": 0.7172, + "step": 9042 + }, + { + "epoch": 0.6842722560629564, + "grad_norm": 2.088231086730957, + "learning_rate": 4.475316603153809e-06, + "loss": 0.6427, + "step": 9043 + }, + { + "epoch": 0.68434792478529, + "grad_norm": 2.185490131378174, + "learning_rate": 4.47336984691479e-06, + "loss": 0.7606, + "step": 9044 + }, + { + "epoch": 0.6844235935076236, + "grad_norm": 2.1886022090911865, + "learning_rate": 4.4714233741479914e-06, + "loss": 0.5061, + "step": 9045 + }, + { + "epoch": 0.6844992622299573, + "grad_norm": 2.4206368923187256, + "learning_rate": 4.46947718497531e-06, + "loss": 0.7002, + "step": 9046 + }, + { + "epoch": 0.6845749309522908, + "grad_norm": 2.282135009765625, + "learning_rate": 4.467531279518619e-06, + "loss": 0.7932, + "step": 9047 + }, + { + "epoch": 0.6846505996746245, + "grad_norm": 2.5050301551818848, + "learning_rate": 4.465585657899779e-06, + "loss": 0.5978, + "step": 9048 + }, + { + "epoch": 0.6847262683969582, + "grad_norm": 2.1457459926605225, + "learning_rate": 4.463640320240636e-06, + "loss": 0.6758, + "step": 9049 + }, + { + "epoch": 0.6848019371192917, + "grad_norm": 1.7627110481262207, + "learning_rate": 4.4616952666630036e-06, + "loss": 0.7674, + "step": 9050 + }, + { + "epoch": 0.6848776058416254, + "grad_norm": 2.0747087001800537, + "learning_rate": 4.4597504972886895e-06, + "loss": 0.6957, + "step": 9051 + }, + { + "epoch": 0.6849532745639589, + "grad_norm": 1.8389160633087158, + "learning_rate": 4.457806012239488e-06, + "loss": 0.6881, + "step": 9052 + }, + { + "epoch": 0.6850289432862926, + "grad_norm": 3.18410325050354, + "learning_rate": 4.455861811637168e-06, + "loss": 0.7342, + "step": 9053 + }, + { + "epoch": 0.6851046120086263, + "grad_norm": 1.8247333765029907, + "learning_rate": 4.453917895603476e-06, + "loss": 0.5705, + "step": 9054 + }, + { + "epoch": 0.6851802807309598, + "grad_norm": 1.9525268077850342, + "learning_rate": 4.451974264260148e-06, + "loss": 0.7596, + "step": 9055 + }, + { + "epoch": 0.6852559494532935, + "grad_norm": 1.9142605066299438, + "learning_rate": 4.450030917728903e-06, + "loss": 0.6349, + "step": 9056 + }, + { + "epoch": 0.6853316181756272, + "grad_norm": 1.9247804880142212, + "learning_rate": 4.448087856131438e-06, + "loss": 0.5992, + "step": 9057 + }, + { + "epoch": 0.6854072868979607, + "grad_norm": 2.2341079711914062, + "learning_rate": 4.446145079589434e-06, + "loss": 0.8441, + "step": 9058 + }, + { + "epoch": 0.6854829556202944, + "grad_norm": 2.2383766174316406, + "learning_rate": 4.444202588224554e-06, + "loss": 0.5673, + "step": 9059 + }, + { + "epoch": 0.6855586243426279, + "grad_norm": 1.9901219606399536, + "learning_rate": 4.442260382158447e-06, + "loss": 0.7313, + "step": 9060 + }, + { + "epoch": 0.6856342930649616, + "grad_norm": 2.1193864345550537, + "learning_rate": 4.440318461512729e-06, + "loss": 0.7233, + "step": 9061 + }, + { + "epoch": 0.6857099617872953, + "grad_norm": 2.3127729892730713, + "learning_rate": 4.438376826409021e-06, + "loss": 0.656, + "step": 9062 + }, + { + "epoch": 0.6857856305096288, + "grad_norm": 1.5982202291488647, + "learning_rate": 4.4364354769689125e-06, + "loss": 0.7635, + "step": 9063 + }, + { + "epoch": 0.6858612992319625, + "grad_norm": 1.8591372966766357, + "learning_rate": 4.434494413313972e-06, + "loss": 0.6837, + "step": 9064 + }, + { + "epoch": 0.685936967954296, + "grad_norm": 2.3256266117095947, + "learning_rate": 4.432553635565758e-06, + "loss": 0.8136, + "step": 9065 + }, + { + "epoch": 0.6860126366766297, + "grad_norm": 1.8938707113265991, + "learning_rate": 4.430613143845805e-06, + "loss": 0.7994, + "step": 9066 + }, + { + "epoch": 0.6860883053989634, + "grad_norm": 1.8198975324630737, + "learning_rate": 4.428672938275642e-06, + "loss": 0.7606, + "step": 9067 + }, + { + "epoch": 0.6861639741212969, + "grad_norm": 2.892153739929199, + "learning_rate": 4.4267330189767624e-06, + "loss": 0.714, + "step": 9068 + }, + { + "epoch": 0.6862396428436306, + "grad_norm": 1.9913108348846436, + "learning_rate": 4.424793386070653e-06, + "loss": 0.5629, + "step": 9069 + }, + { + "epoch": 0.6863153115659643, + "grad_norm": 2.234666347503662, + "learning_rate": 4.4228540396787795e-06, + "loss": 0.786, + "step": 9070 + }, + { + "epoch": 0.6863909802882978, + "grad_norm": 2.110463857650757, + "learning_rate": 4.4209149799225905e-06, + "loss": 0.657, + "step": 9071 + }, + { + "epoch": 0.6864666490106315, + "grad_norm": 2.373729705810547, + "learning_rate": 4.418976206923516e-06, + "loss": 0.672, + "step": 9072 + }, + { + "epoch": 0.686542317732965, + "grad_norm": 1.475502371788025, + "learning_rate": 4.4170377208029684e-06, + "loss": 0.8645, + "step": 9073 + }, + { + "epoch": 0.6866179864552987, + "grad_norm": 2.1476123332977295, + "learning_rate": 4.415099521682345e-06, + "loss": 0.7975, + "step": 9074 + }, + { + "epoch": 0.6866936551776324, + "grad_norm": 2.0400941371917725, + "learning_rate": 4.4131616096830155e-06, + "loss": 0.7912, + "step": 9075 + }, + { + "epoch": 0.6867693238999659, + "grad_norm": 1.941752314567566, + "learning_rate": 4.411223984926338e-06, + "loss": 0.6795, + "step": 9076 + }, + { + "epoch": 0.6868449926222996, + "grad_norm": 2.0431621074676514, + "learning_rate": 4.409286647533664e-06, + "loss": 0.7254, + "step": 9077 + }, + { + "epoch": 0.6869206613446331, + "grad_norm": 1.969773530960083, + "learning_rate": 4.407349597626304e-06, + "loss": 0.7392, + "step": 9078 + }, + { + "epoch": 0.6869963300669668, + "grad_norm": 2.117180109024048, + "learning_rate": 4.4054128353255676e-06, + "loss": 0.7551, + "step": 9079 + }, + { + "epoch": 0.6870719987893005, + "grad_norm": 2.118661403656006, + "learning_rate": 4.403476360752739e-06, + "loss": 0.7525, + "step": 9080 + }, + { + "epoch": 0.687147667511634, + "grad_norm": 1.820186972618103, + "learning_rate": 4.401540174029088e-06, + "loss": 0.5936, + "step": 9081 + }, + { + "epoch": 0.6872233362339677, + "grad_norm": 2.435248851776123, + "learning_rate": 4.399604275275865e-06, + "loss": 0.6549, + "step": 9082 + }, + { + "epoch": 0.6872990049563014, + "grad_norm": 2.5884740352630615, + "learning_rate": 4.397668664614301e-06, + "loss": 0.6127, + "step": 9083 + }, + { + "epoch": 0.6873746736786349, + "grad_norm": 2.3123669624328613, + "learning_rate": 4.395733342165612e-06, + "loss": 0.5723, + "step": 9084 + }, + { + "epoch": 0.6874503424009686, + "grad_norm": 2.0741138458251953, + "learning_rate": 4.393798308050996e-06, + "loss": 0.6145, + "step": 9085 + }, + { + "epoch": 0.6875260111233021, + "grad_norm": 2.0065953731536865, + "learning_rate": 4.3918635623916214e-06, + "loss": 0.7152, + "step": 9086 + }, + { + "epoch": 0.6876016798456358, + "grad_norm": 2.173222064971924, + "learning_rate": 4.389929105308658e-06, + "loss": 0.7504, + "step": 9087 + }, + { + "epoch": 0.6876773485679695, + "grad_norm": 2.4735090732574463, + "learning_rate": 4.3879949369232486e-06, + "loss": 0.68, + "step": 9088 + }, + { + "epoch": 0.687753017290303, + "grad_norm": 2.0955440998077393, + "learning_rate": 4.38606105735651e-06, + "loss": 0.6141, + "step": 9089 + }, + { + "epoch": 0.6878286860126367, + "grad_norm": 1.9914036989212036, + "learning_rate": 4.3841274667295524e-06, + "loss": 0.6674, + "step": 9090 + }, + { + "epoch": 0.6879043547349702, + "grad_norm": 1.948864221572876, + "learning_rate": 4.3821941651634605e-06, + "loss": 0.5495, + "step": 9091 + }, + { + "epoch": 0.6879800234573039, + "grad_norm": 2.403665542602539, + "learning_rate": 4.380261152779307e-06, + "loss": 0.6433, + "step": 9092 + }, + { + "epoch": 0.6880556921796376, + "grad_norm": 1.9592474699020386, + "learning_rate": 4.378328429698142e-06, + "loss": 0.6297, + "step": 9093 + }, + { + "epoch": 0.6881313609019711, + "grad_norm": 1.8239825963974, + "learning_rate": 4.3763959960409985e-06, + "loss": 0.6765, + "step": 9094 + }, + { + "epoch": 0.6882070296243048, + "grad_norm": 1.8471215963363647, + "learning_rate": 4.3744638519288915e-06, + "loss": 0.7255, + "step": 9095 + }, + { + "epoch": 0.6882826983466385, + "grad_norm": 1.9686150550842285, + "learning_rate": 4.372531997482823e-06, + "loss": 0.6088, + "step": 9096 + }, + { + "epoch": 0.688358367068972, + "grad_norm": 2.405791997909546, + "learning_rate": 4.370600432823762e-06, + "loss": 0.8179, + "step": 9097 + }, + { + "epoch": 0.6884340357913057, + "grad_norm": 2.5450029373168945, + "learning_rate": 4.368669158072678e-06, + "loss": 0.5827, + "step": 9098 + }, + { + "epoch": 0.6885097045136392, + "grad_norm": 2.776323080062866, + "learning_rate": 4.366738173350514e-06, + "loss": 0.7119, + "step": 9099 + }, + { + "epoch": 0.6885853732359729, + "grad_norm": 2.0069100856781006, + "learning_rate": 4.364807478778188e-06, + "loss": 0.7055, + "step": 9100 + }, + { + "epoch": 0.6886610419583066, + "grad_norm": 1.881259799003601, + "learning_rate": 4.362877074476611e-06, + "loss": 0.6521, + "step": 9101 + }, + { + "epoch": 0.6887367106806401, + "grad_norm": 2.377471923828125, + "learning_rate": 4.3609469605666686e-06, + "loss": 0.6983, + "step": 9102 + }, + { + "epoch": 0.6888123794029738, + "grad_norm": 3.351958751678467, + "learning_rate": 4.359017137169231e-06, + "loss": 0.7147, + "step": 9103 + }, + { + "epoch": 0.6888880481253074, + "grad_norm": 2.346928119659424, + "learning_rate": 4.3570876044051525e-06, + "loss": 0.6571, + "step": 9104 + }, + { + "epoch": 0.688963716847641, + "grad_norm": 1.908215880393982, + "learning_rate": 4.355158362395264e-06, + "loss": 0.7263, + "step": 9105 + }, + { + "epoch": 0.6890393855699747, + "grad_norm": 2.136303186416626, + "learning_rate": 4.353229411260387e-06, + "loss": 0.732, + "step": 9106 + }, + { + "epoch": 0.6891150542923082, + "grad_norm": 2.8035433292388916, + "learning_rate": 4.351300751121307e-06, + "loss": 0.6906, + "step": 9107 + }, + { + "epoch": 0.6891907230146419, + "grad_norm": 1.8425335884094238, + "learning_rate": 4.3493723820988125e-06, + "loss": 0.6579, + "step": 9108 + }, + { + "epoch": 0.6892663917369756, + "grad_norm": 1.5369822978973389, + "learning_rate": 4.347444304313661e-06, + "loss": 0.5209, + "step": 9109 + }, + { + "epoch": 0.6893420604593091, + "grad_norm": 1.7258330583572388, + "learning_rate": 4.345516517886599e-06, + "loss": 0.7984, + "step": 9110 + }, + { + "epoch": 0.6894177291816428, + "grad_norm": 2.101191759109497, + "learning_rate": 4.343589022938344e-06, + "loss": 0.6611, + "step": 9111 + }, + { + "epoch": 0.6894933979039763, + "grad_norm": 1.9967455863952637, + "learning_rate": 4.341661819589601e-06, + "loss": 0.7261, + "step": 9112 + }, + { + "epoch": 0.68956906662631, + "grad_norm": 1.7132256031036377, + "learning_rate": 4.339734907961069e-06, + "loss": 0.6795, + "step": 9113 + }, + { + "epoch": 0.6896447353486437, + "grad_norm": 1.7403509616851807, + "learning_rate": 4.337808288173407e-06, + "loss": 0.6901, + "step": 9114 + }, + { + "epoch": 0.6897204040709772, + "grad_norm": 2.2544641494750977, + "learning_rate": 4.335881960347269e-06, + "loss": 0.5939, + "step": 9115 + }, + { + "epoch": 0.6897960727933109, + "grad_norm": 2.149181842803955, + "learning_rate": 4.333955924603288e-06, + "loss": 0.8534, + "step": 9116 + }, + { + "epoch": 0.6898717415156445, + "grad_norm": 1.9814257621765137, + "learning_rate": 4.332030181062079e-06, + "loss": 0.7261, + "step": 9117 + }, + { + "epoch": 0.6899474102379781, + "grad_norm": 2.3098654747009277, + "learning_rate": 4.3301047298442385e-06, + "loss": 0.7357, + "step": 9118 + }, + { + "epoch": 0.6900230789603118, + "grad_norm": 1.9268317222595215, + "learning_rate": 4.3281795710703436e-06, + "loss": 0.6553, + "step": 9119 + }, + { + "epoch": 0.6900987476826453, + "grad_norm": 1.951817274093628, + "learning_rate": 4.326254704860959e-06, + "loss": 0.7475, + "step": 9120 + }, + { + "epoch": 0.690174416404979, + "grad_norm": 2.312189817428589, + "learning_rate": 4.324330131336617e-06, + "loss": 0.5805, + "step": 9121 + }, + { + "epoch": 0.6902500851273127, + "grad_norm": 1.6239818334579468, + "learning_rate": 4.322405850617842e-06, + "loss": 0.8134, + "step": 9122 + }, + { + "epoch": 0.6903257538496462, + "grad_norm": 2.053333044052124, + "learning_rate": 4.320481862825146e-06, + "loss": 0.6272, + "step": 9123 + }, + { + "epoch": 0.6904014225719799, + "grad_norm": 2.048210859298706, + "learning_rate": 4.318558168079012e-06, + "loss": 0.6803, + "step": 9124 + }, + { + "epoch": 0.6904770912943134, + "grad_norm": 2.4473769664764404, + "learning_rate": 4.316634766499906e-06, + "loss": 0.5872, + "step": 9125 + }, + { + "epoch": 0.6905527600166471, + "grad_norm": 2.2394003868103027, + "learning_rate": 4.314711658208278e-06, + "loss": 0.7076, + "step": 9126 + }, + { + "epoch": 0.6906284287389808, + "grad_norm": 1.7375953197479248, + "learning_rate": 4.31278884332456e-06, + "loss": 0.672, + "step": 9127 + }, + { + "epoch": 0.6907040974613143, + "grad_norm": 1.8307822942733765, + "learning_rate": 4.3108663219691656e-06, + "loss": 0.8224, + "step": 9128 + }, + { + "epoch": 0.690779766183648, + "grad_norm": 1.7547332048416138, + "learning_rate": 4.308944094262488e-06, + "loss": 0.6712, + "step": 9129 + }, + { + "epoch": 0.6908554349059816, + "grad_norm": 2.497670888900757, + "learning_rate": 4.3070221603249036e-06, + "loss": 0.753, + "step": 9130 + }, + { + "epoch": 0.6909311036283152, + "grad_norm": 1.989039421081543, + "learning_rate": 4.305100520276775e-06, + "loss": 0.7124, + "step": 9131 + }, + { + "epoch": 0.6910067723506489, + "grad_norm": 1.7251970767974854, + "learning_rate": 4.303179174238433e-06, + "loss": 0.6056, + "step": 9132 + }, + { + "epoch": 0.6910824410729824, + "grad_norm": 2.441929578781128, + "learning_rate": 4.301258122330198e-06, + "loss": 0.6691, + "step": 9133 + }, + { + "epoch": 0.6911581097953161, + "grad_norm": 2.3875224590301514, + "learning_rate": 4.299337364672385e-06, + "loss": 0.7165, + "step": 9134 + }, + { + "epoch": 0.6912337785176498, + "grad_norm": 1.9688384532928467, + "learning_rate": 4.297416901385267e-06, + "loss": 0.7152, + "step": 9135 + }, + { + "epoch": 0.6913094472399833, + "grad_norm": 2.073638677597046, + "learning_rate": 4.295496732589112e-06, + "loss": 0.6398, + "step": 9136 + }, + { + "epoch": 0.691385115962317, + "grad_norm": 2.102508068084717, + "learning_rate": 4.293576858404167e-06, + "loss": 0.6271, + "step": 9137 + }, + { + "epoch": 0.6914607846846506, + "grad_norm": 2.1116816997528076, + "learning_rate": 4.2916572789506625e-06, + "loss": 0.5456, + "step": 9138 + }, + { + "epoch": 0.6915364534069842, + "grad_norm": 2.1573615074157715, + "learning_rate": 4.2897379943488075e-06, + "loss": 0.8615, + "step": 9139 + }, + { + "epoch": 0.6916121221293179, + "grad_norm": 1.9969463348388672, + "learning_rate": 4.2878190047187944e-06, + "loss": 0.6516, + "step": 9140 + }, + { + "epoch": 0.6916877908516514, + "grad_norm": 1.9518414735794067, + "learning_rate": 4.285900310180796e-06, + "loss": 0.7213, + "step": 9141 + }, + { + "epoch": 0.6917634595739851, + "grad_norm": 1.909786581993103, + "learning_rate": 4.283981910854971e-06, + "loss": 0.6096, + "step": 9142 + }, + { + "epoch": 0.6918391282963187, + "grad_norm": 2.240398406982422, + "learning_rate": 4.2820638068614455e-06, + "loss": 0.6252, + "step": 9143 + }, + { + "epoch": 0.6919147970186523, + "grad_norm": 2.16809344291687, + "learning_rate": 4.280145998320347e-06, + "loss": 0.5408, + "step": 9144 + }, + { + "epoch": 0.691990465740986, + "grad_norm": 2.0366976261138916, + "learning_rate": 4.278228485351776e-06, + "loss": 0.757, + "step": 9145 + }, + { + "epoch": 0.6920661344633195, + "grad_norm": 1.8961420059204102, + "learning_rate": 4.276311268075806e-06, + "loss": 0.7155, + "step": 9146 + }, + { + "epoch": 0.6921418031856532, + "grad_norm": 1.6012108325958252, + "learning_rate": 4.274394346612502e-06, + "loss": 0.6259, + "step": 9147 + }, + { + "epoch": 0.6922174719079869, + "grad_norm": 2.297229290008545, + "learning_rate": 4.272477721081908e-06, + "loss": 0.6098, + "step": 9148 + }, + { + "epoch": 0.6922931406303204, + "grad_norm": 1.7989269495010376, + "learning_rate": 4.270561391604051e-06, + "loss": 0.6358, + "step": 9149 + }, + { + "epoch": 0.6923688093526541, + "grad_norm": 2.2176966667175293, + "learning_rate": 4.268645358298935e-06, + "loss": 0.668, + "step": 9150 + }, + { + "epoch": 0.6924444780749877, + "grad_norm": 2.412024974822998, + "learning_rate": 4.266729621286552e-06, + "loss": 0.7053, + "step": 9151 + }, + { + "epoch": 0.6925201467973213, + "grad_norm": 2.2924671173095703, + "learning_rate": 4.2648141806868705e-06, + "loss": 0.6765, + "step": 9152 + }, + { + "epoch": 0.692595815519655, + "grad_norm": 3.1433582305908203, + "learning_rate": 4.262899036619835e-06, + "loss": 0.7652, + "step": 9153 + }, + { + "epoch": 0.6926714842419885, + "grad_norm": 2.3667728900909424, + "learning_rate": 4.2609841892053865e-06, + "loss": 0.7888, + "step": 9154 + }, + { + "epoch": 0.6927471529643222, + "grad_norm": 2.2989847660064697, + "learning_rate": 4.259069638563436e-06, + "loss": 0.8005, + "step": 9155 + }, + { + "epoch": 0.6928228216866558, + "grad_norm": 2.9312691688537598, + "learning_rate": 4.257155384813883e-06, + "loss": 0.6623, + "step": 9156 + }, + { + "epoch": 0.6928984904089894, + "grad_norm": 1.8132741451263428, + "learning_rate": 4.255241428076595e-06, + "loss": 0.7629, + "step": 9157 + }, + { + "epoch": 0.6929741591313231, + "grad_norm": 2.117753505706787, + "learning_rate": 4.253327768471433e-06, + "loss": 0.827, + "step": 9158 + }, + { + "epoch": 0.6930498278536567, + "grad_norm": 2.0779871940612793, + "learning_rate": 4.2514144061182446e-06, + "loss": 0.6522, + "step": 9159 + }, + { + "epoch": 0.6931254965759903, + "grad_norm": 2.123748302459717, + "learning_rate": 4.249501341136843e-06, + "loss": 0.7253, + "step": 9160 + }, + { + "epoch": 0.693201165298324, + "grad_norm": 2.4232475757598877, + "learning_rate": 4.24758857364703e-06, + "loss": 0.6461, + "step": 9161 + }, + { + "epoch": 0.6932768340206575, + "grad_norm": 2.0096006393432617, + "learning_rate": 4.2456761037685936e-06, + "loss": 0.6071, + "step": 9162 + }, + { + "epoch": 0.6933525027429912, + "grad_norm": 3.09159255027771, + "learning_rate": 4.243763931621296e-06, + "loss": 0.5963, + "step": 9163 + }, + { + "epoch": 0.6934281714653248, + "grad_norm": 2.0967772006988525, + "learning_rate": 4.241852057324885e-06, + "loss": 0.6053, + "step": 9164 + }, + { + "epoch": 0.6935038401876584, + "grad_norm": 2.970052719116211, + "learning_rate": 4.239940480999087e-06, + "loss": 0.6578, + "step": 9165 + }, + { + "epoch": 0.6935795089099921, + "grad_norm": 2.1014604568481445, + "learning_rate": 4.238029202763617e-06, + "loss": 0.5055, + "step": 9166 + }, + { + "epoch": 0.6936551776323256, + "grad_norm": 2.0818746089935303, + "learning_rate": 4.2361182227381556e-06, + "loss": 0.7098, + "step": 9167 + }, + { + "epoch": 0.6937308463546593, + "grad_norm": 2.38274884223938, + "learning_rate": 4.23420754104238e-06, + "loss": 0.6155, + "step": 9168 + }, + { + "epoch": 0.693806515076993, + "grad_norm": 1.9527738094329834, + "learning_rate": 4.232297157795939e-06, + "loss": 0.6962, + "step": 9169 + }, + { + "epoch": 0.6938821837993265, + "grad_norm": 2.1649208068847656, + "learning_rate": 4.230387073118477e-06, + "loss": 0.7543, + "step": 9170 + }, + { + "epoch": 0.6939578525216602, + "grad_norm": 3.0182876586914062, + "learning_rate": 4.228477287129601e-06, + "loss": 0.9058, + "step": 9171 + }, + { + "epoch": 0.6940335212439938, + "grad_norm": 2.083481788635254, + "learning_rate": 4.226567799948909e-06, + "loss": 0.6472, + "step": 9172 + }, + { + "epoch": 0.6941091899663274, + "grad_norm": 2.3242437839508057, + "learning_rate": 4.224658611695981e-06, + "loss": 0.632, + "step": 9173 + }, + { + "epoch": 0.6941848586886611, + "grad_norm": 2.4421794414520264, + "learning_rate": 4.222749722490377e-06, + "loss": 0.6659, + "step": 9174 + }, + { + "epoch": 0.6942605274109946, + "grad_norm": 2.1332194805145264, + "learning_rate": 4.220841132451636e-06, + "loss": 0.6021, + "step": 9175 + }, + { + "epoch": 0.6943361961333283, + "grad_norm": 2.1453001499176025, + "learning_rate": 4.218932841699281e-06, + "loss": 0.7119, + "step": 9176 + }, + { + "epoch": 0.6944118648556619, + "grad_norm": 2.6229517459869385, + "learning_rate": 4.2170248503528195e-06, + "loss": 0.6428, + "step": 9177 + }, + { + "epoch": 0.6944875335779955, + "grad_norm": 1.7247917652130127, + "learning_rate": 4.215117158531727e-06, + "loss": 0.7384, + "step": 9178 + }, + { + "epoch": 0.6945632023003292, + "grad_norm": 1.925907850265503, + "learning_rate": 4.213209766355471e-06, + "loss": 0.6112, + "step": 9179 + }, + { + "epoch": 0.6946388710226628, + "grad_norm": 2.0717689990997314, + "learning_rate": 4.21130267394351e-06, + "loss": 0.7033, + "step": 9180 + }, + { + "epoch": 0.6947145397449964, + "grad_norm": 1.7757562398910522, + "learning_rate": 4.209395881415259e-06, + "loss": 0.5708, + "step": 9181 + }, + { + "epoch": 0.6947902084673301, + "grad_norm": 2.3035130500793457, + "learning_rate": 4.207489388890133e-06, + "loss": 0.6394, + "step": 9182 + }, + { + "epoch": 0.6948658771896636, + "grad_norm": 2.059298515319824, + "learning_rate": 4.205583196487524e-06, + "loss": 0.7353, + "step": 9183 + }, + { + "epoch": 0.6949415459119973, + "grad_norm": 1.9274907112121582, + "learning_rate": 4.2036773043268005e-06, + "loss": 0.7174, + "step": 9184 + }, + { + "epoch": 0.6950172146343309, + "grad_norm": 2.086277961730957, + "learning_rate": 4.201771712527318e-06, + "loss": 0.6836, + "step": 9185 + }, + { + "epoch": 0.6950928833566645, + "grad_norm": 2.223573684692383, + "learning_rate": 4.19986642120841e-06, + "loss": 0.6575, + "step": 9186 + }, + { + "epoch": 0.6951685520789982, + "grad_norm": 3.5324819087982178, + "learning_rate": 4.197961430489393e-06, + "loss": 0.7639, + "step": 9187 + }, + { + "epoch": 0.6952442208013317, + "grad_norm": 3.183359146118164, + "learning_rate": 4.1960567404895644e-06, + "loss": 0.6029, + "step": 9188 + }, + { + "epoch": 0.6953198895236654, + "grad_norm": 1.7968335151672363, + "learning_rate": 4.194152351328196e-06, + "loss": 0.8075, + "step": 9189 + }, + { + "epoch": 0.695395558245999, + "grad_norm": 1.813084602355957, + "learning_rate": 4.192248263124554e-06, + "loss": 0.7122, + "step": 9190 + }, + { + "epoch": 0.6954712269683326, + "grad_norm": 2.4125254154205322, + "learning_rate": 4.1903444759978785e-06, + "loss": 0.7781, + "step": 9191 + }, + { + "epoch": 0.6955468956906663, + "grad_norm": 2.2479751110076904, + "learning_rate": 4.188440990067385e-06, + "loss": 0.671, + "step": 9192 + }, + { + "epoch": 0.6956225644129999, + "grad_norm": 2.125385284423828, + "learning_rate": 4.18653780545228e-06, + "loss": 0.6195, + "step": 9193 + }, + { + "epoch": 0.6956982331353335, + "grad_norm": 2.0010180473327637, + "learning_rate": 4.184634922271746e-06, + "loss": 0.712, + "step": 9194 + }, + { + "epoch": 0.6957739018576672, + "grad_norm": 2.053400754928589, + "learning_rate": 4.182732340644948e-06, + "loss": 0.538, + "step": 9195 + }, + { + "epoch": 0.6958495705800007, + "grad_norm": 2.342834949493408, + "learning_rate": 4.180830060691031e-06, + "loss": 0.6268, + "step": 9196 + }, + { + "epoch": 0.6959252393023344, + "grad_norm": 2.2274081707000732, + "learning_rate": 4.178928082529123e-06, + "loss": 0.7135, + "step": 9197 + }, + { + "epoch": 0.696000908024668, + "grad_norm": 2.246617317199707, + "learning_rate": 4.177026406278332e-06, + "loss": 0.684, + "step": 9198 + }, + { + "epoch": 0.6960765767470016, + "grad_norm": 2.258347988128662, + "learning_rate": 4.1751250320577475e-06, + "loss": 0.7247, + "step": 9199 + }, + { + "epoch": 0.6961522454693353, + "grad_norm": 1.881675362586975, + "learning_rate": 4.173223959986437e-06, + "loss": 0.733, + "step": 9200 + }, + { + "epoch": 0.6962279141916689, + "grad_norm": 3.7385098934173584, + "learning_rate": 4.171323190183455e-06, + "loss": 0.585, + "step": 9201 + }, + { + "epoch": 0.6963035829140025, + "grad_norm": 2.185427665710449, + "learning_rate": 4.1694227227678365e-06, + "loss": 0.6836, + "step": 9202 + }, + { + "epoch": 0.6963792516363361, + "grad_norm": 2.1207618713378906, + "learning_rate": 4.167522557858588e-06, + "loss": 0.6311, + "step": 9203 + }, + { + "epoch": 0.6964549203586697, + "grad_norm": 1.9281233549118042, + "learning_rate": 4.165622695574704e-06, + "loss": 0.8161, + "step": 9204 + }, + { + "epoch": 0.6965305890810034, + "grad_norm": 2.0313522815704346, + "learning_rate": 4.163723136035168e-06, + "loss": 0.5862, + "step": 9205 + }, + { + "epoch": 0.696606257803337, + "grad_norm": 2.408750534057617, + "learning_rate": 4.161823879358929e-06, + "loss": 0.6848, + "step": 9206 + }, + { + "epoch": 0.6966819265256706, + "grad_norm": 1.5770151615142822, + "learning_rate": 4.159924925664927e-06, + "loss": 0.5911, + "step": 9207 + }, + { + "epoch": 0.6967575952480043, + "grad_norm": 2.710477113723755, + "learning_rate": 4.158026275072082e-06, + "loss": 0.7804, + "step": 9208 + }, + { + "epoch": 0.6968332639703378, + "grad_norm": 1.9035274982452393, + "learning_rate": 4.156127927699294e-06, + "loss": 0.6249, + "step": 9209 + }, + { + "epoch": 0.6969089326926715, + "grad_norm": 1.7576590776443481, + "learning_rate": 4.154229883665437e-06, + "loss": 0.5942, + "step": 9210 + }, + { + "epoch": 0.6969846014150051, + "grad_norm": 1.8701988458633423, + "learning_rate": 4.152332143089381e-06, + "loss": 0.601, + "step": 9211 + }, + { + "epoch": 0.6970602701373387, + "grad_norm": 2.1016006469726562, + "learning_rate": 4.150434706089965e-06, + "loss": 0.643, + "step": 9212 + }, + { + "epoch": 0.6971359388596724, + "grad_norm": 2.1549484729766846, + "learning_rate": 4.148537572786016e-06, + "loss": 0.734, + "step": 9213 + }, + { + "epoch": 0.697211607582006, + "grad_norm": 2.0905442237854004, + "learning_rate": 4.146640743296333e-06, + "loss": 0.6454, + "step": 9214 + }, + { + "epoch": 0.6972872763043396, + "grad_norm": 1.8356784582138062, + "learning_rate": 4.144744217739701e-06, + "loss": 0.6778, + "step": 9215 + }, + { + "epoch": 0.6973629450266732, + "grad_norm": 2.0387625694274902, + "learning_rate": 4.142847996234896e-06, + "loss": 0.6613, + "step": 9216 + }, + { + "epoch": 0.6974386137490068, + "grad_norm": 1.5910100936889648, + "learning_rate": 4.140952078900658e-06, + "loss": 0.5748, + "step": 9217 + }, + { + "epoch": 0.6975142824713405, + "grad_norm": 1.9364255666732788, + "learning_rate": 4.139056465855714e-06, + "loss": 0.6595, + "step": 9218 + }, + { + "epoch": 0.6975899511936741, + "grad_norm": 1.8187669515609741, + "learning_rate": 4.137161157218779e-06, + "loss": 0.5342, + "step": 9219 + }, + { + "epoch": 0.6976656199160077, + "grad_norm": 1.753604769706726, + "learning_rate": 4.135266153108539e-06, + "loss": 0.6342, + "step": 9220 + }, + { + "epoch": 0.6977412886383414, + "grad_norm": 2.2299137115478516, + "learning_rate": 4.133371453643668e-06, + "loss": 0.6365, + "step": 9221 + }, + { + "epoch": 0.697816957360675, + "grad_norm": 2.1055357456207275, + "learning_rate": 4.131477058942816e-06, + "loss": 0.8319, + "step": 9222 + }, + { + "epoch": 0.6978926260830086, + "grad_norm": 3.087947368621826, + "learning_rate": 4.12958296912462e-06, + "loss": 0.6226, + "step": 9223 + }, + { + "epoch": 0.6979682948053422, + "grad_norm": 1.8325108289718628, + "learning_rate": 4.127689184307691e-06, + "loss": 0.6056, + "step": 9224 + }, + { + "epoch": 0.6980439635276758, + "grad_norm": 2.2946414947509766, + "learning_rate": 4.1257957046106185e-06, + "loss": 0.6827, + "step": 9225 + }, + { + "epoch": 0.6981196322500095, + "grad_norm": 2.5224714279174805, + "learning_rate": 4.1239025301519875e-06, + "loss": 0.7167, + "step": 9226 + }, + { + "epoch": 0.6981953009723431, + "grad_norm": 1.8570104837417603, + "learning_rate": 4.122009661050355e-06, + "loss": 0.6781, + "step": 9227 + }, + { + "epoch": 0.6982709696946767, + "grad_norm": 2.250521183013916, + "learning_rate": 4.120117097424252e-06, + "loss": 0.6597, + "step": 9228 + }, + { + "epoch": 0.6983466384170103, + "grad_norm": 2.871605157852173, + "learning_rate": 4.118224839392199e-06, + "loss": 0.6728, + "step": 9229 + }, + { + "epoch": 0.698422307139344, + "grad_norm": 2.111581325531006, + "learning_rate": 4.116332887072697e-06, + "loss": 0.7846, + "step": 9230 + }, + { + "epoch": 0.6984979758616776, + "grad_norm": 2.0724074840545654, + "learning_rate": 4.1144412405842245e-06, + "loss": 0.6152, + "step": 9231 + }, + { + "epoch": 0.6985736445840112, + "grad_norm": 1.961753487586975, + "learning_rate": 4.112549900045244e-06, + "loss": 0.7844, + "step": 9232 + }, + { + "epoch": 0.6986493133063448, + "grad_norm": 1.6883488893508911, + "learning_rate": 4.1106588655741965e-06, + "loss": 0.7343, + "step": 9233 + }, + { + "epoch": 0.6987249820286785, + "grad_norm": 1.871146559715271, + "learning_rate": 4.108768137289507e-06, + "loss": 0.5357, + "step": 9234 + }, + { + "epoch": 0.698800650751012, + "grad_norm": 2.21620774269104, + "learning_rate": 4.106877715309572e-06, + "loss": 0.6735, + "step": 9235 + }, + { + "epoch": 0.6988763194733457, + "grad_norm": 1.7150919437408447, + "learning_rate": 4.104987599752783e-06, + "loss": 0.5634, + "step": 9236 + }, + { + "epoch": 0.6989519881956793, + "grad_norm": 2.08954119682312, + "learning_rate": 4.103097790737507e-06, + "loss": 0.6117, + "step": 9237 + }, + { + "epoch": 0.6990276569180129, + "grad_norm": 2.288553237915039, + "learning_rate": 4.101208288382082e-06, + "loss": 0.7672, + "step": 9238 + }, + { + "epoch": 0.6991033256403466, + "grad_norm": 1.9526705741882324, + "learning_rate": 4.099319092804839e-06, + "loss": 0.7364, + "step": 9239 + }, + { + "epoch": 0.6991789943626802, + "grad_norm": 2.6329140663146973, + "learning_rate": 4.097430204124082e-06, + "loss": 0.7055, + "step": 9240 + }, + { + "epoch": 0.6992546630850138, + "grad_norm": 2.0371086597442627, + "learning_rate": 4.095541622458108e-06, + "loss": 0.6239, + "step": 9241 + }, + { + "epoch": 0.6993303318073474, + "grad_norm": 1.7768152952194214, + "learning_rate": 4.093653347925178e-06, + "loss": 0.651, + "step": 9242 + }, + { + "epoch": 0.699406000529681, + "grad_norm": 1.7975434064865112, + "learning_rate": 4.091765380643544e-06, + "loss": 0.6697, + "step": 9243 + }, + { + "epoch": 0.6994816692520147, + "grad_norm": 1.8695652484893799, + "learning_rate": 4.089877720731438e-06, + "loss": 0.768, + "step": 9244 + }, + { + "epoch": 0.6995573379743483, + "grad_norm": 2.318563222885132, + "learning_rate": 4.087990368307073e-06, + "loss": 0.6121, + "step": 9245 + }, + { + "epoch": 0.6996330066966819, + "grad_norm": 1.9575533866882324, + "learning_rate": 4.0861033234886305e-06, + "loss": 0.7827, + "step": 9246 + }, + { + "epoch": 0.6997086754190156, + "grad_norm": 2.9256958961486816, + "learning_rate": 4.084216586394297e-06, + "loss": 0.5644, + "step": 9247 + }, + { + "epoch": 0.6997843441413492, + "grad_norm": 2.3860838413238525, + "learning_rate": 4.082330157142222e-06, + "loss": 0.8408, + "step": 9248 + }, + { + "epoch": 0.6998600128636828, + "grad_norm": 1.9278440475463867, + "learning_rate": 4.080444035850536e-06, + "loss": 0.6105, + "step": 9249 + }, + { + "epoch": 0.6999356815860164, + "grad_norm": 2.849597215652466, + "learning_rate": 4.0785582226373555e-06, + "loss": 0.6557, + "step": 9250 + }, + { + "epoch": 0.70001135030835, + "grad_norm": 2.0541129112243652, + "learning_rate": 4.076672717620778e-06, + "loss": 0.6789, + "step": 9251 + }, + { + "epoch": 0.7000870190306837, + "grad_norm": 1.8564491271972656, + "learning_rate": 4.074787520918878e-06, + "loss": 0.6847, + "step": 9252 + }, + { + "epoch": 0.7001626877530173, + "grad_norm": 1.949328899383545, + "learning_rate": 4.072902632649714e-06, + "loss": 0.7276, + "step": 9253 + }, + { + "epoch": 0.7002383564753509, + "grad_norm": 1.7822113037109375, + "learning_rate": 4.071018052931323e-06, + "loss": 0.698, + "step": 9254 + }, + { + "epoch": 0.7003140251976845, + "grad_norm": 2.231820821762085, + "learning_rate": 4.069133781881727e-06, + "loss": 0.601, + "step": 9255 + }, + { + "epoch": 0.7003896939200182, + "grad_norm": 2.3078548908233643, + "learning_rate": 4.067249819618916e-06, + "loss": 0.646, + "step": 9256 + }, + { + "epoch": 0.7004653626423518, + "grad_norm": 2.0806334018707275, + "learning_rate": 4.065366166260878e-06, + "loss": 0.6797, + "step": 9257 + }, + { + "epoch": 0.7005410313646854, + "grad_norm": 1.993569254875183, + "learning_rate": 4.063482821925572e-06, + "loss": 0.6611, + "step": 9258 + }, + { + "epoch": 0.700616700087019, + "grad_norm": 2.1819117069244385, + "learning_rate": 4.061599786730941e-06, + "loss": 0.6336, + "step": 9259 + }, + { + "epoch": 0.7006923688093527, + "grad_norm": 2.2110774517059326, + "learning_rate": 4.059717060794902e-06, + "loss": 0.5346, + "step": 9260 + }, + { + "epoch": 0.7007680375316863, + "grad_norm": 2.1333065032958984, + "learning_rate": 4.057834644235355e-06, + "loss": 0.7319, + "step": 9261 + }, + { + "epoch": 0.7008437062540199, + "grad_norm": 1.8538813591003418, + "learning_rate": 4.055952537170195e-06, + "loss": 0.6036, + "step": 9262 + }, + { + "epoch": 0.7009193749763535, + "grad_norm": 2.097684144973755, + "learning_rate": 4.054070739717274e-06, + "loss": 0.7522, + "step": 9263 + }, + { + "epoch": 0.7009950436986871, + "grad_norm": 1.6460198163986206, + "learning_rate": 4.05218925199444e-06, + "loss": 0.533, + "step": 9264 + }, + { + "epoch": 0.7010707124210208, + "grad_norm": 4.016617774963379, + "learning_rate": 4.05030807411952e-06, + "loss": 0.5638, + "step": 9265 + }, + { + "epoch": 0.7011463811433544, + "grad_norm": 1.8979525566101074, + "learning_rate": 4.048427206210316e-06, + "loss": 0.6731, + "step": 9266 + }, + { + "epoch": 0.701222049865688, + "grad_norm": 2.001255989074707, + "learning_rate": 4.046546648384616e-06, + "loss": 0.5913, + "step": 9267 + }, + { + "epoch": 0.7012977185880216, + "grad_norm": 2.3916757106781006, + "learning_rate": 4.044666400760186e-06, + "loss": 0.5653, + "step": 9268 + }, + { + "epoch": 0.7013733873103553, + "grad_norm": 1.7634689807891846, + "learning_rate": 4.042786463454778e-06, + "loss": 0.6478, + "step": 9269 + }, + { + "epoch": 0.7014490560326889, + "grad_norm": 2.492938280105591, + "learning_rate": 4.04090683658611e-06, + "loss": 0.6283, + "step": 9270 + }, + { + "epoch": 0.7015247247550225, + "grad_norm": 1.7969627380371094, + "learning_rate": 4.039027520271894e-06, + "loss": 0.6721, + "step": 9271 + }, + { + "epoch": 0.7016003934773561, + "grad_norm": 1.9584459066390991, + "learning_rate": 4.037148514629823e-06, + "loss": 0.5983, + "step": 9272 + }, + { + "epoch": 0.7016760621996898, + "grad_norm": 2.0558922290802, + "learning_rate": 4.035269819777567e-06, + "loss": 0.7428, + "step": 9273 + }, + { + "epoch": 0.7017517309220234, + "grad_norm": 1.8677301406860352, + "learning_rate": 4.03339143583277e-06, + "loss": 0.8003, + "step": 9274 + }, + { + "epoch": 0.701827399644357, + "grad_norm": 2.25616192817688, + "learning_rate": 4.0315133629130645e-06, + "loss": 0.6491, + "step": 9275 + }, + { + "epoch": 0.7019030683666906, + "grad_norm": 2.1559536457061768, + "learning_rate": 4.029635601136063e-06, + "loss": 0.6757, + "step": 9276 + }, + { + "epoch": 0.7019787370890243, + "grad_norm": 1.8619732856750488, + "learning_rate": 4.027758150619356e-06, + "loss": 0.5132, + "step": 9277 + }, + { + "epoch": 0.7020544058113579, + "grad_norm": 1.7390613555908203, + "learning_rate": 4.0258810114805156e-06, + "loss": 0.6006, + "step": 9278 + }, + { + "epoch": 0.7021300745336915, + "grad_norm": 2.2929675579071045, + "learning_rate": 4.024004183837095e-06, + "loss": 0.605, + "step": 9279 + }, + { + "epoch": 0.7022057432560251, + "grad_norm": 2.116135835647583, + "learning_rate": 4.022127667806629e-06, + "loss": 0.8126, + "step": 9280 + }, + { + "epoch": 0.7022814119783587, + "grad_norm": 2.891150951385498, + "learning_rate": 4.020251463506623e-06, + "loss": 0.7529, + "step": 9281 + }, + { + "epoch": 0.7023570807006924, + "grad_norm": 2.0369067192077637, + "learning_rate": 4.01837557105458e-06, + "loss": 0.7471, + "step": 9282 + }, + { + "epoch": 0.702432749423026, + "grad_norm": 2.0991392135620117, + "learning_rate": 4.016499990567975e-06, + "loss": 0.8235, + "step": 9283 + }, + { + "epoch": 0.7025084181453596, + "grad_norm": 2.3228790760040283, + "learning_rate": 4.014624722164255e-06, + "loss": 0.5768, + "step": 9284 + }, + { + "epoch": 0.7025840868676932, + "grad_norm": 2.1848552227020264, + "learning_rate": 4.01274976596086e-06, + "loss": 0.6158, + "step": 9285 + }, + { + "epoch": 0.7026597555900269, + "grad_norm": 2.320488452911377, + "learning_rate": 4.0108751220752065e-06, + "loss": 0.7085, + "step": 9286 + }, + { + "epoch": 0.7027354243123605, + "grad_norm": 3.8415627479553223, + "learning_rate": 4.009000790624687e-06, + "loss": 0.7112, + "step": 9287 + }, + { + "epoch": 0.7028110930346941, + "grad_norm": 2.0608065128326416, + "learning_rate": 4.007126771726684e-06, + "loss": 0.6086, + "step": 9288 + }, + { + "epoch": 0.7028867617570277, + "grad_norm": 1.4849302768707275, + "learning_rate": 4.005253065498549e-06, + "loss": 0.6349, + "step": 9289 + }, + { + "epoch": 0.7029624304793614, + "grad_norm": 3.0800068378448486, + "learning_rate": 4.003379672057622e-06, + "loss": 0.7289, + "step": 9290 + }, + { + "epoch": 0.703038099201695, + "grad_norm": 2.145936965942383, + "learning_rate": 4.001506591521226e-06, + "loss": 0.7398, + "step": 9291 + }, + { + "epoch": 0.7031137679240286, + "grad_norm": 1.8864727020263672, + "learning_rate": 3.999633824006647e-06, + "loss": 0.6183, + "step": 9292 + }, + { + "epoch": 0.7031894366463622, + "grad_norm": 1.6764299869537354, + "learning_rate": 3.997761369631174e-06, + "loss": 0.6655, + "step": 9293 + }, + { + "epoch": 0.7032651053686958, + "grad_norm": 1.7775150537490845, + "learning_rate": 3.9958892285120674e-06, + "loss": 0.5845, + "step": 9294 + }, + { + "epoch": 0.7033407740910295, + "grad_norm": 1.978812575340271, + "learning_rate": 3.994017400766558e-06, + "loss": 0.7968, + "step": 9295 + }, + { + "epoch": 0.7034164428133631, + "grad_norm": 2.058699369430542, + "learning_rate": 3.992145886511871e-06, + "loss": 0.6392, + "step": 9296 + }, + { + "epoch": 0.7034921115356967, + "grad_norm": 1.882474422454834, + "learning_rate": 3.990274685865206e-06, + "loss": 0.5851, + "step": 9297 + }, + { + "epoch": 0.7035677802580304, + "grad_norm": 1.9979125261306763, + "learning_rate": 3.988403798943743e-06, + "loss": 0.8891, + "step": 9298 + }, + { + "epoch": 0.703643448980364, + "grad_norm": 1.9912859201431274, + "learning_rate": 3.986533225864645e-06, + "loss": 0.6231, + "step": 9299 + }, + { + "epoch": 0.7037191177026976, + "grad_norm": 2.228140115737915, + "learning_rate": 3.984662966745051e-06, + "loss": 0.8467, + "step": 9300 + }, + { + "epoch": 0.7037947864250312, + "grad_norm": 1.6897190809249878, + "learning_rate": 3.982793021702084e-06, + "loss": 0.7131, + "step": 9301 + }, + { + "epoch": 0.7038704551473648, + "grad_norm": 2.1993837356567383, + "learning_rate": 3.980923390852844e-06, + "loss": 0.7983, + "step": 9302 + }, + { + "epoch": 0.7039461238696985, + "grad_norm": 2.0132064819335938, + "learning_rate": 3.979054074314417e-06, + "loss": 0.7792, + "step": 9303 + }, + { + "epoch": 0.7040217925920321, + "grad_norm": 2.138044595718384, + "learning_rate": 3.977185072203862e-06, + "loss": 0.692, + "step": 9304 + }, + { + "epoch": 0.7040974613143657, + "grad_norm": 1.7573822736740112, + "learning_rate": 3.975316384638228e-06, + "loss": 0.5628, + "step": 9305 + }, + { + "epoch": 0.7041731300366993, + "grad_norm": 1.9482179880142212, + "learning_rate": 3.97344801173453e-06, + "loss": 0.4971, + "step": 9306 + }, + { + "epoch": 0.7042487987590329, + "grad_norm": 2.954737424850464, + "learning_rate": 3.971579953609772e-06, + "loss": 0.8212, + "step": 9307 + }, + { + "epoch": 0.7043244674813666, + "grad_norm": 1.92641019821167, + "learning_rate": 3.9697122103809475e-06, + "loss": 0.8241, + "step": 9308 + }, + { + "epoch": 0.7044001362037002, + "grad_norm": 2.1319527626037598, + "learning_rate": 3.967844782165012e-06, + "loss": 0.6169, + "step": 9309 + }, + { + "epoch": 0.7044758049260338, + "grad_norm": 1.711698293685913, + "learning_rate": 3.9659776690789104e-06, + "loss": 0.6958, + "step": 9310 + }, + { + "epoch": 0.7045514736483675, + "grad_norm": 2.079453468322754, + "learning_rate": 3.9641108712395714e-06, + "loss": 0.6348, + "step": 9311 + }, + { + "epoch": 0.7046271423707011, + "grad_norm": 3.2162301540374756, + "learning_rate": 3.962244388763896e-06, + "loss": 0.6935, + "step": 9312 + }, + { + "epoch": 0.7047028110930347, + "grad_norm": 1.953348159790039, + "learning_rate": 3.960378221768772e-06, + "loss": 0.586, + "step": 9313 + }, + { + "epoch": 0.7047784798153683, + "grad_norm": 2.384439468383789, + "learning_rate": 3.958512370371063e-06, + "loss": 0.6983, + "step": 9314 + }, + { + "epoch": 0.7048541485377019, + "grad_norm": 1.7614166736602783, + "learning_rate": 3.956646834687616e-06, + "loss": 0.7052, + "step": 9315 + }, + { + "epoch": 0.7049298172600356, + "grad_norm": 5.346411228179932, + "learning_rate": 3.95478161483526e-06, + "loss": 0.7654, + "step": 9316 + }, + { + "epoch": 0.7050054859823692, + "grad_norm": 2.2105648517608643, + "learning_rate": 3.9529167109307915e-06, + "loss": 0.7921, + "step": 9317 + }, + { + "epoch": 0.7050811547047028, + "grad_norm": 2.0886669158935547, + "learning_rate": 3.951052123091005e-06, + "loss": 0.6371, + "step": 9318 + }, + { + "epoch": 0.7051568234270364, + "grad_norm": 1.895363450050354, + "learning_rate": 3.949187851432667e-06, + "loss": 0.6704, + "step": 9319 + }, + { + "epoch": 0.70523249214937, + "grad_norm": 3.5013060569763184, + "learning_rate": 3.947323896072521e-06, + "loss": 0.6872, + "step": 9320 + }, + { + "epoch": 0.7053081608717037, + "grad_norm": 1.9184173345565796, + "learning_rate": 3.945460257127294e-06, + "loss": 0.665, + "step": 9321 + }, + { + "epoch": 0.7053838295940373, + "grad_norm": 1.5872775316238403, + "learning_rate": 3.943596934713695e-06, + "loss": 0.8925, + "step": 9322 + }, + { + "epoch": 0.7054594983163709, + "grad_norm": 2.0756430625915527, + "learning_rate": 3.9417339289484085e-06, + "loss": 0.7668, + "step": 9323 + }, + { + "epoch": 0.7055351670387046, + "grad_norm": 2.210831642150879, + "learning_rate": 3.939871239948105e-06, + "loss": 0.697, + "step": 9324 + }, + { + "epoch": 0.7056108357610382, + "grad_norm": 1.8079180717468262, + "learning_rate": 3.93800886782943e-06, + "loss": 0.749, + "step": 9325 + }, + { + "epoch": 0.7056865044833718, + "grad_norm": 2.1431519985198975, + "learning_rate": 3.936146812709017e-06, + "loss": 0.7066, + "step": 9326 + }, + { + "epoch": 0.7057621732057054, + "grad_norm": 1.8611176013946533, + "learning_rate": 3.934285074703465e-06, + "loss": 0.6619, + "step": 9327 + }, + { + "epoch": 0.705837841928039, + "grad_norm": 6.968406677246094, + "learning_rate": 3.932423653929362e-06, + "loss": 0.5841, + "step": 9328 + }, + { + "epoch": 0.7059135106503727, + "grad_norm": 2.0079822540283203, + "learning_rate": 3.930562550503284e-06, + "loss": 0.568, + "step": 9329 + }, + { + "epoch": 0.7059891793727063, + "grad_norm": 2.0030272006988525, + "learning_rate": 3.92870176454178e-06, + "loss": 0.6616, + "step": 9330 + }, + { + "epoch": 0.7060648480950399, + "grad_norm": 2.209120273590088, + "learning_rate": 3.926841296161369e-06, + "loss": 0.6471, + "step": 9331 + }, + { + "epoch": 0.7061405168173736, + "grad_norm": 2.2087392807006836, + "learning_rate": 3.924981145478567e-06, + "loss": 0.6976, + "step": 9332 + }, + { + "epoch": 0.7062161855397071, + "grad_norm": 2.025752067565918, + "learning_rate": 3.923121312609859e-06, + "loss": 0.6718, + "step": 9333 + }, + { + "epoch": 0.7062918542620408, + "grad_norm": 2.695591926574707, + "learning_rate": 3.921261797671714e-06, + "loss": 0.7283, + "step": 9334 + }, + { + "epoch": 0.7063675229843744, + "grad_norm": 2.5060596466064453, + "learning_rate": 3.9194026007805834e-06, + "loss": 0.7901, + "step": 9335 + }, + { + "epoch": 0.706443191706708, + "grad_norm": 1.7619905471801758, + "learning_rate": 3.917543722052894e-06, + "loss": 0.5261, + "step": 9336 + }, + { + "epoch": 0.7065188604290417, + "grad_norm": 2.090834379196167, + "learning_rate": 3.915685161605058e-06, + "loss": 0.6204, + "step": 9337 + }, + { + "epoch": 0.7065945291513753, + "grad_norm": 1.739188313484192, + "learning_rate": 3.913826919553457e-06, + "loss": 0.6318, + "step": 9338 + }, + { + "epoch": 0.7066701978737089, + "grad_norm": 2.2699122428894043, + "learning_rate": 3.911968996014467e-06, + "loss": 0.727, + "step": 9339 + }, + { + "epoch": 0.7067458665960425, + "grad_norm": 1.8585529327392578, + "learning_rate": 3.910111391104438e-06, + "loss": 0.6699, + "step": 9340 + }, + { + "epoch": 0.7068215353183761, + "grad_norm": 2.3249197006225586, + "learning_rate": 3.908254104939695e-06, + "loss": 0.5889, + "step": 9341 + }, + { + "epoch": 0.7068972040407098, + "grad_norm": 1.8129733800888062, + "learning_rate": 3.906397137636547e-06, + "loss": 0.6422, + "step": 9342 + }, + { + "epoch": 0.7069728727630434, + "grad_norm": 1.9523649215698242, + "learning_rate": 3.9045404893112815e-06, + "loss": 0.6836, + "step": 9343 + }, + { + "epoch": 0.707048541485377, + "grad_norm": 1.462280035018921, + "learning_rate": 3.902684160080179e-06, + "loss": 0.5477, + "step": 9344 + }, + { + "epoch": 0.7071242102077107, + "grad_norm": 2.921168327331543, + "learning_rate": 3.900828150059477e-06, + "loss": 0.5474, + "step": 9345 + }, + { + "epoch": 0.7071998789300442, + "grad_norm": 1.826545238494873, + "learning_rate": 3.898972459365409e-06, + "loss": 0.7442, + "step": 9346 + }, + { + "epoch": 0.7072755476523779, + "grad_norm": 1.8657852411270142, + "learning_rate": 3.897117088114185e-06, + "loss": 0.6929, + "step": 9347 + }, + { + "epoch": 0.7073512163747115, + "grad_norm": 2.4558889865875244, + "learning_rate": 3.895262036421993e-06, + "loss": 0.842, + "step": 9348 + }, + { + "epoch": 0.7074268850970451, + "grad_norm": 1.9185665845870972, + "learning_rate": 3.893407304405003e-06, + "loss": 0.6258, + "step": 9349 + }, + { + "epoch": 0.7075025538193788, + "grad_norm": 2.26891827583313, + "learning_rate": 3.891552892179365e-06, + "loss": 0.7757, + "step": 9350 + }, + { + "epoch": 0.7075782225417124, + "grad_norm": 2.160792589187622, + "learning_rate": 3.88969879986121e-06, + "loss": 0.6637, + "step": 9351 + }, + { + "epoch": 0.707653891264046, + "grad_norm": 2.357847213745117, + "learning_rate": 3.887845027566642e-06, + "loss": 0.6997, + "step": 9352 + }, + { + "epoch": 0.7077295599863797, + "grad_norm": 1.9302562475204468, + "learning_rate": 3.8859915754117505e-06, + "loss": 0.8658, + "step": 9353 + }, + { + "epoch": 0.7078052287087132, + "grad_norm": 2.308100700378418, + "learning_rate": 3.884138443512612e-06, + "loss": 0.6944, + "step": 9354 + }, + { + "epoch": 0.7078808974310469, + "grad_norm": 2.170538902282715, + "learning_rate": 3.882285631985269e-06, + "loss": 0.6771, + "step": 9355 + }, + { + "epoch": 0.7079565661533805, + "grad_norm": 1.9224172830581665, + "learning_rate": 3.880433140945753e-06, + "loss": 0.592, + "step": 9356 + }, + { + "epoch": 0.7080322348757141, + "grad_norm": 1.849142074584961, + "learning_rate": 3.878580970510071e-06, + "loss": 0.5242, + "step": 9357 + }, + { + "epoch": 0.7081079035980478, + "grad_norm": 1.9761168956756592, + "learning_rate": 3.876729120794215e-06, + "loss": 0.706, + "step": 9358 + }, + { + "epoch": 0.7081835723203813, + "grad_norm": 2.217850685119629, + "learning_rate": 3.87487759191415e-06, + "loss": 0.6843, + "step": 9359 + }, + { + "epoch": 0.708259241042715, + "grad_norm": 2.1054461002349854, + "learning_rate": 3.873026383985828e-06, + "loss": 0.7485, + "step": 9360 + }, + { + "epoch": 0.7083349097650486, + "grad_norm": 1.964118480682373, + "learning_rate": 3.871175497125176e-06, + "loss": 0.5514, + "step": 9361 + }, + { + "epoch": 0.7084105784873822, + "grad_norm": 1.676416039466858, + "learning_rate": 3.869324931448107e-06, + "loss": 0.587, + "step": 9362 + }, + { + "epoch": 0.7084862472097159, + "grad_norm": 1.9723479747772217, + "learning_rate": 3.867474687070502e-06, + "loss": 0.697, + "step": 9363 + }, + { + "epoch": 0.7085619159320495, + "grad_norm": 2.1898975372314453, + "learning_rate": 3.865624764108229e-06, + "loss": 0.6789, + "step": 9364 + }, + { + "epoch": 0.7086375846543831, + "grad_norm": 2.4142651557922363, + "learning_rate": 3.863775162677147e-06, + "loss": 0.7163, + "step": 9365 + }, + { + "epoch": 0.7087132533767168, + "grad_norm": 2.3115365505218506, + "learning_rate": 3.8619258828930725e-06, + "loss": 0.5373, + "step": 9366 + }, + { + "epoch": 0.7087889220990503, + "grad_norm": 2.0673863887786865, + "learning_rate": 3.860076924871818e-06, + "loss": 0.5509, + "step": 9367 + }, + { + "epoch": 0.708864590821384, + "grad_norm": 2.3030877113342285, + "learning_rate": 3.8582282887291724e-06, + "loss": 0.754, + "step": 9368 + }, + { + "epoch": 0.7089402595437176, + "grad_norm": 2.19999361038208, + "learning_rate": 3.856379974580901e-06, + "loss": 0.6528, + "step": 9369 + }, + { + "epoch": 0.7090159282660512, + "grad_norm": 2.0521130561828613, + "learning_rate": 3.854531982542751e-06, + "loss": 0.7486, + "step": 9370 + }, + { + "epoch": 0.7090915969883849, + "grad_norm": 1.9347343444824219, + "learning_rate": 3.852684312730452e-06, + "loss": 0.7396, + "step": 9371 + }, + { + "epoch": 0.7091672657107184, + "grad_norm": 5.190321922302246, + "learning_rate": 3.850836965259713e-06, + "loss": 0.6111, + "step": 9372 + }, + { + "epoch": 0.7092429344330521, + "grad_norm": 1.797953724861145, + "learning_rate": 3.848989940246214e-06, + "loss": 0.6532, + "step": 9373 + }, + { + "epoch": 0.7093186031553858, + "grad_norm": 2.0552330017089844, + "learning_rate": 3.847143237805622e-06, + "loss": 0.6478, + "step": 9374 + }, + { + "epoch": 0.7093942718777193, + "grad_norm": 2.1826653480529785, + "learning_rate": 3.845296858053591e-06, + "loss": 0.674, + "step": 9375 + }, + { + "epoch": 0.709469940600053, + "grad_norm": 2.6305980682373047, + "learning_rate": 3.8434508011057456e-06, + "loss": 0.7174, + "step": 9376 + }, + { + "epoch": 0.7095456093223866, + "grad_norm": 2.358619451522827, + "learning_rate": 3.841605067077686e-06, + "loss": 0.7223, + "step": 9377 + }, + { + "epoch": 0.7096212780447202, + "grad_norm": 1.9301419258117676, + "learning_rate": 3.839759656085001e-06, + "loss": 0.6305, + "step": 9378 + }, + { + "epoch": 0.7096969467670539, + "grad_norm": 1.9100176095962524, + "learning_rate": 3.8379145682432565e-06, + "loss": 0.6708, + "step": 9379 + }, + { + "epoch": 0.7097726154893874, + "grad_norm": 2.4053359031677246, + "learning_rate": 3.836069803667998e-06, + "loss": 0.6106, + "step": 9380 + }, + { + "epoch": 0.7098482842117211, + "grad_norm": 4.036832332611084, + "learning_rate": 3.834225362474753e-06, + "loss": 0.6419, + "step": 9381 + }, + { + "epoch": 0.7099239529340547, + "grad_norm": 2.3508119583129883, + "learning_rate": 3.8323812447790205e-06, + "loss": 0.8037, + "step": 9382 + }, + { + "epoch": 0.7099996216563883, + "grad_norm": 2.4832112789154053, + "learning_rate": 3.830537450696293e-06, + "loss": 0.6607, + "step": 9383 + }, + { + "epoch": 0.710075290378722, + "grad_norm": 1.9314616918563843, + "learning_rate": 3.828693980342024e-06, + "loss": 0.7162, + "step": 9384 + }, + { + "epoch": 0.7101509591010555, + "grad_norm": 2.2306244373321533, + "learning_rate": 3.826850833831668e-06, + "loss": 0.7208, + "step": 9385 + }, + { + "epoch": 0.7102266278233892, + "grad_norm": 2.328071355819702, + "learning_rate": 3.825008011280648e-06, + "loss": 0.6431, + "step": 9386 + }, + { + "epoch": 0.7103022965457229, + "grad_norm": 1.6222195625305176, + "learning_rate": 3.823165512804361e-06, + "loss": 0.6632, + "step": 9387 + }, + { + "epoch": 0.7103779652680564, + "grad_norm": 2.752122640609741, + "learning_rate": 3.821323338518193e-06, + "loss": 0.7188, + "step": 9388 + }, + { + "epoch": 0.7104536339903901, + "grad_norm": 1.881763219833374, + "learning_rate": 3.819481488537504e-06, + "loss": 0.6389, + "step": 9389 + }, + { + "epoch": 0.7105293027127237, + "grad_norm": 1.7420378923416138, + "learning_rate": 3.817639962977646e-06, + "loss": 0.7323, + "step": 9390 + }, + { + "epoch": 0.7106049714350573, + "grad_norm": 2.1217947006225586, + "learning_rate": 3.815798761953933e-06, + "loss": 0.7075, + "step": 9391 + }, + { + "epoch": 0.710680640157391, + "grad_norm": 2.142434597015381, + "learning_rate": 3.813957885581669e-06, + "loss": 0.7825, + "step": 9392 + }, + { + "epoch": 0.7107563088797245, + "grad_norm": 2.093893051147461, + "learning_rate": 3.8121173339761356e-06, + "loss": 0.6228, + "step": 9393 + }, + { + "epoch": 0.7108319776020582, + "grad_norm": 2.101154088973999, + "learning_rate": 3.8102771072525944e-06, + "loss": 0.8039, + "step": 9394 + }, + { + "epoch": 0.7109076463243919, + "grad_norm": 1.9551880359649658, + "learning_rate": 3.8084372055262866e-06, + "loss": 0.7045, + "step": 9395 + }, + { + "epoch": 0.7109833150467254, + "grad_norm": 1.807440996170044, + "learning_rate": 3.8065976289124328e-06, + "loss": 0.6904, + "step": 9396 + }, + { + "epoch": 0.7110589837690591, + "grad_norm": 1.9995956420898438, + "learning_rate": 3.8047583775262367e-06, + "loss": 0.7355, + "step": 9397 + }, + { + "epoch": 0.7111346524913927, + "grad_norm": 2.0244057178497314, + "learning_rate": 3.80291945148287e-06, + "loss": 0.6372, + "step": 9398 + }, + { + "epoch": 0.7112103212137263, + "grad_norm": 2.710584878921509, + "learning_rate": 3.801080850897497e-06, + "loss": 0.8311, + "step": 9399 + }, + { + "epoch": 0.71128598993606, + "grad_norm": 1.9999345541000366, + "learning_rate": 3.7992425758852565e-06, + "loss": 0.6724, + "step": 9400 + }, + { + "epoch": 0.7113616586583935, + "grad_norm": 2.0285205841064453, + "learning_rate": 3.7974046265612676e-06, + "loss": 0.6539, + "step": 9401 + }, + { + "epoch": 0.7114373273807272, + "grad_norm": 1.9455727338790894, + "learning_rate": 3.795567003040628e-06, + "loss": 0.472, + "step": 9402 + }, + { + "epoch": 0.7115129961030608, + "grad_norm": 2.234898090362549, + "learning_rate": 3.7937297054384152e-06, + "loss": 0.679, + "step": 9403 + }, + { + "epoch": 0.7115886648253944, + "grad_norm": 2.2903592586517334, + "learning_rate": 3.791892733869688e-06, + "loss": 0.5767, + "step": 9404 + }, + { + "epoch": 0.7116643335477281, + "grad_norm": 2.1868555545806885, + "learning_rate": 3.790056088449483e-06, + "loss": 0.6769, + "step": 9405 + }, + { + "epoch": 0.7117400022700616, + "grad_norm": 2.426342725753784, + "learning_rate": 3.7882197692928168e-06, + "loss": 0.6697, + "step": 9406 + }, + { + "epoch": 0.7118156709923953, + "grad_norm": 3.310873031616211, + "learning_rate": 3.786383776514685e-06, + "loss": 0.6237, + "step": 9407 + }, + { + "epoch": 0.711891339714729, + "grad_norm": 2.320218324661255, + "learning_rate": 3.784548110230068e-06, + "loss": 0.647, + "step": 9408 + }, + { + "epoch": 0.7119670084370625, + "grad_norm": 1.9522889852523804, + "learning_rate": 3.7827127705539136e-06, + "loss": 0.7945, + "step": 9409 + }, + { + "epoch": 0.7120426771593962, + "grad_norm": 1.851012110710144, + "learning_rate": 3.7808777576011564e-06, + "loss": 0.5536, + "step": 9410 + }, + { + "epoch": 0.7121183458817298, + "grad_norm": 1.8848350048065186, + "learning_rate": 3.7790430714867223e-06, + "loss": 0.6775, + "step": 9411 + }, + { + "epoch": 0.7121940146040634, + "grad_norm": 1.8779957294464111, + "learning_rate": 3.777208712325493e-06, + "loss": 0.7499, + "step": 9412 + }, + { + "epoch": 0.7122696833263971, + "grad_norm": 1.9439111948013306, + "learning_rate": 3.775374680232348e-06, + "loss": 0.6631, + "step": 9413 + }, + { + "epoch": 0.7123453520487306, + "grad_norm": 2.2685723304748535, + "learning_rate": 3.773540975322138e-06, + "loss": 0.6203, + "step": 9414 + }, + { + "epoch": 0.7124210207710643, + "grad_norm": 2.156620979309082, + "learning_rate": 3.7717075977096973e-06, + "loss": 0.6294, + "step": 9415 + }, + { + "epoch": 0.712496689493398, + "grad_norm": 2.185917854309082, + "learning_rate": 3.7698745475098365e-06, + "loss": 0.7808, + "step": 9416 + }, + { + "epoch": 0.7125723582157315, + "grad_norm": 1.884772777557373, + "learning_rate": 3.768041824837349e-06, + "loss": 0.6747, + "step": 9417 + }, + { + "epoch": 0.7126480269380652, + "grad_norm": 1.9386128187179565, + "learning_rate": 3.766209429807004e-06, + "loss": 0.656, + "step": 9418 + }, + { + "epoch": 0.7127236956603987, + "grad_norm": 1.856753945350647, + "learning_rate": 3.764377362533556e-06, + "loss": 0.4882, + "step": 9419 + }, + { + "epoch": 0.7127993643827324, + "grad_norm": 2.4401142597198486, + "learning_rate": 3.762545623131724e-06, + "loss": 0.654, + "step": 9420 + }, + { + "epoch": 0.7128750331050661, + "grad_norm": 1.9322893619537354, + "learning_rate": 3.7607142117162297e-06, + "loss": 0.7618, + "step": 9421 + }, + { + "epoch": 0.7129507018273996, + "grad_norm": 2.7075653076171875, + "learning_rate": 3.7588831284017608e-06, + "loss": 0.6788, + "step": 9422 + }, + { + "epoch": 0.7130263705497333, + "grad_norm": 1.963377833366394, + "learning_rate": 3.757052373302978e-06, + "loss": 0.6509, + "step": 9423 + }, + { + "epoch": 0.713102039272067, + "grad_norm": 1.9781228303909302, + "learning_rate": 3.7552219465345335e-06, + "loss": 0.5682, + "step": 9424 + }, + { + "epoch": 0.7131777079944005, + "grad_norm": 1.9401733875274658, + "learning_rate": 3.7533918482110544e-06, + "loss": 0.5823, + "step": 9425 + }, + { + "epoch": 0.7132533767167342, + "grad_norm": 1.7262191772460938, + "learning_rate": 3.7515620784471475e-06, + "loss": 0.6593, + "step": 9426 + }, + { + "epoch": 0.7133290454390677, + "grad_norm": 2.252978801727295, + "learning_rate": 3.7497326373573983e-06, + "loss": 0.7555, + "step": 9427 + }, + { + "epoch": 0.7134047141614014, + "grad_norm": 2.0870866775512695, + "learning_rate": 3.747903525056374e-06, + "loss": 0.6717, + "step": 9428 + }, + { + "epoch": 0.713480382883735, + "grad_norm": 2.1834726333618164, + "learning_rate": 3.746074741658621e-06, + "loss": 0.6464, + "step": 9429 + }, + { + "epoch": 0.7135560516060686, + "grad_norm": 2.443652629852295, + "learning_rate": 3.744246287278654e-06, + "loss": 0.5819, + "step": 9430 + }, + { + "epoch": 0.7136317203284023, + "grad_norm": 1.8211729526519775, + "learning_rate": 3.742418162030987e-06, + "loss": 0.6914, + "step": 9431 + }, + { + "epoch": 0.7137073890507358, + "grad_norm": 3.0865557193756104, + "learning_rate": 3.740590366030099e-06, + "loss": 0.7489, + "step": 9432 + }, + { + "epoch": 0.7137830577730695, + "grad_norm": 1.6434742212295532, + "learning_rate": 3.738762899390458e-06, + "loss": 0.6222, + "step": 9433 + }, + { + "epoch": 0.7138587264954032, + "grad_norm": 1.8651151657104492, + "learning_rate": 3.736935762226497e-06, + "loss": 0.5819, + "step": 9434 + }, + { + "epoch": 0.7139343952177367, + "grad_norm": 2.248553514480591, + "learning_rate": 3.7351089546526386e-06, + "loss": 0.8123, + "step": 9435 + }, + { + "epoch": 0.7140100639400704, + "grad_norm": 2.241312265396118, + "learning_rate": 3.7332824767832927e-06, + "loss": 0.8631, + "step": 9436 + }, + { + "epoch": 0.714085732662404, + "grad_norm": 2.0975382328033447, + "learning_rate": 3.731456328732829e-06, + "loss": 0.6104, + "step": 9437 + }, + { + "epoch": 0.7141614013847376, + "grad_norm": 1.8900827169418335, + "learning_rate": 3.729630510615611e-06, + "loss": 0.8335, + "step": 9438 + }, + { + "epoch": 0.7142370701070713, + "grad_norm": 2.051609992980957, + "learning_rate": 3.7278050225459774e-06, + "loss": 0.6891, + "step": 9439 + }, + { + "epoch": 0.7143127388294048, + "grad_norm": 2.2935402393341064, + "learning_rate": 3.7259798646382476e-06, + "loss": 0.5638, + "step": 9440 + }, + { + "epoch": 0.7143884075517385, + "grad_norm": 2.171649217605591, + "learning_rate": 3.724155037006711e-06, + "loss": 0.7834, + "step": 9441 + }, + { + "epoch": 0.7144640762740722, + "grad_norm": 2.2171740531921387, + "learning_rate": 3.7223305397656537e-06, + "loss": 0.5864, + "step": 9442 + }, + { + "epoch": 0.7145397449964057, + "grad_norm": 2.0101137161254883, + "learning_rate": 3.7205063730293306e-06, + "loss": 0.5468, + "step": 9443 + }, + { + "epoch": 0.7146154137187394, + "grad_norm": 1.868871808052063, + "learning_rate": 3.718682536911972e-06, + "loss": 0.6371, + "step": 9444 + }, + { + "epoch": 0.7146910824410729, + "grad_norm": 1.9993494749069214, + "learning_rate": 3.716859031527794e-06, + "loss": 0.8044, + "step": 9445 + }, + { + "epoch": 0.7147667511634066, + "grad_norm": 2.727073907852173, + "learning_rate": 3.715035856990989e-06, + "loss": 0.6426, + "step": 9446 + }, + { + "epoch": 0.7148424198857403, + "grad_norm": 1.9772788286209106, + "learning_rate": 3.7132130134157373e-06, + "loss": 0.6816, + "step": 9447 + }, + { + "epoch": 0.7149180886080738, + "grad_norm": 1.7466607093811035, + "learning_rate": 3.7113905009161843e-06, + "loss": 0.5874, + "step": 9448 + }, + { + "epoch": 0.7149937573304075, + "grad_norm": 2.1143674850463867, + "learning_rate": 3.7095683196064624e-06, + "loss": 0.7799, + "step": 9449 + }, + { + "epoch": 0.7150694260527412, + "grad_norm": 2.3886454105377197, + "learning_rate": 3.707746469600685e-06, + "loss": 0.5778, + "step": 9450 + }, + { + "epoch": 0.7151450947750747, + "grad_norm": 1.7100965976715088, + "learning_rate": 3.7059249510129392e-06, + "loss": 0.6333, + "step": 9451 + }, + { + "epoch": 0.7152207634974084, + "grad_norm": 2.1997292041778564, + "learning_rate": 3.7041037639572976e-06, + "loss": 0.698, + "step": 9452 + }, + { + "epoch": 0.7152964322197419, + "grad_norm": 2.3083553314208984, + "learning_rate": 3.7022829085478066e-06, + "loss": 0.6241, + "step": 9453 + }, + { + "epoch": 0.7153721009420756, + "grad_norm": 2.3041303157806396, + "learning_rate": 3.7004623848984977e-06, + "loss": 0.6932, + "step": 9454 + }, + { + "epoch": 0.7154477696644093, + "grad_norm": 1.8793962001800537, + "learning_rate": 3.698642193123373e-06, + "loss": 0.5792, + "step": 9455 + }, + { + "epoch": 0.7155234383867428, + "grad_norm": 2.057422161102295, + "learning_rate": 3.696822333336416e-06, + "loss": 0.6926, + "step": 9456 + }, + { + "epoch": 0.7155991071090765, + "grad_norm": 1.9454712867736816, + "learning_rate": 3.695002805651605e-06, + "loss": 0.6216, + "step": 9457 + }, + { + "epoch": 0.71567477583141, + "grad_norm": 2.082719326019287, + "learning_rate": 3.693183610182873e-06, + "loss": 0.6695, + "step": 9458 + }, + { + "epoch": 0.7157504445537437, + "grad_norm": 3.1850838661193848, + "learning_rate": 3.691364747044147e-06, + "loss": 0.7003, + "step": 9459 + }, + { + "epoch": 0.7158261132760774, + "grad_norm": 2.776329517364502, + "learning_rate": 3.6895462163493316e-06, + "loss": 0.6318, + "step": 9460 + }, + { + "epoch": 0.7159017819984109, + "grad_norm": 1.9540523290634155, + "learning_rate": 3.6877280182123084e-06, + "loss": 0.7524, + "step": 9461 + }, + { + "epoch": 0.7159774507207446, + "grad_norm": 2.246755599975586, + "learning_rate": 3.6859101527469375e-06, + "loss": 0.6448, + "step": 9462 + }, + { + "epoch": 0.7160531194430783, + "grad_norm": 1.626090168952942, + "learning_rate": 3.684092620067062e-06, + "loss": 0.6897, + "step": 9463 + }, + { + "epoch": 0.7161287881654118, + "grad_norm": 2.5425055027008057, + "learning_rate": 3.6822754202864992e-06, + "loss": 0.7054, + "step": 9464 + }, + { + "epoch": 0.7162044568877455, + "grad_norm": 2.3118436336517334, + "learning_rate": 3.680458553519053e-06, + "loss": 0.6208, + "step": 9465 + }, + { + "epoch": 0.716280125610079, + "grad_norm": 2.1892473697662354, + "learning_rate": 3.678642019878491e-06, + "loss": 0.6901, + "step": 9466 + }, + { + "epoch": 0.7163557943324127, + "grad_norm": 1.977307915687561, + "learning_rate": 3.676825819478581e-06, + "loss": 0.7049, + "step": 9467 + }, + { + "epoch": 0.7164314630547464, + "grad_norm": 2.380528450012207, + "learning_rate": 3.6750099524330575e-06, + "loss": 0.6314, + "step": 9468 + }, + { + "epoch": 0.7165071317770799, + "grad_norm": 2.11545729637146, + "learning_rate": 3.6731944188556317e-06, + "loss": 0.7005, + "step": 9469 + }, + { + "epoch": 0.7165828004994136, + "grad_norm": 2.1556060314178467, + "learning_rate": 3.6713792188599997e-06, + "loss": 0.6029, + "step": 9470 + }, + { + "epoch": 0.7166584692217471, + "grad_norm": 1.7340325117111206, + "learning_rate": 3.669564352559837e-06, + "loss": 0.6378, + "step": 9471 + }, + { + "epoch": 0.7167341379440808, + "grad_norm": 1.669938325881958, + "learning_rate": 3.6677498200687934e-06, + "loss": 0.709, + "step": 9472 + }, + { + "epoch": 0.7168098066664145, + "grad_norm": 2.10733699798584, + "learning_rate": 3.6659356215005036e-06, + "loss": 0.728, + "step": 9473 + }, + { + "epoch": 0.716885475388748, + "grad_norm": 1.7881940603256226, + "learning_rate": 3.6641217569685783e-06, + "loss": 0.5225, + "step": 9474 + }, + { + "epoch": 0.7169611441110817, + "grad_norm": 1.5777084827423096, + "learning_rate": 3.6623082265866098e-06, + "loss": 0.5302, + "step": 9475 + }, + { + "epoch": 0.7170368128334154, + "grad_norm": 2.004207134246826, + "learning_rate": 3.66049503046816e-06, + "loss": 0.8483, + "step": 9476 + }, + { + "epoch": 0.7171124815557489, + "grad_norm": 1.8468573093414307, + "learning_rate": 3.658682168726779e-06, + "loss": 0.7662, + "step": 9477 + }, + { + "epoch": 0.7171881502780826, + "grad_norm": 1.8332854509353638, + "learning_rate": 3.6568696414760007e-06, + "loss": 0.5098, + "step": 9478 + }, + { + "epoch": 0.7172638190004161, + "grad_norm": 1.9783474206924438, + "learning_rate": 3.6550574488293284e-06, + "loss": 0.5637, + "step": 9479 + }, + { + "epoch": 0.7173394877227498, + "grad_norm": 1.8308309316635132, + "learning_rate": 3.6532455909002453e-06, + "loss": 0.7741, + "step": 9480 + }, + { + "epoch": 0.7174151564450835, + "grad_norm": 2.1236815452575684, + "learning_rate": 3.6514340678022155e-06, + "loss": 0.6946, + "step": 9481 + }, + { + "epoch": 0.717490825167417, + "grad_norm": 2.0192248821258545, + "learning_rate": 3.649622879648684e-06, + "loss": 0.6707, + "step": 9482 + }, + { + "epoch": 0.7175664938897507, + "grad_norm": 2.7706687450408936, + "learning_rate": 3.647812026553073e-06, + "loss": 0.6171, + "step": 9483 + }, + { + "epoch": 0.7176421626120842, + "grad_norm": 2.084230661392212, + "learning_rate": 3.6460015086287838e-06, + "loss": 0.6501, + "step": 9484 + }, + { + "epoch": 0.7177178313344179, + "grad_norm": 2.3443830013275146, + "learning_rate": 3.6441913259891964e-06, + "loss": 0.6697, + "step": 9485 + }, + { + "epoch": 0.7177935000567516, + "grad_norm": 2.1774091720581055, + "learning_rate": 3.6423814787476756e-06, + "loss": 0.6251, + "step": 9486 + }, + { + "epoch": 0.7178691687790851, + "grad_norm": 1.484096646308899, + "learning_rate": 3.640571967017548e-06, + "loss": 0.6349, + "step": 9487 + }, + { + "epoch": 0.7179448375014188, + "grad_norm": 1.9635212421417236, + "learning_rate": 3.638762790912142e-06, + "loss": 0.5859, + "step": 9488 + }, + { + "epoch": 0.7180205062237525, + "grad_norm": 1.664218544960022, + "learning_rate": 3.636953950544753e-06, + "loss": 0.7502, + "step": 9489 + }, + { + "epoch": 0.718096174946086, + "grad_norm": 1.7915072441101074, + "learning_rate": 3.635145446028651e-06, + "loss": 0.6983, + "step": 9490 + }, + { + "epoch": 0.7181718436684197, + "grad_norm": 2.0278759002685547, + "learning_rate": 3.6333372774770926e-06, + "loss": 0.6947, + "step": 9491 + }, + { + "epoch": 0.7182475123907532, + "grad_norm": 1.7070521116256714, + "learning_rate": 3.631529445003309e-06, + "loss": 0.6624, + "step": 9492 + }, + { + "epoch": 0.7183231811130869, + "grad_norm": 1.7862852811813354, + "learning_rate": 3.629721948720522e-06, + "loss": 0.6233, + "step": 9493 + }, + { + "epoch": 0.7183988498354206, + "grad_norm": 1.671974778175354, + "learning_rate": 3.6279147887419135e-06, + "loss": 0.7435, + "step": 9494 + }, + { + "epoch": 0.7184745185577541, + "grad_norm": 2.4894473552703857, + "learning_rate": 3.6261079651806546e-06, + "loss": 0.6102, + "step": 9495 + }, + { + "epoch": 0.7185501872800878, + "grad_norm": 2.10819673538208, + "learning_rate": 3.624301478149897e-06, + "loss": 0.6282, + "step": 9496 + }, + { + "epoch": 0.7186258560024213, + "grad_norm": 2.059037923812866, + "learning_rate": 3.6224953277627686e-06, + "loss": 0.6832, + "step": 9497 + }, + { + "epoch": 0.718701524724755, + "grad_norm": 2.2662110328674316, + "learning_rate": 3.620689514132375e-06, + "loss": 0.6392, + "step": 9498 + }, + { + "epoch": 0.7187771934470887, + "grad_norm": 1.8977314233779907, + "learning_rate": 3.6188840373718028e-06, + "loss": 0.6685, + "step": 9499 + }, + { + "epoch": 0.7188528621694222, + "grad_norm": 1.8836039304733276, + "learning_rate": 3.617078897594121e-06, + "loss": 0.7153, + "step": 9500 + }, + { + "epoch": 0.7189285308917559, + "grad_norm": 1.4642912149429321, + "learning_rate": 3.6152740949123648e-06, + "loss": 0.5909, + "step": 9501 + }, + { + "epoch": 0.7190041996140896, + "grad_norm": 1.8887202739715576, + "learning_rate": 3.6134696294395585e-06, + "loss": 0.726, + "step": 9502 + }, + { + "epoch": 0.7190798683364231, + "grad_norm": 2.040818214416504, + "learning_rate": 3.6116655012887122e-06, + "loss": 0.6889, + "step": 9503 + }, + { + "epoch": 0.7191555370587568, + "grad_norm": 1.7351603507995605, + "learning_rate": 3.6098617105727973e-06, + "loss": 0.7366, + "step": 9504 + }, + { + "epoch": 0.7192312057810903, + "grad_norm": 1.4561235904693604, + "learning_rate": 3.608058257404776e-06, + "loss": 0.6087, + "step": 9505 + }, + { + "epoch": 0.719306874503424, + "grad_norm": 3.0228309631347656, + "learning_rate": 3.606255141897586e-06, + "loss": 0.8051, + "step": 9506 + }, + { + "epoch": 0.7193825432257577, + "grad_norm": 1.730713129043579, + "learning_rate": 3.6044523641641448e-06, + "loss": 0.685, + "step": 9507 + }, + { + "epoch": 0.7194582119480912, + "grad_norm": 2.0511322021484375, + "learning_rate": 3.6026499243173475e-06, + "loss": 0.7083, + "step": 9508 + }, + { + "epoch": 0.7195338806704249, + "grad_norm": 2.072368860244751, + "learning_rate": 3.6008478224700685e-06, + "loss": 0.6813, + "step": 9509 + }, + { + "epoch": 0.7196095493927585, + "grad_norm": 1.9627420902252197, + "learning_rate": 3.5990460587351625e-06, + "loss": 0.7139, + "step": 9510 + }, + { + "epoch": 0.7196852181150921, + "grad_norm": 2.194145441055298, + "learning_rate": 3.5972446332254646e-06, + "loss": 0.5347, + "step": 9511 + }, + { + "epoch": 0.7197608868374258, + "grad_norm": 1.782472848892212, + "learning_rate": 3.595443546053776e-06, + "loss": 0.6956, + "step": 9512 + }, + { + "epoch": 0.7198365555597593, + "grad_norm": 1.7157336473464966, + "learning_rate": 3.5936427973328957e-06, + "loss": 0.7086, + "step": 9513 + }, + { + "epoch": 0.719912224282093, + "grad_norm": 2.2964377403259277, + "learning_rate": 3.591842387175593e-06, + "loss": 0.7328, + "step": 9514 + }, + { + "epoch": 0.7199878930044267, + "grad_norm": 1.782142996788025, + "learning_rate": 3.590042315694609e-06, + "loss": 0.7607, + "step": 9515 + }, + { + "epoch": 0.7200635617267602, + "grad_norm": 2.2905352115631104, + "learning_rate": 3.588242583002674e-06, + "loss": 0.6966, + "step": 9516 + }, + { + "epoch": 0.7201392304490939, + "grad_norm": 1.8695130348205566, + "learning_rate": 3.5864431892124913e-06, + "loss": 0.6522, + "step": 9517 + }, + { + "epoch": 0.7202148991714274, + "grad_norm": 1.8097631931304932, + "learning_rate": 3.5846441344367456e-06, + "loss": 0.6304, + "step": 9518 + }, + { + "epoch": 0.7202905678937611, + "grad_norm": 4.14026403427124, + "learning_rate": 3.5828454187881e-06, + "loss": 0.5817, + "step": 9519 + }, + { + "epoch": 0.7203662366160948, + "grad_norm": 1.8867014646530151, + "learning_rate": 3.581047042379195e-06, + "loss": 0.5896, + "step": 9520 + }, + { + "epoch": 0.7204419053384283, + "grad_norm": 2.1675286293029785, + "learning_rate": 3.579249005322652e-06, + "loss": 0.7204, + "step": 9521 + }, + { + "epoch": 0.720517574060762, + "grad_norm": 2.6543667316436768, + "learning_rate": 3.577451307731071e-06, + "loss": 0.7004, + "step": 9522 + }, + { + "epoch": 0.7205932427830956, + "grad_norm": 2.085458517074585, + "learning_rate": 3.575653949717022e-06, + "loss": 0.645, + "step": 9523 + }, + { + "epoch": 0.7206689115054292, + "grad_norm": 2.0255820751190186, + "learning_rate": 3.5738569313930702e-06, + "loss": 0.6807, + "step": 9524 + }, + { + "epoch": 0.7207445802277629, + "grad_norm": 2.1324872970581055, + "learning_rate": 3.572060252871752e-06, + "loss": 0.6868, + "step": 9525 + }, + { + "epoch": 0.7208202489500964, + "grad_norm": 1.980837345123291, + "learning_rate": 3.570263914265572e-06, + "loss": 0.5308, + "step": 9526 + }, + { + "epoch": 0.7208959176724301, + "grad_norm": 2.0542397499084473, + "learning_rate": 3.5684679156870284e-06, + "loss": 0.6264, + "step": 9527 + }, + { + "epoch": 0.7209715863947638, + "grad_norm": 2.3621749877929688, + "learning_rate": 3.5666722572485916e-06, + "loss": 0.7101, + "step": 9528 + }, + { + "epoch": 0.7210472551170973, + "grad_norm": 2.299753189086914, + "learning_rate": 3.564876939062711e-06, + "loss": 0.6421, + "step": 9529 + }, + { + "epoch": 0.721122923839431, + "grad_norm": 1.9980701208114624, + "learning_rate": 3.5630819612418172e-06, + "loss": 0.7892, + "step": 9530 + }, + { + "epoch": 0.7211985925617646, + "grad_norm": 1.7415344715118408, + "learning_rate": 3.5612873238983153e-06, + "loss": 0.6191, + "step": 9531 + }, + { + "epoch": 0.7212742612840982, + "grad_norm": 2.2024734020233154, + "learning_rate": 3.5594930271445946e-06, + "loss": 0.6404, + "step": 9532 + }, + { + "epoch": 0.7213499300064319, + "grad_norm": 2.5525379180908203, + "learning_rate": 3.557699071093012e-06, + "loss": 0.7167, + "step": 9533 + }, + { + "epoch": 0.7214255987287654, + "grad_norm": 2.1622018814086914, + "learning_rate": 3.5559054558559193e-06, + "loss": 0.6134, + "step": 9534 + }, + { + "epoch": 0.7215012674510991, + "grad_norm": 2.7653920650482178, + "learning_rate": 3.5541121815456345e-06, + "loss": 0.6996, + "step": 9535 + }, + { + "epoch": 0.7215769361734327, + "grad_norm": 2.7878835201263428, + "learning_rate": 3.5523192482744618e-06, + "loss": 0.6121, + "step": 9536 + }, + { + "epoch": 0.7216526048957663, + "grad_norm": 1.8439620733261108, + "learning_rate": 3.5505266561546753e-06, + "loss": 0.6619, + "step": 9537 + }, + { + "epoch": 0.7217282736181, + "grad_norm": 2.6026670932769775, + "learning_rate": 3.5487344052985323e-06, + "loss": 0.7525, + "step": 9538 + }, + { + "epoch": 0.7218039423404335, + "grad_norm": 2.118645191192627, + "learning_rate": 3.5469424958182783e-06, + "loss": 0.7478, + "step": 9539 + }, + { + "epoch": 0.7218796110627672, + "grad_norm": 1.8370145559310913, + "learning_rate": 3.5451509278261196e-06, + "loss": 0.7044, + "step": 9540 + }, + { + "epoch": 0.7219552797851009, + "grad_norm": 2.0463836193084717, + "learning_rate": 3.543359701434254e-06, + "loss": 0.6612, + "step": 9541 + }, + { + "epoch": 0.7220309485074344, + "grad_norm": 1.8631353378295898, + "learning_rate": 3.5415688167548513e-06, + "loss": 0.7544, + "step": 9542 + }, + { + "epoch": 0.7221066172297681, + "grad_norm": 2.5322537422180176, + "learning_rate": 3.5397782739000647e-06, + "loss": 0.7171, + "step": 9543 + }, + { + "epoch": 0.7221822859521017, + "grad_norm": 1.8810664415359497, + "learning_rate": 3.5379880729820227e-06, + "loss": 0.5414, + "step": 9544 + }, + { + "epoch": 0.7222579546744353, + "grad_norm": 2.0911777019500732, + "learning_rate": 3.536198214112834e-06, + "loss": 0.6315, + "step": 9545 + }, + { + "epoch": 0.722333623396769, + "grad_norm": 2.1385138034820557, + "learning_rate": 3.534408697404588e-06, + "loss": 0.7681, + "step": 9546 + }, + { + "epoch": 0.7224092921191025, + "grad_norm": 2.2795066833496094, + "learning_rate": 3.5326195229693447e-06, + "loss": 0.7531, + "step": 9547 + }, + { + "epoch": 0.7224849608414362, + "grad_norm": 2.068978786468506, + "learning_rate": 3.5308306909191467e-06, + "loss": 0.6324, + "step": 9548 + }, + { + "epoch": 0.7225606295637698, + "grad_norm": 2.281078577041626, + "learning_rate": 3.5290422013660234e-06, + "loss": 0.751, + "step": 9549 + }, + { + "epoch": 0.7226362982861034, + "grad_norm": 2.842747449874878, + "learning_rate": 3.5272540544219766e-06, + "loss": 0.5912, + "step": 9550 + }, + { + "epoch": 0.7227119670084371, + "grad_norm": 1.6664164066314697, + "learning_rate": 3.5254662501989788e-06, + "loss": 0.5474, + "step": 9551 + }, + { + "epoch": 0.7227876357307707, + "grad_norm": 2.199381113052368, + "learning_rate": 3.5236787888089905e-06, + "loss": 0.6749, + "step": 9552 + }, + { + "epoch": 0.7228633044531043, + "grad_norm": 1.8498841524124146, + "learning_rate": 3.5218916703639495e-06, + "loss": 0.6779, + "step": 9553 + }, + { + "epoch": 0.722938973175438, + "grad_norm": 2.030409336090088, + "learning_rate": 3.5201048949757702e-06, + "loss": 0.7235, + "step": 9554 + }, + { + "epoch": 0.7230146418977715, + "grad_norm": 2.3380861282348633, + "learning_rate": 3.5183184627563463e-06, + "loss": 0.6836, + "step": 9555 + }, + { + "epoch": 0.7230903106201052, + "grad_norm": 2.5878119468688965, + "learning_rate": 3.5165323738175504e-06, + "loss": 0.6228, + "step": 9556 + }, + { + "epoch": 0.7231659793424388, + "grad_norm": 2.390239953994751, + "learning_rate": 3.514746628271236e-06, + "loss": 0.4656, + "step": 9557 + }, + { + "epoch": 0.7232416480647724, + "grad_norm": 2.0692977905273438, + "learning_rate": 3.512961226229227e-06, + "loss": 0.6703, + "step": 9558 + }, + { + "epoch": 0.7233173167871061, + "grad_norm": 2.3725948333740234, + "learning_rate": 3.511176167803329e-06, + "loss": 0.665, + "step": 9559 + }, + { + "epoch": 0.7233929855094396, + "grad_norm": 2.082200288772583, + "learning_rate": 3.509391453105339e-06, + "loss": 0.6218, + "step": 9560 + }, + { + "epoch": 0.7234686542317733, + "grad_norm": 1.8738415241241455, + "learning_rate": 3.5076070822470115e-06, + "loss": 0.7547, + "step": 9561 + }, + { + "epoch": 0.7235443229541069, + "grad_norm": 2.0773375034332275, + "learning_rate": 3.5058230553400937e-06, + "loss": 0.6218, + "step": 9562 + }, + { + "epoch": 0.7236199916764405, + "grad_norm": 2.684323310852051, + "learning_rate": 3.504039372496306e-06, + "loss": 0.6819, + "step": 9563 + }, + { + "epoch": 0.7236956603987742, + "grad_norm": 2.242973804473877, + "learning_rate": 3.502256033827349e-06, + "loss": 0.7311, + "step": 9564 + }, + { + "epoch": 0.7237713291211078, + "grad_norm": 1.914873480796814, + "learning_rate": 3.5004730394449014e-06, + "loss": 0.6217, + "step": 9565 + }, + { + "epoch": 0.7238469978434414, + "grad_norm": 3.037616729736328, + "learning_rate": 3.498690389460619e-06, + "loss": 0.7967, + "step": 9566 + }, + { + "epoch": 0.7239226665657751, + "grad_norm": 1.9221965074539185, + "learning_rate": 3.4969080839861388e-06, + "loss": 0.6185, + "step": 9567 + }, + { + "epoch": 0.7239983352881086, + "grad_norm": 1.7986969947814941, + "learning_rate": 3.495126123133075e-06, + "loss": 0.5751, + "step": 9568 + }, + { + "epoch": 0.7240740040104423, + "grad_norm": 2.0456697940826416, + "learning_rate": 3.4933445070130137e-06, + "loss": 0.7111, + "step": 9569 + }, + { + "epoch": 0.7241496727327759, + "grad_norm": 2.568084716796875, + "learning_rate": 3.4915632357375322e-06, + "loss": 0.7023, + "step": 9570 + }, + { + "epoch": 0.7242253414551095, + "grad_norm": 1.8491854667663574, + "learning_rate": 3.489782309418181e-06, + "loss": 0.5003, + "step": 9571 + }, + { + "epoch": 0.7243010101774432, + "grad_norm": 2.522088050842285, + "learning_rate": 3.4880017281664807e-06, + "loss": 0.6625, + "step": 9572 + }, + { + "epoch": 0.7243766788997767, + "grad_norm": 2.015510082244873, + "learning_rate": 3.4862214920939396e-06, + "loss": 0.6182, + "step": 9573 + }, + { + "epoch": 0.7244523476221104, + "grad_norm": 1.839280366897583, + "learning_rate": 3.4844416013120436e-06, + "loss": 0.6601, + "step": 9574 + }, + { + "epoch": 0.724528016344444, + "grad_norm": 1.9909266233444214, + "learning_rate": 3.4826620559322523e-06, + "loss": 0.7079, + "step": 9575 + }, + { + "epoch": 0.7246036850667776, + "grad_norm": 2.2563157081604004, + "learning_rate": 3.480882856066009e-06, + "loss": 0.5589, + "step": 9576 + }, + { + "epoch": 0.7246793537891113, + "grad_norm": 2.3766355514526367, + "learning_rate": 3.4791040018247334e-06, + "loss": 0.6712, + "step": 9577 + }, + { + "epoch": 0.7247550225114449, + "grad_norm": 2.32324481010437, + "learning_rate": 3.477325493319824e-06, + "loss": 0.7717, + "step": 9578 + }, + { + "epoch": 0.7248306912337785, + "grad_norm": 1.881474256515503, + "learning_rate": 3.4755473306626482e-06, + "loss": 0.6536, + "step": 9579 + }, + { + "epoch": 0.7249063599561122, + "grad_norm": 2.3308231830596924, + "learning_rate": 3.4737695139645697e-06, + "loss": 0.6384, + "step": 9580 + }, + { + "epoch": 0.7249820286784457, + "grad_norm": 2.173731803894043, + "learning_rate": 3.471992043336919e-06, + "loss": 0.6587, + "step": 9581 + }, + { + "epoch": 0.7250576974007794, + "grad_norm": 2.35199236869812, + "learning_rate": 3.4702149188910087e-06, + "loss": 0.6212, + "step": 9582 + }, + { + "epoch": 0.725133366123113, + "grad_norm": 2.924612522125244, + "learning_rate": 3.468438140738123e-06, + "loss": 0.7118, + "step": 9583 + }, + { + "epoch": 0.7252090348454466, + "grad_norm": 2.093873977661133, + "learning_rate": 3.46666170898953e-06, + "loss": 0.6152, + "step": 9584 + }, + { + "epoch": 0.7252847035677803, + "grad_norm": 2.7379560470581055, + "learning_rate": 3.4648856237564827e-06, + "loss": 0.7422, + "step": 9585 + }, + { + "epoch": 0.7253603722901139, + "grad_norm": 2.0272998809814453, + "learning_rate": 3.463109885150198e-06, + "loss": 0.6245, + "step": 9586 + }, + { + "epoch": 0.7254360410124475, + "grad_norm": 2.0299673080444336, + "learning_rate": 3.4613344932818797e-06, + "loss": 0.7292, + "step": 9587 + }, + { + "epoch": 0.7255117097347811, + "grad_norm": 2.6118695735931396, + "learning_rate": 3.459559448262711e-06, + "loss": 0.6669, + "step": 9588 + }, + { + "epoch": 0.7255873784571147, + "grad_norm": 1.9721378087997437, + "learning_rate": 3.457784750203849e-06, + "loss": 0.6908, + "step": 9589 + }, + { + "epoch": 0.7256630471794484, + "grad_norm": 2.455974817276001, + "learning_rate": 3.456010399216431e-06, + "loss": 0.8746, + "step": 9590 + }, + { + "epoch": 0.725738715901782, + "grad_norm": 1.8864761590957642, + "learning_rate": 3.454236395411574e-06, + "loss": 0.6697, + "step": 9591 + }, + { + "epoch": 0.7258143846241156, + "grad_norm": 1.9574358463287354, + "learning_rate": 3.4524627389003745e-06, + "loss": 0.7325, + "step": 9592 + }, + { + "epoch": 0.7258900533464493, + "grad_norm": 1.9608203172683716, + "learning_rate": 3.450689429793897e-06, + "loss": 0.7059, + "step": 9593 + }, + { + "epoch": 0.7259657220687828, + "grad_norm": 2.011075496673584, + "learning_rate": 3.4489164682031966e-06, + "loss": 0.6755, + "step": 9594 + }, + { + "epoch": 0.7260413907911165, + "grad_norm": 1.925155520439148, + "learning_rate": 3.4471438542392987e-06, + "loss": 0.8509, + "step": 9595 + }, + { + "epoch": 0.7261170595134501, + "grad_norm": 1.9459024667739868, + "learning_rate": 3.4453715880132183e-06, + "loss": 0.6895, + "step": 9596 + }, + { + "epoch": 0.7261927282357837, + "grad_norm": 1.7830241918563843, + "learning_rate": 3.4435996696359328e-06, + "loss": 0.6713, + "step": 9597 + }, + { + "epoch": 0.7262683969581174, + "grad_norm": 2.3200533390045166, + "learning_rate": 3.441828099218406e-06, + "loss": 0.7796, + "step": 9598 + }, + { + "epoch": 0.726344065680451, + "grad_norm": 2.053757429122925, + "learning_rate": 3.4400568768715827e-06, + "loss": 0.6481, + "step": 9599 + }, + { + "epoch": 0.7264197344027846, + "grad_norm": 2.283618688583374, + "learning_rate": 3.4382860027063798e-06, + "loss": 0.7214, + "step": 9600 + }, + { + "epoch": 0.7264954031251182, + "grad_norm": 2.036465644836426, + "learning_rate": 3.436515476833696e-06, + "loss": 0.6602, + "step": 9601 + }, + { + "epoch": 0.7265710718474518, + "grad_norm": 1.9989351034164429, + "learning_rate": 3.434745299364408e-06, + "loss": 0.6376, + "step": 9602 + }, + { + "epoch": 0.7266467405697855, + "grad_norm": 1.9617687463760376, + "learning_rate": 3.4329754704093725e-06, + "loss": 0.5082, + "step": 9603 + }, + { + "epoch": 0.7267224092921191, + "grad_norm": 2.9853837490081787, + "learning_rate": 3.431205990079416e-06, + "loss": 0.7962, + "step": 9604 + }, + { + "epoch": 0.7267980780144527, + "grad_norm": 2.0819427967071533, + "learning_rate": 3.4294368584853484e-06, + "loss": 0.6982, + "step": 9605 + }, + { + "epoch": 0.7268737467367864, + "grad_norm": 2.134868621826172, + "learning_rate": 3.4276680757379687e-06, + "loss": 0.6123, + "step": 9606 + }, + { + "epoch": 0.72694941545912, + "grad_norm": 2.1825947761535645, + "learning_rate": 3.425899641948035e-06, + "loss": 0.7086, + "step": 9607 + }, + { + "epoch": 0.7270250841814536, + "grad_norm": 2.3707220554351807, + "learning_rate": 3.4241315572262933e-06, + "loss": 0.8065, + "step": 9608 + }, + { + "epoch": 0.7271007529037872, + "grad_norm": 1.2709568738937378, + "learning_rate": 3.4223638216834683e-06, + "loss": 0.8368, + "step": 9609 + }, + { + "epoch": 0.7271764216261208, + "grad_norm": 2.1164627075195312, + "learning_rate": 3.4205964354302608e-06, + "loss": 0.7194, + "step": 9610 + }, + { + "epoch": 0.7272520903484545, + "grad_norm": 1.7704885005950928, + "learning_rate": 3.4188293985773507e-06, + "loss": 0.6807, + "step": 9611 + }, + { + "epoch": 0.7273277590707881, + "grad_norm": 1.7768155336380005, + "learning_rate": 3.417062711235396e-06, + "loss": 0.6752, + "step": 9612 + }, + { + "epoch": 0.7274034277931217, + "grad_norm": 1.9417698383331299, + "learning_rate": 3.415296373515031e-06, + "loss": 0.7535, + "step": 9613 + }, + { + "epoch": 0.7274790965154553, + "grad_norm": 2.049741506576538, + "learning_rate": 3.413530385526874e-06, + "loss": 0.7368, + "step": 9614 + }, + { + "epoch": 0.727554765237789, + "grad_norm": 1.6550544500350952, + "learning_rate": 3.411764747381506e-06, + "loss": 0.6998, + "step": 9615 + }, + { + "epoch": 0.7276304339601226, + "grad_norm": 1.9627418518066406, + "learning_rate": 3.409999459189508e-06, + "loss": 0.6864, + "step": 9616 + }, + { + "epoch": 0.7277061026824562, + "grad_norm": 2.080371379852295, + "learning_rate": 3.4082345210614273e-06, + "loss": 0.6129, + "step": 9617 + }, + { + "epoch": 0.7277817714047898, + "grad_norm": 1.9414567947387695, + "learning_rate": 3.406469933107783e-06, + "loss": 0.6578, + "step": 9618 + }, + { + "epoch": 0.7278574401271235, + "grad_norm": 2.097715139389038, + "learning_rate": 3.404705695439083e-06, + "loss": 0.6798, + "step": 9619 + }, + { + "epoch": 0.7279331088494571, + "grad_norm": 2.0292246341705322, + "learning_rate": 3.40294180816581e-06, + "loss": 0.6346, + "step": 9620 + }, + { + "epoch": 0.7280087775717907, + "grad_norm": 2.0286881923675537, + "learning_rate": 3.401178271398425e-06, + "loss": 0.7645, + "step": 9621 + }, + { + "epoch": 0.7280844462941243, + "grad_norm": 2.190192461013794, + "learning_rate": 3.3994150852473645e-06, + "loss": 0.6803, + "step": 9622 + }, + { + "epoch": 0.7281601150164579, + "grad_norm": 2.6516058444976807, + "learning_rate": 3.3976522498230454e-06, + "loss": 0.9133, + "step": 9623 + }, + { + "epoch": 0.7282357837387916, + "grad_norm": 1.7994333505630493, + "learning_rate": 3.395889765235864e-06, + "loss": 0.7207, + "step": 9624 + }, + { + "epoch": 0.7283114524611252, + "grad_norm": 2.710233211517334, + "learning_rate": 3.3941276315961903e-06, + "loss": 0.6214, + "step": 9625 + }, + { + "epoch": 0.7283871211834588, + "grad_norm": 2.217609167098999, + "learning_rate": 3.3923658490143767e-06, + "loss": 0.6707, + "step": 9626 + }, + { + "epoch": 0.7284627899057925, + "grad_norm": 2.534865379333496, + "learning_rate": 3.3906044176007505e-06, + "loss": 0.8433, + "step": 9627 + }, + { + "epoch": 0.728538458628126, + "grad_norm": 2.2182860374450684, + "learning_rate": 3.3888433374656217e-06, + "loss": 0.7009, + "step": 9628 + }, + { + "epoch": 0.7286141273504597, + "grad_norm": 2.057269811630249, + "learning_rate": 3.387082608719268e-06, + "loss": 0.7962, + "step": 9629 + }, + { + "epoch": 0.7286897960727933, + "grad_norm": 2.081799268722534, + "learning_rate": 3.385322231471954e-06, + "loss": 0.6249, + "step": 9630 + }, + { + "epoch": 0.7287654647951269, + "grad_norm": 2.1988329887390137, + "learning_rate": 3.383562205833927e-06, + "loss": 0.8234, + "step": 9631 + }, + { + "epoch": 0.7288411335174606, + "grad_norm": 2.533674716949463, + "learning_rate": 3.381802531915398e-06, + "loss": 0.6977, + "step": 9632 + }, + { + "epoch": 0.7289168022397942, + "grad_norm": 1.9693000316619873, + "learning_rate": 3.380043209826566e-06, + "loss": 0.5226, + "step": 9633 + }, + { + "epoch": 0.7289924709621278, + "grad_norm": 2.4341700077056885, + "learning_rate": 3.3782842396776048e-06, + "loss": 0.6874, + "step": 9634 + }, + { + "epoch": 0.7290681396844614, + "grad_norm": 2.3296284675598145, + "learning_rate": 3.3765256215786707e-06, + "loss": 0.4436, + "step": 9635 + }, + { + "epoch": 0.729143808406795, + "grad_norm": 1.8959673643112183, + "learning_rate": 3.374767355639885e-06, + "loss": 0.6406, + "step": 9636 + }, + { + "epoch": 0.7292194771291287, + "grad_norm": 2.5320215225219727, + "learning_rate": 3.373009441971364e-06, + "loss": 0.7049, + "step": 9637 + }, + { + "epoch": 0.7292951458514623, + "grad_norm": 2.963879346847534, + "learning_rate": 3.3712518806831915e-06, + "loss": 0.6362, + "step": 9638 + }, + { + "epoch": 0.7293708145737959, + "grad_norm": 1.864016056060791, + "learning_rate": 3.3694946718854357e-06, + "loss": 0.5834, + "step": 9639 + }, + { + "epoch": 0.7294464832961296, + "grad_norm": 2.2070538997650146, + "learning_rate": 3.3677378156881313e-06, + "loss": 0.64, + "step": 9640 + }, + { + "epoch": 0.7295221520184632, + "grad_norm": 2.1202077865600586, + "learning_rate": 3.3659813122012987e-06, + "loss": 0.619, + "step": 9641 + }, + { + "epoch": 0.7295978207407968, + "grad_norm": 1.996546983718872, + "learning_rate": 3.364225161534945e-06, + "loss": 0.5529, + "step": 9642 + }, + { + "epoch": 0.7296734894631304, + "grad_norm": 1.7262141704559326, + "learning_rate": 3.362469363799037e-06, + "loss": 0.6483, + "step": 9643 + }, + { + "epoch": 0.729749158185464, + "grad_norm": 2.2432174682617188, + "learning_rate": 3.360713919103532e-06, + "loss": 0.6979, + "step": 9644 + }, + { + "epoch": 0.7298248269077977, + "grad_norm": 2.1169657707214355, + "learning_rate": 3.35895882755836e-06, + "loss": 0.7512, + "step": 9645 + }, + { + "epoch": 0.7299004956301313, + "grad_norm": 2.215263843536377, + "learning_rate": 3.357204089273432e-06, + "loss": 0.6911, + "step": 9646 + }, + { + "epoch": 0.7299761643524649, + "grad_norm": 2.0325393676757812, + "learning_rate": 3.3554497043586354e-06, + "loss": 0.6089, + "step": 9647 + }, + { + "epoch": 0.7300518330747985, + "grad_norm": 2.492884874343872, + "learning_rate": 3.353695672923835e-06, + "loss": 0.6432, + "step": 9648 + }, + { + "epoch": 0.7301275017971322, + "grad_norm": 1.838275671005249, + "learning_rate": 3.351941995078877e-06, + "loss": 0.6128, + "step": 9649 + }, + { + "epoch": 0.7302031705194658, + "grad_norm": 2.5768980979919434, + "learning_rate": 3.3501886709335755e-06, + "loss": 0.542, + "step": 9650 + }, + { + "epoch": 0.7302788392417994, + "grad_norm": 2.3133151531219482, + "learning_rate": 3.3484357005977307e-06, + "loss": 0.6435, + "step": 9651 + }, + { + "epoch": 0.730354507964133, + "grad_norm": 2.1591763496398926, + "learning_rate": 3.346683084181125e-06, + "loss": 0.8351, + "step": 9652 + }, + { + "epoch": 0.7304301766864667, + "grad_norm": 2.5849671363830566, + "learning_rate": 3.344930821793512e-06, + "loss": 0.5672, + "step": 9653 + }, + { + "epoch": 0.7305058454088003, + "grad_norm": 2.141481876373291, + "learning_rate": 3.343178913544619e-06, + "loss": 0.6263, + "step": 9654 + }, + { + "epoch": 0.7305815141311339, + "grad_norm": 2.7578744888305664, + "learning_rate": 3.341427359544158e-06, + "loss": 0.7468, + "step": 9655 + }, + { + "epoch": 0.7306571828534675, + "grad_norm": 1.959076166152954, + "learning_rate": 3.339676159901819e-06, + "loss": 0.582, + "step": 9656 + }, + { + "epoch": 0.7307328515758011, + "grad_norm": 2.0008225440979004, + "learning_rate": 3.3379253147272654e-06, + "loss": 0.6107, + "step": 9657 + }, + { + "epoch": 0.7308085202981348, + "grad_norm": 2.1886539459228516, + "learning_rate": 3.336174824130143e-06, + "loss": 0.7106, + "step": 9658 + }, + { + "epoch": 0.7308841890204684, + "grad_norm": 2.4869959354400635, + "learning_rate": 3.334424688220071e-06, + "loss": 0.7828, + "step": 9659 + }, + { + "epoch": 0.730959857742802, + "grad_norm": 3.1968321800231934, + "learning_rate": 3.3326749071066546e-06, + "loss": 0.6548, + "step": 9660 + }, + { + "epoch": 0.7310355264651356, + "grad_norm": 2.0156288146972656, + "learning_rate": 3.330925480899458e-06, + "loss": 0.7084, + "step": 9661 + }, + { + "epoch": 0.7311111951874693, + "grad_norm": 2.093147039413452, + "learning_rate": 3.329176409708048e-06, + "loss": 0.716, + "step": 9662 + }, + { + "epoch": 0.7311868639098029, + "grad_norm": 1.8537280559539795, + "learning_rate": 3.3274276936419558e-06, + "loss": 0.7604, + "step": 9663 + }, + { + "epoch": 0.7312625326321365, + "grad_norm": 1.8829224109649658, + "learning_rate": 3.325679332810685e-06, + "loss": 0.5923, + "step": 9664 + }, + { + "epoch": 0.7313382013544701, + "grad_norm": 2.2655227184295654, + "learning_rate": 3.323931327323727e-06, + "loss": 0.6448, + "step": 9665 + }, + { + "epoch": 0.7314138700768038, + "grad_norm": 2.4388043880462646, + "learning_rate": 3.322183677290546e-06, + "loss": 0.6538, + "step": 9666 + }, + { + "epoch": 0.7314895387991374, + "grad_norm": 2.1966893672943115, + "learning_rate": 3.3204363828205933e-06, + "loss": 0.609, + "step": 9667 + }, + { + "epoch": 0.731565207521471, + "grad_norm": 1.9812705516815186, + "learning_rate": 3.318689444023281e-06, + "loss": 0.6558, + "step": 9668 + }, + { + "epoch": 0.7316408762438046, + "grad_norm": 2.1352076530456543, + "learning_rate": 3.3169428610080107e-06, + "loss": 0.6868, + "step": 9669 + }, + { + "epoch": 0.7317165449661382, + "grad_norm": 1.9275273084640503, + "learning_rate": 3.315196633884161e-06, + "loss": 0.663, + "step": 9670 + }, + { + "epoch": 0.7317922136884719, + "grad_norm": 2.548799991607666, + "learning_rate": 3.3134507627610867e-06, + "loss": 0.743, + "step": 9671 + }, + { + "epoch": 0.7318678824108055, + "grad_norm": 1.8957780599594116, + "learning_rate": 3.311705247748113e-06, + "loss": 0.5394, + "step": 9672 + }, + { + "epoch": 0.7319435511331391, + "grad_norm": 1.761271595954895, + "learning_rate": 3.3099600889545576e-06, + "loss": 0.7391, + "step": 9673 + }, + { + "epoch": 0.7320192198554727, + "grad_norm": 1.8656989336013794, + "learning_rate": 3.308215286489708e-06, + "loss": 0.5925, + "step": 9674 + }, + { + "epoch": 0.7320948885778064, + "grad_norm": 2.2291691303253174, + "learning_rate": 3.306470840462824e-06, + "loss": 0.6399, + "step": 9675 + }, + { + "epoch": 0.73217055730014, + "grad_norm": 2.45021390914917, + "learning_rate": 3.304726750983151e-06, + "loss": 0.7225, + "step": 9676 + }, + { + "epoch": 0.7322462260224736, + "grad_norm": 1.7993860244750977, + "learning_rate": 3.30298301815991e-06, + "loss": 0.6022, + "step": 9677 + }, + { + "epoch": 0.7323218947448072, + "grad_norm": 2.358670234680176, + "learning_rate": 3.301239642102298e-06, + "loss": 0.691, + "step": 9678 + }, + { + "epoch": 0.7323975634671409, + "grad_norm": 4.461367130279541, + "learning_rate": 3.2994966229194917e-06, + "loss": 0.6848, + "step": 9679 + }, + { + "epoch": 0.7324732321894745, + "grad_norm": 2.1369030475616455, + "learning_rate": 3.297753960720645e-06, + "loss": 0.7066, + "step": 9680 + }, + { + "epoch": 0.7325489009118081, + "grad_norm": 2.010079860687256, + "learning_rate": 3.296011655614891e-06, + "loss": 0.7084, + "step": 9681 + }, + { + "epoch": 0.7326245696341417, + "grad_norm": 2.3091893196105957, + "learning_rate": 3.2942697077113305e-06, + "loss": 0.7503, + "step": 9682 + }, + { + "epoch": 0.7327002383564754, + "grad_norm": 2.3691303730010986, + "learning_rate": 3.292528117119058e-06, + "loss": 0.6997, + "step": 9683 + }, + { + "epoch": 0.732775907078809, + "grad_norm": 3.361497402191162, + "learning_rate": 3.2907868839471364e-06, + "loss": 0.7454, + "step": 9684 + }, + { + "epoch": 0.7328515758011426, + "grad_norm": 2.057619571685791, + "learning_rate": 3.2890460083046072e-06, + "loss": 0.7054, + "step": 9685 + }, + { + "epoch": 0.7329272445234762, + "grad_norm": 2.1395699977874756, + "learning_rate": 3.2873054903004863e-06, + "loss": 0.5957, + "step": 9686 + }, + { + "epoch": 0.7330029132458098, + "grad_norm": 1.947824478149414, + "learning_rate": 3.28556533004377e-06, + "loss": 0.5955, + "step": 9687 + }, + { + "epoch": 0.7330785819681435, + "grad_norm": 2.4234938621520996, + "learning_rate": 3.283825527643441e-06, + "loss": 0.7185, + "step": 9688 + }, + { + "epoch": 0.7331542506904771, + "grad_norm": 4.633688926696777, + "learning_rate": 3.282086083208443e-06, + "loss": 0.7757, + "step": 9689 + }, + { + "epoch": 0.7332299194128107, + "grad_norm": 2.446262836456299, + "learning_rate": 3.280346996847709e-06, + "loss": 0.655, + "step": 9690 + }, + { + "epoch": 0.7333055881351443, + "grad_norm": 1.894422173500061, + "learning_rate": 3.2786082686701447e-06, + "loss": 0.7366, + "step": 9691 + }, + { + "epoch": 0.733381256857478, + "grad_norm": 2.1454946994781494, + "learning_rate": 3.2768698987846356e-06, + "loss": 0.5931, + "step": 9692 + }, + { + "epoch": 0.7334569255798116, + "grad_norm": 2.107937812805176, + "learning_rate": 3.2751318873000444e-06, + "loss": 0.5901, + "step": 9693 + }, + { + "epoch": 0.7335325943021452, + "grad_norm": 1.8799843788146973, + "learning_rate": 3.2733942343252114e-06, + "loss": 0.7529, + "step": 9694 + }, + { + "epoch": 0.7336082630244788, + "grad_norm": 2.41536808013916, + "learning_rate": 3.271656939968957e-06, + "loss": 0.7178, + "step": 9695 + }, + { + "epoch": 0.7336839317468125, + "grad_norm": 2.3177335262298584, + "learning_rate": 3.2699200043400684e-06, + "loss": 0.6441, + "step": 9696 + }, + { + "epoch": 0.7337596004691461, + "grad_norm": 1.8029228448867798, + "learning_rate": 3.2681834275473205e-06, + "loss": 0.6193, + "step": 9697 + }, + { + "epoch": 0.7338352691914797, + "grad_norm": 1.7246633768081665, + "learning_rate": 3.2664472096994678e-06, + "loss": 0.5477, + "step": 9698 + }, + { + "epoch": 0.7339109379138133, + "grad_norm": 2.265120029449463, + "learning_rate": 3.2647113509052387e-06, + "loss": 0.7033, + "step": 9699 + }, + { + "epoch": 0.7339866066361469, + "grad_norm": 2.030282974243164, + "learning_rate": 3.2629758512733326e-06, + "loss": 0.6291, + "step": 9700 + }, + { + "epoch": 0.7340622753584806, + "grad_norm": 2.392416477203369, + "learning_rate": 3.261240710912433e-06, + "loss": 0.7904, + "step": 9701 + }, + { + "epoch": 0.7341379440808142, + "grad_norm": 2.1410059928894043, + "learning_rate": 3.2595059299312027e-06, + "loss": 0.5866, + "step": 9702 + }, + { + "epoch": 0.7342136128031478, + "grad_norm": 2.2164275646209717, + "learning_rate": 3.2577715084382777e-06, + "loss": 0.7813, + "step": 9703 + }, + { + "epoch": 0.7342892815254815, + "grad_norm": 2.500359535217285, + "learning_rate": 3.256037446542273e-06, + "loss": 0.7013, + "step": 9704 + }, + { + "epoch": 0.7343649502478151, + "grad_norm": 2.0464277267456055, + "learning_rate": 3.2543037443517825e-06, + "loss": 0.6824, + "step": 9705 + }, + { + "epoch": 0.7344406189701487, + "grad_norm": 1.9826641082763672, + "learning_rate": 3.252570401975377e-06, + "loss": 0.6748, + "step": 9706 + }, + { + "epoch": 0.7345162876924823, + "grad_norm": 2.0672097206115723, + "learning_rate": 3.250837419521598e-06, + "loss": 0.5698, + "step": 9707 + }, + { + "epoch": 0.7345919564148159, + "grad_norm": 1.9913432598114014, + "learning_rate": 3.2491047970989765e-06, + "loss": 0.7454, + "step": 9708 + }, + { + "epoch": 0.7346676251371496, + "grad_norm": 1.714163064956665, + "learning_rate": 3.2473725348160173e-06, + "loss": 0.5349, + "step": 9709 + }, + { + "epoch": 0.7347432938594832, + "grad_norm": 1.9784096479415894, + "learning_rate": 3.2456406327811926e-06, + "loss": 0.6531, + "step": 9710 + }, + { + "epoch": 0.7348189625818168, + "grad_norm": 2.2186923027038574, + "learning_rate": 3.243909091102964e-06, + "loss": 0.695, + "step": 9711 + }, + { + "epoch": 0.7348946313041504, + "grad_norm": 2.7318224906921387, + "learning_rate": 3.2421779098897644e-06, + "loss": 0.7293, + "step": 9712 + }, + { + "epoch": 0.734970300026484, + "grad_norm": 2.105350971221924, + "learning_rate": 3.240447089250008e-06, + "loss": 0.6585, + "step": 9713 + }, + { + "epoch": 0.7350459687488177, + "grad_norm": 2.2211616039276123, + "learning_rate": 3.2387166292920837e-06, + "loss": 0.7232, + "step": 9714 + }, + { + "epoch": 0.7351216374711513, + "grad_norm": 2.1771297454833984, + "learning_rate": 3.2369865301243573e-06, + "loss": 0.5941, + "step": 9715 + }, + { + "epoch": 0.7351973061934849, + "grad_norm": 30.021831512451172, + "learning_rate": 3.2352567918551753e-06, + "loss": 0.7043, + "step": 9716 + }, + { + "epoch": 0.7352729749158186, + "grad_norm": 1.4489507675170898, + "learning_rate": 3.233527414592861e-06, + "loss": 0.7254, + "step": 9717 + }, + { + "epoch": 0.7353486436381522, + "grad_norm": 2.0714879035949707, + "learning_rate": 3.231798398445705e-06, + "loss": 0.7017, + "step": 9718 + }, + { + "epoch": 0.7354243123604858, + "grad_norm": 2.2503437995910645, + "learning_rate": 3.230069743521993e-06, + "loss": 0.7195, + "step": 9719 + }, + { + "epoch": 0.7354999810828194, + "grad_norm": 1.817765235900879, + "learning_rate": 3.2283414499299786e-06, + "loss": 0.6089, + "step": 9720 + }, + { + "epoch": 0.735575649805153, + "grad_norm": 1.9589232206344604, + "learning_rate": 3.2266135177778883e-06, + "loss": 0.7062, + "step": 9721 + }, + { + "epoch": 0.7356513185274867, + "grad_norm": 2.7293508052825928, + "learning_rate": 3.224885947173932e-06, + "loss": 0.7512, + "step": 9722 + }, + { + "epoch": 0.7357269872498203, + "grad_norm": 2.42242169380188, + "learning_rate": 3.223158738226297e-06, + "loss": 0.8047, + "step": 9723 + }, + { + "epoch": 0.7358026559721539, + "grad_norm": 2.5627288818359375, + "learning_rate": 3.221431891043146e-06, + "loss": 0.7915, + "step": 9724 + }, + { + "epoch": 0.7358783246944876, + "grad_norm": 1.7673484086990356, + "learning_rate": 3.2197054057326203e-06, + "loss": 0.7325, + "step": 9725 + }, + { + "epoch": 0.7359539934168211, + "grad_norm": 2.0500118732452393, + "learning_rate": 3.217979282402839e-06, + "loss": 0.6227, + "step": 9726 + }, + { + "epoch": 0.7360296621391548, + "grad_norm": 2.1804354190826416, + "learning_rate": 3.216253521161894e-06, + "loss": 0.6206, + "step": 9727 + }, + { + "epoch": 0.7361053308614884, + "grad_norm": 1.8224960565567017, + "learning_rate": 3.214528122117862e-06, + "loss": 0.7576, + "step": 9728 + }, + { + "epoch": 0.736180999583822, + "grad_norm": 2.191704750061035, + "learning_rate": 3.212803085378792e-06, + "loss": 0.6808, + "step": 9729 + }, + { + "epoch": 0.7362566683061557, + "grad_norm": 1.6620792150497437, + "learning_rate": 3.2110784110527098e-06, + "loss": 0.771, + "step": 9730 + }, + { + "epoch": 0.7363323370284893, + "grad_norm": 2.0463523864746094, + "learning_rate": 3.2093540992476243e-06, + "loss": 0.5801, + "step": 9731 + }, + { + "epoch": 0.7364080057508229, + "grad_norm": 1.8782941102981567, + "learning_rate": 3.207630150071512e-06, + "loss": 0.778, + "step": 9732 + }, + { + "epoch": 0.7364836744731565, + "grad_norm": 2.127807378768921, + "learning_rate": 3.205906563632331e-06, + "loss": 0.7317, + "step": 9733 + }, + { + "epoch": 0.7365593431954901, + "grad_norm": 2.123108386993408, + "learning_rate": 3.2041833400380274e-06, + "loss": 0.5925, + "step": 9734 + }, + { + "epoch": 0.7366350119178238, + "grad_norm": 1.8136372566223145, + "learning_rate": 3.202460479396505e-06, + "loss": 0.7108, + "step": 9735 + }, + { + "epoch": 0.7367106806401574, + "grad_norm": 2.5573692321777344, + "learning_rate": 3.200737981815661e-06, + "loss": 0.8463, + "step": 9736 + }, + { + "epoch": 0.736786349362491, + "grad_norm": 2.7275161743164062, + "learning_rate": 3.19901584740336e-06, + "loss": 0.6705, + "step": 9737 + }, + { + "epoch": 0.7368620180848247, + "grad_norm": 1.9936103820800781, + "learning_rate": 3.1972940762674494e-06, + "loss": 0.6206, + "step": 9738 + }, + { + "epoch": 0.7369376868071582, + "grad_norm": 1.7031792402267456, + "learning_rate": 3.195572668515753e-06, + "loss": 0.6619, + "step": 9739 + }, + { + "epoch": 0.7370133555294919, + "grad_norm": 1.8480082750320435, + "learning_rate": 3.193851624256069e-06, + "loss": 0.6239, + "step": 9740 + }, + { + "epoch": 0.7370890242518255, + "grad_norm": 1.9207595586776733, + "learning_rate": 3.192130943596176e-06, + "loss": 0.8244, + "step": 9741 + }, + { + "epoch": 0.7371646929741591, + "grad_norm": 1.9544023275375366, + "learning_rate": 3.190410626643831e-06, + "loss": 0.6302, + "step": 9742 + }, + { + "epoch": 0.7372403616964928, + "grad_norm": 2.287046194076538, + "learning_rate": 3.188690673506757e-06, + "loss": 0.5985, + "step": 9743 + }, + { + "epoch": 0.7373160304188264, + "grad_norm": 2.2530996799468994, + "learning_rate": 3.186971084292673e-06, + "loss": 0.7136, + "step": 9744 + }, + { + "epoch": 0.73739169914116, + "grad_norm": 2.2167491912841797, + "learning_rate": 3.1852518591092636e-06, + "loss": 0.6572, + "step": 9745 + }, + { + "epoch": 0.7374673678634937, + "grad_norm": 1.5545406341552734, + "learning_rate": 3.1835329980641866e-06, + "loss": 0.7841, + "step": 9746 + }, + { + "epoch": 0.7375430365858272, + "grad_norm": 2.0609633922576904, + "learning_rate": 3.181814501265086e-06, + "loss": 0.6042, + "step": 9747 + }, + { + "epoch": 0.7376187053081609, + "grad_norm": 2.269827365875244, + "learning_rate": 3.18009636881958e-06, + "loss": 0.8208, + "step": 9748 + }, + { + "epoch": 0.7376943740304945, + "grad_norm": 2.116123914718628, + "learning_rate": 3.178378600835264e-06, + "loss": 0.693, + "step": 9749 + }, + { + "epoch": 0.7377700427528281, + "grad_norm": 2.1836884021759033, + "learning_rate": 3.176661197419708e-06, + "loss": 0.707, + "step": 9750 + }, + { + "epoch": 0.7378457114751618, + "grad_norm": 1.9921830892562866, + "learning_rate": 3.1749441586804633e-06, + "loss": 0.58, + "step": 9751 + }, + { + "epoch": 0.7379213801974953, + "grad_norm": 2.1456127166748047, + "learning_rate": 3.173227484725059e-06, + "loss": 0.6073, + "step": 9752 + }, + { + "epoch": 0.737997048919829, + "grad_norm": 2.2834341526031494, + "learning_rate": 3.1715111756609924e-06, + "loss": 0.6229, + "step": 9753 + }, + { + "epoch": 0.7380727176421626, + "grad_norm": 2.3917596340179443, + "learning_rate": 3.1697952315957453e-06, + "loss": 0.7978, + "step": 9754 + }, + { + "epoch": 0.7381483863644962, + "grad_norm": 2.0401668548583984, + "learning_rate": 3.1680796526367804e-06, + "loss": 0.7177, + "step": 9755 + }, + { + "epoch": 0.7382240550868299, + "grad_norm": 2.360987424850464, + "learning_rate": 3.1663644388915333e-06, + "loss": 0.7348, + "step": 9756 + }, + { + "epoch": 0.7382997238091635, + "grad_norm": 2.4300918579101562, + "learning_rate": 3.1646495904674113e-06, + "loss": 0.667, + "step": 9757 + }, + { + "epoch": 0.7383753925314971, + "grad_norm": 2.076064109802246, + "learning_rate": 3.162935107471805e-06, + "loss": 0.6606, + "step": 9758 + }, + { + "epoch": 0.7384510612538308, + "grad_norm": 2.9375874996185303, + "learning_rate": 3.1612209900120817e-06, + "loss": 0.6929, + "step": 9759 + }, + { + "epoch": 0.7385267299761643, + "grad_norm": 2.2996110916137695, + "learning_rate": 3.159507238195584e-06, + "loss": 0.6716, + "step": 9760 + }, + { + "epoch": 0.738602398698498, + "grad_norm": 1.980484127998352, + "learning_rate": 3.1577938521296352e-06, + "loss": 0.6685, + "step": 9761 + }, + { + "epoch": 0.7386780674208316, + "grad_norm": 3.1064846515655518, + "learning_rate": 3.1560808319215305e-06, + "loss": 0.7042, + "step": 9762 + }, + { + "epoch": 0.7387537361431652, + "grad_norm": 2.007359743118286, + "learning_rate": 3.154368177678548e-06, + "loss": 0.6777, + "step": 9763 + }, + { + "epoch": 0.7388294048654989, + "grad_norm": 2.1503567695617676, + "learning_rate": 3.1526558895079316e-06, + "loss": 0.6476, + "step": 9764 + }, + { + "epoch": 0.7389050735878324, + "grad_norm": 1.7849518060684204, + "learning_rate": 3.15094396751692e-06, + "loss": 0.5983, + "step": 9765 + }, + { + "epoch": 0.7389807423101661, + "grad_norm": 1.9997239112854004, + "learning_rate": 3.1492324118127173e-06, + "loss": 0.69, + "step": 9766 + }, + { + "epoch": 0.7390564110324997, + "grad_norm": 2.347898244857788, + "learning_rate": 3.147521222502502e-06, + "loss": 0.7001, + "step": 9767 + }, + { + "epoch": 0.7391320797548333, + "grad_norm": 2.870927095413208, + "learning_rate": 3.145810399693437e-06, + "loss": 0.692, + "step": 9768 + }, + { + "epoch": 0.739207748477167, + "grad_norm": 2.2380945682525635, + "learning_rate": 3.1440999434926564e-06, + "loss": 0.7641, + "step": 9769 + }, + { + "epoch": 0.7392834171995006, + "grad_norm": 1.807690143585205, + "learning_rate": 3.1423898540072832e-06, + "loss": 0.6217, + "step": 9770 + }, + { + "epoch": 0.7393590859218342, + "grad_norm": 2.3498446941375732, + "learning_rate": 3.140680131344401e-06, + "loss": 0.6596, + "step": 9771 + }, + { + "epoch": 0.7394347546441679, + "grad_norm": 2.41398024559021, + "learning_rate": 3.13897077561108e-06, + "loss": 0.7398, + "step": 9772 + }, + { + "epoch": 0.7395104233665014, + "grad_norm": 2.0578386783599854, + "learning_rate": 3.137261786914366e-06, + "loss": 0.7848, + "step": 9773 + }, + { + "epoch": 0.7395860920888351, + "grad_norm": 2.1671581268310547, + "learning_rate": 3.1355531653612802e-06, + "loss": 0.562, + "step": 9774 + }, + { + "epoch": 0.7396617608111687, + "grad_norm": 2.2448394298553467, + "learning_rate": 3.1338449110588247e-06, + "loss": 0.7788, + "step": 9775 + }, + { + "epoch": 0.7397374295335023, + "grad_norm": 1.992663025856018, + "learning_rate": 3.132137024113973e-06, + "loss": 0.7574, + "step": 9776 + }, + { + "epoch": 0.739813098255836, + "grad_norm": 3.92378830909729, + "learning_rate": 3.1304295046336836e-06, + "loss": 0.5947, + "step": 9777 + }, + { + "epoch": 0.7398887669781695, + "grad_norm": 1.903420090675354, + "learning_rate": 3.12872235272488e-06, + "loss": 0.6038, + "step": 9778 + }, + { + "epoch": 0.7399644357005032, + "grad_norm": 2.0623772144317627, + "learning_rate": 3.1270155684944695e-06, + "loss": 0.7105, + "step": 9779 + }, + { + "epoch": 0.7400401044228369, + "grad_norm": 2.4272897243499756, + "learning_rate": 3.125309152049346e-06, + "loss": 0.6364, + "step": 9780 + }, + { + "epoch": 0.7401157731451704, + "grad_norm": 2.0178956985473633, + "learning_rate": 3.1236031034963617e-06, + "loss": 0.7385, + "step": 9781 + }, + { + "epoch": 0.7401914418675041, + "grad_norm": 1.974817156791687, + "learning_rate": 3.1218974229423575e-06, + "loss": 0.5617, + "step": 9782 + }, + { + "epoch": 0.7402671105898377, + "grad_norm": 2.286247968673706, + "learning_rate": 3.1201921104941478e-06, + "loss": 0.6671, + "step": 9783 + }, + { + "epoch": 0.7403427793121713, + "grad_norm": 2.1284759044647217, + "learning_rate": 3.118487166258527e-06, + "loss": 0.6746, + "step": 9784 + }, + { + "epoch": 0.740418448034505, + "grad_norm": 2.2996256351470947, + "learning_rate": 3.1167825903422616e-06, + "loss": 0.6687, + "step": 9785 + }, + { + "epoch": 0.7404941167568385, + "grad_norm": 2.304643154144287, + "learning_rate": 3.1150783828521005e-06, + "loss": 0.6445, + "step": 9786 + }, + { + "epoch": 0.7405697854791722, + "grad_norm": 2.089303731918335, + "learning_rate": 3.1133745438947643e-06, + "loss": 0.5833, + "step": 9787 + }, + { + "epoch": 0.7406454542015058, + "grad_norm": 2.256558895111084, + "learning_rate": 3.1116710735769567e-06, + "loss": 0.7369, + "step": 9788 + }, + { + "epoch": 0.7407211229238394, + "grad_norm": 2.6049532890319824, + "learning_rate": 3.109967972005349e-06, + "loss": 0.5936, + "step": 9789 + }, + { + "epoch": 0.7407967916461731, + "grad_norm": 2.2916321754455566, + "learning_rate": 3.1082652392865946e-06, + "loss": 0.6695, + "step": 9790 + }, + { + "epoch": 0.7408724603685066, + "grad_norm": 2.5603201389312744, + "learning_rate": 3.1065628755273324e-06, + "loss": 0.5951, + "step": 9791 + }, + { + "epoch": 0.7409481290908403, + "grad_norm": 2.134892225265503, + "learning_rate": 3.1048608808341624e-06, + "loss": 0.7521, + "step": 9792 + }, + { + "epoch": 0.741023797813174, + "grad_norm": 10.978889465332031, + "learning_rate": 3.103159255313671e-06, + "loss": 0.6364, + "step": 9793 + }, + { + "epoch": 0.7410994665355075, + "grad_norm": 1.8416504859924316, + "learning_rate": 3.10145799907242e-06, + "loss": 0.5399, + "step": 9794 + }, + { + "epoch": 0.7411751352578412, + "grad_norm": 2.196185350418091, + "learning_rate": 3.099757112216947e-06, + "loss": 0.6477, + "step": 9795 + }, + { + "epoch": 0.7412508039801748, + "grad_norm": 2.4861955642700195, + "learning_rate": 3.098056594853767e-06, + "loss": 0.5316, + "step": 9796 + }, + { + "epoch": 0.7413264727025084, + "grad_norm": 2.2695884704589844, + "learning_rate": 3.0963564470893736e-06, + "loss": 0.7883, + "step": 9797 + }, + { + "epoch": 0.7414021414248421, + "grad_norm": 2.2510933876037598, + "learning_rate": 3.094656669030236e-06, + "loss": 0.7622, + "step": 9798 + }, + { + "epoch": 0.7414778101471756, + "grad_norm": 2.05856990814209, + "learning_rate": 3.0929572607827946e-06, + "loss": 0.5341, + "step": 9799 + }, + { + "epoch": 0.7415534788695093, + "grad_norm": 2.074747323989868, + "learning_rate": 3.0912582224534737e-06, + "loss": 0.6792, + "step": 9800 + }, + { + "epoch": 0.741629147591843, + "grad_norm": 2.0920283794403076, + "learning_rate": 3.089559554148676e-06, + "loss": 0.7247, + "step": 9801 + }, + { + "epoch": 0.7417048163141765, + "grad_norm": 2.252413034439087, + "learning_rate": 3.0878612559747785e-06, + "loss": 0.6384, + "step": 9802 + }, + { + "epoch": 0.7417804850365102, + "grad_norm": 1.887231707572937, + "learning_rate": 3.0861633280381293e-06, + "loss": 0.5092, + "step": 9803 + }, + { + "epoch": 0.7418561537588437, + "grad_norm": 2.2161378860473633, + "learning_rate": 3.08446577044506e-06, + "loss": 0.8336, + "step": 9804 + }, + { + "epoch": 0.7419318224811774, + "grad_norm": 2.0824790000915527, + "learning_rate": 3.082768583301876e-06, + "loss": 0.7406, + "step": 9805 + }, + { + "epoch": 0.7420074912035111, + "grad_norm": 2.0326271057128906, + "learning_rate": 3.0810717667148635e-06, + "loss": 0.6042, + "step": 9806 + }, + { + "epoch": 0.7420831599258446, + "grad_norm": 1.864272117614746, + "learning_rate": 3.07937532079028e-06, + "loss": 0.6879, + "step": 9807 + }, + { + "epoch": 0.7421588286481783, + "grad_norm": 2.2721335887908936, + "learning_rate": 3.0776792456343648e-06, + "loss": 0.7037, + "step": 9808 + }, + { + "epoch": 0.742234497370512, + "grad_norm": 1.7374581098556519, + "learning_rate": 3.0759835413533324e-06, + "loss": 0.6843, + "step": 9809 + }, + { + "epoch": 0.7423101660928455, + "grad_norm": 2.2523179054260254, + "learning_rate": 3.0742882080533656e-06, + "loss": 0.705, + "step": 9810 + }, + { + "epoch": 0.7423858348151792, + "grad_norm": 1.5839877128601074, + "learning_rate": 3.0725932458406395e-06, + "loss": 0.7204, + "step": 9811 + }, + { + "epoch": 0.7424615035375127, + "grad_norm": 2.310640335083008, + "learning_rate": 3.0708986548212998e-06, + "loss": 0.635, + "step": 9812 + }, + { + "epoch": 0.7425371722598464, + "grad_norm": 2.1407198905944824, + "learning_rate": 3.06920443510146e-06, + "loss": 0.6782, + "step": 9813 + }, + { + "epoch": 0.7426128409821801, + "grad_norm": 2.077183246612549, + "learning_rate": 3.067510586787221e-06, + "loss": 0.7058, + "step": 9814 + }, + { + "epoch": 0.7426885097045136, + "grad_norm": 1.9576934576034546, + "learning_rate": 3.065817109984654e-06, + "loss": 0.5691, + "step": 9815 + }, + { + "epoch": 0.7427641784268473, + "grad_norm": 2.2817611694335938, + "learning_rate": 3.0641240047998196e-06, + "loss": 0.7766, + "step": 9816 + }, + { + "epoch": 0.7428398471491808, + "grad_norm": 2.1753251552581787, + "learning_rate": 3.062431271338736e-06, + "loss": 0.6152, + "step": 9817 + }, + { + "epoch": 0.7429155158715145, + "grad_norm": 2.1816024780273438, + "learning_rate": 3.0607389097074095e-06, + "loss": 0.6559, + "step": 9818 + }, + { + "epoch": 0.7429911845938482, + "grad_norm": 2.068418264389038, + "learning_rate": 3.059046920011823e-06, + "loss": 0.7292, + "step": 9819 + }, + { + "epoch": 0.7430668533161817, + "grad_norm": 2.037598133087158, + "learning_rate": 3.057355302357934e-06, + "loss": 0.5673, + "step": 9820 + }, + { + "epoch": 0.7431425220385154, + "grad_norm": 2.198431968688965, + "learning_rate": 3.055664056851677e-06, + "loss": 0.6868, + "step": 9821 + }, + { + "epoch": 0.743218190760849, + "grad_norm": 2.101435899734497, + "learning_rate": 3.0539731835989625e-06, + "loss": 0.6842, + "step": 9822 + }, + { + "epoch": 0.7432938594831826, + "grad_norm": 2.1221351623535156, + "learning_rate": 3.052282682705682e-06, + "loss": 0.6233, + "step": 9823 + }, + { + "epoch": 0.7433695282055163, + "grad_norm": 1.9273860454559326, + "learning_rate": 3.0505925542776946e-06, + "loss": 0.6363, + "step": 9824 + }, + { + "epoch": 0.7434451969278498, + "grad_norm": 2.380946159362793, + "learning_rate": 3.048902798420844e-06, + "loss": 0.665, + "step": 9825 + }, + { + "epoch": 0.7435208656501835, + "grad_norm": 2.2486279010772705, + "learning_rate": 3.047213415240948e-06, + "loss": 0.6321, + "step": 9826 + }, + { + "epoch": 0.7435965343725172, + "grad_norm": 1.9640283584594727, + "learning_rate": 3.0455244048438014e-06, + "loss": 0.5942, + "step": 9827 + }, + { + "epoch": 0.7436722030948507, + "grad_norm": 2.507197141647339, + "learning_rate": 3.043835767335177e-06, + "loss": 0.7769, + "step": 9828 + }, + { + "epoch": 0.7437478718171844, + "grad_norm": 2.5422580242156982, + "learning_rate": 3.0421475028208205e-06, + "loss": 0.7886, + "step": 9829 + }, + { + "epoch": 0.7438235405395179, + "grad_norm": 2.2974729537963867, + "learning_rate": 3.0404596114064573e-06, + "loss": 0.4964, + "step": 9830 + }, + { + "epoch": 0.7438992092618516, + "grad_norm": 2.1887059211730957, + "learning_rate": 3.038772093197789e-06, + "loss": 0.7204, + "step": 9831 + }, + { + "epoch": 0.7439748779841853, + "grad_norm": 2.602665901184082, + "learning_rate": 3.0370849483004927e-06, + "loss": 0.5673, + "step": 9832 + }, + { + "epoch": 0.7440505467065188, + "grad_norm": 2.0209755897521973, + "learning_rate": 3.0353981768202243e-06, + "loss": 0.6575, + "step": 9833 + }, + { + "epoch": 0.7441262154288525, + "grad_norm": 2.425705671310425, + "learning_rate": 3.033711778862616e-06, + "loss": 0.6502, + "step": 9834 + }, + { + "epoch": 0.7442018841511862, + "grad_norm": 3.1738603115081787, + "learning_rate": 3.032025754533271e-06, + "loss": 0.5545, + "step": 9835 + }, + { + "epoch": 0.7442775528735197, + "grad_norm": 1.8896595239639282, + "learning_rate": 3.0303401039377725e-06, + "loss": 0.5624, + "step": 9836 + }, + { + "epoch": 0.7443532215958534, + "grad_norm": 2.21313738822937, + "learning_rate": 3.0286548271816916e-06, + "loss": 0.6534, + "step": 9837 + }, + { + "epoch": 0.7444288903181869, + "grad_norm": 2.004441499710083, + "learning_rate": 3.0269699243705555e-06, + "loss": 0.7336, + "step": 9838 + }, + { + "epoch": 0.7445045590405206, + "grad_norm": 2.018430233001709, + "learning_rate": 3.025285395609882e-06, + "loss": 0.6419, + "step": 9839 + }, + { + "epoch": 0.7445802277628543, + "grad_norm": 2.3819639682769775, + "learning_rate": 3.0236012410051617e-06, + "loss": 0.6499, + "step": 9840 + }, + { + "epoch": 0.7446558964851878, + "grad_norm": 2.396756172180176, + "learning_rate": 3.0219174606618614e-06, + "loss": 0.7293, + "step": 9841 + }, + { + "epoch": 0.7447315652075215, + "grad_norm": 1.8934662342071533, + "learning_rate": 3.0202340546854254e-06, + "loss": 0.6671, + "step": 9842 + }, + { + "epoch": 0.744807233929855, + "grad_norm": 2.2285642623901367, + "learning_rate": 3.0185510231812736e-06, + "loss": 0.5863, + "step": 9843 + }, + { + "epoch": 0.7448829026521887, + "grad_norm": 1.9590516090393066, + "learning_rate": 3.0168683662548037e-06, + "loss": 0.57, + "step": 9844 + }, + { + "epoch": 0.7449585713745224, + "grad_norm": 2.253278970718384, + "learning_rate": 3.0151860840113916e-06, + "loss": 0.6678, + "step": 9845 + }, + { + "epoch": 0.7450342400968559, + "grad_norm": 1.9592149257659912, + "learning_rate": 3.0135041765563778e-06, + "loss": 0.7633, + "step": 9846 + }, + { + "epoch": 0.7451099088191896, + "grad_norm": 2.0090091228485107, + "learning_rate": 3.011822643995098e-06, + "loss": 0.7217, + "step": 9847 + }, + { + "epoch": 0.7451855775415233, + "grad_norm": 2.6432924270629883, + "learning_rate": 3.0101414864328547e-06, + "loss": 0.6495, + "step": 9848 + }, + { + "epoch": 0.7452612462638568, + "grad_norm": 2.1458330154418945, + "learning_rate": 3.0084607039749234e-06, + "loss": 0.556, + "step": 9849 + }, + { + "epoch": 0.7453369149861905, + "grad_norm": 1.9538377523422241, + "learning_rate": 3.006780296726561e-06, + "loss": 0.582, + "step": 9850 + }, + { + "epoch": 0.745412583708524, + "grad_norm": 1.9687731266021729, + "learning_rate": 3.0051002647930002e-06, + "loss": 0.6966, + "step": 9851 + }, + { + "epoch": 0.7454882524308577, + "grad_norm": 2.3369882106781006, + "learning_rate": 3.0034206082794515e-06, + "loss": 0.5864, + "step": 9852 + }, + { + "epoch": 0.7455639211531914, + "grad_norm": 2.699866533279419, + "learning_rate": 3.0017413272911e-06, + "loss": 0.7418, + "step": 9853 + }, + { + "epoch": 0.7456395898755249, + "grad_norm": 1.819517731666565, + "learning_rate": 3.000062421933107e-06, + "loss": 0.5972, + "step": 9854 + }, + { + "epoch": 0.7457152585978586, + "grad_norm": 2.184372663497925, + "learning_rate": 2.9983838923106146e-06, + "loss": 0.6785, + "step": 9855 + }, + { + "epoch": 0.7457909273201923, + "grad_norm": 1.910994291305542, + "learning_rate": 2.996705738528728e-06, + "loss": 0.6254, + "step": 9856 + }, + { + "epoch": 0.7458665960425258, + "grad_norm": 5.832062244415283, + "learning_rate": 2.995027960692548e-06, + "loss": 0.5108, + "step": 9857 + }, + { + "epoch": 0.7459422647648595, + "grad_norm": 4.642062664031982, + "learning_rate": 2.9933505589071393e-06, + "loss": 0.7348, + "step": 9858 + }, + { + "epoch": 0.746017933487193, + "grad_norm": 2.5428013801574707, + "learning_rate": 2.9916735332775504e-06, + "loss": 0.6369, + "step": 9859 + }, + { + "epoch": 0.7460936022095267, + "grad_norm": 1.667399525642395, + "learning_rate": 2.989996883908794e-06, + "loss": 0.594, + "step": 9860 + }, + { + "epoch": 0.7461692709318604, + "grad_norm": 3.054075002670288, + "learning_rate": 2.9883206109058685e-06, + "loss": 0.7789, + "step": 9861 + }, + { + "epoch": 0.7462449396541939, + "grad_norm": 2.8136708736419678, + "learning_rate": 2.9866447143737572e-06, + "loss": 0.5894, + "step": 9862 + }, + { + "epoch": 0.7463206083765276, + "grad_norm": 2.113799571990967, + "learning_rate": 2.9849691944174e-06, + "loss": 0.5714, + "step": 9863 + }, + { + "epoch": 0.7463962770988611, + "grad_norm": 1.9772562980651855, + "learning_rate": 2.983294051141727e-06, + "loss": 0.6968, + "step": 9864 + }, + { + "epoch": 0.7464719458211948, + "grad_norm": 2.416429042816162, + "learning_rate": 2.9816192846516415e-06, + "loss": 0.6939, + "step": 9865 + }, + { + "epoch": 0.7465476145435285, + "grad_norm": 1.8433407545089722, + "learning_rate": 2.9799448950520247e-06, + "loss": 0.5994, + "step": 9866 + }, + { + "epoch": 0.746623283265862, + "grad_norm": 2.278648853302002, + "learning_rate": 2.978270882447723e-06, + "loss": 0.7247, + "step": 9867 + }, + { + "epoch": 0.7466989519881957, + "grad_norm": 2.0349230766296387, + "learning_rate": 2.976597246943579e-06, + "loss": 0.7355, + "step": 9868 + }, + { + "epoch": 0.7467746207105294, + "grad_norm": 1.8349040746688843, + "learning_rate": 2.974923988644401e-06, + "loss": 0.7532, + "step": 9869 + }, + { + "epoch": 0.7468502894328629, + "grad_norm": 2.4737563133239746, + "learning_rate": 2.973251107654966e-06, + "loss": 0.6121, + "step": 9870 + }, + { + "epoch": 0.7469259581551966, + "grad_norm": 2.696403741836548, + "learning_rate": 2.9715786040800403e-06, + "loss": 0.7659, + "step": 9871 + }, + { + "epoch": 0.7470016268775301, + "grad_norm": 2.2334213256835938, + "learning_rate": 2.969906478024358e-06, + "loss": 0.7986, + "step": 9872 + }, + { + "epoch": 0.7470772955998638, + "grad_norm": 1.8761075735092163, + "learning_rate": 2.9682347295926405e-06, + "loss": 0.6269, + "step": 9873 + }, + { + "epoch": 0.7471529643221975, + "grad_norm": 2.75467586517334, + "learning_rate": 2.9665633588895718e-06, + "loss": 0.6236, + "step": 9874 + }, + { + "epoch": 0.747228633044531, + "grad_norm": 2.219914197921753, + "learning_rate": 2.964892366019819e-06, + "loss": 0.6861, + "step": 9875 + }, + { + "epoch": 0.7473043017668647, + "grad_norm": 1.8167731761932373, + "learning_rate": 2.9632217510880267e-06, + "loss": 0.6217, + "step": 9876 + }, + { + "epoch": 0.7473799704891982, + "grad_norm": 1.8947856426239014, + "learning_rate": 2.9615515141988137e-06, + "loss": 0.65, + "step": 9877 + }, + { + "epoch": 0.7474556392115319, + "grad_norm": 2.316633701324463, + "learning_rate": 2.959881655456775e-06, + "loss": 0.7939, + "step": 9878 + }, + { + "epoch": 0.7475313079338656, + "grad_norm": 2.0706794261932373, + "learning_rate": 2.9582121749664843e-06, + "loss": 0.7122, + "step": 9879 + }, + { + "epoch": 0.7476069766561991, + "grad_norm": 2.491586446762085, + "learning_rate": 2.956543072832491e-06, + "loss": 0.6208, + "step": 9880 + }, + { + "epoch": 0.7476826453785328, + "grad_norm": 1.9248894453048706, + "learning_rate": 2.954874349159314e-06, + "loss": 0.5814, + "step": 9881 + }, + { + "epoch": 0.7477583141008665, + "grad_norm": 2.1573891639709473, + "learning_rate": 2.9532060040514544e-06, + "loss": 0.7211, + "step": 9882 + }, + { + "epoch": 0.7478339828232, + "grad_norm": 2.213338613510132, + "learning_rate": 2.9515380376133995e-06, + "loss": 0.5858, + "step": 9883 + }, + { + "epoch": 0.7479096515455337, + "grad_norm": 2.1346771717071533, + "learning_rate": 2.9498704499495923e-06, + "loss": 0.6022, + "step": 9884 + }, + { + "epoch": 0.7479853202678672, + "grad_norm": 2.2232656478881836, + "learning_rate": 2.9482032411644665e-06, + "loss": 0.5621, + "step": 9885 + }, + { + "epoch": 0.7480609889902009, + "grad_norm": 2.403750419616699, + "learning_rate": 2.946536411362427e-06, + "loss": 0.6615, + "step": 9886 + }, + { + "epoch": 0.7481366577125346, + "grad_norm": 2.009737491607666, + "learning_rate": 2.9448699606478564e-06, + "loss": 0.7192, + "step": 9887 + }, + { + "epoch": 0.7482123264348681, + "grad_norm": 1.6389305591583252, + "learning_rate": 2.943203889125114e-06, + "loss": 0.6288, + "step": 9888 + }, + { + "epoch": 0.7482879951572018, + "grad_norm": 2.2867612838745117, + "learning_rate": 2.941538196898534e-06, + "loss": 0.8133, + "step": 9889 + }, + { + "epoch": 0.7483636638795353, + "grad_norm": 3.108665704727173, + "learning_rate": 2.939872884072428e-06, + "loss": 0.5923, + "step": 9890 + }, + { + "epoch": 0.748439332601869, + "grad_norm": 1.8590794801712036, + "learning_rate": 2.9382079507510856e-06, + "loss": 0.5962, + "step": 9891 + }, + { + "epoch": 0.7485150013242027, + "grad_norm": 2.1245317459106445, + "learning_rate": 2.9365433970387614e-06, + "loss": 0.6682, + "step": 9892 + }, + { + "epoch": 0.7485906700465362, + "grad_norm": 2.0722525119781494, + "learning_rate": 2.9348792230397044e-06, + "loss": 0.616, + "step": 9893 + }, + { + "epoch": 0.7486663387688699, + "grad_norm": 2.0355629920959473, + "learning_rate": 2.9332154288581305e-06, + "loss": 0.7896, + "step": 9894 + }, + { + "epoch": 0.7487420074912036, + "grad_norm": 2.689260244369507, + "learning_rate": 2.9315520145982257e-06, + "loss": 0.6665, + "step": 9895 + }, + { + "epoch": 0.7488176762135371, + "grad_norm": 1.7102781534194946, + "learning_rate": 2.929888980364161e-06, + "loss": 0.6444, + "step": 9896 + }, + { + "epoch": 0.7488933449358708, + "grad_norm": 2.3687448501586914, + "learning_rate": 2.9282263262600825e-06, + "loss": 0.8416, + "step": 9897 + }, + { + "epoch": 0.7489690136582043, + "grad_norm": 2.313998222351074, + "learning_rate": 2.926564052390109e-06, + "loss": 0.6892, + "step": 9898 + }, + { + "epoch": 0.749044682380538, + "grad_norm": 1.732693076133728, + "learning_rate": 2.9249021588583393e-06, + "loss": 0.6822, + "step": 9899 + }, + { + "epoch": 0.7491203511028717, + "grad_norm": 2.4626896381378174, + "learning_rate": 2.9232406457688444e-06, + "loss": 0.5485, + "step": 9900 + }, + { + "epoch": 0.7491960198252052, + "grad_norm": 2.252591133117676, + "learning_rate": 2.9215795132256786e-06, + "loss": 0.7695, + "step": 9901 + }, + { + "epoch": 0.7492716885475389, + "grad_norm": 1.9376341104507446, + "learning_rate": 2.9199187613328577e-06, + "loss": 0.6194, + "step": 9902 + }, + { + "epoch": 0.7493473572698724, + "grad_norm": 2.1779584884643555, + "learning_rate": 2.9182583901943925e-06, + "loss": 0.7618, + "step": 9903 + }, + { + "epoch": 0.7494230259922061, + "grad_norm": 2.131627321243286, + "learning_rate": 2.9165983999142577e-06, + "loss": 0.7612, + "step": 9904 + }, + { + "epoch": 0.7494986947145398, + "grad_norm": 2.43209171295166, + "learning_rate": 2.9149387905964096e-06, + "loss": 0.5466, + "step": 9905 + }, + { + "epoch": 0.7495743634368733, + "grad_norm": 2.0824105739593506, + "learning_rate": 2.9132795623447736e-06, + "loss": 0.8629, + "step": 9906 + }, + { + "epoch": 0.749650032159207, + "grad_norm": 2.1721785068511963, + "learning_rate": 2.9116207152632575e-06, + "loss": 0.6502, + "step": 9907 + }, + { + "epoch": 0.7497257008815407, + "grad_norm": 1.9518764019012451, + "learning_rate": 2.909962249455746e-06, + "loss": 0.7207, + "step": 9908 + }, + { + "epoch": 0.7498013696038742, + "grad_norm": 3.3193180561065674, + "learning_rate": 2.908304165026094e-06, + "loss": 0.7037, + "step": 9909 + }, + { + "epoch": 0.7498770383262079, + "grad_norm": 2.249847412109375, + "learning_rate": 2.906646462078139e-06, + "loss": 0.8591, + "step": 9910 + }, + { + "epoch": 0.7499527070485414, + "grad_norm": 1.7684543132781982, + "learning_rate": 2.904989140715691e-06, + "loss": 0.6102, + "step": 9911 + }, + { + "epoch": 0.7500283757708751, + "grad_norm": 2.1191229820251465, + "learning_rate": 2.9033322010425397e-06, + "loss": 0.6022, + "step": 9912 + }, + { + "epoch": 0.7501040444932088, + "grad_norm": 2.044253349304199, + "learning_rate": 2.901675643162439e-06, + "loss": 0.6631, + "step": 9913 + }, + { + "epoch": 0.7501797132155423, + "grad_norm": 2.230672597885132, + "learning_rate": 2.9000194671791366e-06, + "loss": 0.7228, + "step": 9914 + }, + { + "epoch": 0.750255381937876, + "grad_norm": 4.245325088500977, + "learning_rate": 2.898363673196348e-06, + "loss": 0.7393, + "step": 9915 + }, + { + "epoch": 0.7503310506602096, + "grad_norm": 2.3195321559906006, + "learning_rate": 2.896708261317758e-06, + "loss": 0.5678, + "step": 9916 + }, + { + "epoch": 0.7504067193825432, + "grad_norm": 2.411245346069336, + "learning_rate": 2.8950532316470373e-06, + "loss": 0.7304, + "step": 9917 + }, + { + "epoch": 0.7504823881048769, + "grad_norm": 3.026913642883301, + "learning_rate": 2.893398584287826e-06, + "loss": 0.6865, + "step": 9918 + }, + { + "epoch": 0.7505580568272104, + "grad_norm": 1.9332554340362549, + "learning_rate": 2.8917443193437524e-06, + "loss": 0.6483, + "step": 9919 + }, + { + "epoch": 0.7506337255495441, + "grad_norm": 1.6384657621383667, + "learning_rate": 2.890090436918403e-06, + "loss": 0.659, + "step": 9920 + }, + { + "epoch": 0.7507093942718778, + "grad_norm": 2.61690354347229, + "learning_rate": 2.888436937115353e-06, + "loss": 0.6423, + "step": 9921 + }, + { + "epoch": 0.7507850629942113, + "grad_norm": 1.9042266607284546, + "learning_rate": 2.886783820038149e-06, + "loss": 0.5829, + "step": 9922 + }, + { + "epoch": 0.750860731716545, + "grad_norm": 2.1988930702209473, + "learning_rate": 2.885131085790314e-06, + "loss": 0.6954, + "step": 9923 + }, + { + "epoch": 0.7509364004388785, + "grad_norm": 2.5003511905670166, + "learning_rate": 2.8834787344753483e-06, + "loss": 0.5322, + "step": 9924 + }, + { + "epoch": 0.7510120691612122, + "grad_norm": 2.1539666652679443, + "learning_rate": 2.8818267661967285e-06, + "loss": 0.7318, + "step": 9925 + }, + { + "epoch": 0.7510877378835459, + "grad_norm": 1.925028681755066, + "learning_rate": 2.8801751810579074e-06, + "loss": 0.6704, + "step": 9926 + }, + { + "epoch": 0.7511634066058794, + "grad_norm": 1.9884802103042603, + "learning_rate": 2.8785239791623075e-06, + "loss": 0.6807, + "step": 9927 + }, + { + "epoch": 0.7512390753282131, + "grad_norm": 2.154848575592041, + "learning_rate": 2.8768731606133323e-06, + "loss": 0.6473, + "step": 9928 + }, + { + "epoch": 0.7513147440505467, + "grad_norm": 2.2960104942321777, + "learning_rate": 2.8752227255143707e-06, + "loss": 0.6503, + "step": 9929 + }, + { + "epoch": 0.7513904127728803, + "grad_norm": 2.1243772506713867, + "learning_rate": 2.873572673968768e-06, + "loss": 0.9263, + "step": 9930 + }, + { + "epoch": 0.751466081495214, + "grad_norm": 4.902968406677246, + "learning_rate": 2.8719230060798606e-06, + "loss": 0.7779, + "step": 9931 + }, + { + "epoch": 0.7515417502175475, + "grad_norm": 2.171704053878784, + "learning_rate": 2.870273721950955e-06, + "loss": 0.6809, + "step": 9932 + }, + { + "epoch": 0.7516174189398812, + "grad_norm": 2.409769296646118, + "learning_rate": 2.868624821685335e-06, + "loss": 0.6696, + "step": 9933 + }, + { + "epoch": 0.7516930876622149, + "grad_norm": 2.15291690826416, + "learning_rate": 2.8669763053862595e-06, + "loss": 0.6879, + "step": 9934 + }, + { + "epoch": 0.7517687563845484, + "grad_norm": 2.0499536991119385, + "learning_rate": 2.8653281731569645e-06, + "loss": 0.5733, + "step": 9935 + }, + { + "epoch": 0.7518444251068821, + "grad_norm": 1.5915296077728271, + "learning_rate": 2.8636804251006612e-06, + "loss": 0.593, + "step": 9936 + }, + { + "epoch": 0.7519200938292157, + "grad_norm": 2.1210434436798096, + "learning_rate": 2.862033061320541e-06, + "loss": 0.5076, + "step": 9937 + }, + { + "epoch": 0.7519957625515493, + "grad_norm": 2.2604899406433105, + "learning_rate": 2.8603860819197558e-06, + "loss": 0.6894, + "step": 9938 + }, + { + "epoch": 0.752071431273883, + "grad_norm": 2.7228496074676514, + "learning_rate": 2.8587394870014557e-06, + "loss": 0.7777, + "step": 9939 + }, + { + "epoch": 0.7521470999962165, + "grad_norm": 1.9101241827011108, + "learning_rate": 2.857093276668755e-06, + "loss": 0.6761, + "step": 9940 + }, + { + "epoch": 0.7522227687185502, + "grad_norm": 1.973021388053894, + "learning_rate": 2.8554474510247377e-06, + "loss": 0.4929, + "step": 9941 + }, + { + "epoch": 0.7522984374408838, + "grad_norm": 1.9737138748168945, + "learning_rate": 2.8538020101724762e-06, + "loss": 0.6213, + "step": 9942 + }, + { + "epoch": 0.7523741061632174, + "grad_norm": 2.246549606323242, + "learning_rate": 2.852156954215012e-06, + "loss": 0.7567, + "step": 9943 + }, + { + "epoch": 0.7524497748855511, + "grad_norm": 1.7363240718841553, + "learning_rate": 2.850512283255364e-06, + "loss": 0.7059, + "step": 9944 + }, + { + "epoch": 0.7525254436078846, + "grad_norm": 2.246518135070801, + "learning_rate": 2.8488679973965264e-06, + "loss": 0.8108, + "step": 9945 + }, + { + "epoch": 0.7526011123302183, + "grad_norm": 1.421762466430664, + "learning_rate": 2.84722409674147e-06, + "loss": 0.7699, + "step": 9946 + }, + { + "epoch": 0.752676781052552, + "grad_norm": 2.417525291442871, + "learning_rate": 2.8455805813931415e-06, + "loss": 0.6468, + "step": 9947 + }, + { + "epoch": 0.7527524497748855, + "grad_norm": 2.013603687286377, + "learning_rate": 2.8439374514544645e-06, + "loss": 0.7207, + "step": 9948 + }, + { + "epoch": 0.7528281184972192, + "grad_norm": 2.3147010803222656, + "learning_rate": 2.8422947070283305e-06, + "loss": 0.6962, + "step": 9949 + }, + { + "epoch": 0.7529037872195528, + "grad_norm": 2.1023120880126953, + "learning_rate": 2.840652348217622e-06, + "loss": 0.7563, + "step": 9950 + }, + { + "epoch": 0.7529794559418864, + "grad_norm": 2.0633652210235596, + "learning_rate": 2.8390103751251867e-06, + "loss": 0.5911, + "step": 9951 + }, + { + "epoch": 0.7530551246642201, + "grad_norm": 2.3509156703948975, + "learning_rate": 2.8373687878538466e-06, + "loss": 0.7062, + "step": 9952 + }, + { + "epoch": 0.7531307933865536, + "grad_norm": 2.152987480163574, + "learning_rate": 2.8357275865064056e-06, + "loss": 0.6786, + "step": 9953 + }, + { + "epoch": 0.7532064621088873, + "grad_norm": 2.2282655239105225, + "learning_rate": 2.834086771185641e-06, + "loss": 0.648, + "step": 9954 + }, + { + "epoch": 0.7532821308312209, + "grad_norm": 3.348574638366699, + "learning_rate": 2.8324463419943045e-06, + "loss": 0.6576, + "step": 9955 + }, + { + "epoch": 0.7533577995535545, + "grad_norm": 1.9638891220092773, + "learning_rate": 2.8308062990351275e-06, + "loss": 0.6466, + "step": 9956 + }, + { + "epoch": 0.7534334682758882, + "grad_norm": 2.2350552082061768, + "learning_rate": 2.8291666424108125e-06, + "loss": 0.6518, + "step": 9957 + }, + { + "epoch": 0.7535091369982218, + "grad_norm": 1.747753620147705, + "learning_rate": 2.827527372224046e-06, + "loss": 0.756, + "step": 9958 + }, + { + "epoch": 0.7535848057205554, + "grad_norm": 2.364982843399048, + "learning_rate": 2.8258884885774716e-06, + "loss": 0.8052, + "step": 9959 + }, + { + "epoch": 0.7536604744428891, + "grad_norm": 1.823944330215454, + "learning_rate": 2.8242499915737346e-06, + "loss": 0.5839, + "step": 9960 + }, + { + "epoch": 0.7537361431652226, + "grad_norm": 1.797965168952942, + "learning_rate": 2.822611881315437e-06, + "loss": 0.6997, + "step": 9961 + }, + { + "epoch": 0.7538118118875563, + "grad_norm": 3.2685093879699707, + "learning_rate": 2.8209741579051656e-06, + "loss": 0.8114, + "step": 9962 + }, + { + "epoch": 0.7538874806098899, + "grad_norm": 2.2074337005615234, + "learning_rate": 2.8193368214454753e-06, + "loss": 0.6622, + "step": 9963 + }, + { + "epoch": 0.7539631493322235, + "grad_norm": 3.020928382873535, + "learning_rate": 2.8176998720389014e-06, + "loss": 0.7776, + "step": 9964 + }, + { + "epoch": 0.7540388180545572, + "grad_norm": 2.763895034790039, + "learning_rate": 2.816063309787964e-06, + "loss": 0.7328, + "step": 9965 + }, + { + "epoch": 0.7541144867768907, + "grad_norm": 2.075176477432251, + "learning_rate": 2.8144271347951395e-06, + "loss": 0.6773, + "step": 9966 + }, + { + "epoch": 0.7541901554992244, + "grad_norm": 2.109731912612915, + "learning_rate": 2.8127913471628942e-06, + "loss": 0.5774, + "step": 9967 + }, + { + "epoch": 0.754265824221558, + "grad_norm": 1.6521979570388794, + "learning_rate": 2.811155946993668e-06, + "loss": 0.7379, + "step": 9968 + }, + { + "epoch": 0.7543414929438916, + "grad_norm": 2.2940516471862793, + "learning_rate": 2.809520934389872e-06, + "loss": 0.611, + "step": 9969 + }, + { + "epoch": 0.7544171616662253, + "grad_norm": 3.290804862976074, + "learning_rate": 2.8078863094538983e-06, + "loss": 0.678, + "step": 9970 + }, + { + "epoch": 0.7544928303885589, + "grad_norm": 1.9227198362350464, + "learning_rate": 2.8062520722881114e-06, + "loss": 0.659, + "step": 9971 + }, + { + "epoch": 0.7545684991108925, + "grad_norm": 2.2773373126983643, + "learning_rate": 2.8046182229948555e-06, + "loss": 0.5751, + "step": 9972 + }, + { + "epoch": 0.7546441678332262, + "grad_norm": 2.041769504547119, + "learning_rate": 2.802984761676443e-06, + "loss": 0.6616, + "step": 9973 + }, + { + "epoch": 0.7547198365555597, + "grad_norm": 2.143829107284546, + "learning_rate": 2.8013516884351637e-06, + "loss": 0.7292, + "step": 9974 + }, + { + "epoch": 0.7547955052778934, + "grad_norm": 1.9663937091827393, + "learning_rate": 2.7997190033732943e-06, + "loss": 0.5109, + "step": 9975 + }, + { + "epoch": 0.754871174000227, + "grad_norm": 2.2332675457000732, + "learning_rate": 2.7980867065930774e-06, + "loss": 0.6587, + "step": 9976 + }, + { + "epoch": 0.7549468427225606, + "grad_norm": 2.470423460006714, + "learning_rate": 2.796454798196729e-06, + "loss": 0.5856, + "step": 9977 + }, + { + "epoch": 0.7550225114448943, + "grad_norm": 2.361738681793213, + "learning_rate": 2.7948232782864444e-06, + "loss": 0.6109, + "step": 9978 + }, + { + "epoch": 0.7550981801672279, + "grad_norm": 2.0614898204803467, + "learning_rate": 2.793192146964397e-06, + "loss": 0.5809, + "step": 9979 + }, + { + "epoch": 0.7551738488895615, + "grad_norm": 2.270296096801758, + "learning_rate": 2.791561404332731e-06, + "loss": 0.6174, + "step": 9980 + }, + { + "epoch": 0.7552495176118951, + "grad_norm": 7.146130561828613, + "learning_rate": 2.7899310504935724e-06, + "loss": 0.5779, + "step": 9981 + }, + { + "epoch": 0.7553251863342287, + "grad_norm": 2.3047327995300293, + "learning_rate": 2.788301085549016e-06, + "loss": 0.7432, + "step": 9982 + }, + { + "epoch": 0.7554008550565624, + "grad_norm": 2.4408271312713623, + "learning_rate": 2.78667150960114e-06, + "loss": 0.6569, + "step": 9983 + }, + { + "epoch": 0.755476523778896, + "grad_norm": 2.420197010040283, + "learning_rate": 2.785042322751987e-06, + "loss": 0.8204, + "step": 9984 + }, + { + "epoch": 0.7555521925012296, + "grad_norm": 5.947091579437256, + "learning_rate": 2.7834135251035825e-06, + "loss": 0.7756, + "step": 9985 + }, + { + "epoch": 0.7556278612235633, + "grad_norm": 2.492147445678711, + "learning_rate": 2.781785116757936e-06, + "loss": 0.6647, + "step": 9986 + }, + { + "epoch": 0.7557035299458968, + "grad_norm": 2.13313889503479, + "learning_rate": 2.780157097817015e-06, + "loss": 0.7473, + "step": 9987 + }, + { + "epoch": 0.7557791986682305, + "grad_norm": 7.684272289276123, + "learning_rate": 2.778529468382774e-06, + "loss": 0.6594, + "step": 9988 + }, + { + "epoch": 0.7558548673905641, + "grad_norm": 1.934888482093811, + "learning_rate": 2.7769022285571394e-06, + "loss": 0.5884, + "step": 9989 + }, + { + "epoch": 0.7559305361128977, + "grad_norm": 2.01039457321167, + "learning_rate": 2.7752753784420167e-06, + "loss": 0.6648, + "step": 9990 + }, + { + "epoch": 0.7560062048352314, + "grad_norm": 2.073888063430786, + "learning_rate": 2.7736489181392825e-06, + "loss": 0.6572, + "step": 9991 + }, + { + "epoch": 0.756081873557565, + "grad_norm": 1.679289698600769, + "learning_rate": 2.772022847750791e-06, + "loss": 0.8544, + "step": 9992 + }, + { + "epoch": 0.7561575422798986, + "grad_norm": 1.9221335649490356, + "learning_rate": 2.7703971673783728e-06, + "loss": 0.7504, + "step": 9993 + }, + { + "epoch": 0.7562332110022322, + "grad_norm": 2.1600656509399414, + "learning_rate": 2.768771877123836e-06, + "loss": 0.7859, + "step": 9994 + }, + { + "epoch": 0.7563088797245658, + "grad_norm": 2.2807884216308594, + "learning_rate": 2.7671469770889522e-06, + "loss": 0.6178, + "step": 9995 + }, + { + "epoch": 0.7563845484468995, + "grad_norm": 2.230642557144165, + "learning_rate": 2.765522467375487e-06, + "loss": 0.5125, + "step": 9996 + }, + { + "epoch": 0.7564602171692331, + "grad_norm": 1.8637601137161255, + "learning_rate": 2.7638983480851724e-06, + "loss": 0.6613, + "step": 9997 + }, + { + "epoch": 0.7565358858915667, + "grad_norm": 2.0826594829559326, + "learning_rate": 2.7622746193197115e-06, + "loss": 0.8624, + "step": 9998 + }, + { + "epoch": 0.7566115546139004, + "grad_norm": 2.7239291667938232, + "learning_rate": 2.7606512811807885e-06, + "loss": 0.7223, + "step": 9999 + }, + { + "epoch": 0.756687223336234, + "grad_norm": 1.7906194925308228, + "learning_rate": 2.7590283337700626e-06, + "loss": 0.8105, + "step": 10000 + }, + { + "epoch": 0.7567628920585676, + "grad_norm": 2.736328125, + "learning_rate": 2.757405777189168e-06, + "loss": 0.8085, + "step": 10001 + }, + { + "epoch": 0.7568385607809012, + "grad_norm": 3.0120749473571777, + "learning_rate": 2.7557836115397153e-06, + "loss": 0.5936, + "step": 10002 + }, + { + "epoch": 0.7569142295032348, + "grad_norm": 2.22469425201416, + "learning_rate": 2.754161836923289e-06, + "loss": 0.4523, + "step": 10003 + }, + { + "epoch": 0.7569898982255685, + "grad_norm": 2.1429076194763184, + "learning_rate": 2.7525404534414494e-06, + "loss": 0.5562, + "step": 10004 + }, + { + "epoch": 0.7570655669479021, + "grad_norm": 2.4758174419403076, + "learning_rate": 2.750919461195734e-06, + "loss": 0.6039, + "step": 10005 + }, + { + "epoch": 0.7571412356702357, + "grad_norm": 1.938726782798767, + "learning_rate": 2.749298860287653e-06, + "loss": 0.6375, + "step": 10006 + }, + { + "epoch": 0.7572169043925693, + "grad_norm": 2.25929594039917, + "learning_rate": 2.7476786508186953e-06, + "loss": 0.7259, + "step": 10007 + }, + { + "epoch": 0.757292573114903, + "grad_norm": 1.9672093391418457, + "learning_rate": 2.7460588328903265e-06, + "loss": 0.6031, + "step": 10008 + }, + { + "epoch": 0.7573682418372366, + "grad_norm": 2.4729254245758057, + "learning_rate": 2.7444394066039776e-06, + "loss": 0.6826, + "step": 10009 + }, + { + "epoch": 0.7574439105595702, + "grad_norm": 2.6728367805480957, + "learning_rate": 2.742820372061063e-06, + "loss": 0.6732, + "step": 10010 + }, + { + "epoch": 0.7575195792819038, + "grad_norm": 2.688096523284912, + "learning_rate": 2.7412017293629802e-06, + "loss": 0.7156, + "step": 10011 + }, + { + "epoch": 0.7575952480042375, + "grad_norm": 3.0258054733276367, + "learning_rate": 2.7395834786110872e-06, + "loss": 0.5589, + "step": 10012 + }, + { + "epoch": 0.757670916726571, + "grad_norm": 1.7880308628082275, + "learning_rate": 2.7379656199067244e-06, + "loss": 0.6092, + "step": 10013 + }, + { + "epoch": 0.7577465854489047, + "grad_norm": 2.20332407951355, + "learning_rate": 2.736348153351208e-06, + "loss": 0.9061, + "step": 10014 + }, + { + "epoch": 0.7578222541712383, + "grad_norm": 1.8708069324493408, + "learning_rate": 2.73473107904583e-06, + "loss": 0.6117, + "step": 10015 + }, + { + "epoch": 0.7578979228935719, + "grad_norm": 2.2508816719055176, + "learning_rate": 2.7331143970918554e-06, + "loss": 0.8897, + "step": 10016 + }, + { + "epoch": 0.7579735916159056, + "grad_norm": 2.168473243713379, + "learning_rate": 2.7314981075905277e-06, + "loss": 0.6392, + "step": 10017 + }, + { + "epoch": 0.7580492603382392, + "grad_norm": 1.8958176374435425, + "learning_rate": 2.729882210643066e-06, + "loss": 0.631, + "step": 10018 + }, + { + "epoch": 0.7581249290605728, + "grad_norm": 2.8258678913116455, + "learning_rate": 2.7282667063506567e-06, + "loss": 0.6969, + "step": 10019 + }, + { + "epoch": 0.7582005977829064, + "grad_norm": 1.941758155822754, + "learning_rate": 2.7266515948144726e-06, + "loss": 0.6123, + "step": 10020 + }, + { + "epoch": 0.75827626650524, + "grad_norm": 2.0148000717163086, + "learning_rate": 2.7250368761356524e-06, + "loss": 0.7288, + "step": 10021 + }, + { + "epoch": 0.7583519352275737, + "grad_norm": 2.3907501697540283, + "learning_rate": 2.723422550415325e-06, + "loss": 0.7616, + "step": 10022 + }, + { + "epoch": 0.7584276039499073, + "grad_norm": 2.1037983894348145, + "learning_rate": 2.7218086177545744e-06, + "loss": 0.6279, + "step": 10023 + }, + { + "epoch": 0.7585032726722409, + "grad_norm": 1.8898531198501587, + "learning_rate": 2.7201950782544758e-06, + "loss": 0.6425, + "step": 10024 + }, + { + "epoch": 0.7585789413945746, + "grad_norm": 2.113624334335327, + "learning_rate": 2.7185819320160714e-06, + "loss": 0.6748, + "step": 10025 + }, + { + "epoch": 0.7586546101169082, + "grad_norm": 2.033940553665161, + "learning_rate": 2.7169691791403844e-06, + "loss": 0.5771, + "step": 10026 + }, + { + "epoch": 0.7587302788392418, + "grad_norm": 2.1298940181732178, + "learning_rate": 2.715356819728408e-06, + "loss": 0.7522, + "step": 10027 + }, + { + "epoch": 0.7588059475615754, + "grad_norm": 2.124619960784912, + "learning_rate": 2.7137448538811158e-06, + "loss": 0.6404, + "step": 10028 + }, + { + "epoch": 0.758881616283909, + "grad_norm": 1.9840469360351562, + "learning_rate": 2.712133281699454e-06, + "loss": 0.5566, + "step": 10029 + }, + { + "epoch": 0.7589572850062427, + "grad_norm": 2.5573880672454834, + "learning_rate": 2.710522103284342e-06, + "loss": 0.7079, + "step": 10030 + }, + { + "epoch": 0.7590329537285763, + "grad_norm": 2.176572561264038, + "learning_rate": 2.7089113187366758e-06, + "loss": 0.5414, + "step": 10031 + }, + { + "epoch": 0.7591086224509099, + "grad_norm": 2.228116273880005, + "learning_rate": 2.7073009281573362e-06, + "loss": 0.7058, + "step": 10032 + }, + { + "epoch": 0.7591842911732435, + "grad_norm": 2.373213052749634, + "learning_rate": 2.705690931647162e-06, + "loss": 0.7606, + "step": 10033 + }, + { + "epoch": 0.7592599598955772, + "grad_norm": 1.8033760786056519, + "learning_rate": 2.704081329306981e-06, + "loss": 0.716, + "step": 10034 + }, + { + "epoch": 0.7593356286179108, + "grad_norm": 2.362074375152588, + "learning_rate": 2.70247212123759e-06, + "loss": 0.7064, + "step": 10035 + }, + { + "epoch": 0.7594112973402444, + "grad_norm": 2.1984481811523438, + "learning_rate": 2.700863307539763e-06, + "loss": 0.7345, + "step": 10036 + }, + { + "epoch": 0.759486966062578, + "grad_norm": 3.2609875202178955, + "learning_rate": 2.699254888314251e-06, + "loss": 0.7909, + "step": 10037 + }, + { + "epoch": 0.7595626347849117, + "grad_norm": 1.9195688962936401, + "learning_rate": 2.697646863661776e-06, + "loss": 0.6074, + "step": 10038 + }, + { + "epoch": 0.7596383035072453, + "grad_norm": 2.3153951168060303, + "learning_rate": 2.6960392336830385e-06, + "loss": 0.7519, + "step": 10039 + }, + { + "epoch": 0.7597139722295789, + "grad_norm": 2.753959894180298, + "learning_rate": 2.6944319984787166e-06, + "loss": 0.6649, + "step": 10040 + }, + { + "epoch": 0.7597896409519125, + "grad_norm": 2.4450736045837402, + "learning_rate": 2.692825158149452e-06, + "loss": 0.7087, + "step": 10041 + }, + { + "epoch": 0.7598653096742461, + "grad_norm": 1.9317896366119385, + "learning_rate": 2.691218712795879e-06, + "loss": 0.5861, + "step": 10042 + }, + { + "epoch": 0.7599409783965798, + "grad_norm": 2.266354560852051, + "learning_rate": 2.689612662518598e-06, + "loss": 0.6434, + "step": 10043 + }, + { + "epoch": 0.7600166471189134, + "grad_norm": 1.8731240034103394, + "learning_rate": 2.6880070074181794e-06, + "loss": 0.5763, + "step": 10044 + }, + { + "epoch": 0.760092315841247, + "grad_norm": 2.1654038429260254, + "learning_rate": 2.6864017475951778e-06, + "loss": 0.5699, + "step": 10045 + }, + { + "epoch": 0.7601679845635806, + "grad_norm": 2.76196026802063, + "learning_rate": 2.6847968831501187e-06, + "loss": 0.6435, + "step": 10046 + }, + { + "epoch": 0.7602436532859143, + "grad_norm": 2.265397548675537, + "learning_rate": 2.6831924141835052e-06, + "loss": 0.6767, + "step": 10047 + }, + { + "epoch": 0.7603193220082479, + "grad_norm": 1.6206222772598267, + "learning_rate": 2.6815883407958136e-06, + "loss": 0.6914, + "step": 10048 + }, + { + "epoch": 0.7603949907305815, + "grad_norm": 2.7728967666625977, + "learning_rate": 2.6799846630874965e-06, + "loss": 0.7587, + "step": 10049 + }, + { + "epoch": 0.7604706594529151, + "grad_norm": 2.363548517227173, + "learning_rate": 2.678381381158981e-06, + "loss": 0.7465, + "step": 10050 + }, + { + "epoch": 0.7605463281752488, + "grad_norm": 1.4986516237258911, + "learning_rate": 2.67677849511067e-06, + "loss": 0.5781, + "step": 10051 + }, + { + "epoch": 0.7606219968975824, + "grad_norm": 2.2090630531311035, + "learning_rate": 2.6751760050429415e-06, + "loss": 0.7364, + "step": 10052 + }, + { + "epoch": 0.760697665619916, + "grad_norm": 2.0763497352600098, + "learning_rate": 2.673573911056148e-06, + "loss": 0.9323, + "step": 10053 + }, + { + "epoch": 0.7607733343422496, + "grad_norm": 1.6616204977035522, + "learning_rate": 2.6719722132506225e-06, + "loss": 0.6069, + "step": 10054 + }, + { + "epoch": 0.7608490030645833, + "grad_norm": 2.090778112411499, + "learning_rate": 2.67037091172666e-06, + "loss": 0.676, + "step": 10055 + }, + { + "epoch": 0.7609246717869169, + "grad_norm": 1.8353502750396729, + "learning_rate": 2.6687700065845417e-06, + "loss": 0.4939, + "step": 10056 + }, + { + "epoch": 0.7610003405092505, + "grad_norm": 1.8233598470687866, + "learning_rate": 2.667169497924528e-06, + "loss": 0.5752, + "step": 10057 + }, + { + "epoch": 0.7610760092315841, + "grad_norm": 1.976509928703308, + "learning_rate": 2.6655693858468413e-06, + "loss": 0.6282, + "step": 10058 + }, + { + "epoch": 0.7611516779539177, + "grad_norm": 3.537311553955078, + "learning_rate": 2.6639696704516876e-06, + "loss": 0.7171, + "step": 10059 + }, + { + "epoch": 0.7612273466762514, + "grad_norm": 2.408458709716797, + "learning_rate": 2.6623703518392456e-06, + "loss": 0.733, + "step": 10060 + }, + { + "epoch": 0.761303015398585, + "grad_norm": 1.9443494081497192, + "learning_rate": 2.6607714301096737e-06, + "loss": 0.6078, + "step": 10061 + }, + { + "epoch": 0.7613786841209186, + "grad_norm": 2.5981321334838867, + "learning_rate": 2.659172905363094e-06, + "loss": 0.7041, + "step": 10062 + }, + { + "epoch": 0.7614543528432522, + "grad_norm": 2.7510721683502197, + "learning_rate": 2.657574777699617e-06, + "loss": 0.7332, + "step": 10063 + }, + { + "epoch": 0.7615300215655859, + "grad_norm": 2.1146605014801025, + "learning_rate": 2.6559770472193217e-06, + "loss": 0.6687, + "step": 10064 + }, + { + "epoch": 0.7616056902879195, + "grad_norm": 2.1599109172821045, + "learning_rate": 2.654379714022266e-06, + "loss": 0.6745, + "step": 10065 + }, + { + "epoch": 0.7616813590102531, + "grad_norm": 2.144578695297241, + "learning_rate": 2.6527827782084733e-06, + "loss": 0.6877, + "step": 10066 + }, + { + "epoch": 0.7617570277325867, + "grad_norm": 3.8134841918945312, + "learning_rate": 2.6511862398779495e-06, + "loss": 0.6743, + "step": 10067 + }, + { + "epoch": 0.7618326964549204, + "grad_norm": 1.7415173053741455, + "learning_rate": 2.6495900991306847e-06, + "loss": 0.6796, + "step": 10068 + }, + { + "epoch": 0.761908365177254, + "grad_norm": 2.2260866165161133, + "learning_rate": 2.647994356066624e-06, + "loss": 0.7098, + "step": 10069 + }, + { + "epoch": 0.7619840338995876, + "grad_norm": 2.0269641876220703, + "learning_rate": 2.6463990107857016e-06, + "loss": 0.7222, + "step": 10070 + }, + { + "epoch": 0.7620597026219212, + "grad_norm": 2.0193490982055664, + "learning_rate": 2.6448040633878226e-06, + "loss": 0.6655, + "step": 10071 + }, + { + "epoch": 0.7621353713442548, + "grad_norm": 2.8648266792297363, + "learning_rate": 2.6432095139728695e-06, + "loss": 0.6558, + "step": 10072 + }, + { + "epoch": 0.7622110400665885, + "grad_norm": 2.253610610961914, + "learning_rate": 2.641615362640696e-06, + "loss": 0.5851, + "step": 10073 + }, + { + "epoch": 0.7622867087889221, + "grad_norm": 2.2244420051574707, + "learning_rate": 2.6400216094911348e-06, + "loss": 0.6313, + "step": 10074 + }, + { + "epoch": 0.7623623775112557, + "grad_norm": 2.7857227325439453, + "learning_rate": 2.638428254623993e-06, + "loss": 0.7529, + "step": 10075 + }, + { + "epoch": 0.7624380462335894, + "grad_norm": 1.7802857160568237, + "learning_rate": 2.636835298139048e-06, + "loss": 0.616, + "step": 10076 + }, + { + "epoch": 0.762513714955923, + "grad_norm": 2.1480860710144043, + "learning_rate": 2.635242740136054e-06, + "loss": 0.6676, + "step": 10077 + }, + { + "epoch": 0.7625893836782566, + "grad_norm": 1.7942516803741455, + "learning_rate": 2.6336505807147486e-06, + "loss": 0.6174, + "step": 10078 + }, + { + "epoch": 0.7626650524005902, + "grad_norm": 1.9763191938400269, + "learning_rate": 2.6320588199748383e-06, + "loss": 0.7353, + "step": 10079 + }, + { + "epoch": 0.7627407211229238, + "grad_norm": 1.8652597665786743, + "learning_rate": 2.6304674580159983e-06, + "loss": 0.7269, + "step": 10080 + }, + { + "epoch": 0.7628163898452575, + "grad_norm": 1.8752639293670654, + "learning_rate": 2.628876494937888e-06, + "loss": 0.6001, + "step": 10081 + }, + { + "epoch": 0.7628920585675911, + "grad_norm": 1.8885382413864136, + "learning_rate": 2.6272859308401375e-06, + "loss": 0.6663, + "step": 10082 + }, + { + "epoch": 0.7629677272899247, + "grad_norm": 2.4886248111724854, + "learning_rate": 2.6256957658223537e-06, + "loss": 0.7086, + "step": 10083 + }, + { + "epoch": 0.7630433960122583, + "grad_norm": 2.2221665382385254, + "learning_rate": 2.6241059999841183e-06, + "loss": 0.6899, + "step": 10084 + }, + { + "epoch": 0.7631190647345919, + "grad_norm": 1.9292700290679932, + "learning_rate": 2.6225166334249877e-06, + "loss": 0.8171, + "step": 10085 + }, + { + "epoch": 0.7631947334569256, + "grad_norm": 2.5671818256378174, + "learning_rate": 2.620927666244496e-06, + "loss": 0.6218, + "step": 10086 + }, + { + "epoch": 0.7632704021792592, + "grad_norm": 2.0714309215545654, + "learning_rate": 2.6193390985421403e-06, + "loss": 0.6615, + "step": 10087 + }, + { + "epoch": 0.7633460709015928, + "grad_norm": 1.8017868995666504, + "learning_rate": 2.6177509304174105e-06, + "loss": 0.6723, + "step": 10088 + }, + { + "epoch": 0.7634217396239265, + "grad_norm": 1.793261170387268, + "learning_rate": 2.616163161969762e-06, + "loss": 0.5502, + "step": 10089 + }, + { + "epoch": 0.7634974083462601, + "grad_norm": 2.2344236373901367, + "learning_rate": 2.614575793298622e-06, + "loss": 0.6795, + "step": 10090 + }, + { + "epoch": 0.7635730770685937, + "grad_norm": 1.8927500247955322, + "learning_rate": 2.612988824503399e-06, + "loss": 0.8756, + "step": 10091 + }, + { + "epoch": 0.7636487457909273, + "grad_norm": 3.597015142440796, + "learning_rate": 2.6114022556834717e-06, + "loss": 0.6986, + "step": 10092 + }, + { + "epoch": 0.7637244145132609, + "grad_norm": 2.1370482444763184, + "learning_rate": 2.6098160869382026e-06, + "loss": 0.6671, + "step": 10093 + }, + { + "epoch": 0.7638000832355946, + "grad_norm": 2.2916955947875977, + "learning_rate": 2.6082303183669164e-06, + "loss": 0.5868, + "step": 10094 + }, + { + "epoch": 0.7638757519579282, + "grad_norm": 1.7020882368087769, + "learning_rate": 2.606644950068921e-06, + "loss": 0.5157, + "step": 10095 + }, + { + "epoch": 0.7639514206802618, + "grad_norm": 1.9924485683441162, + "learning_rate": 2.6050599821434985e-06, + "loss": 0.6293, + "step": 10096 + }, + { + "epoch": 0.7640270894025954, + "grad_norm": 2.115015983581543, + "learning_rate": 2.603475414689905e-06, + "loss": 0.7781, + "step": 10097 + }, + { + "epoch": 0.7641027581249291, + "grad_norm": 2.8572585582733154, + "learning_rate": 2.6018912478073657e-06, + "loss": 0.5423, + "step": 10098 + }, + { + "epoch": 0.7641784268472627, + "grad_norm": 1.8709588050842285, + "learning_rate": 2.600307481595092e-06, + "loss": 0.7225, + "step": 10099 + }, + { + "epoch": 0.7642540955695963, + "grad_norm": 2.634243965148926, + "learning_rate": 2.5987241161522665e-06, + "loss": 0.837, + "step": 10100 + }, + { + "epoch": 0.7643297642919299, + "grad_norm": 1.8500239849090576, + "learning_rate": 2.597141151578038e-06, + "loss": 0.6693, + "step": 10101 + }, + { + "epoch": 0.7644054330142636, + "grad_norm": 1.8766469955444336, + "learning_rate": 2.5955585879715396e-06, + "loss": 0.6458, + "step": 10102 + }, + { + "epoch": 0.7644811017365972, + "grad_norm": 2.2552802562713623, + "learning_rate": 2.5939764254318767e-06, + "loss": 0.7068, + "step": 10103 + }, + { + "epoch": 0.7645567704589308, + "grad_norm": 2.3717575073242188, + "learning_rate": 2.5923946640581307e-06, + "loss": 0.685, + "step": 10104 + }, + { + "epoch": 0.7646324391812644, + "grad_norm": 2.0326504707336426, + "learning_rate": 2.590813303949355e-06, + "loss": 0.8291, + "step": 10105 + }, + { + "epoch": 0.764708107903598, + "grad_norm": 2.752199172973633, + "learning_rate": 2.5892323452045797e-06, + "loss": 0.7143, + "step": 10106 + }, + { + "epoch": 0.7647837766259317, + "grad_norm": 2.53975510597229, + "learning_rate": 2.5876517879228106e-06, + "loss": 0.8028, + "step": 10107 + }, + { + "epoch": 0.7648594453482653, + "grad_norm": 1.4467507600784302, + "learning_rate": 2.5860716322030263e-06, + "loss": 0.7469, + "step": 10108 + }, + { + "epoch": 0.7649351140705989, + "grad_norm": 2.353743553161621, + "learning_rate": 2.5844918781441815e-06, + "loss": 0.5224, + "step": 10109 + }, + { + "epoch": 0.7650107827929326, + "grad_norm": 2.355583906173706, + "learning_rate": 2.582912525845205e-06, + "loss": 0.7163, + "step": 10110 + }, + { + "epoch": 0.7650864515152662, + "grad_norm": 1.8890095949172974, + "learning_rate": 2.5813335754050047e-06, + "loss": 0.5741, + "step": 10111 + }, + { + "epoch": 0.7651621202375998, + "grad_norm": 2.1098287105560303, + "learning_rate": 2.5797550269224544e-06, + "loss": 0.6586, + "step": 10112 + }, + { + "epoch": 0.7652377889599334, + "grad_norm": 2.207130193710327, + "learning_rate": 2.5781768804964063e-06, + "loss": 0.7282, + "step": 10113 + }, + { + "epoch": 0.765313457682267, + "grad_norm": 2.3205008506774902, + "learning_rate": 2.576599136225698e-06, + "loss": 0.6441, + "step": 10114 + }, + { + "epoch": 0.7653891264046007, + "grad_norm": 2.5144574642181396, + "learning_rate": 2.5750217942091252e-06, + "loss": 0.6996, + "step": 10115 + }, + { + "epoch": 0.7654647951269343, + "grad_norm": 2.885598659515381, + "learning_rate": 2.573444854545468e-06, + "loss": 0.5839, + "step": 10116 + }, + { + "epoch": 0.7655404638492679, + "grad_norm": 1.7929126024246216, + "learning_rate": 2.571868317333481e-06, + "loss": 0.4543, + "step": 10117 + }, + { + "epoch": 0.7656161325716015, + "grad_norm": 2.23551082611084, + "learning_rate": 2.5702921826718902e-06, + "loss": 0.7682, + "step": 10118 + }, + { + "epoch": 0.7656918012939351, + "grad_norm": 2.1573288440704346, + "learning_rate": 2.5687164506593993e-06, + "loss": 0.6006, + "step": 10119 + }, + { + "epoch": 0.7657674700162688, + "grad_norm": 3.274181842803955, + "learning_rate": 2.5671411213946864e-06, + "loss": 0.5829, + "step": 10120 + }, + { + "epoch": 0.7658431387386024, + "grad_norm": 2.04793381690979, + "learning_rate": 2.565566194976402e-06, + "loss": 0.5535, + "step": 10121 + }, + { + "epoch": 0.765918807460936, + "grad_norm": 2.4527783393859863, + "learning_rate": 2.5639916715031764e-06, + "loss": 0.6092, + "step": 10122 + }, + { + "epoch": 0.7659944761832697, + "grad_norm": 2.0146374702453613, + "learning_rate": 2.5624175510736047e-06, + "loss": 0.7187, + "step": 10123 + }, + { + "epoch": 0.7660701449056033, + "grad_norm": 2.2973668575286865, + "learning_rate": 2.5608438337862695e-06, + "loss": 0.7676, + "step": 10124 + }, + { + "epoch": 0.7661458136279369, + "grad_norm": 2.033846139907837, + "learning_rate": 2.559270519739723e-06, + "loss": 0.6969, + "step": 10125 + }, + { + "epoch": 0.7662214823502705, + "grad_norm": 2.0946013927459717, + "learning_rate": 2.5576976090324856e-06, + "loss": 0.7496, + "step": 10126 + }, + { + "epoch": 0.7662971510726041, + "grad_norm": 1.880980372428894, + "learning_rate": 2.556125101763061e-06, + "loss": 0.784, + "step": 10127 + }, + { + "epoch": 0.7663728197949378, + "grad_norm": 1.8983718156814575, + "learning_rate": 2.554552998029924e-06, + "loss": 0.6489, + "step": 10128 + }, + { + "epoch": 0.7664484885172714, + "grad_norm": 2.347182512283325, + "learning_rate": 2.552981297931526e-06, + "loss": 0.6194, + "step": 10129 + }, + { + "epoch": 0.766524157239605, + "grad_norm": 3.0601274967193604, + "learning_rate": 2.5514100015662915e-06, + "loss": 0.5625, + "step": 10130 + }, + { + "epoch": 0.7665998259619387, + "grad_norm": 2.182065725326538, + "learning_rate": 2.5498391090326193e-06, + "loss": 0.7296, + "step": 10131 + }, + { + "epoch": 0.7666754946842722, + "grad_norm": 1.8279647827148438, + "learning_rate": 2.5482686204288874e-06, + "loss": 0.6273, + "step": 10132 + }, + { + "epoch": 0.7667511634066059, + "grad_norm": 2.4019768238067627, + "learning_rate": 2.5466985358534365e-06, + "loss": 0.6006, + "step": 10133 + }, + { + "epoch": 0.7668268321289395, + "grad_norm": 1.9945967197418213, + "learning_rate": 2.5451288554045986e-06, + "loss": 0.8876, + "step": 10134 + }, + { + "epoch": 0.7669025008512731, + "grad_norm": 2.012547254562378, + "learning_rate": 2.5435595791806693e-06, + "loss": 0.6557, + "step": 10135 + }, + { + "epoch": 0.7669781695736068, + "grad_norm": 2.253406047821045, + "learning_rate": 2.541990707279925e-06, + "loss": 0.599, + "step": 10136 + }, + { + "epoch": 0.7670538382959404, + "grad_norm": 1.9846205711364746, + "learning_rate": 2.5404222398006072e-06, + "loss": 0.6009, + "step": 10137 + }, + { + "epoch": 0.767129507018274, + "grad_norm": 2.01389741897583, + "learning_rate": 2.538854176840941e-06, + "loss": 0.6161, + "step": 10138 + }, + { + "epoch": 0.7672051757406076, + "grad_norm": 1.8462287187576294, + "learning_rate": 2.537286518499125e-06, + "loss": 0.7132, + "step": 10139 + }, + { + "epoch": 0.7672808444629412, + "grad_norm": 2.2837891578674316, + "learning_rate": 2.5357192648733296e-06, + "loss": 0.7349, + "step": 10140 + }, + { + "epoch": 0.7673565131852749, + "grad_norm": 2.81986403465271, + "learning_rate": 2.534152416061703e-06, + "loss": 0.6632, + "step": 10141 + }, + { + "epoch": 0.7674321819076085, + "grad_norm": 2.114987850189209, + "learning_rate": 2.5325859721623636e-06, + "loss": 0.6577, + "step": 10142 + }, + { + "epoch": 0.7675078506299421, + "grad_norm": 2.2492518424987793, + "learning_rate": 2.5310199332734123e-06, + "loss": 0.7331, + "step": 10143 + }, + { + "epoch": 0.7675835193522758, + "grad_norm": 2.2491164207458496, + "learning_rate": 2.52945429949291e-06, + "loss": 0.6299, + "step": 10144 + }, + { + "epoch": 0.7676591880746093, + "grad_norm": 4.50223445892334, + "learning_rate": 2.527889070918911e-06, + "loss": 0.6815, + "step": 10145 + }, + { + "epoch": 0.767734856796943, + "grad_norm": 1.7851481437683105, + "learning_rate": 2.526324247649435e-06, + "loss": 0.5633, + "step": 10146 + }, + { + "epoch": 0.7678105255192766, + "grad_norm": 2.163814067840576, + "learning_rate": 2.5247598297824694e-06, + "loss": 0.6235, + "step": 10147 + }, + { + "epoch": 0.7678861942416102, + "grad_norm": 2.0847737789154053, + "learning_rate": 2.523195817415987e-06, + "loss": 0.6388, + "step": 10148 + }, + { + "epoch": 0.7679618629639439, + "grad_norm": 1.7596197128295898, + "learning_rate": 2.5216322106479305e-06, + "loss": 0.6007, + "step": 10149 + }, + { + "epoch": 0.7680375316862775, + "grad_norm": 3.082669973373413, + "learning_rate": 2.52006900957622e-06, + "loss": 0.6749, + "step": 10150 + }, + { + "epoch": 0.7681132004086111, + "grad_norm": 2.0572025775909424, + "learning_rate": 2.518506214298745e-06, + "loss": 0.737, + "step": 10151 + }, + { + "epoch": 0.7681888691309448, + "grad_norm": 2.777792453765869, + "learning_rate": 2.5169438249133753e-06, + "loss": 0.7714, + "step": 10152 + }, + { + "epoch": 0.7682645378532783, + "grad_norm": 2.290621280670166, + "learning_rate": 2.515381841517952e-06, + "loss": 0.5603, + "step": 10153 + }, + { + "epoch": 0.768340206575612, + "grad_norm": 2.1354238986968994, + "learning_rate": 2.5138202642102922e-06, + "loss": 0.762, + "step": 10154 + }, + { + "epoch": 0.7684158752979456, + "grad_norm": 1.9276994466781616, + "learning_rate": 2.512259093088186e-06, + "loss": 0.7416, + "step": 10155 + }, + { + "epoch": 0.7684915440202792, + "grad_norm": 1.9895371198654175, + "learning_rate": 2.5106983282493985e-06, + "loss": 0.564, + "step": 10156 + }, + { + "epoch": 0.7685672127426129, + "grad_norm": 3.0279078483581543, + "learning_rate": 2.5091379697916745e-06, + "loss": 0.6865, + "step": 10157 + }, + { + "epoch": 0.7686428814649464, + "grad_norm": 1.848624348640442, + "learning_rate": 2.5075780178127215e-06, + "loss": 0.6344, + "step": 10158 + }, + { + "epoch": 0.7687185501872801, + "grad_norm": 1.8406096696853638, + "learning_rate": 2.506018472410229e-06, + "loss": 0.6756, + "step": 10159 + }, + { + "epoch": 0.7687942189096137, + "grad_norm": 9.052090644836426, + "learning_rate": 2.5044593336818697e-06, + "loss": 0.5753, + "step": 10160 + }, + { + "epoch": 0.7688698876319473, + "grad_norm": 2.3040497303009033, + "learning_rate": 2.502900601725274e-06, + "loss": 0.5959, + "step": 10161 + }, + { + "epoch": 0.768945556354281, + "grad_norm": 2.6110458374023438, + "learning_rate": 2.501342276638056e-06, + "loss": 0.8063, + "step": 10162 + }, + { + "epoch": 0.7690212250766146, + "grad_norm": 2.4658761024475098, + "learning_rate": 2.4997843585178035e-06, + "loss": 0.6493, + "step": 10163 + }, + { + "epoch": 0.7690968937989482, + "grad_norm": 2.334641456604004, + "learning_rate": 2.4982268474620786e-06, + "loss": 0.7169, + "step": 10164 + }, + { + "epoch": 0.7691725625212819, + "grad_norm": 1.93704354763031, + "learning_rate": 2.4966697435684195e-06, + "loss": 0.6254, + "step": 10165 + }, + { + "epoch": 0.7692482312436154, + "grad_norm": 2.5095372200012207, + "learning_rate": 2.495113046934334e-06, + "loss": 0.7474, + "step": 10166 + }, + { + "epoch": 0.7693238999659491, + "grad_norm": 2.160459041595459, + "learning_rate": 2.4935567576573085e-06, + "loss": 0.7978, + "step": 10167 + }, + { + "epoch": 0.7693995686882827, + "grad_norm": 2.0386502742767334, + "learning_rate": 2.4920008758348072e-06, + "loss": 0.696, + "step": 10168 + }, + { + "epoch": 0.7694752374106163, + "grad_norm": 2.827305555343628, + "learning_rate": 2.4904454015642546e-06, + "loss": 0.6549, + "step": 10169 + }, + { + "epoch": 0.76955090613295, + "grad_norm": 1.8038434982299805, + "learning_rate": 2.4888903349430677e-06, + "loss": 0.658, + "step": 10170 + }, + { + "epoch": 0.7696265748552835, + "grad_norm": 3.055170774459839, + "learning_rate": 2.48733567606863e-06, + "loss": 0.657, + "step": 10171 + }, + { + "epoch": 0.7697022435776172, + "grad_norm": 3.3407459259033203, + "learning_rate": 2.485781425038294e-06, + "loss": 0.6095, + "step": 10172 + }, + { + "epoch": 0.7697779122999509, + "grad_norm": 2.0380611419677734, + "learning_rate": 2.484227581949396e-06, + "loss": 0.5941, + "step": 10173 + }, + { + "epoch": 0.7698535810222844, + "grad_norm": 2.155287504196167, + "learning_rate": 2.4826741468992407e-06, + "loss": 0.7884, + "step": 10174 + }, + { + "epoch": 0.7699292497446181, + "grad_norm": 2.009636163711548, + "learning_rate": 2.4811211199851102e-06, + "loss": 0.6486, + "step": 10175 + }, + { + "epoch": 0.7700049184669517, + "grad_norm": 2.339906930923462, + "learning_rate": 2.479568501304259e-06, + "loss": 0.4735, + "step": 10176 + }, + { + "epoch": 0.7700805871892853, + "grad_norm": 2.612977981567383, + "learning_rate": 2.4780162909539178e-06, + "loss": 0.7915, + "step": 10177 + }, + { + "epoch": 0.770156255911619, + "grad_norm": 2.249410629272461, + "learning_rate": 2.4764644890312947e-06, + "loss": 0.6992, + "step": 10178 + }, + { + "epoch": 0.7702319246339525, + "grad_norm": 1.8891581296920776, + "learning_rate": 2.474913095633562e-06, + "loss": 0.6526, + "step": 10179 + }, + { + "epoch": 0.7703075933562862, + "grad_norm": 3.0311803817749023, + "learning_rate": 2.473362110857873e-06, + "loss": 0.7193, + "step": 10180 + }, + { + "epoch": 0.7703832620786198, + "grad_norm": 2.6356663703918457, + "learning_rate": 2.4718115348013604e-06, + "loss": 0.6858, + "step": 10181 + }, + { + "epoch": 0.7704589308009534, + "grad_norm": 2.00958514213562, + "learning_rate": 2.4702613675611284e-06, + "loss": 0.7983, + "step": 10182 + }, + { + "epoch": 0.7705345995232871, + "grad_norm": 2.9431421756744385, + "learning_rate": 2.468711609234246e-06, + "loss": 0.6924, + "step": 10183 + }, + { + "epoch": 0.7706102682456206, + "grad_norm": 1.6894335746765137, + "learning_rate": 2.467162259917767e-06, + "loss": 0.5111, + "step": 10184 + }, + { + "epoch": 0.7706859369679543, + "grad_norm": 2.004911184310913, + "learning_rate": 2.4656133197087166e-06, + "loss": 0.6717, + "step": 10185 + }, + { + "epoch": 0.770761605690288, + "grad_norm": 2.340867280960083, + "learning_rate": 2.4640647887040957e-06, + "loss": 0.8315, + "step": 10186 + }, + { + "epoch": 0.7708372744126215, + "grad_norm": 2.400015115737915, + "learning_rate": 2.4625166670008777e-06, + "loss": 0.691, + "step": 10187 + }, + { + "epoch": 0.7709129431349552, + "grad_norm": 2.0905392169952393, + "learning_rate": 2.46096895469601e-06, + "loss": 0.6846, + "step": 10188 + }, + { + "epoch": 0.7709886118572888, + "grad_norm": 2.421705961227417, + "learning_rate": 2.45942165188642e-06, + "loss": 0.622, + "step": 10189 + }, + { + "epoch": 0.7710642805796224, + "grad_norm": 2.1648590564727783, + "learning_rate": 2.457874758668995e-06, + "loss": 0.7249, + "step": 10190 + }, + { + "epoch": 0.7711399493019561, + "grad_norm": 2.4879276752471924, + "learning_rate": 2.4563282751406145e-06, + "loss": 0.7729, + "step": 10191 + }, + { + "epoch": 0.7712156180242896, + "grad_norm": 1.9964536428451538, + "learning_rate": 2.4547822013981253e-06, + "loss": 0.735, + "step": 10192 + }, + { + "epoch": 0.7712912867466233, + "grad_norm": 2.6276025772094727, + "learning_rate": 2.4532365375383423e-06, + "loss": 0.6983, + "step": 10193 + }, + { + "epoch": 0.771366955468957, + "grad_norm": 2.3542721271514893, + "learning_rate": 2.451691283658061e-06, + "loss": 0.751, + "step": 10194 + }, + { + "epoch": 0.7714426241912905, + "grad_norm": 1.8065311908721924, + "learning_rate": 2.4501464398540494e-06, + "loss": 0.67, + "step": 10195 + }, + { + "epoch": 0.7715182929136242, + "grad_norm": 1.8417807817459106, + "learning_rate": 2.4486020062230577e-06, + "loss": 0.6552, + "step": 10196 + }, + { + "epoch": 0.7715939616359577, + "grad_norm": 2.1254916191101074, + "learning_rate": 2.4470579828617955e-06, + "loss": 0.6009, + "step": 10197 + }, + { + "epoch": 0.7716696303582914, + "grad_norm": 2.3638763427734375, + "learning_rate": 2.4455143698669573e-06, + "loss": 0.6551, + "step": 10198 + }, + { + "epoch": 0.7717452990806251, + "grad_norm": 2.0712759494781494, + "learning_rate": 2.4439711673352094e-06, + "loss": 0.6991, + "step": 10199 + }, + { + "epoch": 0.7718209678029586, + "grad_norm": 1.8671433925628662, + "learning_rate": 2.4424283753631906e-06, + "loss": 0.7074, + "step": 10200 + }, + { + "epoch": 0.7718966365252923, + "grad_norm": 2.3705897331237793, + "learning_rate": 2.4408859940475177e-06, + "loss": 0.6934, + "step": 10201 + }, + { + "epoch": 0.771972305247626, + "grad_norm": 2.1461682319641113, + "learning_rate": 2.4393440234847788e-06, + "loss": 0.6653, + "step": 10202 + }, + { + "epoch": 0.7720479739699595, + "grad_norm": 3.3979363441467285, + "learning_rate": 2.4378024637715394e-06, + "loss": 0.509, + "step": 10203 + }, + { + "epoch": 0.7721236426922932, + "grad_norm": 1.9009852409362793, + "learning_rate": 2.4362613150043307e-06, + "loss": 0.657, + "step": 10204 + }, + { + "epoch": 0.7721993114146267, + "grad_norm": 2.3193490505218506, + "learning_rate": 2.4347205772796663e-06, + "loss": 0.8041, + "step": 10205 + }, + { + "epoch": 0.7722749801369604, + "grad_norm": 2.223050355911255, + "learning_rate": 2.4331802506940397e-06, + "loss": 0.6525, + "step": 10206 + }, + { + "epoch": 0.772350648859294, + "grad_norm": 2.456544876098633, + "learning_rate": 2.4316403353439026e-06, + "loss": 0.6296, + "step": 10207 + }, + { + "epoch": 0.7724263175816276, + "grad_norm": 2.209892511367798, + "learning_rate": 2.430100831325692e-06, + "loss": 0.6408, + "step": 10208 + }, + { + "epoch": 0.7725019863039613, + "grad_norm": 2.1688029766082764, + "learning_rate": 2.428561738735817e-06, + "loss": 0.8956, + "step": 10209 + }, + { + "epoch": 0.7725776550262948, + "grad_norm": 2.3047657012939453, + "learning_rate": 2.4270230576706603e-06, + "loss": 0.6141, + "step": 10210 + }, + { + "epoch": 0.7726533237486285, + "grad_norm": 2.0687978267669678, + "learning_rate": 2.42548478822658e-06, + "loss": 0.5368, + "step": 10211 + }, + { + "epoch": 0.7727289924709622, + "grad_norm": 3.177206039428711, + "learning_rate": 2.4239469304999065e-06, + "loss": 0.7155, + "step": 10212 + }, + { + "epoch": 0.7728046611932957, + "grad_norm": 1.9328745603561401, + "learning_rate": 2.4224094845869464e-06, + "loss": 0.7287, + "step": 10213 + }, + { + "epoch": 0.7728803299156294, + "grad_norm": 2.771430492401123, + "learning_rate": 2.420872450583981e-06, + "loss": 0.6553, + "step": 10214 + }, + { + "epoch": 0.772955998637963, + "grad_norm": 2.3271167278289795, + "learning_rate": 2.419335828587259e-06, + "loss": 0.5808, + "step": 10215 + }, + { + "epoch": 0.7730316673602966, + "grad_norm": 2.1030850410461426, + "learning_rate": 2.4177996186930102e-06, + "loss": 0.6993, + "step": 10216 + }, + { + "epoch": 0.7731073360826303, + "grad_norm": 2.1259703636169434, + "learning_rate": 2.4162638209974437e-06, + "loss": 0.8063, + "step": 10217 + }, + { + "epoch": 0.7731830048049638, + "grad_norm": 1.9268971681594849, + "learning_rate": 2.414728435596728e-06, + "loss": 0.7061, + "step": 10218 + }, + { + "epoch": 0.7732586735272975, + "grad_norm": 2.1795198917388916, + "learning_rate": 2.413193462587017e-06, + "loss": 0.6964, + "step": 10219 + }, + { + "epoch": 0.7733343422496312, + "grad_norm": 1.8316737413406372, + "learning_rate": 2.4116589020644367e-06, + "loss": 0.6009, + "step": 10220 + }, + { + "epoch": 0.7734100109719647, + "grad_norm": 2.2390308380126953, + "learning_rate": 2.4101247541250833e-06, + "loss": 0.5806, + "step": 10221 + }, + { + "epoch": 0.7734856796942984, + "grad_norm": 2.265646457672119, + "learning_rate": 2.408591018865034e-06, + "loss": 0.701, + "step": 10222 + }, + { + "epoch": 0.7735613484166319, + "grad_norm": 1.9274543523788452, + "learning_rate": 2.407057696380334e-06, + "loss": 0.624, + "step": 10223 + }, + { + "epoch": 0.7736370171389656, + "grad_norm": 1.9996583461761475, + "learning_rate": 2.4055247867670044e-06, + "loss": 0.663, + "step": 10224 + }, + { + "epoch": 0.7737126858612993, + "grad_norm": 2.1675891876220703, + "learning_rate": 2.4039922901210444e-06, + "loss": 0.6858, + "step": 10225 + }, + { + "epoch": 0.7737883545836328, + "grad_norm": 2.129465103149414, + "learning_rate": 2.4024602065384162e-06, + "loss": 0.598, + "step": 10226 + }, + { + "epoch": 0.7738640233059665, + "grad_norm": 2.162490129470825, + "learning_rate": 2.4009285361150723e-06, + "loss": 0.7343, + "step": 10227 + }, + { + "epoch": 0.7739396920283002, + "grad_norm": 2.1317222118377686, + "learning_rate": 2.39939727894693e-06, + "loss": 0.7082, + "step": 10228 + }, + { + "epoch": 0.7740153607506337, + "grad_norm": 2.5465569496154785, + "learning_rate": 2.3978664351298754e-06, + "loss": 0.616, + "step": 10229 + }, + { + "epoch": 0.7740910294729674, + "grad_norm": 2.2197041511535645, + "learning_rate": 2.396336004759779e-06, + "loss": 0.7131, + "step": 10230 + }, + { + "epoch": 0.7741666981953009, + "grad_norm": 1.9501080513000488, + "learning_rate": 2.39480598793248e-06, + "loss": 0.6171, + "step": 10231 + }, + { + "epoch": 0.7742423669176346, + "grad_norm": 1.9470248222351074, + "learning_rate": 2.393276384743795e-06, + "loss": 0.6346, + "step": 10232 + }, + { + "epoch": 0.7743180356399683, + "grad_norm": 2.145937442779541, + "learning_rate": 2.3917471952895117e-06, + "loss": 0.6556, + "step": 10233 + }, + { + "epoch": 0.7743937043623018, + "grad_norm": 1.949892520904541, + "learning_rate": 2.3902184196653922e-06, + "loss": 0.6536, + "step": 10234 + }, + { + "epoch": 0.7744693730846355, + "grad_norm": 2.531683921813965, + "learning_rate": 2.3886900579671765e-06, + "loss": 0.6373, + "step": 10235 + }, + { + "epoch": 0.774545041806969, + "grad_norm": 1.8003000020980835, + "learning_rate": 2.3871621102905676e-06, + "loss": 0.6487, + "step": 10236 + }, + { + "epoch": 0.7746207105293027, + "grad_norm": 2.1588857173919678, + "learning_rate": 2.385634576731258e-06, + "loss": 0.7224, + "step": 10237 + }, + { + "epoch": 0.7746963792516364, + "grad_norm": 1.7768288850784302, + "learning_rate": 2.3841074573849058e-06, + "loss": 0.6877, + "step": 10238 + }, + { + "epoch": 0.7747720479739699, + "grad_norm": 2.2642805576324463, + "learning_rate": 2.382580752347145e-06, + "loss": 0.621, + "step": 10239 + }, + { + "epoch": 0.7748477166963036, + "grad_norm": 1.7709468603134155, + "learning_rate": 2.381054461713579e-06, + "loss": 0.7062, + "step": 10240 + }, + { + "epoch": 0.7749233854186373, + "grad_norm": 2.4417378902435303, + "learning_rate": 2.3795285855797874e-06, + "loss": 0.6432, + "step": 10241 + }, + { + "epoch": 0.7749990541409708, + "grad_norm": 2.286708354949951, + "learning_rate": 2.3780031240413338e-06, + "loss": 0.6124, + "step": 10242 + }, + { + "epoch": 0.7750747228633045, + "grad_norm": 2.1248910427093506, + "learning_rate": 2.376478077193741e-06, + "loss": 0.6614, + "step": 10243 + }, + { + "epoch": 0.775150391585638, + "grad_norm": 3.0439376831054688, + "learning_rate": 2.3749534451325134e-06, + "loss": 0.8119, + "step": 10244 + }, + { + "epoch": 0.7752260603079717, + "grad_norm": 2.391871213912964, + "learning_rate": 2.37342922795313e-06, + "loss": 0.6768, + "step": 10245 + }, + { + "epoch": 0.7753017290303054, + "grad_norm": 2.5745506286621094, + "learning_rate": 2.3719054257510398e-06, + "loss": 0.7174, + "step": 10246 + }, + { + "epoch": 0.7753773977526389, + "grad_norm": 2.4691545963287354, + "learning_rate": 2.370382038621671e-06, + "loss": 0.7401, + "step": 10247 + }, + { + "epoch": 0.7754530664749726, + "grad_norm": 2.357771396636963, + "learning_rate": 2.368859066660421e-06, + "loss": 0.7197, + "step": 10248 + }, + { + "epoch": 0.7755287351973061, + "grad_norm": 2.1640098094940186, + "learning_rate": 2.3673365099626673e-06, + "loss": 0.5828, + "step": 10249 + }, + { + "epoch": 0.7756044039196398, + "grad_norm": 1.9653394222259521, + "learning_rate": 2.365814368623751e-06, + "loss": 0.6857, + "step": 10250 + }, + { + "epoch": 0.7756800726419735, + "grad_norm": 2.2896316051483154, + "learning_rate": 2.364292642738996e-06, + "loss": 0.6718, + "step": 10251 + }, + { + "epoch": 0.775755741364307, + "grad_norm": 2.249856948852539, + "learning_rate": 2.3627713324036957e-06, + "loss": 0.588, + "step": 10252 + }, + { + "epoch": 0.7758314100866407, + "grad_norm": 1.9826165437698364, + "learning_rate": 2.3612504377131283e-06, + "loss": 0.6545, + "step": 10253 + }, + { + "epoch": 0.7759070788089744, + "grad_norm": 3.986067295074463, + "learning_rate": 2.359729958762527e-06, + "loss": 0.7375, + "step": 10254 + }, + { + "epoch": 0.7759827475313079, + "grad_norm": 2.1496474742889404, + "learning_rate": 2.3582098956471134e-06, + "loss": 0.7683, + "step": 10255 + }, + { + "epoch": 0.7760584162536416, + "grad_norm": 2.206681966781616, + "learning_rate": 2.3566902484620785e-06, + "loss": 0.5398, + "step": 10256 + }, + { + "epoch": 0.7761340849759751, + "grad_norm": 2.004941940307617, + "learning_rate": 2.355171017302587e-06, + "loss": 0.7467, + "step": 10257 + }, + { + "epoch": 0.7762097536983088, + "grad_norm": 2.200032949447632, + "learning_rate": 2.353652202263778e-06, + "loss": 0.6155, + "step": 10258 + }, + { + "epoch": 0.7762854224206425, + "grad_norm": 1.9743727445602417, + "learning_rate": 2.352133803440765e-06, + "loss": 0.5706, + "step": 10259 + }, + { + "epoch": 0.776361091142976, + "grad_norm": 1.8890496492385864, + "learning_rate": 2.350615820928639e-06, + "loss": 0.578, + "step": 10260 + }, + { + "epoch": 0.7764367598653097, + "grad_norm": 2.166748523712158, + "learning_rate": 2.3490982548224532e-06, + "loss": 0.7314, + "step": 10261 + }, + { + "epoch": 0.7765124285876432, + "grad_norm": 2.333298683166504, + "learning_rate": 2.3475811052172434e-06, + "loss": 0.7265, + "step": 10262 + }, + { + "epoch": 0.7765880973099769, + "grad_norm": 2.2891969680786133, + "learning_rate": 2.3460643722080277e-06, + "loss": 0.6929, + "step": 10263 + }, + { + "epoch": 0.7766637660323106, + "grad_norm": 2.030637741088867, + "learning_rate": 2.344548055889779e-06, + "loss": 0.7014, + "step": 10264 + }, + { + "epoch": 0.7767394347546441, + "grad_norm": 2.0292556285858154, + "learning_rate": 2.3430321563574577e-06, + "loss": 0.6334, + "step": 10265 + }, + { + "epoch": 0.7768151034769778, + "grad_norm": 2.329683542251587, + "learning_rate": 2.3415166737059937e-06, + "loss": 0.7243, + "step": 10266 + }, + { + "epoch": 0.7768907721993115, + "grad_norm": 2.4206478595733643, + "learning_rate": 2.340001608030292e-06, + "loss": 0.604, + "step": 10267 + }, + { + "epoch": 0.776966440921645, + "grad_norm": 2.233008861541748, + "learning_rate": 2.3384869594252304e-06, + "loss": 0.7065, + "step": 10268 + }, + { + "epoch": 0.7770421096439787, + "grad_norm": 1.844909906387329, + "learning_rate": 2.336972727985662e-06, + "loss": 0.7302, + "step": 10269 + }, + { + "epoch": 0.7771177783663122, + "grad_norm": 1.7668637037277222, + "learning_rate": 2.335458913806411e-06, + "loss": 0.6437, + "step": 10270 + }, + { + "epoch": 0.7771934470886459, + "grad_norm": 2.1801650524139404, + "learning_rate": 2.3339455169822822e-06, + "loss": 0.8086, + "step": 10271 + }, + { + "epoch": 0.7772691158109796, + "grad_norm": 1.8977692127227783, + "learning_rate": 2.33243253760804e-06, + "loss": 0.6991, + "step": 10272 + }, + { + "epoch": 0.7773447845333131, + "grad_norm": 2.6029887199401855, + "learning_rate": 2.3309199757784408e-06, + "loss": 0.6931, + "step": 10273 + }, + { + "epoch": 0.7774204532556468, + "grad_norm": 2.1565604209899902, + "learning_rate": 2.3294078315882057e-06, + "loss": 0.66, + "step": 10274 + }, + { + "epoch": 0.7774961219779803, + "grad_norm": 2.308840751647949, + "learning_rate": 2.3278961051320257e-06, + "loss": 0.6124, + "step": 10275 + }, + { + "epoch": 0.777571790700314, + "grad_norm": 2.039461851119995, + "learning_rate": 2.3263847965045705e-06, + "loss": 0.5688, + "step": 10276 + }, + { + "epoch": 0.7776474594226477, + "grad_norm": 2.1434340476989746, + "learning_rate": 2.324873905800485e-06, + "loss": 0.656, + "step": 10277 + }, + { + "epoch": 0.7777231281449812, + "grad_norm": 1.743505597114563, + "learning_rate": 2.323363433114385e-06, + "loss": 0.6187, + "step": 10278 + }, + { + "epoch": 0.7777987968673149, + "grad_norm": 2.4365437030792236, + "learning_rate": 2.321853378540862e-06, + "loss": 0.7503, + "step": 10279 + }, + { + "epoch": 0.7778744655896486, + "grad_norm": 2.0435638427734375, + "learning_rate": 2.3203437421744804e-06, + "loss": 0.7011, + "step": 10280 + }, + { + "epoch": 0.7779501343119821, + "grad_norm": 2.0592703819274902, + "learning_rate": 2.318834524109781e-06, + "loss": 0.6205, + "step": 10281 + }, + { + "epoch": 0.7780258030343158, + "grad_norm": 2.7824881076812744, + "learning_rate": 2.3173257244412673e-06, + "loss": 0.5982, + "step": 10282 + }, + { + "epoch": 0.7781014717566493, + "grad_norm": 2.0062973499298096, + "learning_rate": 2.3158173432634347e-06, + "loss": 0.6368, + "step": 10283 + }, + { + "epoch": 0.778177140478983, + "grad_norm": 2.447920322418213, + "learning_rate": 2.314309380670739e-06, + "loss": 0.7854, + "step": 10284 + }, + { + "epoch": 0.7782528092013167, + "grad_norm": 2.387455463409424, + "learning_rate": 2.312801836757616e-06, + "loss": 0.6015, + "step": 10285 + }, + { + "epoch": 0.7783284779236502, + "grad_norm": 2.1444883346557617, + "learning_rate": 2.3112947116184693e-06, + "loss": 0.5855, + "step": 10286 + }, + { + "epoch": 0.7784041466459839, + "grad_norm": 2.148451328277588, + "learning_rate": 2.3097880053476777e-06, + "loss": 0.6432, + "step": 10287 + }, + { + "epoch": 0.7784798153683175, + "grad_norm": 2.473336696624756, + "learning_rate": 2.308281718039607e-06, + "loss": 0.6729, + "step": 10288 + }, + { + "epoch": 0.7785554840906511, + "grad_norm": 2.9266908168792725, + "learning_rate": 2.306775849788575e-06, + "loss": 0.5335, + "step": 10289 + }, + { + "epoch": 0.7786311528129848, + "grad_norm": 2.390139102935791, + "learning_rate": 2.3052704006888876e-06, + "loss": 0.6986, + "step": 10290 + }, + { + "epoch": 0.7787068215353183, + "grad_norm": 2.233603000640869, + "learning_rate": 2.3037653708348215e-06, + "loss": 0.6058, + "step": 10291 + }, + { + "epoch": 0.778782490257652, + "grad_norm": 2.33750057220459, + "learning_rate": 2.302260760320629e-06, + "loss": 0.769, + "step": 10292 + }, + { + "epoch": 0.7788581589799857, + "grad_norm": 2.0690042972564697, + "learning_rate": 2.3007565692405256e-06, + "loss": 0.6749, + "step": 10293 + }, + { + "epoch": 0.7789338277023192, + "grad_norm": 2.1784255504608154, + "learning_rate": 2.2992527976887156e-06, + "loss": 0.5672, + "step": 10294 + }, + { + "epoch": 0.7790094964246529, + "grad_norm": 1.8492722511291504, + "learning_rate": 2.2977494457593715e-06, + "loss": 0.7427, + "step": 10295 + }, + { + "epoch": 0.7790851651469864, + "grad_norm": 2.1456425189971924, + "learning_rate": 2.2962465135466325e-06, + "loss": 0.6621, + "step": 10296 + }, + { + "epoch": 0.7791608338693201, + "grad_norm": 2.1436009407043457, + "learning_rate": 2.294744001144619e-06, + "loss": 0.7521, + "step": 10297 + }, + { + "epoch": 0.7792365025916538, + "grad_norm": 2.698065996170044, + "learning_rate": 2.2932419086474206e-06, + "loss": 0.7116, + "step": 10298 + }, + { + "epoch": 0.7793121713139873, + "grad_norm": 2.1838340759277344, + "learning_rate": 2.291740236149112e-06, + "loss": 0.6111, + "step": 10299 + }, + { + "epoch": 0.779387840036321, + "grad_norm": 2.1678380966186523, + "learning_rate": 2.290238983743724e-06, + "loss": 0.5987, + "step": 10300 + }, + { + "epoch": 0.7794635087586546, + "grad_norm": 2.3915209770202637, + "learning_rate": 2.288738151525273e-06, + "loss": 0.5449, + "step": 10301 + }, + { + "epoch": 0.7795391774809882, + "grad_norm": 2.2159979343414307, + "learning_rate": 2.2872377395877457e-06, + "loss": 0.6592, + "step": 10302 + }, + { + "epoch": 0.7796148462033219, + "grad_norm": 2.2043135166168213, + "learning_rate": 2.285737748025103e-06, + "loss": 0.671, + "step": 10303 + }, + { + "epoch": 0.7796905149256554, + "grad_norm": 2.210493564605713, + "learning_rate": 2.2842381769312798e-06, + "loss": 0.589, + "step": 10304 + }, + { + "epoch": 0.7797661836479891, + "grad_norm": 2.3865721225738525, + "learning_rate": 2.282739026400182e-06, + "loss": 0.6478, + "step": 10305 + }, + { + "epoch": 0.7798418523703228, + "grad_norm": 1.9948359727859497, + "learning_rate": 2.2812402965256957e-06, + "loss": 0.7697, + "step": 10306 + }, + { + "epoch": 0.7799175210926563, + "grad_norm": 3.1462361812591553, + "learning_rate": 2.27974198740167e-06, + "loss": 0.6498, + "step": 10307 + }, + { + "epoch": 0.77999318981499, + "grad_norm": 1.8765848875045776, + "learning_rate": 2.278244099121936e-06, + "loss": 0.6286, + "step": 10308 + }, + { + "epoch": 0.7800688585373236, + "grad_norm": 4.639185905456543, + "learning_rate": 2.276746631780301e-06, + "loss": 0.5139, + "step": 10309 + }, + { + "epoch": 0.7801445272596572, + "grad_norm": 2.111081838607788, + "learning_rate": 2.2752495854705357e-06, + "loss": 0.6906, + "step": 10310 + }, + { + "epoch": 0.7802201959819909, + "grad_norm": 6.2167744636535645, + "learning_rate": 2.2737529602863918e-06, + "loss": 0.8498, + "step": 10311 + }, + { + "epoch": 0.7802958647043244, + "grad_norm": 2.314579486846924, + "learning_rate": 2.2722567563215922e-06, + "loss": 0.745, + "step": 10312 + }, + { + "epoch": 0.7803715334266581, + "grad_norm": 1.908983826637268, + "learning_rate": 2.270760973669836e-06, + "loss": 0.6662, + "step": 10313 + }, + { + "epoch": 0.7804472021489917, + "grad_norm": 1.937185287475586, + "learning_rate": 2.269265612424791e-06, + "loss": 0.5662, + "step": 10314 + }, + { + "epoch": 0.7805228708713253, + "grad_norm": 2.0278525352478027, + "learning_rate": 2.2677706726801044e-06, + "loss": 0.8562, + "step": 10315 + }, + { + "epoch": 0.780598539593659, + "grad_norm": 2.70788311958313, + "learning_rate": 2.266276154529393e-06, + "loss": 0.8197, + "step": 10316 + }, + { + "epoch": 0.7806742083159925, + "grad_norm": 1.852952480316162, + "learning_rate": 2.2647820580662505e-06, + "loss": 0.7382, + "step": 10317 + }, + { + "epoch": 0.7807498770383262, + "grad_norm": 2.082524299621582, + "learning_rate": 2.263288383384234e-06, + "loss": 0.6123, + "step": 10318 + }, + { + "epoch": 0.7808255457606599, + "grad_norm": 2.4691317081451416, + "learning_rate": 2.2617951305768917e-06, + "loss": 0.7913, + "step": 10319 + }, + { + "epoch": 0.7809012144829934, + "grad_norm": 2.056469678878784, + "learning_rate": 2.2603022997377337e-06, + "loss": 0.7534, + "step": 10320 + }, + { + "epoch": 0.7809768832053271, + "grad_norm": 2.364788293838501, + "learning_rate": 2.2588098909602435e-06, + "loss": 0.6309, + "step": 10321 + }, + { + "epoch": 0.7810525519276607, + "grad_norm": 2.2668612003326416, + "learning_rate": 2.2573179043378803e-06, + "loss": 0.7426, + "step": 10322 + }, + { + "epoch": 0.7811282206499943, + "grad_norm": 2.3786261081695557, + "learning_rate": 2.255826339964079e-06, + "loss": 0.6801, + "step": 10323 + }, + { + "epoch": 0.781203889372328, + "grad_norm": 2.557690382003784, + "learning_rate": 2.254335197932246e-06, + "loss": 0.5807, + "step": 10324 + }, + { + "epoch": 0.7812795580946615, + "grad_norm": 2.1861777305603027, + "learning_rate": 2.25284447833576e-06, + "loss": 0.6966, + "step": 10325 + }, + { + "epoch": 0.7813552268169952, + "grad_norm": 2.943876028060913, + "learning_rate": 2.251354181267977e-06, + "loss": 0.6657, + "step": 10326 + }, + { + "epoch": 0.7814308955393289, + "grad_norm": 1.9715664386749268, + "learning_rate": 2.249864306822222e-06, + "loss": 0.7396, + "step": 10327 + }, + { + "epoch": 0.7815065642616624, + "grad_norm": 1.953696608543396, + "learning_rate": 2.248374855091797e-06, + "loss": 0.52, + "step": 10328 + }, + { + "epoch": 0.7815822329839961, + "grad_norm": 2.13394832611084, + "learning_rate": 2.246885826169975e-06, + "loss": 0.6673, + "step": 10329 + }, + { + "epoch": 0.7816579017063296, + "grad_norm": 1.9963359832763672, + "learning_rate": 2.2453972201500055e-06, + "loss": 0.6992, + "step": 10330 + }, + { + "epoch": 0.7817335704286633, + "grad_norm": 2.229747772216797, + "learning_rate": 2.243909037125112e-06, + "loss": 0.8009, + "step": 10331 + }, + { + "epoch": 0.781809239150997, + "grad_norm": 1.8210551738739014, + "learning_rate": 2.2424212771884842e-06, + "loss": 0.7221, + "step": 10332 + }, + { + "epoch": 0.7818849078733305, + "grad_norm": 2.274820566177368, + "learning_rate": 2.2409339404332924e-06, + "loss": 0.8382, + "step": 10333 + }, + { + "epoch": 0.7819605765956642, + "grad_norm": 2.1577939987182617, + "learning_rate": 2.2394470269526785e-06, + "loss": 0.7821, + "step": 10334 + }, + { + "epoch": 0.7820362453179978, + "grad_norm": 1.9666999578475952, + "learning_rate": 2.2379605368397578e-06, + "loss": 0.7119, + "step": 10335 + }, + { + "epoch": 0.7821119140403314, + "grad_norm": 2.317742109298706, + "learning_rate": 2.2364744701876195e-06, + "loss": 0.5406, + "step": 10336 + }, + { + "epoch": 0.7821875827626651, + "grad_norm": 2.2449235916137695, + "learning_rate": 2.234988827089326e-06, + "loss": 0.6204, + "step": 10337 + }, + { + "epoch": 0.7822632514849986, + "grad_norm": 2.313387632369995, + "learning_rate": 2.2335036076379153e-06, + "loss": 0.7333, + "step": 10338 + }, + { + "epoch": 0.7823389202073323, + "grad_norm": 2.1615755558013916, + "learning_rate": 2.2320188119263895e-06, + "loss": 0.7058, + "step": 10339 + }, + { + "epoch": 0.782414588929666, + "grad_norm": 2.0965301990509033, + "learning_rate": 2.230534440047738e-06, + "loss": 0.5865, + "step": 10340 + }, + { + "epoch": 0.7824902576519995, + "grad_norm": 2.1409988403320312, + "learning_rate": 2.2290504920949155e-06, + "loss": 0.643, + "step": 10341 + }, + { + "epoch": 0.7825659263743332, + "grad_norm": 2.249452590942383, + "learning_rate": 2.2275669681608534e-06, + "loss": 0.6476, + "step": 10342 + }, + { + "epoch": 0.7826415950966668, + "grad_norm": 2.122205972671509, + "learning_rate": 2.22608386833845e-06, + "loss": 0.7286, + "step": 10343 + }, + { + "epoch": 0.7827172638190004, + "grad_norm": 2.537824869155884, + "learning_rate": 2.224601192720581e-06, + "loss": 0.7216, + "step": 10344 + }, + { + "epoch": 0.7827929325413341, + "grad_norm": 2.4190261363983154, + "learning_rate": 2.2231189414001053e-06, + "loss": 0.544, + "step": 10345 + }, + { + "epoch": 0.7828686012636676, + "grad_norm": 2.434654712677002, + "learning_rate": 2.221637114469837e-06, + "loss": 0.7063, + "step": 10346 + }, + { + "epoch": 0.7829442699860013, + "grad_norm": 2.0389528274536133, + "learning_rate": 2.2201557120225783e-06, + "loss": 0.5486, + "step": 10347 + }, + { + "epoch": 0.7830199387083349, + "grad_norm": 2.8061161041259766, + "learning_rate": 2.2186747341510968e-06, + "loss": 0.6029, + "step": 10348 + }, + { + "epoch": 0.7830956074306685, + "grad_norm": 1.9045759439468384, + "learning_rate": 2.2171941809481367e-06, + "loss": 0.7342, + "step": 10349 + }, + { + "epoch": 0.7831712761530022, + "grad_norm": 2.0165064334869385, + "learning_rate": 2.2157140525064155e-06, + "loss": 0.7228, + "step": 10350 + }, + { + "epoch": 0.7832469448753357, + "grad_norm": 2.1648142337799072, + "learning_rate": 2.214234348918623e-06, + "loss": 0.7189, + "step": 10351 + }, + { + "epoch": 0.7833226135976694, + "grad_norm": 1.7740191221237183, + "learning_rate": 2.2127550702774267e-06, + "loss": 0.6009, + "step": 10352 + }, + { + "epoch": 0.7833982823200031, + "grad_norm": 2.149911642074585, + "learning_rate": 2.2112762166754567e-06, + "loss": 0.7393, + "step": 10353 + }, + { + "epoch": 0.7834739510423366, + "grad_norm": 2.2649195194244385, + "learning_rate": 2.209797788205326e-06, + "loss": 0.657, + "step": 10354 + }, + { + "epoch": 0.7835496197646703, + "grad_norm": 2.4505796432495117, + "learning_rate": 2.208319784959622e-06, + "loss": 0.569, + "step": 10355 + }, + { + "epoch": 0.7836252884870039, + "grad_norm": 2.0467987060546875, + "learning_rate": 2.2068422070309032e-06, + "loss": 0.8597, + "step": 10356 + }, + { + "epoch": 0.7837009572093375, + "grad_norm": 2.103842258453369, + "learning_rate": 2.2053650545116936e-06, + "loss": 0.6934, + "step": 10357 + }, + { + "epoch": 0.7837766259316712, + "grad_norm": 1.9594662189483643, + "learning_rate": 2.2038883274945015e-06, + "loss": 0.6439, + "step": 10358 + }, + { + "epoch": 0.7838522946540047, + "grad_norm": 2.5224552154541016, + "learning_rate": 2.2024120260718035e-06, + "loss": 0.6937, + "step": 10359 + }, + { + "epoch": 0.7839279633763384, + "grad_norm": 1.9823088645935059, + "learning_rate": 2.2009361503360506e-06, + "loss": 0.5863, + "step": 10360 + }, + { + "epoch": 0.784003632098672, + "grad_norm": 2.8926243782043457, + "learning_rate": 2.199460700379666e-06, + "loss": 0.6975, + "step": 10361 + }, + { + "epoch": 0.7840793008210056, + "grad_norm": 2.1695239543914795, + "learning_rate": 2.1979856762950488e-06, + "loss": 0.5354, + "step": 10362 + }, + { + "epoch": 0.7841549695433393, + "grad_norm": 4.557194709777832, + "learning_rate": 2.196511078174571e-06, + "loss": 0.7417, + "step": 10363 + }, + { + "epoch": 0.7842306382656729, + "grad_norm": 2.158311605453491, + "learning_rate": 2.19503690611057e-06, + "loss": 0.6356, + "step": 10364 + }, + { + "epoch": 0.7843063069880065, + "grad_norm": 2.076812505722046, + "learning_rate": 2.1935631601953705e-06, + "loss": 0.6391, + "step": 10365 + }, + { + "epoch": 0.7843819757103402, + "grad_norm": 2.2256968021392822, + "learning_rate": 2.192089840521263e-06, + "loss": 0.5693, + "step": 10366 + }, + { + "epoch": 0.7844576444326737, + "grad_norm": 3.5892693996429443, + "learning_rate": 2.1906169471805065e-06, + "loss": 0.6821, + "step": 10367 + }, + { + "epoch": 0.7845333131550074, + "grad_norm": 2.188708543777466, + "learning_rate": 2.1891444802653406e-06, + "loss": 0.6475, + "step": 10368 + }, + { + "epoch": 0.784608981877341, + "grad_norm": 2.307600975036621, + "learning_rate": 2.187672439867977e-06, + "loss": 0.7261, + "step": 10369 + }, + { + "epoch": 0.7846846505996746, + "grad_norm": 1.9995859861373901, + "learning_rate": 2.1862008260805987e-06, + "loss": 0.6297, + "step": 10370 + }, + { + "epoch": 0.7847603193220083, + "grad_norm": 2.0694448947906494, + "learning_rate": 2.184729638995363e-06, + "loss": 0.6715, + "step": 10371 + }, + { + "epoch": 0.7848359880443418, + "grad_norm": 2.0987765789031982, + "learning_rate": 2.1832588787044003e-06, + "loss": 0.6215, + "step": 10372 + }, + { + "epoch": 0.7849116567666755, + "grad_norm": 2.225297689437866, + "learning_rate": 2.1817885452998156e-06, + "loss": 0.5915, + "step": 10373 + }, + { + "epoch": 0.7849873254890091, + "grad_norm": 3.953972101211548, + "learning_rate": 2.1803186388736867e-06, + "loss": 0.789, + "step": 10374 + }, + { + "epoch": 0.7850629942113427, + "grad_norm": 2.2498698234558105, + "learning_rate": 2.1788491595180567e-06, + "loss": 0.6853, + "step": 10375 + }, + { + "epoch": 0.7851386629336764, + "grad_norm": 2.0867936611175537, + "learning_rate": 2.177380107324958e-06, + "loss": 0.7266, + "step": 10376 + }, + { + "epoch": 0.78521433165601, + "grad_norm": 2.7368338108062744, + "learning_rate": 2.175911482386386e-06, + "loss": 0.6268, + "step": 10377 + }, + { + "epoch": 0.7852900003783436, + "grad_norm": 2.203387975692749, + "learning_rate": 2.174443284794307e-06, + "loss": 0.7792, + "step": 10378 + }, + { + "epoch": 0.7853656691006773, + "grad_norm": 1.9313303232192993, + "learning_rate": 2.1729755146406653e-06, + "loss": 0.7007, + "step": 10379 + }, + { + "epoch": 0.7854413378230108, + "grad_norm": 2.8301777839660645, + "learning_rate": 2.171508172017378e-06, + "loss": 0.7817, + "step": 10380 + }, + { + "epoch": 0.7855170065453445, + "grad_norm": 2.140004873275757, + "learning_rate": 2.170041257016336e-06, + "loss": 0.7564, + "step": 10381 + }, + { + "epoch": 0.7855926752676781, + "grad_norm": 9.439233779907227, + "learning_rate": 2.1685747697294005e-06, + "loss": 0.6332, + "step": 10382 + }, + { + "epoch": 0.7856683439900117, + "grad_norm": 3.0512073040008545, + "learning_rate": 2.167108710248408e-06, + "loss": 0.6822, + "step": 10383 + }, + { + "epoch": 0.7857440127123454, + "grad_norm": 2.452768564224243, + "learning_rate": 2.165643078665172e-06, + "loss": 0.6007, + "step": 10384 + }, + { + "epoch": 0.785819681434679, + "grad_norm": 2.016571044921875, + "learning_rate": 2.1641778750714653e-06, + "loss": 0.6412, + "step": 10385 + }, + { + "epoch": 0.7858953501570126, + "grad_norm": 2.0821337699890137, + "learning_rate": 2.162713099559053e-06, + "loss": 0.6281, + "step": 10386 + }, + { + "epoch": 0.7859710188793462, + "grad_norm": 1.9462223052978516, + "learning_rate": 2.16124875221966e-06, + "loss": 0.6654, + "step": 10387 + }, + { + "epoch": 0.7860466876016798, + "grad_norm": 1.9950424432754517, + "learning_rate": 2.1597848331449925e-06, + "loss": 0.6193, + "step": 10388 + }, + { + "epoch": 0.7861223563240135, + "grad_norm": 2.3755111694335938, + "learning_rate": 2.1583213424267207e-06, + "loss": 0.6631, + "step": 10389 + }, + { + "epoch": 0.7861980250463471, + "grad_norm": 2.3503427505493164, + "learning_rate": 2.1568582801564918e-06, + "loss": 0.7469, + "step": 10390 + }, + { + "epoch": 0.7862736937686807, + "grad_norm": 2.1532740592956543, + "learning_rate": 2.1553956464259367e-06, + "loss": 0.675, + "step": 10391 + }, + { + "epoch": 0.7863493624910144, + "grad_norm": 3.5984179973602295, + "learning_rate": 2.153933441326641e-06, + "loss": 0.5442, + "step": 10392 + }, + { + "epoch": 0.786425031213348, + "grad_norm": 1.983961820602417, + "learning_rate": 2.1524716649501764e-06, + "loss": 0.6268, + "step": 10393 + }, + { + "epoch": 0.7865006999356816, + "grad_norm": 2.3665051460266113, + "learning_rate": 2.151010317388083e-06, + "loss": 0.6089, + "step": 10394 + }, + { + "epoch": 0.7865763686580152, + "grad_norm": 3.3090527057647705, + "learning_rate": 2.1495493987318773e-06, + "loss": 0.7426, + "step": 10395 + }, + { + "epoch": 0.7866520373803488, + "grad_norm": 2.405433416366577, + "learning_rate": 2.148088909073044e-06, + "loss": 0.7821, + "step": 10396 + }, + { + "epoch": 0.7867277061026825, + "grad_norm": 2.0059757232666016, + "learning_rate": 2.1466288485030456e-06, + "loss": 0.6236, + "step": 10397 + }, + { + "epoch": 0.786803374825016, + "grad_norm": 2.419422149658203, + "learning_rate": 2.145169217113317e-06, + "loss": 0.7572, + "step": 10398 + }, + { + "epoch": 0.7868790435473497, + "grad_norm": 3.1536026000976562, + "learning_rate": 2.143710014995261e-06, + "loss": 0.6652, + "step": 10399 + }, + { + "epoch": 0.7869547122696833, + "grad_norm": 2.279188394546509, + "learning_rate": 2.142251242240258e-06, + "loss": 0.6332, + "step": 10400 + }, + { + "epoch": 0.7870303809920169, + "grad_norm": 2.087526321411133, + "learning_rate": 2.1407928989396655e-06, + "loss": 0.5919, + "step": 10401 + }, + { + "epoch": 0.7871060497143506, + "grad_norm": 3.789907693862915, + "learning_rate": 2.1393349851848084e-06, + "loss": 0.6605, + "step": 10402 + }, + { + "epoch": 0.7871817184366842, + "grad_norm": 2.147244930267334, + "learning_rate": 2.1378775010669824e-06, + "loss": 0.7815, + "step": 10403 + }, + { + "epoch": 0.7872573871590178, + "grad_norm": 3.6964704990386963, + "learning_rate": 2.1364204466774623e-06, + "loss": 0.4579, + "step": 10404 + }, + { + "epoch": 0.7873330558813515, + "grad_norm": 2.3750951290130615, + "learning_rate": 2.134963822107494e-06, + "loss": 0.6549, + "step": 10405 + }, + { + "epoch": 0.787408724603685, + "grad_norm": 2.594712972640991, + "learning_rate": 2.1335076274482954e-06, + "loss": 0.6412, + "step": 10406 + }, + { + "epoch": 0.7874843933260187, + "grad_norm": 2.0209412574768066, + "learning_rate": 2.132051862791057e-06, + "loss": 0.7044, + "step": 10407 + }, + { + "epoch": 0.7875600620483523, + "grad_norm": 2.0428459644317627, + "learning_rate": 2.130596528226945e-06, + "loss": 0.6416, + "step": 10408 + }, + { + "epoch": 0.7876357307706859, + "grad_norm": 2.461974859237671, + "learning_rate": 2.1291416238470994e-06, + "loss": 0.7138, + "step": 10409 + }, + { + "epoch": 0.7877113994930196, + "grad_norm": 1.9830410480499268, + "learning_rate": 2.127687149742626e-06, + "loss": 0.6454, + "step": 10410 + }, + { + "epoch": 0.7877870682153532, + "grad_norm": 2.3147575855255127, + "learning_rate": 2.126233106004608e-06, + "loss": 0.7328, + "step": 10411 + }, + { + "epoch": 0.7878627369376868, + "grad_norm": 2.058706283569336, + "learning_rate": 2.124779492724111e-06, + "loss": 0.6221, + "step": 10412 + }, + { + "epoch": 0.7879384056600204, + "grad_norm": 2.767449140548706, + "learning_rate": 2.1233263099921565e-06, + "loss": 0.6106, + "step": 10413 + }, + { + "epoch": 0.788014074382354, + "grad_norm": 2.3518853187561035, + "learning_rate": 2.12187355789975e-06, + "loss": 0.7009, + "step": 10414 + }, + { + "epoch": 0.7880897431046877, + "grad_norm": 1.9216002225875854, + "learning_rate": 2.1204212365378685e-06, + "loss": 0.7082, + "step": 10415 + }, + { + "epoch": 0.7881654118270213, + "grad_norm": 2.5170297622680664, + "learning_rate": 2.1189693459974597e-06, + "loss": 0.6601, + "step": 10416 + }, + { + "epoch": 0.7882410805493549, + "grad_norm": 3.180408000946045, + "learning_rate": 2.117517886369447e-06, + "loss": 0.8731, + "step": 10417 + }, + { + "epoch": 0.7883167492716886, + "grad_norm": 1.9902843236923218, + "learning_rate": 2.116066857744725e-06, + "loss": 0.6012, + "step": 10418 + }, + { + "epoch": 0.7883924179940222, + "grad_norm": 2.8387811183929443, + "learning_rate": 2.1146162602141614e-06, + "loss": 0.6855, + "step": 10419 + }, + { + "epoch": 0.7884680867163558, + "grad_norm": 2.100433349609375, + "learning_rate": 2.1131660938685998e-06, + "loss": 0.6094, + "step": 10420 + }, + { + "epoch": 0.7885437554386894, + "grad_norm": 2.3717706203460693, + "learning_rate": 2.1117163587988477e-06, + "loss": 0.6863, + "step": 10421 + }, + { + "epoch": 0.788619424161023, + "grad_norm": 1.9035117626190186, + "learning_rate": 2.1102670550956986e-06, + "loss": 0.645, + "step": 10422 + }, + { + "epoch": 0.7886950928833567, + "grad_norm": 2.151751756668091, + "learning_rate": 2.108818182849914e-06, + "loss": 0.7161, + "step": 10423 + }, + { + "epoch": 0.7887707616056903, + "grad_norm": 2.054111957550049, + "learning_rate": 2.10736974215222e-06, + "loss": 0.6702, + "step": 10424 + }, + { + "epoch": 0.7888464303280239, + "grad_norm": 2.213700532913208, + "learning_rate": 2.1059217330933273e-06, + "loss": 0.5848, + "step": 10425 + }, + { + "epoch": 0.7889220990503575, + "grad_norm": 1.8522429466247559, + "learning_rate": 2.104474155763913e-06, + "loss": 0.6449, + "step": 10426 + }, + { + "epoch": 0.7889977677726911, + "grad_norm": 2.314577341079712, + "learning_rate": 2.1030270102546303e-06, + "loss": 0.7704, + "step": 10427 + }, + { + "epoch": 0.7890734364950248, + "grad_norm": 2.8026366233825684, + "learning_rate": 2.1015802966561037e-06, + "loss": 0.5235, + "step": 10428 + }, + { + "epoch": 0.7891491052173584, + "grad_norm": 2.0576069355010986, + "learning_rate": 2.100134015058931e-06, + "loss": 0.6654, + "step": 10429 + }, + { + "epoch": 0.789224773939692, + "grad_norm": 2.624588966369629, + "learning_rate": 2.098688165553683e-06, + "loss": 0.8377, + "step": 10430 + }, + { + "epoch": 0.7893004426620257, + "grad_norm": 1.8975677490234375, + "learning_rate": 2.0972427482309034e-06, + "loss": 0.8135, + "step": 10431 + }, + { + "epoch": 0.7893761113843593, + "grad_norm": 2.2255985736846924, + "learning_rate": 2.09579776318111e-06, + "loss": 0.7366, + "step": 10432 + }, + { + "epoch": 0.7894517801066929, + "grad_norm": 2.165253162384033, + "learning_rate": 2.0943532104947906e-06, + "loss": 0.6496, + "step": 10433 + }, + { + "epoch": 0.7895274488290265, + "grad_norm": 2.220125436782837, + "learning_rate": 2.0929090902624117e-06, + "loss": 0.697, + "step": 10434 + }, + { + "epoch": 0.7896031175513601, + "grad_norm": 1.6284946203231812, + "learning_rate": 2.0914654025744034e-06, + "loss": 0.589, + "step": 10435 + }, + { + "epoch": 0.7896787862736938, + "grad_norm": 1.925709843635559, + "learning_rate": 2.090022147521174e-06, + "loss": 0.6547, + "step": 10436 + }, + { + "epoch": 0.7897544549960274, + "grad_norm": 2.4849398136138916, + "learning_rate": 2.088579325193112e-06, + "loss": 0.5531, + "step": 10437 + }, + { + "epoch": 0.789830123718361, + "grad_norm": 1.8222843408584595, + "learning_rate": 2.0871369356805653e-06, + "loss": 0.6774, + "step": 10438 + }, + { + "epoch": 0.7899057924406946, + "grad_norm": 2.412240505218506, + "learning_rate": 2.085694979073861e-06, + "loss": 0.7183, + "step": 10439 + }, + { + "epoch": 0.7899814611630283, + "grad_norm": 2.244049549102783, + "learning_rate": 2.084253455463302e-06, + "loss": 0.6786, + "step": 10440 + }, + { + "epoch": 0.7900571298853619, + "grad_norm": 3.5137126445770264, + "learning_rate": 2.0828123649391594e-06, + "loss": 0.6426, + "step": 10441 + }, + { + "epoch": 0.7901327986076955, + "grad_norm": 2.3122079372406006, + "learning_rate": 2.0813717075916797e-06, + "loss": 0.6122, + "step": 10442 + }, + { + "epoch": 0.7902084673300291, + "grad_norm": 2.5689215660095215, + "learning_rate": 2.0799314835110808e-06, + "loss": 0.7128, + "step": 10443 + }, + { + "epoch": 0.7902841360523628, + "grad_norm": 1.6637498140335083, + "learning_rate": 2.0784916927875547e-06, + "loss": 0.6082, + "step": 10444 + }, + { + "epoch": 0.7903598047746964, + "grad_norm": 2.4732978343963623, + "learning_rate": 2.0770523355112686e-06, + "loss": 0.7996, + "step": 10445 + }, + { + "epoch": 0.79043547349703, + "grad_norm": 1.848886251449585, + "learning_rate": 2.075613411772353e-06, + "loss": 0.6386, + "step": 10446 + }, + { + "epoch": 0.7905111422193636, + "grad_norm": 2.2179338932037354, + "learning_rate": 2.074174921660921e-06, + "loss": 0.7844, + "step": 10447 + }, + { + "epoch": 0.7905868109416972, + "grad_norm": 2.1151363849639893, + "learning_rate": 2.0727368652670605e-06, + "loss": 0.6207, + "step": 10448 + }, + { + "epoch": 0.7906624796640309, + "grad_norm": 1.6570191383361816, + "learning_rate": 2.07129924268082e-06, + "loss": 0.5132, + "step": 10449 + }, + { + "epoch": 0.7907381483863645, + "grad_norm": 2.8186240196228027, + "learning_rate": 2.069862053992231e-06, + "loss": 0.7471, + "step": 10450 + }, + { + "epoch": 0.7908138171086981, + "grad_norm": 2.399502992630005, + "learning_rate": 2.0684252992912963e-06, + "loss": 0.7079, + "step": 10451 + }, + { + "epoch": 0.7908894858310317, + "grad_norm": 2.391279935836792, + "learning_rate": 2.0669889786679883e-06, + "loss": 0.7076, + "step": 10452 + }, + { + "epoch": 0.7909651545533654, + "grad_norm": 2.1151928901672363, + "learning_rate": 2.065553092212254e-06, + "loss": 0.6934, + "step": 10453 + }, + { + "epoch": 0.791040823275699, + "grad_norm": 1.9356576204299927, + "learning_rate": 2.0641176400140136e-06, + "loss": 0.5707, + "step": 10454 + }, + { + "epoch": 0.7911164919980326, + "grad_norm": 2.005326986312866, + "learning_rate": 2.0626826221631627e-06, + "loss": 0.7772, + "step": 10455 + }, + { + "epoch": 0.7911921607203662, + "grad_norm": 2.1144111156463623, + "learning_rate": 2.0612480387495613e-06, + "loss": 0.6407, + "step": 10456 + }, + { + "epoch": 0.7912678294426999, + "grad_norm": 1.8899532556533813, + "learning_rate": 2.0598138898630487e-06, + "loss": 0.7053, + "step": 10457 + }, + { + "epoch": 0.7913434981650335, + "grad_norm": 2.0663957595825195, + "learning_rate": 2.0583801755934396e-06, + "loss": 0.5835, + "step": 10458 + }, + { + "epoch": 0.7914191668873671, + "grad_norm": 2.444054126739502, + "learning_rate": 2.0569468960305178e-06, + "loss": 0.6798, + "step": 10459 + }, + { + "epoch": 0.7914948356097007, + "grad_norm": 2.1964471340179443, + "learning_rate": 2.055514051264036e-06, + "loss": 0.7087, + "step": 10460 + }, + { + "epoch": 0.7915705043320344, + "grad_norm": 2.126030683517456, + "learning_rate": 2.0540816413837256e-06, + "loss": 0.6266, + "step": 10461 + }, + { + "epoch": 0.791646173054368, + "grad_norm": 2.224245071411133, + "learning_rate": 2.052649666479289e-06, + "loss": 0.6503, + "step": 10462 + }, + { + "epoch": 0.7917218417767016, + "grad_norm": 2.307697057723999, + "learning_rate": 2.0512181266404004e-06, + "loss": 0.6489, + "step": 10463 + }, + { + "epoch": 0.7917975104990352, + "grad_norm": 2.7697362899780273, + "learning_rate": 2.0497870219567073e-06, + "loss": 0.6928, + "step": 10464 + }, + { + "epoch": 0.7918731792213688, + "grad_norm": 2.193354368209839, + "learning_rate": 2.048356352517831e-06, + "loss": 0.7207, + "step": 10465 + }, + { + "epoch": 0.7919488479437025, + "grad_norm": 2.0516974925994873, + "learning_rate": 2.0469261184133664e-06, + "loss": 0.6624, + "step": 10466 + }, + { + "epoch": 0.7920245166660361, + "grad_norm": 2.0919811725616455, + "learning_rate": 2.0454963197328724e-06, + "loss": 0.6431, + "step": 10467 + }, + { + "epoch": 0.7921001853883697, + "grad_norm": 2.1068317890167236, + "learning_rate": 2.044066956565895e-06, + "loss": 0.6878, + "step": 10468 + }, + { + "epoch": 0.7921758541107033, + "grad_norm": 2.4329023361206055, + "learning_rate": 2.0426380290019456e-06, + "loss": 0.778, + "step": 10469 + }, + { + "epoch": 0.792251522833037, + "grad_norm": 2.205319881439209, + "learning_rate": 2.0412095371305034e-06, + "loss": 0.7374, + "step": 10470 + }, + { + "epoch": 0.7923271915553706, + "grad_norm": 1.9677940607070923, + "learning_rate": 2.0397814810410265e-06, + "loss": 0.6473, + "step": 10471 + }, + { + "epoch": 0.7924028602777042, + "grad_norm": 2.6336700916290283, + "learning_rate": 2.038353860822944e-06, + "loss": 0.6716, + "step": 10472 + }, + { + "epoch": 0.7924785290000378, + "grad_norm": 1.8802638053894043, + "learning_rate": 2.0369266765656644e-06, + "loss": 0.7585, + "step": 10473 + }, + { + "epoch": 0.7925541977223715, + "grad_norm": 4.260136127471924, + "learning_rate": 2.035499928358554e-06, + "loss": 0.6322, + "step": 10474 + }, + { + "epoch": 0.7926298664447051, + "grad_norm": 1.8927791118621826, + "learning_rate": 2.034073616290965e-06, + "loss": 0.5409, + "step": 10475 + }, + { + "epoch": 0.7927055351670387, + "grad_norm": 2.0177175998687744, + "learning_rate": 2.0326477404522163e-06, + "loss": 0.6721, + "step": 10476 + }, + { + "epoch": 0.7927812038893723, + "grad_norm": 2.187217950820923, + "learning_rate": 2.031222300931601e-06, + "loss": 0.7119, + "step": 10477 + }, + { + "epoch": 0.7928568726117059, + "grad_norm": 1.9448692798614502, + "learning_rate": 2.029797297818385e-06, + "loss": 0.5877, + "step": 10478 + }, + { + "epoch": 0.7929325413340396, + "grad_norm": 2.5591869354248047, + "learning_rate": 2.0283727312018075e-06, + "loss": 0.5605, + "step": 10479 + }, + { + "epoch": 0.7930082100563732, + "grad_norm": 2.923570394515991, + "learning_rate": 2.02694860117108e-06, + "loss": 0.6858, + "step": 10480 + }, + { + "epoch": 0.7930838787787068, + "grad_norm": 1.8390942811965942, + "learning_rate": 2.0255249078153825e-06, + "loss": 0.719, + "step": 10481 + }, + { + "epoch": 0.7931595475010405, + "grad_norm": 2.0713329315185547, + "learning_rate": 2.0241016512238716e-06, + "loss": 0.7737, + "step": 10482 + }, + { + "epoch": 0.7932352162233741, + "grad_norm": 1.8896489143371582, + "learning_rate": 2.0226788314856824e-06, + "loss": 0.5676, + "step": 10483 + }, + { + "epoch": 0.7933108849457077, + "grad_norm": 2.4428815841674805, + "learning_rate": 2.021256448689909e-06, + "loss": 0.6226, + "step": 10484 + }, + { + "epoch": 0.7933865536680413, + "grad_norm": 2.0781445503234863, + "learning_rate": 2.01983450292563e-06, + "loss": 0.63, + "step": 10485 + }, + { + "epoch": 0.7934622223903749, + "grad_norm": 2.051621198654175, + "learning_rate": 2.0184129942818912e-06, + "loss": 0.6363, + "step": 10486 + }, + { + "epoch": 0.7935378911127086, + "grad_norm": 2.4698948860168457, + "learning_rate": 2.0169919228477136e-06, + "loss": 0.6365, + "step": 10487 + }, + { + "epoch": 0.7936135598350422, + "grad_norm": 2.3092644214630127, + "learning_rate": 2.0155712887120822e-06, + "loss": 0.6834, + "step": 10488 + }, + { + "epoch": 0.7936892285573758, + "grad_norm": 1.9739770889282227, + "learning_rate": 2.01415109196397e-06, + "loss": 0.5711, + "step": 10489 + }, + { + "epoch": 0.7937648972797094, + "grad_norm": 2.368990659713745, + "learning_rate": 2.0127313326923118e-06, + "loss": 0.6306, + "step": 10490 + }, + { + "epoch": 0.793840566002043, + "grad_norm": 1.8285222053527832, + "learning_rate": 2.01131201098602e-06, + "loss": 0.7374, + "step": 10491 + }, + { + "epoch": 0.7939162347243767, + "grad_norm": 2.3828883171081543, + "learning_rate": 2.0098931269339706e-06, + "loss": 0.694, + "step": 10492 + }, + { + "epoch": 0.7939919034467103, + "grad_norm": 2.278015375137329, + "learning_rate": 2.00847468062502e-06, + "loss": 0.6483, + "step": 10493 + }, + { + "epoch": 0.7940675721690439, + "grad_norm": 2.0421197414398193, + "learning_rate": 2.0070566721480044e-06, + "loss": 0.6897, + "step": 10494 + }, + { + "epoch": 0.7941432408913776, + "grad_norm": 1.976413369178772, + "learning_rate": 2.005639101591714e-06, + "loss": 0.6678, + "step": 10495 + }, + { + "epoch": 0.7942189096137112, + "grad_norm": 1.767219066619873, + "learning_rate": 2.0042219690449255e-06, + "loss": 0.6366, + "step": 10496 + }, + { + "epoch": 0.7942945783360448, + "grad_norm": 3.3812007904052734, + "learning_rate": 2.002805274596386e-06, + "loss": 0.5554, + "step": 10497 + }, + { + "epoch": 0.7943702470583784, + "grad_norm": 2.365610361099243, + "learning_rate": 2.0013890183348107e-06, + "loss": 0.778, + "step": 10498 + }, + { + "epoch": 0.794445915780712, + "grad_norm": 1.9793906211853027, + "learning_rate": 1.9999732003488917e-06, + "loss": 0.7294, + "step": 10499 + }, + { + "epoch": 0.7945215845030457, + "grad_norm": 1.898956060409546, + "learning_rate": 1.9985578207272914e-06, + "loss": 0.649, + "step": 10500 + }, + { + "epoch": 0.7945972532253793, + "grad_norm": 1.8723238706588745, + "learning_rate": 1.997142879558649e-06, + "loss": 0.584, + "step": 10501 + }, + { + "epoch": 0.7946729219477129, + "grad_norm": 2.4054598808288574, + "learning_rate": 1.9957283769315654e-06, + "loss": 0.7476, + "step": 10502 + }, + { + "epoch": 0.7947485906700466, + "grad_norm": 2.132002830505371, + "learning_rate": 1.994314312934624e-06, + "loss": 0.8334, + "step": 10503 + }, + { + "epoch": 0.7948242593923801, + "grad_norm": 2.4048826694488525, + "learning_rate": 1.9929006876563824e-06, + "loss": 0.6341, + "step": 10504 + }, + { + "epoch": 0.7948999281147138, + "grad_norm": 6.029483318328857, + "learning_rate": 1.991487501185365e-06, + "loss": 0.695, + "step": 10505 + }, + { + "epoch": 0.7949755968370474, + "grad_norm": 2.2719666957855225, + "learning_rate": 1.9900747536100666e-06, + "loss": 0.6664, + "step": 10506 + }, + { + "epoch": 0.795051265559381, + "grad_norm": 2.24306058883667, + "learning_rate": 1.9886624450189597e-06, + "loss": 0.6445, + "step": 10507 + }, + { + "epoch": 0.7951269342817147, + "grad_norm": 2.079287052154541, + "learning_rate": 1.9872505755004876e-06, + "loss": 0.6169, + "step": 10508 + }, + { + "epoch": 0.7952026030040483, + "grad_norm": 1.9773471355438232, + "learning_rate": 1.985839145143068e-06, + "loss": 0.5757, + "step": 10509 + }, + { + "epoch": 0.7952782717263819, + "grad_norm": 1.859931468963623, + "learning_rate": 1.984428154035086e-06, + "loss": 0.655, + "step": 10510 + }, + { + "epoch": 0.7953539404487155, + "grad_norm": 1.6292186975479126, + "learning_rate": 1.983017602264904e-06, + "loss": 0.7832, + "step": 10511 + }, + { + "epoch": 0.7954296091710491, + "grad_norm": 1.8309117555618286, + "learning_rate": 1.981607489920859e-06, + "loss": 0.6333, + "step": 10512 + }, + { + "epoch": 0.7955052778933828, + "grad_norm": 1.76371169090271, + "learning_rate": 1.9801978170912485e-06, + "loss": 0.5041, + "step": 10513 + }, + { + "epoch": 0.7955809466157164, + "grad_norm": 1.981345534324646, + "learning_rate": 1.978788583864357e-06, + "loss": 0.6277, + "step": 10514 + }, + { + "epoch": 0.79565661533805, + "grad_norm": 2.2104129791259766, + "learning_rate": 1.9773797903284367e-06, + "loss": 0.6721, + "step": 10515 + }, + { + "epoch": 0.7957322840603837, + "grad_norm": 2.0217669010162354, + "learning_rate": 1.975971436571705e-06, + "loss": 0.585, + "step": 10516 + }, + { + "epoch": 0.7958079527827172, + "grad_norm": 2.435241222381592, + "learning_rate": 1.97456352268236e-06, + "loss": 0.6876, + "step": 10517 + }, + { + "epoch": 0.7958836215050509, + "grad_norm": 2.3520796298980713, + "learning_rate": 1.973156048748569e-06, + "loss": 0.4607, + "step": 10518 + }, + { + "epoch": 0.7959592902273845, + "grad_norm": 2.4293181896209717, + "learning_rate": 1.9717490148584775e-06, + "loss": 0.6131, + "step": 10519 + }, + { + "epoch": 0.7960349589497181, + "grad_norm": 2.50989031791687, + "learning_rate": 1.9703424211001926e-06, + "loss": 0.7233, + "step": 10520 + }, + { + "epoch": 0.7961106276720518, + "grad_norm": 2.3708267211914062, + "learning_rate": 1.968936267561803e-06, + "loss": 0.7077, + "step": 10521 + }, + { + "epoch": 0.7961862963943854, + "grad_norm": 1.8803597688674927, + "learning_rate": 1.9675305543313647e-06, + "loss": 0.632, + "step": 10522 + }, + { + "epoch": 0.796261965116719, + "grad_norm": 1.908509373664856, + "learning_rate": 1.9661252814969117e-06, + "loss": 0.6277, + "step": 10523 + }, + { + "epoch": 0.7963376338390526, + "grad_norm": 2.45629620552063, + "learning_rate": 1.964720449146439e-06, + "loss": 0.6072, + "step": 10524 + }, + { + "epoch": 0.7964133025613862, + "grad_norm": 3.3856143951416016, + "learning_rate": 1.9633160573679287e-06, + "loss": 0.6671, + "step": 10525 + }, + { + "epoch": 0.7964889712837199, + "grad_norm": 2.470855474472046, + "learning_rate": 1.9619121062493283e-06, + "loss": 0.6902, + "step": 10526 + }, + { + "epoch": 0.7965646400060535, + "grad_norm": 2.5444891452789307, + "learning_rate": 1.960508595878554e-06, + "loss": 0.5386, + "step": 10527 + }, + { + "epoch": 0.7966403087283871, + "grad_norm": 2.180934429168701, + "learning_rate": 1.9591055263434998e-06, + "loss": 0.605, + "step": 10528 + }, + { + "epoch": 0.7967159774507208, + "grad_norm": 2.0110931396484375, + "learning_rate": 1.9577028977320297e-06, + "loss": 0.6318, + "step": 10529 + }, + { + "epoch": 0.7967916461730543, + "grad_norm": 2.564516544342041, + "learning_rate": 1.9563007101319826e-06, + "loss": 0.7039, + "step": 10530 + }, + { + "epoch": 0.796867314895388, + "grad_norm": 2.044375419616699, + "learning_rate": 1.9548989636311673e-06, + "loss": 0.7169, + "step": 10531 + }, + { + "epoch": 0.7969429836177216, + "grad_norm": 2.060332775115967, + "learning_rate": 1.9534976583173652e-06, + "loss": 0.5094, + "step": 10532 + }, + { + "epoch": 0.7970186523400552, + "grad_norm": 2.0547759532928467, + "learning_rate": 1.9520967942783307e-06, + "loss": 0.6053, + "step": 10533 + }, + { + "epoch": 0.7970943210623889, + "grad_norm": 2.1505582332611084, + "learning_rate": 1.950696371601791e-06, + "loss": 0.6574, + "step": 10534 + }, + { + "epoch": 0.7971699897847225, + "grad_norm": 2.3827664852142334, + "learning_rate": 1.949296390375445e-06, + "loss": 0.5319, + "step": 10535 + }, + { + "epoch": 0.7972456585070561, + "grad_norm": 2.6274614334106445, + "learning_rate": 1.947896850686963e-06, + "loss": 0.8266, + "step": 10536 + }, + { + "epoch": 0.7973213272293898, + "grad_norm": 2.0706329345703125, + "learning_rate": 1.946497752623993e-06, + "loss": 0.7172, + "step": 10537 + }, + { + "epoch": 0.7973969959517233, + "grad_norm": 2.9049019813537598, + "learning_rate": 1.945099096274144e-06, + "loss": 0.7605, + "step": 10538 + }, + { + "epoch": 0.797472664674057, + "grad_norm": 2.054645299911499, + "learning_rate": 1.943700881725006e-06, + "loss": 0.6115, + "step": 10539 + }, + { + "epoch": 0.7975483333963906, + "grad_norm": 2.4279866218566895, + "learning_rate": 1.9423031090641456e-06, + "loss": 0.6093, + "step": 10540 + }, + { + "epoch": 0.7976240021187242, + "grad_norm": 2.322181224822998, + "learning_rate": 1.9409057783790908e-06, + "loss": 0.7224, + "step": 10541 + }, + { + "epoch": 0.7976996708410579, + "grad_norm": 2.0702531337738037, + "learning_rate": 1.9395088897573463e-06, + "loss": 0.8073, + "step": 10542 + }, + { + "epoch": 0.7977753395633914, + "grad_norm": 3.023908853530884, + "learning_rate": 1.9381124432863933e-06, + "loss": 0.6391, + "step": 10543 + }, + { + "epoch": 0.7978510082857251, + "grad_norm": 2.054307699203491, + "learning_rate": 1.936716439053679e-06, + "loss": 0.7798, + "step": 10544 + }, + { + "epoch": 0.7979266770080587, + "grad_norm": 2.4131476879119873, + "learning_rate": 1.935320877146627e-06, + "loss": 0.7464, + "step": 10545 + }, + { + "epoch": 0.7980023457303923, + "grad_norm": 2.163196086883545, + "learning_rate": 1.9339257576526325e-06, + "loss": 0.7076, + "step": 10546 + }, + { + "epoch": 0.798078014452726, + "grad_norm": 2.453834056854248, + "learning_rate": 1.9325310806590596e-06, + "loss": 0.7057, + "step": 10547 + }, + { + "epoch": 0.7981536831750596, + "grad_norm": 1.4434751272201538, + "learning_rate": 1.9311368462532536e-06, + "loss": 0.8129, + "step": 10548 + }, + { + "epoch": 0.7982293518973932, + "grad_norm": 2.3839449882507324, + "learning_rate": 1.929743054522516e-06, + "loss": 0.7117, + "step": 10549 + }, + { + "epoch": 0.7983050206197269, + "grad_norm": 2.3013927936553955, + "learning_rate": 1.9283497055541383e-06, + "loss": 0.6521, + "step": 10550 + }, + { + "epoch": 0.7983806893420604, + "grad_norm": 2.079742670059204, + "learning_rate": 1.926956799435378e-06, + "loss": 0.5988, + "step": 10551 + }, + { + "epoch": 0.7984563580643941, + "grad_norm": 2.3923838138580322, + "learning_rate": 1.9255643362534573e-06, + "loss": 0.6227, + "step": 10552 + }, + { + "epoch": 0.7985320267867277, + "grad_norm": 2.4555153846740723, + "learning_rate": 1.9241723160955793e-06, + "loss": 0.6377, + "step": 10553 + }, + { + "epoch": 0.7986076955090613, + "grad_norm": 1.8996168375015259, + "learning_rate": 1.9227807390489167e-06, + "loss": 0.6329, + "step": 10554 + }, + { + "epoch": 0.798683364231395, + "grad_norm": 2.6470742225646973, + "learning_rate": 1.9213896052006145e-06, + "loss": 0.7298, + "step": 10555 + }, + { + "epoch": 0.7987590329537286, + "grad_norm": 1.7246336936950684, + "learning_rate": 1.9199989146377903e-06, + "loss": 0.6901, + "step": 10556 + }, + { + "epoch": 0.7988347016760622, + "grad_norm": 3.4669127464294434, + "learning_rate": 1.918608667447534e-06, + "loss": 0.5589, + "step": 10557 + }, + { + "epoch": 0.7989103703983959, + "grad_norm": 2.552844762802124, + "learning_rate": 1.9172188637169087e-06, + "loss": 0.5577, + "step": 10558 + }, + { + "epoch": 0.7989860391207294, + "grad_norm": 3.0051169395446777, + "learning_rate": 1.9158295035329425e-06, + "loss": 0.5776, + "step": 10559 + }, + { + "epoch": 0.7990617078430631, + "grad_norm": 2.2740957736968994, + "learning_rate": 1.9144405869826475e-06, + "loss": 0.6686, + "step": 10560 + }, + { + "epoch": 0.7991373765653967, + "grad_norm": 2.724806308746338, + "learning_rate": 1.9130521141530013e-06, + "loss": 0.6296, + "step": 10561 + }, + { + "epoch": 0.7992130452877303, + "grad_norm": 2.1365087032318115, + "learning_rate": 1.9116640851309554e-06, + "loss": 0.7783, + "step": 10562 + }, + { + "epoch": 0.799288714010064, + "grad_norm": 2.0151805877685547, + "learning_rate": 1.9102765000034293e-06, + "loss": 0.6014, + "step": 10563 + }, + { + "epoch": 0.7993643827323975, + "grad_norm": 1.9399892091751099, + "learning_rate": 1.9088893588573187e-06, + "loss": 0.6421, + "step": 10564 + }, + { + "epoch": 0.7994400514547312, + "grad_norm": 2.5492935180664062, + "learning_rate": 1.9075026617794924e-06, + "loss": 0.8427, + "step": 10565 + }, + { + "epoch": 0.7995157201770648, + "grad_norm": 2.4646923542022705, + "learning_rate": 1.9061164088567896e-06, + "loss": 0.5545, + "step": 10566 + }, + { + "epoch": 0.7995913888993984, + "grad_norm": 1.8194031715393066, + "learning_rate": 1.9047306001760213e-06, + "loss": 0.6374, + "step": 10567 + }, + { + "epoch": 0.7996670576217321, + "grad_norm": 1.9019348621368408, + "learning_rate": 1.9033452358239716e-06, + "loss": 0.6827, + "step": 10568 + }, + { + "epoch": 0.7997427263440657, + "grad_norm": 2.4975194931030273, + "learning_rate": 1.9019603158873995e-06, + "loss": 0.7033, + "step": 10569 + }, + { + "epoch": 0.7998183950663993, + "grad_norm": 3.534217596054077, + "learning_rate": 1.9005758404530242e-06, + "loss": 0.6896, + "step": 10570 + }, + { + "epoch": 0.799894063788733, + "grad_norm": 1.9258010387420654, + "learning_rate": 1.8991918096075558e-06, + "loss": 0.6669, + "step": 10571 + }, + { + "epoch": 0.7999697325110665, + "grad_norm": 1.901734709739685, + "learning_rate": 1.8978082234376657e-06, + "loss": 0.6585, + "step": 10572 + }, + { + "epoch": 0.8000454012334002, + "grad_norm": 2.1467180252075195, + "learning_rate": 1.8964250820299927e-06, + "loss": 0.5655, + "step": 10573 + }, + { + "epoch": 0.8001210699557338, + "grad_norm": 2.114226818084717, + "learning_rate": 1.8950423854711563e-06, + "loss": 0.6701, + "step": 10574 + }, + { + "epoch": 0.8001967386780674, + "grad_norm": 2.7017455101013184, + "learning_rate": 1.8936601338477445e-06, + "loss": 0.5554, + "step": 10575 + }, + { + "epoch": 0.8002724074004011, + "grad_norm": 1.8943901062011719, + "learning_rate": 1.8922783272463251e-06, + "loss": 0.5889, + "step": 10576 + }, + { + "epoch": 0.8003480761227346, + "grad_norm": 2.2416491508483887, + "learning_rate": 1.8908969657534225e-06, + "loss": 0.7364, + "step": 10577 + }, + { + "epoch": 0.8004237448450683, + "grad_norm": 2.565739154815674, + "learning_rate": 1.889516049455546e-06, + "loss": 0.772, + "step": 10578 + }, + { + "epoch": 0.800499413567402, + "grad_norm": 2.755770683288574, + "learning_rate": 1.888135578439172e-06, + "loss": 0.6644, + "step": 10579 + }, + { + "epoch": 0.8005750822897355, + "grad_norm": 1.84762704372406, + "learning_rate": 1.8867555527907516e-06, + "loss": 0.6432, + "step": 10580 + }, + { + "epoch": 0.8006507510120692, + "grad_norm": 2.085479497909546, + "learning_rate": 1.8853759725967045e-06, + "loss": 0.7186, + "step": 10581 + }, + { + "epoch": 0.8007264197344028, + "grad_norm": 1.9291927814483643, + "learning_rate": 1.8839968379434267e-06, + "loss": 0.5889, + "step": 10582 + }, + { + "epoch": 0.8008020884567364, + "grad_norm": 2.066859483718872, + "learning_rate": 1.8826181489172843e-06, + "loss": 0.6291, + "step": 10583 + }, + { + "epoch": 0.8008777571790701, + "grad_norm": 2.1499745845794678, + "learning_rate": 1.8812399056046118e-06, + "loss": 0.5774, + "step": 10584 + }, + { + "epoch": 0.8009534259014036, + "grad_norm": 2.3581552505493164, + "learning_rate": 1.8798621080917184e-06, + "loss": 0.632, + "step": 10585 + }, + { + "epoch": 0.8010290946237373, + "grad_norm": 2.0463876724243164, + "learning_rate": 1.8784847564648952e-06, + "loss": 0.7999, + "step": 10586 + }, + { + "epoch": 0.801104763346071, + "grad_norm": 1.8345085382461548, + "learning_rate": 1.877107850810387e-06, + "loss": 0.6854, + "step": 10587 + }, + { + "epoch": 0.8011804320684045, + "grad_norm": 2.0658600330352783, + "learning_rate": 1.8757313912144227e-06, + "loss": 0.6487, + "step": 10588 + }, + { + "epoch": 0.8012561007907382, + "grad_norm": 2.1300668716430664, + "learning_rate": 1.874355377763203e-06, + "loss": 0.8562, + "step": 10589 + }, + { + "epoch": 0.8013317695130717, + "grad_norm": 2.3721659183502197, + "learning_rate": 1.8729798105428951e-06, + "loss": 0.6192, + "step": 10590 + }, + { + "epoch": 0.8014074382354054, + "grad_norm": 1.95151686668396, + "learning_rate": 1.8716046896396437e-06, + "loss": 0.6669, + "step": 10591 + }, + { + "epoch": 0.801483106957739, + "grad_norm": 2.4539577960968018, + "learning_rate": 1.8702300151395627e-06, + "loss": 0.8109, + "step": 10592 + }, + { + "epoch": 0.8015587756800726, + "grad_norm": 2.4067583084106445, + "learning_rate": 1.8688557871287382e-06, + "loss": 0.6994, + "step": 10593 + }, + { + "epoch": 0.8016344444024063, + "grad_norm": 2.889045476913452, + "learning_rate": 1.8674820056932325e-06, + "loss": 0.6797, + "step": 10594 + }, + { + "epoch": 0.8017101131247399, + "grad_norm": 2.1616549491882324, + "learning_rate": 1.8661086709190677e-06, + "loss": 0.7378, + "step": 10595 + }, + { + "epoch": 0.8017857818470735, + "grad_norm": 1.9277255535125732, + "learning_rate": 1.864735782892254e-06, + "loss": 0.6307, + "step": 10596 + }, + { + "epoch": 0.8018614505694072, + "grad_norm": 2.1379098892211914, + "learning_rate": 1.8633633416987667e-06, + "loss": 0.5359, + "step": 10597 + }, + { + "epoch": 0.8019371192917407, + "grad_norm": 2.582929849624634, + "learning_rate": 1.861991347424547e-06, + "loss": 0.6166, + "step": 10598 + }, + { + "epoch": 0.8020127880140744, + "grad_norm": 2.745823383331299, + "learning_rate": 1.8606198001555162e-06, + "loss": 0.7859, + "step": 10599 + }, + { + "epoch": 0.802088456736408, + "grad_norm": 1.990330696105957, + "learning_rate": 1.8592486999775644e-06, + "loss": 0.7426, + "step": 10600 + }, + { + "epoch": 0.8021641254587416, + "grad_norm": 2.0357511043548584, + "learning_rate": 1.8578780469765562e-06, + "loss": 0.8174, + "step": 10601 + }, + { + "epoch": 0.8022397941810753, + "grad_norm": 1.937957763671875, + "learning_rate": 1.8565078412383238e-06, + "loss": 0.8851, + "step": 10602 + }, + { + "epoch": 0.8023154629034088, + "grad_norm": 2.319841146469116, + "learning_rate": 1.8551380828486765e-06, + "loss": 0.6238, + "step": 10603 + }, + { + "epoch": 0.8023911316257425, + "grad_norm": 2.1866133213043213, + "learning_rate": 1.8537687718933928e-06, + "loss": 0.7628, + "step": 10604 + }, + { + "epoch": 0.8024668003480762, + "grad_norm": 2.112520933151245, + "learning_rate": 1.852399908458221e-06, + "loss": 0.565, + "step": 10605 + }, + { + "epoch": 0.8025424690704097, + "grad_norm": 2.1112277507781982, + "learning_rate": 1.8510314926288826e-06, + "loss": 0.7111, + "step": 10606 + }, + { + "epoch": 0.8026181377927434, + "grad_norm": 2.6001243591308594, + "learning_rate": 1.8496635244910772e-06, + "loss": 0.6784, + "step": 10607 + }, + { + "epoch": 0.802693806515077, + "grad_norm": 2.205632448196411, + "learning_rate": 1.84829600413047e-06, + "loss": 0.6225, + "step": 10608 + }, + { + "epoch": 0.8027694752374106, + "grad_norm": 2.1408543586730957, + "learning_rate": 1.8469289316326977e-06, + "loss": 0.6406, + "step": 10609 + }, + { + "epoch": 0.8028451439597443, + "grad_norm": 2.142906904220581, + "learning_rate": 1.8455623070833706e-06, + "loss": 0.7465, + "step": 10610 + }, + { + "epoch": 0.8029208126820778, + "grad_norm": 2.3650617599487305, + "learning_rate": 1.8441961305680726e-06, + "loss": 0.6459, + "step": 10611 + }, + { + "epoch": 0.8029964814044115, + "grad_norm": 2.328948497772217, + "learning_rate": 1.842830402172357e-06, + "loss": 0.7611, + "step": 10612 + }, + { + "epoch": 0.8030721501267452, + "grad_norm": 1.9152311086654663, + "learning_rate": 1.8414651219817513e-06, + "loss": 0.5491, + "step": 10613 + }, + { + "epoch": 0.8031478188490787, + "grad_norm": 3.1804325580596924, + "learning_rate": 1.8401002900817533e-06, + "loss": 0.6511, + "step": 10614 + }, + { + "epoch": 0.8032234875714124, + "grad_norm": 2.4679126739501953, + "learning_rate": 1.8387359065578344e-06, + "loss": 0.6399, + "step": 10615 + }, + { + "epoch": 0.8032991562937459, + "grad_norm": 2.6551365852355957, + "learning_rate": 1.8373719714954315e-06, + "loss": 0.7862, + "step": 10616 + }, + { + "epoch": 0.8033748250160796, + "grad_norm": 1.9284342527389526, + "learning_rate": 1.8360084849799643e-06, + "loss": 0.6186, + "step": 10617 + }, + { + "epoch": 0.8034504937384133, + "grad_norm": 2.3558826446533203, + "learning_rate": 1.8346454470968194e-06, + "loss": 0.7019, + "step": 10618 + }, + { + "epoch": 0.8035261624607468, + "grad_norm": 2.2871432304382324, + "learning_rate": 1.8332828579313505e-06, + "loss": 0.7563, + "step": 10619 + }, + { + "epoch": 0.8036018311830805, + "grad_norm": 2.3927693367004395, + "learning_rate": 1.8319207175688881e-06, + "loss": 0.7131, + "step": 10620 + }, + { + "epoch": 0.8036774999054141, + "grad_norm": 2.0817835330963135, + "learning_rate": 1.8305590260947336e-06, + "loss": 0.6026, + "step": 10621 + }, + { + "epoch": 0.8037531686277477, + "grad_norm": 2.117398738861084, + "learning_rate": 1.8291977835941651e-06, + "loss": 0.6481, + "step": 10622 + }, + { + "epoch": 0.8038288373500814, + "grad_norm": 2.352769136428833, + "learning_rate": 1.827836990152423e-06, + "loss": 0.6245, + "step": 10623 + }, + { + "epoch": 0.8039045060724149, + "grad_norm": 1.9620620012283325, + "learning_rate": 1.8264766458547258e-06, + "loss": 0.8225, + "step": 10624 + }, + { + "epoch": 0.8039801747947486, + "grad_norm": 1.7931766510009766, + "learning_rate": 1.8251167507862633e-06, + "loss": 0.5747, + "step": 10625 + }, + { + "epoch": 0.8040558435170823, + "grad_norm": 2.456925630569458, + "learning_rate": 1.8237573050321955e-06, + "loss": 0.6384, + "step": 10626 + }, + { + "epoch": 0.8041315122394158, + "grad_norm": 2.1616482734680176, + "learning_rate": 1.8223983086776574e-06, + "loss": 0.6522, + "step": 10627 + }, + { + "epoch": 0.8042071809617495, + "grad_norm": 1.8963215351104736, + "learning_rate": 1.8210397618077507e-06, + "loss": 0.679, + "step": 10628 + }, + { + "epoch": 0.804282849684083, + "grad_norm": 2.105846643447876, + "learning_rate": 1.8196816645075575e-06, + "loss": 0.7072, + "step": 10629 + }, + { + "epoch": 0.8043585184064167, + "grad_norm": 2.052081823348999, + "learning_rate": 1.8183240168621198e-06, + "loss": 0.5888, + "step": 10630 + }, + { + "epoch": 0.8044341871287504, + "grad_norm": 2.6387953758239746, + "learning_rate": 1.8169668189564574e-06, + "loss": 0.5963, + "step": 10631 + }, + { + "epoch": 0.8045098558510839, + "grad_norm": 3.556173801422119, + "learning_rate": 1.8156100708755705e-06, + "loss": 0.6972, + "step": 10632 + }, + { + "epoch": 0.8045855245734176, + "grad_norm": 2.080299139022827, + "learning_rate": 1.8142537727044158e-06, + "loss": 0.5975, + "step": 10633 + }, + { + "epoch": 0.8046611932957513, + "grad_norm": 2.5786876678466797, + "learning_rate": 1.812897924527932e-06, + "loss": 0.64, + "step": 10634 + }, + { + "epoch": 0.8047368620180848, + "grad_norm": 3.127788543701172, + "learning_rate": 1.8115425264310257e-06, + "loss": 0.7167, + "step": 10635 + }, + { + "epoch": 0.8048125307404185, + "grad_norm": 1.6849464178085327, + "learning_rate": 1.810187578498577e-06, + "loss": 0.6677, + "step": 10636 + }, + { + "epoch": 0.804888199462752, + "grad_norm": 2.1735470294952393, + "learning_rate": 1.8088330808154364e-06, + "loss": 0.6236, + "step": 10637 + }, + { + "epoch": 0.8049638681850857, + "grad_norm": 2.128405809402466, + "learning_rate": 1.8074790334664275e-06, + "loss": 0.6748, + "step": 10638 + }, + { + "epoch": 0.8050395369074194, + "grad_norm": 2.5104458332061768, + "learning_rate": 1.806125436536345e-06, + "loss": 0.6915, + "step": 10639 + }, + { + "epoch": 0.8051152056297529, + "grad_norm": 2.0362462997436523, + "learning_rate": 1.8047722901099575e-06, + "loss": 0.5961, + "step": 10640 + }, + { + "epoch": 0.8051908743520866, + "grad_norm": 3.8204987049102783, + "learning_rate": 1.803419594271999e-06, + "loss": 0.6853, + "step": 10641 + }, + { + "epoch": 0.8052665430744201, + "grad_norm": 2.5584213733673096, + "learning_rate": 1.80206734910718e-06, + "loss": 0.8556, + "step": 10642 + }, + { + "epoch": 0.8053422117967538, + "grad_norm": 2.950486898422241, + "learning_rate": 1.800715554700189e-06, + "loss": 0.7538, + "step": 10643 + }, + { + "epoch": 0.8054178805190875, + "grad_norm": 1.961512565612793, + "learning_rate": 1.7993642111356726e-06, + "loss": 0.6005, + "step": 10644 + }, + { + "epoch": 0.805493549241421, + "grad_norm": 2.0798726081848145, + "learning_rate": 1.7980133184982597e-06, + "loss": 0.7188, + "step": 10645 + }, + { + "epoch": 0.8055692179637547, + "grad_norm": 2.015023946762085, + "learning_rate": 1.796662876872547e-06, + "loss": 0.6464, + "step": 10646 + }, + { + "epoch": 0.8056448866860884, + "grad_norm": 2.150325059890747, + "learning_rate": 1.7953128863431025e-06, + "loss": 0.6864, + "step": 10647 + }, + { + "epoch": 0.8057205554084219, + "grad_norm": 2.242133617401123, + "learning_rate": 1.7939633469944687e-06, + "loss": 0.5954, + "step": 10648 + }, + { + "epoch": 0.8057962241307556, + "grad_norm": 2.3537936210632324, + "learning_rate": 1.792614258911157e-06, + "loss": 0.734, + "step": 10649 + }, + { + "epoch": 0.8058718928530891, + "grad_norm": 4.886502742767334, + "learning_rate": 1.7912656221776517e-06, + "loss": 0.5386, + "step": 10650 + }, + { + "epoch": 0.8059475615754228, + "grad_norm": 2.2850396633148193, + "learning_rate": 1.7899174368784116e-06, + "loss": 0.7146, + "step": 10651 + }, + { + "epoch": 0.8060232302977565, + "grad_norm": 1.9249435663223267, + "learning_rate": 1.7885697030978569e-06, + "loss": 0.7584, + "step": 10652 + }, + { + "epoch": 0.80609889902009, + "grad_norm": 1.7427829504013062, + "learning_rate": 1.787222420920394e-06, + "loss": 0.694, + "step": 10653 + }, + { + "epoch": 0.8061745677424237, + "grad_norm": 2.255074977874756, + "learning_rate": 1.7858755904303947e-06, + "loss": 0.7371, + "step": 10654 + }, + { + "epoch": 0.8062502364647572, + "grad_norm": 2.0775415897369385, + "learning_rate": 1.7845292117121972e-06, + "loss": 0.601, + "step": 10655 + }, + { + "epoch": 0.8063259051870909, + "grad_norm": 3.3672308921813965, + "learning_rate": 1.7831832848501183e-06, + "loss": 0.6938, + "step": 10656 + }, + { + "epoch": 0.8064015739094246, + "grad_norm": 2.231694459915161, + "learning_rate": 1.7818378099284435e-06, + "loss": 0.6689, + "step": 10657 + }, + { + "epoch": 0.8064772426317581, + "grad_norm": 2.523862838745117, + "learning_rate": 1.7804927870314314e-06, + "loss": 0.6083, + "step": 10658 + }, + { + "epoch": 0.8065529113540918, + "grad_norm": 2.5286412239074707, + "learning_rate": 1.7791482162433126e-06, + "loss": 0.7218, + "step": 10659 + }, + { + "epoch": 0.8066285800764255, + "grad_norm": 1.992052674293518, + "learning_rate": 1.7778040976482867e-06, + "loss": 0.7306, + "step": 10660 + }, + { + "epoch": 0.806704248798759, + "grad_norm": 2.2717790603637695, + "learning_rate": 1.7764604313305307e-06, + "loss": 0.7695, + "step": 10661 + }, + { + "epoch": 0.8067799175210927, + "grad_norm": 2.3762831687927246, + "learning_rate": 1.7751172173741807e-06, + "loss": 0.6682, + "step": 10662 + }, + { + "epoch": 0.8068555862434262, + "grad_norm": 1.9005999565124512, + "learning_rate": 1.773774455863361e-06, + "loss": 0.7671, + "step": 10663 + }, + { + "epoch": 0.8069312549657599, + "grad_norm": 1.9350870847702026, + "learning_rate": 1.772432146882158e-06, + "loss": 0.5733, + "step": 10664 + }, + { + "epoch": 0.8070069236880936, + "grad_norm": 1.9355764389038086, + "learning_rate": 1.7710902905146324e-06, + "loss": 0.5401, + "step": 10665 + }, + { + "epoch": 0.8070825924104271, + "grad_norm": 8.177988052368164, + "learning_rate": 1.7697488868448123e-06, + "loss": 0.6673, + "step": 10666 + }, + { + "epoch": 0.8071582611327608, + "grad_norm": 2.334674835205078, + "learning_rate": 1.7684079359567002e-06, + "loss": 0.6466, + "step": 10667 + }, + { + "epoch": 0.8072339298550943, + "grad_norm": 1.965072751045227, + "learning_rate": 1.7670674379342773e-06, + "loss": 0.6563, + "step": 10668 + }, + { + "epoch": 0.807309598577428, + "grad_norm": 2.178422451019287, + "learning_rate": 1.7657273928614828e-06, + "loss": 0.7631, + "step": 10669 + }, + { + "epoch": 0.8073852672997617, + "grad_norm": 2.614917755126953, + "learning_rate": 1.7643878008222373e-06, + "loss": 0.63, + "step": 10670 + }, + { + "epoch": 0.8074609360220952, + "grad_norm": 2.7337183952331543, + "learning_rate": 1.7630486619004313e-06, + "loss": 0.7156, + "step": 10671 + }, + { + "epoch": 0.8075366047444289, + "grad_norm": 2.084549903869629, + "learning_rate": 1.7617099761799246e-06, + "loss": 0.49, + "step": 10672 + }, + { + "epoch": 0.8076122734667626, + "grad_norm": 2.35610294342041, + "learning_rate": 1.7603717437445506e-06, + "loss": 0.6401, + "step": 10673 + }, + { + "epoch": 0.8076879421890961, + "grad_norm": 2.1338629722595215, + "learning_rate": 1.7590339646781149e-06, + "loss": 0.5452, + "step": 10674 + }, + { + "epoch": 0.8077636109114298, + "grad_norm": 2.5002894401550293, + "learning_rate": 1.7576966390643935e-06, + "loss": 0.6739, + "step": 10675 + }, + { + "epoch": 0.8078392796337633, + "grad_norm": 2.1118812561035156, + "learning_rate": 1.7563597669871315e-06, + "loss": 0.5796, + "step": 10676 + }, + { + "epoch": 0.807914948356097, + "grad_norm": 2.3569159507751465, + "learning_rate": 1.7550233485300469e-06, + "loss": 0.7535, + "step": 10677 + }, + { + "epoch": 0.8079906170784307, + "grad_norm": 1.8390127420425415, + "learning_rate": 1.7536873837768358e-06, + "loss": 0.6276, + "step": 10678 + }, + { + "epoch": 0.8080662858007642, + "grad_norm": 2.113814115524292, + "learning_rate": 1.7523518728111603e-06, + "loss": 0.5618, + "step": 10679 + }, + { + "epoch": 0.8081419545230979, + "grad_norm": 2.1201260089874268, + "learning_rate": 1.7510168157166506e-06, + "loss": 0.6731, + "step": 10680 + }, + { + "epoch": 0.8082176232454314, + "grad_norm": 2.6762893199920654, + "learning_rate": 1.7496822125769133e-06, + "loss": 0.5319, + "step": 10681 + }, + { + "epoch": 0.8082932919677651, + "grad_norm": 2.3151755332946777, + "learning_rate": 1.7483480634755262e-06, + "loss": 0.7085, + "step": 10682 + }, + { + "epoch": 0.8083689606900988, + "grad_norm": 2.3955535888671875, + "learning_rate": 1.7470143684960382e-06, + "loss": 0.6417, + "step": 10683 + }, + { + "epoch": 0.8084446294124323, + "grad_norm": 2.628687858581543, + "learning_rate": 1.7456811277219693e-06, + "loss": 0.5682, + "step": 10684 + }, + { + "epoch": 0.808520298134766, + "grad_norm": 2.5121073722839355, + "learning_rate": 1.7443483412368119e-06, + "loss": 0.675, + "step": 10685 + }, + { + "epoch": 0.8085959668570997, + "grad_norm": 2.235436201095581, + "learning_rate": 1.7430160091240313e-06, + "loss": 0.599, + "step": 10686 + }, + { + "epoch": 0.8086716355794332, + "grad_norm": 2.1594743728637695, + "learning_rate": 1.7416841314670577e-06, + "loss": 0.6628, + "step": 10687 + }, + { + "epoch": 0.8087473043017669, + "grad_norm": 2.264970064163208, + "learning_rate": 1.7403527083492974e-06, + "loss": 0.5556, + "step": 10688 + }, + { + "epoch": 0.8088229730241004, + "grad_norm": 2.473870038986206, + "learning_rate": 1.7390217398541355e-06, + "loss": 0.6411, + "step": 10689 + }, + { + "epoch": 0.8088986417464341, + "grad_norm": 2.277695894241333, + "learning_rate": 1.7376912260649158e-06, + "loss": 0.6542, + "step": 10690 + }, + { + "epoch": 0.8089743104687678, + "grad_norm": 2.1290292739868164, + "learning_rate": 1.73636116706496e-06, + "loss": 0.6924, + "step": 10691 + }, + { + "epoch": 0.8090499791911013, + "grad_norm": 2.1614058017730713, + "learning_rate": 1.7350315629375611e-06, + "loss": 0.588, + "step": 10692 + }, + { + "epoch": 0.809125647913435, + "grad_norm": 1.7077982425689697, + "learning_rate": 1.733702413765984e-06, + "loss": 0.5756, + "step": 10693 + }, + { + "epoch": 0.8092013166357686, + "grad_norm": 2.483567953109741, + "learning_rate": 1.7323737196334635e-06, + "loss": 0.6524, + "step": 10694 + }, + { + "epoch": 0.8092769853581022, + "grad_norm": 2.2687125205993652, + "learning_rate": 1.7310454806232077e-06, + "loss": 0.6016, + "step": 10695 + }, + { + "epoch": 0.8093526540804359, + "grad_norm": 2.1247775554656982, + "learning_rate": 1.7297176968183935e-06, + "loss": 0.6076, + "step": 10696 + }, + { + "epoch": 0.8094283228027694, + "grad_norm": 2.6836366653442383, + "learning_rate": 1.7283903683021748e-06, + "loss": 0.6584, + "step": 10697 + }, + { + "epoch": 0.8095039915251031, + "grad_norm": 2.096525192260742, + "learning_rate": 1.7270634951576667e-06, + "loss": 0.5641, + "step": 10698 + }, + { + "epoch": 0.8095796602474368, + "grad_norm": 2.056962251663208, + "learning_rate": 1.7257370774679675e-06, + "loss": 0.6351, + "step": 10699 + }, + { + "epoch": 0.8096553289697703, + "grad_norm": 2.368328809738159, + "learning_rate": 1.7244111153161425e-06, + "loss": 0.696, + "step": 10700 + }, + { + "epoch": 0.809730997692104, + "grad_norm": 2.048150062561035, + "learning_rate": 1.7230856087852236e-06, + "loss": 0.6948, + "step": 10701 + }, + { + "epoch": 0.8098066664144375, + "grad_norm": 3.076840400695801, + "learning_rate": 1.7217605579582204e-06, + "loss": 0.6797, + "step": 10702 + }, + { + "epoch": 0.8098823351367712, + "grad_norm": 1.980602502822876, + "learning_rate": 1.7204359629181112e-06, + "loss": 0.6077, + "step": 10703 + }, + { + "epoch": 0.8099580038591049, + "grad_norm": 2.4633147716522217, + "learning_rate": 1.719111823747847e-06, + "loss": 0.6029, + "step": 10704 + }, + { + "epoch": 0.8100336725814384, + "grad_norm": 6.110641956329346, + "learning_rate": 1.7177881405303505e-06, + "loss": 0.7389, + "step": 10705 + }, + { + "epoch": 0.8101093413037721, + "grad_norm": 2.669058322906494, + "learning_rate": 1.716464913348514e-06, + "loss": 0.6563, + "step": 10706 + }, + { + "epoch": 0.8101850100261057, + "grad_norm": 2.2113122940063477, + "learning_rate": 1.7151421422852045e-06, + "loss": 0.6166, + "step": 10707 + }, + { + "epoch": 0.8102606787484393, + "grad_norm": 2.3939402103424072, + "learning_rate": 1.7138198274232508e-06, + "loss": 0.618, + "step": 10708 + }, + { + "epoch": 0.810336347470773, + "grad_norm": 2.17268967628479, + "learning_rate": 1.7124979688454684e-06, + "loss": 0.8182, + "step": 10709 + }, + { + "epoch": 0.8104120161931065, + "grad_norm": 9.700509071350098, + "learning_rate": 1.7111765666346343e-06, + "loss": 0.6712, + "step": 10710 + }, + { + "epoch": 0.8104876849154402, + "grad_norm": 2.0205531120300293, + "learning_rate": 1.7098556208735011e-06, + "loss": 0.7657, + "step": 10711 + }, + { + "epoch": 0.8105633536377739, + "grad_norm": 1.8378758430480957, + "learning_rate": 1.708535131644785e-06, + "loss": 0.6912, + "step": 10712 + }, + { + "epoch": 0.8106390223601074, + "grad_norm": 1.6077888011932373, + "learning_rate": 1.7072150990311805e-06, + "loss": 0.6132, + "step": 10713 + }, + { + "epoch": 0.8107146910824411, + "grad_norm": 1.9418666362762451, + "learning_rate": 1.7058955231153598e-06, + "loss": 0.6918, + "step": 10714 + }, + { + "epoch": 0.8107903598047747, + "grad_norm": 2.010127544403076, + "learning_rate": 1.7045764039799502e-06, + "loss": 0.6395, + "step": 10715 + }, + { + "epoch": 0.8108660285271083, + "grad_norm": 1.9228168725967407, + "learning_rate": 1.7032577417075624e-06, + "loss": 0.7397, + "step": 10716 + }, + { + "epoch": 0.810941697249442, + "grad_norm": 2.564603328704834, + "learning_rate": 1.7019395363807748e-06, + "loss": 0.7028, + "step": 10717 + }, + { + "epoch": 0.8110173659717755, + "grad_norm": 2.034059762954712, + "learning_rate": 1.7006217880821414e-06, + "loss": 0.6505, + "step": 10718 + }, + { + "epoch": 0.8110930346941092, + "grad_norm": 2.007988691329956, + "learning_rate": 1.6993044968941754e-06, + "loss": 0.7674, + "step": 10719 + }, + { + "epoch": 0.8111687034164428, + "grad_norm": 2.383240222930908, + "learning_rate": 1.6979876628993777e-06, + "loss": 0.7798, + "step": 10720 + }, + { + "epoch": 0.8112443721387764, + "grad_norm": 2.532583236694336, + "learning_rate": 1.6966712861802135e-06, + "loss": 0.7291, + "step": 10721 + }, + { + "epoch": 0.8113200408611101, + "grad_norm": 2.3727943897247314, + "learning_rate": 1.6953553668191115e-06, + "loss": 0.6543, + "step": 10722 + }, + { + "epoch": 0.8113957095834436, + "grad_norm": 2.5050201416015625, + "learning_rate": 1.6940399048984833e-06, + "loss": 0.5767, + "step": 10723 + }, + { + "epoch": 0.8114713783057773, + "grad_norm": 2.1862447261810303, + "learning_rate": 1.6927249005007034e-06, + "loss": 0.7536, + "step": 10724 + }, + { + "epoch": 0.811547047028111, + "grad_norm": 2.4321300983428955, + "learning_rate": 1.6914103537081305e-06, + "loss": 0.7764, + "step": 10725 + }, + { + "epoch": 0.8116227157504445, + "grad_norm": 2.240424871444702, + "learning_rate": 1.6900962646030772e-06, + "loss": 0.6716, + "step": 10726 + }, + { + "epoch": 0.8116983844727782, + "grad_norm": 2.253845453262329, + "learning_rate": 1.6887826332678393e-06, + "loss": 0.6926, + "step": 10727 + }, + { + "epoch": 0.8117740531951118, + "grad_norm": 1.9651557207107544, + "learning_rate": 1.6874694597846795e-06, + "loss": 0.7082, + "step": 10728 + }, + { + "epoch": 0.8118497219174454, + "grad_norm": 2.718416929244995, + "learning_rate": 1.686156744235834e-06, + "loss": 0.7596, + "step": 10729 + }, + { + "epoch": 0.8119253906397791, + "grad_norm": 2.3345022201538086, + "learning_rate": 1.6848444867035093e-06, + "loss": 0.6771, + "step": 10730 + }, + { + "epoch": 0.8120010593621126, + "grad_norm": 2.0901668071746826, + "learning_rate": 1.6835326872698826e-06, + "loss": 0.6624, + "step": 10731 + }, + { + "epoch": 0.8120767280844463, + "grad_norm": 2.0496819019317627, + "learning_rate": 1.6822213460171061e-06, + "loss": 0.6007, + "step": 10732 + }, + { + "epoch": 0.8121523968067799, + "grad_norm": 1.9577654600143433, + "learning_rate": 1.6809104630272944e-06, + "loss": 0.8049, + "step": 10733 + }, + { + "epoch": 0.8122280655291135, + "grad_norm": 2.862490177154541, + "learning_rate": 1.6796000383825414e-06, + "loss": 0.6048, + "step": 10734 + }, + { + "epoch": 0.8123037342514472, + "grad_norm": 2.0185632705688477, + "learning_rate": 1.6782900721649146e-06, + "loss": 0.67, + "step": 10735 + }, + { + "epoch": 0.8123794029737808, + "grad_norm": 2.023618221282959, + "learning_rate": 1.6769805644564426e-06, + "loss": 0.599, + "step": 10736 + }, + { + "epoch": 0.8124550716961144, + "grad_norm": 2.3943023681640625, + "learning_rate": 1.6756715153391327e-06, + "loss": 0.5855, + "step": 10737 + }, + { + "epoch": 0.8125307404184481, + "grad_norm": 2.002091407775879, + "learning_rate": 1.6743629248949631e-06, + "loss": 0.7371, + "step": 10738 + }, + { + "epoch": 0.8126064091407816, + "grad_norm": 2.1737380027770996, + "learning_rate": 1.6730547932058806e-06, + "loss": 0.5976, + "step": 10739 + }, + { + "epoch": 0.8126820778631153, + "grad_norm": 2.151082992553711, + "learning_rate": 1.6717471203538053e-06, + "loss": 0.7263, + "step": 10740 + }, + { + "epoch": 0.8127577465854489, + "grad_norm": 2.34664249420166, + "learning_rate": 1.670439906420628e-06, + "loss": 0.6413, + "step": 10741 + }, + { + "epoch": 0.8128334153077825, + "grad_norm": 2.2073299884796143, + "learning_rate": 1.66913315148821e-06, + "loss": 0.7334, + "step": 10742 + }, + { + "epoch": 0.8129090840301162, + "grad_norm": 1.9917361736297607, + "learning_rate": 1.667826855638388e-06, + "loss": 0.6073, + "step": 10743 + }, + { + "epoch": 0.8129847527524497, + "grad_norm": 2.251788854598999, + "learning_rate": 1.6665210189529585e-06, + "loss": 0.6817, + "step": 10744 + }, + { + "epoch": 0.8130604214747834, + "grad_norm": 2.446361541748047, + "learning_rate": 1.6652156415137041e-06, + "loss": 0.7737, + "step": 10745 + }, + { + "epoch": 0.813136090197117, + "grad_norm": 2.6486692428588867, + "learning_rate": 1.6639107234023723e-06, + "loss": 0.5922, + "step": 10746 + }, + { + "epoch": 0.8132117589194506, + "grad_norm": 2.2547566890716553, + "learning_rate": 1.662606264700676e-06, + "loss": 0.6869, + "step": 10747 + }, + { + "epoch": 0.8132874276417843, + "grad_norm": 2.7549359798431396, + "learning_rate": 1.6613022654903086e-06, + "loss": 0.5634, + "step": 10748 + }, + { + "epoch": 0.8133630963641179, + "grad_norm": 2.0122973918914795, + "learning_rate": 1.6599987258529288e-06, + "loss": 0.5678, + "step": 10749 + }, + { + "epoch": 0.8134387650864515, + "grad_norm": 2.138713836669922, + "learning_rate": 1.6586956458701685e-06, + "loss": 0.7879, + "step": 10750 + }, + { + "epoch": 0.8135144338087852, + "grad_norm": 2.357614040374756, + "learning_rate": 1.6573930256236323e-06, + "loss": 0.6622, + "step": 10751 + }, + { + "epoch": 0.8135901025311187, + "grad_norm": 2.2221148014068604, + "learning_rate": 1.656090865194894e-06, + "loss": 0.79, + "step": 10752 + }, + { + "epoch": 0.8136657712534524, + "grad_norm": 2.7023918628692627, + "learning_rate": 1.654789164665499e-06, + "loss": 0.716, + "step": 10753 + }, + { + "epoch": 0.813741439975786, + "grad_norm": 2.392548084259033, + "learning_rate": 1.6534879241169625e-06, + "loss": 0.6097, + "step": 10754 + }, + { + "epoch": 0.8138171086981196, + "grad_norm": 2.4918103218078613, + "learning_rate": 1.6521871436307754e-06, + "loss": 0.5744, + "step": 10755 + }, + { + "epoch": 0.8138927774204533, + "grad_norm": 1.715964913368225, + "learning_rate": 1.6508868232883932e-06, + "loss": 0.614, + "step": 10756 + }, + { + "epoch": 0.8139684461427869, + "grad_norm": 1.9560282230377197, + "learning_rate": 1.649586963171252e-06, + "loss": 0.6038, + "step": 10757 + }, + { + "epoch": 0.8140441148651205, + "grad_norm": 1.8922325372695923, + "learning_rate": 1.6482875633607465e-06, + "loss": 0.6643, + "step": 10758 + }, + { + "epoch": 0.8141197835874541, + "grad_norm": 2.1275534629821777, + "learning_rate": 1.6469886239382518e-06, + "loss": 0.7323, + "step": 10759 + }, + { + "epoch": 0.8141954523097877, + "grad_norm": 2.745668649673462, + "learning_rate": 1.6456901449851118e-06, + "loss": 0.655, + "step": 10760 + }, + { + "epoch": 0.8142711210321214, + "grad_norm": 2.2803080081939697, + "learning_rate": 1.6443921265826423e-06, + "loss": 0.5338, + "step": 10761 + }, + { + "epoch": 0.814346789754455, + "grad_norm": 2.5156054496765137, + "learning_rate": 1.6430945688121284e-06, + "loss": 0.6988, + "step": 10762 + }, + { + "epoch": 0.8144224584767886, + "grad_norm": 1.8427619934082031, + "learning_rate": 1.6417974717548272e-06, + "loss": 0.5556, + "step": 10763 + }, + { + "epoch": 0.8144981271991223, + "grad_norm": 2.746143341064453, + "learning_rate": 1.6405008354919705e-06, + "loss": 0.7378, + "step": 10764 + }, + { + "epoch": 0.8145737959214558, + "grad_norm": 2.229966163635254, + "learning_rate": 1.6392046601047505e-06, + "loss": 0.7431, + "step": 10765 + }, + { + "epoch": 0.8146494646437895, + "grad_norm": 3.1086843013763428, + "learning_rate": 1.637908945674344e-06, + "loss": 0.7127, + "step": 10766 + }, + { + "epoch": 0.8147251333661231, + "grad_norm": 2.0336904525756836, + "learning_rate": 1.6366136922818926e-06, + "loss": 0.6288, + "step": 10767 + }, + { + "epoch": 0.8148008020884567, + "grad_norm": 2.8491573333740234, + "learning_rate": 1.635318900008509e-06, + "loss": 0.7268, + "step": 10768 + }, + { + "epoch": 0.8148764708107904, + "grad_norm": 2.5054476261138916, + "learning_rate": 1.6340245689352744e-06, + "loss": 0.7421, + "step": 10769 + }, + { + "epoch": 0.814952139533124, + "grad_norm": 2.3975508213043213, + "learning_rate": 1.6327306991432431e-06, + "loss": 0.6014, + "step": 10770 + }, + { + "epoch": 0.8150278082554576, + "grad_norm": 1.5673209428787231, + "learning_rate": 1.6314372907134484e-06, + "loss": 0.8127, + "step": 10771 + }, + { + "epoch": 0.8151034769777912, + "grad_norm": 1.5236127376556396, + "learning_rate": 1.630144343726882e-06, + "loss": 0.6618, + "step": 10772 + }, + { + "epoch": 0.8151791457001248, + "grad_norm": 2.371945858001709, + "learning_rate": 1.6288518582645128e-06, + "loss": 0.7702, + "step": 10773 + }, + { + "epoch": 0.8152548144224585, + "grad_norm": 3.2651684284210205, + "learning_rate": 1.6275598344072825e-06, + "loss": 0.53, + "step": 10774 + }, + { + "epoch": 0.8153304831447921, + "grad_norm": 2.621706008911133, + "learning_rate": 1.6262682722360997e-06, + "loss": 0.6125, + "step": 10775 + }, + { + "epoch": 0.8154061518671257, + "grad_norm": 2.606985092163086, + "learning_rate": 1.6249771718318475e-06, + "loss": 0.7299, + "step": 10776 + }, + { + "epoch": 0.8154818205894594, + "grad_norm": 2.2531168460845947, + "learning_rate": 1.6236865332753782e-06, + "loss": 0.6822, + "step": 10777 + }, + { + "epoch": 0.815557489311793, + "grad_norm": 1.9697020053863525, + "learning_rate": 1.6223963566475195e-06, + "loss": 0.5334, + "step": 10778 + }, + { + "epoch": 0.8156331580341266, + "grad_norm": 2.3771908283233643, + "learning_rate": 1.6211066420290594e-06, + "loss": 0.6614, + "step": 10779 + }, + { + "epoch": 0.8157088267564602, + "grad_norm": 2.4156370162963867, + "learning_rate": 1.6198173895007665e-06, + "loss": 0.6298, + "step": 10780 + }, + { + "epoch": 0.8157844954787938, + "grad_norm": 4.180350303649902, + "learning_rate": 1.6185285991433812e-06, + "loss": 0.7348, + "step": 10781 + }, + { + "epoch": 0.8158601642011275, + "grad_norm": 2.094904899597168, + "learning_rate": 1.6172402710376108e-06, + "loss": 0.6079, + "step": 10782 + }, + { + "epoch": 0.8159358329234611, + "grad_norm": 2.2320287227630615, + "learning_rate": 1.6159524052641319e-06, + "loss": 0.7023, + "step": 10783 + }, + { + "epoch": 0.8160115016457947, + "grad_norm": 2.9694509506225586, + "learning_rate": 1.6146650019035967e-06, + "loss": 0.7397, + "step": 10784 + }, + { + "epoch": 0.8160871703681284, + "grad_norm": 5.22353982925415, + "learning_rate": 1.6133780610366253e-06, + "loss": 0.6958, + "step": 10785 + }, + { + "epoch": 0.816162839090462, + "grad_norm": 2.167146682739258, + "learning_rate": 1.6120915827438116e-06, + "loss": 0.8393, + "step": 10786 + }, + { + "epoch": 0.8162385078127956, + "grad_norm": 2.2224442958831787, + "learning_rate": 1.6108055671057176e-06, + "loss": 0.5897, + "step": 10787 + }, + { + "epoch": 0.8163141765351292, + "grad_norm": 2.2738425731658936, + "learning_rate": 1.6095200142028796e-06, + "loss": 0.6285, + "step": 10788 + }, + { + "epoch": 0.8163898452574628, + "grad_norm": 2.0885982513427734, + "learning_rate": 1.6082349241158033e-06, + "loss": 0.6971, + "step": 10789 + }, + { + "epoch": 0.8164655139797965, + "grad_norm": 1.7838362455368042, + "learning_rate": 1.6069502969249595e-06, + "loss": 0.6409, + "step": 10790 + }, + { + "epoch": 0.81654118270213, + "grad_norm": 2.1843435764312744, + "learning_rate": 1.6056661327108026e-06, + "loss": 0.7144, + "step": 10791 + }, + { + "epoch": 0.8166168514244637, + "grad_norm": 2.814628839492798, + "learning_rate": 1.6043824315537513e-06, + "loss": 0.5826, + "step": 10792 + }, + { + "epoch": 0.8166925201467973, + "grad_norm": 1.8284931182861328, + "learning_rate": 1.6030991935341905e-06, + "loss": 0.6479, + "step": 10793 + }, + { + "epoch": 0.8167681888691309, + "grad_norm": 2.4346306324005127, + "learning_rate": 1.6018164187324818e-06, + "loss": 0.7205, + "step": 10794 + }, + { + "epoch": 0.8168438575914646, + "grad_norm": 2.5444185733795166, + "learning_rate": 1.6005341072289578e-06, + "loss": 0.8004, + "step": 10795 + }, + { + "epoch": 0.8169195263137982, + "grad_norm": 3.5385169982910156, + "learning_rate": 1.5992522591039204e-06, + "loss": 0.7806, + "step": 10796 + }, + { + "epoch": 0.8169951950361318, + "grad_norm": 2.1103341579437256, + "learning_rate": 1.5979708744376443e-06, + "loss": 0.6149, + "step": 10797 + }, + { + "epoch": 0.8170708637584655, + "grad_norm": 2.047963857650757, + "learning_rate": 1.5966899533103725e-06, + "loss": 0.6581, + "step": 10798 + }, + { + "epoch": 0.817146532480799, + "grad_norm": 2.359651565551758, + "learning_rate": 1.5954094958023217e-06, + "loss": 0.6508, + "step": 10799 + }, + { + "epoch": 0.8172222012031327, + "grad_norm": 2.0635018348693848, + "learning_rate": 1.5941295019936786e-06, + "loss": 0.7374, + "step": 10800 + }, + { + "epoch": 0.8172978699254663, + "grad_norm": 1.9403828382492065, + "learning_rate": 1.5928499719645964e-06, + "loss": 0.6434, + "step": 10801 + }, + { + "epoch": 0.8173735386477999, + "grad_norm": 2.187650442123413, + "learning_rate": 1.5915709057952078e-06, + "loss": 0.7436, + "step": 10802 + }, + { + "epoch": 0.8174492073701336, + "grad_norm": 2.316969871520996, + "learning_rate": 1.5902923035656138e-06, + "loss": 0.6399, + "step": 10803 + }, + { + "epoch": 0.8175248760924672, + "grad_norm": 2.0138914585113525, + "learning_rate": 1.5890141653558796e-06, + "loss": 0.7807, + "step": 10804 + }, + { + "epoch": 0.8176005448148008, + "grad_norm": 2.207831382751465, + "learning_rate": 1.5877364912460476e-06, + "loss": 0.5119, + "step": 10805 + }, + { + "epoch": 0.8176762135371344, + "grad_norm": 2.2698066234588623, + "learning_rate": 1.586459281316131e-06, + "loss": 0.7117, + "step": 10806 + }, + { + "epoch": 0.817751882259468, + "grad_norm": 2.2364702224731445, + "learning_rate": 1.5851825356461133e-06, + "loss": 0.6974, + "step": 10807 + }, + { + "epoch": 0.8178275509818017, + "grad_norm": 1.6945065259933472, + "learning_rate": 1.583906254315947e-06, + "loss": 0.6608, + "step": 10808 + }, + { + "epoch": 0.8179032197041353, + "grad_norm": 1.855660319328308, + "learning_rate": 1.5826304374055573e-06, + "loss": 0.5783, + "step": 10809 + }, + { + "epoch": 0.8179788884264689, + "grad_norm": 2.5465874671936035, + "learning_rate": 1.5813550849948433e-06, + "loss": 0.6822, + "step": 10810 + }, + { + "epoch": 0.8180545571488026, + "grad_norm": 2.3344638347625732, + "learning_rate": 1.580080197163663e-06, + "loss": 0.6158, + "step": 10811 + }, + { + "epoch": 0.8181302258711362, + "grad_norm": 4.9126877784729, + "learning_rate": 1.578805773991863e-06, + "loss": 0.6967, + "step": 10812 + }, + { + "epoch": 0.8182058945934698, + "grad_norm": 2.2319014072418213, + "learning_rate": 1.577531815559248e-06, + "loss": 0.6559, + "step": 10813 + }, + { + "epoch": 0.8182815633158034, + "grad_norm": 5.378940582275391, + "learning_rate": 1.5762583219456002e-06, + "loss": 0.762, + "step": 10814 + }, + { + "epoch": 0.818357232038137, + "grad_norm": 3.3257508277893066, + "learning_rate": 1.574985293230666e-06, + "loss": 0.7037, + "step": 10815 + }, + { + "epoch": 0.8184329007604707, + "grad_norm": 2.033773899078369, + "learning_rate": 1.5737127294941647e-06, + "loss": 0.6447, + "step": 10816 + }, + { + "epoch": 0.8185085694828043, + "grad_norm": 2.516923666000366, + "learning_rate": 1.5724406308157973e-06, + "loss": 0.546, + "step": 10817 + }, + { + "epoch": 0.8185842382051379, + "grad_norm": 1.9066401720046997, + "learning_rate": 1.5711689972752181e-06, + "loss": 0.5628, + "step": 10818 + }, + { + "epoch": 0.8186599069274715, + "grad_norm": 2.1507344245910645, + "learning_rate": 1.5698978289520646e-06, + "loss": 0.6484, + "step": 10819 + }, + { + "epoch": 0.8187355756498051, + "grad_norm": 2.1909475326538086, + "learning_rate": 1.568627125925941e-06, + "loss": 0.5734, + "step": 10820 + }, + { + "epoch": 0.8188112443721388, + "grad_norm": 2.0895121097564697, + "learning_rate": 1.5673568882764225e-06, + "loss": 0.5795, + "step": 10821 + }, + { + "epoch": 0.8188869130944724, + "grad_norm": 1.9548548460006714, + "learning_rate": 1.5660871160830558e-06, + "loss": 0.6567, + "step": 10822 + }, + { + "epoch": 0.818962581816806, + "grad_norm": 2.28955340385437, + "learning_rate": 1.564817809425358e-06, + "loss": 0.7006, + "step": 10823 + }, + { + "epoch": 0.8190382505391397, + "grad_norm": 2.538539409637451, + "learning_rate": 1.5635489683828196e-06, + "loss": 0.5051, + "step": 10824 + }, + { + "epoch": 0.8191139192614733, + "grad_norm": 2.679903984069824, + "learning_rate": 1.5622805930348953e-06, + "loss": 0.667, + "step": 10825 + }, + { + "epoch": 0.8191895879838069, + "grad_norm": 2.0182337760925293, + "learning_rate": 1.5610126834610141e-06, + "loss": 0.6008, + "step": 10826 + }, + { + "epoch": 0.8192652567061405, + "grad_norm": 1.927375316619873, + "learning_rate": 1.5597452397405818e-06, + "loss": 0.612, + "step": 10827 + }, + { + "epoch": 0.8193409254284741, + "grad_norm": 2.192244291305542, + "learning_rate": 1.5584782619529688e-06, + "loss": 0.6674, + "step": 10828 + }, + { + "epoch": 0.8194165941508078, + "grad_norm": 2.570380449295044, + "learning_rate": 1.5572117501775148e-06, + "loss": 0.4887, + "step": 10829 + }, + { + "epoch": 0.8194922628731414, + "grad_norm": 2.119783401489258, + "learning_rate": 1.555945704493533e-06, + "loss": 0.7094, + "step": 10830 + }, + { + "epoch": 0.819567931595475, + "grad_norm": 2.8816754817962646, + "learning_rate": 1.5546801249803083e-06, + "loss": 0.7619, + "step": 10831 + }, + { + "epoch": 0.8196436003178086, + "grad_norm": 1.987670660018921, + "learning_rate": 1.5534150117170953e-06, + "loss": 0.6066, + "step": 10832 + }, + { + "epoch": 0.8197192690401423, + "grad_norm": 2.288383722305298, + "learning_rate": 1.5521503647831193e-06, + "loss": 0.65, + "step": 10833 + }, + { + "epoch": 0.8197949377624759, + "grad_norm": 1.7874622344970703, + "learning_rate": 1.5508861842575773e-06, + "loss": 0.5712, + "step": 10834 + }, + { + "epoch": 0.8198706064848095, + "grad_norm": 2.939530372619629, + "learning_rate": 1.549622470219638e-06, + "loss": 0.6617, + "step": 10835 + }, + { + "epoch": 0.8199462752071431, + "grad_norm": 2.5253098011016846, + "learning_rate": 1.5483592227484347e-06, + "loss": 0.8274, + "step": 10836 + }, + { + "epoch": 0.8200219439294768, + "grad_norm": 2.590799570083618, + "learning_rate": 1.5470964419230754e-06, + "loss": 0.7798, + "step": 10837 + }, + { + "epoch": 0.8200976126518104, + "grad_norm": 2.3619234561920166, + "learning_rate": 1.5458341278226478e-06, + "loss": 0.6679, + "step": 10838 + }, + { + "epoch": 0.820173281374144, + "grad_norm": 2.0205016136169434, + "learning_rate": 1.544572280526195e-06, + "loss": 0.6351, + "step": 10839 + }, + { + "epoch": 0.8202489500964776, + "grad_norm": 2.110157012939453, + "learning_rate": 1.543310900112738e-06, + "loss": 0.6308, + "step": 10840 + }, + { + "epoch": 0.8203246188188112, + "grad_norm": 2.0330491065979004, + "learning_rate": 1.5420499866612723e-06, + "loss": 0.7454, + "step": 10841 + }, + { + "epoch": 0.8204002875411449, + "grad_norm": 2.687309980392456, + "learning_rate": 1.5407895402507574e-06, + "loss": 0.78, + "step": 10842 + }, + { + "epoch": 0.8204759562634785, + "grad_norm": 3.1039085388183594, + "learning_rate": 1.5395295609601274e-06, + "loss": 0.6781, + "step": 10843 + }, + { + "epoch": 0.8205516249858121, + "grad_norm": 2.10479474067688, + "learning_rate": 1.538270048868286e-06, + "loss": 0.6809, + "step": 10844 + }, + { + "epoch": 0.8206272937081457, + "grad_norm": 5.9827752113342285, + "learning_rate": 1.5370110040541093e-06, + "loss": 0.6575, + "step": 10845 + }, + { + "epoch": 0.8207029624304794, + "grad_norm": 2.722191572189331, + "learning_rate": 1.535752426596444e-06, + "loss": 0.6425, + "step": 10846 + }, + { + "epoch": 0.820778631152813, + "grad_norm": 2.3565070629119873, + "learning_rate": 1.534494316574099e-06, + "loss": 0.6546, + "step": 10847 + }, + { + "epoch": 0.8208542998751466, + "grad_norm": 2.351691961288452, + "learning_rate": 1.5332366740658685e-06, + "loss": 0.6076, + "step": 10848 + }, + { + "epoch": 0.8209299685974802, + "grad_norm": 3.5153005123138428, + "learning_rate": 1.5319794991505105e-06, + "loss": 0.6766, + "step": 10849 + }, + { + "epoch": 0.8210056373198139, + "grad_norm": 2.6627254486083984, + "learning_rate": 1.530722791906748e-06, + "loss": 0.6439, + "step": 10850 + }, + { + "epoch": 0.8210813060421475, + "grad_norm": 3.6057939529418945, + "learning_rate": 1.5294665524132828e-06, + "loss": 0.6652, + "step": 10851 + }, + { + "epoch": 0.8211569747644811, + "grad_norm": 1.9653656482696533, + "learning_rate": 1.5282107807487854e-06, + "loss": 0.7099, + "step": 10852 + }, + { + "epoch": 0.8212326434868147, + "grad_norm": 2.3477768898010254, + "learning_rate": 1.5269554769918955e-06, + "loss": 0.6548, + "step": 10853 + }, + { + "epoch": 0.8213083122091484, + "grad_norm": 2.1630735397338867, + "learning_rate": 1.5257006412212244e-06, + "loss": 0.6972, + "step": 10854 + }, + { + "epoch": 0.821383980931482, + "grad_norm": 1.8949837684631348, + "learning_rate": 1.524446273515353e-06, + "loss": 0.5324, + "step": 10855 + }, + { + "epoch": 0.8214596496538156, + "grad_norm": 2.782655954360962, + "learning_rate": 1.523192373952836e-06, + "loss": 0.6504, + "step": 10856 + }, + { + "epoch": 0.8215353183761492, + "grad_norm": 2.5171873569488525, + "learning_rate": 1.5219389426121952e-06, + "loss": 0.6931, + "step": 10857 + }, + { + "epoch": 0.8216109870984828, + "grad_norm": 2.055389404296875, + "learning_rate": 1.5206859795719249e-06, + "loss": 0.6662, + "step": 10858 + }, + { + "epoch": 0.8216866558208165, + "grad_norm": 2.1020753383636475, + "learning_rate": 1.5194334849104892e-06, + "loss": 0.7256, + "step": 10859 + }, + { + "epoch": 0.8217623245431501, + "grad_norm": 2.210233211517334, + "learning_rate": 1.5181814587063255e-06, + "loss": 0.7089, + "step": 10860 + }, + { + "epoch": 0.8218379932654837, + "grad_norm": 1.9540194272994995, + "learning_rate": 1.5169299010378372e-06, + "loss": 0.6508, + "step": 10861 + }, + { + "epoch": 0.8219136619878173, + "grad_norm": 2.1212716102600098, + "learning_rate": 1.5156788119833983e-06, + "loss": 0.6668, + "step": 10862 + }, + { + "epoch": 0.821989330710151, + "grad_norm": 2.4180808067321777, + "learning_rate": 1.5144281916213645e-06, + "loss": 0.6415, + "step": 10863 + }, + { + "epoch": 0.8220649994324846, + "grad_norm": 2.248098373413086, + "learning_rate": 1.5131780400300459e-06, + "loss": 0.6769, + "step": 10864 + }, + { + "epoch": 0.8221406681548182, + "grad_norm": 2.054067611694336, + "learning_rate": 1.5119283572877336e-06, + "loss": 0.8053, + "step": 10865 + }, + { + "epoch": 0.8222163368771518, + "grad_norm": 2.5258889198303223, + "learning_rate": 1.5106791434726876e-06, + "loss": 0.691, + "step": 10866 + }, + { + "epoch": 0.8222920055994855, + "grad_norm": 2.0589208602905273, + "learning_rate": 1.509430398663137e-06, + "loss": 0.6954, + "step": 10867 + }, + { + "epoch": 0.8223676743218191, + "grad_norm": 2.6181116104125977, + "learning_rate": 1.5081821229372813e-06, + "loss": 0.7106, + "step": 10868 + }, + { + "epoch": 0.8224433430441527, + "grad_norm": 2.312509059906006, + "learning_rate": 1.5069343163732939e-06, + "loss": 0.6206, + "step": 10869 + }, + { + "epoch": 0.8225190117664863, + "grad_norm": 2.3199472427368164, + "learning_rate": 1.5056869790493144e-06, + "loss": 0.7401, + "step": 10870 + }, + { + "epoch": 0.8225946804888199, + "grad_norm": 2.136983871459961, + "learning_rate": 1.5044401110434582e-06, + "loss": 0.6275, + "step": 10871 + }, + { + "epoch": 0.8226703492111536, + "grad_norm": 1.8843696117401123, + "learning_rate": 1.503193712433803e-06, + "loss": 0.6358, + "step": 10872 + }, + { + "epoch": 0.8227460179334872, + "grad_norm": 2.7073588371276855, + "learning_rate": 1.5019477832984042e-06, + "loss": 0.8258, + "step": 10873 + }, + { + "epoch": 0.8228216866558208, + "grad_norm": 2.6563751697540283, + "learning_rate": 1.5007023237152905e-06, + "loss": 0.5735, + "step": 10874 + }, + { + "epoch": 0.8228973553781544, + "grad_norm": 2.3090715408325195, + "learning_rate": 1.4994573337624505e-06, + "loss": 0.661, + "step": 10875 + }, + { + "epoch": 0.8229730241004881, + "grad_norm": 2.4490222930908203, + "learning_rate": 1.4982128135178528e-06, + "loss": 0.6891, + "step": 10876 + }, + { + "epoch": 0.8230486928228217, + "grad_norm": 2.2669460773468018, + "learning_rate": 1.496968763059431e-06, + "loss": 0.6218, + "step": 10877 + }, + { + "epoch": 0.8231243615451553, + "grad_norm": 2.8864951133728027, + "learning_rate": 1.4957251824650948e-06, + "loss": 0.474, + "step": 10878 + }, + { + "epoch": 0.8232000302674889, + "grad_norm": 2.1421849727630615, + "learning_rate": 1.4944820718127179e-06, + "loss": 0.6624, + "step": 10879 + }, + { + "epoch": 0.8232756989898226, + "grad_norm": 2.4659669399261475, + "learning_rate": 1.49323943118015e-06, + "loss": 0.6561, + "step": 10880 + }, + { + "epoch": 0.8233513677121562, + "grad_norm": 1.961225986480713, + "learning_rate": 1.4919972606452113e-06, + "loss": 0.6731, + "step": 10881 + }, + { + "epoch": 0.8234270364344898, + "grad_norm": 2.310224771499634, + "learning_rate": 1.4907555602856849e-06, + "loss": 0.7196, + "step": 10882 + }, + { + "epoch": 0.8235027051568234, + "grad_norm": 2.2171401977539062, + "learning_rate": 1.4895143301793321e-06, + "loss": 0.8323, + "step": 10883 + }, + { + "epoch": 0.823578373879157, + "grad_norm": 3.4674880504608154, + "learning_rate": 1.4882735704038853e-06, + "loss": 0.6263, + "step": 10884 + }, + { + "epoch": 0.8236540426014907, + "grad_norm": 2.4391887187957764, + "learning_rate": 1.4870332810370457e-06, + "loss": 0.7007, + "step": 10885 + }, + { + "epoch": 0.8237297113238243, + "grad_norm": 2.19388747215271, + "learning_rate": 1.48579346215648e-06, + "loss": 0.744, + "step": 10886 + }, + { + "epoch": 0.8238053800461579, + "grad_norm": 2.4731009006500244, + "learning_rate": 1.4845541138398313e-06, + "loss": 0.6056, + "step": 10887 + }, + { + "epoch": 0.8238810487684916, + "grad_norm": 1.8592519760131836, + "learning_rate": 1.4833152361647122e-06, + "loss": 0.642, + "step": 10888 + }, + { + "epoch": 0.8239567174908252, + "grad_norm": 1.9034373760223389, + "learning_rate": 1.4820768292087048e-06, + "loss": 0.7955, + "step": 10889 + }, + { + "epoch": 0.8240323862131588, + "grad_norm": 2.9963486194610596, + "learning_rate": 1.480838893049362e-06, + "loss": 0.7364, + "step": 10890 + }, + { + "epoch": 0.8241080549354924, + "grad_norm": 2.4194231033325195, + "learning_rate": 1.4796014277642077e-06, + "loss": 0.638, + "step": 10891 + }, + { + "epoch": 0.824183723657826, + "grad_norm": 2.1171884536743164, + "learning_rate": 1.4783644334307374e-06, + "loss": 0.7346, + "step": 10892 + }, + { + "epoch": 0.8242593923801597, + "grad_norm": 2.5294923782348633, + "learning_rate": 1.4771279101264106e-06, + "loss": 0.5958, + "step": 10893 + }, + { + "epoch": 0.8243350611024933, + "grad_norm": 2.560204029083252, + "learning_rate": 1.4758918579286686e-06, + "loss": 0.7642, + "step": 10894 + }, + { + "epoch": 0.8244107298248269, + "grad_norm": 2.2272257804870605, + "learning_rate": 1.4746562769149163e-06, + "loss": 0.6933, + "step": 10895 + }, + { + "epoch": 0.8244863985471605, + "grad_norm": 2.733365774154663, + "learning_rate": 1.473421167162525e-06, + "loss": 0.6733, + "step": 10896 + }, + { + "epoch": 0.8245620672694941, + "grad_norm": 2.285641670227051, + "learning_rate": 1.4721865287488448e-06, + "loss": 0.6461, + "step": 10897 + }, + { + "epoch": 0.8246377359918278, + "grad_norm": 2.4050447940826416, + "learning_rate": 1.4709523617511898e-06, + "loss": 0.6534, + "step": 10898 + }, + { + "epoch": 0.8247134047141614, + "grad_norm": 2.5170106887817383, + "learning_rate": 1.4697186662468542e-06, + "loss": 0.5798, + "step": 10899 + }, + { + "epoch": 0.824789073436495, + "grad_norm": 2.0948894023895264, + "learning_rate": 1.4684854423130891e-06, + "loss": 0.7217, + "step": 10900 + }, + { + "epoch": 0.8248647421588287, + "grad_norm": 2.6201298236846924, + "learning_rate": 1.467252690027126e-06, + "loss": 0.6472, + "step": 10901 + }, + { + "epoch": 0.8249404108811623, + "grad_norm": 2.4255621433258057, + "learning_rate": 1.466020409466163e-06, + "loss": 0.7014, + "step": 10902 + }, + { + "epoch": 0.8250160796034959, + "grad_norm": 2.1704020500183105, + "learning_rate": 1.4647886007073692e-06, + "loss": 0.6657, + "step": 10903 + }, + { + "epoch": 0.8250917483258295, + "grad_norm": 2.4163525104522705, + "learning_rate": 1.463557263827886e-06, + "loss": 0.7631, + "step": 10904 + }, + { + "epoch": 0.8251674170481631, + "grad_norm": 2.173043966293335, + "learning_rate": 1.4623263989048226e-06, + "loss": 0.7403, + "step": 10905 + }, + { + "epoch": 0.8252430857704968, + "grad_norm": 2.1846542358398438, + "learning_rate": 1.4610960060152616e-06, + "loss": 0.7112, + "step": 10906 + }, + { + "epoch": 0.8253187544928304, + "grad_norm": 2.4031577110290527, + "learning_rate": 1.4598660852362505e-06, + "loss": 0.6655, + "step": 10907 + }, + { + "epoch": 0.825394423215164, + "grad_norm": 3.223851442337036, + "learning_rate": 1.4586366366448113e-06, + "loss": 0.6503, + "step": 10908 + }, + { + "epoch": 0.8254700919374977, + "grad_norm": 2.421539545059204, + "learning_rate": 1.4574076603179413e-06, + "loss": 0.6976, + "step": 10909 + }, + { + "epoch": 0.8255457606598312, + "grad_norm": 3.1366801261901855, + "learning_rate": 1.4561791563325965e-06, + "loss": 0.62, + "step": 10910 + }, + { + "epoch": 0.8256214293821649, + "grad_norm": 2.170020818710327, + "learning_rate": 1.454951124765714e-06, + "loss": 0.66, + "step": 10911 + }, + { + "epoch": 0.8256970981044985, + "grad_norm": 2.331679344177246, + "learning_rate": 1.4537235656941952e-06, + "loss": 0.6694, + "step": 10912 + }, + { + "epoch": 0.8257727668268321, + "grad_norm": 1.9958034753799438, + "learning_rate": 1.4524964791949157e-06, + "loss": 0.6669, + "step": 10913 + }, + { + "epoch": 0.8258484355491658, + "grad_norm": 1.878063440322876, + "learning_rate": 1.4512698653447153e-06, + "loss": 0.6129, + "step": 10914 + }, + { + "epoch": 0.8259241042714994, + "grad_norm": 2.260777711868286, + "learning_rate": 1.450043724220413e-06, + "loss": 0.6623, + "step": 10915 + }, + { + "epoch": 0.825999772993833, + "grad_norm": 2.2355360984802246, + "learning_rate": 1.4488180558987921e-06, + "loss": 0.5674, + "step": 10916 + }, + { + "epoch": 0.8260754417161666, + "grad_norm": 2.1847424507141113, + "learning_rate": 1.4475928604566107e-06, + "loss": 0.7828, + "step": 10917 + }, + { + "epoch": 0.8261511104385002, + "grad_norm": 2.2542574405670166, + "learning_rate": 1.4463681379705883e-06, + "loss": 0.804, + "step": 10918 + }, + { + "epoch": 0.8262267791608339, + "grad_norm": 1.649489402770996, + "learning_rate": 1.4451438885174242e-06, + "loss": 0.7146, + "step": 10919 + }, + { + "epoch": 0.8263024478831675, + "grad_norm": 2.6824514865875244, + "learning_rate": 1.4439201121737882e-06, + "loss": 0.6701, + "step": 10920 + }, + { + "epoch": 0.8263781166055011, + "grad_norm": 3.05550217628479, + "learning_rate": 1.4426968090163127e-06, + "loss": 0.4885, + "step": 10921 + }, + { + "epoch": 0.8264537853278348, + "grad_norm": 2.190661907196045, + "learning_rate": 1.4414739791216062e-06, + "loss": 0.5637, + "step": 10922 + }, + { + "epoch": 0.8265294540501683, + "grad_norm": 2.1329610347747803, + "learning_rate": 1.4402516225662454e-06, + "loss": 0.6271, + "step": 10923 + }, + { + "epoch": 0.826605122772502, + "grad_norm": 2.2187185287475586, + "learning_rate": 1.43902973942678e-06, + "loss": 0.7154, + "step": 10924 + }, + { + "epoch": 0.8266807914948356, + "grad_norm": 2.099266529083252, + "learning_rate": 1.4378083297797278e-06, + "loss": 0.5802, + "step": 10925 + }, + { + "epoch": 0.8267564602171692, + "grad_norm": 2.433722734451294, + "learning_rate": 1.4365873937015758e-06, + "loss": 0.7381, + "step": 10926 + }, + { + "epoch": 0.8268321289395029, + "grad_norm": 2.2790136337280273, + "learning_rate": 1.4353669312687878e-06, + "loss": 0.6474, + "step": 10927 + }, + { + "epoch": 0.8269077976618365, + "grad_norm": 2.4600353240966797, + "learning_rate": 1.4341469425577866e-06, + "loss": 0.7024, + "step": 10928 + }, + { + "epoch": 0.8269834663841701, + "grad_norm": 2.4430034160614014, + "learning_rate": 1.432927427644973e-06, + "loss": 0.6797, + "step": 10929 + }, + { + "epoch": 0.8270591351065038, + "grad_norm": 6.050069332122803, + "learning_rate": 1.431708386606721e-06, + "loss": 0.6611, + "step": 10930 + }, + { + "epoch": 0.8271348038288373, + "grad_norm": 2.662050485610962, + "learning_rate": 1.4304898195193705e-06, + "loss": 0.6946, + "step": 10931 + }, + { + "epoch": 0.827210472551171, + "grad_norm": 2.608130693435669, + "learning_rate": 1.4292717264592286e-06, + "loss": 0.7405, + "step": 10932 + }, + { + "epoch": 0.8272861412735046, + "grad_norm": 2.265187978744507, + "learning_rate": 1.428054107502577e-06, + "loss": 0.7296, + "step": 10933 + }, + { + "epoch": 0.8273618099958382, + "grad_norm": 1.9278501272201538, + "learning_rate": 1.426836962725669e-06, + "loss": 0.594, + "step": 10934 + }, + { + "epoch": 0.8274374787181719, + "grad_norm": 2.370166540145874, + "learning_rate": 1.4256202922047243e-06, + "loss": 0.5605, + "step": 10935 + }, + { + "epoch": 0.8275131474405054, + "grad_norm": 2.0922703742980957, + "learning_rate": 1.4244040960159356e-06, + "loss": 0.5532, + "step": 10936 + }, + { + "epoch": 0.8275888161628391, + "grad_norm": 2.2597086429595947, + "learning_rate": 1.423188374235464e-06, + "loss": 0.609, + "step": 10937 + }, + { + "epoch": 0.8276644848851727, + "grad_norm": 2.991779088973999, + "learning_rate": 1.4219731269394455e-06, + "loss": 0.6567, + "step": 10938 + }, + { + "epoch": 0.8277401536075063, + "grad_norm": 1.8932214975357056, + "learning_rate": 1.4207583542039767e-06, + "loss": 0.5854, + "step": 10939 + }, + { + "epoch": 0.82781582232984, + "grad_norm": 2.025179147720337, + "learning_rate": 1.4195440561051349e-06, + "loss": 0.7445, + "step": 10940 + }, + { + "epoch": 0.8278914910521736, + "grad_norm": 2.18306827545166, + "learning_rate": 1.4183302327189654e-06, + "loss": 0.7864, + "step": 10941 + }, + { + "epoch": 0.8279671597745072, + "grad_norm": 2.069181203842163, + "learning_rate": 1.4171168841214762e-06, + "loss": 0.6587, + "step": 10942 + }, + { + "epoch": 0.8280428284968409, + "grad_norm": 1.9649893045425415, + "learning_rate": 1.4159040103886545e-06, + "loss": 0.6386, + "step": 10943 + }, + { + "epoch": 0.8281184972191744, + "grad_norm": 2.0835180282592773, + "learning_rate": 1.4146916115964507e-06, + "loss": 0.5745, + "step": 10944 + }, + { + "epoch": 0.8281941659415081, + "grad_norm": 1.8389742374420166, + "learning_rate": 1.413479687820796e-06, + "loss": 0.6632, + "step": 10945 + }, + { + "epoch": 0.8282698346638417, + "grad_norm": 2.533998489379883, + "learning_rate": 1.4122682391375796e-06, + "loss": 0.6618, + "step": 10946 + }, + { + "epoch": 0.8283455033861753, + "grad_norm": 6.8104963302612305, + "learning_rate": 1.411057265622668e-06, + "loss": 0.7536, + "step": 10947 + }, + { + "epoch": 0.828421172108509, + "grad_norm": 2.065166473388672, + "learning_rate": 1.4098467673518954e-06, + "loss": 0.5601, + "step": 10948 + }, + { + "epoch": 0.8284968408308425, + "grad_norm": 2.1348156929016113, + "learning_rate": 1.4086367444010704e-06, + "loss": 0.7066, + "step": 10949 + }, + { + "epoch": 0.8285725095531762, + "grad_norm": 2.0122158527374268, + "learning_rate": 1.4074271968459609e-06, + "loss": 0.6169, + "step": 10950 + }, + { + "epoch": 0.8286481782755099, + "grad_norm": 2.3869214057922363, + "learning_rate": 1.4062181247623206e-06, + "loss": 0.651, + "step": 10951 + }, + { + "epoch": 0.8287238469978434, + "grad_norm": 2.8264942169189453, + "learning_rate": 1.4050095282258642e-06, + "loss": 0.6669, + "step": 10952 + }, + { + "epoch": 0.8287995157201771, + "grad_norm": 2.2256650924682617, + "learning_rate": 1.4038014073122747e-06, + "loss": 0.6945, + "step": 10953 + }, + { + "epoch": 0.8288751844425107, + "grad_norm": 2.1724610328674316, + "learning_rate": 1.40259376209721e-06, + "loss": 0.618, + "step": 10954 + }, + { + "epoch": 0.8289508531648443, + "grad_norm": 2.1436386108398438, + "learning_rate": 1.401386592656297e-06, + "loss": 0.6178, + "step": 10955 + }, + { + "epoch": 0.829026521887178, + "grad_norm": 1.8904942274093628, + "learning_rate": 1.4001798990651317e-06, + "loss": 0.565, + "step": 10956 + }, + { + "epoch": 0.8291021906095115, + "grad_norm": 1.9519450664520264, + "learning_rate": 1.3989736813992826e-06, + "loss": 0.6762, + "step": 10957 + }, + { + "epoch": 0.8291778593318452, + "grad_norm": 2.116001605987549, + "learning_rate": 1.3977679397342863e-06, + "loss": 0.7516, + "step": 10958 + }, + { + "epoch": 0.8292535280541788, + "grad_norm": 2.4981284141540527, + "learning_rate": 1.3965626741456495e-06, + "loss": 0.6231, + "step": 10959 + }, + { + "epoch": 0.8293291967765124, + "grad_norm": 2.15065860748291, + "learning_rate": 1.3953578847088513e-06, + "loss": 0.5323, + "step": 10960 + }, + { + "epoch": 0.8294048654988461, + "grad_norm": 2.2730371952056885, + "learning_rate": 1.394153571499339e-06, + "loss": 0.7206, + "step": 10961 + }, + { + "epoch": 0.8294805342211796, + "grad_norm": 2.4419403076171875, + "learning_rate": 1.3929497345925299e-06, + "loss": 0.667, + "step": 10962 + }, + { + "epoch": 0.8295562029435133, + "grad_norm": 2.3964638710021973, + "learning_rate": 1.3917463740638146e-06, + "loss": 0.7303, + "step": 10963 + }, + { + "epoch": 0.829631871665847, + "grad_norm": 2.3041999340057373, + "learning_rate": 1.3905434899885471e-06, + "loss": 0.657, + "step": 10964 + }, + { + "epoch": 0.8297075403881805, + "grad_norm": 2.5561516284942627, + "learning_rate": 1.389341082442057e-06, + "loss": 0.6983, + "step": 10965 + }, + { + "epoch": 0.8297832091105142, + "grad_norm": 1.8878254890441895, + "learning_rate": 1.3881391514996473e-06, + "loss": 0.6237, + "step": 10966 + }, + { + "epoch": 0.8298588778328478, + "grad_norm": 3.115190029144287, + "learning_rate": 1.3869376972365825e-06, + "loss": 0.798, + "step": 10967 + }, + { + "epoch": 0.8299345465551814, + "grad_norm": 2.7332189083099365, + "learning_rate": 1.3857367197281024e-06, + "loss": 0.6883, + "step": 10968 + }, + { + "epoch": 0.8300102152775151, + "grad_norm": 2.638230562210083, + "learning_rate": 1.3845362190494161e-06, + "loss": 0.5863, + "step": 10969 + }, + { + "epoch": 0.8300858839998486, + "grad_norm": 2.4270436763763428, + "learning_rate": 1.3833361952757031e-06, + "loss": 0.6456, + "step": 10970 + }, + { + "epoch": 0.8301615527221823, + "grad_norm": 1.9929462671279907, + "learning_rate": 1.3821366484821138e-06, + "loss": 0.6827, + "step": 10971 + }, + { + "epoch": 0.830237221444516, + "grad_norm": 4.07589864730835, + "learning_rate": 1.3809375787437656e-06, + "loss": 0.6311, + "step": 10972 + }, + { + "epoch": 0.8303128901668495, + "grad_norm": 2.5167202949523926, + "learning_rate": 1.3797389861357507e-06, + "loss": 0.7506, + "step": 10973 + }, + { + "epoch": 0.8303885588891832, + "grad_norm": 1.9688563346862793, + "learning_rate": 1.378540870733128e-06, + "loss": 0.5853, + "step": 10974 + }, + { + "epoch": 0.8304642276115167, + "grad_norm": 2.244810104370117, + "learning_rate": 1.3773432326109234e-06, + "loss": 0.6253, + "step": 10975 + }, + { + "epoch": 0.8305398963338504, + "grad_norm": 1.8359615802764893, + "learning_rate": 1.376146071844142e-06, + "loss": 0.7278, + "step": 10976 + }, + { + "epoch": 0.8306155650561841, + "grad_norm": 2.00067138671875, + "learning_rate": 1.374949388507754e-06, + "loss": 0.6246, + "step": 10977 + }, + { + "epoch": 0.8306912337785176, + "grad_norm": 2.020059108734131, + "learning_rate": 1.3737531826766962e-06, + "loss": 0.6065, + "step": 10978 + }, + { + "epoch": 0.8307669025008513, + "grad_norm": 2.200312614440918, + "learning_rate": 1.3725574544258797e-06, + "loss": 0.7528, + "step": 10979 + }, + { + "epoch": 0.830842571223185, + "grad_norm": 2.1670212745666504, + "learning_rate": 1.3713622038301856e-06, + "loss": 0.6273, + "step": 10980 + }, + { + "epoch": 0.8309182399455185, + "grad_norm": 2.5770716667175293, + "learning_rate": 1.3701674309644652e-06, + "loss": 0.6216, + "step": 10981 + }, + { + "epoch": 0.8309939086678522, + "grad_norm": 3.1510024070739746, + "learning_rate": 1.3689731359035375e-06, + "loss": 0.6588, + "step": 10982 + }, + { + "epoch": 0.8310695773901857, + "grad_norm": 3.46155047416687, + "learning_rate": 1.3677793187221936e-06, + "loss": 0.5913, + "step": 10983 + }, + { + "epoch": 0.8311452461125194, + "grad_norm": 1.877045750617981, + "learning_rate": 1.3665859794951969e-06, + "loss": 0.5908, + "step": 10984 + }, + { + "epoch": 0.831220914834853, + "grad_norm": 2.1516778469085693, + "learning_rate": 1.3653931182972716e-06, + "loss": 0.675, + "step": 10985 + }, + { + "epoch": 0.8312965835571866, + "grad_norm": 2.5128893852233887, + "learning_rate": 1.3642007352031238e-06, + "loss": 0.7084, + "step": 10986 + }, + { + "epoch": 0.8313722522795203, + "grad_norm": 2.5535311698913574, + "learning_rate": 1.3630088302874237e-06, + "loss": 0.7207, + "step": 10987 + }, + { + "epoch": 0.8314479210018538, + "grad_norm": 2.579092502593994, + "learning_rate": 1.3618174036248138e-06, + "loss": 0.6827, + "step": 10988 + }, + { + "epoch": 0.8315235897241875, + "grad_norm": 2.279123306274414, + "learning_rate": 1.3606264552899005e-06, + "loss": 0.7558, + "step": 10989 + }, + { + "epoch": 0.8315992584465212, + "grad_norm": 2.163329839706421, + "learning_rate": 1.359435985357268e-06, + "loss": 0.6043, + "step": 10990 + }, + { + "epoch": 0.8316749271688547, + "grad_norm": 2.061577320098877, + "learning_rate": 1.3582459939014655e-06, + "loss": 0.7602, + "step": 10991 + }, + { + "epoch": 0.8317505958911884, + "grad_norm": 2.2457826137542725, + "learning_rate": 1.3570564809970164e-06, + "loss": 0.6957, + "step": 10992 + }, + { + "epoch": 0.831826264613522, + "grad_norm": 2.5345616340637207, + "learning_rate": 1.3558674467184096e-06, + "loss": 0.6992, + "step": 10993 + }, + { + "epoch": 0.8319019333358556, + "grad_norm": 1.9230178594589233, + "learning_rate": 1.354678891140108e-06, + "loss": 0.7109, + "step": 10994 + }, + { + "epoch": 0.8319776020581893, + "grad_norm": 1.9315879344940186, + "learning_rate": 1.3534908143365452e-06, + "loss": 0.6629, + "step": 10995 + }, + { + "epoch": 0.8320532707805228, + "grad_norm": 2.1779463291168213, + "learning_rate": 1.352303216382114e-06, + "loss": 0.7043, + "step": 10996 + }, + { + "epoch": 0.8321289395028565, + "grad_norm": 1.9600178003311157, + "learning_rate": 1.3511160973511935e-06, + "loss": 0.7524, + "step": 10997 + }, + { + "epoch": 0.8322046082251902, + "grad_norm": 4.411078929901123, + "learning_rate": 1.3499294573181253e-06, + "loss": 0.749, + "step": 10998 + }, + { + "epoch": 0.8322802769475237, + "grad_norm": 3.7787492275238037, + "learning_rate": 1.3487432963572152e-06, + "loss": 0.5859, + "step": 10999 + }, + { + "epoch": 0.8323559456698574, + "grad_norm": 2.2628886699676514, + "learning_rate": 1.3475576145427465e-06, + "loss": 0.7292, + "step": 11000 + }, + { + "epoch": 0.8324316143921909, + "grad_norm": 2.6907804012298584, + "learning_rate": 1.346372411948969e-06, + "loss": 0.6431, + "step": 11001 + }, + { + "epoch": 0.8325072831145246, + "grad_norm": 3.0805437564849854, + "learning_rate": 1.3451876886501101e-06, + "loss": 0.5905, + "step": 11002 + }, + { + "epoch": 0.8325829518368583, + "grad_norm": 2.1811044216156006, + "learning_rate": 1.344003444720356e-06, + "loss": 0.593, + "step": 11003 + }, + { + "epoch": 0.8326586205591918, + "grad_norm": 2.125622272491455, + "learning_rate": 1.3428196802338676e-06, + "loss": 0.5257, + "step": 11004 + }, + { + "epoch": 0.8327342892815255, + "grad_norm": 3.8492448329925537, + "learning_rate": 1.3416363952647772e-06, + "loss": 0.6362, + "step": 11005 + }, + { + "epoch": 0.8328099580038592, + "grad_norm": 2.9568638801574707, + "learning_rate": 1.340453589887185e-06, + "loss": 0.655, + "step": 11006 + }, + { + "epoch": 0.8328856267261927, + "grad_norm": 2.092532157897949, + "learning_rate": 1.3392712641751645e-06, + "loss": 0.6265, + "step": 11007 + }, + { + "epoch": 0.8329612954485264, + "grad_norm": 2.0288496017456055, + "learning_rate": 1.3380894182027548e-06, + "loss": 0.5884, + "step": 11008 + }, + { + "epoch": 0.8330369641708599, + "grad_norm": 2.5260086059570312, + "learning_rate": 1.336908052043969e-06, + "loss": 0.6618, + "step": 11009 + }, + { + "epoch": 0.8331126328931936, + "grad_norm": 2.303161382675171, + "learning_rate": 1.3357271657727847e-06, + "loss": 0.6813, + "step": 11010 + }, + { + "epoch": 0.8331883016155273, + "grad_norm": 2.152733087539673, + "learning_rate": 1.334546759463152e-06, + "loss": 0.5689, + "step": 11011 + }, + { + "epoch": 0.8332639703378608, + "grad_norm": 2.2623064517974854, + "learning_rate": 1.3333668331889998e-06, + "loss": 0.6485, + "step": 11012 + }, + { + "epoch": 0.8333396390601945, + "grad_norm": 2.2851810455322266, + "learning_rate": 1.3321873870242097e-06, + "loss": 0.6046, + "step": 11013 + }, + { + "epoch": 0.8334153077825281, + "grad_norm": 1.8821451663970947, + "learning_rate": 1.3310084210426468e-06, + "loss": 0.8392, + "step": 11014 + }, + { + "epoch": 0.8334909765048617, + "grad_norm": 2.2863385677337646, + "learning_rate": 1.3298299353181411e-06, + "loss": 0.6131, + "step": 11015 + }, + { + "epoch": 0.8335666452271954, + "grad_norm": 2.0595953464508057, + "learning_rate": 1.3286519299244936e-06, + "loss": 0.7144, + "step": 11016 + }, + { + "epoch": 0.8336423139495289, + "grad_norm": 1.9019546508789062, + "learning_rate": 1.3274744049354739e-06, + "loss": 0.574, + "step": 11017 + }, + { + "epoch": 0.8337179826718626, + "grad_norm": 2.477344036102295, + "learning_rate": 1.3262973604248235e-06, + "loss": 0.6929, + "step": 11018 + }, + { + "epoch": 0.8337936513941963, + "grad_norm": 2.8345818519592285, + "learning_rate": 1.325120796466251e-06, + "loss": 0.6346, + "step": 11019 + }, + { + "epoch": 0.8338693201165298, + "grad_norm": 2.6509807109832764, + "learning_rate": 1.323944713133441e-06, + "loss": 0.6845, + "step": 11020 + }, + { + "epoch": 0.8339449888388635, + "grad_norm": 2.0513241291046143, + "learning_rate": 1.322769110500036e-06, + "loss": 0.5085, + "step": 11021 + }, + { + "epoch": 0.834020657561197, + "grad_norm": 2.247509002685547, + "learning_rate": 1.3215939886396625e-06, + "loss": 0.7244, + "step": 11022 + }, + { + "epoch": 0.8340963262835307, + "grad_norm": 2.2295756340026855, + "learning_rate": 1.3204193476259096e-06, + "loss": 0.6197, + "step": 11023 + }, + { + "epoch": 0.8341719950058644, + "grad_norm": 2.411051034927368, + "learning_rate": 1.3192451875323353e-06, + "loss": 0.6679, + "step": 11024 + }, + { + "epoch": 0.8342476637281979, + "grad_norm": 2.1455461978912354, + "learning_rate": 1.3180715084324689e-06, + "loss": 0.6545, + "step": 11025 + }, + { + "epoch": 0.8343233324505316, + "grad_norm": 2.34451961517334, + "learning_rate": 1.3168983103998115e-06, + "loss": 0.6976, + "step": 11026 + }, + { + "epoch": 0.8343990011728653, + "grad_norm": 2.02919602394104, + "learning_rate": 1.3157255935078313e-06, + "loss": 0.5394, + "step": 11027 + }, + { + "epoch": 0.8344746698951988, + "grad_norm": 2.013793468475342, + "learning_rate": 1.3145533578299699e-06, + "loss": 0.8555, + "step": 11028 + }, + { + "epoch": 0.8345503386175325, + "grad_norm": 2.240281581878662, + "learning_rate": 1.3133816034396343e-06, + "loss": 0.6576, + "step": 11029 + }, + { + "epoch": 0.834626007339866, + "grad_norm": 1.8225212097167969, + "learning_rate": 1.3122103304102057e-06, + "loss": 0.7601, + "step": 11030 + }, + { + "epoch": 0.8347016760621997, + "grad_norm": 2.2404656410217285, + "learning_rate": 1.3110395388150296e-06, + "loss": 0.6523, + "step": 11031 + }, + { + "epoch": 0.8347773447845334, + "grad_norm": 1.9908554553985596, + "learning_rate": 1.3098692287274252e-06, + "loss": 0.5598, + "step": 11032 + }, + { + "epoch": 0.8348530135068669, + "grad_norm": 2.0587451457977295, + "learning_rate": 1.3086994002206843e-06, + "loss": 0.5893, + "step": 11033 + }, + { + "epoch": 0.8349286822292006, + "grad_norm": 2.665961503982544, + "learning_rate": 1.3075300533680657e-06, + "loss": 0.7025, + "step": 11034 + }, + { + "epoch": 0.8350043509515341, + "grad_norm": 2.2750911712646484, + "learning_rate": 1.3063611882427943e-06, + "loss": 0.8163, + "step": 11035 + }, + { + "epoch": 0.8350800196738678, + "grad_norm": 2.3275375366210938, + "learning_rate": 1.3051928049180683e-06, + "loss": 0.5284, + "step": 11036 + }, + { + "epoch": 0.8351556883962015, + "grad_norm": 1.4327739477157593, + "learning_rate": 1.304024903467057e-06, + "loss": 0.7461, + "step": 11037 + }, + { + "epoch": 0.835231357118535, + "grad_norm": 3.3276166915893555, + "learning_rate": 1.3028574839628995e-06, + "loss": 0.8109, + "step": 11038 + }, + { + "epoch": 0.8353070258408687, + "grad_norm": 1.9185841083526611, + "learning_rate": 1.3016905464787009e-06, + "loss": 0.6655, + "step": 11039 + }, + { + "epoch": 0.8353826945632024, + "grad_norm": 1.899623990058899, + "learning_rate": 1.3005240910875395e-06, + "loss": 0.6141, + "step": 11040 + }, + { + "epoch": 0.8354583632855359, + "grad_norm": 2.365811586380005, + "learning_rate": 1.2993581178624644e-06, + "loss": 0.6686, + "step": 11041 + }, + { + "epoch": 0.8355340320078696, + "grad_norm": 2.494126558303833, + "learning_rate": 1.298192626876488e-06, + "loss": 0.6618, + "step": 11042 + }, + { + "epoch": 0.8356097007302031, + "grad_norm": 2.977726459503174, + "learning_rate": 1.2970276182026006e-06, + "loss": 0.6845, + "step": 11043 + }, + { + "epoch": 0.8356853694525368, + "grad_norm": 2.094541549682617, + "learning_rate": 1.2958630919137614e-06, + "loss": 0.5355, + "step": 11044 + }, + { + "epoch": 0.8357610381748705, + "grad_norm": 1.4946520328521729, + "learning_rate": 1.2946990480828904e-06, + "loss": 0.9194, + "step": 11045 + }, + { + "epoch": 0.835836706897204, + "grad_norm": 2.1046907901763916, + "learning_rate": 1.293535486782888e-06, + "loss": 0.7676, + "step": 11046 + }, + { + "epoch": 0.8359123756195377, + "grad_norm": 2.2517387866973877, + "learning_rate": 1.2923724080866165e-06, + "loss": 0.5016, + "step": 11047 + }, + { + "epoch": 0.8359880443418712, + "grad_norm": 3.1959762573242188, + "learning_rate": 1.2912098120669186e-06, + "loss": 0.633, + "step": 11048 + }, + { + "epoch": 0.8360637130642049, + "grad_norm": 2.66766095161438, + "learning_rate": 1.2900476987965934e-06, + "loss": 0.6543, + "step": 11049 + }, + { + "epoch": 0.8361393817865386, + "grad_norm": 2.5261588096618652, + "learning_rate": 1.2888860683484182e-06, + "loss": 0.6519, + "step": 11050 + }, + { + "epoch": 0.8362150505088721, + "grad_norm": 2.3242642879486084, + "learning_rate": 1.2877249207951384e-06, + "loss": 0.6523, + "step": 11051 + }, + { + "epoch": 0.8362907192312058, + "grad_norm": 2.110452890396118, + "learning_rate": 1.2865642562094692e-06, + "loss": 0.658, + "step": 11052 + }, + { + "epoch": 0.8363663879535395, + "grad_norm": 2.2709860801696777, + "learning_rate": 1.285404074664094e-06, + "loss": 0.796, + "step": 11053 + }, + { + "epoch": 0.836442056675873, + "grad_norm": 1.7346996068954468, + "learning_rate": 1.284244376231667e-06, + "loss": 0.5757, + "step": 11054 + }, + { + "epoch": 0.8365177253982067, + "grad_norm": 2.3549299240112305, + "learning_rate": 1.283085160984816e-06, + "loss": 0.8578, + "step": 11055 + }, + { + "epoch": 0.8365933941205402, + "grad_norm": 2.3999102115631104, + "learning_rate": 1.2819264289961293e-06, + "loss": 0.5272, + "step": 11056 + }, + { + "epoch": 0.8366690628428739, + "grad_norm": 2.2719616889953613, + "learning_rate": 1.2807681803381701e-06, + "loss": 0.7264, + "step": 11057 + }, + { + "epoch": 0.8367447315652076, + "grad_norm": 2.6402430534362793, + "learning_rate": 1.2796104150834793e-06, + "loss": 0.6027, + "step": 11058 + }, + { + "epoch": 0.8368204002875411, + "grad_norm": 2.883604049682617, + "learning_rate": 1.2784531333045529e-06, + "loss": 0.8013, + "step": 11059 + }, + { + "epoch": 0.8368960690098748, + "grad_norm": 1.8437893390655518, + "learning_rate": 1.277296335073866e-06, + "loss": 0.7198, + "step": 11060 + }, + { + "epoch": 0.8369717377322083, + "grad_norm": 2.287963628768921, + "learning_rate": 1.2761400204638605e-06, + "loss": 0.7142, + "step": 11061 + }, + { + "epoch": 0.837047406454542, + "grad_norm": 2.6593968868255615, + "learning_rate": 1.2749841895469497e-06, + "loss": 0.5975, + "step": 11062 + }, + { + "epoch": 0.8371230751768757, + "grad_norm": 2.181272268295288, + "learning_rate": 1.2738288423955146e-06, + "loss": 0.6539, + "step": 11063 + }, + { + "epoch": 0.8371987438992092, + "grad_norm": 2.1479294300079346, + "learning_rate": 1.2726739790819062e-06, + "loss": 0.6179, + "step": 11064 + }, + { + "epoch": 0.8372744126215429, + "grad_norm": 2.6610589027404785, + "learning_rate": 1.2715195996784468e-06, + "loss": 0.696, + "step": 11065 + }, + { + "epoch": 0.8373500813438766, + "grad_norm": 2.6823954582214355, + "learning_rate": 1.2703657042574284e-06, + "loss": 0.6255, + "step": 11066 + }, + { + "epoch": 0.8374257500662101, + "grad_norm": 2.4057936668395996, + "learning_rate": 1.2692122928911085e-06, + "loss": 0.5914, + "step": 11067 + }, + { + "epoch": 0.8375014187885438, + "grad_norm": 1.9947429895401, + "learning_rate": 1.268059365651718e-06, + "loss": 0.6166, + "step": 11068 + }, + { + "epoch": 0.8375770875108773, + "grad_norm": 2.460395574569702, + "learning_rate": 1.2669069226114614e-06, + "loss": 0.6943, + "step": 11069 + }, + { + "epoch": 0.837652756233211, + "grad_norm": 2.2644882202148438, + "learning_rate": 1.2657549638425028e-06, + "loss": 0.6663, + "step": 11070 + }, + { + "epoch": 0.8377284249555447, + "grad_norm": 2.3289754390716553, + "learning_rate": 1.2646034894169848e-06, + "loss": 0.6335, + "step": 11071 + }, + { + "epoch": 0.8378040936778782, + "grad_norm": 2.1823723316192627, + "learning_rate": 1.2634524994070152e-06, + "loss": 0.7363, + "step": 11072 + }, + { + "epoch": 0.8378797624002119, + "grad_norm": 2.479443311691284, + "learning_rate": 1.2623019938846735e-06, + "loss": 0.705, + "step": 11073 + }, + { + "epoch": 0.8379554311225454, + "grad_norm": 2.303396701812744, + "learning_rate": 1.2611519729220074e-06, + "loss": 0.8203, + "step": 11074 + }, + { + "epoch": 0.8380310998448791, + "grad_norm": 2.0928168296813965, + "learning_rate": 1.2600024365910352e-06, + "loss": 0.6169, + "step": 11075 + }, + { + "epoch": 0.8381067685672128, + "grad_norm": 2.577695608139038, + "learning_rate": 1.258853384963745e-06, + "loss": 0.6813, + "step": 11076 + }, + { + "epoch": 0.8381824372895463, + "grad_norm": 1.9073697328567505, + "learning_rate": 1.2577048181120954e-06, + "loss": 0.7995, + "step": 11077 + }, + { + "epoch": 0.83825810601188, + "grad_norm": 2.2130842208862305, + "learning_rate": 1.256556736108007e-06, + "loss": 0.6344, + "step": 11078 + }, + { + "epoch": 0.8383337747342137, + "grad_norm": 2.398723840713501, + "learning_rate": 1.2554091390233841e-06, + "loss": 0.7178, + "step": 11079 + }, + { + "epoch": 0.8384094434565472, + "grad_norm": 2.5538170337677, + "learning_rate": 1.2542620269300912e-06, + "loss": 0.7334, + "step": 11080 + }, + { + "epoch": 0.8384851121788809, + "grad_norm": 2.482813596725464, + "learning_rate": 1.253115399899962e-06, + "loss": 0.6849, + "step": 11081 + }, + { + "epoch": 0.8385607809012144, + "grad_norm": 2.486558198928833, + "learning_rate": 1.2519692580048022e-06, + "loss": 0.6309, + "step": 11082 + }, + { + "epoch": 0.8386364496235481, + "grad_norm": 2.245243549346924, + "learning_rate": 1.250823601316388e-06, + "loss": 0.7787, + "step": 11083 + }, + { + "epoch": 0.8387121183458818, + "grad_norm": 2.0628883838653564, + "learning_rate": 1.2496784299064634e-06, + "loss": 0.6715, + "step": 11084 + }, + { + "epoch": 0.8387877870682153, + "grad_norm": 1.9107873439788818, + "learning_rate": 1.2485337438467425e-06, + "loss": 0.5899, + "step": 11085 + }, + { + "epoch": 0.838863455790549, + "grad_norm": 2.893704891204834, + "learning_rate": 1.2473895432089116e-06, + "loss": 0.7503, + "step": 11086 + }, + { + "epoch": 0.8389391245128826, + "grad_norm": 2.1818957328796387, + "learning_rate": 1.246245828064623e-06, + "loss": 0.6855, + "step": 11087 + }, + { + "epoch": 0.8390147932352162, + "grad_norm": 2.1986918449401855, + "learning_rate": 1.2451025984854952e-06, + "loss": 0.7521, + "step": 11088 + }, + { + "epoch": 0.8390904619575499, + "grad_norm": 2.2782137393951416, + "learning_rate": 1.2439598545431285e-06, + "loss": 0.6727, + "step": 11089 + }, + { + "epoch": 0.8391661306798834, + "grad_norm": 2.482480764389038, + "learning_rate": 1.2428175963090803e-06, + "loss": 0.7597, + "step": 11090 + }, + { + "epoch": 0.8392417994022171, + "grad_norm": 2.7252655029296875, + "learning_rate": 1.2416758238548872e-06, + "loss": 0.5514, + "step": 11091 + }, + { + "epoch": 0.8393174681245508, + "grad_norm": 2.0488505363464355, + "learning_rate": 1.2405345372520447e-06, + "loss": 0.589, + "step": 11092 + }, + { + "epoch": 0.8393931368468843, + "grad_norm": 2.3147764205932617, + "learning_rate": 1.2393937365720247e-06, + "loss": 0.6551, + "step": 11093 + }, + { + "epoch": 0.839468805569218, + "grad_norm": 1.9987397193908691, + "learning_rate": 1.2382534218862738e-06, + "loss": 0.8516, + "step": 11094 + }, + { + "epoch": 0.8395444742915515, + "grad_norm": 1.991015911102295, + "learning_rate": 1.2371135932661967e-06, + "loss": 0.5578, + "step": 11095 + }, + { + "epoch": 0.8396201430138852, + "grad_norm": 2.4074630737304688, + "learning_rate": 1.235974250783174e-06, + "loss": 0.8004, + "step": 11096 + }, + { + "epoch": 0.8396958117362189, + "grad_norm": 1.9555567502975464, + "learning_rate": 1.234835394508556e-06, + "loss": 0.7125, + "step": 11097 + }, + { + "epoch": 0.8397714804585524, + "grad_norm": 2.148538827896118, + "learning_rate": 1.2336970245136604e-06, + "loss": 0.6414, + "step": 11098 + }, + { + "epoch": 0.8398471491808861, + "grad_norm": 2.0188326835632324, + "learning_rate": 1.2325591408697773e-06, + "loss": 0.6591, + "step": 11099 + }, + { + "epoch": 0.8399228179032197, + "grad_norm": 2.890838146209717, + "learning_rate": 1.2314217436481636e-06, + "loss": 0.6491, + "step": 11100 + }, + { + "epoch": 0.8399984866255533, + "grad_norm": 1.9198577404022217, + "learning_rate": 1.2302848329200484e-06, + "loss": 0.5948, + "step": 11101 + }, + { + "epoch": 0.840074155347887, + "grad_norm": 12.436066627502441, + "learning_rate": 1.2291484087566258e-06, + "loss": 0.7092, + "step": 11102 + }, + { + "epoch": 0.8401498240702205, + "grad_norm": 2.7914984226226807, + "learning_rate": 1.2280124712290618e-06, + "loss": 0.6803, + "step": 11103 + }, + { + "epoch": 0.8402254927925542, + "grad_norm": 2.818707227706909, + "learning_rate": 1.2268770204084955e-06, + "loss": 0.7888, + "step": 11104 + }, + { + "epoch": 0.8403011615148879, + "grad_norm": 2.656458854675293, + "learning_rate": 1.225742056366035e-06, + "loss": 0.6282, + "step": 11105 + }, + { + "epoch": 0.8403768302372214, + "grad_norm": 2.1726672649383545, + "learning_rate": 1.2246075791727494e-06, + "loss": 0.6697, + "step": 11106 + }, + { + "epoch": 0.8404524989595551, + "grad_norm": 2.160932779312134, + "learning_rate": 1.223473588899685e-06, + "loss": 0.7079, + "step": 11107 + }, + { + "epoch": 0.8405281676818886, + "grad_norm": 1.9137705564498901, + "learning_rate": 1.222340085617858e-06, + "loss": 0.6773, + "step": 11108 + }, + { + "epoch": 0.8406038364042223, + "grad_norm": 2.2129149436950684, + "learning_rate": 1.2212070693982505e-06, + "loss": 0.7601, + "step": 11109 + }, + { + "epoch": 0.840679505126556, + "grad_norm": 1.9030208587646484, + "learning_rate": 1.2200745403118159e-06, + "loss": 0.7088, + "step": 11110 + }, + { + "epoch": 0.8407551738488895, + "grad_norm": 4.24271821975708, + "learning_rate": 1.2189424984294774e-06, + "loss": 0.6975, + "step": 11111 + }, + { + "epoch": 0.8408308425712232, + "grad_norm": 2.1732661724090576, + "learning_rate": 1.217810943822128e-06, + "loss": 0.7062, + "step": 11112 + }, + { + "epoch": 0.8409065112935568, + "grad_norm": 2.135040521621704, + "learning_rate": 1.2166798765606255e-06, + "loss": 0.5736, + "step": 11113 + }, + { + "epoch": 0.8409821800158904, + "grad_norm": 1.6705262660980225, + "learning_rate": 1.2155492967158019e-06, + "loss": 0.5572, + "step": 11114 + }, + { + "epoch": 0.8410578487382241, + "grad_norm": 2.2693963050842285, + "learning_rate": 1.2144192043584637e-06, + "loss": 0.6938, + "step": 11115 + }, + { + "epoch": 0.8411335174605576, + "grad_norm": 2.54014253616333, + "learning_rate": 1.2132895995593742e-06, + "loss": 0.6641, + "step": 11116 + }, + { + "epoch": 0.8412091861828913, + "grad_norm": 1.9177050590515137, + "learning_rate": 1.212160482389275e-06, + "loss": 0.6905, + "step": 11117 + }, + { + "epoch": 0.841284854905225, + "grad_norm": 2.4678313732147217, + "learning_rate": 1.2110318529188764e-06, + "loss": 0.8327, + "step": 11118 + }, + { + "epoch": 0.8413605236275585, + "grad_norm": 1.8306165933609009, + "learning_rate": 1.209903711218855e-06, + "loss": 0.6428, + "step": 11119 + }, + { + "epoch": 0.8414361923498922, + "grad_norm": 10.050545692443848, + "learning_rate": 1.208776057359859e-06, + "loss": 0.5172, + "step": 11120 + }, + { + "epoch": 0.8415118610722258, + "grad_norm": 2.323607921600342, + "learning_rate": 1.207648891412507e-06, + "loss": 0.7041, + "step": 11121 + }, + { + "epoch": 0.8415875297945594, + "grad_norm": 2.1820437908172607, + "learning_rate": 1.206522213447384e-06, + "loss": 0.727, + "step": 11122 + }, + { + "epoch": 0.8416631985168931, + "grad_norm": 2.826960563659668, + "learning_rate": 1.2053960235350498e-06, + "loss": 0.6435, + "step": 11123 + }, + { + "epoch": 0.8417388672392266, + "grad_norm": 2.4186246395111084, + "learning_rate": 1.2042703217460235e-06, + "loss": 0.6031, + "step": 11124 + }, + { + "epoch": 0.8418145359615603, + "grad_norm": 1.740941047668457, + "learning_rate": 1.2031451081508057e-06, + "loss": 0.619, + "step": 11125 + }, + { + "epoch": 0.8418902046838939, + "grad_norm": 2.0095555782318115, + "learning_rate": 1.2020203828198617e-06, + "loss": 0.7084, + "step": 11126 + }, + { + "epoch": 0.8419658734062275, + "grad_norm": 2.2118263244628906, + "learning_rate": 1.2008961458236206e-06, + "loss": 0.6634, + "step": 11127 + }, + { + "epoch": 0.8420415421285612, + "grad_norm": 2.157186269760132, + "learning_rate": 1.1997723972324888e-06, + "loss": 0.62, + "step": 11128 + }, + { + "epoch": 0.8421172108508947, + "grad_norm": 2.188436985015869, + "learning_rate": 1.198649137116838e-06, + "loss": 0.6341, + "step": 11129 + }, + { + "epoch": 0.8421928795732284, + "grad_norm": 2.3916919231414795, + "learning_rate": 1.197526365547011e-06, + "loss": 0.6438, + "step": 11130 + }, + { + "epoch": 0.8422685482955621, + "grad_norm": 2.1905405521392822, + "learning_rate": 1.1964040825933196e-06, + "loss": 0.7768, + "step": 11131 + }, + { + "epoch": 0.8423442170178956, + "grad_norm": 2.423164129257202, + "learning_rate": 1.1952822883260445e-06, + "loss": 0.6209, + "step": 11132 + }, + { + "epoch": 0.8424198857402293, + "grad_norm": 2.417698621749878, + "learning_rate": 1.1941609828154374e-06, + "loss": 0.7285, + "step": 11133 + }, + { + "epoch": 0.8424955544625629, + "grad_norm": 2.6083669662475586, + "learning_rate": 1.1930401661317124e-06, + "loss": 0.6182, + "step": 11134 + }, + { + "epoch": 0.8425712231848965, + "grad_norm": 2.146847724914551, + "learning_rate": 1.1919198383450663e-06, + "loss": 0.6415, + "step": 11135 + }, + { + "epoch": 0.8426468919072302, + "grad_norm": 2.49105167388916, + "learning_rate": 1.190799999525653e-06, + "loss": 0.6844, + "step": 11136 + }, + { + "epoch": 0.8427225606295637, + "grad_norm": 2.345392942428589, + "learning_rate": 1.189680649743604e-06, + "loss": 0.56, + "step": 11137 + }, + { + "epoch": 0.8427982293518974, + "grad_norm": 2.192192316055298, + "learning_rate": 1.1885617890690128e-06, + "loss": 0.6248, + "step": 11138 + }, + { + "epoch": 0.842873898074231, + "grad_norm": 3.144986867904663, + "learning_rate": 1.1874434175719458e-06, + "loss": 0.6772, + "step": 11139 + }, + { + "epoch": 0.8429495667965646, + "grad_norm": 1.7006821632385254, + "learning_rate": 1.1863255353224444e-06, + "loss": 0.5736, + "step": 11140 + }, + { + "epoch": 0.8430252355188983, + "grad_norm": 3.1362383365631104, + "learning_rate": 1.1852081423905087e-06, + "loss": 0.6766, + "step": 11141 + }, + { + "epoch": 0.8431009042412319, + "grad_norm": 2.2844927310943604, + "learning_rate": 1.1840912388461152e-06, + "loss": 0.6861, + "step": 11142 + }, + { + "epoch": 0.8431765729635655, + "grad_norm": 2.2476208209991455, + "learning_rate": 1.1829748247592082e-06, + "loss": 0.7521, + "step": 11143 + }, + { + "epoch": 0.8432522416858992, + "grad_norm": 2.138150215148926, + "learning_rate": 1.181858900199702e-06, + "loss": 0.6996, + "step": 11144 + }, + { + "epoch": 0.8433279104082327, + "grad_norm": 2.889930009841919, + "learning_rate": 1.1807434652374754e-06, + "loss": 0.6901, + "step": 11145 + }, + { + "epoch": 0.8434035791305664, + "grad_norm": 1.7074170112609863, + "learning_rate": 1.1796285199423857e-06, + "loss": 0.6359, + "step": 11146 + }, + { + "epoch": 0.8434792478529, + "grad_norm": 2.217623710632324, + "learning_rate": 1.178514064384254e-06, + "loss": 0.6327, + "step": 11147 + }, + { + "epoch": 0.8435549165752336, + "grad_norm": 1.8675048351287842, + "learning_rate": 1.1774000986328665e-06, + "loss": 0.5452, + "step": 11148 + }, + { + "epoch": 0.8436305852975673, + "grad_norm": 1.739334225654602, + "learning_rate": 1.1762866227579872e-06, + "loss": 0.6498, + "step": 11149 + }, + { + "epoch": 0.8437062540199008, + "grad_norm": 1.5697089433670044, + "learning_rate": 1.1751736368293417e-06, + "loss": 0.5786, + "step": 11150 + }, + { + "epoch": 0.8437819227422345, + "grad_norm": 2.253666639328003, + "learning_rate": 1.1740611409166368e-06, + "loss": 0.7211, + "step": 11151 + }, + { + "epoch": 0.8438575914645681, + "grad_norm": 3.369987964630127, + "learning_rate": 1.172949135089532e-06, + "loss": 0.595, + "step": 11152 + }, + { + "epoch": 0.8439332601869017, + "grad_norm": 2.647265672683716, + "learning_rate": 1.171837619417669e-06, + "loss": 0.6779, + "step": 11153 + }, + { + "epoch": 0.8440089289092354, + "grad_norm": 2.462435483932495, + "learning_rate": 1.1707265939706543e-06, + "loss": 0.6241, + "step": 11154 + }, + { + "epoch": 0.844084597631569, + "grad_norm": 2.3992083072662354, + "learning_rate": 1.1696160588180617e-06, + "loss": 0.7099, + "step": 11155 + }, + { + "epoch": 0.8441602663539026, + "grad_norm": 2.0883431434631348, + "learning_rate": 1.1685060140294388e-06, + "loss": 0.7162, + "step": 11156 + }, + { + "epoch": 0.8442359350762363, + "grad_norm": 2.320479393005371, + "learning_rate": 1.1673964596742994e-06, + "loss": 0.6615, + "step": 11157 + }, + { + "epoch": 0.8443116037985698, + "grad_norm": 2.3560616970062256, + "learning_rate": 1.1662873958221294e-06, + "loss": 0.6776, + "step": 11158 + }, + { + "epoch": 0.8443872725209035, + "grad_norm": 2.070371389389038, + "learning_rate": 1.165178822542378e-06, + "loss": 0.5541, + "step": 11159 + }, + { + "epoch": 0.8444629412432371, + "grad_norm": 7.825291633605957, + "learning_rate": 1.164070739904468e-06, + "loss": 0.8369, + "step": 11160 + }, + { + "epoch": 0.8445386099655707, + "grad_norm": 2.257880687713623, + "learning_rate": 1.1629631479777953e-06, + "loss": 0.7629, + "step": 11161 + }, + { + "epoch": 0.8446142786879044, + "grad_norm": 2.6338634490966797, + "learning_rate": 1.161856046831718e-06, + "loss": 0.7076, + "step": 11162 + }, + { + "epoch": 0.844689947410238, + "grad_norm": 2.4323246479034424, + "learning_rate": 1.1607494365355664e-06, + "loss": 0.7328, + "step": 11163 + }, + { + "epoch": 0.8447656161325716, + "grad_norm": 2.372086524963379, + "learning_rate": 1.1596433171586389e-06, + "loss": 0.7085, + "step": 11164 + }, + { + "epoch": 0.8448412848549052, + "grad_norm": 2.4352121353149414, + "learning_rate": 1.1585376887702074e-06, + "loss": 0.7244, + "step": 11165 + }, + { + "epoch": 0.8449169535772388, + "grad_norm": 1.8406596183776855, + "learning_rate": 1.1574325514395073e-06, + "loss": 0.6614, + "step": 11166 + }, + { + "epoch": 0.8449926222995725, + "grad_norm": 2.4071455001831055, + "learning_rate": 1.1563279052357464e-06, + "loss": 0.8433, + "step": 11167 + }, + { + "epoch": 0.8450682910219061, + "grad_norm": 1.97054123878479, + "learning_rate": 1.1552237502281023e-06, + "loss": 0.624, + "step": 11168 + }, + { + "epoch": 0.8451439597442397, + "grad_norm": 2.932701826095581, + "learning_rate": 1.1541200864857225e-06, + "loss": 0.6386, + "step": 11169 + }, + { + "epoch": 0.8452196284665734, + "grad_norm": 2.2148234844207764, + "learning_rate": 1.153016914077714e-06, + "loss": 0.6634, + "step": 11170 + }, + { + "epoch": 0.845295297188907, + "grad_norm": 2.325047254562378, + "learning_rate": 1.1519142330731705e-06, + "loss": 0.6842, + "step": 11171 + }, + { + "epoch": 0.8453709659112406, + "grad_norm": 2.387718677520752, + "learning_rate": 1.1508120435411416e-06, + "loss": 0.6088, + "step": 11172 + }, + { + "epoch": 0.8454466346335742, + "grad_norm": 1.7872700691223145, + "learning_rate": 1.149710345550649e-06, + "loss": 0.6071, + "step": 11173 + }, + { + "epoch": 0.8455223033559078, + "grad_norm": 1.974530577659607, + "learning_rate": 1.148609139170685e-06, + "loss": 0.7307, + "step": 11174 + }, + { + "epoch": 0.8455979720782415, + "grad_norm": 2.9320738315582275, + "learning_rate": 1.147508424470212e-06, + "loss": 0.6461, + "step": 11175 + }, + { + "epoch": 0.845673640800575, + "grad_norm": 2.1946792602539062, + "learning_rate": 1.146408201518159e-06, + "loss": 0.8129, + "step": 11176 + }, + { + "epoch": 0.8457493095229087, + "grad_norm": 2.6159780025482178, + "learning_rate": 1.1453084703834259e-06, + "loss": 0.7393, + "step": 11177 + }, + { + "epoch": 0.8458249782452423, + "grad_norm": 2.3954858779907227, + "learning_rate": 1.1442092311348814e-06, + "loss": 0.6058, + "step": 11178 + }, + { + "epoch": 0.8459006469675759, + "grad_norm": 2.413275957107544, + "learning_rate": 1.1431104838413637e-06, + "loss": 0.6777, + "step": 11179 + }, + { + "epoch": 0.8459763156899096, + "grad_norm": 1.921155333518982, + "learning_rate": 1.1420122285716798e-06, + "loss": 0.5244, + "step": 11180 + }, + { + "epoch": 0.8460519844122432, + "grad_norm": 2.647214651107788, + "learning_rate": 1.1409144653946064e-06, + "loss": 0.6092, + "step": 11181 + }, + { + "epoch": 0.8461276531345768, + "grad_norm": 3.575390338897705, + "learning_rate": 1.1398171943788878e-06, + "loss": 0.6721, + "step": 11182 + }, + { + "epoch": 0.8462033218569105, + "grad_norm": 2.585902452468872, + "learning_rate": 1.1387204155932418e-06, + "loss": 0.6889, + "step": 11183 + }, + { + "epoch": 0.846278990579244, + "grad_norm": 2.851043462753296, + "learning_rate": 1.1376241291063476e-06, + "loss": 0.6995, + "step": 11184 + }, + { + "epoch": 0.8463546593015777, + "grad_norm": 1.9983781576156616, + "learning_rate": 1.1365283349868602e-06, + "loss": 0.6446, + "step": 11185 + }, + { + "epoch": 0.8464303280239113, + "grad_norm": 2.99881911277771, + "learning_rate": 1.1354330333034028e-06, + "loss": 0.7288, + "step": 11186 + }, + { + "epoch": 0.8465059967462449, + "grad_norm": 2.3670239448547363, + "learning_rate": 1.1343382241245656e-06, + "loss": 0.7242, + "step": 11187 + }, + { + "epoch": 0.8465816654685786, + "grad_norm": 2.0711822509765625, + "learning_rate": 1.1332439075189095e-06, + "loss": 0.6653, + "step": 11188 + }, + { + "epoch": 0.8466573341909122, + "grad_norm": 1.839362382888794, + "learning_rate": 1.132150083554964e-06, + "loss": 0.6028, + "step": 11189 + }, + { + "epoch": 0.8467330029132458, + "grad_norm": 2.130958080291748, + "learning_rate": 1.1310567523012298e-06, + "loss": 0.7051, + "step": 11190 + }, + { + "epoch": 0.8468086716355794, + "grad_norm": 2.0340983867645264, + "learning_rate": 1.1299639138261687e-06, + "loss": 0.6712, + "step": 11191 + }, + { + "epoch": 0.846884340357913, + "grad_norm": 2.854250192642212, + "learning_rate": 1.1288715681982247e-06, + "loss": 0.6333, + "step": 11192 + }, + { + "epoch": 0.8469600090802467, + "grad_norm": 2.552464485168457, + "learning_rate": 1.127779715485802e-06, + "loss": 0.6445, + "step": 11193 + }, + { + "epoch": 0.8470356778025803, + "grad_norm": 2.156697988510132, + "learning_rate": 1.1266883557572762e-06, + "loss": 0.6876, + "step": 11194 + }, + { + "epoch": 0.8471113465249139, + "grad_norm": 1.9202338457107544, + "learning_rate": 1.1255974890809892e-06, + "loss": 0.8281, + "step": 11195 + }, + { + "epoch": 0.8471870152472476, + "grad_norm": 2.571288585662842, + "learning_rate": 1.1245071155252547e-06, + "loss": 0.7569, + "step": 11196 + }, + { + "epoch": 0.8472626839695812, + "grad_norm": 2.219005823135376, + "learning_rate": 1.1234172351583611e-06, + "loss": 0.5319, + "step": 11197 + }, + { + "epoch": 0.8473383526919148, + "grad_norm": 1.8485450744628906, + "learning_rate": 1.1223278480485535e-06, + "loss": 0.5917, + "step": 11198 + }, + { + "epoch": 0.8474140214142484, + "grad_norm": 2.1197338104248047, + "learning_rate": 1.1212389542640566e-06, + "loss": 0.6706, + "step": 11199 + }, + { + "epoch": 0.847489690136582, + "grad_norm": 2.1624670028686523, + "learning_rate": 1.1201505538730586e-06, + "loss": 0.6419, + "step": 11200 + }, + { + "epoch": 0.8475653588589157, + "grad_norm": 6.582333087921143, + "learning_rate": 1.1190626469437192e-06, + "loss": 0.6285, + "step": 11201 + }, + { + "epoch": 0.8476410275812493, + "grad_norm": 2.2934112548828125, + "learning_rate": 1.117975233544168e-06, + "loss": 0.6322, + "step": 11202 + }, + { + "epoch": 0.8477166963035829, + "grad_norm": 2.3044066429138184, + "learning_rate": 1.1168883137425003e-06, + "loss": 0.6555, + "step": 11203 + }, + { + "epoch": 0.8477923650259165, + "grad_norm": 2.2896509170532227, + "learning_rate": 1.1158018876067855e-06, + "loss": 0.6963, + "step": 11204 + }, + { + "epoch": 0.8478680337482501, + "grad_norm": 3.5141971111297607, + "learning_rate": 1.1147159552050557e-06, + "loss": 0.6393, + "step": 11205 + }, + { + "epoch": 0.8479437024705838, + "grad_norm": 2.2190842628479004, + "learning_rate": 1.113630516605315e-06, + "loss": 0.666, + "step": 11206 + }, + { + "epoch": 0.8480193711929174, + "grad_norm": 1.9215750694274902, + "learning_rate": 1.1125455718755402e-06, + "loss": 0.4681, + "step": 11207 + }, + { + "epoch": 0.848095039915251, + "grad_norm": 1.961254596710205, + "learning_rate": 1.1114611210836752e-06, + "loss": 0.6418, + "step": 11208 + }, + { + "epoch": 0.8481707086375847, + "grad_norm": 1.8327749967575073, + "learning_rate": 1.1103771642976272e-06, + "loss": 0.7484, + "step": 11209 + }, + { + "epoch": 0.8482463773599183, + "grad_norm": 2.4051928520202637, + "learning_rate": 1.1092937015852793e-06, + "loss": 0.6549, + "step": 11210 + }, + { + "epoch": 0.8483220460822519, + "grad_norm": 2.0945074558258057, + "learning_rate": 1.108210733014482e-06, + "loss": 0.6683, + "step": 11211 + }, + { + "epoch": 0.8483977148045855, + "grad_norm": 2.168553113937378, + "learning_rate": 1.1071282586530533e-06, + "loss": 0.7284, + "step": 11212 + }, + { + "epoch": 0.8484733835269191, + "grad_norm": 2.0133745670318604, + "learning_rate": 1.1060462785687816e-06, + "loss": 0.6258, + "step": 11213 + }, + { + "epoch": 0.8485490522492528, + "grad_norm": 2.4811995029449463, + "learning_rate": 1.104964792829424e-06, + "loss": 0.6499, + "step": 11214 + }, + { + "epoch": 0.8486247209715864, + "grad_norm": 2.246004343032837, + "learning_rate": 1.1038838015027091e-06, + "loss": 0.7389, + "step": 11215 + }, + { + "epoch": 0.84870038969392, + "grad_norm": 3.04372501373291, + "learning_rate": 1.1028033046563251e-06, + "loss": 0.7067, + "step": 11216 + }, + { + "epoch": 0.8487760584162536, + "grad_norm": 4.410548210144043, + "learning_rate": 1.1017233023579434e-06, + "loss": 0.7638, + "step": 11217 + }, + { + "epoch": 0.8488517271385873, + "grad_norm": 2.3239493370056152, + "learning_rate": 1.1006437946751964e-06, + "loss": 0.6129, + "step": 11218 + }, + { + "epoch": 0.8489273958609209, + "grad_norm": 1.7490601539611816, + "learning_rate": 1.0995647816756827e-06, + "loss": 0.6007, + "step": 11219 + }, + { + "epoch": 0.8490030645832545, + "grad_norm": 4.648648738861084, + "learning_rate": 1.0984862634269753e-06, + "loss": 0.5775, + "step": 11220 + }, + { + "epoch": 0.8490787333055881, + "grad_norm": 1.797368049621582, + "learning_rate": 1.0974082399966151e-06, + "loss": 0.6743, + "step": 11221 + }, + { + "epoch": 0.8491544020279218, + "grad_norm": 2.3413918018341064, + "learning_rate": 1.0963307114521103e-06, + "loss": 0.9284, + "step": 11222 + }, + { + "epoch": 0.8492300707502554, + "grad_norm": 2.0559816360473633, + "learning_rate": 1.0952536778609407e-06, + "loss": 0.6177, + "step": 11223 + }, + { + "epoch": 0.849305739472589, + "grad_norm": 1.990634560585022, + "learning_rate": 1.0941771392905526e-06, + "loss": 0.5489, + "step": 11224 + }, + { + "epoch": 0.8493814081949226, + "grad_norm": 1.8581657409667969, + "learning_rate": 1.0931010958083619e-06, + "loss": 0.6661, + "step": 11225 + }, + { + "epoch": 0.8494570769172562, + "grad_norm": 3.590940475463867, + "learning_rate": 1.0920255474817577e-06, + "loss": 0.5803, + "step": 11226 + }, + { + "epoch": 0.8495327456395899, + "grad_norm": 1.9839054346084595, + "learning_rate": 1.0909504943780863e-06, + "loss": 0.7265, + "step": 11227 + }, + { + "epoch": 0.8496084143619235, + "grad_norm": 2.0688846111297607, + "learning_rate": 1.0898759365646786e-06, + "loss": 0.6612, + "step": 11228 + }, + { + "epoch": 0.8496840830842571, + "grad_norm": 2.5211896896362305, + "learning_rate": 1.0888018741088258e-06, + "loss": 0.5971, + "step": 11229 + }, + { + "epoch": 0.8497597518065907, + "grad_norm": 1.9265793561935425, + "learning_rate": 1.0877283070777852e-06, + "loss": 0.7328, + "step": 11230 + }, + { + "epoch": 0.8498354205289244, + "grad_norm": 2.134287118911743, + "learning_rate": 1.08665523553879e-06, + "loss": 0.6763, + "step": 11231 + }, + { + "epoch": 0.849911089251258, + "grad_norm": 2.190967082977295, + "learning_rate": 1.0855826595590385e-06, + "loss": 0.7203, + "step": 11232 + }, + { + "epoch": 0.8499867579735916, + "grad_norm": 2.094014883041382, + "learning_rate": 1.0845105792056989e-06, + "loss": 0.7426, + "step": 11233 + }, + { + "epoch": 0.8500624266959252, + "grad_norm": 2.0137734413146973, + "learning_rate": 1.0834389945459096e-06, + "loss": 0.6875, + "step": 11234 + }, + { + "epoch": 0.8501380954182589, + "grad_norm": 2.131049156188965, + "learning_rate": 1.0823679056467746e-06, + "loss": 0.6814, + "step": 11235 + }, + { + "epoch": 0.8502137641405925, + "grad_norm": 2.530545949935913, + "learning_rate": 1.0812973125753708e-06, + "loss": 0.5937, + "step": 11236 + }, + { + "epoch": 0.8502894328629261, + "grad_norm": 2.1850709915161133, + "learning_rate": 1.080227215398741e-06, + "loss": 0.7354, + "step": 11237 + }, + { + "epoch": 0.8503651015852597, + "grad_norm": 3.4205210208892822, + "learning_rate": 1.0791576141838997e-06, + "loss": 0.6784, + "step": 11238 + }, + { + "epoch": 0.8504407703075934, + "grad_norm": 1.7098037004470825, + "learning_rate": 1.0780885089978268e-06, + "loss": 0.6278, + "step": 11239 + }, + { + "epoch": 0.850516439029927, + "grad_norm": 2.0882728099823, + "learning_rate": 1.0770198999074763e-06, + "loss": 0.5592, + "step": 11240 + }, + { + "epoch": 0.8505921077522606, + "grad_norm": 1.8970396518707275, + "learning_rate": 1.0759517869797636e-06, + "loss": 0.577, + "step": 11241 + }, + { + "epoch": 0.8506677764745942, + "grad_norm": 2.3336918354034424, + "learning_rate": 1.0748841702815775e-06, + "loss": 0.6625, + "step": 11242 + }, + { + "epoch": 0.8507434451969278, + "grad_norm": 2.1653034687042236, + "learning_rate": 1.0738170498797813e-06, + "loss": 0.7601, + "step": 11243 + }, + { + "epoch": 0.8508191139192615, + "grad_norm": 2.735419988632202, + "learning_rate": 1.0727504258411958e-06, + "loss": 0.7221, + "step": 11244 + }, + { + "epoch": 0.8508947826415951, + "grad_norm": 2.724544048309326, + "learning_rate": 1.0716842982326182e-06, + "loss": 0.6285, + "step": 11245 + }, + { + "epoch": 0.8509704513639287, + "grad_norm": 1.9268399477005005, + "learning_rate": 1.0706186671208144e-06, + "loss": 0.6552, + "step": 11246 + }, + { + "epoch": 0.8510461200862623, + "grad_norm": 3.0010483264923096, + "learning_rate": 1.069553532572515e-06, + "loss": 0.6105, + "step": 11247 + }, + { + "epoch": 0.851121788808596, + "grad_norm": 2.1853091716766357, + "learning_rate": 1.0684888946544244e-06, + "loss": 0.6504, + "step": 11248 + }, + { + "epoch": 0.8511974575309296, + "grad_norm": 2.737804412841797, + "learning_rate": 1.0674247534332125e-06, + "loss": 0.6128, + "step": 11249 + }, + { + "epoch": 0.8512731262532632, + "grad_norm": 2.3587794303894043, + "learning_rate": 1.066361108975522e-06, + "loss": 0.6185, + "step": 11250 + }, + { + "epoch": 0.8513487949755968, + "grad_norm": 1.3188940286636353, + "learning_rate": 1.0652979613479574e-06, + "loss": 0.7213, + "step": 11251 + }, + { + "epoch": 0.8514244636979305, + "grad_norm": 2.1479740142822266, + "learning_rate": 1.0642353106170956e-06, + "loss": 0.4939, + "step": 11252 + }, + { + "epoch": 0.8515001324202641, + "grad_norm": 2.823068141937256, + "learning_rate": 1.0631731568494884e-06, + "loss": 0.7325, + "step": 11253 + }, + { + "epoch": 0.8515758011425977, + "grad_norm": 1.8597098588943481, + "learning_rate": 1.0621115001116516e-06, + "loss": 0.5473, + "step": 11254 + }, + { + "epoch": 0.8516514698649313, + "grad_norm": 2.1567814350128174, + "learning_rate": 1.0610503404700639e-06, + "loss": 0.7219, + "step": 11255 + }, + { + "epoch": 0.851727138587265, + "grad_norm": 2.5054931640625, + "learning_rate": 1.0599896779911822e-06, + "loss": 0.7023, + "step": 11256 + }, + { + "epoch": 0.8518028073095986, + "grad_norm": 2.3276219367980957, + "learning_rate": 1.0589295127414283e-06, + "loss": 0.661, + "step": 11257 + }, + { + "epoch": 0.8518784760319322, + "grad_norm": 2.0778353214263916, + "learning_rate": 1.0578698447871923e-06, + "loss": 0.7084, + "step": 11258 + }, + { + "epoch": 0.8519541447542658, + "grad_norm": 2.253126621246338, + "learning_rate": 1.056810674194835e-06, + "loss": 0.6965, + "step": 11259 + }, + { + "epoch": 0.8520298134765995, + "grad_norm": 2.866034984588623, + "learning_rate": 1.0557520010306842e-06, + "loss": 0.7603, + "step": 11260 + }, + { + "epoch": 0.8521054821989331, + "grad_norm": 2.482037305831909, + "learning_rate": 1.0546938253610389e-06, + "loss": 0.5681, + "step": 11261 + }, + { + "epoch": 0.8521811509212667, + "grad_norm": 1.911925196647644, + "learning_rate": 1.0536361472521644e-06, + "loss": 0.7476, + "step": 11262 + }, + { + "epoch": 0.8522568196436003, + "grad_norm": 2.388798236846924, + "learning_rate": 1.0525789667702935e-06, + "loss": 0.6213, + "step": 11263 + }, + { + "epoch": 0.8523324883659339, + "grad_norm": 2.2827963829040527, + "learning_rate": 1.051522283981636e-06, + "loss": 0.7701, + "step": 11264 + }, + { + "epoch": 0.8524081570882676, + "grad_norm": 2.937349796295166, + "learning_rate": 1.0504660989523604e-06, + "loss": 0.7427, + "step": 11265 + }, + { + "epoch": 0.8524838258106012, + "grad_norm": 1.9935485124588013, + "learning_rate": 1.0494104117486086e-06, + "loss": 0.6395, + "step": 11266 + }, + { + "epoch": 0.8525594945329348, + "grad_norm": 1.938444972038269, + "learning_rate": 1.0483552224364936e-06, + "loss": 0.639, + "step": 11267 + }, + { + "epoch": 0.8526351632552684, + "grad_norm": 2.6357712745666504, + "learning_rate": 1.047300531082092e-06, + "loss": 0.7034, + "step": 11268 + }, + { + "epoch": 0.8527108319776021, + "grad_norm": 1.8678675889968872, + "learning_rate": 1.0462463377514543e-06, + "loss": 0.7342, + "step": 11269 + }, + { + "epoch": 0.8527865006999357, + "grad_norm": 2.1463828086853027, + "learning_rate": 1.045192642510596e-06, + "loss": 0.6495, + "step": 11270 + }, + { + "epoch": 0.8528621694222693, + "grad_norm": 1.9937474727630615, + "learning_rate": 1.0441394454255035e-06, + "loss": 0.6389, + "step": 11271 + }, + { + "epoch": 0.8529378381446029, + "grad_norm": 2.2576024532318115, + "learning_rate": 1.0430867465621328e-06, + "loss": 0.676, + "step": 11272 + }, + { + "epoch": 0.8530135068669366, + "grad_norm": 2.1842191219329834, + "learning_rate": 1.0420345459864023e-06, + "loss": 0.5946, + "step": 11273 + }, + { + "epoch": 0.8530891755892702, + "grad_norm": 1.927615761756897, + "learning_rate": 1.0409828437642092e-06, + "loss": 0.6436, + "step": 11274 + }, + { + "epoch": 0.8531648443116038, + "grad_norm": 3.0947635173797607, + "learning_rate": 1.039931639961416e-06, + "loss": 0.5957, + "step": 11275 + }, + { + "epoch": 0.8532405130339374, + "grad_norm": 2.468949556350708, + "learning_rate": 1.0388809346438467e-06, + "loss": 0.618, + "step": 11276 + }, + { + "epoch": 0.853316181756271, + "grad_norm": 3.7624311447143555, + "learning_rate": 1.037830727877303e-06, + "loss": 0.6508, + "step": 11277 + }, + { + "epoch": 0.8533918504786047, + "grad_norm": 1.588759183883667, + "learning_rate": 1.036781019727552e-06, + "loss": 0.6209, + "step": 11278 + }, + { + "epoch": 0.8534675192009383, + "grad_norm": 2.215994119644165, + "learning_rate": 1.0357318102603293e-06, + "loss": 0.6864, + "step": 11279 + }, + { + "epoch": 0.8535431879232719, + "grad_norm": 2.4442219734191895, + "learning_rate": 1.0346830995413405e-06, + "loss": 0.5362, + "step": 11280 + }, + { + "epoch": 0.8536188566456056, + "grad_norm": 1.9089670181274414, + "learning_rate": 1.0336348876362587e-06, + "loss": 0.6868, + "step": 11281 + }, + { + "epoch": 0.8536945253679392, + "grad_norm": 2.9302616119384766, + "learning_rate": 1.0325871746107266e-06, + "loss": 0.8656, + "step": 11282 + }, + { + "epoch": 0.8537701940902728, + "grad_norm": 2.2180144786834717, + "learning_rate": 1.0315399605303558e-06, + "loss": 0.5997, + "step": 11283 + }, + { + "epoch": 0.8538458628126064, + "grad_norm": 2.2152187824249268, + "learning_rate": 1.0304932454607254e-06, + "loss": 0.675, + "step": 11284 + }, + { + "epoch": 0.85392153153494, + "grad_norm": 1.842807412147522, + "learning_rate": 1.0294470294673846e-06, + "loss": 0.6631, + "step": 11285 + }, + { + "epoch": 0.8539972002572737, + "grad_norm": 2.476421356201172, + "learning_rate": 1.0284013126158527e-06, + "loss": 0.5843, + "step": 11286 + }, + { + "epoch": 0.8540728689796073, + "grad_norm": 2.07633113861084, + "learning_rate": 1.0273560949716123e-06, + "loss": 0.6895, + "step": 11287 + }, + { + "epoch": 0.8541485377019409, + "grad_norm": 2.113013744354248, + "learning_rate": 1.026311376600117e-06, + "loss": 0.7096, + "step": 11288 + }, + { + "epoch": 0.8542242064242745, + "grad_norm": 1.9778804779052734, + "learning_rate": 1.0252671575667984e-06, + "loss": 0.5172, + "step": 11289 + }, + { + "epoch": 0.8542998751466081, + "grad_norm": 2.0087730884552, + "learning_rate": 1.0242234379370402e-06, + "loss": 0.5877, + "step": 11290 + }, + { + "epoch": 0.8543755438689418, + "grad_norm": 2.063204288482666, + "learning_rate": 1.0231802177762084e-06, + "loss": 0.6326, + "step": 11291 + }, + { + "epoch": 0.8544512125912754, + "grad_norm": 2.7657787799835205, + "learning_rate": 1.0221374971496316e-06, + "loss": 0.7622, + "step": 11292 + }, + { + "epoch": 0.854526881313609, + "grad_norm": 1.7157849073410034, + "learning_rate": 1.0210952761226075e-06, + "loss": 0.7045, + "step": 11293 + }, + { + "epoch": 0.8546025500359427, + "grad_norm": 2.532585620880127, + "learning_rate": 1.020053554760405e-06, + "loss": 0.5766, + "step": 11294 + }, + { + "epoch": 0.8546782187582763, + "grad_norm": 2.225200891494751, + "learning_rate": 1.0190123331282586e-06, + "loss": 0.6826, + "step": 11295 + }, + { + "epoch": 0.8547538874806099, + "grad_norm": 2.5582962036132812, + "learning_rate": 1.0179716112913737e-06, + "loss": 0.6516, + "step": 11296 + }, + { + "epoch": 0.8548295562029435, + "grad_norm": 1.9359956979751587, + "learning_rate": 1.0169313893149256e-06, + "loss": 0.6565, + "step": 11297 + }, + { + "epoch": 0.8549052249252771, + "grad_norm": 1.8306528329849243, + "learning_rate": 1.0158916672640524e-06, + "loss": 0.7271, + "step": 11298 + }, + { + "epoch": 0.8549808936476108, + "grad_norm": 2.077057123184204, + "learning_rate": 1.0148524452038643e-06, + "loss": 0.6396, + "step": 11299 + }, + { + "epoch": 0.8550565623699444, + "grad_norm": 2.082470655441284, + "learning_rate": 1.0138137231994477e-06, + "loss": 0.7199, + "step": 11300 + }, + { + "epoch": 0.855132231092278, + "grad_norm": 1.7077547311782837, + "learning_rate": 1.0127755013158432e-06, + "loss": 0.679, + "step": 11301 + }, + { + "epoch": 0.8552078998146116, + "grad_norm": 2.193718671798706, + "learning_rate": 1.0117377796180712e-06, + "loss": 0.7218, + "step": 11302 + }, + { + "epoch": 0.8552835685369452, + "grad_norm": 2.146713972091675, + "learning_rate": 1.010700558171117e-06, + "loss": 0.8642, + "step": 11303 + }, + { + "epoch": 0.8553592372592789, + "grad_norm": 2.3448047637939453, + "learning_rate": 1.0096638370399347e-06, + "loss": 0.6397, + "step": 11304 + }, + { + "epoch": 0.8554349059816125, + "grad_norm": 2.3762691020965576, + "learning_rate": 1.0086276162894462e-06, + "loss": 0.5452, + "step": 11305 + }, + { + "epoch": 0.8555105747039461, + "grad_norm": 2.4727251529693604, + "learning_rate": 1.0075918959845437e-06, + "loss": 0.7184, + "step": 11306 + }, + { + "epoch": 0.8555862434262798, + "grad_norm": 2.2382912635803223, + "learning_rate": 1.0065566761900882e-06, + "loss": 0.5858, + "step": 11307 + }, + { + "epoch": 0.8556619121486134, + "grad_norm": 2.0968942642211914, + "learning_rate": 1.005521956970907e-06, + "loss": 0.6225, + "step": 11308 + }, + { + "epoch": 0.855737580870947, + "grad_norm": 2.303739309310913, + "learning_rate": 1.0044877383917962e-06, + "loss": 0.7995, + "step": 11309 + }, + { + "epoch": 0.8558132495932806, + "grad_norm": 2.176666498184204, + "learning_rate": 1.003454020517525e-06, + "loss": 0.6295, + "step": 11310 + }, + { + "epoch": 0.8558889183156142, + "grad_norm": 2.0503792762756348, + "learning_rate": 1.0024208034128285e-06, + "loss": 0.599, + "step": 11311 + }, + { + "epoch": 0.8559645870379479, + "grad_norm": 2.041576385498047, + "learning_rate": 1.0013880871424082e-06, + "loss": 0.6261, + "step": 11312 + }, + { + "epoch": 0.8560402557602815, + "grad_norm": 2.608793258666992, + "learning_rate": 1.0003558717709352e-06, + "loss": 0.5635, + "step": 11313 + }, + { + "epoch": 0.8561159244826151, + "grad_norm": 2.738176107406616, + "learning_rate": 9.99324157363053e-07, + "loss": 0.7301, + "step": 11314 + }, + { + "epoch": 0.8561915932049488, + "grad_norm": 2.0771257877349854, + "learning_rate": 9.982929439833684e-07, + "loss": 0.6853, + "step": 11315 + }, + { + "epoch": 0.8562672619272823, + "grad_norm": 2.2196590900421143, + "learning_rate": 9.972622316964602e-07, + "loss": 0.5945, + "step": 11316 + }, + { + "epoch": 0.856342930649616, + "grad_norm": 2.066857099533081, + "learning_rate": 9.962320205668747e-07, + "loss": 0.6929, + "step": 11317 + }, + { + "epoch": 0.8564185993719496, + "grad_norm": 3.5094408988952637, + "learning_rate": 9.9520231065913e-07, + "loss": 0.7241, + "step": 11318 + }, + { + "epoch": 0.8564942680942832, + "grad_norm": 2.093404531478882, + "learning_rate": 9.94173102037703e-07, + "loss": 0.6483, + "step": 11319 + }, + { + "epoch": 0.8565699368166169, + "grad_norm": 2.078911304473877, + "learning_rate": 9.931443947670527e-07, + "loss": 0.6671, + "step": 11320 + }, + { + "epoch": 0.8566456055389505, + "grad_norm": 1.9345077276229858, + "learning_rate": 9.92116188911599e-07, + "loss": 0.665, + "step": 11321 + }, + { + "epoch": 0.8567212742612841, + "grad_norm": 1.7748900651931763, + "learning_rate": 9.91088484535729e-07, + "loss": 0.6294, + "step": 11322 + }, + { + "epoch": 0.8567969429836177, + "grad_norm": 2.2130126953125, + "learning_rate": 9.900612817038015e-07, + "loss": 0.637, + "step": 11323 + }, + { + "epoch": 0.8568726117059513, + "grad_norm": 2.3196053504943848, + "learning_rate": 9.890345804801428e-07, + "loss": 0.5467, + "step": 11324 + }, + { + "epoch": 0.856948280428285, + "grad_norm": 3.2533650398254395, + "learning_rate": 9.880083809290526e-07, + "loss": 0.6086, + "step": 11325 + }, + { + "epoch": 0.8570239491506186, + "grad_norm": 2.1937711238861084, + "learning_rate": 9.869826831147895e-07, + "loss": 0.5794, + "step": 11326 + }, + { + "epoch": 0.8570996178729522, + "grad_norm": 2.005005121231079, + "learning_rate": 9.85957487101588e-07, + "loss": 0.5671, + "step": 11327 + }, + { + "epoch": 0.8571752865952859, + "grad_norm": 2.504263401031494, + "learning_rate": 9.84932792953649e-07, + "loss": 0.6214, + "step": 11328 + }, + { + "epoch": 0.8572509553176194, + "grad_norm": 2.071230888366699, + "learning_rate": 9.83908600735142e-07, + "loss": 0.6633, + "step": 11329 + }, + { + "epoch": 0.8573266240399531, + "grad_norm": 2.571350574493408, + "learning_rate": 9.828849105102067e-07, + "loss": 0.6164, + "step": 11330 + }, + { + "epoch": 0.8574022927622867, + "grad_norm": 2.430696725845337, + "learning_rate": 9.81861722342948e-07, + "loss": 0.6413, + "step": 11331 + }, + { + "epoch": 0.8574779614846203, + "grad_norm": 3.106407403945923, + "learning_rate": 9.80839036297444e-07, + "loss": 0.5968, + "step": 11332 + }, + { + "epoch": 0.857553630206954, + "grad_norm": 2.656306505203247, + "learning_rate": 9.798168524377353e-07, + "loss": 0.621, + "step": 11333 + }, + { + "epoch": 0.8576292989292876, + "grad_norm": 2.564502000808716, + "learning_rate": 9.787951708278334e-07, + "loss": 0.8049, + "step": 11334 + }, + { + "epoch": 0.8577049676516212, + "grad_norm": 2.267310619354248, + "learning_rate": 9.77773991531726e-07, + "loss": 0.724, + "step": 11335 + }, + { + "epoch": 0.8577806363739549, + "grad_norm": 2.382852554321289, + "learning_rate": 9.767533146133558e-07, + "loss": 0.6997, + "step": 11336 + }, + { + "epoch": 0.8578563050962884, + "grad_norm": 3.9764015674591064, + "learning_rate": 9.757331401366431e-07, + "loss": 0.5756, + "step": 11337 + }, + { + "epoch": 0.8579319738186221, + "grad_norm": 2.423640727996826, + "learning_rate": 9.747134681654754e-07, + "loss": 0.6611, + "step": 11338 + }, + { + "epoch": 0.8580076425409557, + "grad_norm": 1.80607008934021, + "learning_rate": 9.736942987637069e-07, + "loss": 0.7413, + "step": 11339 + }, + { + "epoch": 0.8580833112632893, + "grad_norm": 2.182513952255249, + "learning_rate": 9.726756319951625e-07, + "loss": 0.81, + "step": 11340 + }, + { + "epoch": 0.858158979985623, + "grad_norm": 2.507810592651367, + "learning_rate": 9.716574679236322e-07, + "loss": 0.6897, + "step": 11341 + }, + { + "epoch": 0.8582346487079565, + "grad_norm": 2.9260153770446777, + "learning_rate": 9.706398066128786e-07, + "loss": 0.6551, + "step": 11342 + }, + { + "epoch": 0.8583103174302902, + "grad_norm": 1.9764418601989746, + "learning_rate": 9.696226481266323e-07, + "loss": 0.7505, + "step": 11343 + }, + { + "epoch": 0.8583859861526238, + "grad_norm": 2.561077356338501, + "learning_rate": 9.68605992528588e-07, + "loss": 0.6391, + "step": 11344 + }, + { + "epoch": 0.8584616548749574, + "grad_norm": 2.1961679458618164, + "learning_rate": 9.675898398824107e-07, + "loss": 0.7866, + "step": 11345 + }, + { + "epoch": 0.8585373235972911, + "grad_norm": 4.30953311920166, + "learning_rate": 9.66574190251741e-07, + "loss": 0.7338, + "step": 11346 + }, + { + "epoch": 0.8586129923196247, + "grad_norm": 5.359286308288574, + "learning_rate": 9.655590437001774e-07, + "loss": 0.6028, + "step": 11347 + }, + { + "epoch": 0.8586886610419583, + "grad_norm": 2.5870842933654785, + "learning_rate": 9.645444002912923e-07, + "loss": 0.6086, + "step": 11348 + }, + { + "epoch": 0.858764329764292, + "grad_norm": 2.1643459796905518, + "learning_rate": 9.63530260088627e-07, + "loss": 0.6189, + "step": 11349 + }, + { + "epoch": 0.8588399984866255, + "grad_norm": 2.0352864265441895, + "learning_rate": 9.625166231556905e-07, + "loss": 0.6887, + "step": 11350 + }, + { + "epoch": 0.8589156672089592, + "grad_norm": 2.4447758197784424, + "learning_rate": 9.615034895559582e-07, + "loss": 0.821, + "step": 11351 + }, + { + "epoch": 0.8589913359312928, + "grad_norm": 1.9808082580566406, + "learning_rate": 9.604908593528783e-07, + "loss": 0.8302, + "step": 11352 + }, + { + "epoch": 0.8590670046536264, + "grad_norm": 2.222776174545288, + "learning_rate": 9.594787326098629e-07, + "loss": 0.6017, + "step": 11353 + }, + { + "epoch": 0.8591426733759601, + "grad_norm": 2.0380992889404297, + "learning_rate": 9.584671093902976e-07, + "loss": 0.6517, + "step": 11354 + }, + { + "epoch": 0.8592183420982936, + "grad_norm": 2.0588114261627197, + "learning_rate": 9.574559897575285e-07, + "loss": 0.768, + "step": 11355 + }, + { + "epoch": 0.8592940108206273, + "grad_norm": 2.0195462703704834, + "learning_rate": 9.564453737748789e-07, + "loss": 0.5548, + "step": 11356 + }, + { + "epoch": 0.859369679542961, + "grad_norm": 2.724370002746582, + "learning_rate": 9.554352615056375e-07, + "loss": 0.7525, + "step": 11357 + }, + { + "epoch": 0.8594453482652945, + "grad_norm": 4.9696855545043945, + "learning_rate": 9.544256530130582e-07, + "loss": 0.5823, + "step": 11358 + }, + { + "epoch": 0.8595210169876282, + "grad_norm": 7.338983058929443, + "learning_rate": 9.534165483603669e-07, + "loss": 0.6556, + "step": 11359 + }, + { + "epoch": 0.8595966857099618, + "grad_norm": 2.392385244369507, + "learning_rate": 9.524079476107569e-07, + "loss": 0.6524, + "step": 11360 + }, + { + "epoch": 0.8596723544322954, + "grad_norm": 2.5599746704101562, + "learning_rate": 9.513998508273906e-07, + "loss": 0.6916, + "step": 11361 + }, + { + "epoch": 0.8597480231546291, + "grad_norm": 2.130995750427246, + "learning_rate": 9.503922580733985e-07, + "loss": 0.6197, + "step": 11362 + }, + { + "epoch": 0.8598236918769626, + "grad_norm": 1.7371002435684204, + "learning_rate": 9.493851694118781e-07, + "loss": 0.8717, + "step": 11363 + }, + { + "epoch": 0.8598993605992963, + "grad_norm": 2.7299606800079346, + "learning_rate": 9.483785849058991e-07, + "loss": 0.7653, + "step": 11364 + }, + { + "epoch": 0.85997502932163, + "grad_norm": 2.7302086353302, + "learning_rate": 9.47372504618491e-07, + "loss": 0.6537, + "step": 11365 + }, + { + "epoch": 0.8600506980439635, + "grad_norm": 4.3424153327941895, + "learning_rate": 9.463669286126653e-07, + "loss": 0.6663, + "step": 11366 + }, + { + "epoch": 0.8601263667662972, + "grad_norm": 2.4264373779296875, + "learning_rate": 9.453618569513898e-07, + "loss": 0.7241, + "step": 11367 + }, + { + "epoch": 0.8602020354886307, + "grad_norm": 2.623892307281494, + "learning_rate": 9.443572896976091e-07, + "loss": 0.6928, + "step": 11368 + }, + { + "epoch": 0.8602777042109644, + "grad_norm": 2.2946858406066895, + "learning_rate": 9.433532269142278e-07, + "loss": 0.6384, + "step": 11369 + }, + { + "epoch": 0.860353372933298, + "grad_norm": 2.276153326034546, + "learning_rate": 9.423496686641248e-07, + "loss": 0.7294, + "step": 11370 + }, + { + "epoch": 0.8604290416556316, + "grad_norm": 1.7452298402786255, + "learning_rate": 9.413466150101505e-07, + "loss": 0.7299, + "step": 11371 + }, + { + "epoch": 0.8605047103779653, + "grad_norm": 3.4095396995544434, + "learning_rate": 9.403440660151139e-07, + "loss": 0.641, + "step": 11372 + }, + { + "epoch": 0.8605803791002989, + "grad_norm": 3.2750163078308105, + "learning_rate": 9.393420217417997e-07, + "loss": 0.7713, + "step": 11373 + }, + { + "epoch": 0.8606560478226325, + "grad_norm": 2.141160726547241, + "learning_rate": 9.383404822529598e-07, + "loss": 0.6203, + "step": 11374 + }, + { + "epoch": 0.8607317165449662, + "grad_norm": 2.0613651275634766, + "learning_rate": 9.373394476113149e-07, + "loss": 0.5827, + "step": 11375 + }, + { + "epoch": 0.8608073852672997, + "grad_norm": 2.971005916595459, + "learning_rate": 9.363389178795488e-07, + "loss": 0.8215, + "step": 11376 + }, + { + "epoch": 0.8608830539896334, + "grad_norm": 2.2407732009887695, + "learning_rate": 9.353388931203216e-07, + "loss": 0.6456, + "step": 11377 + }, + { + "epoch": 0.860958722711967, + "grad_norm": 2.820349931716919, + "learning_rate": 9.343393733962601e-07, + "loss": 0.7552, + "step": 11378 + }, + { + "epoch": 0.8610343914343006, + "grad_norm": 2.415344476699829, + "learning_rate": 9.333403587699511e-07, + "loss": 0.712, + "step": 11379 + }, + { + "epoch": 0.8611100601566343, + "grad_norm": 2.1280593872070312, + "learning_rate": 9.323418493039609e-07, + "loss": 0.518, + "step": 11380 + }, + { + "epoch": 0.8611857288789678, + "grad_norm": 2.12552547454834, + "learning_rate": 9.31343845060818e-07, + "loss": 0.6181, + "step": 11381 + }, + { + "epoch": 0.8612613976013015, + "grad_norm": 2.4220640659332275, + "learning_rate": 9.303463461030208e-07, + "loss": 0.632, + "step": 11382 + }, + { + "epoch": 0.8613370663236352, + "grad_norm": 1.9439997673034668, + "learning_rate": 9.293493524930352e-07, + "loss": 0.511, + "step": 11383 + }, + { + "epoch": 0.8614127350459687, + "grad_norm": 2.1692566871643066, + "learning_rate": 9.283528642932972e-07, + "loss": 0.7734, + "step": 11384 + }, + { + "epoch": 0.8614884037683024, + "grad_norm": 2.1930227279663086, + "learning_rate": 9.27356881566209e-07, + "loss": 0.512, + "step": 11385 + }, + { + "epoch": 0.861564072490636, + "grad_norm": 2.2688231468200684, + "learning_rate": 9.263614043741437e-07, + "loss": 0.7379, + "step": 11386 + }, + { + "epoch": 0.8616397412129696, + "grad_norm": 1.992141842842102, + "learning_rate": 9.253664327794402e-07, + "loss": 0.7816, + "step": 11387 + }, + { + "epoch": 0.8617154099353033, + "grad_norm": 1.8397136926651, + "learning_rate": 9.243719668444079e-07, + "loss": 0.6154, + "step": 11388 + }, + { + "epoch": 0.8617910786576368, + "grad_norm": 2.1748602390289307, + "learning_rate": 9.23378006631324e-07, + "loss": 0.7455, + "step": 11389 + }, + { + "epoch": 0.8618667473799705, + "grad_norm": 2.877929925918579, + "learning_rate": 9.223845522024305e-07, + "loss": 0.7016, + "step": 11390 + }, + { + "epoch": 0.8619424161023042, + "grad_norm": 2.6606006622314453, + "learning_rate": 9.213916036199409e-07, + "loss": 0.7219, + "step": 11391 + }, + { + "epoch": 0.8620180848246377, + "grad_norm": 2.1790380477905273, + "learning_rate": 9.203991609460422e-07, + "loss": 0.6841, + "step": 11392 + }, + { + "epoch": 0.8620937535469714, + "grad_norm": 2.446531295776367, + "learning_rate": 9.19407224242879e-07, + "loss": 0.63, + "step": 11393 + }, + { + "epoch": 0.8621694222693049, + "grad_norm": 2.1365866661071777, + "learning_rate": 9.184157935725702e-07, + "loss": 0.7192, + "step": 11394 + }, + { + "epoch": 0.8622450909916386, + "grad_norm": 3.078911304473877, + "learning_rate": 9.174248689972045e-07, + "loss": 0.7079, + "step": 11395 + }, + { + "epoch": 0.8623207597139723, + "grad_norm": 1.73885977268219, + "learning_rate": 9.164344505788351e-07, + "loss": 0.7938, + "step": 11396 + }, + { + "epoch": 0.8623964284363058, + "grad_norm": 2.480379581451416, + "learning_rate": 9.154445383794863e-07, + "loss": 0.5902, + "step": 11397 + }, + { + "epoch": 0.8624720971586395, + "grad_norm": 2.1541407108306885, + "learning_rate": 9.144551324611486e-07, + "loss": 0.6969, + "step": 11398 + }, + { + "epoch": 0.8625477658809731, + "grad_norm": 2.4464011192321777, + "learning_rate": 9.134662328857826e-07, + "loss": 0.6242, + "step": 11399 + }, + { + "epoch": 0.8626234346033067, + "grad_norm": 2.167466640472412, + "learning_rate": 9.124778397153175e-07, + "loss": 0.6878, + "step": 11400 + }, + { + "epoch": 0.8626991033256404, + "grad_norm": 2.1103248596191406, + "learning_rate": 9.114899530116459e-07, + "loss": 0.6499, + "step": 11401 + }, + { + "epoch": 0.8627747720479739, + "grad_norm": 1.9201300144195557, + "learning_rate": 9.105025728366354e-07, + "loss": 0.6265, + "step": 11402 + }, + { + "epoch": 0.8628504407703076, + "grad_norm": 2.0732924938201904, + "learning_rate": 9.095156992521204e-07, + "loss": 0.6943, + "step": 11403 + }, + { + "epoch": 0.8629261094926413, + "grad_norm": 2.583040952682495, + "learning_rate": 9.085293323198995e-07, + "loss": 0.6831, + "step": 11404 + }, + { + "epoch": 0.8630017782149748, + "grad_norm": 2.722351312637329, + "learning_rate": 9.075434721017414e-07, + "loss": 0.6832, + "step": 11405 + }, + { + "epoch": 0.8630774469373085, + "grad_norm": 2.765360116958618, + "learning_rate": 9.065581186593855e-07, + "loss": 0.6247, + "step": 11406 + }, + { + "epoch": 0.863153115659642, + "grad_norm": 2.3917152881622314, + "learning_rate": 9.055732720545377e-07, + "loss": 0.647, + "step": 11407 + }, + { + "epoch": 0.8632287843819757, + "grad_norm": 2.091081380844116, + "learning_rate": 9.045889323488724e-07, + "loss": 0.5273, + "step": 11408 + }, + { + "epoch": 0.8633044531043094, + "grad_norm": 2.154362201690674, + "learning_rate": 9.036050996040325e-07, + "loss": 0.5733, + "step": 11409 + }, + { + "epoch": 0.8633801218266429, + "grad_norm": 1.648404836654663, + "learning_rate": 9.026217738816286e-07, + "loss": 0.4534, + "step": 11410 + }, + { + "epoch": 0.8634557905489766, + "grad_norm": 2.2857422828674316, + "learning_rate": 9.016389552432365e-07, + "loss": 0.6723, + "step": 11411 + }, + { + "epoch": 0.8635314592713103, + "grad_norm": 2.5683135986328125, + "learning_rate": 9.006566437504079e-07, + "loss": 0.6025, + "step": 11412 + }, + { + "epoch": 0.8636071279936438, + "grad_norm": 5.81996488571167, + "learning_rate": 8.996748394646567e-07, + "loss": 0.7606, + "step": 11413 + }, + { + "epoch": 0.8636827967159775, + "grad_norm": 2.0632705688476562, + "learning_rate": 8.986935424474686e-07, + "loss": 0.4855, + "step": 11414 + }, + { + "epoch": 0.863758465438311, + "grad_norm": 2.0599944591522217, + "learning_rate": 8.977127527602925e-07, + "loss": 0.6972, + "step": 11415 + }, + { + "epoch": 0.8638341341606447, + "grad_norm": 2.6218225955963135, + "learning_rate": 8.967324704645483e-07, + "loss": 0.7039, + "step": 11416 + }, + { + "epoch": 0.8639098028829784, + "grad_norm": 2.5727875232696533, + "learning_rate": 8.957526956216269e-07, + "loss": 0.7001, + "step": 11417 + }, + { + "epoch": 0.8639854716053119, + "grad_norm": 2.259443998336792, + "learning_rate": 8.947734282928841e-07, + "loss": 0.6443, + "step": 11418 + }, + { + "epoch": 0.8640611403276456, + "grad_norm": 2.0039772987365723, + "learning_rate": 8.93794668539645e-07, + "loss": 0.6373, + "step": 11419 + }, + { + "epoch": 0.8641368090499791, + "grad_norm": 3.764800548553467, + "learning_rate": 8.928164164232015e-07, + "loss": 0.5713, + "step": 11420 + }, + { + "epoch": 0.8642124777723128, + "grad_norm": 2.341893196105957, + "learning_rate": 8.918386720048185e-07, + "loss": 0.6677, + "step": 11421 + }, + { + "epoch": 0.8642881464946465, + "grad_norm": 2.431959867477417, + "learning_rate": 8.908614353457182e-07, + "loss": 0.7105, + "step": 11422 + }, + { + "epoch": 0.86436381521698, + "grad_norm": 2.6695075035095215, + "learning_rate": 8.898847065071055e-07, + "loss": 0.7028, + "step": 11423 + }, + { + "epoch": 0.8644394839393137, + "grad_norm": 2.1305665969848633, + "learning_rate": 8.889084855501456e-07, + "loss": 0.6387, + "step": 11424 + }, + { + "epoch": 0.8645151526616474, + "grad_norm": 2.250269889831543, + "learning_rate": 8.879327725359684e-07, + "loss": 0.6831, + "step": 11425 + }, + { + "epoch": 0.8645908213839809, + "grad_norm": 2.085334300994873, + "learning_rate": 8.869575675256794e-07, + "loss": 0.8023, + "step": 11426 + }, + { + "epoch": 0.8646664901063146, + "grad_norm": 2.079393148422241, + "learning_rate": 8.859828705803463e-07, + "loss": 0.5462, + "step": 11427 + }, + { + "epoch": 0.8647421588286481, + "grad_norm": 2.8435254096984863, + "learning_rate": 8.850086817610126e-07, + "loss": 0.6934, + "step": 11428 + }, + { + "epoch": 0.8648178275509818, + "grad_norm": 2.8293187618255615, + "learning_rate": 8.840350011286813e-07, + "loss": 0.6062, + "step": 11429 + }, + { + "epoch": 0.8648934962733155, + "grad_norm": 1.95986008644104, + "learning_rate": 8.830618287443277e-07, + "loss": 0.6402, + "step": 11430 + }, + { + "epoch": 0.864969164995649, + "grad_norm": 1.9640858173370361, + "learning_rate": 8.820891646688961e-07, + "loss": 0.7183, + "step": 11431 + }, + { + "epoch": 0.8650448337179827, + "grad_norm": 2.166637659072876, + "learning_rate": 8.811170089632977e-07, + "loss": 0.7106, + "step": 11432 + }, + { + "epoch": 0.8651205024403162, + "grad_norm": 2.3195998668670654, + "learning_rate": 8.801453616884119e-07, + "loss": 0.605, + "step": 11433 + }, + { + "epoch": 0.8651961711626499, + "grad_norm": 2.1643240451812744, + "learning_rate": 8.791742229050869e-07, + "loss": 0.5808, + "step": 11434 + }, + { + "epoch": 0.8652718398849836, + "grad_norm": 2.442021131515503, + "learning_rate": 8.782035926741381e-07, + "loss": 0.6294, + "step": 11435 + }, + { + "epoch": 0.8653475086073171, + "grad_norm": 2.4725868701934814, + "learning_rate": 8.772334710563489e-07, + "loss": 0.5829, + "step": 11436 + }, + { + "epoch": 0.8654231773296508, + "grad_norm": 2.438480854034424, + "learning_rate": 8.762638581124707e-07, + "loss": 0.6427, + "step": 11437 + }, + { + "epoch": 0.8654988460519845, + "grad_norm": 2.3041131496429443, + "learning_rate": 8.752947539032268e-07, + "loss": 0.564, + "step": 11438 + }, + { + "epoch": 0.865574514774318, + "grad_norm": 1.981211543083191, + "learning_rate": 8.74326158489304e-07, + "loss": 0.7893, + "step": 11439 + }, + { + "epoch": 0.8656501834966517, + "grad_norm": 4.369067668914795, + "learning_rate": 8.733580719313574e-07, + "loss": 0.6974, + "step": 11440 + }, + { + "epoch": 0.8657258522189852, + "grad_norm": 2.5769243240356445, + "learning_rate": 8.723904942900137e-07, + "loss": 0.5311, + "step": 11441 + }, + { + "epoch": 0.8658015209413189, + "grad_norm": 2.2570548057556152, + "learning_rate": 8.714234256258654e-07, + "loss": 0.6526, + "step": 11442 + }, + { + "epoch": 0.8658771896636526, + "grad_norm": 1.7594521045684814, + "learning_rate": 8.704568659994721e-07, + "loss": 0.6366, + "step": 11443 + }, + { + "epoch": 0.8659528583859861, + "grad_norm": 1.9956740140914917, + "learning_rate": 8.694908154713652e-07, + "loss": 0.6423, + "step": 11444 + }, + { + "epoch": 0.8660285271083198, + "grad_norm": 2.2370998859405518, + "learning_rate": 8.685252741020405e-07, + "loss": 0.6358, + "step": 11445 + }, + { + "epoch": 0.8661041958306533, + "grad_norm": 2.284075975418091, + "learning_rate": 8.675602419519646e-07, + "loss": 0.6383, + "step": 11446 + }, + { + "epoch": 0.866179864552987, + "grad_norm": 2.2030980587005615, + "learning_rate": 8.665957190815671e-07, + "loss": 0.5712, + "step": 11447 + }, + { + "epoch": 0.8662555332753207, + "grad_norm": 2.6359593868255615, + "learning_rate": 8.656317055512537e-07, + "loss": 0.6499, + "step": 11448 + }, + { + "epoch": 0.8663312019976542, + "grad_norm": 14.77769660949707, + "learning_rate": 8.646682014213941e-07, + "loss": 0.8428, + "step": 11449 + }, + { + "epoch": 0.8664068707199879, + "grad_norm": 2.7199504375457764, + "learning_rate": 8.637052067523231e-07, + "loss": 0.6286, + "step": 11450 + }, + { + "epoch": 0.8664825394423216, + "grad_norm": 1.821079134941101, + "learning_rate": 8.627427216043474e-07, + "loss": 0.578, + "step": 11451 + }, + { + "epoch": 0.8665582081646551, + "grad_norm": 2.893192768096924, + "learning_rate": 8.617807460377428e-07, + "loss": 0.7833, + "step": 11452 + }, + { + "epoch": 0.8666338768869888, + "grad_norm": 2.2512125968933105, + "learning_rate": 8.608192801127491e-07, + "loss": 0.7754, + "step": 11453 + }, + { + "epoch": 0.8667095456093223, + "grad_norm": 1.8546165227890015, + "learning_rate": 8.598583238895782e-07, + "loss": 0.457, + "step": 11454 + }, + { + "epoch": 0.866785214331656, + "grad_norm": 2.225909471511841, + "learning_rate": 8.588978774284069e-07, + "loss": 0.6653, + "step": 11455 + }, + { + "epoch": 0.8668608830539897, + "grad_norm": 2.2338666915893555, + "learning_rate": 8.57937940789382e-07, + "loss": 0.6326, + "step": 11456 + }, + { + "epoch": 0.8669365517763232, + "grad_norm": 2.0527563095092773, + "learning_rate": 8.569785140326197e-07, + "loss": 0.8369, + "step": 11457 + }, + { + "epoch": 0.8670122204986569, + "grad_norm": 3.0348801612854004, + "learning_rate": 8.560195972181965e-07, + "loss": 0.705, + "step": 11458 + }, + { + "epoch": 0.8670878892209904, + "grad_norm": 2.616718292236328, + "learning_rate": 8.550611904061698e-07, + "loss": 0.7044, + "step": 11459 + }, + { + "epoch": 0.8671635579433241, + "grad_norm": 2.5653347969055176, + "learning_rate": 8.541032936565564e-07, + "loss": 0.7478, + "step": 11460 + }, + { + "epoch": 0.8672392266656578, + "grad_norm": 2.5321779251098633, + "learning_rate": 8.531459070293403e-07, + "loss": 0.5518, + "step": 11461 + }, + { + "epoch": 0.8673148953879913, + "grad_norm": 2.7409920692443848, + "learning_rate": 8.521890305844775e-07, + "loss": 0.6368, + "step": 11462 + }, + { + "epoch": 0.867390564110325, + "grad_norm": 2.6878130435943604, + "learning_rate": 8.512326643818912e-07, + "loss": 0.6114, + "step": 11463 + }, + { + "epoch": 0.8674662328326587, + "grad_norm": 2.02453351020813, + "learning_rate": 8.502768084814714e-07, + "loss": 0.6421, + "step": 11464 + }, + { + "epoch": 0.8675419015549922, + "grad_norm": 2.029902696609497, + "learning_rate": 8.493214629430773e-07, + "loss": 0.6868, + "step": 11465 + }, + { + "epoch": 0.8676175702773259, + "grad_norm": 2.0235416889190674, + "learning_rate": 8.483666278265348e-07, + "loss": 0.6933, + "step": 11466 + }, + { + "epoch": 0.8676932389996594, + "grad_norm": 1.4413191080093384, + "learning_rate": 8.474123031916425e-07, + "loss": 0.7743, + "step": 11467 + }, + { + "epoch": 0.8677689077219931, + "grad_norm": 2.967958688735962, + "learning_rate": 8.464584890981572e-07, + "loss": 0.7331, + "step": 11468 + }, + { + "epoch": 0.8678445764443268, + "grad_norm": 1.6408123970031738, + "learning_rate": 8.455051856058142e-07, + "loss": 0.6527, + "step": 11469 + }, + { + "epoch": 0.8679202451666603, + "grad_norm": 2.9439444541931152, + "learning_rate": 8.44552392774311e-07, + "loss": 0.6358, + "step": 11470 + }, + { + "epoch": 0.867995913888994, + "grad_norm": 2.3100738525390625, + "learning_rate": 8.436001106633165e-07, + "loss": 0.7387, + "step": 11471 + }, + { + "epoch": 0.8680715826113276, + "grad_norm": 2.5663743019104004, + "learning_rate": 8.426483393324633e-07, + "loss": 0.658, + "step": 11472 + }, + { + "epoch": 0.8681472513336612, + "grad_norm": 3.1118669509887695, + "learning_rate": 8.416970788413527e-07, + "loss": 0.7758, + "step": 11473 + }, + { + "epoch": 0.8682229200559949, + "grad_norm": 2.1538186073303223, + "learning_rate": 8.407463292495617e-07, + "loss": 0.6391, + "step": 11474 + }, + { + "epoch": 0.8682985887783284, + "grad_norm": 2.083836078643799, + "learning_rate": 8.39796090616625e-07, + "loss": 0.6682, + "step": 11475 + }, + { + "epoch": 0.8683742575006621, + "grad_norm": 2.3815724849700928, + "learning_rate": 8.38846363002049e-07, + "loss": 0.4637, + "step": 11476 + }, + { + "epoch": 0.8684499262229958, + "grad_norm": 1.9574769735336304, + "learning_rate": 8.378971464653112e-07, + "loss": 0.7101, + "step": 11477 + }, + { + "epoch": 0.8685255949453293, + "grad_norm": 2.0645639896392822, + "learning_rate": 8.369484410658537e-07, + "loss": 0.5948, + "step": 11478 + }, + { + "epoch": 0.868601263667663, + "grad_norm": 2.0333096981048584, + "learning_rate": 8.360002468630862e-07, + "loss": 0.6257, + "step": 11479 + }, + { + "epoch": 0.8686769323899965, + "grad_norm": 2.2880470752716064, + "learning_rate": 8.350525639163892e-07, + "loss": 0.7798, + "step": 11480 + }, + { + "epoch": 0.8687526011123302, + "grad_norm": 2.19171404838562, + "learning_rate": 8.341053922851111e-07, + "loss": 0.7149, + "step": 11481 + }, + { + "epoch": 0.8688282698346639, + "grad_norm": 2.084751605987549, + "learning_rate": 8.331587320285638e-07, + "loss": 0.5851, + "step": 11482 + }, + { + "epoch": 0.8689039385569974, + "grad_norm": 2.067612886428833, + "learning_rate": 8.322125832060294e-07, + "loss": 0.6274, + "step": 11483 + }, + { + "epoch": 0.8689796072793311, + "grad_norm": 2.2991397380828857, + "learning_rate": 8.312669458767629e-07, + "loss": 0.5268, + "step": 11484 + }, + { + "epoch": 0.8690552760016648, + "grad_norm": 2.2263104915618896, + "learning_rate": 8.303218200999817e-07, + "loss": 0.642, + "step": 11485 + }, + { + "epoch": 0.8691309447239983, + "grad_norm": 2.2175941467285156, + "learning_rate": 8.293772059348716e-07, + "loss": 0.5625, + "step": 11486 + }, + { + "epoch": 0.869206613446332, + "grad_norm": 2.1039774417877197, + "learning_rate": 8.28433103440587e-07, + "loss": 0.6794, + "step": 11487 + }, + { + "epoch": 0.8692822821686655, + "grad_norm": 2.4359757900238037, + "learning_rate": 8.27489512676252e-07, + "loss": 0.5836, + "step": 11488 + }, + { + "epoch": 0.8693579508909992, + "grad_norm": 2.079524278640747, + "learning_rate": 8.265464337009572e-07, + "loss": 0.6959, + "step": 11489 + }, + { + "epoch": 0.8694336196133329, + "grad_norm": 2.333555221557617, + "learning_rate": 8.256038665737602e-07, + "loss": 0.766, + "step": 11490 + }, + { + "epoch": 0.8695092883356664, + "grad_norm": 2.577230215072632, + "learning_rate": 8.246618113536889e-07, + "loss": 0.6448, + "step": 11491 + }, + { + "epoch": 0.8695849570580001, + "grad_norm": 2.1624553203582764, + "learning_rate": 8.237202680997381e-07, + "loss": 0.6679, + "step": 11492 + }, + { + "epoch": 0.8696606257803337, + "grad_norm": 2.065601110458374, + "learning_rate": 8.227792368708686e-07, + "loss": 0.5157, + "step": 11493 + }, + { + "epoch": 0.8697362945026673, + "grad_norm": 2.3269245624542236, + "learning_rate": 8.218387177260094e-07, + "loss": 0.7201, + "step": 11494 + }, + { + "epoch": 0.869811963225001, + "grad_norm": 2.024136543273926, + "learning_rate": 8.208987107240642e-07, + "loss": 0.5877, + "step": 11495 + }, + { + "epoch": 0.8698876319473345, + "grad_norm": 2.4356894493103027, + "learning_rate": 8.19959215923895e-07, + "loss": 0.6413, + "step": 11496 + }, + { + "epoch": 0.8699633006696682, + "grad_norm": 2.5709826946258545, + "learning_rate": 8.190202333843368e-07, + "loss": 0.7845, + "step": 11497 + }, + { + "epoch": 0.8700389693920019, + "grad_norm": 2.712878704071045, + "learning_rate": 8.180817631641923e-07, + "loss": 0.623, + "step": 11498 + }, + { + "epoch": 0.8701146381143354, + "grad_norm": 3.3466601371765137, + "learning_rate": 8.171438053222318e-07, + "loss": 0.6294, + "step": 11499 + }, + { + "epoch": 0.8701903068366691, + "grad_norm": 2.0400924682617188, + "learning_rate": 8.162063599171923e-07, + "loss": 0.6766, + "step": 11500 + }, + { + "epoch": 0.8702659755590026, + "grad_norm": 2.169499397277832, + "learning_rate": 8.152694270077796e-07, + "loss": 0.6908, + "step": 11501 + }, + { + "epoch": 0.8703416442813363, + "grad_norm": 2.0179340839385986, + "learning_rate": 8.143330066526689e-07, + "loss": 0.7238, + "step": 11502 + }, + { + "epoch": 0.87041731300367, + "grad_norm": 2.6831698417663574, + "learning_rate": 8.133970989105024e-07, + "loss": 0.6951, + "step": 11503 + }, + { + "epoch": 0.8704929817260035, + "grad_norm": 2.6158101558685303, + "learning_rate": 8.12461703839884e-07, + "loss": 0.561, + "step": 11504 + }, + { + "epoch": 0.8705686504483372, + "grad_norm": 2.133500337600708, + "learning_rate": 8.115268214993981e-07, + "loss": 0.7176, + "step": 11505 + }, + { + "epoch": 0.8706443191706708, + "grad_norm": 2.4899628162384033, + "learning_rate": 8.105924519475886e-07, + "loss": 0.7863, + "step": 11506 + }, + { + "epoch": 0.8707199878930044, + "grad_norm": 2.085965394973755, + "learning_rate": 8.096585952429668e-07, + "loss": 0.6698, + "step": 11507 + }, + { + "epoch": 0.8707956566153381, + "grad_norm": 2.165194272994995, + "learning_rate": 8.08725251444013e-07, + "loss": 0.7282, + "step": 11508 + }, + { + "epoch": 0.8708713253376716, + "grad_norm": 2.1750943660736084, + "learning_rate": 8.077924206091794e-07, + "loss": 0.7459, + "step": 11509 + }, + { + "epoch": 0.8709469940600053, + "grad_norm": 2.4543588161468506, + "learning_rate": 8.068601027968802e-07, + "loss": 0.7384, + "step": 11510 + }, + { + "epoch": 0.871022662782339, + "grad_norm": 2.087395668029785, + "learning_rate": 8.059282980655007e-07, + "loss": 0.5855, + "step": 11511 + }, + { + "epoch": 0.8710983315046725, + "grad_norm": 2.3441057205200195, + "learning_rate": 8.049970064733953e-07, + "loss": 0.7226, + "step": 11512 + }, + { + "epoch": 0.8711740002270062, + "grad_norm": 1.9657081365585327, + "learning_rate": 8.040662280788844e-07, + "loss": 0.631, + "step": 11513 + }, + { + "epoch": 0.8712496689493398, + "grad_norm": 3.1376612186431885, + "learning_rate": 8.031359629402512e-07, + "loss": 0.7256, + "step": 11514 + }, + { + "epoch": 0.8713253376716734, + "grad_norm": 2.253770112991333, + "learning_rate": 8.022062111157583e-07, + "loss": 0.5113, + "step": 11515 + }, + { + "epoch": 0.8714010063940071, + "grad_norm": 2.1332924365997314, + "learning_rate": 8.01276972663627e-07, + "loss": 0.6469, + "step": 11516 + }, + { + "epoch": 0.8714766751163406, + "grad_norm": 2.4022717475891113, + "learning_rate": 8.003482476420517e-07, + "loss": 0.6562, + "step": 11517 + }, + { + "epoch": 0.8715523438386743, + "grad_norm": 2.587195634841919, + "learning_rate": 7.99420036109188e-07, + "loss": 0.7556, + "step": 11518 + }, + { + "epoch": 0.8716280125610079, + "grad_norm": 2.437520742416382, + "learning_rate": 7.984923381231634e-07, + "loss": 0.8001, + "step": 11519 + }, + { + "epoch": 0.8717036812833415, + "grad_norm": 2.373514413833618, + "learning_rate": 7.975651537420793e-07, + "loss": 0.6267, + "step": 11520 + }, + { + "epoch": 0.8717793500056752, + "grad_norm": 1.9270378351211548, + "learning_rate": 7.966384830239933e-07, + "loss": 0.5706, + "step": 11521 + }, + { + "epoch": 0.8718550187280087, + "grad_norm": 2.1025631427764893, + "learning_rate": 7.957123260269391e-07, + "loss": 0.6488, + "step": 11522 + }, + { + "epoch": 0.8719306874503424, + "grad_norm": 2.036912441253662, + "learning_rate": 7.947866828089142e-07, + "loss": 0.6159, + "step": 11523 + }, + { + "epoch": 0.8720063561726761, + "grad_norm": 2.3144853115081787, + "learning_rate": 7.938615534278862e-07, + "loss": 0.7151, + "step": 11524 + }, + { + "epoch": 0.8720820248950096, + "grad_norm": 2.1579651832580566, + "learning_rate": 7.929369379417899e-07, + "loss": 0.6715, + "step": 11525 + }, + { + "epoch": 0.8721576936173433, + "grad_norm": 2.2118868827819824, + "learning_rate": 7.920128364085268e-07, + "loss": 0.6239, + "step": 11526 + }, + { + "epoch": 0.8722333623396769, + "grad_norm": 2.359009265899658, + "learning_rate": 7.910892488859698e-07, + "loss": 0.6618, + "step": 11527 + }, + { + "epoch": 0.8723090310620105, + "grad_norm": 2.0738399028778076, + "learning_rate": 7.901661754319534e-07, + "loss": 0.5986, + "step": 11528 + }, + { + "epoch": 0.8723846997843442, + "grad_norm": 3.8261353969573975, + "learning_rate": 7.892436161042826e-07, + "loss": 0.7576, + "step": 11529 + }, + { + "epoch": 0.8724603685066777, + "grad_norm": 1.8722835779190063, + "learning_rate": 7.883215709607351e-07, + "loss": 0.7365, + "step": 11530 + }, + { + "epoch": 0.8725360372290114, + "grad_norm": 1.9653728008270264, + "learning_rate": 7.874000400590526e-07, + "loss": 0.6152, + "step": 11531 + }, + { + "epoch": 0.872611705951345, + "grad_norm": 3.910702705383301, + "learning_rate": 7.864790234569411e-07, + "loss": 0.5375, + "step": 11532 + }, + { + "epoch": 0.8726873746736786, + "grad_norm": 1.5178172588348389, + "learning_rate": 7.855585212120783e-07, + "loss": 0.7787, + "step": 11533 + }, + { + "epoch": 0.8727630433960123, + "grad_norm": 2.157939910888672, + "learning_rate": 7.846385333821103e-07, + "loss": 0.5934, + "step": 11534 + }, + { + "epoch": 0.8728387121183458, + "grad_norm": 2.1008949279785156, + "learning_rate": 7.837190600246489e-07, + "loss": 0.7366, + "step": 11535 + }, + { + "epoch": 0.8729143808406795, + "grad_norm": 2.356288433074951, + "learning_rate": 7.82800101197274e-07, + "loss": 0.5139, + "step": 11536 + }, + { + "epoch": 0.8729900495630132, + "grad_norm": 1.809434413909912, + "learning_rate": 7.818816569575346e-07, + "loss": 0.5801, + "step": 11537 + }, + { + "epoch": 0.8730657182853467, + "grad_norm": 4.456824779510498, + "learning_rate": 7.809637273629486e-07, + "loss": 0.732, + "step": 11538 + }, + { + "epoch": 0.8731413870076804, + "grad_norm": 2.023094415664673, + "learning_rate": 7.800463124709952e-07, + "loss": 0.4706, + "step": 11539 + }, + { + "epoch": 0.873217055730014, + "grad_norm": 2.1191153526306152, + "learning_rate": 7.791294123391274e-07, + "loss": 0.6217, + "step": 11540 + }, + { + "epoch": 0.8732927244523476, + "grad_norm": 3.088059186935425, + "learning_rate": 7.782130270247681e-07, + "loss": 0.727, + "step": 11541 + }, + { + "epoch": 0.8733683931746813, + "grad_norm": 2.3737430572509766, + "learning_rate": 7.772971565852997e-07, + "loss": 0.6377, + "step": 11542 + }, + { + "epoch": 0.8734440618970148, + "grad_norm": 3.2077624797821045, + "learning_rate": 7.76381801078079e-07, + "loss": 0.7562, + "step": 11543 + }, + { + "epoch": 0.8735197306193485, + "grad_norm": 2.7162673473358154, + "learning_rate": 7.754669605604284e-07, + "loss": 0.5585, + "step": 11544 + }, + { + "epoch": 0.8735953993416821, + "grad_norm": 1.9874165058135986, + "learning_rate": 7.745526350896388e-07, + "loss": 0.695, + "step": 11545 + }, + { + "epoch": 0.8736710680640157, + "grad_norm": 2.2441554069519043, + "learning_rate": 7.736388247229667e-07, + "loss": 0.716, + "step": 11546 + }, + { + "epoch": 0.8737467367863494, + "grad_norm": 2.2191619873046875, + "learning_rate": 7.727255295176391e-07, + "loss": 0.725, + "step": 11547 + }, + { + "epoch": 0.873822405508683, + "grad_norm": 2.3463375568389893, + "learning_rate": 7.718127495308483e-07, + "loss": 0.7155, + "step": 11548 + }, + { + "epoch": 0.8738980742310166, + "grad_norm": 2.282282590866089, + "learning_rate": 7.709004848197588e-07, + "loss": 0.8187, + "step": 11549 + }, + { + "epoch": 0.8739737429533503, + "grad_norm": 2.2139830589294434, + "learning_rate": 7.699887354414935e-07, + "loss": 0.7518, + "step": 11550 + }, + { + "epoch": 0.8740494116756838, + "grad_norm": 2.0290632247924805, + "learning_rate": 7.69077501453154e-07, + "loss": 0.6057, + "step": 11551 + }, + { + "epoch": 0.8741250803980175, + "grad_norm": 1.9807419776916504, + "learning_rate": 7.681667829118057e-07, + "loss": 0.6753, + "step": 11552 + }, + { + "epoch": 0.8742007491203511, + "grad_norm": 2.6558451652526855, + "learning_rate": 7.672565798744757e-07, + "loss": 0.6585, + "step": 11553 + }, + { + "epoch": 0.8742764178426847, + "grad_norm": 2.189072847366333, + "learning_rate": 7.663468923981677e-07, + "loss": 0.6509, + "step": 11554 + }, + { + "epoch": 0.8743520865650184, + "grad_norm": 4.953122138977051, + "learning_rate": 7.654377205398479e-07, + "loss": 0.7143, + "step": 11555 + }, + { + "epoch": 0.874427755287352, + "grad_norm": 1.9904096126556396, + "learning_rate": 7.64529064356451e-07, + "loss": 0.7092, + "step": 11556 + }, + { + "epoch": 0.8745034240096856, + "grad_norm": 2.697661876678467, + "learning_rate": 7.636209239048823e-07, + "loss": 0.6954, + "step": 11557 + }, + { + "epoch": 0.8745790927320192, + "grad_norm": 2.0726523399353027, + "learning_rate": 7.627132992420103e-07, + "loss": 0.7002, + "step": 11558 + }, + { + "epoch": 0.8746547614543528, + "grad_norm": 3.2430927753448486, + "learning_rate": 7.618061904246736e-07, + "loss": 0.6645, + "step": 11559 + }, + { + "epoch": 0.8747304301766865, + "grad_norm": 1.8486747741699219, + "learning_rate": 7.608995975096797e-07, + "loss": 0.6222, + "step": 11560 + }, + { + "epoch": 0.8748060988990201, + "grad_norm": 1.8390332460403442, + "learning_rate": 7.599935205538003e-07, + "loss": 0.6021, + "step": 11561 + }, + { + "epoch": 0.8748817676213537, + "grad_norm": 2.2097909450531006, + "learning_rate": 7.590879596137789e-07, + "loss": 0.7386, + "step": 11562 + }, + { + "epoch": 0.8749574363436874, + "grad_norm": 2.2231502532958984, + "learning_rate": 7.581829147463252e-07, + "loss": 0.7924, + "step": 11563 + }, + { + "epoch": 0.8750331050660209, + "grad_norm": 3.517443895339966, + "learning_rate": 7.572783860081139e-07, + "loss": 0.7055, + "step": 11564 + }, + { + "epoch": 0.8751087737883546, + "grad_norm": 2.743074417114258, + "learning_rate": 7.563743734557877e-07, + "loss": 0.6501, + "step": 11565 + }, + { + "epoch": 0.8751844425106882, + "grad_norm": 2.1922786235809326, + "learning_rate": 7.554708771459651e-07, + "loss": 0.7249, + "step": 11566 + }, + { + "epoch": 0.8752601112330218, + "grad_norm": 2.300901174545288, + "learning_rate": 7.5456789713522e-07, + "loss": 0.6253, + "step": 11567 + }, + { + "epoch": 0.8753357799553555, + "grad_norm": 2.120061159133911, + "learning_rate": 7.536654334801022e-07, + "loss": 0.6272, + "step": 11568 + }, + { + "epoch": 0.875411448677689, + "grad_norm": 2.244152307510376, + "learning_rate": 7.527634862371274e-07, + "loss": 0.7696, + "step": 11569 + }, + { + "epoch": 0.8754871174000227, + "grad_norm": 2.225641965866089, + "learning_rate": 7.518620554627785e-07, + "loss": 0.598, + "step": 11570 + }, + { + "epoch": 0.8755627861223563, + "grad_norm": 2.236870288848877, + "learning_rate": 7.509611412135034e-07, + "loss": 0.7316, + "step": 11571 + }, + { + "epoch": 0.8756384548446899, + "grad_norm": 2.152217149734497, + "learning_rate": 7.500607435457238e-07, + "loss": 0.7231, + "step": 11572 + }, + { + "epoch": 0.8757141235670236, + "grad_norm": 2.509950637817383, + "learning_rate": 7.491608625158226e-07, + "loss": 0.6495, + "step": 11573 + }, + { + "epoch": 0.8757897922893572, + "grad_norm": 2.5385403633117676, + "learning_rate": 7.482614981801579e-07, + "loss": 0.652, + "step": 11574 + }, + { + "epoch": 0.8758654610116908, + "grad_norm": 2.698321580886841, + "learning_rate": 7.473626505950445e-07, + "loss": 0.7599, + "step": 11575 + }, + { + "epoch": 0.8759411297340245, + "grad_norm": 2.6774983406066895, + "learning_rate": 7.464643198167735e-07, + "loss": 0.6847, + "step": 11576 + }, + { + "epoch": 0.876016798456358, + "grad_norm": 2.8981354236602783, + "learning_rate": 7.455665059016056e-07, + "loss": 0.7735, + "step": 11577 + }, + { + "epoch": 0.8760924671786917, + "grad_norm": 1.9417154788970947, + "learning_rate": 7.446692089057583e-07, + "loss": 0.5627, + "step": 11578 + }, + { + "epoch": 0.8761681359010253, + "grad_norm": 2.02998948097229, + "learning_rate": 7.437724288854273e-07, + "loss": 0.6576, + "step": 11579 + }, + { + "epoch": 0.8762438046233589, + "grad_norm": 2.600839614868164, + "learning_rate": 7.428761658967697e-07, + "loss": 0.7377, + "step": 11580 + }, + { + "epoch": 0.8763194733456926, + "grad_norm": 2.005692720413208, + "learning_rate": 7.419804199959138e-07, + "loss": 0.5753, + "step": 11581 + }, + { + "epoch": 0.8763951420680262, + "grad_norm": 2.1509876251220703, + "learning_rate": 7.410851912389536e-07, + "loss": 0.7596, + "step": 11582 + }, + { + "epoch": 0.8764708107903598, + "grad_norm": 2.080975294113159, + "learning_rate": 7.401904796819512e-07, + "loss": 0.5581, + "step": 11583 + }, + { + "epoch": 0.8765464795126934, + "grad_norm": 2.399707317352295, + "learning_rate": 7.392962853809388e-07, + "loss": 0.4842, + "step": 11584 + }, + { + "epoch": 0.876622148235027, + "grad_norm": 2.309359550476074, + "learning_rate": 7.384026083919087e-07, + "loss": 0.6132, + "step": 11585 + }, + { + "epoch": 0.8766978169573607, + "grad_norm": 2.053427219390869, + "learning_rate": 7.375094487708281e-07, + "loss": 0.6819, + "step": 11586 + }, + { + "epoch": 0.8767734856796943, + "grad_norm": 1.9425896406173706, + "learning_rate": 7.366168065736302e-07, + "loss": 0.6406, + "step": 11587 + }, + { + "epoch": 0.8768491544020279, + "grad_norm": 2.2914395332336426, + "learning_rate": 7.357246818562174e-07, + "loss": 0.6731, + "step": 11588 + }, + { + "epoch": 0.8769248231243616, + "grad_norm": 1.905128836631775, + "learning_rate": 7.348330746744529e-07, + "loss": 0.6197, + "step": 11589 + }, + { + "epoch": 0.8770004918466952, + "grad_norm": 2.3496506214141846, + "learning_rate": 7.339419850841741e-07, + "loss": 0.6112, + "step": 11590 + }, + { + "epoch": 0.8770761605690288, + "grad_norm": 2.184185743331909, + "learning_rate": 7.330514131411843e-07, + "loss": 0.4918, + "step": 11591 + }, + { + "epoch": 0.8771518292913624, + "grad_norm": 2.560593843460083, + "learning_rate": 7.321613589012529e-07, + "loss": 0.6898, + "step": 11592 + }, + { + "epoch": 0.877227498013696, + "grad_norm": 1.9485561847686768, + "learning_rate": 7.312718224201194e-07, + "loss": 0.6156, + "step": 11593 + }, + { + "epoch": 0.8773031667360297, + "grad_norm": 2.1661360263824463, + "learning_rate": 7.303828037534881e-07, + "loss": 0.5933, + "step": 11594 + }, + { + "epoch": 0.8773788354583633, + "grad_norm": 2.1465418338775635, + "learning_rate": 7.294943029570345e-07, + "loss": 0.5613, + "step": 11595 + }, + { + "epoch": 0.8774545041806969, + "grad_norm": 2.0607333183288574, + "learning_rate": 7.286063200863953e-07, + "loss": 0.4852, + "step": 11596 + }, + { + "epoch": 0.8775301729030305, + "grad_norm": 2.41105580329895, + "learning_rate": 7.277188551971817e-07, + "loss": 0.6185, + "step": 11597 + }, + { + "epoch": 0.8776058416253641, + "grad_norm": 1.9282902479171753, + "learning_rate": 7.268319083449715e-07, + "loss": 0.506, + "step": 11598 + }, + { + "epoch": 0.8776815103476978, + "grad_norm": 2.118765354156494, + "learning_rate": 7.259454795853041e-07, + "loss": 0.856, + "step": 11599 + }, + { + "epoch": 0.8777571790700314, + "grad_norm": 2.2869462966918945, + "learning_rate": 7.250595689736921e-07, + "loss": 0.6693, + "step": 11600 + }, + { + "epoch": 0.877832847792365, + "grad_norm": 2.2871768474578857, + "learning_rate": 7.241741765656124e-07, + "loss": 0.6692, + "step": 11601 + }, + { + "epoch": 0.8779085165146987, + "grad_norm": 2.093322992324829, + "learning_rate": 7.232893024165172e-07, + "loss": 0.6427, + "step": 11602 + }, + { + "epoch": 0.8779841852370323, + "grad_norm": 1.7536505460739136, + "learning_rate": 7.224049465818136e-07, + "loss": 0.646, + "step": 11603 + }, + { + "epoch": 0.8780598539593659, + "grad_norm": 2.1361818313598633, + "learning_rate": 7.215211091168859e-07, + "loss": 0.7456, + "step": 11604 + }, + { + "epoch": 0.8781355226816995, + "grad_norm": 2.706083297729492, + "learning_rate": 7.206377900770812e-07, + "loss": 0.5506, + "step": 11605 + }, + { + "epoch": 0.8782111914040331, + "grad_norm": 2.02065372467041, + "learning_rate": 7.19754989517718e-07, + "loss": 0.6444, + "step": 11606 + }, + { + "epoch": 0.8782868601263668, + "grad_norm": 2.013063669204712, + "learning_rate": 7.188727074940781e-07, + "loss": 0.7695, + "step": 11607 + }, + { + "epoch": 0.8783625288487004, + "grad_norm": 1.6234900951385498, + "learning_rate": 7.179909440614135e-07, + "loss": 0.8568, + "step": 11608 + }, + { + "epoch": 0.878438197571034, + "grad_norm": 2.600998878479004, + "learning_rate": 7.171096992749458e-07, + "loss": 0.5912, + "step": 11609 + }, + { + "epoch": 0.8785138662933676, + "grad_norm": 2.169203281402588, + "learning_rate": 7.162289731898561e-07, + "loss": 0.6923, + "step": 11610 + }, + { + "epoch": 0.8785895350157013, + "grad_norm": 2.445180892944336, + "learning_rate": 7.153487658613019e-07, + "loss": 0.5911, + "step": 11611 + }, + { + "epoch": 0.8786652037380349, + "grad_norm": 2.2180263996124268, + "learning_rate": 7.144690773444034e-07, + "loss": 0.6728, + "step": 11612 + }, + { + "epoch": 0.8787408724603685, + "grad_norm": 2.4425292015075684, + "learning_rate": 7.135899076942506e-07, + "loss": 0.6479, + "step": 11613 + }, + { + "epoch": 0.8788165411827021, + "grad_norm": 2.1483867168426514, + "learning_rate": 7.127112569658982e-07, + "loss": 0.6243, + "step": 11614 + }, + { + "epoch": 0.8788922099050358, + "grad_norm": 1.742142677307129, + "learning_rate": 7.118331252143724e-07, + "loss": 0.6261, + "step": 11615 + }, + { + "epoch": 0.8789678786273694, + "grad_norm": 2.120514392852783, + "learning_rate": 7.109555124946641e-07, + "loss": 0.7177, + "step": 11616 + }, + { + "epoch": 0.879043547349703, + "grad_norm": 2.504844903945923, + "learning_rate": 7.100784188617293e-07, + "loss": 0.5731, + "step": 11617 + }, + { + "epoch": 0.8791192160720366, + "grad_norm": 2.5175814628601074, + "learning_rate": 7.092018443704971e-07, + "loss": 0.6151, + "step": 11618 + }, + { + "epoch": 0.8791948847943702, + "grad_norm": 2.128124713897705, + "learning_rate": 7.083257890758618e-07, + "loss": 0.6207, + "step": 11619 + }, + { + "epoch": 0.8792705535167039, + "grad_norm": 1.736514687538147, + "learning_rate": 7.074502530326862e-07, + "loss": 0.6287, + "step": 11620 + }, + { + "epoch": 0.8793462222390375, + "grad_norm": 6.643167018890381, + "learning_rate": 7.065752362957955e-07, + "loss": 0.6171, + "step": 11621 + }, + { + "epoch": 0.8794218909613711, + "grad_norm": 1.8215636014938354, + "learning_rate": 7.057007389199851e-07, + "loss": 0.6954, + "step": 11622 + }, + { + "epoch": 0.8794975596837047, + "grad_norm": 2.094165325164795, + "learning_rate": 7.048267609600249e-07, + "loss": 0.56, + "step": 11623 + }, + { + "epoch": 0.8795732284060384, + "grad_norm": 1.876628041267395, + "learning_rate": 7.039533024706424e-07, + "loss": 0.7134, + "step": 11624 + }, + { + "epoch": 0.879648897128372, + "grad_norm": 2.3218555450439453, + "learning_rate": 7.030803635065356e-07, + "loss": 0.7137, + "step": 11625 + }, + { + "epoch": 0.8797245658507056, + "grad_norm": 3.562688112258911, + "learning_rate": 7.022079441223718e-07, + "loss": 0.6009, + "step": 11626 + }, + { + "epoch": 0.8798002345730392, + "grad_norm": 3.2988367080688477, + "learning_rate": 7.013360443727855e-07, + "loss": 0.5274, + "step": 11627 + }, + { + "epoch": 0.8798759032953729, + "grad_norm": 2.6240482330322266, + "learning_rate": 7.004646643123769e-07, + "loss": 0.7412, + "step": 11628 + }, + { + "epoch": 0.8799515720177065, + "grad_norm": 3.6613142490386963, + "learning_rate": 6.995938039957153e-07, + "loss": 0.6822, + "step": 11629 + }, + { + "epoch": 0.8800272407400401, + "grad_norm": 1.9795455932617188, + "learning_rate": 6.987234634773381e-07, + "loss": 0.6066, + "step": 11630 + }, + { + "epoch": 0.8801029094623737, + "grad_norm": 2.0627286434173584, + "learning_rate": 6.978536428117447e-07, + "loss": 0.6163, + "step": 11631 + }, + { + "epoch": 0.8801785781847073, + "grad_norm": 2.3510892391204834, + "learning_rate": 6.969843420534085e-07, + "loss": 0.7384, + "step": 11632 + }, + { + "epoch": 0.880254246907041, + "grad_norm": 2.18217396736145, + "learning_rate": 6.961155612567681e-07, + "loss": 0.5873, + "step": 11633 + }, + { + "epoch": 0.8803299156293746, + "grad_norm": 2.205120086669922, + "learning_rate": 6.952473004762319e-07, + "loss": 0.6898, + "step": 11634 + }, + { + "epoch": 0.8804055843517082, + "grad_norm": 2.0564920902252197, + "learning_rate": 6.943795597661683e-07, + "loss": 0.5414, + "step": 11635 + }, + { + "epoch": 0.8804812530740418, + "grad_norm": 2.376497983932495, + "learning_rate": 6.935123391809209e-07, + "loss": 0.8052, + "step": 11636 + }, + { + "epoch": 0.8805569217963755, + "grad_norm": 2.4886834621429443, + "learning_rate": 6.926456387747964e-07, + "loss": 0.6784, + "step": 11637 + }, + { + "epoch": 0.8806325905187091, + "grad_norm": 2.1387746334075928, + "learning_rate": 6.917794586020722e-07, + "loss": 0.6157, + "step": 11638 + }, + { + "epoch": 0.8807082592410427, + "grad_norm": 3.309083938598633, + "learning_rate": 6.909137987169899e-07, + "loss": 0.761, + "step": 11639 + }, + { + "epoch": 0.8807839279633763, + "grad_norm": 2.392918109893799, + "learning_rate": 6.900486591737603e-07, + "loss": 0.7521, + "step": 11640 + }, + { + "epoch": 0.88085959668571, + "grad_norm": 2.2521510124206543, + "learning_rate": 6.891840400265629e-07, + "loss": 0.6259, + "step": 11641 + }, + { + "epoch": 0.8809352654080436, + "grad_norm": 2.2190327644348145, + "learning_rate": 6.883199413295384e-07, + "loss": 0.7168, + "step": 11642 + }, + { + "epoch": 0.8810109341303772, + "grad_norm": 1.9417771100997925, + "learning_rate": 6.874563631368037e-07, + "loss": 0.7122, + "step": 11643 + }, + { + "epoch": 0.8810866028527108, + "grad_norm": 2.9581193923950195, + "learning_rate": 6.865933055024394e-07, + "loss": 0.6767, + "step": 11644 + }, + { + "epoch": 0.8811622715750445, + "grad_norm": 3.3201589584350586, + "learning_rate": 6.857307684804902e-07, + "loss": 0.7296, + "step": 11645 + }, + { + "epoch": 0.8812379402973781, + "grad_norm": 2.456382989883423, + "learning_rate": 6.848687521249711e-07, + "loss": 0.6919, + "step": 11646 + }, + { + "epoch": 0.8813136090197117, + "grad_norm": 2.552457332611084, + "learning_rate": 6.840072564898647e-07, + "loss": 0.7836, + "step": 11647 + }, + { + "epoch": 0.8813892777420453, + "grad_norm": 2.111968755722046, + "learning_rate": 6.831462816291219e-07, + "loss": 0.5549, + "step": 11648 + }, + { + "epoch": 0.8814649464643789, + "grad_norm": 3.1068809032440186, + "learning_rate": 6.822858275966585e-07, + "loss": 0.6755, + "step": 11649 + }, + { + "epoch": 0.8815406151867126, + "grad_norm": 2.4133050441741943, + "learning_rate": 6.814258944463598e-07, + "loss": 0.6294, + "step": 11650 + }, + { + "epoch": 0.8816162839090462, + "grad_norm": 2.4189083576202393, + "learning_rate": 6.805664822320762e-07, + "loss": 0.6289, + "step": 11651 + }, + { + "epoch": 0.8816919526313798, + "grad_norm": 1.988824486732483, + "learning_rate": 6.797075910076299e-07, + "loss": 0.6044, + "step": 11652 + }, + { + "epoch": 0.8817676213537134, + "grad_norm": 2.337367534637451, + "learning_rate": 6.788492208268029e-07, + "loss": 0.6373, + "step": 11653 + }, + { + "epoch": 0.8818432900760471, + "grad_norm": 4.022536277770996, + "learning_rate": 6.779913717433521e-07, + "loss": 0.6707, + "step": 11654 + }, + { + "epoch": 0.8819189587983807, + "grad_norm": 2.011725902557373, + "learning_rate": 6.771340438109996e-07, + "loss": 0.7432, + "step": 11655 + }, + { + "epoch": 0.8819946275207143, + "grad_norm": 1.9914313554763794, + "learning_rate": 6.762772370834324e-07, + "loss": 0.5172, + "step": 11656 + }, + { + "epoch": 0.8820702962430479, + "grad_norm": 1.9742159843444824, + "learning_rate": 6.754209516143058e-07, + "loss": 0.6344, + "step": 11657 + }, + { + "epoch": 0.8821459649653816, + "grad_norm": 2.7394680976867676, + "learning_rate": 6.745651874572445e-07, + "loss": 0.6957, + "step": 11658 + }, + { + "epoch": 0.8822216336877152, + "grad_norm": 2.470006227493286, + "learning_rate": 6.737099446658389e-07, + "loss": 0.7392, + "step": 11659 + }, + { + "epoch": 0.8822973024100488, + "grad_norm": 2.4756081104278564, + "learning_rate": 6.728552232936471e-07, + "loss": 0.7639, + "step": 11660 + }, + { + "epoch": 0.8823729711323824, + "grad_norm": 2.942502021789551, + "learning_rate": 6.720010233941943e-07, + "loss": 0.6419, + "step": 11661 + }, + { + "epoch": 0.882448639854716, + "grad_norm": 1.9842886924743652, + "learning_rate": 6.711473450209737e-07, + "loss": 0.7802, + "step": 11662 + }, + { + "epoch": 0.8825243085770497, + "grad_norm": 1.7675484418869019, + "learning_rate": 6.702941882274446e-07, + "loss": 0.6591, + "step": 11663 + }, + { + "epoch": 0.8825999772993833, + "grad_norm": 2.0675406455993652, + "learning_rate": 6.694415530670351e-07, + "loss": 0.6647, + "step": 11664 + }, + { + "epoch": 0.8826756460217169, + "grad_norm": 2.01813006401062, + "learning_rate": 6.685894395931396e-07, + "loss": 0.631, + "step": 11665 + }, + { + "epoch": 0.8827513147440506, + "grad_norm": 1.776025414466858, + "learning_rate": 6.677378478591225e-07, + "loss": 0.819, + "step": 11666 + }, + { + "epoch": 0.8828269834663842, + "grad_norm": 2.657186269760132, + "learning_rate": 6.668867779183099e-07, + "loss": 0.6946, + "step": 11667 + }, + { + "epoch": 0.8829026521887178, + "grad_norm": 2.1002979278564453, + "learning_rate": 6.660362298239985e-07, + "loss": 0.7027, + "step": 11668 + }, + { + "epoch": 0.8829783209110514, + "grad_norm": 1.9301815032958984, + "learning_rate": 6.651862036294554e-07, + "loss": 0.7285, + "step": 11669 + }, + { + "epoch": 0.883053989633385, + "grad_norm": 2.0952084064483643, + "learning_rate": 6.6433669938791e-07, + "loss": 0.652, + "step": 11670 + }, + { + "epoch": 0.8831296583557187, + "grad_norm": 2.0233519077301025, + "learning_rate": 6.634877171525611e-07, + "loss": 0.6332, + "step": 11671 + }, + { + "epoch": 0.8832053270780523, + "grad_norm": 2.134343385696411, + "learning_rate": 6.626392569765738e-07, + "loss": 0.7207, + "step": 11672 + }, + { + "epoch": 0.8832809958003859, + "grad_norm": 1.9051159620285034, + "learning_rate": 6.617913189130837e-07, + "loss": 0.6636, + "step": 11673 + }, + { + "epoch": 0.8833566645227195, + "grad_norm": 3.0139591693878174, + "learning_rate": 6.609439030151905e-07, + "loss": 0.5951, + "step": 11674 + }, + { + "epoch": 0.8834323332450531, + "grad_norm": 2.278470277786255, + "learning_rate": 6.600970093359605e-07, + "loss": 0.519, + "step": 11675 + }, + { + "epoch": 0.8835080019673868, + "grad_norm": 2.205688953399658, + "learning_rate": 6.592506379284314e-07, + "loss": 0.7323, + "step": 11676 + }, + { + "epoch": 0.8835836706897204, + "grad_norm": 2.893535852432251, + "learning_rate": 6.584047888456058e-07, + "loss": 0.6726, + "step": 11677 + }, + { + "epoch": 0.883659339412054, + "grad_norm": 3.5614638328552246, + "learning_rate": 6.575594621404494e-07, + "loss": 0.7368, + "step": 11678 + }, + { + "epoch": 0.8837350081343877, + "grad_norm": 2.1940793991088867, + "learning_rate": 6.567146578659037e-07, + "loss": 0.694, + "step": 11679 + }, + { + "epoch": 0.8838106768567213, + "grad_norm": 2.0482017993927, + "learning_rate": 6.558703760748725e-07, + "loss": 0.6786, + "step": 11680 + }, + { + "epoch": 0.8838863455790549, + "grad_norm": 1.887109637260437, + "learning_rate": 6.550266168202263e-07, + "loss": 0.7625, + "step": 11681 + }, + { + "epoch": 0.8839620143013885, + "grad_norm": 1.6695749759674072, + "learning_rate": 6.541833801548032e-07, + "loss": 0.6128, + "step": 11682 + }, + { + "epoch": 0.8840376830237221, + "grad_norm": 2.545053720474243, + "learning_rate": 6.533406661314107e-07, + "loss": 0.6178, + "step": 11683 + }, + { + "epoch": 0.8841133517460558, + "grad_norm": 2.606736183166504, + "learning_rate": 6.524984748028226e-07, + "loss": 0.7046, + "step": 11684 + }, + { + "epoch": 0.8841890204683894, + "grad_norm": 2.562772512435913, + "learning_rate": 6.516568062217777e-07, + "loss": 0.5824, + "step": 11685 + }, + { + "epoch": 0.884264689190723, + "grad_norm": 2.3864822387695312, + "learning_rate": 6.50815660440987e-07, + "loss": 0.7149, + "step": 11686 + }, + { + "epoch": 0.8843403579130567, + "grad_norm": 1.7921406030654907, + "learning_rate": 6.499750375131251e-07, + "loss": 0.7122, + "step": 11687 + }, + { + "epoch": 0.8844160266353902, + "grad_norm": 2.3052492141723633, + "learning_rate": 6.491349374908321e-07, + "loss": 0.6496, + "step": 11688 + }, + { + "epoch": 0.8844916953577239, + "grad_norm": 1.8889057636260986, + "learning_rate": 6.482953604267179e-07, + "loss": 0.5084, + "step": 11689 + }, + { + "epoch": 0.8845673640800575, + "grad_norm": 2.0114352703094482, + "learning_rate": 6.474563063733615e-07, + "loss": 0.7584, + "step": 11690 + }, + { + "epoch": 0.8846430328023911, + "grad_norm": 3.2486960887908936, + "learning_rate": 6.466177753833097e-07, + "loss": 0.6731, + "step": 11691 + }, + { + "epoch": 0.8847187015247248, + "grad_norm": 1.9725160598754883, + "learning_rate": 6.457797675090685e-07, + "loss": 0.6108, + "step": 11692 + }, + { + "epoch": 0.8847943702470584, + "grad_norm": 2.553386926651001, + "learning_rate": 6.449422828031191e-07, + "loss": 0.5285, + "step": 11693 + }, + { + "epoch": 0.884870038969392, + "grad_norm": 1.3259893655776978, + "learning_rate": 6.441053213179074e-07, + "loss": 0.7945, + "step": 11694 + }, + { + "epoch": 0.8849457076917256, + "grad_norm": 2.1485307216644287, + "learning_rate": 6.432688831058464e-07, + "loss": 0.534, + "step": 11695 + }, + { + "epoch": 0.8850213764140592, + "grad_norm": 1.817466139793396, + "learning_rate": 6.424329682193174e-07, + "loss": 0.7578, + "step": 11696 + }, + { + "epoch": 0.8850970451363929, + "grad_norm": 2.9842936992645264, + "learning_rate": 6.415975767106674e-07, + "loss": 0.8272, + "step": 11697 + }, + { + "epoch": 0.8851727138587265, + "grad_norm": 2.0747790336608887, + "learning_rate": 6.407627086322136e-07, + "loss": 0.7141, + "step": 11698 + }, + { + "epoch": 0.8852483825810601, + "grad_norm": 2.0690724849700928, + "learning_rate": 6.399283640362322e-07, + "loss": 0.6518, + "step": 11699 + }, + { + "epoch": 0.8853240513033938, + "grad_norm": 1.9558627605438232, + "learning_rate": 6.390945429749784e-07, + "loss": 0.689, + "step": 11700 + }, + { + "epoch": 0.8853997200257273, + "grad_norm": 2.246373414993286, + "learning_rate": 6.382612455006684e-07, + "loss": 0.6619, + "step": 11701 + }, + { + "epoch": 0.885475388748061, + "grad_norm": 2.5528721809387207, + "learning_rate": 6.374284716654823e-07, + "loss": 0.6938, + "step": 11702 + }, + { + "epoch": 0.8855510574703946, + "grad_norm": 3.1837098598480225, + "learning_rate": 6.365962215215737e-07, + "loss": 0.7115, + "step": 11703 + }, + { + "epoch": 0.8856267261927282, + "grad_norm": 1.9520882368087769, + "learning_rate": 6.357644951210588e-07, + "loss": 0.6254, + "step": 11704 + }, + { + "epoch": 0.8857023949150619, + "grad_norm": 2.3639538288116455, + "learning_rate": 6.349332925160267e-07, + "loss": 0.8559, + "step": 11705 + }, + { + "epoch": 0.8857780636373955, + "grad_norm": 2.534674644470215, + "learning_rate": 6.341026137585271e-07, + "loss": 0.7494, + "step": 11706 + }, + { + "epoch": 0.8858537323597291, + "grad_norm": 2.048832416534424, + "learning_rate": 6.332724589005792e-07, + "loss": 0.7289, + "step": 11707 + }, + { + "epoch": 0.8859294010820628, + "grad_norm": 2.1212267875671387, + "learning_rate": 6.324428279941724e-07, + "loss": 0.7757, + "step": 11708 + }, + { + "epoch": 0.8860050698043963, + "grad_norm": 2.0502870082855225, + "learning_rate": 6.316137210912593e-07, + "loss": 0.7043, + "step": 11709 + }, + { + "epoch": 0.88608073852673, + "grad_norm": 2.497807502746582, + "learning_rate": 6.307851382437612e-07, + "loss": 0.653, + "step": 11710 + }, + { + "epoch": 0.8861564072490636, + "grad_norm": 1.9462158679962158, + "learning_rate": 6.299570795035676e-07, + "loss": 0.6508, + "step": 11711 + }, + { + "epoch": 0.8862320759713972, + "grad_norm": 3.3071272373199463, + "learning_rate": 6.291295449225352e-07, + "loss": 0.7087, + "step": 11712 + }, + { + "epoch": 0.8863077446937309, + "grad_norm": 2.035278797149658, + "learning_rate": 6.283025345524833e-07, + "loss": 0.6981, + "step": 11713 + }, + { + "epoch": 0.8863834134160645, + "grad_norm": 1.910923719406128, + "learning_rate": 6.274760484452027e-07, + "loss": 0.5813, + "step": 11714 + }, + { + "epoch": 0.8864590821383981, + "grad_norm": 1.7221661806106567, + "learning_rate": 6.266500866524558e-07, + "loss": 0.8203, + "step": 11715 + }, + { + "epoch": 0.8865347508607317, + "grad_norm": 2.013583183288574, + "learning_rate": 6.258246492259604e-07, + "loss": 0.7452, + "step": 11716 + }, + { + "epoch": 0.8866104195830653, + "grad_norm": 2.711366653442383, + "learning_rate": 6.24999736217412e-07, + "loss": 0.7126, + "step": 11717 + }, + { + "epoch": 0.886686088305399, + "grad_norm": 2.107581853866577, + "learning_rate": 6.241753476784674e-07, + "loss": 0.655, + "step": 11718 + }, + { + "epoch": 0.8867617570277326, + "grad_norm": 2.0487051010131836, + "learning_rate": 6.233514836607533e-07, + "loss": 0.6132, + "step": 11719 + }, + { + "epoch": 0.8868374257500662, + "grad_norm": 2.098557949066162, + "learning_rate": 6.225281442158633e-07, + "loss": 0.6881, + "step": 11720 + }, + { + "epoch": 0.8869130944723999, + "grad_norm": 2.5246660709381104, + "learning_rate": 6.217053293953562e-07, + "loss": 0.6164, + "step": 11721 + }, + { + "epoch": 0.8869887631947334, + "grad_norm": 2.3522937297821045, + "learning_rate": 6.208830392507609e-07, + "loss": 0.661, + "step": 11722 + }, + { + "epoch": 0.8870644319170671, + "grad_norm": 2.151280164718628, + "learning_rate": 6.20061273833572e-07, + "loss": 0.7885, + "step": 11723 + }, + { + "epoch": 0.8871401006394007, + "grad_norm": 2.2462844848632812, + "learning_rate": 6.192400331952486e-07, + "loss": 0.7719, + "step": 11724 + }, + { + "epoch": 0.8872157693617343, + "grad_norm": 2.168433427810669, + "learning_rate": 6.184193173872194e-07, + "loss": 0.5961, + "step": 11725 + }, + { + "epoch": 0.887291438084068, + "grad_norm": 2.1609530448913574, + "learning_rate": 6.175991264608853e-07, + "loss": 0.8232, + "step": 11726 + }, + { + "epoch": 0.8873671068064016, + "grad_norm": 1.7684580087661743, + "learning_rate": 6.167794604676032e-07, + "loss": 0.7733, + "step": 11727 + }, + { + "epoch": 0.8874427755287352, + "grad_norm": 2.24383807182312, + "learning_rate": 6.15960319458707e-07, + "loss": 0.682, + "step": 11728 + }, + { + "epoch": 0.8875184442510688, + "grad_norm": 2.093867540359497, + "learning_rate": 6.151417034854928e-07, + "loss": 0.6278, + "step": 11729 + }, + { + "epoch": 0.8875941129734024, + "grad_norm": 1.9309519529342651, + "learning_rate": 6.143236125992245e-07, + "loss": 0.6813, + "step": 11730 + }, + { + "epoch": 0.8876697816957361, + "grad_norm": 2.1042773723602295, + "learning_rate": 6.135060468511352e-07, + "loss": 0.6952, + "step": 11731 + }, + { + "epoch": 0.8877454504180697, + "grad_norm": 4.267055988311768, + "learning_rate": 6.126890062924218e-07, + "loss": 0.685, + "step": 11732 + }, + { + "epoch": 0.8878211191404033, + "grad_norm": 1.7628538608551025, + "learning_rate": 6.118724909742515e-07, + "loss": 0.5026, + "step": 11733 + }, + { + "epoch": 0.887896787862737, + "grad_norm": 1.8866184949874878, + "learning_rate": 6.110565009477555e-07, + "loss": 0.5225, + "step": 11734 + }, + { + "epoch": 0.8879724565850705, + "grad_norm": 2.075368881225586, + "learning_rate": 6.102410362640336e-07, + "loss": 0.6586, + "step": 11735 + }, + { + "epoch": 0.8880481253074042, + "grad_norm": 3.18533992767334, + "learning_rate": 6.094260969741542e-07, + "loss": 0.7591, + "step": 11736 + }, + { + "epoch": 0.8881237940297378, + "grad_norm": 2.374406576156616, + "learning_rate": 6.086116831291534e-07, + "loss": 0.8184, + "step": 11737 + }, + { + "epoch": 0.8881994627520714, + "grad_norm": 2.989527940750122, + "learning_rate": 6.077977947800284e-07, + "loss": 0.8288, + "step": 11738 + }, + { + "epoch": 0.8882751314744051, + "grad_norm": 2.4953837394714355, + "learning_rate": 6.069844319777485e-07, + "loss": 0.6191, + "step": 11739 + }, + { + "epoch": 0.8883508001967387, + "grad_norm": 2.7576467990875244, + "learning_rate": 6.061715947732508e-07, + "loss": 0.5426, + "step": 11740 + }, + { + "epoch": 0.8884264689190723, + "grad_norm": 2.4133262634277344, + "learning_rate": 6.053592832174357e-07, + "loss": 0.6257, + "step": 11741 + }, + { + "epoch": 0.888502137641406, + "grad_norm": 2.3025264739990234, + "learning_rate": 6.045474973611746e-07, + "loss": 0.6992, + "step": 11742 + }, + { + "epoch": 0.8885778063637395, + "grad_norm": 2.0357069969177246, + "learning_rate": 6.037362372553026e-07, + "loss": 0.6468, + "step": 11743 + }, + { + "epoch": 0.8886534750860732, + "grad_norm": 2.5537617206573486, + "learning_rate": 6.029255029506262e-07, + "loss": 0.6608, + "step": 11744 + }, + { + "epoch": 0.8887291438084068, + "grad_norm": 2.2393789291381836, + "learning_rate": 6.021152944979118e-07, + "loss": 0.6493, + "step": 11745 + }, + { + "epoch": 0.8888048125307404, + "grad_norm": 2.217641830444336, + "learning_rate": 6.013056119479008e-07, + "loss": 0.7603, + "step": 11746 + }, + { + "epoch": 0.8888804812530741, + "grad_norm": 2.554548978805542, + "learning_rate": 6.004964553512986e-07, + "loss": 0.7148, + "step": 11747 + }, + { + "epoch": 0.8889561499754076, + "grad_norm": 1.9256244897842407, + "learning_rate": 5.996878247587737e-07, + "loss": 0.7035, + "step": 11748 + }, + { + "epoch": 0.8890318186977413, + "grad_norm": 4.53993034362793, + "learning_rate": 5.988797202209676e-07, + "loss": 0.4874, + "step": 11749 + }, + { + "epoch": 0.889107487420075, + "grad_norm": 2.335663080215454, + "learning_rate": 5.980721417884838e-07, + "loss": 0.7383, + "step": 11750 + }, + { + "epoch": 0.8891831561424085, + "grad_norm": 2.05330228805542, + "learning_rate": 5.972650895119018e-07, + "loss": 0.7447, + "step": 11751 + }, + { + "epoch": 0.8892588248647422, + "grad_norm": 2.6175127029418945, + "learning_rate": 5.964585634417553e-07, + "loss": 0.6698, + "step": 11752 + }, + { + "epoch": 0.8893344935870758, + "grad_norm": 1.903134822845459, + "learning_rate": 5.956525636285538e-07, + "loss": 0.623, + "step": 11753 + }, + { + "epoch": 0.8894101623094094, + "grad_norm": 2.384854316711426, + "learning_rate": 5.94847090122772e-07, + "loss": 0.6605, + "step": 11754 + }, + { + "epoch": 0.8894858310317431, + "grad_norm": 3.7998368740081787, + "learning_rate": 5.940421429748514e-07, + "loss": 0.627, + "step": 11755 + }, + { + "epoch": 0.8895614997540766, + "grad_norm": 2.438006639480591, + "learning_rate": 5.932377222351987e-07, + "loss": 0.5972, + "step": 11756 + }, + { + "epoch": 0.8896371684764103, + "grad_norm": 2.115206718444824, + "learning_rate": 5.924338279541919e-07, + "loss": 0.5647, + "step": 11757 + }, + { + "epoch": 0.8897128371987439, + "grad_norm": 2.2133545875549316, + "learning_rate": 5.916304601821733e-07, + "loss": 0.6166, + "step": 11758 + }, + { + "epoch": 0.8897885059210775, + "grad_norm": 2.2478818893432617, + "learning_rate": 5.90827618969449e-07, + "loss": 0.6041, + "step": 11759 + }, + { + "epoch": 0.8898641746434112, + "grad_norm": 2.2799623012542725, + "learning_rate": 5.900253043662977e-07, + "loss": 0.59, + "step": 11760 + }, + { + "epoch": 0.8899398433657447, + "grad_norm": 2.2203476428985596, + "learning_rate": 5.89223516422965e-07, + "loss": 0.6151, + "step": 11761 + }, + { + "epoch": 0.8900155120880784, + "grad_norm": 3.3624329566955566, + "learning_rate": 5.88422255189658e-07, + "loss": 0.7139, + "step": 11762 + }, + { + "epoch": 0.890091180810412, + "grad_norm": 1.922843337059021, + "learning_rate": 5.876215207165554e-07, + "loss": 0.6256, + "step": 11763 + }, + { + "epoch": 0.8901668495327456, + "grad_norm": 2.0801382064819336, + "learning_rate": 5.868213130538032e-07, + "loss": 0.6623, + "step": 11764 + }, + { + "epoch": 0.8902425182550793, + "grad_norm": 1.8652883768081665, + "learning_rate": 5.860216322515112e-07, + "loss": 0.6678, + "step": 11765 + }, + { + "epoch": 0.8903181869774129, + "grad_norm": 1.8340070247650146, + "learning_rate": 5.852224783597584e-07, + "loss": 0.7897, + "step": 11766 + }, + { + "epoch": 0.8903938556997465, + "grad_norm": 2.2345545291900635, + "learning_rate": 5.844238514285908e-07, + "loss": 0.6383, + "step": 11767 + }, + { + "epoch": 0.8904695244220802, + "grad_norm": 3.453942060470581, + "learning_rate": 5.836257515080213e-07, + "loss": 0.5833, + "step": 11768 + }, + { + "epoch": 0.8905451931444137, + "grad_norm": 1.9211537837982178, + "learning_rate": 5.82828178648031e-07, + "loss": 0.6603, + "step": 11769 + }, + { + "epoch": 0.8906208618667474, + "grad_norm": 1.9571908712387085, + "learning_rate": 5.82031132898562e-07, + "loss": 0.5726, + "step": 11770 + }, + { + "epoch": 0.890696530589081, + "grad_norm": 2.344529151916504, + "learning_rate": 5.812346143095303e-07, + "loss": 0.7352, + "step": 11771 + }, + { + "epoch": 0.8907721993114146, + "grad_norm": 1.8455153703689575, + "learning_rate": 5.80438622930818e-07, + "loss": 0.6122, + "step": 11772 + }, + { + "epoch": 0.8908478680337483, + "grad_norm": 2.3054256439208984, + "learning_rate": 5.796431588122711e-07, + "loss": 0.5951, + "step": 11773 + }, + { + "epoch": 0.8909235367560818, + "grad_norm": 2.202211618423462, + "learning_rate": 5.788482220037041e-07, + "loss": 0.5783, + "step": 11774 + }, + { + "epoch": 0.8909992054784155, + "grad_norm": 3.3458003997802734, + "learning_rate": 5.780538125548977e-07, + "loss": 0.6301, + "step": 11775 + }, + { + "epoch": 0.8910748742007492, + "grad_norm": 6.680607795715332, + "learning_rate": 5.772599305156026e-07, + "loss": 0.6437, + "step": 11776 + }, + { + "epoch": 0.8911505429230827, + "grad_norm": 2.2788503170013428, + "learning_rate": 5.764665759355326e-07, + "loss": 0.5404, + "step": 11777 + }, + { + "epoch": 0.8912262116454164, + "grad_norm": 2.148418426513672, + "learning_rate": 5.756737488643713e-07, + "loss": 0.6569, + "step": 11778 + }, + { + "epoch": 0.89130188036775, + "grad_norm": 2.1473772525787354, + "learning_rate": 5.748814493517668e-07, + "loss": 0.4821, + "step": 11779 + }, + { + "epoch": 0.8913775490900836, + "grad_norm": 2.9149558544158936, + "learning_rate": 5.740896774473374e-07, + "loss": 0.7051, + "step": 11780 + }, + { + "epoch": 0.8914532178124173, + "grad_norm": 2.491338014602661, + "learning_rate": 5.732984332006625e-07, + "loss": 0.6287, + "step": 11781 + }, + { + "epoch": 0.8915288865347508, + "grad_norm": 1.9785722494125366, + "learning_rate": 5.725077166612966e-07, + "loss": 0.5939, + "step": 11782 + }, + { + "epoch": 0.8916045552570845, + "grad_norm": 2.247680187225342, + "learning_rate": 5.717175278787568e-07, + "loss": 0.7237, + "step": 11783 + }, + { + "epoch": 0.8916802239794182, + "grad_norm": 2.0729784965515137, + "learning_rate": 5.709278669025236e-07, + "loss": 0.6251, + "step": 11784 + }, + { + "epoch": 0.8917558927017517, + "grad_norm": 8.808799743652344, + "learning_rate": 5.701387337820506e-07, + "loss": 0.6845, + "step": 11785 + }, + { + "epoch": 0.8918315614240854, + "grad_norm": 2.378300666809082, + "learning_rate": 5.693501285667561e-07, + "loss": 0.6736, + "step": 11786 + }, + { + "epoch": 0.8919072301464189, + "grad_norm": 1.9735015630722046, + "learning_rate": 5.685620513060238e-07, + "loss": 0.648, + "step": 11787 + }, + { + "epoch": 0.8919828988687526, + "grad_norm": 2.113264799118042, + "learning_rate": 5.67774502049207e-07, + "loss": 0.6149, + "step": 11788 + }, + { + "epoch": 0.8920585675910863, + "grad_norm": 2.309506893157959, + "learning_rate": 5.669874808456244e-07, + "loss": 0.8259, + "step": 11789 + }, + { + "epoch": 0.8921342363134198, + "grad_norm": 2.5161781311035156, + "learning_rate": 5.662009877445614e-07, + "loss": 0.6214, + "step": 11790 + }, + { + "epoch": 0.8922099050357535, + "grad_norm": 1.9568015336990356, + "learning_rate": 5.654150227952688e-07, + "loss": 0.6697, + "step": 11791 + }, + { + "epoch": 0.8922855737580871, + "grad_norm": 2.603675127029419, + "learning_rate": 5.646295860469701e-07, + "loss": 0.6682, + "step": 11792 + }, + { + "epoch": 0.8923612424804207, + "grad_norm": 2.497239589691162, + "learning_rate": 5.63844677548849e-07, + "loss": 0.6721, + "step": 11793 + }, + { + "epoch": 0.8924369112027544, + "grad_norm": 2.2003486156463623, + "learning_rate": 5.630602973500622e-07, + "loss": 0.6348, + "step": 11794 + }, + { + "epoch": 0.8925125799250879, + "grad_norm": 2.175077199935913, + "learning_rate": 5.622764454997265e-07, + "loss": 0.7373, + "step": 11795 + }, + { + "epoch": 0.8925882486474216, + "grad_norm": 2.1149489879608154, + "learning_rate": 5.614931220469294e-07, + "loss": 0.5557, + "step": 11796 + }, + { + "epoch": 0.8926639173697553, + "grad_norm": 2.437793493270874, + "learning_rate": 5.607103270407288e-07, + "loss": 0.8061, + "step": 11797 + }, + { + "epoch": 0.8927395860920888, + "grad_norm": 2.231208562850952, + "learning_rate": 5.599280605301424e-07, + "loss": 0.6489, + "step": 11798 + }, + { + "epoch": 0.8928152548144225, + "grad_norm": 1.8926862478256226, + "learning_rate": 5.591463225641592e-07, + "loss": 0.7406, + "step": 11799 + }, + { + "epoch": 0.892890923536756, + "grad_norm": 3.975660562515259, + "learning_rate": 5.583651131917338e-07, + "loss": 0.6159, + "step": 11800 + }, + { + "epoch": 0.8929665922590897, + "grad_norm": 1.7848179340362549, + "learning_rate": 5.575844324617914e-07, + "loss": 0.6479, + "step": 11801 + }, + { + "epoch": 0.8930422609814234, + "grad_norm": 2.325378656387329, + "learning_rate": 5.568042804232135e-07, + "loss": 0.7373, + "step": 11802 + }, + { + "epoch": 0.8931179297037569, + "grad_norm": 2.4571590423583984, + "learning_rate": 5.560246571248623e-07, + "loss": 0.8312, + "step": 11803 + }, + { + "epoch": 0.8931935984260906, + "grad_norm": 2.313886880874634, + "learning_rate": 5.552455626155596e-07, + "loss": 0.7574, + "step": 11804 + }, + { + "epoch": 0.8932692671484243, + "grad_norm": 3.1456427574157715, + "learning_rate": 5.544669969440924e-07, + "loss": 0.5273, + "step": 11805 + }, + { + "epoch": 0.8933449358707578, + "grad_norm": 2.003390073776245, + "learning_rate": 5.536889601592178e-07, + "loss": 0.7509, + "step": 11806 + }, + { + "epoch": 0.8934206045930915, + "grad_norm": 2.1938109397888184, + "learning_rate": 5.529114523096576e-07, + "loss": 0.627, + "step": 11807 + }, + { + "epoch": 0.893496273315425, + "grad_norm": 3.804588794708252, + "learning_rate": 5.521344734441061e-07, + "loss": 0.6258, + "step": 11808 + }, + { + "epoch": 0.8935719420377587, + "grad_norm": 5.310543060302734, + "learning_rate": 5.513580236112163e-07, + "loss": 0.743, + "step": 11809 + }, + { + "epoch": 0.8936476107600924, + "grad_norm": 2.3194751739501953, + "learning_rate": 5.505821028596133e-07, + "loss": 0.8416, + "step": 11810 + }, + { + "epoch": 0.8937232794824259, + "grad_norm": 2.0355045795440674, + "learning_rate": 5.498067112378881e-07, + "loss": 0.7552, + "step": 11811 + }, + { + "epoch": 0.8937989482047596, + "grad_norm": 2.1219053268432617, + "learning_rate": 5.490318487945971e-07, + "loss": 0.6533, + "step": 11812 + }, + { + "epoch": 0.8938746169270931, + "grad_norm": 2.342350959777832, + "learning_rate": 5.482575155782663e-07, + "loss": 0.6366, + "step": 11813 + }, + { + "epoch": 0.8939502856494268, + "grad_norm": 2.078801393508911, + "learning_rate": 5.47483711637386e-07, + "loss": 0.7247, + "step": 11814 + }, + { + "epoch": 0.8940259543717605, + "grad_norm": 2.194209337234497, + "learning_rate": 5.467104370204153e-07, + "loss": 0.5571, + "step": 11815 + }, + { + "epoch": 0.894101623094094, + "grad_norm": 2.5547542572021484, + "learning_rate": 5.459376917757776e-07, + "loss": 0.8075, + "step": 11816 + }, + { + "epoch": 0.8941772918164277, + "grad_norm": 2.040752410888672, + "learning_rate": 5.451654759518632e-07, + "loss": 0.5734, + "step": 11817 + }, + { + "epoch": 0.8942529605387614, + "grad_norm": 2.9403791427612305, + "learning_rate": 5.443937895970364e-07, + "loss": 0.694, + "step": 11818 + }, + { + "epoch": 0.8943286292610949, + "grad_norm": 2.2212417125701904, + "learning_rate": 5.436226327596176e-07, + "loss": 0.5364, + "step": 11819 + }, + { + "epoch": 0.8944042979834286, + "grad_norm": 2.305257558822632, + "learning_rate": 5.428520054879009e-07, + "loss": 0.5975, + "step": 11820 + }, + { + "epoch": 0.8944799667057621, + "grad_norm": 2.220174551010132, + "learning_rate": 5.42081907830145e-07, + "loss": 0.7507, + "step": 11821 + }, + { + "epoch": 0.8945556354280958, + "grad_norm": 2.3511104583740234, + "learning_rate": 5.413123398345761e-07, + "loss": 0.7957, + "step": 11822 + }, + { + "epoch": 0.8946313041504295, + "grad_norm": 2.9117751121520996, + "learning_rate": 5.405433015493879e-07, + "loss": 0.6301, + "step": 11823 + }, + { + "epoch": 0.894706972872763, + "grad_norm": 2.1860318183898926, + "learning_rate": 5.397747930227386e-07, + "loss": 0.7152, + "step": 11824 + }, + { + "epoch": 0.8947826415950967, + "grad_norm": 2.775752067565918, + "learning_rate": 5.39006814302756e-07, + "loss": 0.7394, + "step": 11825 + }, + { + "epoch": 0.8948583103174302, + "grad_norm": 2.653651475906372, + "learning_rate": 5.382393654375344e-07, + "loss": 0.6355, + "step": 11826 + }, + { + "epoch": 0.8949339790397639, + "grad_norm": 1.9705297946929932, + "learning_rate": 5.374724464751294e-07, + "loss": 0.5801, + "step": 11827 + }, + { + "epoch": 0.8950096477620976, + "grad_norm": 1.8360956907272339, + "learning_rate": 5.367060574635726e-07, + "loss": 0.7124, + "step": 11828 + }, + { + "epoch": 0.8950853164844311, + "grad_norm": 2.0758135318756104, + "learning_rate": 5.359401984508566e-07, + "loss": 0.7395, + "step": 11829 + }, + { + "epoch": 0.8951609852067648, + "grad_norm": 2.226825475692749, + "learning_rate": 5.351748694849411e-07, + "loss": 0.6466, + "step": 11830 + }, + { + "epoch": 0.8952366539290985, + "grad_norm": 1.9071615934371948, + "learning_rate": 5.344100706137527e-07, + "loss": 0.708, + "step": 11831 + }, + { + "epoch": 0.895312322651432, + "grad_norm": 2.7298760414123535, + "learning_rate": 5.336458018851881e-07, + "loss": 0.6113, + "step": 11832 + }, + { + "epoch": 0.8953879913737657, + "grad_norm": 2.248567581176758, + "learning_rate": 5.32882063347106e-07, + "loss": 0.5838, + "step": 11833 + }, + { + "epoch": 0.8954636600960992, + "grad_norm": 2.2924344539642334, + "learning_rate": 5.321188550473351e-07, + "loss": 0.5261, + "step": 11834 + }, + { + "epoch": 0.8955393288184329, + "grad_norm": 1.8909533023834229, + "learning_rate": 5.313561770336704e-07, + "loss": 0.5217, + "step": 11835 + }, + { + "epoch": 0.8956149975407666, + "grad_norm": 3.445955514907837, + "learning_rate": 5.305940293538733e-07, + "loss": 0.6061, + "step": 11836 + }, + { + "epoch": 0.8956906662631001, + "grad_norm": 1.8945612907409668, + "learning_rate": 5.2983241205567e-07, + "loss": 0.6668, + "step": 11837 + }, + { + "epoch": 0.8957663349854338, + "grad_norm": 2.5115628242492676, + "learning_rate": 5.290713251867571e-07, + "loss": 0.5408, + "step": 11838 + }, + { + "epoch": 0.8958420037077673, + "grad_norm": 2.304194688796997, + "learning_rate": 5.283107687947967e-07, + "loss": 0.5717, + "step": 11839 + }, + { + "epoch": 0.895917672430101, + "grad_norm": 2.0823569297790527, + "learning_rate": 5.275507429274185e-07, + "loss": 0.6496, + "step": 11840 + }, + { + "epoch": 0.8959933411524347, + "grad_norm": 2.2738540172576904, + "learning_rate": 5.267912476322134e-07, + "loss": 0.5803, + "step": 11841 + }, + { + "epoch": 0.8960690098747682, + "grad_norm": 1.8352035284042358, + "learning_rate": 5.260322829567465e-07, + "loss": 0.6032, + "step": 11842 + }, + { + "epoch": 0.8961446785971019, + "grad_norm": 2.4070634841918945, + "learning_rate": 5.252738489485467e-07, + "loss": 0.6438, + "step": 11843 + }, + { + "epoch": 0.8962203473194356, + "grad_norm": 2.4445745944976807, + "learning_rate": 5.245159456551092e-07, + "loss": 0.6391, + "step": 11844 + }, + { + "epoch": 0.8962960160417691, + "grad_norm": 2.533707857131958, + "learning_rate": 5.237585731238958e-07, + "loss": 0.6541, + "step": 11845 + }, + { + "epoch": 0.8963716847641028, + "grad_norm": 2.023911476135254, + "learning_rate": 5.230017314023366e-07, + "loss": 0.667, + "step": 11846 + }, + { + "epoch": 0.8964473534864363, + "grad_norm": 2.331500768661499, + "learning_rate": 5.222454205378277e-07, + "loss": 0.7697, + "step": 11847 + }, + { + "epoch": 0.89652302220877, + "grad_norm": 2.1052236557006836, + "learning_rate": 5.214896405777281e-07, + "loss": 0.6318, + "step": 11848 + }, + { + "epoch": 0.8965986909311037, + "grad_norm": 1.6714646816253662, + "learning_rate": 5.207343915693713e-07, + "loss": 0.6142, + "step": 11849 + }, + { + "epoch": 0.8966743596534372, + "grad_norm": 1.8316816091537476, + "learning_rate": 5.199796735600541e-07, + "loss": 0.63, + "step": 11850 + }, + { + "epoch": 0.8967500283757709, + "grad_norm": 1.962476134300232, + "learning_rate": 5.19225486597036e-07, + "loss": 0.5936, + "step": 11851 + }, + { + "epoch": 0.8968256970981044, + "grad_norm": 2.385847330093384, + "learning_rate": 5.184718307275479e-07, + "loss": 0.6865, + "step": 11852 + }, + { + "epoch": 0.8969013658204381, + "grad_norm": 6.628912925720215, + "learning_rate": 5.177187059987842e-07, + "loss": 0.7063, + "step": 11853 + }, + { + "epoch": 0.8969770345427718, + "grad_norm": 2.5971126556396484, + "learning_rate": 5.169661124579143e-07, + "loss": 0.7106, + "step": 11854 + }, + { + "epoch": 0.8970527032651053, + "grad_norm": 1.8385004997253418, + "learning_rate": 5.162140501520612e-07, + "loss": 0.5338, + "step": 11855 + }, + { + "epoch": 0.897128371987439, + "grad_norm": 1.9331872463226318, + "learning_rate": 5.154625191283256e-07, + "loss": 0.7039, + "step": 11856 + }, + { + "epoch": 0.8972040407097727, + "grad_norm": 2.373424530029297, + "learning_rate": 5.147115194337685e-07, + "loss": 0.7406, + "step": 11857 + }, + { + "epoch": 0.8972797094321062, + "grad_norm": 2.4697651863098145, + "learning_rate": 5.139610511154204e-07, + "loss": 0.7292, + "step": 11858 + }, + { + "epoch": 0.8973553781544399, + "grad_norm": 2.0015337467193604, + "learning_rate": 5.132111142202799e-07, + "loss": 0.6327, + "step": 11859 + }, + { + "epoch": 0.8974310468767734, + "grad_norm": 2.549948215484619, + "learning_rate": 5.124617087953082e-07, + "loss": 0.7159, + "step": 11860 + }, + { + "epoch": 0.8975067155991071, + "grad_norm": 1.9285160303115845, + "learning_rate": 5.117128348874368e-07, + "loss": 0.6433, + "step": 11861 + }, + { + "epoch": 0.8975823843214408, + "grad_norm": 2.8049867153167725, + "learning_rate": 5.109644925435622e-07, + "loss": 0.7273, + "step": 11862 + }, + { + "epoch": 0.8976580530437743, + "grad_norm": 6.72634744644165, + "learning_rate": 5.10216681810546e-07, + "loss": 0.7476, + "step": 11863 + }, + { + "epoch": 0.897733721766108, + "grad_norm": 2.0858564376831055, + "learning_rate": 5.094694027352227e-07, + "loss": 0.6723, + "step": 11864 + }, + { + "epoch": 0.8978093904884415, + "grad_norm": 2.3242621421813965, + "learning_rate": 5.087226553643868e-07, + "loss": 0.6651, + "step": 11865 + }, + { + "epoch": 0.8978850592107752, + "grad_norm": 2.657841205596924, + "learning_rate": 5.079764397448019e-07, + "loss": 0.7593, + "step": 11866 + }, + { + "epoch": 0.8979607279331089, + "grad_norm": 1.891689658164978, + "learning_rate": 5.072307559231986e-07, + "loss": 0.6064, + "step": 11867 + }, + { + "epoch": 0.8980363966554424, + "grad_norm": 2.2270796298980713, + "learning_rate": 5.064856039462747e-07, + "loss": 0.712, + "step": 11868 + }, + { + "epoch": 0.8981120653777761, + "grad_norm": 2.0540242195129395, + "learning_rate": 5.057409838606928e-07, + "loss": 0.6487, + "step": 11869 + }, + { + "epoch": 0.8981877341001098, + "grad_norm": 2.1607933044433594, + "learning_rate": 5.049968957130855e-07, + "loss": 0.5223, + "step": 11870 + }, + { + "epoch": 0.8982634028224433, + "grad_norm": 2.262275457382202, + "learning_rate": 5.042533395500475e-07, + "loss": 0.7798, + "step": 11871 + }, + { + "epoch": 0.898339071544777, + "grad_norm": 2.3458058834075928, + "learning_rate": 5.035103154181458e-07, + "loss": 0.7041, + "step": 11872 + }, + { + "epoch": 0.8984147402671105, + "grad_norm": 2.098083257675171, + "learning_rate": 5.02767823363907e-07, + "loss": 0.6414, + "step": 11873 + }, + { + "epoch": 0.8984904089894442, + "grad_norm": 2.3881022930145264, + "learning_rate": 5.020258634338309e-07, + "loss": 0.7463, + "step": 11874 + }, + { + "epoch": 0.8985660777117779, + "grad_norm": 1.8996672630310059, + "learning_rate": 5.012844356743834e-07, + "loss": 0.5266, + "step": 11875 + }, + { + "epoch": 0.8986417464341114, + "grad_norm": 2.157325506210327, + "learning_rate": 5.005435401319904e-07, + "loss": 0.6429, + "step": 11876 + }, + { + "epoch": 0.8987174151564451, + "grad_norm": 2.011565685272217, + "learning_rate": 4.998031768530525e-07, + "loss": 0.6239, + "step": 11877 + }, + { + "epoch": 0.8987930838787787, + "grad_norm": 2.1313388347625732, + "learning_rate": 4.99063345883932e-07, + "loss": 0.6479, + "step": 11878 + }, + { + "epoch": 0.8988687526011123, + "grad_norm": 2.972902536392212, + "learning_rate": 4.983240472709606e-07, + "loss": 0.7627, + "step": 11879 + }, + { + "epoch": 0.898944421323446, + "grad_norm": 2.6932406425476074, + "learning_rate": 4.975852810604343e-07, + "loss": 0.8005, + "step": 11880 + }, + { + "epoch": 0.8990200900457795, + "grad_norm": 2.0103864669799805, + "learning_rate": 4.968470472986182e-07, + "loss": 0.5966, + "step": 11881 + }, + { + "epoch": 0.8990957587681132, + "grad_norm": 2.020077705383301, + "learning_rate": 4.961093460317422e-07, + "loss": 0.7393, + "step": 11882 + }, + { + "epoch": 0.8991714274904469, + "grad_norm": 2.478301763534546, + "learning_rate": 4.953721773060064e-07, + "loss": 0.5821, + "step": 11883 + }, + { + "epoch": 0.8992470962127804, + "grad_norm": 2.066504955291748, + "learning_rate": 4.946355411675688e-07, + "loss": 0.5965, + "step": 11884 + }, + { + "epoch": 0.8993227649351141, + "grad_norm": 2.5086374282836914, + "learning_rate": 4.938994376625646e-07, + "loss": 0.7674, + "step": 11885 + }, + { + "epoch": 0.8993984336574476, + "grad_norm": 2.390106201171875, + "learning_rate": 4.931638668370909e-07, + "loss": 0.604, + "step": 11886 + }, + { + "epoch": 0.8994741023797813, + "grad_norm": 2.292848825454712, + "learning_rate": 4.924288287372089e-07, + "loss": 0.8054, + "step": 11887 + }, + { + "epoch": 0.899549771102115, + "grad_norm": 2.5886833667755127, + "learning_rate": 4.916943234089506e-07, + "loss": 0.6667, + "step": 11888 + }, + { + "epoch": 0.8996254398244485, + "grad_norm": 2.0663654804229736, + "learning_rate": 4.909603508983124e-07, + "loss": 0.7111, + "step": 11889 + }, + { + "epoch": 0.8997011085467822, + "grad_norm": 2.1520442962646484, + "learning_rate": 4.902269112512594e-07, + "loss": 0.7455, + "step": 11890 + }, + { + "epoch": 0.8997767772691158, + "grad_norm": 2.241392135620117, + "learning_rate": 4.894940045137209e-07, + "loss": 0.6856, + "step": 11891 + }, + { + "epoch": 0.8998524459914494, + "grad_norm": 2.823662757873535, + "learning_rate": 4.887616307315943e-07, + "loss": 0.6895, + "step": 11892 + }, + { + "epoch": 0.8999281147137831, + "grad_norm": 2.172192335128784, + "learning_rate": 4.880297899507438e-07, + "loss": 0.7223, + "step": 11893 + }, + { + "epoch": 0.9000037834361166, + "grad_norm": 2.1436941623687744, + "learning_rate": 4.872984822169967e-07, + "loss": 0.5677, + "step": 11894 + }, + { + "epoch": 0.9000794521584503, + "grad_norm": 2.2516303062438965, + "learning_rate": 4.865677075761534e-07, + "loss": 0.8164, + "step": 11895 + }, + { + "epoch": 0.900155120880784, + "grad_norm": 3.5125577449798584, + "learning_rate": 4.858374660739764e-07, + "loss": 0.6542, + "step": 11896 + }, + { + "epoch": 0.9002307896031175, + "grad_norm": 2.0232994556427, + "learning_rate": 4.85107757756196e-07, + "loss": 0.5626, + "step": 11897 + }, + { + "epoch": 0.9003064583254512, + "grad_norm": 2.3365819454193115, + "learning_rate": 4.843785826685076e-07, + "loss": 0.7357, + "step": 11898 + }, + { + "epoch": 0.9003821270477848, + "grad_norm": 1.9569621086120605, + "learning_rate": 4.836499408565738e-07, + "loss": 0.7641, + "step": 11899 + }, + { + "epoch": 0.9004577957701184, + "grad_norm": 2.693556547164917, + "learning_rate": 4.829218323660282e-07, + "loss": 0.6307, + "step": 11900 + }, + { + "epoch": 0.9005334644924521, + "grad_norm": 1.9152913093566895, + "learning_rate": 4.821942572424641e-07, + "loss": 0.5283, + "step": 11901 + }, + { + "epoch": 0.9006091332147856, + "grad_norm": 8.86640739440918, + "learning_rate": 4.81467215531445e-07, + "loss": 0.7087, + "step": 11902 + }, + { + "epoch": 0.9006848019371193, + "grad_norm": 2.024935722351074, + "learning_rate": 4.807407072785018e-07, + "loss": 0.657, + "step": 11903 + }, + { + "epoch": 0.9007604706594529, + "grad_norm": 2.7039053440093994, + "learning_rate": 4.800147325291298e-07, + "loss": 0.6988, + "step": 11904 + }, + { + "epoch": 0.9008361393817865, + "grad_norm": 2.0335659980773926, + "learning_rate": 4.792892913287927e-07, + "loss": 0.5938, + "step": 11905 + }, + { + "epoch": 0.9009118081041202, + "grad_norm": 2.8437318801879883, + "learning_rate": 4.785643837229183e-07, + "loss": 0.6922, + "step": 11906 + }, + { + "epoch": 0.9009874768264537, + "grad_norm": 2.2634239196777344, + "learning_rate": 4.778400097569062e-07, + "loss": 0.7243, + "step": 11907 + }, + { + "epoch": 0.9010631455487874, + "grad_norm": 1.859001636505127, + "learning_rate": 4.771161694761152e-07, + "loss": 0.8049, + "step": 11908 + }, + { + "epoch": 0.9011388142711211, + "grad_norm": 2.409040927886963, + "learning_rate": 4.763928629258748e-07, + "loss": 0.5497, + "step": 11909 + }, + { + "epoch": 0.9012144829934546, + "grad_norm": 2.5353803634643555, + "learning_rate": 4.75670090151483e-07, + "loss": 0.7495, + "step": 11910 + }, + { + "epoch": 0.9012901517157883, + "grad_norm": 4.32592248916626, + "learning_rate": 4.749478511982025e-07, + "loss": 0.5614, + "step": 11911 + }, + { + "epoch": 0.9013658204381219, + "grad_norm": 1.791941523551941, + "learning_rate": 4.7422614611126013e-07, + "loss": 0.5837, + "step": 11912 + }, + { + "epoch": 0.9014414891604555, + "grad_norm": 1.7201472520828247, + "learning_rate": 4.7350497493585175e-07, + "loss": 0.7207, + "step": 11913 + }, + { + "epoch": 0.9015171578827892, + "grad_norm": 2.2074570655822754, + "learning_rate": 4.7278433771714027e-07, + "loss": 0.5863, + "step": 11914 + }, + { + "epoch": 0.9015928266051227, + "grad_norm": 1.8226673603057861, + "learning_rate": 4.720642345002535e-07, + "loss": 0.5865, + "step": 11915 + }, + { + "epoch": 0.9016684953274564, + "grad_norm": 2.0403738021850586, + "learning_rate": 4.7134466533028643e-07, + "loss": 0.694, + "step": 11916 + }, + { + "epoch": 0.90174416404979, + "grad_norm": 2.33229398727417, + "learning_rate": 4.70625630252303e-07, + "loss": 0.7215, + "step": 11917 + }, + { + "epoch": 0.9018198327721236, + "grad_norm": 2.150709629058838, + "learning_rate": 4.6990712931133015e-07, + "loss": 0.7984, + "step": 11918 + }, + { + "epoch": 0.9018955014944573, + "grad_norm": 9.452735900878906, + "learning_rate": 4.69189162552361e-07, + "loss": 0.5954, + "step": 11919 + }, + { + "epoch": 0.9019711702167909, + "grad_norm": 2.4279088973999023, + "learning_rate": 4.6847173002035747e-07, + "loss": 0.6056, + "step": 11920 + }, + { + "epoch": 0.9020468389391245, + "grad_norm": 2.4450533390045166, + "learning_rate": 4.677548317602517e-07, + "loss": 0.6681, + "step": 11921 + }, + { + "epoch": 0.9021225076614582, + "grad_norm": 3.3857312202453613, + "learning_rate": 4.670384678169337e-07, + "loss": 0.5885, + "step": 11922 + }, + { + "epoch": 0.9021981763837917, + "grad_norm": 2.306729793548584, + "learning_rate": 4.6632263823526467e-07, + "loss": 0.7093, + "step": 11923 + }, + { + "epoch": 0.9022738451061254, + "grad_norm": 1.9452950954437256, + "learning_rate": 4.656073430600747e-07, + "loss": 0.6975, + "step": 11924 + }, + { + "epoch": 0.902349513828459, + "grad_norm": 2.4310667514801025, + "learning_rate": 4.6489258233615596e-07, + "loss": 0.6344, + "step": 11925 + }, + { + "epoch": 0.9024251825507926, + "grad_norm": 2.114664077758789, + "learning_rate": 4.6417835610826863e-07, + "loss": 0.7136, + "step": 11926 + }, + { + "epoch": 0.9025008512731263, + "grad_norm": 2.3902628421783447, + "learning_rate": 4.63464664421142e-07, + "loss": 0.5602, + "step": 11927 + }, + { + "epoch": 0.9025765199954598, + "grad_norm": 2.4101219177246094, + "learning_rate": 4.6275150731946827e-07, + "loss": 0.576, + "step": 11928 + }, + { + "epoch": 0.9026521887177935, + "grad_norm": 5.150691509246826, + "learning_rate": 4.620388848479087e-07, + "loss": 0.7431, + "step": 11929 + }, + { + "epoch": 0.9027278574401271, + "grad_norm": 2.332685708999634, + "learning_rate": 4.613267970510876e-07, + "loss": 0.67, + "step": 11930 + }, + { + "epoch": 0.9028035261624607, + "grad_norm": 2.803213119506836, + "learning_rate": 4.606152439736003e-07, + "loss": 0.6053, + "step": 11931 + }, + { + "epoch": 0.9028791948847944, + "grad_norm": 2.4882402420043945, + "learning_rate": 4.5990422566000824e-07, + "loss": 0.6151, + "step": 11932 + }, + { + "epoch": 0.902954863607128, + "grad_norm": 2.111898422241211, + "learning_rate": 4.591937421548337e-07, + "loss": 0.7308, + "step": 11933 + }, + { + "epoch": 0.9030305323294616, + "grad_norm": 1.9006226062774658, + "learning_rate": 4.584837935025721e-07, + "loss": 0.667, + "step": 11934 + }, + { + "epoch": 0.9031062010517953, + "grad_norm": 2.327817678451538, + "learning_rate": 4.5777437974768186e-07, + "loss": 0.6714, + "step": 11935 + }, + { + "epoch": 0.9031818697741288, + "grad_norm": 2.1458778381347656, + "learning_rate": 4.5706550093458856e-07, + "loss": 0.744, + "step": 11936 + }, + { + "epoch": 0.9032575384964625, + "grad_norm": 1.835634708404541, + "learning_rate": 4.5635715710768457e-07, + "loss": 0.6296, + "step": 11937 + }, + { + "epoch": 0.9033332072187961, + "grad_norm": 2.9382152557373047, + "learning_rate": 4.5564934831132844e-07, + "loss": 0.6187, + "step": 11938 + }, + { + "epoch": 0.9034088759411297, + "grad_norm": 2.5697879791259766, + "learning_rate": 4.5494207458984773e-07, + "loss": 0.698, + "step": 11939 + }, + { + "epoch": 0.9034845446634634, + "grad_norm": 2.2090299129486084, + "learning_rate": 4.5423533598752997e-07, + "loss": 0.6213, + "step": 11940 + }, + { + "epoch": 0.903560213385797, + "grad_norm": 3.206984043121338, + "learning_rate": 4.5352913254863683e-07, + "loss": 0.5154, + "step": 11941 + }, + { + "epoch": 0.9036358821081306, + "grad_norm": 3.334904670715332, + "learning_rate": 4.5282346431739285e-07, + "loss": 0.5593, + "step": 11942 + }, + { + "epoch": 0.9037115508304643, + "grad_norm": 2.202554225921631, + "learning_rate": 4.5211833133798873e-07, + "loss": 0.6092, + "step": 11943 + }, + { + "epoch": 0.9037872195527978, + "grad_norm": 2.342474937438965, + "learning_rate": 4.5141373365458116e-07, + "loss": 0.658, + "step": 11944 + }, + { + "epoch": 0.9038628882751315, + "grad_norm": 2.030301094055176, + "learning_rate": 4.5070967131129383e-07, + "loss": 0.4765, + "step": 11945 + }, + { + "epoch": 0.9039385569974651, + "grad_norm": 1.962053656578064, + "learning_rate": 4.500061443522214e-07, + "loss": 0.5477, + "step": 11946 + }, + { + "epoch": 0.9040142257197987, + "grad_norm": 2.112226724624634, + "learning_rate": 4.4930315282141574e-07, + "loss": 0.4979, + "step": 11947 + }, + { + "epoch": 0.9040898944421324, + "grad_norm": 2.1766979694366455, + "learning_rate": 4.486006967629046e-07, + "loss": 0.7071, + "step": 11948 + }, + { + "epoch": 0.904165563164466, + "grad_norm": 2.6591641902923584, + "learning_rate": 4.478987762206748e-07, + "loss": 0.6703, + "step": 11949 + }, + { + "epoch": 0.9042412318867996, + "grad_norm": 2.473120927810669, + "learning_rate": 4.471973912386861e-07, + "loss": 0.5526, + "step": 11950 + }, + { + "epoch": 0.9043169006091332, + "grad_norm": 2.2571189403533936, + "learning_rate": 4.464965418608584e-07, + "loss": 0.6613, + "step": 11951 + }, + { + "epoch": 0.9043925693314668, + "grad_norm": 2.007202386856079, + "learning_rate": 4.4579622813108365e-07, + "loss": 0.4949, + "step": 11952 + }, + { + "epoch": 0.9044682380538005, + "grad_norm": 2.1282033920288086, + "learning_rate": 4.4509645009321774e-07, + "loss": 0.6742, + "step": 11953 + }, + { + "epoch": 0.904543906776134, + "grad_norm": 2.2609729766845703, + "learning_rate": 4.443972077910806e-07, + "loss": 0.6759, + "step": 11954 + }, + { + "epoch": 0.9046195754984677, + "grad_norm": 2.033329486846924, + "learning_rate": 4.436985012684612e-07, + "loss": 0.6447, + "step": 11955 + }, + { + "epoch": 0.9046952442208014, + "grad_norm": 2.3120713233947754, + "learning_rate": 4.430003305691176e-07, + "loss": 0.741, + "step": 11956 + }, + { + "epoch": 0.9047709129431349, + "grad_norm": 2.225062847137451, + "learning_rate": 4.423026957367707e-07, + "loss": 0.5616, + "step": 11957 + }, + { + "epoch": 0.9048465816654686, + "grad_norm": 1.9412503242492676, + "learning_rate": 4.416055968151077e-07, + "loss": 0.6355, + "step": 11958 + }, + { + "epoch": 0.9049222503878022, + "grad_norm": 1.9755481481552124, + "learning_rate": 4.409090338477826e-07, + "loss": 0.6126, + "step": 11959 + }, + { + "epoch": 0.9049979191101358, + "grad_norm": 2.4795279502868652, + "learning_rate": 4.4021300687841747e-07, + "loss": 0.6847, + "step": 11960 + }, + { + "epoch": 0.9050735878324695, + "grad_norm": 2.1052944660186768, + "learning_rate": 4.395175159505995e-07, + "loss": 0.6671, + "step": 11961 + }, + { + "epoch": 0.905149256554803, + "grad_norm": 3.0458271503448486, + "learning_rate": 4.3882256110788286e-07, + "loss": 0.6731, + "step": 11962 + }, + { + "epoch": 0.9052249252771367, + "grad_norm": 3.665377378463745, + "learning_rate": 4.3812814239378774e-07, + "loss": 0.7169, + "step": 11963 + }, + { + "epoch": 0.9053005939994703, + "grad_norm": 1.8927977085113525, + "learning_rate": 4.374342598518013e-07, + "loss": 0.6032, + "step": 11964 + }, + { + "epoch": 0.9053762627218039, + "grad_norm": 1.916894555091858, + "learning_rate": 4.367409135253758e-07, + "loss": 0.6076, + "step": 11965 + }, + { + "epoch": 0.9054519314441376, + "grad_norm": 1.8118383884429932, + "learning_rate": 4.3604810345792956e-07, + "loss": 0.7221, + "step": 11966 + }, + { + "epoch": 0.9055276001664712, + "grad_norm": 1.7049998044967651, + "learning_rate": 4.353558296928528e-07, + "loss": 0.6027, + "step": 11967 + }, + { + "epoch": 0.9056032688888048, + "grad_norm": 1.889378547668457, + "learning_rate": 4.346640922734949e-07, + "loss": 0.6484, + "step": 11968 + }, + { + "epoch": 0.9056789376111385, + "grad_norm": 2.0997726917266846, + "learning_rate": 4.339728912431742e-07, + "loss": 0.6673, + "step": 11969 + }, + { + "epoch": 0.905754606333472, + "grad_norm": 3.0155575275421143, + "learning_rate": 4.33282226645177e-07, + "loss": 0.7523, + "step": 11970 + }, + { + "epoch": 0.9058302750558057, + "grad_norm": 2.474846839904785, + "learning_rate": 4.3259209852275583e-07, + "loss": 0.6474, + "step": 11971 + }, + { + "epoch": 0.9059059437781393, + "grad_norm": 2.1822690963745117, + "learning_rate": 4.31902506919127e-07, + "loss": 0.6469, + "step": 11972 + }, + { + "epoch": 0.9059816125004729, + "grad_norm": 2.6391453742980957, + "learning_rate": 4.312134518774761e-07, + "loss": 0.7557, + "step": 11973 + }, + { + "epoch": 0.9060572812228066, + "grad_norm": 1.6580966711044312, + "learning_rate": 4.3052493344095346e-07, + "loss": 0.8282, + "step": 11974 + }, + { + "epoch": 0.9061329499451402, + "grad_norm": 2.3595659732818604, + "learning_rate": 4.298369516526777e-07, + "loss": 0.6902, + "step": 11975 + }, + { + "epoch": 0.9062086186674738, + "grad_norm": 2.4112627506256104, + "learning_rate": 4.2914950655572827e-07, + "loss": 0.66, + "step": 11976 + }, + { + "epoch": 0.9062842873898074, + "grad_norm": 2.256720781326294, + "learning_rate": 4.284625981931608e-07, + "loss": 0.7697, + "step": 11977 + }, + { + "epoch": 0.906359956112141, + "grad_norm": 2.7008187770843506, + "learning_rate": 4.277762266079899e-07, + "loss": 0.5343, + "step": 11978 + }, + { + "epoch": 0.9064356248344747, + "grad_norm": 2.01400089263916, + "learning_rate": 4.270903918431961e-07, + "loss": 0.5767, + "step": 11979 + }, + { + "epoch": 0.9065112935568083, + "grad_norm": 2.3042778968811035, + "learning_rate": 4.264050939417301e-07, + "loss": 0.789, + "step": 11980 + }, + { + "epoch": 0.9065869622791419, + "grad_norm": 2.1162967681884766, + "learning_rate": 4.2572033294650756e-07, + "loss": 0.6247, + "step": 11981 + }, + { + "epoch": 0.9066626310014756, + "grad_norm": 1.9995644092559814, + "learning_rate": 4.2503610890041023e-07, + "loss": 0.6561, + "step": 11982 + }, + { + "epoch": 0.9067382997238091, + "grad_norm": 2.2234203815460205, + "learning_rate": 4.2435242184628677e-07, + "loss": 0.719, + "step": 11983 + }, + { + "epoch": 0.9068139684461428, + "grad_norm": 2.4800000190734863, + "learning_rate": 4.236692718269519e-07, + "loss": 0.635, + "step": 11984 + }, + { + "epoch": 0.9068896371684764, + "grad_norm": 2.198791742324829, + "learning_rate": 4.229866588851855e-07, + "loss": 0.5428, + "step": 11985 + }, + { + "epoch": 0.90696530589081, + "grad_norm": 2.6701927185058594, + "learning_rate": 4.2230458306373634e-07, + "loss": 0.6647, + "step": 11986 + }, + { + "epoch": 0.9070409746131437, + "grad_norm": 2.1392929553985596, + "learning_rate": 4.216230444053182e-07, + "loss": 0.6216, + "step": 11987 + }, + { + "epoch": 0.9071166433354773, + "grad_norm": 2.303083658218384, + "learning_rate": 4.2094204295261095e-07, + "loss": 0.7002, + "step": 11988 + }, + { + "epoch": 0.9071923120578109, + "grad_norm": 2.640005111694336, + "learning_rate": 4.2026157874826254e-07, + "loss": 0.734, + "step": 11989 + }, + { + "epoch": 0.9072679807801445, + "grad_norm": 2.1459431648254395, + "learning_rate": 4.1958165183488185e-07, + "loss": 0.7214, + "step": 11990 + }, + { + "epoch": 0.9073436495024781, + "grad_norm": 2.6311416625976562, + "learning_rate": 4.189022622550508e-07, + "loss": 0.6757, + "step": 11991 + }, + { + "epoch": 0.9074193182248118, + "grad_norm": 2.1436285972595215, + "learning_rate": 4.1822341005131636e-07, + "loss": 0.7854, + "step": 11992 + }, + { + "epoch": 0.9074949869471454, + "grad_norm": 2.0433895587921143, + "learning_rate": 4.1754509526618754e-07, + "loss": 0.6659, + "step": 11993 + }, + { + "epoch": 0.907570655669479, + "grad_norm": 1.8191251754760742, + "learning_rate": 4.1686731794214337e-07, + "loss": 0.6189, + "step": 11994 + }, + { + "epoch": 0.9076463243918127, + "grad_norm": 2.218461751937866, + "learning_rate": 4.161900781216299e-07, + "loss": 0.5692, + "step": 11995 + }, + { + "epoch": 0.9077219931141463, + "grad_norm": 2.302450656890869, + "learning_rate": 4.1551337584705815e-07, + "loss": 0.5076, + "step": 11996 + }, + { + "epoch": 0.9077976618364799, + "grad_norm": 2.4182517528533936, + "learning_rate": 4.148372111608023e-07, + "loss": 0.6981, + "step": 11997 + }, + { + "epoch": 0.9078733305588135, + "grad_norm": 3.4762744903564453, + "learning_rate": 4.1416158410520845e-07, + "loss": 0.5797, + "step": 11998 + }, + { + "epoch": 0.9079489992811471, + "grad_norm": 2.167374610900879, + "learning_rate": 4.1348649472258673e-07, + "loss": 0.6399, + "step": 11999 + }, + { + "epoch": 0.9080246680034808, + "grad_norm": 2.449777841567993, + "learning_rate": 4.128119430552133e-07, + "loss": 0.6094, + "step": 12000 + }, + { + "epoch": 0.9081003367258144, + "grad_norm": 2.1977956295013428, + "learning_rate": 4.1213792914533046e-07, + "loss": 0.6119, + "step": 12001 + }, + { + "epoch": 0.908176005448148, + "grad_norm": 1.5713450908660889, + "learning_rate": 4.1146445303514537e-07, + "loss": 0.8275, + "step": 12002 + }, + { + "epoch": 0.9082516741704816, + "grad_norm": 2.628474235534668, + "learning_rate": 4.107915147668363e-07, + "loss": 0.8309, + "step": 12003 + }, + { + "epoch": 0.9083273428928152, + "grad_norm": 2.199763774871826, + "learning_rate": 4.1011911438254357e-07, + "loss": 0.6514, + "step": 12004 + }, + { + "epoch": 0.9084030116151489, + "grad_norm": 5.241161823272705, + "learning_rate": 4.094472519243745e-07, + "loss": 0.7267, + "step": 12005 + }, + { + "epoch": 0.9084786803374825, + "grad_norm": 2.1290805339813232, + "learning_rate": 4.087759274344034e-07, + "loss": 0.6428, + "step": 12006 + }, + { + "epoch": 0.9085543490598161, + "grad_norm": 1.9543763399124146, + "learning_rate": 4.0810514095467164e-07, + "loss": 0.6201, + "step": 12007 + }, + { + "epoch": 0.9086300177821498, + "grad_norm": 2.172839879989624, + "learning_rate": 4.074348925271847e-07, + "loss": 0.739, + "step": 12008 + }, + { + "epoch": 0.9087056865044834, + "grad_norm": 2.1397907733917236, + "learning_rate": 4.067651821939169e-07, + "loss": 0.6348, + "step": 12009 + }, + { + "epoch": 0.908781355226817, + "grad_norm": 2.9188485145568848, + "learning_rate": 4.0609600999680875e-07, + "loss": 0.687, + "step": 12010 + }, + { + "epoch": 0.9088570239491506, + "grad_norm": 2.0263235569000244, + "learning_rate": 4.054273759777627e-07, + "loss": 0.677, + "step": 12011 + }, + { + "epoch": 0.9089326926714842, + "grad_norm": 2.360248327255249, + "learning_rate": 4.047592801786523e-07, + "loss": 0.6305, + "step": 12012 + }, + { + "epoch": 0.9090083613938179, + "grad_norm": 2.5668838024139404, + "learning_rate": 4.04091722641317e-07, + "loss": 0.7643, + "step": 12013 + }, + { + "epoch": 0.9090840301161515, + "grad_norm": 2.728790521621704, + "learning_rate": 4.0342470340756145e-07, + "loss": 0.5854, + "step": 12014 + }, + { + "epoch": 0.9091596988384851, + "grad_norm": 2.55326771736145, + "learning_rate": 4.0275822251915517e-07, + "loss": 0.7622, + "step": 12015 + }, + { + "epoch": 0.9092353675608187, + "grad_norm": 2.370495080947876, + "learning_rate": 4.0209228001783484e-07, + "loss": 0.7064, + "step": 12016 + }, + { + "epoch": 0.9093110362831524, + "grad_norm": 1.9629454612731934, + "learning_rate": 4.0142687594530604e-07, + "loss": 0.686, + "step": 12017 + }, + { + "epoch": 0.909386705005486, + "grad_norm": 2.457399368286133, + "learning_rate": 4.0076201034323647e-07, + "loss": 0.5467, + "step": 12018 + }, + { + "epoch": 0.9094623737278196, + "grad_norm": 1.8854044675827026, + "learning_rate": 4.000976832532638e-07, + "loss": 0.6292, + "step": 12019 + }, + { + "epoch": 0.9095380424501532, + "grad_norm": 2.935149908065796, + "learning_rate": 3.994338947169888e-07, + "loss": 0.5731, + "step": 12020 + }, + { + "epoch": 0.9096137111724869, + "grad_norm": 2.078005790710449, + "learning_rate": 3.987706447759831e-07, + "loss": 0.5449, + "step": 12021 + }, + { + "epoch": 0.9096893798948205, + "grad_norm": 2.3061161041259766, + "learning_rate": 3.9810793347177663e-07, + "loss": 0.5563, + "step": 12022 + }, + { + "epoch": 0.9097650486171541, + "grad_norm": 2.2766451835632324, + "learning_rate": 3.9744576084587413e-07, + "loss": 0.6867, + "step": 12023 + }, + { + "epoch": 0.9098407173394877, + "grad_norm": 2.109184503555298, + "learning_rate": 3.967841269397434e-07, + "loss": 0.5578, + "step": 12024 + }, + { + "epoch": 0.9099163860618213, + "grad_norm": 2.568835735321045, + "learning_rate": 3.9612303179481634e-07, + "loss": 0.6472, + "step": 12025 + }, + { + "epoch": 0.909992054784155, + "grad_norm": 2.184821128845215, + "learning_rate": 3.9546247545249284e-07, + "loss": 0.753, + "step": 12026 + }, + { + "epoch": 0.9100677235064886, + "grad_norm": 1.9900933504104614, + "learning_rate": 3.948024579541377e-07, + "loss": 0.5763, + "step": 12027 + }, + { + "epoch": 0.9101433922288222, + "grad_norm": 2.366878032684326, + "learning_rate": 3.94142979341089e-07, + "loss": 0.6965, + "step": 12028 + }, + { + "epoch": 0.9102190609511558, + "grad_norm": 2.603574275970459, + "learning_rate": 3.934840396546396e-07, + "loss": 0.7252, + "step": 12029 + }, + { + "epoch": 0.9102947296734895, + "grad_norm": 2.070781707763672, + "learning_rate": 3.928256389360566e-07, + "loss": 0.5665, + "step": 12030 + }, + { + "epoch": 0.9103703983958231, + "grad_norm": 1.9723803997039795, + "learning_rate": 3.921677772265709e-07, + "loss": 0.7802, + "step": 12031 + }, + { + "epoch": 0.9104460671181567, + "grad_norm": 2.6558046340942383, + "learning_rate": 3.915104545673807e-07, + "loss": 0.6873, + "step": 12032 + }, + { + "epoch": 0.9105217358404903, + "grad_norm": 2.883211374282837, + "learning_rate": 3.9085367099964786e-07, + "loss": 0.7914, + "step": 12033 + }, + { + "epoch": 0.910597404562824, + "grad_norm": 2.342439651489258, + "learning_rate": 3.9019742656450465e-07, + "loss": 0.6175, + "step": 12034 + }, + { + "epoch": 0.9106730732851576, + "grad_norm": 2.299743413925171, + "learning_rate": 3.895417213030471e-07, + "loss": 0.7706, + "step": 12035 + }, + { + "epoch": 0.9107487420074912, + "grad_norm": 2.087432861328125, + "learning_rate": 3.8888655525633544e-07, + "loss": 0.6474, + "step": 12036 + }, + { + "epoch": 0.9108244107298248, + "grad_norm": 1.8561009168624878, + "learning_rate": 3.882319284653988e-07, + "loss": 0.7941, + "step": 12037 + }, + { + "epoch": 0.9109000794521585, + "grad_norm": 1.991654634475708, + "learning_rate": 3.8757784097123236e-07, + "loss": 0.6651, + "step": 12038 + }, + { + "epoch": 0.9109757481744921, + "grad_norm": 2.475132942199707, + "learning_rate": 3.8692429281479845e-07, + "loss": 0.6396, + "step": 12039 + }, + { + "epoch": 0.9110514168968257, + "grad_norm": 1.9189382791519165, + "learning_rate": 3.8627128403702326e-07, + "loss": 0.7264, + "step": 12040 + }, + { + "epoch": 0.9111270856191593, + "grad_norm": 2.612868547439575, + "learning_rate": 3.856188146788001e-07, + "loss": 0.6025, + "step": 12041 + }, + { + "epoch": 0.9112027543414929, + "grad_norm": 2.8287506103515625, + "learning_rate": 3.849668847809903e-07, + "loss": 0.5975, + "step": 12042 + }, + { + "epoch": 0.9112784230638266, + "grad_norm": 2.0225670337677, + "learning_rate": 3.8431549438441616e-07, + "loss": 0.6964, + "step": 12043 + }, + { + "epoch": 0.9113540917861602, + "grad_norm": 2.5549564361572266, + "learning_rate": 3.8366464352987405e-07, + "loss": 0.5845, + "step": 12044 + }, + { + "epoch": 0.9114297605084938, + "grad_norm": 2.6276886463165283, + "learning_rate": 3.8301433225811945e-07, + "loss": 0.8134, + "step": 12045 + }, + { + "epoch": 0.9115054292308274, + "grad_norm": 2.4634835720062256, + "learning_rate": 3.8236456060987967e-07, + "loss": 0.6734, + "step": 12046 + }, + { + "epoch": 0.9115810979531611, + "grad_norm": 2.0669867992401123, + "learning_rate": 3.8171532862584326e-07, + "loss": 0.6143, + "step": 12047 + }, + { + "epoch": 0.9116567666754947, + "grad_norm": 2.2459287643432617, + "learning_rate": 3.810666363466666e-07, + "loss": 0.6493, + "step": 12048 + }, + { + "epoch": 0.9117324353978283, + "grad_norm": 1.9273678064346313, + "learning_rate": 3.8041848381297626e-07, + "loss": 0.6154, + "step": 12049 + }, + { + "epoch": 0.9118081041201619, + "grad_norm": 2.385669231414795, + "learning_rate": 3.797708710653588e-07, + "loss": 0.6914, + "step": 12050 + }, + { + "epoch": 0.9118837728424956, + "grad_norm": 2.3687374591827393, + "learning_rate": 3.791237981443697e-07, + "loss": 0.8302, + "step": 12051 + }, + { + "epoch": 0.9119594415648292, + "grad_norm": 2.349445343017578, + "learning_rate": 3.784772650905326e-07, + "loss": 0.6558, + "step": 12052 + }, + { + "epoch": 0.9120351102871628, + "grad_norm": 2.4349260330200195, + "learning_rate": 3.778312719443341e-07, + "loss": 0.7113, + "step": 12053 + }, + { + "epoch": 0.9121107790094964, + "grad_norm": 2.196366786956787, + "learning_rate": 3.771858187462288e-07, + "loss": 0.6412, + "step": 12054 + }, + { + "epoch": 0.91218644773183, + "grad_norm": 2.005478620529175, + "learning_rate": 3.7654090553663747e-07, + "loss": 0.5158, + "step": 12055 + }, + { + "epoch": 0.9122621164541637, + "grad_norm": 2.5596768856048584, + "learning_rate": 3.758965323559467e-07, + "loss": 0.7028, + "step": 12056 + }, + { + "epoch": 0.9123377851764973, + "grad_norm": 2.0070645809173584, + "learning_rate": 3.752526992445082e-07, + "loss": 0.5788, + "step": 12057 + }, + { + "epoch": 0.9124134538988309, + "grad_norm": 2.1916732788085938, + "learning_rate": 3.7460940624263985e-07, + "loss": 0.7334, + "step": 12058 + }, + { + "epoch": 0.9124891226211646, + "grad_norm": 2.4893784523010254, + "learning_rate": 3.739666533906303e-07, + "loss": 0.7867, + "step": 12059 + }, + { + "epoch": 0.9125647913434982, + "grad_norm": 2.1541330814361572, + "learning_rate": 3.733244407287294e-07, + "loss": 0.6831, + "step": 12060 + }, + { + "epoch": 0.9126404600658318, + "grad_norm": 2.3403072357177734, + "learning_rate": 3.72682768297153e-07, + "loss": 0.7134, + "step": 12061 + }, + { + "epoch": 0.9127161287881654, + "grad_norm": 2.0377376079559326, + "learning_rate": 3.720416361360859e-07, + "loss": 0.6679, + "step": 12062 + }, + { + "epoch": 0.912791797510499, + "grad_norm": 2.7400593757629395, + "learning_rate": 3.71401044285678e-07, + "loss": 0.7331, + "step": 12063 + }, + { + "epoch": 0.9128674662328327, + "grad_norm": 1.9614194631576538, + "learning_rate": 3.7076099278604527e-07, + "loss": 0.57, + "step": 12064 + }, + { + "epoch": 0.9129431349551663, + "grad_norm": 1.974969506263733, + "learning_rate": 3.7012148167726855e-07, + "loss": 0.4305, + "step": 12065 + }, + { + "epoch": 0.9130188036774999, + "grad_norm": 2.233001470565796, + "learning_rate": 3.694825109993979e-07, + "loss": 0.74, + "step": 12066 + }, + { + "epoch": 0.9130944723998335, + "grad_norm": 2.316767454147339, + "learning_rate": 3.688440807924472e-07, + "loss": 0.8145, + "step": 12067 + }, + { + "epoch": 0.9131701411221671, + "grad_norm": 1.535016417503357, + "learning_rate": 3.682061910963956e-07, + "loss": 0.7931, + "step": 12068 + }, + { + "epoch": 0.9132458098445008, + "grad_norm": 1.9249364137649536, + "learning_rate": 3.6756884195119114e-07, + "loss": 0.6361, + "step": 12069 + }, + { + "epoch": 0.9133214785668344, + "grad_norm": 2.287426471710205, + "learning_rate": 3.669320333967477e-07, + "loss": 0.6633, + "step": 12070 + }, + { + "epoch": 0.913397147289168, + "grad_norm": 2.515241861343384, + "learning_rate": 3.662957654729416e-07, + "loss": 0.7616, + "step": 12071 + }, + { + "epoch": 0.9134728160115017, + "grad_norm": 2.161454677581787, + "learning_rate": 3.656600382196199e-07, + "loss": 0.7475, + "step": 12072 + }, + { + "epoch": 0.9135484847338353, + "grad_norm": 1.951395869255066, + "learning_rate": 3.650248516765937e-07, + "loss": 0.77, + "step": 12073 + }, + { + "epoch": 0.9136241534561689, + "grad_norm": 2.1375932693481445, + "learning_rate": 3.6439020588364023e-07, + "loss": 0.5793, + "step": 12074 + }, + { + "epoch": 0.9136998221785025, + "grad_norm": 2.560678720474243, + "learning_rate": 3.637561008805027e-07, + "loss": 0.6161, + "step": 12075 + }, + { + "epoch": 0.9137754909008361, + "grad_norm": 2.45676851272583, + "learning_rate": 3.631225367068913e-07, + "loss": 0.7046, + "step": 12076 + }, + { + "epoch": 0.9138511596231698, + "grad_norm": 2.3351166248321533, + "learning_rate": 3.6248951340248136e-07, + "loss": 0.6914, + "step": 12077 + }, + { + "epoch": 0.9139268283455034, + "grad_norm": 2.179896354675293, + "learning_rate": 3.6185703100691615e-07, + "loss": 0.6828, + "step": 12078 + }, + { + "epoch": 0.914002497067837, + "grad_norm": 2.130155563354492, + "learning_rate": 3.6122508955980094e-07, + "loss": 0.6875, + "step": 12079 + }, + { + "epoch": 0.9140781657901706, + "grad_norm": 2.144454002380371, + "learning_rate": 3.6059368910071313e-07, + "loss": 0.702, + "step": 12080 + }, + { + "epoch": 0.9141538345125042, + "grad_norm": 1.5945395231246948, + "learning_rate": 3.5996282966919303e-07, + "loss": 0.7236, + "step": 12081 + }, + { + "epoch": 0.9142295032348379, + "grad_norm": 1.7243108749389648, + "learning_rate": 3.593325113047441e-07, + "loss": 0.5943, + "step": 12082 + }, + { + "epoch": 0.9143051719571715, + "grad_norm": 2.1047885417938232, + "learning_rate": 3.5870273404684073e-07, + "loss": 0.7484, + "step": 12083 + }, + { + "epoch": 0.9143808406795051, + "grad_norm": 2.1068966388702393, + "learning_rate": 3.580734979349214e-07, + "loss": 0.7103, + "step": 12084 + }, + { + "epoch": 0.9144565094018388, + "grad_norm": 3.1981070041656494, + "learning_rate": 3.5744480300839156e-07, + "loss": 0.6087, + "step": 12085 + }, + { + "epoch": 0.9145321781241724, + "grad_norm": 3.288362741470337, + "learning_rate": 3.5681664930662075e-07, + "loss": 0.5641, + "step": 12086 + }, + { + "epoch": 0.914607846846506, + "grad_norm": 2.8169796466827393, + "learning_rate": 3.5618903686894745e-07, + "loss": 0.6044, + "step": 12087 + }, + { + "epoch": 0.9146835155688396, + "grad_norm": 2.2789206504821777, + "learning_rate": 3.5556196573467426e-07, + "loss": 0.6518, + "step": 12088 + }, + { + "epoch": 0.9147591842911732, + "grad_norm": 2.204268455505371, + "learning_rate": 3.5493543594306974e-07, + "loss": 0.6927, + "step": 12089 + }, + { + "epoch": 0.9148348530135069, + "grad_norm": 2.291841983795166, + "learning_rate": 3.5430944753336956e-07, + "loss": 0.7375, + "step": 12090 + }, + { + "epoch": 0.9149105217358405, + "grad_norm": 2.3435380458831787, + "learning_rate": 3.5368400054477637e-07, + "loss": 0.6929, + "step": 12091 + }, + { + "epoch": 0.9149861904581741, + "grad_norm": 2.182016134262085, + "learning_rate": 3.530590950164567e-07, + "loss": 0.7357, + "step": 12092 + }, + { + "epoch": 0.9150618591805078, + "grad_norm": 2.184502363204956, + "learning_rate": 3.524347309875434e-07, + "loss": 0.7398, + "step": 12093 + }, + { + "epoch": 0.9151375279028413, + "grad_norm": 6.090785026550293, + "learning_rate": 3.5181090849713617e-07, + "loss": 0.6725, + "step": 12094 + }, + { + "epoch": 0.915213196625175, + "grad_norm": 2.206554412841797, + "learning_rate": 3.511876275843037e-07, + "loss": 0.6784, + "step": 12095 + }, + { + "epoch": 0.9152888653475086, + "grad_norm": 2.320699453353882, + "learning_rate": 3.5056488828807377e-07, + "loss": 0.6322, + "step": 12096 + }, + { + "epoch": 0.9153645340698422, + "grad_norm": 2.412203311920166, + "learning_rate": 3.4994269064744624e-07, + "loss": 0.6949, + "step": 12097 + }, + { + "epoch": 0.9154402027921759, + "grad_norm": 2.6732378005981445, + "learning_rate": 3.493210347013859e-07, + "loss": 0.7427, + "step": 12098 + }, + { + "epoch": 0.9155158715145095, + "grad_norm": 1.9151742458343506, + "learning_rate": 3.486999204888216e-07, + "loss": 0.6351, + "step": 12099 + }, + { + "epoch": 0.9155915402368431, + "grad_norm": 1.943975567817688, + "learning_rate": 3.480793480486493e-07, + "loss": 0.614, + "step": 12100 + }, + { + "epoch": 0.9156672089591767, + "grad_norm": 2.587817907333374, + "learning_rate": 3.474593174197328e-07, + "loss": 0.6968, + "step": 12101 + }, + { + "epoch": 0.9157428776815103, + "grad_norm": 3.2089931964874268, + "learning_rate": 3.4683982864090013e-07, + "loss": 0.5855, + "step": 12102 + }, + { + "epoch": 0.915818546403844, + "grad_norm": 1.9807599782943726, + "learning_rate": 3.462208817509452e-07, + "loss": 0.5766, + "step": 12103 + }, + { + "epoch": 0.9158942151261776, + "grad_norm": 2.119952917098999, + "learning_rate": 3.456024767886261e-07, + "loss": 0.6788, + "step": 12104 + }, + { + "epoch": 0.9159698838485112, + "grad_norm": 2.423600196838379, + "learning_rate": 3.4498461379267277e-07, + "loss": 0.631, + "step": 12105 + }, + { + "epoch": 0.9160455525708449, + "grad_norm": 1.9820013046264648, + "learning_rate": 3.4436729280177823e-07, + "loss": 0.8097, + "step": 12106 + }, + { + "epoch": 0.9161212212931784, + "grad_norm": 1.9202264547348022, + "learning_rate": 3.4375051385459864e-07, + "loss": 0.5799, + "step": 12107 + }, + { + "epoch": 0.9161968900155121, + "grad_norm": 2.3052821159362793, + "learning_rate": 3.431342769897591e-07, + "loss": 0.7043, + "step": 12108 + }, + { + "epoch": 0.9162725587378457, + "grad_norm": 2.5394515991210938, + "learning_rate": 3.4251858224585064e-07, + "loss": 0.6048, + "step": 12109 + }, + { + "epoch": 0.9163482274601793, + "grad_norm": 2.8350353240966797, + "learning_rate": 3.419034296614305e-07, + "loss": 0.5312, + "step": 12110 + }, + { + "epoch": 0.916423896182513, + "grad_norm": 2.1664719581604004, + "learning_rate": 3.4128881927502086e-07, + "loss": 0.598, + "step": 12111 + }, + { + "epoch": 0.9164995649048466, + "grad_norm": 2.737666606903076, + "learning_rate": 3.406747511251119e-07, + "loss": 0.7566, + "step": 12112 + }, + { + "epoch": 0.9165752336271802, + "grad_norm": 2.4136788845062256, + "learning_rate": 3.4006122525015793e-07, + "loss": 0.745, + "step": 12113 + }, + { + "epoch": 0.9166509023495139, + "grad_norm": 2.2882440090179443, + "learning_rate": 3.3944824168857914e-07, + "loss": 0.7246, + "step": 12114 + }, + { + "epoch": 0.9167265710718474, + "grad_norm": 3.440190553665161, + "learning_rate": 3.3883580047876186e-07, + "loss": 0.6751, + "step": 12115 + }, + { + "epoch": 0.9168022397941811, + "grad_norm": 2.7280311584472656, + "learning_rate": 3.3822390165906134e-07, + "loss": 0.7142, + "step": 12116 + }, + { + "epoch": 0.9168779085165147, + "grad_norm": 2.077730417251587, + "learning_rate": 3.376125452677971e-07, + "loss": 0.6187, + "step": 12117 + }, + { + "epoch": 0.9169535772388483, + "grad_norm": 2.064181327819824, + "learning_rate": 3.370017313432513e-07, + "loss": 0.6946, + "step": 12118 + }, + { + "epoch": 0.917029245961182, + "grad_norm": 2.3035082817077637, + "learning_rate": 3.3639145992367647e-07, + "loss": 0.6829, + "step": 12119 + }, + { + "epoch": 0.9171049146835155, + "grad_norm": 1.866401195526123, + "learning_rate": 3.3578173104729005e-07, + "loss": 0.6442, + "step": 12120 + }, + { + "epoch": 0.9171805834058492, + "grad_norm": 2.719041585922241, + "learning_rate": 3.3517254475227544e-07, + "loss": 0.7242, + "step": 12121 + }, + { + "epoch": 0.9172562521281828, + "grad_norm": 2.2494893074035645, + "learning_rate": 3.345639010767811e-07, + "loss": 0.701, + "step": 12122 + }, + { + "epoch": 0.9173319208505164, + "grad_norm": 2.3660080432891846, + "learning_rate": 3.3395580005892365e-07, + "loss": 0.7243, + "step": 12123 + }, + { + "epoch": 0.9174075895728501, + "grad_norm": 2.231206178665161, + "learning_rate": 3.333482417367836e-07, + "loss": 0.6173, + "step": 12124 + }, + { + "epoch": 0.9174832582951837, + "grad_norm": 2.3385729789733887, + "learning_rate": 3.327412261484064e-07, + "loss": 0.7117, + "step": 12125 + }, + { + "epoch": 0.9175589270175173, + "grad_norm": 2.6792593002319336, + "learning_rate": 3.3213475333180777e-07, + "loss": 0.7404, + "step": 12126 + }, + { + "epoch": 0.917634595739851, + "grad_norm": 2.788846492767334, + "learning_rate": 3.315288233249663e-07, + "loss": 0.7583, + "step": 12127 + }, + { + "epoch": 0.9177102644621845, + "grad_norm": 1.883687973022461, + "learning_rate": 3.3092343616582753e-07, + "loss": 0.7129, + "step": 12128 + }, + { + "epoch": 0.9177859331845182, + "grad_norm": 2.291748285293579, + "learning_rate": 3.303185918923013e-07, + "loss": 0.6957, + "step": 12129 + }, + { + "epoch": 0.9178616019068518, + "grad_norm": 1.4826828241348267, + "learning_rate": 3.297142905422652e-07, + "loss": 0.6491, + "step": 12130 + }, + { + "epoch": 0.9179372706291854, + "grad_norm": 2.145691394805908, + "learning_rate": 3.29110532153566e-07, + "loss": 0.7542, + "step": 12131 + }, + { + "epoch": 0.9180129393515191, + "grad_norm": 2.223241090774536, + "learning_rate": 3.2850731676400945e-07, + "loss": 0.615, + "step": 12132 + }, + { + "epoch": 0.9180886080738526, + "grad_norm": 4.687051773071289, + "learning_rate": 3.2790464441137037e-07, + "loss": 0.7339, + "step": 12133 + }, + { + "epoch": 0.9181642767961863, + "grad_norm": 2.535534620285034, + "learning_rate": 3.273025151333925e-07, + "loss": 0.7676, + "step": 12134 + }, + { + "epoch": 0.91823994551852, + "grad_norm": 2.9152448177337646, + "learning_rate": 3.267009289677817e-07, + "loss": 0.6491, + "step": 12135 + }, + { + "epoch": 0.9183156142408535, + "grad_norm": 2.461850881576538, + "learning_rate": 3.2609988595221183e-07, + "loss": 0.612, + "step": 12136 + }, + { + "epoch": 0.9183912829631872, + "grad_norm": 2.5751256942749023, + "learning_rate": 3.254993861243218e-07, + "loss": 0.4901, + "step": 12137 + }, + { + "epoch": 0.9184669516855208, + "grad_norm": 2.4287333488464355, + "learning_rate": 3.248994295217176e-07, + "loss": 0.6079, + "step": 12138 + }, + { + "epoch": 0.9185426204078544, + "grad_norm": 2.0941126346588135, + "learning_rate": 3.24300016181969e-07, + "loss": 0.7313, + "step": 12139 + }, + { + "epoch": 0.9186182891301881, + "grad_norm": 2.2236790657043457, + "learning_rate": 3.2370114614261313e-07, + "loss": 0.9, + "step": 12140 + }, + { + "epoch": 0.9186939578525216, + "grad_norm": 2.147141933441162, + "learning_rate": 3.231028194411569e-07, + "loss": 0.6299, + "step": 12141 + }, + { + "epoch": 0.9187696265748553, + "grad_norm": 2.7429134845733643, + "learning_rate": 3.2250503611506444e-07, + "loss": 0.8518, + "step": 12142 + }, + { + "epoch": 0.918845295297189, + "grad_norm": 2.0502939224243164, + "learning_rate": 3.2190779620177267e-07, + "loss": 0.5715, + "step": 12143 + }, + { + "epoch": 0.9189209640195225, + "grad_norm": 2.8797767162323, + "learning_rate": 3.213110997386838e-07, + "loss": 0.6093, + "step": 12144 + }, + { + "epoch": 0.9189966327418562, + "grad_norm": 2.793109655380249, + "learning_rate": 3.2071494676316484e-07, + "loss": 0.7734, + "step": 12145 + }, + { + "epoch": 0.9190723014641897, + "grad_norm": 2.135164976119995, + "learning_rate": 3.2011933731254697e-07, + "loss": 0.6241, + "step": 12146 + }, + { + "epoch": 0.9191479701865234, + "grad_norm": 1.7029752731323242, + "learning_rate": 3.1952427142413033e-07, + "loss": 0.6607, + "step": 12147 + }, + { + "epoch": 0.919223638908857, + "grad_norm": 6.995512008666992, + "learning_rate": 3.1892974913518016e-07, + "loss": 0.7122, + "step": 12148 + }, + { + "epoch": 0.9192993076311906, + "grad_norm": 2.4727792739868164, + "learning_rate": 3.183357704829286e-07, + "loss": 0.7127, + "step": 12149 + }, + { + "epoch": 0.9193749763535243, + "grad_norm": 1.784459114074707, + "learning_rate": 3.1774233550457e-07, + "loss": 0.5731, + "step": 12150 + }, + { + "epoch": 0.9194506450758579, + "grad_norm": 1.7401350736618042, + "learning_rate": 3.1714944423726653e-07, + "loss": 0.6354, + "step": 12151 + }, + { + "epoch": 0.9195263137981915, + "grad_norm": 2.0081143379211426, + "learning_rate": 3.165570967181506e-07, + "loss": 0.7686, + "step": 12152 + }, + { + "epoch": 0.9196019825205252, + "grad_norm": 1.8823308944702148, + "learning_rate": 3.1596529298431445e-07, + "loss": 0.6569, + "step": 12153 + }, + { + "epoch": 0.9196776512428587, + "grad_norm": 2.5882723331451416, + "learning_rate": 3.1537403307281843e-07, + "loss": 0.5767, + "step": 12154 + }, + { + "epoch": 0.9197533199651924, + "grad_norm": 2.073834180831909, + "learning_rate": 3.14783317020691e-07, + "loss": 0.7049, + "step": 12155 + }, + { + "epoch": 0.919828988687526, + "grad_norm": 2.439730644226074, + "learning_rate": 3.1419314486492245e-07, + "loss": 0.6684, + "step": 12156 + }, + { + "epoch": 0.9199046574098596, + "grad_norm": 2.621870756149292, + "learning_rate": 3.136035166424733e-07, + "loss": 0.7028, + "step": 12157 + }, + { + "epoch": 0.9199803261321933, + "grad_norm": 2.3154456615448, + "learning_rate": 3.1301443239026705e-07, + "loss": 0.7616, + "step": 12158 + }, + { + "epoch": 0.9200559948545268, + "grad_norm": 2.4377593994140625, + "learning_rate": 3.1242589214519513e-07, + "loss": 0.5677, + "step": 12159 + }, + { + "epoch": 0.9201316635768605, + "grad_norm": 2.1143412590026855, + "learning_rate": 3.1183789594411203e-07, + "loss": 0.591, + "step": 12160 + }, + { + "epoch": 0.9202073322991942, + "grad_norm": 1.7824926376342773, + "learning_rate": 3.112504438238394e-07, + "loss": 0.6487, + "step": 12161 + }, + { + "epoch": 0.9202830010215277, + "grad_norm": 2.3391568660736084, + "learning_rate": 3.106635358211687e-07, + "loss": 0.6635, + "step": 12162 + }, + { + "epoch": 0.9203586697438614, + "grad_norm": 2.347287178039551, + "learning_rate": 3.100771719728526e-07, + "loss": 0.6899, + "step": 12163 + }, + { + "epoch": 0.920434338466195, + "grad_norm": 2.611984968185425, + "learning_rate": 3.0949135231560864e-07, + "loss": 0.5357, + "step": 12164 + }, + { + "epoch": 0.9205100071885286, + "grad_norm": 2.079094886779785, + "learning_rate": 3.089060768861256e-07, + "loss": 0.6983, + "step": 12165 + }, + { + "epoch": 0.9205856759108623, + "grad_norm": 2.0612375736236572, + "learning_rate": 3.0832134572105507e-07, + "loss": 0.7183, + "step": 12166 + }, + { + "epoch": 0.9206613446331958, + "grad_norm": 2.252366542816162, + "learning_rate": 3.0773715885701284e-07, + "loss": 0.6214, + "step": 12167 + }, + { + "epoch": 0.9207370133555295, + "grad_norm": 2.7472410202026367, + "learning_rate": 3.071535163305845e-07, + "loss": 0.6864, + "step": 12168 + }, + { + "epoch": 0.9208126820778632, + "grad_norm": 2.145517349243164, + "learning_rate": 3.0657041817831897e-07, + "loss": 0.6973, + "step": 12169 + }, + { + "epoch": 0.9208883508001967, + "grad_norm": 1.8512307405471802, + "learning_rate": 3.05987864436733e-07, + "loss": 0.7021, + "step": 12170 + }, + { + "epoch": 0.9209640195225304, + "grad_norm": 2.517260789871216, + "learning_rate": 3.054058551423053e-07, + "loss": 0.6665, + "step": 12171 + }, + { + "epoch": 0.9210396882448639, + "grad_norm": 2.0453529357910156, + "learning_rate": 3.048243903314849e-07, + "loss": 0.6017, + "step": 12172 + }, + { + "epoch": 0.9211153569671976, + "grad_norm": 2.419189453125, + "learning_rate": 3.0424347004068555e-07, + "loss": 0.5802, + "step": 12173 + }, + { + "epoch": 0.9211910256895313, + "grad_norm": 2.2323241233825684, + "learning_rate": 3.0366309430628516e-07, + "loss": 0.7098, + "step": 12174 + }, + { + "epoch": 0.9212666944118648, + "grad_norm": 2.0994818210601807, + "learning_rate": 3.0308326316462966e-07, + "loss": 0.6425, + "step": 12175 + }, + { + "epoch": 0.9213423631341985, + "grad_norm": 1.8704789876937866, + "learning_rate": 3.02503976652027e-07, + "loss": 0.558, + "step": 12176 + }, + { + "epoch": 0.9214180318565321, + "grad_norm": 2.0065083503723145, + "learning_rate": 3.019252348047602e-07, + "loss": 0.623, + "step": 12177 + }, + { + "epoch": 0.9214937005788657, + "grad_norm": 2.0805394649505615, + "learning_rate": 3.0134703765906626e-07, + "loss": 0.6629, + "step": 12178 + }, + { + "epoch": 0.9215693693011994, + "grad_norm": 2.2282912731170654, + "learning_rate": 3.007693852511552e-07, + "loss": 0.7508, + "step": 12179 + }, + { + "epoch": 0.9216450380235329, + "grad_norm": 2.3103513717651367, + "learning_rate": 3.0019227761720304e-07, + "loss": 0.6449, + "step": 12180 + }, + { + "epoch": 0.9217207067458666, + "grad_norm": 2.332411050796509, + "learning_rate": 2.9961571479334794e-07, + "loss": 0.5846, + "step": 12181 + }, + { + "epoch": 0.9217963754682003, + "grad_norm": 1.9255980253219604, + "learning_rate": 2.99039696815698e-07, + "loss": 0.6451, + "step": 12182 + }, + { + "epoch": 0.9218720441905338, + "grad_norm": 4.481695652008057, + "learning_rate": 2.9846422372032434e-07, + "loss": 0.5743, + "step": 12183 + }, + { + "epoch": 0.9219477129128675, + "grad_norm": 2.167587995529175, + "learning_rate": 2.9788929554326614e-07, + "loss": 0.7151, + "step": 12184 + }, + { + "epoch": 0.9220233816352011, + "grad_norm": 2.3407418727874756, + "learning_rate": 2.9731491232052466e-07, + "loss": 0.6573, + "step": 12185 + }, + { + "epoch": 0.9220990503575347, + "grad_norm": 2.2735655307769775, + "learning_rate": 2.9674107408807107e-07, + "loss": 0.6146, + "step": 12186 + }, + { + "epoch": 0.9221747190798684, + "grad_norm": 2.0419833660125732, + "learning_rate": 2.961677808818436e-07, + "loss": 0.6306, + "step": 12187 + }, + { + "epoch": 0.9222503878022019, + "grad_norm": 2.499739646911621, + "learning_rate": 2.955950327377396e-07, + "loss": 0.6939, + "step": 12188 + }, + { + "epoch": 0.9223260565245356, + "grad_norm": 2.449592351913452, + "learning_rate": 2.950228296916283e-07, + "loss": 0.834, + "step": 12189 + }, + { + "epoch": 0.9224017252468693, + "grad_norm": 2.2888007164001465, + "learning_rate": 2.944511717793421e-07, + "loss": 0.6675, + "step": 12190 + }, + { + "epoch": 0.9224773939692028, + "grad_norm": 2.9520766735076904, + "learning_rate": 2.938800590366814e-07, + "loss": 0.7078, + "step": 12191 + }, + { + "epoch": 0.9225530626915365, + "grad_norm": 2.2078518867492676, + "learning_rate": 2.9330949149941044e-07, + "loss": 0.6119, + "step": 12192 + }, + { + "epoch": 0.92262873141387, + "grad_norm": 2.88496470451355, + "learning_rate": 2.927394692032598e-07, + "loss": 0.6607, + "step": 12193 + }, + { + "epoch": 0.9227044001362037, + "grad_norm": 2.3544445037841797, + "learning_rate": 2.921699921839258e-07, + "loss": 0.7694, + "step": 12194 + }, + { + "epoch": 0.9227800688585374, + "grad_norm": 2.2602198123931885, + "learning_rate": 2.91601060477073e-07, + "loss": 0.7182, + "step": 12195 + }, + { + "epoch": 0.9228557375808709, + "grad_norm": 2.0758039951324463, + "learning_rate": 2.910326741183269e-07, + "loss": 0.6234, + "step": 12196 + }, + { + "epoch": 0.9229314063032046, + "grad_norm": 2.403632402420044, + "learning_rate": 2.9046483314328296e-07, + "loss": 0.6832, + "step": 12197 + }, + { + "epoch": 0.9230070750255382, + "grad_norm": 2.320176362991333, + "learning_rate": 2.898975375875018e-07, + "loss": 0.7362, + "step": 12198 + }, + { + "epoch": 0.9230827437478718, + "grad_norm": 2.266352891921997, + "learning_rate": 2.89330787486508e-07, + "loss": 0.6202, + "step": 12199 + }, + { + "epoch": 0.9231584124702055, + "grad_norm": 2.6983745098114014, + "learning_rate": 2.887645828757951e-07, + "loss": 0.6643, + "step": 12200 + }, + { + "epoch": 0.923234081192539, + "grad_norm": 3.239264488220215, + "learning_rate": 2.881989237908188e-07, + "loss": 0.6703, + "step": 12201 + }, + { + "epoch": 0.9233097499148727, + "grad_norm": 1.9743820428848267, + "learning_rate": 2.876338102670028e-07, + "loss": 0.6001, + "step": 12202 + }, + { + "epoch": 0.9233854186372064, + "grad_norm": 2.7023541927337646, + "learning_rate": 2.8706924233973765e-07, + "loss": 0.6539, + "step": 12203 + }, + { + "epoch": 0.9234610873595399, + "grad_norm": 1.8627772331237793, + "learning_rate": 2.865052200443772e-07, + "loss": 0.6342, + "step": 12204 + }, + { + "epoch": 0.9235367560818736, + "grad_norm": 2.2735352516174316, + "learning_rate": 2.8594174341624216e-07, + "loss": 0.6798, + "step": 12205 + }, + { + "epoch": 0.9236124248042071, + "grad_norm": 2.6565213203430176, + "learning_rate": 2.8537881249062225e-07, + "loss": 0.6429, + "step": 12206 + }, + { + "epoch": 0.9236880935265408, + "grad_norm": 1.921863317489624, + "learning_rate": 2.8481642730276434e-07, + "loss": 0.5624, + "step": 12207 + }, + { + "epoch": 0.9237637622488745, + "grad_norm": 2.8346986770629883, + "learning_rate": 2.8425458788789126e-07, + "loss": 0.6731, + "step": 12208 + }, + { + "epoch": 0.923839430971208, + "grad_norm": 1.9833852052688599, + "learning_rate": 2.8369329428118784e-07, + "loss": 0.7168, + "step": 12209 + }, + { + "epoch": 0.9239150996935417, + "grad_norm": 2.3957440853118896, + "learning_rate": 2.8313254651779997e-07, + "loss": 0.7039, + "step": 12210 + }, + { + "epoch": 0.9239907684158754, + "grad_norm": 2.2904884815216064, + "learning_rate": 2.8257234463284653e-07, + "loss": 0.7522, + "step": 12211 + }, + { + "epoch": 0.9240664371382089, + "grad_norm": 2.137908458709717, + "learning_rate": 2.820126886614085e-07, + "loss": 0.5352, + "step": 12212 + }, + { + "epoch": 0.9241421058605426, + "grad_norm": 2.9669129848480225, + "learning_rate": 2.814535786385338e-07, + "loss": 0.6325, + "step": 12213 + }, + { + "epoch": 0.9242177745828761, + "grad_norm": 4.183269500732422, + "learning_rate": 2.808950145992345e-07, + "loss": 0.7076, + "step": 12214 + }, + { + "epoch": 0.9242934433052098, + "grad_norm": 1.7913990020751953, + "learning_rate": 2.8033699657849056e-07, + "loss": 0.6799, + "step": 12215 + }, + { + "epoch": 0.9243691120275435, + "grad_norm": 1.807224988937378, + "learning_rate": 2.79779524611248e-07, + "loss": 0.5429, + "step": 12216 + }, + { + "epoch": 0.924444780749877, + "grad_norm": 3.026785135269165, + "learning_rate": 2.7922259873241397e-07, + "loss": 0.6853, + "step": 12217 + }, + { + "epoch": 0.9245204494722107, + "grad_norm": 2.0761656761169434, + "learning_rate": 2.786662189768685e-07, + "loss": 0.5143, + "step": 12218 + }, + { + "epoch": 0.9245961181945442, + "grad_norm": 2.615605354309082, + "learning_rate": 2.7811038537945177e-07, + "loss": 0.734, + "step": 12219 + }, + { + "epoch": 0.9246717869168779, + "grad_norm": 2.0705811977386475, + "learning_rate": 2.775550979749739e-07, + "loss": 0.5934, + "step": 12220 + }, + { + "epoch": 0.9247474556392116, + "grad_norm": 2.260209321975708, + "learning_rate": 2.7700035679820714e-07, + "loss": 0.8695, + "step": 12221 + }, + { + "epoch": 0.9248231243615451, + "grad_norm": 2.3986639976501465, + "learning_rate": 2.764461618838906e-07, + "loss": 0.7589, + "step": 12222 + }, + { + "epoch": 0.9248987930838788, + "grad_norm": 2.1753921508789062, + "learning_rate": 2.758925132667326e-07, + "loss": 0.6476, + "step": 12223 + }, + { + "epoch": 0.9249744618062125, + "grad_norm": 1.9226336479187012, + "learning_rate": 2.7533941098140234e-07, + "loss": 0.7568, + "step": 12224 + }, + { + "epoch": 0.925050130528546, + "grad_norm": 2.058516502380371, + "learning_rate": 2.747868550625362e-07, + "loss": 0.6215, + "step": 12225 + }, + { + "epoch": 0.9251257992508797, + "grad_norm": 1.9395172595977783, + "learning_rate": 2.742348455447384e-07, + "loss": 0.571, + "step": 12226 + }, + { + "epoch": 0.9252014679732132, + "grad_norm": 2.5205371379852295, + "learning_rate": 2.736833824625774e-07, + "loss": 0.6528, + "step": 12227 + }, + { + "epoch": 0.9252771366955469, + "grad_norm": 2.5392866134643555, + "learning_rate": 2.7313246585058647e-07, + "loss": 0.6829, + "step": 12228 + }, + { + "epoch": 0.9253528054178806, + "grad_norm": 3.4775822162628174, + "learning_rate": 2.7258209574326707e-07, + "loss": 0.5941, + "step": 12229 + }, + { + "epoch": 0.9254284741402141, + "grad_norm": 2.08290433883667, + "learning_rate": 2.7203227217508565e-07, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 0.9255041428625478, + "grad_norm": 2.0628485679626465, + "learning_rate": 2.714829951804716e-07, + "loss": 0.6772, + "step": 12231 + }, + { + "epoch": 0.9255798115848813, + "grad_norm": 2.3678271770477295, + "learning_rate": 2.709342647938244e-07, + "loss": 0.6361, + "step": 12232 + }, + { + "epoch": 0.925655480307215, + "grad_norm": 2.2154617309570312, + "learning_rate": 2.703860810495057e-07, + "loss": 0.6324, + "step": 12233 + }, + { + "epoch": 0.9257311490295487, + "grad_norm": 2.3665215969085693, + "learning_rate": 2.698384439818479e-07, + "loss": 0.6778, + "step": 12234 + }, + { + "epoch": 0.9258068177518822, + "grad_norm": 2.475618362426758, + "learning_rate": 2.692913536251416e-07, + "loss": 0.5842, + "step": 12235 + }, + { + "epoch": 0.9258824864742159, + "grad_norm": 2.6356213092803955, + "learning_rate": 2.6874481001365035e-07, + "loss": 0.8052, + "step": 12236 + }, + { + "epoch": 0.9259581551965496, + "grad_norm": 2.1997127532958984, + "learning_rate": 2.681988131815989e-07, + "loss": 0.564, + "step": 12237 + }, + { + "epoch": 0.9260338239188831, + "grad_norm": 2.037262201309204, + "learning_rate": 2.676533631631798e-07, + "loss": 0.6311, + "step": 12238 + }, + { + "epoch": 0.9261094926412168, + "grad_norm": 1.9841945171356201, + "learning_rate": 2.6710845999255076e-07, + "loss": 0.6565, + "step": 12239 + }, + { + "epoch": 0.9261851613635503, + "grad_norm": 2.244075298309326, + "learning_rate": 2.6656410370383544e-07, + "loss": 0.614, + "step": 12240 + }, + { + "epoch": 0.926260830085884, + "grad_norm": 2.1905736923217773, + "learning_rate": 2.660202943311246e-07, + "loss": 0.8046, + "step": 12241 + }, + { + "epoch": 0.9263364988082177, + "grad_norm": 2.341736078262329, + "learning_rate": 2.6547703190847105e-07, + "loss": 0.6139, + "step": 12242 + }, + { + "epoch": 0.9264121675305512, + "grad_norm": 2.3669471740722656, + "learning_rate": 2.649343164698965e-07, + "loss": 0.7966, + "step": 12243 + }, + { + "epoch": 0.9264878362528849, + "grad_norm": 2.272200345993042, + "learning_rate": 2.643921480493888e-07, + "loss": 0.6013, + "step": 12244 + }, + { + "epoch": 0.9265635049752184, + "grad_norm": 2.1507675647735596, + "learning_rate": 2.6385052668089784e-07, + "loss": 0.6805, + "step": 12245 + }, + { + "epoch": 0.9266391736975521, + "grad_norm": 2.2726826667785645, + "learning_rate": 2.6330945239834336e-07, + "loss": 0.5546, + "step": 12246 + }, + { + "epoch": 0.9267148424198858, + "grad_norm": 2.1980509757995605, + "learning_rate": 2.6276892523560934e-07, + "loss": 0.6105, + "step": 12247 + }, + { + "epoch": 0.9267905111422193, + "grad_norm": 2.0372631549835205, + "learning_rate": 2.6222894522654375e-07, + "loss": 0.6964, + "step": 12248 + }, + { + "epoch": 0.926866179864553, + "grad_norm": 2.292754888534546, + "learning_rate": 2.6168951240496443e-07, + "loss": 0.761, + "step": 12249 + }, + { + "epoch": 0.9269418485868867, + "grad_norm": 2.1714630126953125, + "learning_rate": 2.611506268046494e-07, + "loss": 0.6707, + "step": 12250 + }, + { + "epoch": 0.9270175173092202, + "grad_norm": 2.125967025756836, + "learning_rate": 2.606122884593477e-07, + "loss": 0.6586, + "step": 12251 + }, + { + "epoch": 0.9270931860315539, + "grad_norm": 2.231060266494751, + "learning_rate": 2.6007449740277235e-07, + "loss": 0.5914, + "step": 12252 + }, + { + "epoch": 0.9271688547538874, + "grad_norm": 2.295248031616211, + "learning_rate": 2.5953725366859836e-07, + "loss": 0.6735, + "step": 12253 + }, + { + "epoch": 0.9272445234762211, + "grad_norm": 3.4666316509246826, + "learning_rate": 2.590005572904729e-07, + "loss": 0.6238, + "step": 12254 + }, + { + "epoch": 0.9273201921985548, + "grad_norm": 2.405599355697632, + "learning_rate": 2.5846440830200404e-07, + "loss": 0.7202, + "step": 12255 + }, + { + "epoch": 0.9273958609208883, + "grad_norm": 2.2995810508728027, + "learning_rate": 2.579288067367679e-07, + "loss": 0.6795, + "step": 12256 + }, + { + "epoch": 0.927471529643222, + "grad_norm": 2.8682172298431396, + "learning_rate": 2.5739375262830464e-07, + "loss": 0.5819, + "step": 12257 + }, + { + "epoch": 0.9275471983655555, + "grad_norm": 3.253770589828491, + "learning_rate": 2.5685924601012157e-07, + "loss": 0.6481, + "step": 12258 + }, + { + "epoch": 0.9276228670878892, + "grad_norm": 2.7173924446105957, + "learning_rate": 2.563252869156908e-07, + "loss": 0.4945, + "step": 12259 + }, + { + "epoch": 0.9276985358102229, + "grad_norm": 2.2100868225097656, + "learning_rate": 2.5579187537845164e-07, + "loss": 0.619, + "step": 12260 + }, + { + "epoch": 0.9277742045325564, + "grad_norm": 2.2828118801116943, + "learning_rate": 2.552590114318073e-07, + "loss": 0.783, + "step": 12261 + }, + { + "epoch": 0.9278498732548901, + "grad_norm": 2.5905113220214844, + "learning_rate": 2.5472669510912916e-07, + "loss": 0.6722, + "step": 12262 + }, + { + "epoch": 0.9279255419772238, + "grad_norm": 2.3828647136688232, + "learning_rate": 2.5419492644374855e-07, + "loss": 0.6373, + "step": 12263 + }, + { + "epoch": 0.9280012106995573, + "grad_norm": 2.7237589359283447, + "learning_rate": 2.536637054689698e-07, + "loss": 0.697, + "step": 12264 + }, + { + "epoch": 0.928076879421891, + "grad_norm": 2.399646282196045, + "learning_rate": 2.531330322180593e-07, + "loss": 0.5974, + "step": 12265 + }, + { + "epoch": 0.9281525481442245, + "grad_norm": 2.0155105590820312, + "learning_rate": 2.5260290672424947e-07, + "loss": 0.5291, + "step": 12266 + }, + { + "epoch": 0.9282282168665582, + "grad_norm": 2.4780659675598145, + "learning_rate": 2.5207332902073776e-07, + "loss": 0.6977, + "step": 12267 + }, + { + "epoch": 0.9283038855888919, + "grad_norm": 2.1585497856140137, + "learning_rate": 2.5154429914068764e-07, + "loss": 0.7286, + "step": 12268 + }, + { + "epoch": 0.9283795543112254, + "grad_norm": 2.1875948905944824, + "learning_rate": 2.510158171172296e-07, + "loss": 0.6065, + "step": 12269 + }, + { + "epoch": 0.9284552230335591, + "grad_norm": 2.345557928085327, + "learning_rate": 2.5048788298345926e-07, + "loss": 0.4536, + "step": 12270 + }, + { + "epoch": 0.9285308917558927, + "grad_norm": 2.446751356124878, + "learning_rate": 2.4996049677243703e-07, + "loss": 0.6631, + "step": 12271 + }, + { + "epoch": 0.9286065604782263, + "grad_norm": 2.7235686779022217, + "learning_rate": 2.494336585171896e-07, + "loss": 0.6673, + "step": 12272 + }, + { + "epoch": 0.92868222920056, + "grad_norm": 2.1927380561828613, + "learning_rate": 2.489073682507105e-07, + "loss": 0.6072, + "step": 12273 + }, + { + "epoch": 0.9287578979228935, + "grad_norm": 2.9181969165802, + "learning_rate": 2.483816260059534e-07, + "loss": 0.6484, + "step": 12274 + }, + { + "epoch": 0.9288335666452272, + "grad_norm": 2.1326992511749268, + "learning_rate": 2.4785643181584696e-07, + "loss": 0.7141, + "step": 12275 + }, + { + "epoch": 0.9289092353675609, + "grad_norm": 2.227208375930786, + "learning_rate": 2.4733178571327887e-07, + "loss": 0.554, + "step": 12276 + }, + { + "epoch": 0.9289849040898944, + "grad_norm": 2.5068063735961914, + "learning_rate": 2.4680768773110383e-07, + "loss": 0.7795, + "step": 12277 + }, + { + "epoch": 0.9290605728122281, + "grad_norm": 2.883193016052246, + "learning_rate": 2.462841379021417e-07, + "loss": 0.6704, + "step": 12278 + }, + { + "epoch": 0.9291362415345616, + "grad_norm": 2.217097520828247, + "learning_rate": 2.4576113625918005e-07, + "loss": 0.6113, + "step": 12279 + }, + { + "epoch": 0.9292119102568953, + "grad_norm": 3.0743801593780518, + "learning_rate": 2.4523868283497186e-07, + "loss": 0.7329, + "step": 12280 + }, + { + "epoch": 0.929287578979229, + "grad_norm": 2.3578438758850098, + "learning_rate": 2.447167776622329e-07, + "loss": 0.7069, + "step": 12281 + }, + { + "epoch": 0.9293632477015625, + "grad_norm": 2.4407949447631836, + "learning_rate": 2.44195420773647e-07, + "loss": 0.6827, + "step": 12282 + }, + { + "epoch": 0.9294389164238962, + "grad_norm": 2.97994065284729, + "learning_rate": 2.4367461220186406e-07, + "loss": 0.572, + "step": 12283 + }, + { + "epoch": 0.9295145851462298, + "grad_norm": 2.102031946182251, + "learning_rate": 2.43154351979498e-07, + "loss": 0.685, + "step": 12284 + }, + { + "epoch": 0.9295902538685634, + "grad_norm": 2.1757426261901855, + "learning_rate": 2.426346401391287e-07, + "loss": 0.6041, + "step": 12285 + }, + { + "epoch": 0.9296659225908971, + "grad_norm": 2.5137274265289307, + "learning_rate": 2.4211547671330423e-07, + "loss": 0.6202, + "step": 12286 + }, + { + "epoch": 0.9297415913132306, + "grad_norm": 2.278620719909668, + "learning_rate": 2.415968617345355e-07, + "loss": 0.7326, + "step": 12287 + }, + { + "epoch": 0.9298172600355643, + "grad_norm": 1.830971360206604, + "learning_rate": 2.410787952352986e-07, + "loss": 0.6772, + "step": 12288 + }, + { + "epoch": 0.929892928757898, + "grad_norm": 2.2332699298858643, + "learning_rate": 2.4056127724803656e-07, + "loss": 0.6575, + "step": 12289 + }, + { + "epoch": 0.9299685974802315, + "grad_norm": 3.241168975830078, + "learning_rate": 2.400443078051604e-07, + "loss": 0.7853, + "step": 12290 + }, + { + "epoch": 0.9300442662025652, + "grad_norm": 1.8960295915603638, + "learning_rate": 2.3952788693904125e-07, + "loss": 0.5753, + "step": 12291 + }, + { + "epoch": 0.9301199349248988, + "grad_norm": 2.006786584854126, + "learning_rate": 2.3901201468202126e-07, + "loss": 0.5916, + "step": 12292 + }, + { + "epoch": 0.9301956036472324, + "grad_norm": 2.6823008060455322, + "learning_rate": 2.3849669106640557e-07, + "loss": 0.7097, + "step": 12293 + }, + { + "epoch": 0.9302712723695661, + "grad_norm": 1.6065900325775146, + "learning_rate": 2.379819161244654e-07, + "loss": 0.5853, + "step": 12294 + }, + { + "epoch": 0.9303469410918996, + "grad_norm": 3.367147207260132, + "learning_rate": 2.3746768988843693e-07, + "loss": 0.7193, + "step": 12295 + }, + { + "epoch": 0.9304226098142333, + "grad_norm": 2.1877822875976562, + "learning_rate": 2.3695401239052338e-07, + "loss": 0.7386, + "step": 12296 + }, + { + "epoch": 0.9304982785365669, + "grad_norm": 2.3759443759918213, + "learning_rate": 2.3644088366289208e-07, + "loss": 0.6824, + "step": 12297 + }, + { + "epoch": 0.9305739472589005, + "grad_norm": 1.9915658235549927, + "learning_rate": 2.3592830373767925e-07, + "loss": 0.7018, + "step": 12298 + }, + { + "epoch": 0.9306496159812342, + "grad_norm": 2.2175698280334473, + "learning_rate": 2.3541627264698028e-07, + "loss": 0.7714, + "step": 12299 + }, + { + "epoch": 0.9307252847035677, + "grad_norm": 2.8158113956451416, + "learning_rate": 2.349047904228635e-07, + "loss": 0.6239, + "step": 12300 + }, + { + "epoch": 0.9308009534259014, + "grad_norm": 2.3479740619659424, + "learning_rate": 2.3439385709735928e-07, + "loss": 0.7483, + "step": 12301 + }, + { + "epoch": 0.9308766221482351, + "grad_norm": 2.05190372467041, + "learning_rate": 2.3388347270246202e-07, + "loss": 0.5696, + "step": 12302 + }, + { + "epoch": 0.9309522908705686, + "grad_norm": 2.0475738048553467, + "learning_rate": 2.3337363727013515e-07, + "loss": 0.5486, + "step": 12303 + }, + { + "epoch": 0.9310279595929023, + "grad_norm": 2.140483856201172, + "learning_rate": 2.3286435083230618e-07, + "loss": 0.6311, + "step": 12304 + }, + { + "epoch": 0.9311036283152359, + "grad_norm": 2.0714380741119385, + "learning_rate": 2.3235561342086753e-07, + "loss": 0.6138, + "step": 12305 + }, + { + "epoch": 0.9311792970375695, + "grad_norm": 2.3134987354278564, + "learning_rate": 2.3184742506767775e-07, + "loss": 0.6722, + "step": 12306 + }, + { + "epoch": 0.9312549657599032, + "grad_norm": 2.5604090690612793, + "learning_rate": 2.313397858045624e-07, + "loss": 0.708, + "step": 12307 + }, + { + "epoch": 0.9313306344822367, + "grad_norm": 3.3896567821502686, + "learning_rate": 2.30832695663311e-07, + "loss": 0.6125, + "step": 12308 + }, + { + "epoch": 0.9314063032045704, + "grad_norm": 1.9573801755905151, + "learning_rate": 2.303261546756802e-07, + "loss": 0.5785, + "step": 12309 + }, + { + "epoch": 0.931481971926904, + "grad_norm": 1.7080085277557373, + "learning_rate": 2.298201628733876e-07, + "loss": 0.7792, + "step": 12310 + }, + { + "epoch": 0.9315576406492376, + "grad_norm": 1.9010158777236938, + "learning_rate": 2.2931472028812384e-07, + "loss": 0.5984, + "step": 12311 + }, + { + "epoch": 0.9316333093715713, + "grad_norm": 2.1762309074401855, + "learning_rate": 2.2880982695154162e-07, + "loss": 0.6983, + "step": 12312 + }, + { + "epoch": 0.9317089780939048, + "grad_norm": 1.8109301328659058, + "learning_rate": 2.2830548289525666e-07, + "loss": 0.6687, + "step": 12313 + }, + { + "epoch": 0.9317846468162385, + "grad_norm": 1.976968765258789, + "learning_rate": 2.2780168815085267e-07, + "loss": 0.6647, + "step": 12314 + }, + { + "epoch": 0.9318603155385722, + "grad_norm": 3.7262067794799805, + "learning_rate": 2.2729844274987942e-07, + "loss": 0.6439, + "step": 12315 + }, + { + "epoch": 0.9319359842609057, + "grad_norm": 1.9241440296173096, + "learning_rate": 2.2679574672385272e-07, + "loss": 0.7134, + "step": 12316 + }, + { + "epoch": 0.9320116529832394, + "grad_norm": 2.4006447792053223, + "learning_rate": 2.2629360010425237e-07, + "loss": 0.598, + "step": 12317 + }, + { + "epoch": 0.932087321705573, + "grad_norm": 2.9187188148498535, + "learning_rate": 2.2579200292252422e-07, + "loss": 0.7099, + "step": 12318 + }, + { + "epoch": 0.9321629904279066, + "grad_norm": 2.171943426132202, + "learning_rate": 2.2529095521008114e-07, + "loss": 0.6414, + "step": 12319 + }, + { + "epoch": 0.9322386591502403, + "grad_norm": 2.2062504291534424, + "learning_rate": 2.2479045699829803e-07, + "loss": 0.653, + "step": 12320 + }, + { + "epoch": 0.9323143278725738, + "grad_norm": 2.55442476272583, + "learning_rate": 2.2429050831851882e-07, + "loss": 0.6587, + "step": 12321 + }, + { + "epoch": 0.9323899965949075, + "grad_norm": 2.2659802436828613, + "learning_rate": 2.2379110920205248e-07, + "loss": 0.6878, + "step": 12322 + }, + { + "epoch": 0.9324656653172411, + "grad_norm": 3.343369245529175, + "learning_rate": 2.2329225968017296e-07, + "loss": 0.7144, + "step": 12323 + }, + { + "epoch": 0.9325413340395747, + "grad_norm": 1.9788143634796143, + "learning_rate": 2.2279395978411932e-07, + "loss": 0.5868, + "step": 12324 + }, + { + "epoch": 0.9326170027619084, + "grad_norm": 3.297405481338501, + "learning_rate": 2.2229620954509554e-07, + "loss": 0.6762, + "step": 12325 + }, + { + "epoch": 0.932692671484242, + "grad_norm": 2.0477468967437744, + "learning_rate": 2.2179900899427574e-07, + "loss": 0.5926, + "step": 12326 + }, + { + "epoch": 0.9327683402065756, + "grad_norm": 2.3132870197296143, + "learning_rate": 2.21302358162793e-07, + "loss": 0.6212, + "step": 12327 + }, + { + "epoch": 0.9328440089289093, + "grad_norm": 2.395486354827881, + "learning_rate": 2.208062570817514e-07, + "loss": 0.664, + "step": 12328 + }, + { + "epoch": 0.9329196776512428, + "grad_norm": 2.0944366455078125, + "learning_rate": 2.2031070578221612e-07, + "loss": 0.6979, + "step": 12329 + }, + { + "epoch": 0.9329953463735765, + "grad_norm": 2.699592351913452, + "learning_rate": 2.1981570429522134e-07, + "loss": 0.6174, + "step": 12330 + }, + { + "epoch": 0.9330710150959101, + "grad_norm": 2.9009218215942383, + "learning_rate": 2.1932125265176628e-07, + "loss": 0.7047, + "step": 12331 + }, + { + "epoch": 0.9331466838182437, + "grad_norm": 2.090672254562378, + "learning_rate": 2.1882735088281414e-07, + "loss": 0.7085, + "step": 12332 + }, + { + "epoch": 0.9332223525405774, + "grad_norm": 2.3523049354553223, + "learning_rate": 2.1833399901929618e-07, + "loss": 0.5658, + "step": 12333 + }, + { + "epoch": 0.933298021262911, + "grad_norm": 1.9913511276245117, + "learning_rate": 2.178411970921057e-07, + "loss": 0.6611, + "step": 12334 + }, + { + "epoch": 0.9333736899852446, + "grad_norm": 2.0843513011932373, + "learning_rate": 2.1734894513210303e-07, + "loss": 0.64, + "step": 12335 + }, + { + "epoch": 0.9334493587075782, + "grad_norm": 2.296250343322754, + "learning_rate": 2.1685724317011746e-07, + "loss": 0.6017, + "step": 12336 + }, + { + "epoch": 0.9335250274299118, + "grad_norm": 2.035994291305542, + "learning_rate": 2.163660912369404e-07, + "loss": 0.6316, + "step": 12337 + }, + { + "epoch": 0.9336006961522455, + "grad_norm": 2.133603096008301, + "learning_rate": 2.1587548936332723e-07, + "loss": 0.7112, + "step": 12338 + }, + { + "epoch": 0.9336763648745791, + "grad_norm": 9.666108131408691, + "learning_rate": 2.1538543758000239e-07, + "loss": 0.6514, + "step": 12339 + }, + { + "epoch": 0.9337520335969127, + "grad_norm": 2.0413687229156494, + "learning_rate": 2.1489593591765434e-07, + "loss": 0.7346, + "step": 12340 + }, + { + "epoch": 0.9338277023192464, + "grad_norm": 7.327420711517334, + "learning_rate": 2.144069844069365e-07, + "loss": 0.6501, + "step": 12341 + }, + { + "epoch": 0.9339033710415799, + "grad_norm": 2.2991442680358887, + "learning_rate": 2.1391858307847045e-07, + "loss": 0.7028, + "step": 12342 + }, + { + "epoch": 0.9339790397639136, + "grad_norm": 10.1016845703125, + "learning_rate": 2.134307319628397e-07, + "loss": 0.6521, + "step": 12343 + }, + { + "epoch": 0.9340547084862472, + "grad_norm": 1.8840327262878418, + "learning_rate": 2.1294343109059677e-07, + "loss": 0.6156, + "step": 12344 + }, + { + "epoch": 0.9341303772085808, + "grad_norm": 1.7931628227233887, + "learning_rate": 2.124566804922563e-07, + "loss": 0.632, + "step": 12345 + }, + { + "epoch": 0.9342060459309145, + "grad_norm": 2.3598361015319824, + "learning_rate": 2.119704801982999e-07, + "loss": 0.6864, + "step": 12346 + }, + { + "epoch": 0.934281714653248, + "grad_norm": 1.6346832513809204, + "learning_rate": 2.114848302391772e-07, + "loss": 0.5736, + "step": 12347 + }, + { + "epoch": 0.9343573833755817, + "grad_norm": 2.1589810848236084, + "learning_rate": 2.1099973064529987e-07, + "loss": 0.657, + "step": 12348 + }, + { + "epoch": 0.9344330520979153, + "grad_norm": 3.102557897567749, + "learning_rate": 2.1051518144704562e-07, + "loss": 0.6275, + "step": 12349 + }, + { + "epoch": 0.9345087208202489, + "grad_norm": 2.09216046333313, + "learning_rate": 2.100311826747602e-07, + "loss": 0.7042, + "step": 12350 + }, + { + "epoch": 0.9345843895425826, + "grad_norm": 1.9065994024276733, + "learning_rate": 2.095477343587513e-07, + "loss": 0.6763, + "step": 12351 + }, + { + "epoch": 0.9346600582649162, + "grad_norm": 2.5319108963012695, + "learning_rate": 2.0906483652929576e-07, + "loss": 0.6599, + "step": 12352 + }, + { + "epoch": 0.9347357269872498, + "grad_norm": 2.6786136627197266, + "learning_rate": 2.0858248921663337e-07, + "loss": 0.7684, + "step": 12353 + }, + { + "epoch": 0.9348113957095835, + "grad_norm": 2.0246286392211914, + "learning_rate": 2.0810069245097097e-07, + "loss": 0.7829, + "step": 12354 + }, + { + "epoch": 0.934887064431917, + "grad_norm": 2.1122617721557617, + "learning_rate": 2.0761944626247942e-07, + "loss": 0.7041, + "step": 12355 + }, + { + "epoch": 0.9349627331542507, + "grad_norm": 2.1690616607666016, + "learning_rate": 2.0713875068129563e-07, + "loss": 0.6317, + "step": 12356 + }, + { + "epoch": 0.9350384018765843, + "grad_norm": 2.203425407409668, + "learning_rate": 2.066586057375225e-07, + "loss": 0.7214, + "step": 12357 + }, + { + "epoch": 0.9351140705989179, + "grad_norm": 2.0501766204833984, + "learning_rate": 2.0617901146122998e-07, + "loss": 0.595, + "step": 12358 + }, + { + "epoch": 0.9351897393212516, + "grad_norm": 3.1031334400177, + "learning_rate": 2.0569996788245005e-07, + "loss": 0.6128, + "step": 12359 + }, + { + "epoch": 0.9352654080435852, + "grad_norm": 2.129321336746216, + "learning_rate": 2.052214750311817e-07, + "loss": 0.6836, + "step": 12360 + }, + { + "epoch": 0.9353410767659188, + "grad_norm": 2.268303871154785, + "learning_rate": 2.04743532937391e-07, + "loss": 0.6845, + "step": 12361 + }, + { + "epoch": 0.9354167454882524, + "grad_norm": 2.151488780975342, + "learning_rate": 2.0426614163100698e-07, + "loss": 0.6448, + "step": 12362 + }, + { + "epoch": 0.935492414210586, + "grad_norm": 2.471512794494629, + "learning_rate": 2.0378930114192572e-07, + "loss": 0.6199, + "step": 12363 + }, + { + "epoch": 0.9355680829329197, + "grad_norm": 2.2034783363342285, + "learning_rate": 2.0331301150000935e-07, + "loss": 0.6998, + "step": 12364 + }, + { + "epoch": 0.9356437516552533, + "grad_norm": 2.128675937652588, + "learning_rate": 2.02837272735085e-07, + "loss": 0.7106, + "step": 12365 + }, + { + "epoch": 0.9357194203775869, + "grad_norm": 2.1250526905059814, + "learning_rate": 2.0236208487694285e-07, + "loss": 0.7074, + "step": 12366 + }, + { + "epoch": 0.9357950890999206, + "grad_norm": 3.409339189529419, + "learning_rate": 2.018874479553421e-07, + "loss": 0.6652, + "step": 12367 + }, + { + "epoch": 0.9358707578222542, + "grad_norm": 2.3269472122192383, + "learning_rate": 2.0141336200000592e-07, + "loss": 0.5631, + "step": 12368 + }, + { + "epoch": 0.9359464265445878, + "grad_norm": 2.5745363235473633, + "learning_rate": 2.0093982704062463e-07, + "loss": 0.6937, + "step": 12369 + }, + { + "epoch": 0.9360220952669214, + "grad_norm": 2.068528413772583, + "learning_rate": 2.0046684310684948e-07, + "loss": 0.7959, + "step": 12370 + }, + { + "epoch": 0.936097763989255, + "grad_norm": 2.4575674533843994, + "learning_rate": 1.9999441022830078e-07, + "loss": 0.6793, + "step": 12371 + }, + { + "epoch": 0.9361734327115887, + "grad_norm": 1.9595268964767456, + "learning_rate": 1.9952252843456685e-07, + "loss": 0.8498, + "step": 12372 + }, + { + "epoch": 0.9362491014339223, + "grad_norm": 3.2010905742645264, + "learning_rate": 1.990511977551951e-07, + "loss": 0.6914, + "step": 12373 + }, + { + "epoch": 0.9363247701562559, + "grad_norm": 2.2969300746917725, + "learning_rate": 1.9858041821970386e-07, + "loss": 0.6583, + "step": 12374 + }, + { + "epoch": 0.9364004388785895, + "grad_norm": 2.4109859466552734, + "learning_rate": 1.9811018985757357e-07, + "loss": 0.7333, + "step": 12375 + }, + { + "epoch": 0.9364761076009231, + "grad_norm": 1.9935057163238525, + "learning_rate": 1.9764051269825168e-07, + "loss": 0.6399, + "step": 12376 + }, + { + "epoch": 0.9365517763232568, + "grad_norm": 2.352203130722046, + "learning_rate": 1.9717138677115164e-07, + "loss": 0.6009, + "step": 12377 + }, + { + "epoch": 0.9366274450455904, + "grad_norm": 2.5235226154327393, + "learning_rate": 1.96702812105651e-07, + "loss": 0.6322, + "step": 12378 + }, + { + "epoch": 0.936703113767924, + "grad_norm": 2.3327648639678955, + "learning_rate": 1.9623478873109424e-07, + "loss": 0.6441, + "step": 12379 + }, + { + "epoch": 0.9367787824902577, + "grad_norm": 2.742035388946533, + "learning_rate": 1.9576731667678993e-07, + "loss": 0.6182, + "step": 12380 + }, + { + "epoch": 0.9368544512125913, + "grad_norm": 3.735501766204834, + "learning_rate": 1.9530039597201066e-07, + "loss": 0.6479, + "step": 12381 + }, + { + "epoch": 0.9369301199349249, + "grad_norm": 7.853936195373535, + "learning_rate": 1.9483402664600002e-07, + "loss": 0.7149, + "step": 12382 + }, + { + "epoch": 0.9370057886572585, + "grad_norm": 2.559588670730591, + "learning_rate": 1.9436820872796169e-07, + "loss": 0.5942, + "step": 12383 + }, + { + "epoch": 0.9370814573795921, + "grad_norm": 2.3932056427001953, + "learning_rate": 1.939029422470673e-07, + "loss": 0.6378, + "step": 12384 + }, + { + "epoch": 0.9371571261019258, + "grad_norm": 2.621971368789673, + "learning_rate": 1.9343822723245251e-07, + "loss": 0.7658, + "step": 12385 + }, + { + "epoch": 0.9372327948242594, + "grad_norm": 2.2142841815948486, + "learning_rate": 1.9297406371322012e-07, + "loss": 0.67, + "step": 12386 + }, + { + "epoch": 0.937308463546593, + "grad_norm": 2.741013526916504, + "learning_rate": 1.9251045171843684e-07, + "loss": 0.8169, + "step": 12387 + }, + { + "epoch": 0.9373841322689266, + "grad_norm": 2.1864612102508545, + "learning_rate": 1.9204739127713644e-07, + "loss": 0.6494, + "step": 12388 + }, + { + "epoch": 0.9374598009912603, + "grad_norm": 2.0697124004364014, + "learning_rate": 1.9158488241831672e-07, + "loss": 0.8498, + "step": 12389 + }, + { + "epoch": 0.9375354697135939, + "grad_norm": 2.364565372467041, + "learning_rate": 1.9112292517094255e-07, + "loss": 0.8094, + "step": 12390 + }, + { + "epoch": 0.9376111384359275, + "grad_norm": 2.1771390438079834, + "learning_rate": 1.9066151956394074e-07, + "loss": 0.7019, + "step": 12391 + }, + { + "epoch": 0.9376868071582611, + "grad_norm": 2.1345317363739014, + "learning_rate": 1.902006656262062e-07, + "loss": 0.5427, + "step": 12392 + }, + { + "epoch": 0.9377624758805948, + "grad_norm": 2.93278431892395, + "learning_rate": 1.8974036338660283e-07, + "loss": 0.6624, + "step": 12393 + }, + { + "epoch": 0.9378381446029284, + "grad_norm": 2.2009384632110596, + "learning_rate": 1.892806128739526e-07, + "loss": 0.7585, + "step": 12394 + }, + { + "epoch": 0.937913813325262, + "grad_norm": 2.6381959915161133, + "learning_rate": 1.8882141411704845e-07, + "loss": 0.8012, + "step": 12395 + }, + { + "epoch": 0.9379894820475956, + "grad_norm": 2.5263381004333496, + "learning_rate": 1.883627671446454e-07, + "loss": 0.6025, + "step": 12396 + }, + { + "epoch": 0.9380651507699292, + "grad_norm": 2.298753261566162, + "learning_rate": 1.8790467198546647e-07, + "loss": 0.687, + "step": 12397 + }, + { + "epoch": 0.9381408194922629, + "grad_norm": 3.0033249855041504, + "learning_rate": 1.8744712866819768e-07, + "loss": 0.8006, + "step": 12398 + }, + { + "epoch": 0.9382164882145965, + "grad_norm": 1.8784987926483154, + "learning_rate": 1.8699013722149417e-07, + "loss": 0.6037, + "step": 12399 + }, + { + "epoch": 0.9382921569369301, + "grad_norm": 2.2286734580993652, + "learning_rate": 1.8653369767397298e-07, + "loss": 0.7111, + "step": 12400 + }, + { + "epoch": 0.9383678256592637, + "grad_norm": 2.3188793659210205, + "learning_rate": 1.8607781005421832e-07, + "loss": 0.6128, + "step": 12401 + }, + { + "epoch": 0.9384434943815974, + "grad_norm": 2.3316099643707275, + "learning_rate": 1.856224743907773e-07, + "loss": 0.6353, + "step": 12402 + }, + { + "epoch": 0.938519163103931, + "grad_norm": 2.368476629257202, + "learning_rate": 1.851676907121671e-07, + "loss": 0.6556, + "step": 12403 + }, + { + "epoch": 0.9385948318262646, + "grad_norm": 2.1538264751434326, + "learning_rate": 1.8471345904686699e-07, + "loss": 0.6717, + "step": 12404 + }, + { + "epoch": 0.9386705005485982, + "grad_norm": 2.1779584884643555, + "learning_rate": 1.8425977942332118e-07, + "loss": 0.7298, + "step": 12405 + }, + { + "epoch": 0.9387461692709319, + "grad_norm": 11.318231582641602, + "learning_rate": 1.8380665186994294e-07, + "loss": 0.708, + "step": 12406 + }, + { + "epoch": 0.9388218379932655, + "grad_norm": 2.0828261375427246, + "learning_rate": 1.833540764151056e-07, + "loss": 0.6225, + "step": 12407 + }, + { + "epoch": 0.9388975067155991, + "grad_norm": 10.891009330749512, + "learning_rate": 1.8290205308715346e-07, + "loss": 0.634, + "step": 12408 + }, + { + "epoch": 0.9389731754379327, + "grad_norm": 3.8643298149108887, + "learning_rate": 1.824505819143929e-07, + "loss": 0.7067, + "step": 12409 + }, + { + "epoch": 0.9390488441602663, + "grad_norm": 3.798006534576416, + "learning_rate": 1.819996629250953e-07, + "loss": 0.876, + "step": 12410 + }, + { + "epoch": 0.9391245128826, + "grad_norm": 1.9390292167663574, + "learning_rate": 1.8154929614750004e-07, + "loss": 0.6763, + "step": 12411 + }, + { + "epoch": 0.9392001816049336, + "grad_norm": 2.075162172317505, + "learning_rate": 1.810994816098106e-07, + "loss": 0.6476, + "step": 12412 + }, + { + "epoch": 0.9392758503272672, + "grad_norm": 2.536733865737915, + "learning_rate": 1.8065021934019542e-07, + "loss": 0.6086, + "step": 12413 + }, + { + "epoch": 0.9393515190496009, + "grad_norm": 2.2751870155334473, + "learning_rate": 1.8020150936678804e-07, + "loss": 0.7458, + "step": 12414 + }, + { + "epoch": 0.9394271877719345, + "grad_norm": 2.1633825302124023, + "learning_rate": 1.7975335171768992e-07, + "loss": 0.727, + "step": 12415 + }, + { + "epoch": 0.9395028564942681, + "grad_norm": 1.8044906854629517, + "learning_rate": 1.7930574642096464e-07, + "loss": 0.5019, + "step": 12416 + }, + { + "epoch": 0.9395785252166017, + "grad_norm": 2.0498223304748535, + "learning_rate": 1.788586935046428e-07, + "loss": 0.6854, + "step": 12417 + }, + { + "epoch": 0.9396541939389353, + "grad_norm": 1.9652925729751587, + "learning_rate": 1.7841219299672096e-07, + "loss": 0.7548, + "step": 12418 + }, + { + "epoch": 0.939729862661269, + "grad_norm": 2.0633480548858643, + "learning_rate": 1.7796624492515978e-07, + "loss": 0.6056, + "step": 12419 + }, + { + "epoch": 0.9398055313836026, + "grad_norm": 2.8833742141723633, + "learning_rate": 1.775208493178869e-07, + "loss": 0.657, + "step": 12420 + }, + { + "epoch": 0.9398812001059362, + "grad_norm": 3.0172793865203857, + "learning_rate": 1.7707600620279307e-07, + "loss": 0.7535, + "step": 12421 + }, + { + "epoch": 0.9399568688282698, + "grad_norm": 2.2233293056488037, + "learning_rate": 1.7663171560773694e-07, + "loss": 0.7763, + "step": 12422 + }, + { + "epoch": 0.9400325375506035, + "grad_norm": 1.8882126808166504, + "learning_rate": 1.761879775605403e-07, + "loss": 0.718, + "step": 12423 + }, + { + "epoch": 0.9401082062729371, + "grad_norm": 2.3364417552948, + "learning_rate": 1.7574479208899286e-07, + "loss": 0.6638, + "step": 12424 + }, + { + "epoch": 0.9401838749952707, + "grad_norm": 2.688260316848755, + "learning_rate": 1.7530215922084646e-07, + "loss": 0.6975, + "step": 12425 + }, + { + "epoch": 0.9402595437176043, + "grad_norm": 1.90416419506073, + "learning_rate": 1.7486007898382393e-07, + "loss": 0.607, + "step": 12426 + }, + { + "epoch": 0.940335212439938, + "grad_norm": 2.5654749870300293, + "learning_rate": 1.7441855140560515e-07, + "loss": 0.709, + "step": 12427 + }, + { + "epoch": 0.9404108811622716, + "grad_norm": 2.002608060836792, + "learning_rate": 1.7397757651384194e-07, + "loss": 0.5844, + "step": 12428 + }, + { + "epoch": 0.9404865498846052, + "grad_norm": 1.7451132535934448, + "learning_rate": 1.7353715433615125e-07, + "loss": 0.6631, + "step": 12429 + }, + { + "epoch": 0.9405622186069388, + "grad_norm": 2.49381685256958, + "learning_rate": 1.73097284900111e-07, + "loss": 0.6062, + "step": 12430 + }, + { + "epoch": 0.9406378873292724, + "grad_norm": 3.437695264816284, + "learning_rate": 1.726579682332682e-07, + "loss": 0.6267, + "step": 12431 + }, + { + "epoch": 0.9407135560516061, + "grad_norm": 4.435547828674316, + "learning_rate": 1.7221920436313577e-07, + "loss": 0.65, + "step": 12432 + }, + { + "epoch": 0.9407892247739397, + "grad_norm": 2.160989761352539, + "learning_rate": 1.7178099331718776e-07, + "loss": 0.6809, + "step": 12433 + }, + { + "epoch": 0.9408648934962733, + "grad_norm": 1.8670154809951782, + "learning_rate": 1.7134333512286925e-07, + "loss": 0.6438, + "step": 12434 + }, + { + "epoch": 0.9409405622186069, + "grad_norm": 2.766958713531494, + "learning_rate": 1.709062298075853e-07, + "loss": 0.6525, + "step": 12435 + }, + { + "epoch": 0.9410162309409406, + "grad_norm": 2.01920747756958, + "learning_rate": 1.70469677398711e-07, + "loss": 0.7654, + "step": 12436 + }, + { + "epoch": 0.9410918996632742, + "grad_norm": 2.5431160926818848, + "learning_rate": 1.700336779235835e-07, + "loss": 0.6332, + "step": 12437 + }, + { + "epoch": 0.9411675683856078, + "grad_norm": 2.6801328659057617, + "learning_rate": 1.695982314095059e-07, + "loss": 0.6775, + "step": 12438 + }, + { + "epoch": 0.9412432371079414, + "grad_norm": 2.0385563373565674, + "learning_rate": 1.6916333788374849e-07, + "loss": 0.8871, + "step": 12439 + }, + { + "epoch": 0.9413189058302751, + "grad_norm": 1.3202354907989502, + "learning_rate": 1.687289973735454e-07, + "loss": 0.7429, + "step": 12440 + }, + { + "epoch": 0.9413945745526087, + "grad_norm": 2.1803438663482666, + "learning_rate": 1.6829520990609592e-07, + "loss": 0.6388, + "step": 12441 + }, + { + "epoch": 0.9414702432749423, + "grad_norm": 2.1765198707580566, + "learning_rate": 1.678619755085663e-07, + "loss": 0.6521, + "step": 12442 + }, + { + "epoch": 0.9415459119972759, + "grad_norm": 2.3797476291656494, + "learning_rate": 1.6742929420808584e-07, + "loss": 0.7488, + "step": 12443 + }, + { + "epoch": 0.9416215807196096, + "grad_norm": 2.2045605182647705, + "learning_rate": 1.6699716603175086e-07, + "loss": 0.7189, + "step": 12444 + }, + { + "epoch": 0.9416972494419432, + "grad_norm": 2.272555351257324, + "learning_rate": 1.6656559100662272e-07, + "loss": 0.6715, + "step": 12445 + }, + { + "epoch": 0.9417729181642768, + "grad_norm": 2.058490514755249, + "learning_rate": 1.661345691597288e-07, + "loss": 0.6625, + "step": 12446 + }, + { + "epoch": 0.9418485868866104, + "grad_norm": 4.0172038078308105, + "learning_rate": 1.657041005180605e-07, + "loss": 0.7327, + "step": 12447 + }, + { + "epoch": 0.941924255608944, + "grad_norm": 2.1872177124023438, + "learning_rate": 1.6527418510857328e-07, + "loss": 0.7891, + "step": 12448 + }, + { + "epoch": 0.9419999243312777, + "grad_norm": 1.810878038406372, + "learning_rate": 1.6484482295819258e-07, + "loss": 0.5745, + "step": 12449 + }, + { + "epoch": 0.9420755930536113, + "grad_norm": 2.1320366859436035, + "learning_rate": 1.6441601409380591e-07, + "loss": 0.7669, + "step": 12450 + }, + { + "epoch": 0.9421512617759449, + "grad_norm": 2.2419726848602295, + "learning_rate": 1.6398775854226578e-07, + "loss": 0.6414, + "step": 12451 + }, + { + "epoch": 0.9422269304982785, + "grad_norm": 1.9634507894515991, + "learning_rate": 1.6356005633039074e-07, + "loss": 0.7198, + "step": 12452 + }, + { + "epoch": 0.9423025992206122, + "grad_norm": 2.088675022125244, + "learning_rate": 1.6313290748496534e-07, + "loss": 0.5175, + "step": 12453 + }, + { + "epoch": 0.9423782679429458, + "grad_norm": 2.3254096508026123, + "learning_rate": 1.6270631203274023e-07, + "loss": 0.5991, + "step": 12454 + }, + { + "epoch": 0.9424539366652794, + "grad_norm": 1.976793646812439, + "learning_rate": 1.62280270000429e-07, + "loss": 0.6158, + "step": 12455 + }, + { + "epoch": 0.942529605387613, + "grad_norm": 1.8767503499984741, + "learning_rate": 1.6185478141471132e-07, + "loss": 0.6511, + "step": 12456 + }, + { + "epoch": 0.9426052741099467, + "grad_norm": 2.4342007637023926, + "learning_rate": 1.614298463022339e-07, + "loss": 0.7071, + "step": 12457 + }, + { + "epoch": 0.9426809428322803, + "grad_norm": 1.5997973680496216, + "learning_rate": 1.6100546468960642e-07, + "loss": 0.7338, + "step": 12458 + }, + { + "epoch": 0.9427566115546139, + "grad_norm": 1.8619414567947388, + "learning_rate": 1.6058163660340563e-07, + "loss": 0.7648, + "step": 12459 + }, + { + "epoch": 0.9428322802769475, + "grad_norm": 1.8532341718673706, + "learning_rate": 1.601583620701733e-07, + "loss": 0.7516, + "step": 12460 + }, + { + "epoch": 0.9429079489992811, + "grad_norm": 2.581843614578247, + "learning_rate": 1.5973564111641625e-07, + "loss": 0.6256, + "step": 12461 + }, + { + "epoch": 0.9429836177216148, + "grad_norm": 2.2200398445129395, + "learning_rate": 1.5931347376860528e-07, + "loss": 0.6617, + "step": 12462 + }, + { + "epoch": 0.9430592864439484, + "grad_norm": 3.4904468059539795, + "learning_rate": 1.5889186005317923e-07, + "loss": 0.6086, + "step": 12463 + }, + { + "epoch": 0.943134955166282, + "grad_norm": 3.9420273303985596, + "learning_rate": 1.5847079999654e-07, + "loss": 0.6135, + "step": 12464 + }, + { + "epoch": 0.9432106238886157, + "grad_norm": 1.8716062307357788, + "learning_rate": 1.5805029362505652e-07, + "loss": 0.6254, + "step": 12465 + }, + { + "epoch": 0.9432862926109493, + "grad_norm": 2.7966954708099365, + "learning_rate": 1.5763034096506167e-07, + "loss": 0.601, + "step": 12466 + }, + { + "epoch": 0.9433619613332829, + "grad_norm": 1.860151767730713, + "learning_rate": 1.5721094204285547e-07, + "loss": 0.6642, + "step": 12467 + }, + { + "epoch": 0.9434376300556165, + "grad_norm": 2.0277626514434814, + "learning_rate": 1.5679209688470087e-07, + "loss": 0.754, + "step": 12468 + }, + { + "epoch": 0.9435132987779501, + "grad_norm": 1.7337976694107056, + "learning_rate": 1.563738055168269e-07, + "loss": 0.6579, + "step": 12469 + }, + { + "epoch": 0.9435889675002838, + "grad_norm": 2.3716025352478027, + "learning_rate": 1.559560679654296e-07, + "loss": 0.6865, + "step": 12470 + }, + { + "epoch": 0.9436646362226174, + "grad_norm": 2.448322057723999, + "learning_rate": 1.5553888425666806e-07, + "loss": 0.693, + "step": 12471 + }, + { + "epoch": 0.943740304944951, + "grad_norm": 2.229336977005005, + "learning_rate": 1.551222544166684e-07, + "loss": 0.6458, + "step": 12472 + }, + { + "epoch": 0.9438159736672846, + "grad_norm": 2.3972370624542236, + "learning_rate": 1.5470617847152068e-07, + "loss": 0.697, + "step": 12473 + }, + { + "epoch": 0.9438916423896182, + "grad_norm": 5.422283172607422, + "learning_rate": 1.5429065644728113e-07, + "loss": 0.6889, + "step": 12474 + }, + { + "epoch": 0.9439673111119519, + "grad_norm": 2.0572350025177, + "learning_rate": 1.538756883699719e-07, + "loss": 0.7075, + "step": 12475 + }, + { + "epoch": 0.9440429798342855, + "grad_norm": 3.023061990737915, + "learning_rate": 1.5346127426557822e-07, + "loss": 0.7228, + "step": 12476 + }, + { + "epoch": 0.9441186485566191, + "grad_norm": 2.5621566772460938, + "learning_rate": 1.530474141600523e-07, + "loss": 0.6039, + "step": 12477 + }, + { + "epoch": 0.9441943172789528, + "grad_norm": 2.2269248962402344, + "learning_rate": 1.5263410807931244e-07, + "loss": 0.6546, + "step": 12478 + }, + { + "epoch": 0.9442699860012864, + "grad_norm": 2.0988047122955322, + "learning_rate": 1.5222135604924093e-07, + "loss": 0.5909, + "step": 12479 + }, + { + "epoch": 0.94434565472362, + "grad_norm": 2.0287554264068604, + "learning_rate": 1.5180915809568507e-07, + "loss": 0.7294, + "step": 12480 + }, + { + "epoch": 0.9444213234459536, + "grad_norm": 2.013700008392334, + "learning_rate": 1.5139751424445726e-07, + "loss": 0.7171, + "step": 12481 + }, + { + "epoch": 0.9444969921682872, + "grad_norm": 2.1935606002807617, + "learning_rate": 1.5098642452133883e-07, + "loss": 0.6523, + "step": 12482 + }, + { + "epoch": 0.9445726608906209, + "grad_norm": 1.8394144773483276, + "learning_rate": 1.505758889520702e-07, + "loss": 0.6384, + "step": 12483 + }, + { + "epoch": 0.9446483296129545, + "grad_norm": 2.9377388954162598, + "learning_rate": 1.5016590756236183e-07, + "loss": 0.646, + "step": 12484 + }, + { + "epoch": 0.9447239983352881, + "grad_norm": 2.116981267929077, + "learning_rate": 1.4975648037788914e-07, + "loss": 0.6948, + "step": 12485 + }, + { + "epoch": 0.9447996670576218, + "grad_norm": 2.6826493740081787, + "learning_rate": 1.4934760742429066e-07, + "loss": 0.5801, + "step": 12486 + }, + { + "epoch": 0.9448753357799553, + "grad_norm": 2.8828585147857666, + "learning_rate": 1.489392887271709e-07, + "loss": 0.641, + "step": 12487 + }, + { + "epoch": 0.944951004502289, + "grad_norm": 1.9045580625534058, + "learning_rate": 1.4853152431210138e-07, + "loss": 0.5268, + "step": 12488 + }, + { + "epoch": 0.9450266732246226, + "grad_norm": 6.540375709533691, + "learning_rate": 1.481243142046157e-07, + "loss": 0.6434, + "step": 12489 + }, + { + "epoch": 0.9451023419469562, + "grad_norm": 1.9567054510116577, + "learning_rate": 1.4771765843021746e-07, + "loss": 0.6509, + "step": 12490 + }, + { + "epoch": 0.9451780106692899, + "grad_norm": 4.807526111602783, + "learning_rate": 1.4731155701437028e-07, + "loss": 0.6686, + "step": 12491 + }, + { + "epoch": 0.9452536793916235, + "grad_norm": 2.3727920055389404, + "learning_rate": 1.469060099825068e-07, + "loss": 0.7862, + "step": 12492 + }, + { + "epoch": 0.9453293481139571, + "grad_norm": 2.6744470596313477, + "learning_rate": 1.4650101736002374e-07, + "loss": 0.6864, + "step": 12493 + }, + { + "epoch": 0.9454050168362907, + "grad_norm": 2.174910306930542, + "learning_rate": 1.460965791722808e-07, + "loss": 0.4797, + "step": 12494 + }, + { + "epoch": 0.9454806855586243, + "grad_norm": 2.039815664291382, + "learning_rate": 1.4569269544460872e-07, + "loss": 0.621, + "step": 12495 + }, + { + "epoch": 0.945556354280958, + "grad_norm": 2.9486753940582275, + "learning_rate": 1.4528936620229826e-07, + "loss": 0.7348, + "step": 12496 + }, + { + "epoch": 0.9456320230032916, + "grad_norm": 2.2600271701812744, + "learning_rate": 1.4488659147060723e-07, + "loss": 0.6521, + "step": 12497 + }, + { + "epoch": 0.9457076917256252, + "grad_norm": 2.1503450870513916, + "learning_rate": 1.4448437127475844e-07, + "loss": 0.6144, + "step": 12498 + }, + { + "epoch": 0.9457833604479589, + "grad_norm": 2.493232250213623, + "learning_rate": 1.4408270563994075e-07, + "loss": 0.6895, + "step": 12499 + }, + { + "epoch": 0.9458590291702924, + "grad_norm": 2.2982404232025146, + "learning_rate": 1.4368159459130704e-07, + "loss": 0.6948, + "step": 12500 + }, + { + "epoch": 0.9459346978926261, + "grad_norm": 1.8142317533493042, + "learning_rate": 1.432810381539772e-07, + "loss": 0.6644, + "step": 12501 + }, + { + "epoch": 0.9460103666149597, + "grad_norm": 2.070129871368408, + "learning_rate": 1.4288103635303517e-07, + "loss": 0.5584, + "step": 12502 + }, + { + "epoch": 0.9460860353372933, + "grad_norm": 2.6108832359313965, + "learning_rate": 1.4248158921352894e-07, + "loss": 0.6624, + "step": 12503 + }, + { + "epoch": 0.946161704059627, + "grad_norm": 1.9606751203536987, + "learning_rate": 1.4208269676047547e-07, + "loss": 0.6436, + "step": 12504 + }, + { + "epoch": 0.9462373727819606, + "grad_norm": 1.9400218725204468, + "learning_rate": 1.416843590188528e-07, + "loss": 0.7202, + "step": 12505 + }, + { + "epoch": 0.9463130415042942, + "grad_norm": 3.4818761348724365, + "learning_rate": 1.4128657601360696e-07, + "loss": 0.8214, + "step": 12506 + }, + { + "epoch": 0.9463887102266278, + "grad_norm": 2.120473623275757, + "learning_rate": 1.4088934776964902e-07, + "loss": 0.651, + "step": 12507 + }, + { + "epoch": 0.9464643789489614, + "grad_norm": 2.1492528915405273, + "learning_rate": 1.404926743118531e-07, + "loss": 0.6341, + "step": 12508 + }, + { + "epoch": 0.9465400476712951, + "grad_norm": 2.199296712875366, + "learning_rate": 1.400965556650613e-07, + "loss": 0.6321, + "step": 12509 + }, + { + "epoch": 0.9466157163936287, + "grad_norm": 1.9306163787841797, + "learning_rate": 1.3970099185407982e-07, + "loss": 0.6515, + "step": 12510 + }, + { + "epoch": 0.9466913851159623, + "grad_norm": 1.7523746490478516, + "learning_rate": 1.393059829036788e-07, + "loss": 0.672, + "step": 12511 + }, + { + "epoch": 0.946767053838296, + "grad_norm": 1.741031527519226, + "learning_rate": 1.3891152883859748e-07, + "loss": 0.5683, + "step": 12512 + }, + { + "epoch": 0.9468427225606295, + "grad_norm": 2.261603355407715, + "learning_rate": 1.385176296835361e-07, + "loss": 0.6403, + "step": 12513 + }, + { + "epoch": 0.9469183912829632, + "grad_norm": 2.020596504211426, + "learning_rate": 1.381242854631619e-07, + "loss": 0.7049, + "step": 12514 + }, + { + "epoch": 0.9469940600052968, + "grad_norm": 1.8669471740722656, + "learning_rate": 1.3773149620210723e-07, + "loss": 0.6478, + "step": 12515 + }, + { + "epoch": 0.9470697287276304, + "grad_norm": 2.3542299270629883, + "learning_rate": 1.3733926192497136e-07, + "loss": 0.7477, + "step": 12516 + }, + { + "epoch": 0.9471453974499641, + "grad_norm": 2.5550339221954346, + "learning_rate": 1.3694758265631568e-07, + "loss": 0.6786, + "step": 12517 + }, + { + "epoch": 0.9472210661722977, + "grad_norm": 2.4507362842559814, + "learning_rate": 1.3655645842066956e-07, + "loss": 0.6481, + "step": 12518 + }, + { + "epoch": 0.9472967348946313, + "grad_norm": 1.8866103887557983, + "learning_rate": 1.3616588924252538e-07, + "loss": 0.6686, + "step": 12519 + }, + { + "epoch": 0.947372403616965, + "grad_norm": 2.28023624420166, + "learning_rate": 1.357758751463416e-07, + "loss": 0.7095, + "step": 12520 + }, + { + "epoch": 0.9474480723392985, + "grad_norm": 2.679999351501465, + "learning_rate": 1.3538641615654468e-07, + "loss": 0.7683, + "step": 12521 + }, + { + "epoch": 0.9475237410616322, + "grad_norm": 2.136809825897217, + "learning_rate": 1.349975122975211e-07, + "loss": 0.771, + "step": 12522 + }, + { + "epoch": 0.9475994097839658, + "grad_norm": 2.2693591117858887, + "learning_rate": 1.346091635936254e-07, + "loss": 0.6842, + "step": 12523 + }, + { + "epoch": 0.9476750785062994, + "grad_norm": 1.6307967901229858, + "learning_rate": 1.3422137006917913e-07, + "loss": 0.801, + "step": 12524 + }, + { + "epoch": 0.9477507472286331, + "grad_norm": 2.6975574493408203, + "learning_rate": 1.3383413174846582e-07, + "loss": 0.6235, + "step": 12525 + }, + { + "epoch": 0.9478264159509666, + "grad_norm": 2.2791359424591064, + "learning_rate": 1.334474486557351e-07, + "loss": 0.591, + "step": 12526 + }, + { + "epoch": 0.9479020846733003, + "grad_norm": 2.0071561336517334, + "learning_rate": 1.3306132081520362e-07, + "loss": 0.6949, + "step": 12527 + }, + { + "epoch": 0.947977753395634, + "grad_norm": 3.1164028644561768, + "learning_rate": 1.32675748251052e-07, + "loss": 0.6275, + "step": 12528 + }, + { + "epoch": 0.9480534221179675, + "grad_norm": 2.2760443687438965, + "learning_rate": 1.3229073098742496e-07, + "loss": 0.7345, + "step": 12529 + }, + { + "epoch": 0.9481290908403012, + "grad_norm": 3.998307228088379, + "learning_rate": 1.3190626904843317e-07, + "loss": 0.7405, + "step": 12530 + }, + { + "epoch": 0.9482047595626348, + "grad_norm": 2.044447183609009, + "learning_rate": 1.315223624581544e-07, + "loss": 0.6031, + "step": 12531 + }, + { + "epoch": 0.9482804282849684, + "grad_norm": 3.347496509552002, + "learning_rate": 1.3113901124063045e-07, + "loss": 0.7423, + "step": 12532 + }, + { + "epoch": 0.9483560970073021, + "grad_norm": 1.8583823442459106, + "learning_rate": 1.3075621541986605e-07, + "loss": 0.6394, + "step": 12533 + }, + { + "epoch": 0.9484317657296356, + "grad_norm": 2.4121158123016357, + "learning_rate": 1.3037397501983406e-07, + "loss": 0.6032, + "step": 12534 + }, + { + "epoch": 0.9485074344519693, + "grad_norm": 2.290224313735962, + "learning_rate": 1.2999229006447134e-07, + "loss": 0.6019, + "step": 12535 + }, + { + "epoch": 0.9485831031743029, + "grad_norm": 2.389235019683838, + "learning_rate": 1.2961116057768074e-07, + "loss": 0.7305, + "step": 12536 + }, + { + "epoch": 0.9486587718966365, + "grad_norm": 2.0536439418792725, + "learning_rate": 1.292305865833292e-07, + "loss": 0.5639, + "step": 12537 + }, + { + "epoch": 0.9487344406189702, + "grad_norm": 2.742306709289551, + "learning_rate": 1.2885056810525063e-07, + "loss": 0.7101, + "step": 12538 + }, + { + "epoch": 0.9488101093413037, + "grad_norm": 1.4020804166793823, + "learning_rate": 1.2847110516724202e-07, + "loss": 0.8055, + "step": 12539 + }, + { + "epoch": 0.9488857780636374, + "grad_norm": 1.8928502798080444, + "learning_rate": 1.2809219779306735e-07, + "loss": 0.4833, + "step": 12540 + }, + { + "epoch": 0.948961446785971, + "grad_norm": 1.8035271167755127, + "learning_rate": 1.2771384600645264e-07, + "loss": 0.727, + "step": 12541 + }, + { + "epoch": 0.9490371155083046, + "grad_norm": 2.05291485786438, + "learning_rate": 1.2733604983109493e-07, + "loss": 0.6463, + "step": 12542 + }, + { + "epoch": 0.9491127842306383, + "grad_norm": 2.0966427326202393, + "learning_rate": 1.269588092906513e-07, + "loss": 0.5246, + "step": 12543 + }, + { + "epoch": 0.9491884529529719, + "grad_norm": 1.9400415420532227, + "learning_rate": 1.2658212440874585e-07, + "loss": 0.5948, + "step": 12544 + }, + { + "epoch": 0.9492641216753055, + "grad_norm": 2.1425304412841797, + "learning_rate": 1.262059952089677e-07, + "loss": 0.5876, + "step": 12545 + }, + { + "epoch": 0.9493397903976392, + "grad_norm": 2.972198724746704, + "learning_rate": 1.2583042171487103e-07, + "loss": 0.6889, + "step": 12546 + }, + { + "epoch": 0.9494154591199727, + "grad_norm": 1.892687201499939, + "learning_rate": 1.25455403949976e-07, + "loss": 0.6808, + "step": 12547 + }, + { + "epoch": 0.9494911278423064, + "grad_norm": 2.389648914337158, + "learning_rate": 1.2508094193776786e-07, + "loss": 0.7109, + "step": 12548 + }, + { + "epoch": 0.94956679656464, + "grad_norm": 2.40558123588562, + "learning_rate": 1.2470703570169583e-07, + "loss": 0.6682, + "step": 12549 + }, + { + "epoch": 0.9496424652869736, + "grad_norm": 5.251361846923828, + "learning_rate": 1.2433368526517619e-07, + "loss": 0.7605, + "step": 12550 + }, + { + "epoch": 0.9497181340093073, + "grad_norm": 1.9533052444458008, + "learning_rate": 1.2396089065158722e-07, + "loss": 0.619, + "step": 12551 + }, + { + "epoch": 0.9497938027316408, + "grad_norm": 2.2325778007507324, + "learning_rate": 1.2358865188427626e-07, + "loss": 0.707, + "step": 12552 + }, + { + "epoch": 0.9498694714539745, + "grad_norm": 2.874405860900879, + "learning_rate": 1.2321696898655465e-07, + "loss": 0.6691, + "step": 12553 + }, + { + "epoch": 0.9499451401763082, + "grad_norm": 6.6497697830200195, + "learning_rate": 1.228458419816968e-07, + "loss": 0.7334, + "step": 12554 + }, + { + "epoch": 0.9500208088986417, + "grad_norm": 2.4262197017669678, + "learning_rate": 1.2247527089294408e-07, + "loss": 0.7098, + "step": 12555 + }, + { + "epoch": 0.9500964776209754, + "grad_norm": 2.1515469551086426, + "learning_rate": 1.2210525574350296e-07, + "loss": 0.5938, + "step": 12556 + }, + { + "epoch": 0.950172146343309, + "grad_norm": 2.0948853492736816, + "learning_rate": 1.2173579655654686e-07, + "loss": 0.7028, + "step": 12557 + }, + { + "epoch": 0.9502478150656426, + "grad_norm": 2.3338329792022705, + "learning_rate": 1.2136689335521035e-07, + "loss": 0.7445, + "step": 12558 + }, + { + "epoch": 0.9503234837879763, + "grad_norm": 1.8340742588043213, + "learning_rate": 1.2099854616259587e-07, + "loss": 0.6667, + "step": 12559 + }, + { + "epoch": 0.9503991525103098, + "grad_norm": 3.241755962371826, + "learning_rate": 1.2063075500177e-07, + "loss": 0.6357, + "step": 12560 + }, + { + "epoch": 0.9504748212326435, + "grad_norm": 2.4523792266845703, + "learning_rate": 1.2026351989576633e-07, + "loss": 0.5422, + "step": 12561 + }, + { + "epoch": 0.9505504899549772, + "grad_norm": 2.439547538757324, + "learning_rate": 1.1989684086758147e-07, + "loss": 0.7088, + "step": 12562 + }, + { + "epoch": 0.9506261586773107, + "grad_norm": 2.1670706272125244, + "learning_rate": 1.19530717940178e-07, + "loss": 0.7398, + "step": 12563 + }, + { + "epoch": 0.9507018273996444, + "grad_norm": 3.0719752311706543, + "learning_rate": 1.1916515113648463e-07, + "loss": 0.7488, + "step": 12564 + }, + { + "epoch": 0.9507774961219779, + "grad_norm": 4.41519832611084, + "learning_rate": 1.1880014047939302e-07, + "loss": 0.58, + "step": 12565 + }, + { + "epoch": 0.9508531648443116, + "grad_norm": 2.505060911178589, + "learning_rate": 1.1843568599176091e-07, + "loss": 0.6799, + "step": 12566 + }, + { + "epoch": 0.9509288335666453, + "grad_norm": 2.1886472702026367, + "learning_rate": 1.1807178769641402e-07, + "loss": 0.6989, + "step": 12567 + }, + { + "epoch": 0.9510045022889788, + "grad_norm": 2.5705251693725586, + "learning_rate": 1.1770844561613913e-07, + "loss": 0.6411, + "step": 12568 + }, + { + "epoch": 0.9510801710113125, + "grad_norm": 2.1216318607330322, + "learning_rate": 1.1734565977369005e-07, + "loss": 0.6649, + "step": 12569 + }, + { + "epoch": 0.9511558397336461, + "grad_norm": 2.3023529052734375, + "learning_rate": 1.1698343019178559e-07, + "loss": 0.6896, + "step": 12570 + }, + { + "epoch": 0.9512315084559797, + "grad_norm": 2.0217092037200928, + "learning_rate": 1.166217568931096e-07, + "loss": 0.4953, + "step": 12571 + }, + { + "epoch": 0.9513071771783134, + "grad_norm": 2.0048747062683105, + "learning_rate": 1.1626063990031199e-07, + "loss": 0.5621, + "step": 12572 + }, + { + "epoch": 0.9513828459006469, + "grad_norm": 2.298532009124756, + "learning_rate": 1.1590007923600665e-07, + "loss": 0.733, + "step": 12573 + }, + { + "epoch": 0.9514585146229806, + "grad_norm": 2.250257730484009, + "learning_rate": 1.1554007492277252e-07, + "loss": 0.7155, + "step": 12574 + }, + { + "epoch": 0.9515341833453143, + "grad_norm": 2.662155866622925, + "learning_rate": 1.1518062698315557e-07, + "loss": 0.608, + "step": 12575 + }, + { + "epoch": 0.9516098520676478, + "grad_norm": 2.0141024589538574, + "learning_rate": 1.1482173543966479e-07, + "loss": 0.5917, + "step": 12576 + }, + { + "epoch": 0.9516855207899815, + "grad_norm": 7.37333345413208, + "learning_rate": 1.144634003147742e-07, + "loss": 0.8291, + "step": 12577 + }, + { + "epoch": 0.951761189512315, + "grad_norm": 2.1229794025421143, + "learning_rate": 1.1410562163092486e-07, + "loss": 0.5582, + "step": 12578 + }, + { + "epoch": 0.9518368582346487, + "grad_norm": 2.4636754989624023, + "learning_rate": 1.1374839941052284e-07, + "loss": 0.8073, + "step": 12579 + }, + { + "epoch": 0.9519125269569824, + "grad_norm": 2.854630947113037, + "learning_rate": 1.1339173367593725e-07, + "loss": 0.5689, + "step": 12580 + }, + { + "epoch": 0.9519881956793159, + "grad_norm": 1.9160290956497192, + "learning_rate": 1.1303562444950321e-07, + "loss": 0.5861, + "step": 12581 + }, + { + "epoch": 0.9520638644016496, + "grad_norm": 2.2296550273895264, + "learning_rate": 1.1268007175352291e-07, + "loss": 0.6746, + "step": 12582 + }, + { + "epoch": 0.9521395331239833, + "grad_norm": 2.0400097370147705, + "learning_rate": 1.123250756102625e-07, + "loss": 0.7203, + "step": 12583 + }, + { + "epoch": 0.9522152018463168, + "grad_norm": 1.9765279293060303, + "learning_rate": 1.1197063604195123e-07, + "loss": 0.589, + "step": 12584 + }, + { + "epoch": 0.9522908705686505, + "grad_norm": 2.517303228378296, + "learning_rate": 1.1161675307078534e-07, + "loss": 0.741, + "step": 12585 + }, + { + "epoch": 0.952366539290984, + "grad_norm": 2.5748841762542725, + "learning_rate": 1.1126342671892908e-07, + "loss": 0.6368, + "step": 12586 + }, + { + "epoch": 0.9524422080133177, + "grad_norm": 1.8756376504898071, + "learning_rate": 1.1091065700850378e-07, + "loss": 0.6526, + "step": 12587 + }, + { + "epoch": 0.9525178767356514, + "grad_norm": 3.596529960632324, + "learning_rate": 1.1055844396160574e-07, + "loss": 0.5991, + "step": 12588 + }, + { + "epoch": 0.9525935454579849, + "grad_norm": 6.150516033172607, + "learning_rate": 1.1020678760029035e-07, + "loss": 0.6345, + "step": 12589 + }, + { + "epoch": 0.9526692141803186, + "grad_norm": 2.0023791790008545, + "learning_rate": 1.0985568794657797e-07, + "loss": 0.5204, + "step": 12590 + }, + { + "epoch": 0.9527448829026521, + "grad_norm": 1.9689321517944336, + "learning_rate": 1.0950514502245701e-07, + "loss": 0.666, + "step": 12591 + }, + { + "epoch": 0.9528205516249858, + "grad_norm": 2.2433865070343018, + "learning_rate": 1.0915515884987892e-07, + "loss": 0.6823, + "step": 12592 + }, + { + "epoch": 0.9528962203473195, + "grad_norm": 2.0992069244384766, + "learning_rate": 1.0880572945076217e-07, + "loss": 0.7694, + "step": 12593 + }, + { + "epoch": 0.952971889069653, + "grad_norm": 3.2551393508911133, + "learning_rate": 1.0845685684698726e-07, + "loss": 0.6114, + "step": 12594 + }, + { + "epoch": 0.9530475577919867, + "grad_norm": 2.008338212966919, + "learning_rate": 1.0810854106040268e-07, + "loss": 0.5985, + "step": 12595 + }, + { + "epoch": 0.9531232265143204, + "grad_norm": 2.3150219917297363, + "learning_rate": 1.0776078211282203e-07, + "loss": 0.7655, + "step": 12596 + }, + { + "epoch": 0.9531988952366539, + "grad_norm": 2.6294565200805664, + "learning_rate": 1.0741358002602086e-07, + "loss": 0.6856, + "step": 12597 + }, + { + "epoch": 0.9532745639589876, + "grad_norm": 1.913138747215271, + "learning_rate": 1.0706693482174479e-07, + "loss": 0.597, + "step": 12598 + }, + { + "epoch": 0.9533502326813211, + "grad_norm": 2.5744075775146484, + "learning_rate": 1.0672084652169944e-07, + "loss": 0.8647, + "step": 12599 + }, + { + "epoch": 0.9534259014036548, + "grad_norm": 1.7659814357757568, + "learning_rate": 1.0637531514756049e-07, + "loss": 0.6648, + "step": 12600 + }, + { + "epoch": 0.9535015701259885, + "grad_norm": 1.7846674919128418, + "learning_rate": 1.0603034072096363e-07, + "loss": 0.8073, + "step": 12601 + }, + { + "epoch": 0.953577238848322, + "grad_norm": 2.20290207862854, + "learning_rate": 1.0568592326351257e-07, + "loss": 0.7963, + "step": 12602 + }, + { + "epoch": 0.9536529075706557, + "grad_norm": 1.7884464263916016, + "learning_rate": 1.0534206279677904e-07, + "loss": 0.6471, + "step": 12603 + }, + { + "epoch": 0.9537285762929892, + "grad_norm": 2.2755014896392822, + "learning_rate": 1.0499875934229286e-07, + "loss": 0.6275, + "step": 12604 + }, + { + "epoch": 0.9538042450153229, + "grad_norm": 1.9066975116729736, + "learning_rate": 1.046560129215538e-07, + "loss": 0.6213, + "step": 12605 + }, + { + "epoch": 0.9538799137376566, + "grad_norm": 3.067195177078247, + "learning_rate": 1.043138235560267e-07, + "loss": 0.6876, + "step": 12606 + }, + { + "epoch": 0.9539555824599901, + "grad_norm": 2.0356249809265137, + "learning_rate": 1.0397219126714042e-07, + "loss": 0.6169, + "step": 12607 + }, + { + "epoch": 0.9540312511823238, + "grad_norm": 2.0770580768585205, + "learning_rate": 1.0363111607628884e-07, + "loss": 0.7419, + "step": 12608 + }, + { + "epoch": 0.9541069199046575, + "grad_norm": 3.184155225753784, + "learning_rate": 1.0329059800483087e-07, + "loss": 0.7008, + "step": 12609 + }, + { + "epoch": 0.954182588626991, + "grad_norm": 1.850549578666687, + "learning_rate": 1.0295063707409147e-07, + "loss": 0.6739, + "step": 12610 + }, + { + "epoch": 0.9542582573493247, + "grad_norm": 2.2812001705169678, + "learning_rate": 1.026112333053596e-07, + "loss": 0.6861, + "step": 12611 + }, + { + "epoch": 0.9543339260716582, + "grad_norm": 2.713418960571289, + "learning_rate": 1.0227238671988925e-07, + "loss": 0.5918, + "step": 12612 + }, + { + "epoch": 0.9544095947939919, + "grad_norm": 4.540587902069092, + "learning_rate": 1.0193409733890147e-07, + "loss": 0.6968, + "step": 12613 + }, + { + "epoch": 0.9544852635163256, + "grad_norm": 6.917119979858398, + "learning_rate": 1.0159636518358029e-07, + "loss": 0.6452, + "step": 12614 + }, + { + "epoch": 0.9545609322386591, + "grad_norm": 1.9576506614685059, + "learning_rate": 1.012591902750758e-07, + "loss": 0.7493, + "step": 12615 + }, + { + "epoch": 0.9546366009609928, + "grad_norm": 2.1393682956695557, + "learning_rate": 1.009225726345021e-07, + "loss": 0.7141, + "step": 12616 + }, + { + "epoch": 0.9547122696833263, + "grad_norm": 2.726297378540039, + "learning_rate": 1.0058651228294036e-07, + "loss": 0.697, + "step": 12617 + }, + { + "epoch": 0.95478793840566, + "grad_norm": 2.0172853469848633, + "learning_rate": 1.0025100924143571e-07, + "loss": 0.6984, + "step": 12618 + }, + { + "epoch": 0.9548636071279937, + "grad_norm": 2.437422752380371, + "learning_rate": 9.991606353099836e-08, + "loss": 0.7801, + "step": 12619 + }, + { + "epoch": 0.9549392758503272, + "grad_norm": 2.639605760574341, + "learning_rate": 9.958167517260252e-08, + "loss": 0.6264, + "step": 12620 + }, + { + "epoch": 0.9550149445726609, + "grad_norm": 2.6709601879119873, + "learning_rate": 9.924784418719146e-08, + "loss": 0.7681, + "step": 12621 + }, + { + "epoch": 0.9550906132949946, + "grad_norm": 1.9807193279266357, + "learning_rate": 9.891457059566745e-08, + "loss": 0.6269, + "step": 12622 + }, + { + "epoch": 0.9551662820173281, + "grad_norm": 2.4436280727386475, + "learning_rate": 9.858185441890177e-08, + "loss": 0.5812, + "step": 12623 + }, + { + "epoch": 0.9552419507396618, + "grad_norm": 2.2791218757629395, + "learning_rate": 9.824969567773278e-08, + "loss": 0.6978, + "step": 12624 + }, + { + "epoch": 0.9553176194619953, + "grad_norm": 1.9470678567886353, + "learning_rate": 9.791809439295885e-08, + "loss": 0.8761, + "step": 12625 + }, + { + "epoch": 0.955393288184329, + "grad_norm": 2.1823413372039795, + "learning_rate": 9.758705058534634e-08, + "loss": 0.5794, + "step": 12626 + }, + { + "epoch": 0.9554689569066627, + "grad_norm": 2.474637508392334, + "learning_rate": 9.725656427562769e-08, + "loss": 0.6105, + "step": 12627 + }, + { + "epoch": 0.9555446256289962, + "grad_norm": 2.545193910598755, + "learning_rate": 9.692663548449732e-08, + "loss": 0.6524, + "step": 12628 + }, + { + "epoch": 0.9556202943513299, + "grad_norm": 1.8600281476974487, + "learning_rate": 9.659726423261672e-08, + "loss": 0.594, + "step": 12629 + }, + { + "epoch": 0.9556959630736634, + "grad_norm": 1.867389440536499, + "learning_rate": 9.626845054061239e-08, + "loss": 0.624, + "step": 12630 + }, + { + "epoch": 0.9557716317959971, + "grad_norm": 1.7528537511825562, + "learning_rate": 9.594019442907686e-08, + "loss": 0.5813, + "step": 12631 + }, + { + "epoch": 0.9558473005183308, + "grad_norm": 4.033431053161621, + "learning_rate": 9.561249591856569e-08, + "loss": 0.6462, + "step": 12632 + }, + { + "epoch": 0.9559229692406643, + "grad_norm": 2.031341075897217, + "learning_rate": 9.528535502959845e-08, + "loss": 0.6628, + "step": 12633 + }, + { + "epoch": 0.955998637962998, + "grad_norm": 2.2305564880371094, + "learning_rate": 9.495877178266477e-08, + "loss": 0.6545, + "step": 12634 + }, + { + "epoch": 0.9560743066853317, + "grad_norm": 1.8292995691299438, + "learning_rate": 9.463274619821627e-08, + "loss": 0.5837, + "step": 12635 + }, + { + "epoch": 0.9561499754076652, + "grad_norm": 1.8190289735794067, + "learning_rate": 9.430727829666763e-08, + "loss": 0.6024, + "step": 12636 + }, + { + "epoch": 0.9562256441299989, + "grad_norm": 1.8149267435073853, + "learning_rate": 9.398236809840155e-08, + "loss": 0.7985, + "step": 12637 + }, + { + "epoch": 0.9563013128523324, + "grad_norm": 2.005405902862549, + "learning_rate": 9.365801562376474e-08, + "loss": 0.7324, + "step": 12638 + }, + { + "epoch": 0.9563769815746661, + "grad_norm": 3.069957971572876, + "learning_rate": 9.333422089307097e-08, + "loss": 0.5228, + "step": 12639 + }, + { + "epoch": 0.9564526502969998, + "grad_norm": 2.064141035079956, + "learning_rate": 9.301098392659502e-08, + "loss": 0.7289, + "step": 12640 + }, + { + "epoch": 0.9565283190193333, + "grad_norm": 1.8948678970336914, + "learning_rate": 9.268830474457967e-08, + "loss": 0.6597, + "step": 12641 + }, + { + "epoch": 0.956603987741667, + "grad_norm": 2.4123117923736572, + "learning_rate": 9.236618336723379e-08, + "loss": 0.7717, + "step": 12642 + }, + { + "epoch": 0.9566796564640007, + "grad_norm": 2.754319667816162, + "learning_rate": 9.204461981472623e-08, + "loss": 0.5993, + "step": 12643 + }, + { + "epoch": 0.9567553251863342, + "grad_norm": 2.069659471511841, + "learning_rate": 9.172361410719787e-08, + "loss": 0.692, + "step": 12644 + }, + { + "epoch": 0.9568309939086679, + "grad_norm": 2.8982131481170654, + "learning_rate": 9.140316626474865e-08, + "loss": 0.6309, + "step": 12645 + }, + { + "epoch": 0.9569066626310014, + "grad_norm": 2.2474629878997803, + "learning_rate": 9.10832763074485e-08, + "loss": 0.5228, + "step": 12646 + }, + { + "epoch": 0.9569823313533351, + "grad_norm": 3.5946133136749268, + "learning_rate": 9.076394425532741e-08, + "loss": 0.6798, + "step": 12647 + }, + { + "epoch": 0.9570580000756688, + "grad_norm": 1.9877676963806152, + "learning_rate": 9.044517012838438e-08, + "loss": 0.6968, + "step": 12648 + }, + { + "epoch": 0.9571336687980023, + "grad_norm": 2.1319262981414795, + "learning_rate": 9.012695394658143e-08, + "loss": 0.6383, + "step": 12649 + }, + { + "epoch": 0.957209337520336, + "grad_norm": 2.4543004035949707, + "learning_rate": 8.980929572984764e-08, + "loss": 0.7055, + "step": 12650 + }, + { + "epoch": 0.9572850062426695, + "grad_norm": 2.64933180809021, + "learning_rate": 8.949219549807408e-08, + "loss": 0.5386, + "step": 12651 + }, + { + "epoch": 0.9573606749650032, + "grad_norm": 2.116211414337158, + "learning_rate": 8.917565327111888e-08, + "loss": 0.6279, + "step": 12652 + }, + { + "epoch": 0.9574363436873369, + "grad_norm": 2.017352342605591, + "learning_rate": 8.885966906880616e-08, + "loss": 0.5641, + "step": 12653 + }, + { + "epoch": 0.9575120124096704, + "grad_norm": 2.1703391075134277, + "learning_rate": 8.854424291092311e-08, + "loss": 0.7587, + "step": 12654 + }, + { + "epoch": 0.9575876811320041, + "grad_norm": 2.2390897274017334, + "learning_rate": 8.822937481722194e-08, + "loss": 0.6269, + "step": 12655 + }, + { + "epoch": 0.9576633498543378, + "grad_norm": 1.785683035850525, + "learning_rate": 8.791506480742284e-08, + "loss": 0.6123, + "step": 12656 + }, + { + "epoch": 0.9577390185766713, + "grad_norm": 1.9447267055511475, + "learning_rate": 8.76013129012061e-08, + "loss": 0.6235, + "step": 12657 + }, + { + "epoch": 0.957814687299005, + "grad_norm": 1.9245911836624146, + "learning_rate": 8.728811911822199e-08, + "loss": 0.5552, + "step": 12658 + }, + { + "epoch": 0.9578903560213385, + "grad_norm": 3.584024667739868, + "learning_rate": 8.697548347808281e-08, + "loss": 0.8091, + "step": 12659 + }, + { + "epoch": 0.9579660247436722, + "grad_norm": 2.3792314529418945, + "learning_rate": 8.666340600036793e-08, + "loss": 0.6981, + "step": 12660 + }, + { + "epoch": 0.9580416934660059, + "grad_norm": 3.4363317489624023, + "learning_rate": 8.635188670461869e-08, + "loss": 0.5795, + "step": 12661 + }, + { + "epoch": 0.9581173621883394, + "grad_norm": 2.3299102783203125, + "learning_rate": 8.604092561034549e-08, + "loss": 0.6234, + "step": 12662 + }, + { + "epoch": 0.9581930309106731, + "grad_norm": 2.2737045288085938, + "learning_rate": 8.573052273701975e-08, + "loss": 0.7343, + "step": 12663 + }, + { + "epoch": 0.9582686996330066, + "grad_norm": 2.334836483001709, + "learning_rate": 8.542067810408194e-08, + "loss": 0.7093, + "step": 12664 + }, + { + "epoch": 0.9583443683553403, + "grad_norm": 2.3729231357574463, + "learning_rate": 8.511139173093352e-08, + "loss": 0.6742, + "step": 12665 + }, + { + "epoch": 0.958420037077674, + "grad_norm": 2.322737693786621, + "learning_rate": 8.4802663636945e-08, + "loss": 0.6121, + "step": 12666 + }, + { + "epoch": 0.9584957058000075, + "grad_norm": 2.7772295475006104, + "learning_rate": 8.449449384144891e-08, + "loss": 0.4887, + "step": 12667 + }, + { + "epoch": 0.9585713745223412, + "grad_norm": 2.32209849357605, + "learning_rate": 8.418688236374283e-08, + "loss": 0.5629, + "step": 12668 + }, + { + "epoch": 0.9586470432446749, + "grad_norm": 2.3986520767211914, + "learning_rate": 8.387982922309135e-08, + "loss": 0.5758, + "step": 12669 + }, + { + "epoch": 0.9587227119670084, + "grad_norm": 2.683042526245117, + "learning_rate": 8.357333443872406e-08, + "loss": 0.7964, + "step": 12670 + }, + { + "epoch": 0.9587983806893421, + "grad_norm": 3.9749975204467773, + "learning_rate": 8.326739802983363e-08, + "loss": 0.6979, + "step": 12671 + }, + { + "epoch": 0.9588740494116756, + "grad_norm": 2.608891010284424, + "learning_rate": 8.296202001557873e-08, + "loss": 0.7334, + "step": 12672 + }, + { + "epoch": 0.9589497181340093, + "grad_norm": 2.276517152786255, + "learning_rate": 8.265720041508407e-08, + "loss": 0.6777, + "step": 12673 + }, + { + "epoch": 0.959025386856343, + "grad_norm": 2.6598875522613525, + "learning_rate": 8.235293924743636e-08, + "loss": 0.6954, + "step": 12674 + }, + { + "epoch": 0.9591010555786765, + "grad_norm": 1.9157791137695312, + "learning_rate": 8.204923653169139e-08, + "loss": 0.5896, + "step": 12675 + }, + { + "epoch": 0.9591767243010102, + "grad_norm": 2.493894100189209, + "learning_rate": 8.174609228686792e-08, + "loss": 0.6405, + "step": 12676 + }, + { + "epoch": 0.9592523930233438, + "grad_norm": 2.365548610687256, + "learning_rate": 8.144350653194877e-08, + "loss": 0.6652, + "step": 12677 + }, + { + "epoch": 0.9593280617456774, + "grad_norm": 1.957023024559021, + "learning_rate": 8.114147928588377e-08, + "loss": 0.6683, + "step": 12678 + }, + { + "epoch": 0.9594037304680111, + "grad_norm": 2.2314658164978027, + "learning_rate": 8.084001056758583e-08, + "loss": 0.5744, + "step": 12679 + }, + { + "epoch": 0.9594793991903446, + "grad_norm": 2.346789836883545, + "learning_rate": 8.053910039593481e-08, + "loss": 0.608, + "step": 12680 + }, + { + "epoch": 0.9595550679126783, + "grad_norm": 2.3163628578186035, + "learning_rate": 8.023874878977467e-08, + "loss": 0.6973, + "step": 12681 + }, + { + "epoch": 0.959630736635012, + "grad_norm": 2.010727643966675, + "learning_rate": 7.993895576791333e-08, + "loss": 0.7321, + "step": 12682 + }, + { + "epoch": 0.9597064053573455, + "grad_norm": 2.24619197845459, + "learning_rate": 7.963972134912578e-08, + "loss": 0.8042, + "step": 12683 + }, + { + "epoch": 0.9597820740796792, + "grad_norm": 2.5780961513519287, + "learning_rate": 7.934104555215105e-08, + "loss": 0.5879, + "step": 12684 + }, + { + "epoch": 0.9598577428020127, + "grad_norm": 2.02319598197937, + "learning_rate": 7.904292839569315e-08, + "loss": 0.6226, + "step": 12685 + }, + { + "epoch": 0.9599334115243464, + "grad_norm": 2.3072006702423096, + "learning_rate": 7.874536989842018e-08, + "loss": 0.742, + "step": 12686 + }, + { + "epoch": 0.9600090802466801, + "grad_norm": 2.2716474533081055, + "learning_rate": 7.844837007896821e-08, + "loss": 0.7209, + "step": 12687 + }, + { + "epoch": 0.9600847489690136, + "grad_norm": 2.065999984741211, + "learning_rate": 7.815192895593437e-08, + "loss": 0.6923, + "step": 12688 + }, + { + "epoch": 0.9601604176913473, + "grad_norm": 2.5855307579040527, + "learning_rate": 7.785604654788281e-08, + "loss": 0.4902, + "step": 12689 + }, + { + "epoch": 0.9602360864136809, + "grad_norm": 1.992203950881958, + "learning_rate": 7.75607228733447e-08, + "loss": 0.6572, + "step": 12690 + }, + { + "epoch": 0.9603117551360145, + "grad_norm": 1.9567265510559082, + "learning_rate": 7.726595795081226e-08, + "loss": 0.6281, + "step": 12691 + }, + { + "epoch": 0.9603874238583482, + "grad_norm": 1.9469960927963257, + "learning_rate": 7.697175179874472e-08, + "loss": 0.6364, + "step": 12692 + }, + { + "epoch": 0.9604630925806817, + "grad_norm": 1.9473823308944702, + "learning_rate": 7.667810443556733e-08, + "loss": 0.5201, + "step": 12693 + }, + { + "epoch": 0.9605387613030154, + "grad_norm": 2.243472099304199, + "learning_rate": 7.638501587966839e-08, + "loss": 0.569, + "step": 12694 + }, + { + "epoch": 0.9606144300253491, + "grad_norm": 1.9308743476867676, + "learning_rate": 7.609248614940123e-08, + "loss": 0.5012, + "step": 12695 + }, + { + "epoch": 0.9606900987476826, + "grad_norm": 2.4356298446655273, + "learning_rate": 7.580051526308718e-08, + "loss": 0.7305, + "step": 12696 + }, + { + "epoch": 0.9607657674700163, + "grad_norm": 1.883434534072876, + "learning_rate": 7.550910323900862e-08, + "loss": 0.6199, + "step": 12697 + }, + { + "epoch": 0.9608414361923499, + "grad_norm": 2.298492908477783, + "learning_rate": 7.521825009541594e-08, + "loss": 0.5387, + "step": 12698 + }, + { + "epoch": 0.9609171049146835, + "grad_norm": 1.997653841972351, + "learning_rate": 7.49279558505226e-08, + "loss": 0.5514, + "step": 12699 + }, + { + "epoch": 0.9609927736370172, + "grad_norm": 2.485727310180664, + "learning_rate": 7.463822052250702e-08, + "loss": 0.7501, + "step": 12700 + }, + { + "epoch": 0.9610684423593507, + "grad_norm": 4.434056758880615, + "learning_rate": 7.434904412951471e-08, + "loss": 0.6913, + "step": 12701 + }, + { + "epoch": 0.9611441110816844, + "grad_norm": 2.3401598930358887, + "learning_rate": 7.406042668965419e-08, + "loss": 0.654, + "step": 12702 + }, + { + "epoch": 0.961219779804018, + "grad_norm": 1.9578860998153687, + "learning_rate": 7.377236822099998e-08, + "loss": 0.831, + "step": 12703 + }, + { + "epoch": 0.9612954485263516, + "grad_norm": 2.5894887447357178, + "learning_rate": 7.348486874159166e-08, + "loss": 0.6493, + "step": 12704 + }, + { + "epoch": 0.9613711172486853, + "grad_norm": 2.045994281768799, + "learning_rate": 7.319792826943084e-08, + "loss": 0.6242, + "step": 12705 + }, + { + "epoch": 0.9614467859710188, + "grad_norm": 1.8696407079696655, + "learning_rate": 7.291154682249013e-08, + "loss": 0.7577, + "step": 12706 + }, + { + "epoch": 0.9615224546933525, + "grad_norm": 2.9939026832580566, + "learning_rate": 7.262572441870219e-08, + "loss": 0.7651, + "step": 12707 + }, + { + "epoch": 0.9615981234156862, + "grad_norm": 2.0975334644317627, + "learning_rate": 7.234046107596471e-08, + "loss": 0.6157, + "step": 12708 + }, + { + "epoch": 0.9616737921380197, + "grad_norm": 2.63741397857666, + "learning_rate": 7.205575681214438e-08, + "loss": 0.6313, + "step": 12709 + }, + { + "epoch": 0.9617494608603534, + "grad_norm": 2.186922788619995, + "learning_rate": 7.177161164506795e-08, + "loss": 0.622, + "step": 12710 + }, + { + "epoch": 0.961825129582687, + "grad_norm": 4.426486492156982, + "learning_rate": 7.14880255925312e-08, + "loss": 0.7372, + "step": 12711 + }, + { + "epoch": 0.9619007983050206, + "grad_norm": 2.0475969314575195, + "learning_rate": 7.12049986722919e-08, + "loss": 0.7598, + "step": 12712 + }, + { + "epoch": 0.9619764670273543, + "grad_norm": 2.091733932495117, + "learning_rate": 7.09225309020759e-08, + "loss": 0.6758, + "step": 12713 + }, + { + "epoch": 0.9620521357496878, + "grad_norm": 1.9092284440994263, + "learning_rate": 7.064062229957102e-08, + "loss": 0.6692, + "step": 12714 + }, + { + "epoch": 0.9621278044720215, + "grad_norm": 2.5357465744018555, + "learning_rate": 7.035927288243016e-08, + "loss": 0.6938, + "step": 12715 + }, + { + "epoch": 0.9622034731943551, + "grad_norm": 2.2587950229644775, + "learning_rate": 7.007848266827521e-08, + "loss": 0.6561, + "step": 12716 + }, + { + "epoch": 0.9622791419166887, + "grad_norm": 2.3485636711120605, + "learning_rate": 6.979825167468812e-08, + "loss": 0.6759, + "step": 12717 + }, + { + "epoch": 0.9623548106390224, + "grad_norm": 1.9571853876113892, + "learning_rate": 6.951857991921783e-08, + "loss": 0.6174, + "step": 12718 + }, + { + "epoch": 0.962430479361356, + "grad_norm": 1.7149250507354736, + "learning_rate": 6.923946741937836e-08, + "loss": 0.6175, + "step": 12719 + }, + { + "epoch": 0.9625061480836896, + "grad_norm": 1.5973360538482666, + "learning_rate": 6.896091419264971e-08, + "loss": 0.4001, + "step": 12720 + }, + { + "epoch": 0.9625818168060233, + "grad_norm": 2.3491268157958984, + "learning_rate": 6.868292025647494e-08, + "loss": 0.5713, + "step": 12721 + }, + { + "epoch": 0.9626574855283568, + "grad_norm": 2.8299319744110107, + "learning_rate": 6.84054856282631e-08, + "loss": 0.7462, + "step": 12722 + }, + { + "epoch": 0.9627331542506905, + "grad_norm": 2.6515955924987793, + "learning_rate": 6.81286103253883e-08, + "loss": 0.7521, + "step": 12723 + }, + { + "epoch": 0.9628088229730241, + "grad_norm": 2.534411668777466, + "learning_rate": 6.785229436518969e-08, + "loss": 0.5264, + "step": 12724 + }, + { + "epoch": 0.9628844916953577, + "grad_norm": 2.334911823272705, + "learning_rate": 6.757653776496841e-08, + "loss": 0.5722, + "step": 12725 + }, + { + "epoch": 0.9629601604176914, + "grad_norm": 2.0208029747009277, + "learning_rate": 6.730134054199665e-08, + "loss": 0.6558, + "step": 12726 + }, + { + "epoch": 0.963035829140025, + "grad_norm": 2.0118119716644287, + "learning_rate": 6.702670271350764e-08, + "loss": 0.7271, + "step": 12727 + }, + { + "epoch": 0.9631114978623586, + "grad_norm": 2.5050179958343506, + "learning_rate": 6.675262429669759e-08, + "loss": 0.7486, + "step": 12728 + }, + { + "epoch": 0.9631871665846922, + "grad_norm": 2.230125665664673, + "learning_rate": 6.64791053087328e-08, + "loss": 0.6211, + "step": 12729 + }, + { + "epoch": 0.9632628353070258, + "grad_norm": 2.61617112159729, + "learning_rate": 6.620614576673956e-08, + "loss": 0.6537, + "step": 12730 + }, + { + "epoch": 0.9633385040293595, + "grad_norm": 2.5779638290405273, + "learning_rate": 6.593374568781519e-08, + "loss": 0.6734, + "step": 12731 + }, + { + "epoch": 0.963414172751693, + "grad_norm": 2.4025802612304688, + "learning_rate": 6.566190508901404e-08, + "loss": 0.7389, + "step": 12732 + }, + { + "epoch": 0.9634898414740267, + "grad_norm": 2.136965751647949, + "learning_rate": 6.539062398736251e-08, + "loss": 0.5392, + "step": 12733 + }, + { + "epoch": 0.9635655101963604, + "grad_norm": 4.406826496124268, + "learning_rate": 6.5119902399848e-08, + "loss": 0.6547, + "step": 12734 + }, + { + "epoch": 0.9636411789186939, + "grad_norm": 1.7842594385147095, + "learning_rate": 6.484974034342395e-08, + "loss": 0.6089, + "step": 12735 + }, + { + "epoch": 0.9637168476410276, + "grad_norm": 2.642629623413086, + "learning_rate": 6.458013783500882e-08, + "loss": 0.7135, + "step": 12736 + }, + { + "epoch": 0.9637925163633612, + "grad_norm": 2.0451202392578125, + "learning_rate": 6.431109489148612e-08, + "loss": 0.745, + "step": 12737 + }, + { + "epoch": 0.9638681850856948, + "grad_norm": 2.547163486480713, + "learning_rate": 6.404261152970437e-08, + "loss": 0.5514, + "step": 12738 + }, + { + "epoch": 0.9639438538080285, + "grad_norm": 2.487988233566284, + "learning_rate": 6.37746877664771e-08, + "loss": 0.6643, + "step": 12739 + }, + { + "epoch": 0.964019522530362, + "grad_norm": 2.42266583442688, + "learning_rate": 6.350732361858092e-08, + "loss": 0.6916, + "step": 12740 + }, + { + "epoch": 0.9640951912526957, + "grad_norm": 2.2286386489868164, + "learning_rate": 6.324051910276141e-08, + "loss": 0.7287, + "step": 12741 + }, + { + "epoch": 0.9641708599750293, + "grad_norm": 2.0510995388031006, + "learning_rate": 6.297427423572521e-08, + "loss": 0.6814, + "step": 12742 + }, + { + "epoch": 0.9642465286973629, + "grad_norm": 2.0005273818969727, + "learning_rate": 6.2708589034146e-08, + "loss": 0.6475, + "step": 12743 + }, + { + "epoch": 0.9643221974196966, + "grad_norm": 1.9023462533950806, + "learning_rate": 6.244346351466146e-08, + "loss": 0.716, + "step": 12744 + }, + { + "epoch": 0.9643978661420302, + "grad_norm": 2.4381988048553467, + "learning_rate": 6.21788976938743e-08, + "loss": 0.6293, + "step": 12745 + }, + { + "epoch": 0.9644735348643638, + "grad_norm": 1.6925064325332642, + "learning_rate": 6.191489158835328e-08, + "loss": 0.5658, + "step": 12746 + }, + { + "epoch": 0.9645492035866975, + "grad_norm": 2.0021846294403076, + "learning_rate": 6.165144521463117e-08, + "loss": 0.5507, + "step": 12747 + }, + { + "epoch": 0.964624872309031, + "grad_norm": 2.283257484436035, + "learning_rate": 6.138855858920577e-08, + "loss": 0.8271, + "step": 12748 + }, + { + "epoch": 0.9647005410313647, + "grad_norm": 2.3239376544952393, + "learning_rate": 6.112623172853993e-08, + "loss": 0.5572, + "step": 12749 + }, + { + "epoch": 0.9647762097536983, + "grad_norm": 1.8095026016235352, + "learning_rate": 6.086446464906148e-08, + "loss": 0.6423, + "step": 12750 + }, + { + "epoch": 0.9648518784760319, + "grad_norm": 2.217108964920044, + "learning_rate": 6.060325736716133e-08, + "loss": 0.6364, + "step": 12751 + }, + { + "epoch": 0.9649275471983656, + "grad_norm": 2.219142436981201, + "learning_rate": 6.034260989920037e-08, + "loss": 0.7133, + "step": 12752 + }, + { + "epoch": 0.9650032159206992, + "grad_norm": 2.773773670196533, + "learning_rate": 6.008252226149957e-08, + "loss": 0.5822, + "step": 12753 + }, + { + "epoch": 0.9650788846430328, + "grad_norm": 2.4652557373046875, + "learning_rate": 5.982299447034589e-08, + "loss": 0.6448, + "step": 12754 + }, + { + "epoch": 0.9651545533653664, + "grad_norm": 1.8013569116592407, + "learning_rate": 5.9564026541992333e-08, + "loss": 0.4925, + "step": 12755 + }, + { + "epoch": 0.9652302220877, + "grad_norm": 2.425915241241455, + "learning_rate": 5.930561849265592e-08, + "loss": 0.6024, + "step": 12756 + }, + { + "epoch": 0.9653058908100337, + "grad_norm": 2.936739921569824, + "learning_rate": 5.9047770338520714e-08, + "loss": 0.7074, + "step": 12757 + }, + { + "epoch": 0.9653815595323673, + "grad_norm": 1.838965892791748, + "learning_rate": 5.879048209573079e-08, + "loss": 0.7531, + "step": 12758 + }, + { + "epoch": 0.9654572282547009, + "grad_norm": 2.287705421447754, + "learning_rate": 5.853375378040227e-08, + "loss": 0.6203, + "step": 12759 + }, + { + "epoch": 0.9655328969770346, + "grad_norm": 2.7539687156677246, + "learning_rate": 5.827758540860928e-08, + "loss": 0.5758, + "step": 12760 + }, + { + "epoch": 0.9656085656993681, + "grad_norm": 1.8179734945297241, + "learning_rate": 5.8021976996394e-08, + "loss": 0.5622, + "step": 12761 + }, + { + "epoch": 0.9656842344217018, + "grad_norm": 2.9774842262268066, + "learning_rate": 5.776692855976562e-08, + "loss": 0.7241, + "step": 12762 + }, + { + "epoch": 0.9657599031440354, + "grad_norm": 1.650606632232666, + "learning_rate": 5.751244011469536e-08, + "loss": 0.743, + "step": 12763 + }, + { + "epoch": 0.965835571866369, + "grad_norm": 2.1129419803619385, + "learning_rate": 5.7258511677118485e-08, + "loss": 0.6685, + "step": 12764 + }, + { + "epoch": 0.9659112405887027, + "grad_norm": 3.256843328475952, + "learning_rate": 5.7005143262938266e-08, + "loss": 0.6937, + "step": 12765 + }, + { + "epoch": 0.9659869093110363, + "grad_norm": 2.002124309539795, + "learning_rate": 5.675233488802101e-08, + "loss": 0.7681, + "step": 12766 + }, + { + "epoch": 0.9660625780333699, + "grad_norm": 2.2396044731140137, + "learning_rate": 5.650008656819905e-08, + "loss": 0.5746, + "step": 12767 + }, + { + "epoch": 0.9661382467557035, + "grad_norm": 2.126986503601074, + "learning_rate": 5.624839831926776e-08, + "loss": 0.6165, + "step": 12768 + }, + { + "epoch": 0.9662139154780371, + "grad_norm": 1.8271424770355225, + "learning_rate": 5.5997270156989525e-08, + "loss": 0.6282, + "step": 12769 + }, + { + "epoch": 0.9662895842003708, + "grad_norm": 2.457106828689575, + "learning_rate": 5.574670209709176e-08, + "loss": 0.6688, + "step": 12770 + }, + { + "epoch": 0.9663652529227044, + "grad_norm": 2.185253381729126, + "learning_rate": 5.5496694155262925e-08, + "loss": 0.677, + "step": 12771 + }, + { + "epoch": 0.966440921645038, + "grad_norm": 1.8739707469940186, + "learning_rate": 5.524724634716149e-08, + "loss": 0.6415, + "step": 12772 + }, + { + "epoch": 0.9665165903673717, + "grad_norm": 2.169405221939087, + "learning_rate": 5.499835868840997e-08, + "loss": 0.5788, + "step": 12773 + }, + { + "epoch": 0.9665922590897053, + "grad_norm": 2.6439480781555176, + "learning_rate": 5.4750031194590875e-08, + "loss": 0.6599, + "step": 12774 + }, + { + "epoch": 0.9666679278120389, + "grad_norm": 2.3185319900512695, + "learning_rate": 5.4502263881258784e-08, + "loss": 0.6413, + "step": 12775 + }, + { + "epoch": 0.9667435965343725, + "grad_norm": 1.7796145677566528, + "learning_rate": 5.425505676392728e-08, + "loss": 0.601, + "step": 12776 + }, + { + "epoch": 0.9668192652567061, + "grad_norm": 1.9893208742141724, + "learning_rate": 5.4008409858077977e-08, + "loss": 0.6446, + "step": 12777 + }, + { + "epoch": 0.9668949339790398, + "grad_norm": 2.2175159454345703, + "learning_rate": 5.376232317915752e-08, + "loss": 0.6573, + "step": 12778 + }, + { + "epoch": 0.9669706027013734, + "grad_norm": 3.3002736568450928, + "learning_rate": 5.351679674257559e-08, + "loss": 0.6405, + "step": 12779 + }, + { + "epoch": 0.967046271423707, + "grad_norm": 2.012585401535034, + "learning_rate": 5.327183056370888e-08, + "loss": 0.6775, + "step": 12780 + }, + { + "epoch": 0.9671219401460406, + "grad_norm": 2.179703712463379, + "learning_rate": 5.302742465789712e-08, + "loss": 0.5052, + "step": 12781 + }, + { + "epoch": 0.9671976088683742, + "grad_norm": 2.5398223400115967, + "learning_rate": 5.278357904044606e-08, + "loss": 0.7536, + "step": 12782 + }, + { + "epoch": 0.9672732775907079, + "grad_norm": 2.5192582607269287, + "learning_rate": 5.2540293726625497e-08, + "loss": 0.7409, + "step": 12783 + }, + { + "epoch": 0.9673489463130415, + "grad_norm": 2.220503091812134, + "learning_rate": 5.229756873167224e-08, + "loss": 0.6389, + "step": 12784 + }, + { + "epoch": 0.9674246150353751, + "grad_norm": 2.697296142578125, + "learning_rate": 5.205540407078513e-08, + "loss": 0.77, + "step": 12785 + }, + { + "epoch": 0.9675002837577088, + "grad_norm": 2.3187201023101807, + "learning_rate": 5.1813799759130034e-08, + "loss": 0.6236, + "step": 12786 + }, + { + "epoch": 0.9675759524800424, + "grad_norm": 2.0090014934539795, + "learning_rate": 5.157275581183585e-08, + "loss": 0.6874, + "step": 12787 + }, + { + "epoch": 0.967651621202376, + "grad_norm": 1.9001106023788452, + "learning_rate": 5.13322722439995e-08, + "loss": 0.6184, + "step": 12788 + }, + { + "epoch": 0.9677272899247096, + "grad_norm": 2.549781560897827, + "learning_rate": 5.1092349070678944e-08, + "loss": 0.6552, + "step": 12789 + }, + { + "epoch": 0.9678029586470432, + "grad_norm": 2.1971235275268555, + "learning_rate": 5.085298630690016e-08, + "loss": 0.6244, + "step": 12790 + }, + { + "epoch": 0.9678786273693769, + "grad_norm": 3.237020492553711, + "learning_rate": 5.061418396765316e-08, + "loss": 0.7338, + "step": 12791 + }, + { + "epoch": 0.9679542960917105, + "grad_norm": 2.797713279724121, + "learning_rate": 5.0375942067890976e-08, + "loss": 0.7075, + "step": 12792 + }, + { + "epoch": 0.9680299648140441, + "grad_norm": 2.096550941467285, + "learning_rate": 5.013826062253368e-08, + "loss": 0.5549, + "step": 12793 + }, + { + "epoch": 0.9681056335363777, + "grad_norm": 1.996402621269226, + "learning_rate": 4.9901139646466364e-08, + "loss": 0.6456, + "step": 12794 + }, + { + "epoch": 0.9681813022587114, + "grad_norm": 1.7945371866226196, + "learning_rate": 4.966457915453815e-08, + "loss": 0.5903, + "step": 12795 + }, + { + "epoch": 0.968256970981045, + "grad_norm": 2.276614189147949, + "learning_rate": 4.9428579161562184e-08, + "loss": 0.7016, + "step": 12796 + }, + { + "epoch": 0.9683326397033786, + "grad_norm": 2.1116700172424316, + "learning_rate": 4.919313968231765e-08, + "loss": 0.6455, + "step": 12797 + }, + { + "epoch": 0.9684083084257122, + "grad_norm": 2.1539816856384277, + "learning_rate": 4.895826073155074e-08, + "loss": 0.7131, + "step": 12798 + }, + { + "epoch": 0.9684839771480459, + "grad_norm": 2.0993354320526123, + "learning_rate": 4.872394232396771e-08, + "loss": 0.6818, + "step": 12799 + }, + { + "epoch": 0.9685596458703795, + "grad_norm": 2.5427918434143066, + "learning_rate": 4.8490184474243806e-08, + "loss": 0.472, + "step": 12800 + }, + { + "epoch": 0.9686353145927131, + "grad_norm": 2.007993459701538, + "learning_rate": 4.825698719701632e-08, + "loss": 0.6107, + "step": 12801 + }, + { + "epoch": 0.9687109833150467, + "grad_norm": 2.1347367763519287, + "learning_rate": 4.802435050689058e-08, + "loss": 0.7914, + "step": 12802 + }, + { + "epoch": 0.9687866520373803, + "grad_norm": 2.216740846633911, + "learning_rate": 4.779227441843392e-08, + "loss": 0.702, + "step": 12803 + }, + { + "epoch": 0.968862320759714, + "grad_norm": 2.9680533409118652, + "learning_rate": 4.756075894618073e-08, + "loss": 0.6582, + "step": 12804 + }, + { + "epoch": 0.9689379894820476, + "grad_norm": 2.0287156105041504, + "learning_rate": 4.7329804104627394e-08, + "loss": 0.6694, + "step": 12805 + }, + { + "epoch": 0.9690136582043812, + "grad_norm": 2.646369695663452, + "learning_rate": 4.7099409908239355e-08, + "loss": 0.6483, + "step": 12806 + }, + { + "epoch": 0.9690893269267148, + "grad_norm": 2.547806978225708, + "learning_rate": 4.686957637144207e-08, + "loss": 0.5989, + "step": 12807 + }, + { + "epoch": 0.9691649956490485, + "grad_norm": 2.441589832305908, + "learning_rate": 4.664030350863102e-08, + "loss": 0.75, + "step": 12808 + }, + { + "epoch": 0.9692406643713821, + "grad_norm": 2.0199062824249268, + "learning_rate": 4.641159133416273e-08, + "loss": 0.6601, + "step": 12809 + }, + { + "epoch": 0.9693163330937157, + "grad_norm": 2.6701488494873047, + "learning_rate": 4.618343986235973e-08, + "loss": 0.7961, + "step": 12810 + }, + { + "epoch": 0.9693920018160493, + "grad_norm": 2.2760307788848877, + "learning_rate": 4.5955849107509603e-08, + "loss": 0.5234, + "step": 12811 + }, + { + "epoch": 0.969467670538383, + "grad_norm": 2.0577778816223145, + "learning_rate": 4.572881908386495e-08, + "loss": 0.5675, + "step": 12812 + }, + { + "epoch": 0.9695433392607166, + "grad_norm": 2.3686912059783936, + "learning_rate": 4.5502349805643385e-08, + "loss": 0.6366, + "step": 12813 + }, + { + "epoch": 0.9696190079830502, + "grad_norm": 2.451944351196289, + "learning_rate": 4.527644128702757e-08, + "loss": 0.7198, + "step": 12814 + }, + { + "epoch": 0.9696946767053838, + "grad_norm": 2.4918670654296875, + "learning_rate": 4.505109354216419e-08, + "loss": 0.5377, + "step": 12815 + }, + { + "epoch": 0.9697703454277175, + "grad_norm": 1.964171051979065, + "learning_rate": 4.4826306585164955e-08, + "loss": 0.6491, + "step": 12816 + }, + { + "epoch": 0.9698460141500511, + "grad_norm": 2.240342855453491, + "learning_rate": 4.4602080430106605e-08, + "loss": 0.7502, + "step": 12817 + }, + { + "epoch": 0.9699216828723847, + "grad_norm": 2.0190277099609375, + "learning_rate": 4.437841509103091e-08, + "loss": 0.6771, + "step": 12818 + }, + { + "epoch": 0.9699973515947183, + "grad_norm": 2.1004180908203125, + "learning_rate": 4.415531058194566e-08, + "loss": 0.675, + "step": 12819 + }, + { + "epoch": 0.9700730203170519, + "grad_norm": 2.711425304412842, + "learning_rate": 4.3932766916821684e-08, + "loss": 0.6182, + "step": 12820 + }, + { + "epoch": 0.9701486890393856, + "grad_norm": 2.490840435028076, + "learning_rate": 4.371078410959484e-08, + "loss": 0.6801, + "step": 12821 + }, + { + "epoch": 0.9702243577617192, + "grad_norm": 2.158508777618408, + "learning_rate": 4.348936217416599e-08, + "loss": 0.6393, + "step": 12822 + }, + { + "epoch": 0.9703000264840528, + "grad_norm": 2.2075629234313965, + "learning_rate": 4.326850112440306e-08, + "loss": 0.5681, + "step": 12823 + }, + { + "epoch": 0.9703756952063864, + "grad_norm": 2.199986457824707, + "learning_rate": 4.304820097413698e-08, + "loss": 0.7375, + "step": 12824 + }, + { + "epoch": 0.9704513639287201, + "grad_norm": 2.364950180053711, + "learning_rate": 4.2828461737161706e-08, + "loss": 0.6709, + "step": 12825 + }, + { + "epoch": 0.9705270326510537, + "grad_norm": 2.256744861602783, + "learning_rate": 4.2609283427239245e-08, + "loss": 0.4733, + "step": 12826 + }, + { + "epoch": 0.9706027013733873, + "grad_norm": 2.7820794582366943, + "learning_rate": 4.2390666058095606e-08, + "loss": 0.7549, + "step": 12827 + }, + { + "epoch": 0.9706783700957209, + "grad_norm": 2.726006507873535, + "learning_rate": 4.2172609643420846e-08, + "loss": 0.8422, + "step": 12828 + }, + { + "epoch": 0.9707540388180546, + "grad_norm": 2.408484697341919, + "learning_rate": 4.1955114196870035e-08, + "loss": 0.6995, + "step": 12829 + }, + { + "epoch": 0.9708297075403882, + "grad_norm": 1.8652710914611816, + "learning_rate": 4.1738179732064286e-08, + "loss": 0.6074, + "step": 12830 + }, + { + "epoch": 0.9709053762627218, + "grad_norm": 1.773630976676941, + "learning_rate": 4.152180626258772e-08, + "loss": 0.7438, + "step": 12831 + }, + { + "epoch": 0.9709810449850554, + "grad_norm": 1.8957676887512207, + "learning_rate": 4.1305993801991514e-08, + "loss": 0.6815, + "step": 12832 + }, + { + "epoch": 0.971056713707389, + "grad_norm": 2.1224136352539062, + "learning_rate": 4.109074236378885e-08, + "loss": 0.5116, + "step": 12833 + }, + { + "epoch": 0.9711323824297227, + "grad_norm": 2.1865241527557373, + "learning_rate": 4.087605196146094e-08, + "loss": 0.6051, + "step": 12834 + }, + { + "epoch": 0.9712080511520563, + "grad_norm": 2.5317890644073486, + "learning_rate": 4.066192260845303e-08, + "loss": 0.6102, + "step": 12835 + }, + { + "epoch": 0.9712837198743899, + "grad_norm": 2.466912269592285, + "learning_rate": 4.0448354318172395e-08, + "loss": 0.6871, + "step": 12836 + }, + { + "epoch": 0.9713593885967235, + "grad_norm": 2.1204843521118164, + "learning_rate": 4.023534710399435e-08, + "loss": 0.5491, + "step": 12837 + }, + { + "epoch": 0.9714350573190572, + "grad_norm": 2.1326029300689697, + "learning_rate": 4.0022900979259206e-08, + "loss": 0.549, + "step": 12838 + }, + { + "epoch": 0.9715107260413908, + "grad_norm": 2.1497695446014404, + "learning_rate": 3.981101595726933e-08, + "loss": 0.6592, + "step": 12839 + }, + { + "epoch": 0.9715863947637244, + "grad_norm": 1.952813744544983, + "learning_rate": 3.95996920512951e-08, + "loss": 0.6218, + "step": 12840 + }, + { + "epoch": 0.971662063486058, + "grad_norm": 2.6534552574157715, + "learning_rate": 3.938892927456994e-08, + "loss": 0.6288, + "step": 12841 + }, + { + "epoch": 0.9717377322083917, + "grad_norm": 1.9842441082000732, + "learning_rate": 3.917872764029129e-08, + "loss": 0.6466, + "step": 12842 + }, + { + "epoch": 0.9718134009307253, + "grad_norm": 2.3986659049987793, + "learning_rate": 3.8969087161622616e-08, + "loss": 0.6197, + "step": 12843 + }, + { + "epoch": 0.9718890696530589, + "grad_norm": 2.6465704441070557, + "learning_rate": 3.8760007851695423e-08, + "loss": 0.7109, + "step": 12844 + }, + { + "epoch": 0.9719647383753925, + "grad_norm": 2.349898338317871, + "learning_rate": 3.855148972359923e-08, + "loss": 0.6664, + "step": 12845 + }, + { + "epoch": 0.9720404070977261, + "grad_norm": 2.368164300918579, + "learning_rate": 3.83435327903936e-08, + "loss": 0.5955, + "step": 12846 + }, + { + "epoch": 0.9721160758200598, + "grad_norm": 2.0866646766662598, + "learning_rate": 3.8136137065102104e-08, + "loss": 0.6821, + "step": 12847 + }, + { + "epoch": 0.9721917445423934, + "grad_norm": 2.3973708152770996, + "learning_rate": 3.7929302560711365e-08, + "loss": 0.6995, + "step": 12848 + }, + { + "epoch": 0.972267413264727, + "grad_norm": 2.413179397583008, + "learning_rate": 3.772302929017502e-08, + "loss": 0.6289, + "step": 12849 + }, + { + "epoch": 0.9723430819870607, + "grad_norm": 2.0390636920928955, + "learning_rate": 3.7517317266409725e-08, + "loss": 0.6386, + "step": 12850 + }, + { + "epoch": 0.9724187507093943, + "grad_norm": 2.3033881187438965, + "learning_rate": 3.7312166502298184e-08, + "loss": 0.6435, + "step": 12851 + }, + { + "epoch": 0.9724944194317279, + "grad_norm": 2.501260995864868, + "learning_rate": 3.710757701068812e-08, + "loss": 0.6013, + "step": 12852 + }, + { + "epoch": 0.9725700881540615, + "grad_norm": 2.2412755489349365, + "learning_rate": 3.6903548804390283e-08, + "loss": 0.637, + "step": 12853 + }, + { + "epoch": 0.9726457568763951, + "grad_norm": 1.7953567504882812, + "learning_rate": 3.670008189618246e-08, + "loss": 0.6847, + "step": 12854 + }, + { + "epoch": 0.9727214255987288, + "grad_norm": 2.0152835845947266, + "learning_rate": 3.6497176298807445e-08, + "loss": 0.6373, + "step": 12855 + }, + { + "epoch": 0.9727970943210624, + "grad_norm": 2.8471808433532715, + "learning_rate": 3.629483202497008e-08, + "loss": 0.6311, + "step": 12856 + }, + { + "epoch": 0.972872763043396, + "grad_norm": 1.9700920581817627, + "learning_rate": 3.6093049087342236e-08, + "loss": 0.6387, + "step": 12857 + }, + { + "epoch": 0.9729484317657296, + "grad_norm": 2.660348653793335, + "learning_rate": 3.589182749855979e-08, + "loss": 0.6581, + "step": 12858 + }, + { + "epoch": 0.9730241004880632, + "grad_norm": 2.1217596530914307, + "learning_rate": 3.5691167271225676e-08, + "loss": 0.5784, + "step": 12859 + }, + { + "epoch": 0.9730997692103969, + "grad_norm": 2.266953945159912, + "learning_rate": 3.549106841790484e-08, + "loss": 0.7355, + "step": 12860 + }, + { + "epoch": 0.9731754379327305, + "grad_norm": 2.1540300846099854, + "learning_rate": 3.5291530951127247e-08, + "loss": 0.6858, + "step": 12861 + }, + { + "epoch": 0.9732511066550641, + "grad_norm": 2.050753355026245, + "learning_rate": 3.5092554883389916e-08, + "loss": 0.617, + "step": 12862 + }, + { + "epoch": 0.9733267753773978, + "grad_norm": 4.436022758483887, + "learning_rate": 3.489414022715287e-08, + "loss": 0.567, + "step": 12863 + }, + { + "epoch": 0.9734024440997314, + "grad_norm": 2.4525585174560547, + "learning_rate": 3.4696286994841176e-08, + "loss": 0.7939, + "step": 12864 + }, + { + "epoch": 0.973478112822065, + "grad_norm": 1.9763293266296387, + "learning_rate": 3.449899519884492e-08, + "loss": 0.5779, + "step": 12865 + }, + { + "epoch": 0.9735537815443986, + "grad_norm": 1.7585806846618652, + "learning_rate": 3.430226485152021e-08, + "loss": 0.7401, + "step": 12866 + }, + { + "epoch": 0.9736294502667322, + "grad_norm": 1.8818296194076538, + "learning_rate": 3.410609596518621e-08, + "loss": 0.6261, + "step": 12867 + }, + { + "epoch": 0.9737051189890659, + "grad_norm": 2.081613302230835, + "learning_rate": 3.3910488552127085e-08, + "loss": 0.6814, + "step": 12868 + }, + { + "epoch": 0.9737807877113995, + "grad_norm": 2.024808168411255, + "learning_rate": 3.3715442624594025e-08, + "loss": 0.6636, + "step": 12869 + }, + { + "epoch": 0.9738564564337331, + "grad_norm": 3.159187078475952, + "learning_rate": 3.352095819479928e-08, + "loss": 0.6785, + "step": 12870 + }, + { + "epoch": 0.9739321251560668, + "grad_norm": 2.1414241790771484, + "learning_rate": 3.332703527492409e-08, + "loss": 0.7564, + "step": 12871 + }, + { + "epoch": 0.9740077938784004, + "grad_norm": 2.4196348190307617, + "learning_rate": 3.3133673877111745e-08, + "loss": 0.573, + "step": 12872 + }, + { + "epoch": 0.974083462600734, + "grad_norm": 2.986185312271118, + "learning_rate": 3.2940874013470567e-08, + "loss": 0.653, + "step": 12873 + }, + { + "epoch": 0.9741591313230676, + "grad_norm": 2.1942930221557617, + "learning_rate": 3.274863569607489e-08, + "loss": 0.832, + "step": 12874 + }, + { + "epoch": 0.9742348000454012, + "grad_norm": 2.0108728408813477, + "learning_rate": 3.255695893696309e-08, + "loss": 0.5848, + "step": 12875 + }, + { + "epoch": 0.9743104687677349, + "grad_norm": 1.9388446807861328, + "learning_rate": 3.2365843748139554e-08, + "loss": 0.7127, + "step": 12876 + }, + { + "epoch": 0.9743861374900685, + "grad_norm": 4.268650054931641, + "learning_rate": 3.2175290141571725e-08, + "loss": 0.6322, + "step": 12877 + }, + { + "epoch": 0.9744618062124021, + "grad_norm": 2.069544553756714, + "learning_rate": 3.198529812919204e-08, + "loss": 0.6331, + "step": 12878 + }, + { + "epoch": 0.9745374749347357, + "grad_norm": 2.953239679336548, + "learning_rate": 3.1795867722898995e-08, + "loss": 0.6101, + "step": 12879 + }, + { + "epoch": 0.9746131436570693, + "grad_norm": 2.6282474994659424, + "learning_rate": 3.16069989345561e-08, + "loss": 0.6981, + "step": 12880 + }, + { + "epoch": 0.974688812379403, + "grad_norm": 2.169370412826538, + "learning_rate": 3.141869177598988e-08, + "loss": 0.6561, + "step": 12881 + }, + { + "epoch": 0.9747644811017366, + "grad_norm": 2.562084913253784, + "learning_rate": 3.123094625899292e-08, + "loss": 0.6634, + "step": 12882 + }, + { + "epoch": 0.9748401498240702, + "grad_norm": 2.8010025024414062, + "learning_rate": 3.1043762395321804e-08, + "loss": 0.6248, + "step": 12883 + }, + { + "epoch": 0.9749158185464039, + "grad_norm": 1.9485735893249512, + "learning_rate": 3.085714019670116e-08, + "loss": 0.6315, + "step": 12884 + }, + { + "epoch": 0.9749914872687375, + "grad_norm": 2.397359848022461, + "learning_rate": 3.067107967481464e-08, + "loss": 0.6059, + "step": 12885 + }, + { + "epoch": 0.9750671559910711, + "grad_norm": 2.445741891860962, + "learning_rate": 3.0485580841315916e-08, + "loss": 0.5743, + "step": 12886 + }, + { + "epoch": 0.9751428247134047, + "grad_norm": 11.2733736038208, + "learning_rate": 3.030064370782171e-08, + "loss": 0.6714, + "step": 12887 + }, + { + "epoch": 0.9752184934357383, + "grad_norm": 2.1173009872436523, + "learning_rate": 3.011626828591274e-08, + "loss": 0.7012, + "step": 12888 + }, + { + "epoch": 0.975294162158072, + "grad_norm": 2.1452109813690186, + "learning_rate": 2.9932454587133784e-08, + "loss": 0.7467, + "step": 12889 + }, + { + "epoch": 0.9753698308804056, + "grad_norm": 2.0479838848114014, + "learning_rate": 2.9749202622998628e-08, + "loss": 0.6547, + "step": 12890 + }, + { + "epoch": 0.9754454996027392, + "grad_norm": 2.4938087463378906, + "learning_rate": 2.9566512404981096e-08, + "loss": 0.6852, + "step": 12891 + }, + { + "epoch": 0.9755211683250729, + "grad_norm": 1.6364085674285889, + "learning_rate": 2.9384383944522032e-08, + "loss": 0.6448, + "step": 12892 + }, + { + "epoch": 0.9755968370474064, + "grad_norm": 2.1803574562072754, + "learning_rate": 2.9202817253028314e-08, + "loss": 0.6075, + "step": 12893 + }, + { + "epoch": 0.9756725057697401, + "grad_norm": 1.8982106447219849, + "learning_rate": 2.9021812341868847e-08, + "loss": 0.6446, + "step": 12894 + }, + { + "epoch": 0.9757481744920737, + "grad_norm": 2.0787320137023926, + "learning_rate": 2.8841369222378566e-08, + "loss": 0.7196, + "step": 12895 + }, + { + "epoch": 0.9758238432144073, + "grad_norm": 2.5845391750335693, + "learning_rate": 2.866148790585843e-08, + "loss": 0.5858, + "step": 12896 + }, + { + "epoch": 0.975899511936741, + "grad_norm": 1.972965955734253, + "learning_rate": 2.8482168403573427e-08, + "loss": 0.6546, + "step": 12897 + }, + { + "epoch": 0.9759751806590746, + "grad_norm": 2.0222232341766357, + "learning_rate": 2.8303410726751576e-08, + "loss": 0.645, + "step": 12898 + }, + { + "epoch": 0.9760508493814082, + "grad_norm": 2.261725425720215, + "learning_rate": 2.8125214886588923e-08, + "loss": 0.6731, + "step": 12899 + }, + { + "epoch": 0.9761265181037418, + "grad_norm": 2.3352370262145996, + "learning_rate": 2.7947580894242542e-08, + "loss": 0.6196, + "step": 12900 + }, + { + "epoch": 0.9762021868260754, + "grad_norm": 2.054361581802368, + "learning_rate": 2.777050876083953e-08, + "loss": 0.5426, + "step": 12901 + }, + { + "epoch": 0.9762778555484091, + "grad_norm": 2.2071480751037598, + "learning_rate": 2.759399849746602e-08, + "loss": 0.6568, + "step": 12902 + }, + { + "epoch": 0.9763535242707427, + "grad_norm": 2.048191785812378, + "learning_rate": 2.7418050115176176e-08, + "loss": 0.7078, + "step": 12903 + }, + { + "epoch": 0.9764291929930763, + "grad_norm": 2.7316575050354004, + "learning_rate": 2.7242663624989172e-08, + "loss": 0.6979, + "step": 12904 + }, + { + "epoch": 0.97650486171541, + "grad_norm": 2.226996660232544, + "learning_rate": 2.706783903788823e-08, + "loss": 0.5833, + "step": 12905 + }, + { + "epoch": 0.9765805304377435, + "grad_norm": 2.25785231590271, + "learning_rate": 2.6893576364821593e-08, + "loss": 0.6875, + "step": 12906 + }, + { + "epoch": 0.9766561991600772, + "grad_norm": 2.408618927001953, + "learning_rate": 2.6719875616701528e-08, + "loss": 0.7061, + "step": 12907 + }, + { + "epoch": 0.9767318678824108, + "grad_norm": 2.1182861328125, + "learning_rate": 2.6546736804405337e-08, + "loss": 0.6309, + "step": 12908 + }, + { + "epoch": 0.9768075366047444, + "grad_norm": 2.118018388748169, + "learning_rate": 2.6374159938777342e-08, + "loss": 0.6792, + "step": 12909 + }, + { + "epoch": 0.9768832053270781, + "grad_norm": 2.3272552490234375, + "learning_rate": 2.6202145030621904e-08, + "loss": 0.7025, + "step": 12910 + }, + { + "epoch": 0.9769588740494117, + "grad_norm": 2.4793472290039062, + "learning_rate": 2.6030692090714404e-08, + "loss": 0.6057, + "step": 12911 + }, + { + "epoch": 0.9770345427717453, + "grad_norm": 2.1847920417785645, + "learning_rate": 2.585980112978925e-08, + "loss": 0.7071, + "step": 12912 + }, + { + "epoch": 0.977110211494079, + "grad_norm": 1.9853038787841797, + "learning_rate": 2.5689472158549888e-08, + "loss": 0.6536, + "step": 12913 + }, + { + "epoch": 0.9771858802164125, + "grad_norm": 2.087451934814453, + "learning_rate": 2.5519705187662778e-08, + "loss": 0.5842, + "step": 12914 + }, + { + "epoch": 0.9772615489387462, + "grad_norm": 2.3047263622283936, + "learning_rate": 2.535050022775742e-08, + "loss": 0.555, + "step": 12915 + }, + { + "epoch": 0.9773372176610798, + "grad_norm": 1.9747389554977417, + "learning_rate": 2.518185728943234e-08, + "loss": 0.5205, + "step": 12916 + }, + { + "epoch": 0.9774128863834134, + "grad_norm": 2.134197235107422, + "learning_rate": 2.5013776383247088e-08, + "loss": 0.7488, + "step": 12917 + }, + { + "epoch": 0.9774885551057471, + "grad_norm": 2.0079126358032227, + "learning_rate": 2.4846257519727246e-08, + "loss": 0.666, + "step": 12918 + }, + { + "epoch": 0.9775642238280806, + "grad_norm": 2.0842461585998535, + "learning_rate": 2.4679300709364416e-08, + "loss": 0.6784, + "step": 12919 + }, + { + "epoch": 0.9776398925504143, + "grad_norm": 2.3097853660583496, + "learning_rate": 2.4512905962613242e-08, + "loss": 0.7302, + "step": 12920 + }, + { + "epoch": 0.977715561272748, + "grad_norm": 2.2408077716827393, + "learning_rate": 2.4347073289894382e-08, + "loss": 0.7462, + "step": 12921 + }, + { + "epoch": 0.9777912299950815, + "grad_norm": 2.217599630355835, + "learning_rate": 2.4181802701592537e-08, + "loss": 0.6856, + "step": 12922 + }, + { + "epoch": 0.9778668987174152, + "grad_norm": 2.31512188911438, + "learning_rate": 2.401709420805842e-08, + "loss": 0.6898, + "step": 12923 + }, + { + "epoch": 0.9779425674397488, + "grad_norm": 2.1751387119293213, + "learning_rate": 2.3852947819604788e-08, + "loss": 0.5326, + "step": 12924 + }, + { + "epoch": 0.9780182361620824, + "grad_norm": 2.364943265914917, + "learning_rate": 2.3689363546511413e-08, + "loss": 0.6644, + "step": 12925 + }, + { + "epoch": 0.978093904884416, + "grad_norm": 1.4398491382598877, + "learning_rate": 2.3526341399024097e-08, + "loss": 0.7215, + "step": 12926 + }, + { + "epoch": 0.9781695736067496, + "grad_norm": 2.1224849224090576, + "learning_rate": 2.3363881387349684e-08, + "loss": 0.6535, + "step": 12927 + }, + { + "epoch": 0.9782452423290833, + "grad_norm": 2.169689178466797, + "learning_rate": 2.3201983521664027e-08, + "loss": 0.6147, + "step": 12928 + }, + { + "epoch": 0.9783209110514169, + "grad_norm": 2.206355094909668, + "learning_rate": 2.304064781210402e-08, + "loss": 0.7304, + "step": 12929 + }, + { + "epoch": 0.9783965797737505, + "grad_norm": 2.7798423767089844, + "learning_rate": 2.2879874268773583e-08, + "loss": 0.6318, + "step": 12930 + }, + { + "epoch": 0.9784722484960842, + "grad_norm": 2.0683248043060303, + "learning_rate": 2.2719662901741656e-08, + "loss": 0.6629, + "step": 12931 + }, + { + "epoch": 0.9785479172184177, + "grad_norm": 3.2282752990722656, + "learning_rate": 2.2560013721039217e-08, + "loss": 0.6315, + "step": 12932 + }, + { + "epoch": 0.9786235859407514, + "grad_norm": 2.0770423412323, + "learning_rate": 2.240092673666627e-08, + "loss": 0.5093, + "step": 12933 + }, + { + "epoch": 0.978699254663085, + "grad_norm": 2.091789722442627, + "learning_rate": 2.2242401958584847e-08, + "loss": 0.6643, + "step": 12934 + }, + { + "epoch": 0.9787749233854186, + "grad_norm": 2.1290385723114014, + "learning_rate": 2.2084439396721002e-08, + "loss": 0.6081, + "step": 12935 + }, + { + "epoch": 0.9788505921077523, + "grad_norm": 1.8989332914352417, + "learning_rate": 2.1927039060966825e-08, + "loss": 0.5779, + "step": 12936 + }, + { + "epoch": 0.9789262608300859, + "grad_norm": 2.21238112449646, + "learning_rate": 2.177020096118143e-08, + "loss": 0.6956, + "step": 12937 + }, + { + "epoch": 0.9790019295524195, + "grad_norm": 2.3626580238342285, + "learning_rate": 2.1613925107184962e-08, + "loss": 0.7432, + "step": 12938 + }, + { + "epoch": 0.9790775982747532, + "grad_norm": 2.329190254211426, + "learning_rate": 2.1458211508763594e-08, + "loss": 0.7325, + "step": 12939 + }, + { + "epoch": 0.9791532669970867, + "grad_norm": 3.697888135910034, + "learning_rate": 2.130306017566952e-08, + "loss": 0.6174, + "step": 12940 + }, + { + "epoch": 0.9792289357194204, + "grad_norm": 1.962836503982544, + "learning_rate": 2.1148471117617972e-08, + "loss": 0.6935, + "step": 12941 + }, + { + "epoch": 0.979304604441754, + "grad_norm": 2.527402877807617, + "learning_rate": 2.0994444344291207e-08, + "loss": 0.7468, + "step": 12942 + }, + { + "epoch": 0.9793802731640876, + "grad_norm": 3.003408670425415, + "learning_rate": 2.084097986533351e-08, + "loss": 0.7296, + "step": 12943 + }, + { + "epoch": 0.9794559418864213, + "grad_norm": 1.8468185663223267, + "learning_rate": 2.068807769035519e-08, + "loss": 0.6277, + "step": 12944 + }, + { + "epoch": 0.9795316106087548, + "grad_norm": 2.2292520999908447, + "learning_rate": 2.053573782893259e-08, + "loss": 0.6755, + "step": 12945 + }, + { + "epoch": 0.9796072793310885, + "grad_norm": 3.2633378505706787, + "learning_rate": 2.0383960290605076e-08, + "loss": 0.637, + "step": 12946 + }, + { + "epoch": 0.9796829480534222, + "grad_norm": 2.093501329421997, + "learning_rate": 2.0232745084878046e-08, + "loss": 0.6773, + "step": 12947 + }, + { + "epoch": 0.9797586167757557, + "grad_norm": 2.401658773422241, + "learning_rate": 2.0082092221220925e-08, + "loss": 0.6135, + "step": 12948 + }, + { + "epoch": 0.9798342854980894, + "grad_norm": 2.732320547103882, + "learning_rate": 1.9932001709066172e-08, + "loss": 0.5327, + "step": 12949 + }, + { + "epoch": 0.979909954220423, + "grad_norm": 2.547961711883545, + "learning_rate": 1.978247355781626e-08, + "loss": 0.684, + "step": 12950 + }, + { + "epoch": 0.9799856229427566, + "grad_norm": 2.245232105255127, + "learning_rate": 1.9633507776831704e-08, + "loss": 0.5505, + "step": 12951 + }, + { + "epoch": 0.9800612916650903, + "grad_norm": 2.435441732406616, + "learning_rate": 1.948510437544404e-08, + "loss": 0.5674, + "step": 12952 + }, + { + "epoch": 0.9801369603874238, + "grad_norm": 2.2396018505096436, + "learning_rate": 1.9337263362945833e-08, + "loss": 0.6784, + "step": 12953 + }, + { + "epoch": 0.9802126291097575, + "grad_norm": 1.6761008501052856, + "learning_rate": 1.918998474859468e-08, + "loss": 0.6931, + "step": 12954 + }, + { + "epoch": 0.9802882978320911, + "grad_norm": 1.8858741521835327, + "learning_rate": 1.90432685416142e-08, + "loss": 0.6703, + "step": 12955 + }, + { + "epoch": 0.9803639665544247, + "grad_norm": 2.178541421890259, + "learning_rate": 1.8897114751192046e-08, + "loss": 0.669, + "step": 12956 + }, + { + "epoch": 0.9804396352767584, + "grad_norm": 2.8125827312469482, + "learning_rate": 1.8751523386480896e-08, + "loss": 0.7143, + "step": 12957 + }, + { + "epoch": 0.9805153039990919, + "grad_norm": 2.164923667907715, + "learning_rate": 1.8606494456599453e-08, + "loss": 0.5678, + "step": 12958 + }, + { + "epoch": 0.9805909727214256, + "grad_norm": 1.9167901277542114, + "learning_rate": 1.846202797062746e-08, + "loss": 0.5551, + "step": 12959 + }, + { + "epoch": 0.9806666414437593, + "grad_norm": 4.015124320983887, + "learning_rate": 1.8318123937612674e-08, + "loss": 0.6921, + "step": 12960 + }, + { + "epoch": 0.9807423101660928, + "grad_norm": 2.202143907546997, + "learning_rate": 1.8174782366567887e-08, + "loss": 0.6212, + "step": 12961 + }, + { + "epoch": 0.9808179788884265, + "grad_norm": 2.2554047107696533, + "learning_rate": 1.803200326646992e-08, + "loss": 0.9251, + "step": 12962 + }, + { + "epoch": 0.9808936476107601, + "grad_norm": 2.1025631427764893, + "learning_rate": 1.7889786646257622e-08, + "loss": 0.6946, + "step": 12963 + }, + { + "epoch": 0.9809693163330937, + "grad_norm": 2.0117862224578857, + "learning_rate": 1.7748132514838868e-08, + "loss": 0.7143, + "step": 12964 + }, + { + "epoch": 0.9810449850554274, + "grad_norm": 2.1950743198394775, + "learning_rate": 1.7607040881084558e-08, + "loss": 0.5661, + "step": 12965 + }, + { + "epoch": 0.9811206537777609, + "grad_norm": 2.6558985710144043, + "learning_rate": 1.7466511753830626e-08, + "loss": 0.5764, + "step": 12966 + }, + { + "epoch": 0.9811963225000946, + "grad_norm": 1.9270297288894653, + "learning_rate": 1.7326545141875038e-08, + "loss": 0.6928, + "step": 12967 + }, + { + "epoch": 0.9812719912224283, + "grad_norm": 1.9736948013305664, + "learning_rate": 1.7187141053985776e-08, + "loss": 0.6422, + "step": 12968 + }, + { + "epoch": 0.9813476599447618, + "grad_norm": 2.117433786392212, + "learning_rate": 1.7048299498891862e-08, + "loss": 0.6823, + "step": 12969 + }, + { + "epoch": 0.9814233286670955, + "grad_norm": 1.8852287530899048, + "learning_rate": 1.6910020485287338e-08, + "loss": 0.7003, + "step": 12970 + }, + { + "epoch": 0.981498997389429, + "grad_norm": 2.4513890743255615, + "learning_rate": 1.6772304021832275e-08, + "loss": 0.7829, + "step": 12971 + }, + { + "epoch": 0.9815746661117627, + "grad_norm": 2.3832831382751465, + "learning_rate": 1.6635150117150776e-08, + "loss": 0.7601, + "step": 12972 + }, + { + "epoch": 0.9816503348340964, + "grad_norm": 2.286743402481079, + "learning_rate": 1.6498558779831973e-08, + "loss": 0.7211, + "step": 12973 + }, + { + "epoch": 0.9817260035564299, + "grad_norm": 3.2696454524993896, + "learning_rate": 1.6362530018430022e-08, + "loss": 0.7254, + "step": 12974 + }, + { + "epoch": 0.9818016722787636, + "grad_norm": 2.6552700996398926, + "learning_rate": 1.6227063841462108e-08, + "loss": 0.7234, + "step": 12975 + }, + { + "epoch": 0.9818773410010972, + "grad_norm": 2.3581531047821045, + "learning_rate": 1.6092160257413446e-08, + "loss": 0.7437, + "step": 12976 + }, + { + "epoch": 0.9819530097234308, + "grad_norm": 3.002993106842041, + "learning_rate": 1.5957819274730277e-08, + "loss": 0.6852, + "step": 12977 + }, + { + "epoch": 0.9820286784457645, + "grad_norm": 2.3881561756134033, + "learning_rate": 1.5824040901826876e-08, + "loss": 0.608, + "step": 12978 + }, + { + "epoch": 0.982104347168098, + "grad_norm": 2.43519926071167, + "learning_rate": 1.5690825147080533e-08, + "loss": 0.6577, + "step": 12979 + }, + { + "epoch": 0.9821800158904317, + "grad_norm": 2.051454544067383, + "learning_rate": 1.5558172018833584e-08, + "loss": 0.6634, + "step": 12980 + }, + { + "epoch": 0.9822556846127654, + "grad_norm": 2.9443256855010986, + "learning_rate": 1.5426081525392377e-08, + "loss": 0.5589, + "step": 12981 + }, + { + "epoch": 0.9823313533350989, + "grad_norm": 2.4073567390441895, + "learning_rate": 1.52945536750303e-08, + "loss": 0.6178, + "step": 12982 + }, + { + "epoch": 0.9824070220574326, + "grad_norm": 2.8144779205322266, + "learning_rate": 1.516358847598376e-08, + "loss": 0.6876, + "step": 12983 + }, + { + "epoch": 0.9824826907797661, + "grad_norm": 2.132742166519165, + "learning_rate": 1.50331859364522e-08, + "loss": 0.6289, + "step": 12984 + }, + { + "epoch": 0.9825583595020998, + "grad_norm": 2.0860514640808105, + "learning_rate": 1.4903346064605085e-08, + "loss": 0.6644, + "step": 12985 + }, + { + "epoch": 0.9826340282244335, + "grad_norm": 2.203529119491577, + "learning_rate": 1.4774068868570911e-08, + "loss": 0.7307, + "step": 12986 + }, + { + "epoch": 0.982709696946767, + "grad_norm": 1.9949660301208496, + "learning_rate": 1.4645354356446206e-08, + "loss": 0.7352, + "step": 12987 + }, + { + "epoch": 0.9827853656691007, + "grad_norm": 1.9843783378601074, + "learning_rate": 1.4517202536291519e-08, + "loss": 0.6633, + "step": 12988 + }, + { + "epoch": 0.9828610343914344, + "grad_norm": 2.39467191696167, + "learning_rate": 1.438961341613243e-08, + "loss": 0.4948, + "step": 12989 + }, + { + "epoch": 0.9829367031137679, + "grad_norm": 2.534012794494629, + "learning_rate": 1.4262587003959549e-08, + "loss": 0.6207, + "step": 12990 + }, + { + "epoch": 0.9830123718361016, + "grad_norm": 2.261028528213501, + "learning_rate": 1.4136123307725512e-08, + "loss": 0.6348, + "step": 12991 + }, + { + "epoch": 0.9830880405584351, + "grad_norm": 2.1703314781188965, + "learning_rate": 1.4010222335351985e-08, + "loss": 0.5724, + "step": 12992 + }, + { + "epoch": 0.9831637092807688, + "grad_norm": 2.269578218460083, + "learning_rate": 1.3884884094722662e-08, + "loss": 0.6435, + "step": 12993 + }, + { + "epoch": 0.9832393780031025, + "grad_norm": 2.1392128467559814, + "learning_rate": 1.376010859368626e-08, + "loss": 0.714, + "step": 12994 + }, + { + "epoch": 0.983315046725436, + "grad_norm": 1.8886982202529907, + "learning_rate": 1.3635895840056534e-08, + "loss": 0.5763, + "step": 12995 + }, + { + "epoch": 0.9833907154477697, + "grad_norm": 2.211404323577881, + "learning_rate": 1.3512245841613257e-08, + "loss": 0.6028, + "step": 12996 + }, + { + "epoch": 0.9834663841701032, + "grad_norm": 1.8366492986679077, + "learning_rate": 1.338915860609824e-08, + "loss": 0.6571, + "step": 12997 + }, + { + "epoch": 0.9835420528924369, + "grad_norm": 3.9259090423583984, + "learning_rate": 1.3266634141220312e-08, + "loss": 0.7254, + "step": 12998 + }, + { + "epoch": 0.9836177216147706, + "grad_norm": 2.866379499435425, + "learning_rate": 1.314467245465334e-08, + "loss": 0.6933, + "step": 12999 + }, + { + "epoch": 0.9836933903371041, + "grad_norm": 1.9616422653198242, + "learning_rate": 1.302327355403321e-08, + "loss": 0.6589, + "step": 13000 + }, + { + "epoch": 0.9837690590594378, + "grad_norm": 2.223374843597412, + "learning_rate": 1.2902437446962844e-08, + "loss": 0.6306, + "step": 13001 + }, + { + "epoch": 0.9838447277817715, + "grad_norm": 2.437225818634033, + "learning_rate": 1.2782164141010188e-08, + "loss": 0.841, + "step": 13002 + }, + { + "epoch": 0.983920396504105, + "grad_norm": 2.194181203842163, + "learning_rate": 1.2662453643706217e-08, + "loss": 0.6298, + "step": 13003 + }, + { + "epoch": 0.9839960652264387, + "grad_norm": 2.2740933895111084, + "learning_rate": 1.2543305962548935e-08, + "loss": 0.6895, + "step": 13004 + }, + { + "epoch": 0.9840717339487722, + "grad_norm": 2.391655206680298, + "learning_rate": 1.2424721104997371e-08, + "loss": 0.7314, + "step": 13005 + }, + { + "epoch": 0.9841474026711059, + "grad_norm": 2.1336898803710938, + "learning_rate": 1.2306699078479588e-08, + "loss": 0.609, + "step": 13006 + }, + { + "epoch": 0.9842230713934396, + "grad_norm": 4.608248710632324, + "learning_rate": 1.2189239890386672e-08, + "loss": 0.7341, + "step": 13007 + }, + { + "epoch": 0.9842987401157731, + "grad_norm": 3.4450154304504395, + "learning_rate": 1.207234354807374e-08, + "loss": 0.6952, + "step": 13008 + }, + { + "epoch": 0.9843744088381068, + "grad_norm": 2.1393322944641113, + "learning_rate": 1.1956010058859934e-08, + "loss": 0.6627, + "step": 13009 + }, + { + "epoch": 0.9844500775604403, + "grad_norm": 2.124804973602295, + "learning_rate": 1.1840239430032429e-08, + "loss": 0.5678, + "step": 13010 + }, + { + "epoch": 0.984525746282774, + "grad_norm": 2.0524282455444336, + "learning_rate": 1.1725031668840425e-08, + "loss": 0.7415, + "step": 13011 + }, + { + "epoch": 0.9846014150051077, + "grad_norm": 1.9901623725891113, + "learning_rate": 1.161038678249815e-08, + "loss": 0.541, + "step": 13012 + }, + { + "epoch": 0.9846770837274412, + "grad_norm": 2.330474376678467, + "learning_rate": 1.1496304778185863e-08, + "loss": 0.7449, + "step": 13013 + }, + { + "epoch": 0.9847527524497749, + "grad_norm": 2.8459479808807373, + "learning_rate": 1.1382785663046846e-08, + "loss": 0.7962, + "step": 13014 + }, + { + "epoch": 0.9848284211721086, + "grad_norm": 2.1679039001464844, + "learning_rate": 1.1269829444191416e-08, + "loss": 0.639, + "step": 13015 + }, + { + "epoch": 0.9849040898944421, + "grad_norm": 1.800337314605713, + "learning_rate": 1.1157436128691911e-08, + "loss": 0.5612, + "step": 13016 + }, + { + "epoch": 0.9849797586167758, + "grad_norm": 2.2454311847686768, + "learning_rate": 1.1045605723586705e-08, + "loss": 0.6121, + "step": 13017 + }, + { + "epoch": 0.9850554273391093, + "grad_norm": 2.1114368438720703, + "learning_rate": 1.0934338235879193e-08, + "loss": 0.8751, + "step": 13018 + }, + { + "epoch": 0.985131096061443, + "grad_norm": 2.5677621364593506, + "learning_rate": 1.0823633672538802e-08, + "loss": 0.5946, + "step": 13019 + }, + { + "epoch": 0.9852067647837767, + "grad_norm": 2.1673922538757324, + "learning_rate": 1.0713492040495986e-08, + "loss": 0.65, + "step": 13020 + }, + { + "epoch": 0.9852824335061102, + "grad_norm": 2.8875181674957275, + "learning_rate": 1.060391334664923e-08, + "loss": 0.5534, + "step": 13021 + }, + { + "epoch": 0.9853581022284439, + "grad_norm": 2.2637150287628174, + "learning_rate": 1.0494897597861041e-08, + "loss": 0.6161, + "step": 13022 + }, + { + "epoch": 0.9854337709507774, + "grad_norm": 2.323862314224243, + "learning_rate": 1.0386444800957962e-08, + "loss": 0.7628, + "step": 13023 + }, + { + "epoch": 0.9855094396731111, + "grad_norm": 2.155294179916382, + "learning_rate": 1.0278554962731557e-08, + "loss": 0.657, + "step": 13024 + }, + { + "epoch": 0.9855851083954448, + "grad_norm": 2.271121025085449, + "learning_rate": 1.0171228089938422e-08, + "loss": 0.63, + "step": 13025 + }, + { + "epoch": 0.9856607771177783, + "grad_norm": 2.7450411319732666, + "learning_rate": 1.0064464189300181e-08, + "loss": 0.6431, + "step": 13026 + }, + { + "epoch": 0.985736445840112, + "grad_norm": 1.9051148891448975, + "learning_rate": 9.958263267501488e-09, + "loss": 0.5382, + "step": 13027 + }, + { + "epoch": 0.9858121145624457, + "grad_norm": 2.6679155826568604, + "learning_rate": 9.852625331193021e-09, + "loss": 0.5978, + "step": 13028 + }, + { + "epoch": 0.9858877832847792, + "grad_norm": 2.288546323776245, + "learning_rate": 9.747550386991488e-09, + "loss": 0.5773, + "step": 13029 + }, + { + "epoch": 0.9859634520071129, + "grad_norm": 2.641033411026001, + "learning_rate": 9.643038441476626e-09, + "loss": 0.6219, + "step": 13030 + }, + { + "epoch": 0.9860391207294464, + "grad_norm": 3.3034932613372803, + "learning_rate": 9.539089501193199e-09, + "loss": 0.7107, + "step": 13031 + }, + { + "epoch": 0.9861147894517801, + "grad_norm": 1.9344165325164795, + "learning_rate": 9.43570357265e-09, + "loss": 0.7148, + "step": 13032 + }, + { + "epoch": 0.9861904581741138, + "grad_norm": 2.1721315383911133, + "learning_rate": 9.332880662321852e-09, + "loss": 0.6956, + "step": 13033 + }, + { + "epoch": 0.9862661268964473, + "grad_norm": 1.8685091733932495, + "learning_rate": 9.230620776648602e-09, + "loss": 0.583, + "step": 13034 + }, + { + "epoch": 0.986341795618781, + "grad_norm": 1.820870280265808, + "learning_rate": 9.128923922033128e-09, + "loss": 0.5824, + "step": 13035 + }, + { + "epoch": 0.9864174643411145, + "grad_norm": 2.135571002960205, + "learning_rate": 9.027790104845335e-09, + "loss": 0.7753, + "step": 13036 + }, + { + "epoch": 0.9864931330634482, + "grad_norm": 3.053931474685669, + "learning_rate": 8.927219331417158e-09, + "loss": 0.6846, + "step": 13037 + }, + { + "epoch": 0.9865688017857819, + "grad_norm": 2.2578110694885254, + "learning_rate": 8.82721160804656e-09, + "loss": 0.6994, + "step": 13038 + }, + { + "epoch": 0.9866444705081154, + "grad_norm": 2.533595561981201, + "learning_rate": 8.727766940997528e-09, + "loss": 0.7083, + "step": 13039 + }, + { + "epoch": 0.9867201392304491, + "grad_norm": 2.277336835861206, + "learning_rate": 8.628885336497084e-09, + "loss": 0.8314, + "step": 13040 + }, + { + "epoch": 0.9867958079527828, + "grad_norm": 2.3916232585906982, + "learning_rate": 8.530566800738272e-09, + "loss": 0.6584, + "step": 13041 + }, + { + "epoch": 0.9868714766751163, + "grad_norm": 2.5253095626831055, + "learning_rate": 8.432811339876168e-09, + "loss": 0.6123, + "step": 13042 + }, + { + "epoch": 0.98694714539745, + "grad_norm": 1.7095435857772827, + "learning_rate": 8.335618960033876e-09, + "loss": 0.8094, + "step": 13043 + }, + { + "epoch": 0.9870228141197835, + "grad_norm": 2.151761293411255, + "learning_rate": 8.238989667297526e-09, + "loss": 0.716, + "step": 13044 + }, + { + "epoch": 0.9870984828421172, + "grad_norm": 2.2348501682281494, + "learning_rate": 8.142923467718277e-09, + "loss": 0.7185, + "step": 13045 + }, + { + "epoch": 0.9871741515644509, + "grad_norm": 2.2857894897460938, + "learning_rate": 8.047420367313319e-09, + "loss": 0.7555, + "step": 13046 + }, + { + "epoch": 0.9872498202867844, + "grad_norm": 2.075570821762085, + "learning_rate": 7.952480372061866e-09, + "loss": 0.6908, + "step": 13047 + }, + { + "epoch": 0.9873254890091181, + "grad_norm": 2.3449413776397705, + "learning_rate": 7.858103487910161e-09, + "loss": 0.6586, + "step": 13048 + }, + { + "epoch": 0.9874011577314517, + "grad_norm": 1.8678113222122192, + "learning_rate": 7.764289720767482e-09, + "loss": 0.6416, + "step": 13049 + }, + { + "epoch": 0.9874768264537853, + "grad_norm": 1.8483861684799194, + "learning_rate": 7.671039076510123e-09, + "loss": 0.6662, + "step": 13050 + }, + { + "epoch": 0.987552495176119, + "grad_norm": 2.6870992183685303, + "learning_rate": 7.578351560976416e-09, + "loss": 0.7492, + "step": 13051 + }, + { + "epoch": 0.9876281638984525, + "grad_norm": 2.3274192810058594, + "learning_rate": 7.486227179971717e-09, + "loss": 0.7949, + "step": 13052 + }, + { + "epoch": 0.9877038326207862, + "grad_norm": 2.314833641052246, + "learning_rate": 7.394665939264411e-09, + "loss": 0.7524, + "step": 13053 + }, + { + "epoch": 0.9877795013431199, + "grad_norm": 2.826127767562866, + "learning_rate": 7.303667844589912e-09, + "loss": 0.6682, + "step": 13054 + }, + { + "epoch": 0.9878551700654534, + "grad_norm": 2.203782081604004, + "learning_rate": 7.213232901644662e-09, + "loss": 0.5775, + "step": 13055 + }, + { + "epoch": 0.9879308387877871, + "grad_norm": 2.5531766414642334, + "learning_rate": 7.12336111609313e-09, + "loss": 0.6546, + "step": 13056 + }, + { + "epoch": 0.9880065075101206, + "grad_norm": 2.4287493228912354, + "learning_rate": 7.034052493562815e-09, + "loss": 0.7344, + "step": 13057 + }, + { + "epoch": 0.9880821762324543, + "grad_norm": 1.6254581212997437, + "learning_rate": 6.945307039647242e-09, + "loss": 0.8126, + "step": 13058 + }, + { + "epoch": 0.988157844954788, + "grad_norm": 2.4416098594665527, + "learning_rate": 6.857124759903966e-09, + "loss": 0.6512, + "step": 13059 + }, + { + "epoch": 0.9882335136771215, + "grad_norm": 2.1222572326660156, + "learning_rate": 6.769505659854569e-09, + "loss": 0.7188, + "step": 13060 + }, + { + "epoch": 0.9883091823994552, + "grad_norm": 2.3706514835357666, + "learning_rate": 6.682449744986663e-09, + "loss": 0.5678, + "step": 13061 + }, + { + "epoch": 0.9883848511217888, + "grad_norm": 1.8657547235488892, + "learning_rate": 6.5959570207508864e-09, + "loss": 0.7439, + "step": 13062 + }, + { + "epoch": 0.9884605198441224, + "grad_norm": 2.860536813735962, + "learning_rate": 6.5100274925649075e-09, + "loss": 0.5991, + "step": 13063 + }, + { + "epoch": 0.9885361885664561, + "grad_norm": 2.22926664352417, + "learning_rate": 6.42466116580942e-09, + "loss": 0.817, + "step": 13064 + }, + { + "epoch": 0.9886118572887896, + "grad_norm": 1.8725168704986572, + "learning_rate": 6.339858045830149e-09, + "loss": 0.6259, + "step": 13065 + }, + { + "epoch": 0.9886875260111233, + "grad_norm": 2.0670039653778076, + "learning_rate": 6.255618137938845e-09, + "loss": 0.6289, + "step": 13066 + }, + { + "epoch": 0.988763194733457, + "grad_norm": 2.009124517440796, + "learning_rate": 6.17194144740929e-09, + "loss": 0.6825, + "step": 13067 + }, + { + "epoch": 0.9888388634557905, + "grad_norm": 1.9414106607437134, + "learning_rate": 6.088827979483291e-09, + "loss": 0.4782, + "step": 13068 + }, + { + "epoch": 0.9889145321781242, + "grad_norm": 2.9034688472747803, + "learning_rate": 6.006277739363686e-09, + "loss": 0.6774, + "step": 13069 + }, + { + "epoch": 0.9889902009004577, + "grad_norm": 2.372115135192871, + "learning_rate": 5.924290732221338e-09, + "loss": 0.6958, + "step": 13070 + }, + { + "epoch": 0.9890658696227914, + "grad_norm": 1.480709433555603, + "learning_rate": 5.842866963190141e-09, + "loss": 0.6686, + "step": 13071 + }, + { + "epoch": 0.9891415383451251, + "grad_norm": 2.762083053588867, + "learning_rate": 5.762006437370015e-09, + "loss": 0.6242, + "step": 13072 + }, + { + "epoch": 0.9892172070674586, + "grad_norm": 2.282045364379883, + "learning_rate": 5.681709159822912e-09, + "loss": 0.5928, + "step": 13073 + }, + { + "epoch": 0.9892928757897923, + "grad_norm": 2.124495029449463, + "learning_rate": 5.601975135578807e-09, + "loss": 0.5653, + "step": 13074 + }, + { + "epoch": 0.9893685445121259, + "grad_norm": 2.060455560684204, + "learning_rate": 5.522804369630707e-09, + "loss": 0.6898, + "step": 13075 + }, + { + "epoch": 0.9894442132344595, + "grad_norm": 2.3240835666656494, + "learning_rate": 5.444196866935647e-09, + "loss": 0.6417, + "step": 13076 + }, + { + "epoch": 0.9895198819567932, + "grad_norm": 2.27996826171875, + "learning_rate": 5.366152632417687e-09, + "loss": 0.5804, + "step": 13077 + }, + { + "epoch": 0.9895955506791267, + "grad_norm": 2.054598569869995, + "learning_rate": 5.288671670962919e-09, + "loss": 0.7136, + "step": 13078 + }, + { + "epoch": 0.9896712194014604, + "grad_norm": 2.2513556480407715, + "learning_rate": 5.211753987423462e-09, + "loss": 0.8557, + "step": 13079 + }, + { + "epoch": 0.9897468881237941, + "grad_norm": 2.1142704486846924, + "learning_rate": 5.135399586617462e-09, + "loss": 0.6724, + "step": 13080 + }, + { + "epoch": 0.9898225568461276, + "grad_norm": 2.2248411178588867, + "learning_rate": 5.059608473325095e-09, + "loss": 0.6099, + "step": 13081 + }, + { + "epoch": 0.9898982255684613, + "grad_norm": 3.098581552505493, + "learning_rate": 4.984380652293563e-09, + "loss": 0.5557, + "step": 13082 + }, + { + "epoch": 0.9899738942907949, + "grad_norm": 1.7447338104248047, + "learning_rate": 4.909716128234098e-09, + "loss": 0.6664, + "step": 13083 + }, + { + "epoch": 0.9900495630131285, + "grad_norm": 2.116192102432251, + "learning_rate": 4.835614905820962e-09, + "loss": 0.6416, + "step": 13084 + }, + { + "epoch": 0.9901252317354622, + "grad_norm": 3.5600974559783936, + "learning_rate": 4.762076989695441e-09, + "loss": 0.5656, + "step": 13085 + }, + { + "epoch": 0.9902009004577957, + "grad_norm": 2.4236409664154053, + "learning_rate": 4.689102384462851e-09, + "loss": 0.6626, + "step": 13086 + }, + { + "epoch": 0.9902765691801294, + "grad_norm": 1.842241644859314, + "learning_rate": 4.616691094693537e-09, + "loss": 0.6851, + "step": 13087 + }, + { + "epoch": 0.990352237902463, + "grad_norm": 2.1137871742248535, + "learning_rate": 4.5448431249218715e-09, + "loss": 0.5285, + "step": 13088 + }, + { + "epoch": 0.9904279066247966, + "grad_norm": 2.3001229763031006, + "learning_rate": 4.473558479646256e-09, + "loss": 0.7189, + "step": 13089 + }, + { + "epoch": 0.9905035753471303, + "grad_norm": 2.6320502758026123, + "learning_rate": 4.402837163331119e-09, + "loss": 0.6221, + "step": 13090 + }, + { + "epoch": 0.9905792440694638, + "grad_norm": 2.4288251399993896, + "learning_rate": 4.332679180406918e-09, + "loss": 0.5301, + "step": 13091 + }, + { + "epoch": 0.9906549127917975, + "grad_norm": 2.138550043106079, + "learning_rate": 4.2630845352651384e-09, + "loss": 0.6783, + "step": 13092 + }, + { + "epoch": 0.9907305815141312, + "grad_norm": 2.3440093994140625, + "learning_rate": 4.1940532322642946e-09, + "loss": 0.7674, + "step": 13093 + }, + { + "epoch": 0.9908062502364647, + "grad_norm": 1.6465092897415161, + "learning_rate": 4.125585275728927e-09, + "loss": 0.6484, + "step": 13094 + }, + { + "epoch": 0.9908819189587984, + "grad_norm": 2.4605023860931396, + "learning_rate": 4.057680669944608e-09, + "loss": 0.7061, + "step": 13095 + }, + { + "epoch": 0.990957587681132, + "grad_norm": 1.8242436647415161, + "learning_rate": 3.990339419164935e-09, + "loss": 0.7324, + "step": 13096 + }, + { + "epoch": 0.9910332564034656, + "grad_norm": 2.511643648147583, + "learning_rate": 3.923561527606534e-09, + "loss": 0.6348, + "step": 13097 + }, + { + "epoch": 0.9911089251257993, + "grad_norm": 2.8463854789733887, + "learning_rate": 3.857346999452061e-09, + "loss": 0.7027, + "step": 13098 + }, + { + "epoch": 0.9911845938481328, + "grad_norm": 1.9040359258651733, + "learning_rate": 3.7916958388481974e-09, + "loss": 0.6504, + "step": 13099 + }, + { + "epoch": 0.9912602625704665, + "grad_norm": 2.7657530307769775, + "learning_rate": 3.726608049904656e-09, + "loss": 0.8033, + "step": 13100 + }, + { + "epoch": 0.9913359312928002, + "grad_norm": 2.482766628265381, + "learning_rate": 3.662083636698177e-09, + "loss": 0.6571, + "step": 13101 + }, + { + "epoch": 0.9914116000151337, + "grad_norm": 2.55605411529541, + "learning_rate": 3.598122603270526e-09, + "loss": 0.6026, + "step": 13102 + }, + { + "epoch": 0.9914872687374674, + "grad_norm": 2.0801234245300293, + "learning_rate": 3.534724953625501e-09, + "loss": 0.6188, + "step": 13103 + }, + { + "epoch": 0.991562937459801, + "grad_norm": 3.96653413772583, + "learning_rate": 3.4718906917349245e-09, + "loss": 0.6812, + "step": 13104 + }, + { + "epoch": 0.9916386061821346, + "grad_norm": 2.2530484199523926, + "learning_rate": 3.4096198215326504e-09, + "loss": 0.7594, + "step": 13105 + }, + { + "epoch": 0.9917142749044683, + "grad_norm": 2.784916639328003, + "learning_rate": 3.347912346917559e-09, + "loss": 0.6711, + "step": 13106 + }, + { + "epoch": 0.9917899436268018, + "grad_norm": 2.254490852355957, + "learning_rate": 3.286768271756557e-09, + "loss": 0.6553, + "step": 13107 + }, + { + "epoch": 0.9918656123491355, + "grad_norm": 1.7609950304031372, + "learning_rate": 3.226187599875585e-09, + "loss": 0.6226, + "step": 13108 + }, + { + "epoch": 0.9919412810714691, + "grad_norm": 2.008504629135132, + "learning_rate": 3.166170335070606e-09, + "loss": 0.6512, + "step": 13109 + }, + { + "epoch": 0.9920169497938027, + "grad_norm": 2.3329126834869385, + "learning_rate": 3.106716481098615e-09, + "loss": 0.7869, + "step": 13110 + }, + { + "epoch": 0.9920926185161364, + "grad_norm": 2.263775587081909, + "learning_rate": 3.0478260416846314e-09, + "loss": 0.5358, + "step": 13111 + }, + { + "epoch": 0.99216828723847, + "grad_norm": 2.2455990314483643, + "learning_rate": 2.9894990205147076e-09, + "loss": 0.5658, + "step": 13112 + }, + { + "epoch": 0.9922439559608036, + "grad_norm": 1.8885899782180786, + "learning_rate": 2.931735421241921e-09, + "loss": 0.744, + "step": 13113 + }, + { + "epoch": 0.9923196246831373, + "grad_norm": 1.837786316871643, + "learning_rate": 2.874535247484378e-09, + "loss": 0.5932, + "step": 13114 + }, + { + "epoch": 0.9923952934054708, + "grad_norm": 2.2272021770477295, + "learning_rate": 2.817898502824212e-09, + "loss": 0.6078, + "step": 13115 + }, + { + "epoch": 0.9924709621278045, + "grad_norm": 5.379904747009277, + "learning_rate": 2.7618251908065884e-09, + "loss": 0.7149, + "step": 13116 + }, + { + "epoch": 0.9925466308501381, + "grad_norm": 2.127641201019287, + "learning_rate": 2.706315314944696e-09, + "loss": 0.6478, + "step": 13117 + }, + { + "epoch": 0.9926222995724717, + "grad_norm": 2.2539281845092773, + "learning_rate": 2.6513688787137557e-09, + "loss": 0.672, + "step": 13118 + }, + { + "epoch": 0.9926979682948054, + "grad_norm": 2.355245351791382, + "learning_rate": 2.5969858855560138e-09, + "loss": 0.5535, + "step": 13119 + }, + { + "epoch": 0.9927736370171389, + "grad_norm": 2.0226423740386963, + "learning_rate": 2.543166338874747e-09, + "loss": 0.5628, + "step": 13120 + }, + { + "epoch": 0.9928493057394726, + "grad_norm": 2.060910701751709, + "learning_rate": 2.4899102420422593e-09, + "loss": 0.787, + "step": 13121 + }, + { + "epoch": 0.9929249744618062, + "grad_norm": 2.399125576019287, + "learning_rate": 2.4372175983938817e-09, + "loss": 0.5982, + "step": 13122 + }, + { + "epoch": 0.9930006431841398, + "grad_norm": 3.2431602478027344, + "learning_rate": 2.385088411227976e-09, + "loss": 0.6401, + "step": 13123 + }, + { + "epoch": 0.9930763119064735, + "grad_norm": 1.9777895212173462, + "learning_rate": 2.333522683808931e-09, + "loss": 0.5933, + "step": 13124 + }, + { + "epoch": 0.993151980628807, + "grad_norm": 2.409860372543335, + "learning_rate": 2.2825204193681613e-09, + "loss": 0.7402, + "step": 13125 + }, + { + "epoch": 0.9932276493511407, + "grad_norm": 2.1435985565185547, + "learning_rate": 2.232081621097115e-09, + "loss": 0.7274, + "step": 13126 + }, + { + "epoch": 0.9933033180734744, + "grad_norm": 2.2561211585998535, + "learning_rate": 2.1822062921552644e-09, + "loss": 0.5811, + "step": 13127 + }, + { + "epoch": 0.9933789867958079, + "grad_norm": 2.5629122257232666, + "learning_rate": 2.132894435666111e-09, + "loss": 0.763, + "step": 13128 + }, + { + "epoch": 0.9934546555181416, + "grad_norm": 2.6304588317871094, + "learning_rate": 2.0841460547181833e-09, + "loss": 0.653, + "step": 13129 + }, + { + "epoch": 0.9935303242404752, + "grad_norm": 2.282959222793579, + "learning_rate": 2.035961152364041e-09, + "loss": 0.5899, + "step": 13130 + }, + { + "epoch": 0.9936059929628088, + "grad_norm": 2.0651817321777344, + "learning_rate": 1.9883397316202702e-09, + "loss": 0.693, + "step": 13131 + }, + { + "epoch": 0.9936816616851425, + "grad_norm": 2.5978622436523438, + "learning_rate": 1.941281795470484e-09, + "loss": 0.7713, + "step": 13132 + }, + { + "epoch": 0.993757330407476, + "grad_norm": 2.876408338546753, + "learning_rate": 1.894787346860327e-09, + "loss": 0.5898, + "step": 13133 + }, + { + "epoch": 0.9938329991298097, + "grad_norm": 2.2665717601776123, + "learning_rate": 1.848856388702469e-09, + "loss": 0.7736, + "step": 13134 + }, + { + "epoch": 0.9939086678521433, + "grad_norm": 2.229184150695801, + "learning_rate": 1.8034889238726093e-09, + "loss": 0.5054, + "step": 13135 + }, + { + "epoch": 0.9939843365744769, + "grad_norm": 2.301201820373535, + "learning_rate": 1.7586849552114758e-09, + "loss": 0.5685, + "step": 13136 + }, + { + "epoch": 0.9940600052968106, + "grad_norm": 2.13543701171875, + "learning_rate": 1.7144444855258234e-09, + "loss": 0.6025, + "step": 13137 + }, + { + "epoch": 0.9941356740191442, + "grad_norm": 1.9200706481933594, + "learning_rate": 1.6707675175854363e-09, + "loss": 0.4845, + "step": 13138 + }, + { + "epoch": 0.9942113427414778, + "grad_norm": 2.272440195083618, + "learning_rate": 1.6276540541261265e-09, + "loss": 0.6253, + "step": 13139 + }, + { + "epoch": 0.9942870114638115, + "grad_norm": 2.107754945755005, + "learning_rate": 1.5851040978467346e-09, + "loss": 0.7687, + "step": 13140 + }, + { + "epoch": 0.994362680186145, + "grad_norm": 2.178772449493408, + "learning_rate": 1.5431176514131285e-09, + "loss": 0.5703, + "step": 13141 + }, + { + "epoch": 0.9944383489084787, + "grad_norm": 2.395909070968628, + "learning_rate": 1.5016947174532058e-09, + "loss": 0.7377, + "step": 13142 + }, + { + "epoch": 0.9945140176308123, + "grad_norm": 1.9375349283218384, + "learning_rate": 1.4608352985628904e-09, + "loss": 0.728, + "step": 13143 + }, + { + "epoch": 0.9945896863531459, + "grad_norm": 2.407315254211426, + "learning_rate": 1.4205393972991366e-09, + "loss": 0.7251, + "step": 13144 + }, + { + "epoch": 0.9946653550754796, + "grad_norm": 2.111492872238159, + "learning_rate": 1.3808070161859255e-09, + "loss": 0.7817, + "step": 13145 + }, + { + "epoch": 0.9947410237978132, + "grad_norm": 2.2751195430755615, + "learning_rate": 1.341638157712266e-09, + "loss": 0.7493, + "step": 13146 + }, + { + "epoch": 0.9948166925201468, + "grad_norm": 2.5054099559783936, + "learning_rate": 1.303032824330197e-09, + "loss": 0.6376, + "step": 13147 + }, + { + "epoch": 0.9948923612424804, + "grad_norm": 2.099950075149536, + "learning_rate": 1.264991018457784e-09, + "loss": 0.6259, + "step": 13148 + }, + { + "epoch": 0.994968029964814, + "grad_norm": 2.118708848953247, + "learning_rate": 1.2275127424771216e-09, + "loss": 0.6925, + "step": 13149 + }, + { + "epoch": 0.9950436986871477, + "grad_norm": 4.26461935043335, + "learning_rate": 1.190597998734333e-09, + "loss": 0.6318, + "step": 13150 + }, + { + "epoch": 0.9951193674094813, + "grad_norm": 1.8085392713546753, + "learning_rate": 1.1542467895425679e-09, + "loss": 0.5068, + "step": 13151 + }, + { + "epoch": 0.9951950361318149, + "grad_norm": 2.8070521354675293, + "learning_rate": 1.1184591171780056e-09, + "loss": 0.6108, + "step": 13152 + }, + { + "epoch": 0.9952707048541486, + "grad_norm": 2.0022857189178467, + "learning_rate": 1.0832349838808542e-09, + "loss": 0.5927, + "step": 13153 + }, + { + "epoch": 0.9953463735764821, + "grad_norm": 2.230762243270874, + "learning_rate": 1.0485743918583478e-09, + "loss": 0.6105, + "step": 13154 + }, + { + "epoch": 0.9954220422988158, + "grad_norm": 2.8104093074798584, + "learning_rate": 1.0144773432797516e-09, + "loss": 0.5951, + "step": 13155 + }, + { + "epoch": 0.9954977110211494, + "grad_norm": 2.1270148754119873, + "learning_rate": 9.809438402803572e-10, + "loss": 0.6414, + "step": 13156 + }, + { + "epoch": 0.995573379743483, + "grad_norm": 2.065595865249634, + "learning_rate": 9.479738849614838e-10, + "loss": 0.5966, + "step": 13157 + }, + { + "epoch": 0.9956490484658167, + "grad_norm": 2.2215771675109863, + "learning_rate": 9.15567479386481e-10, + "loss": 0.6672, + "step": 13158 + }, + { + "epoch": 0.9957247171881503, + "grad_norm": 2.1374623775482178, + "learning_rate": 8.837246255847253e-10, + "loss": 0.6385, + "step": 13159 + }, + { + "epoch": 0.9958003859104839, + "grad_norm": 2.5752291679382324, + "learning_rate": 8.524453255516207e-10, + "loss": 0.6717, + "step": 13160 + }, + { + "epoch": 0.9958760546328175, + "grad_norm": 2.127338409423828, + "learning_rate": 8.217295812446013e-10, + "loss": 0.5726, + "step": 13161 + }, + { + "epoch": 0.9959517233551511, + "grad_norm": 2.5369412899017334, + "learning_rate": 7.915773945881277e-10, + "loss": 0.6269, + "step": 13162 + }, + { + "epoch": 0.9960273920774848, + "grad_norm": 3.5135183334350586, + "learning_rate": 7.619887674696902e-10, + "loss": 0.5283, + "step": 13163 + }, + { + "epoch": 0.9961030607998184, + "grad_norm": 2.36592960357666, + "learning_rate": 7.329637017428059e-10, + "loss": 0.6122, + "step": 13164 + }, + { + "epoch": 0.996178729522152, + "grad_norm": 2.4969322681427, + "learning_rate": 7.045021992250211e-10, + "loss": 0.6736, + "step": 13165 + }, + { + "epoch": 0.9962543982444857, + "grad_norm": 1.717811107635498, + "learning_rate": 6.766042616989098e-10, + "loss": 0.6547, + "step": 13166 + }, + { + "epoch": 0.9963300669668192, + "grad_norm": 1.9306954145431519, + "learning_rate": 6.49269890911075e-10, + "loss": 0.7854, + "step": 13167 + }, + { + "epoch": 0.9964057356891529, + "grad_norm": 2.652772903442383, + "learning_rate": 6.224990885721482e-10, + "loss": 0.5947, + "step": 13168 + }, + { + "epoch": 0.9964814044114865, + "grad_norm": 2.094191074371338, + "learning_rate": 5.962918563607867e-10, + "loss": 0.7139, + "step": 13169 + }, + { + "epoch": 0.9965570731338201, + "grad_norm": 2.385401725769043, + "learning_rate": 5.706481959176779e-10, + "loss": 0.6746, + "step": 13170 + }, + { + "epoch": 0.9966327418561538, + "grad_norm": 2.2519052028656006, + "learning_rate": 5.455681088475383e-10, + "loss": 0.6605, + "step": 13171 + }, + { + "epoch": 0.9967084105784874, + "grad_norm": 2.4455978870391846, + "learning_rate": 5.210515967221108e-10, + "loss": 0.6044, + "step": 13172 + }, + { + "epoch": 0.996784079300821, + "grad_norm": 2.6201171875, + "learning_rate": 4.970986610761675e-10, + "loss": 0.5899, + "step": 13173 + }, + { + "epoch": 0.9968597480231546, + "grad_norm": 2.2739479541778564, + "learning_rate": 4.737093034095086e-10, + "loss": 0.6609, + "step": 13174 + }, + { + "epoch": 0.9969354167454882, + "grad_norm": 2.2343196868896484, + "learning_rate": 4.5088352518796173e-10, + "loss": 0.6017, + "step": 13175 + }, + { + "epoch": 0.9970110854678219, + "grad_norm": 2.2220582962036133, + "learning_rate": 4.286213278393847e-10, + "loss": 0.6353, + "step": 13176 + }, + { + "epoch": 0.9970867541901555, + "grad_norm": 2.405085802078247, + "learning_rate": 4.0692271275866167e-10, + "loss": 0.5783, + "step": 13177 + }, + { + "epoch": 0.9971624229124891, + "grad_norm": 2.3258261680603027, + "learning_rate": 3.8578768130470565e-10, + "loss": 0.5282, + "step": 13178 + }, + { + "epoch": 0.9972380916348228, + "grad_norm": 2.120819568634033, + "learning_rate": 3.652162348014576e-10, + "loss": 0.7857, + "step": 13179 + }, + { + "epoch": 0.9973137603571564, + "grad_norm": 2.094067335128784, + "learning_rate": 3.4520837453688726e-10, + "loss": 0.7784, + "step": 13180 + }, + { + "epoch": 0.99738942907949, + "grad_norm": 1.7406028509140015, + "learning_rate": 3.257641017629931e-10, + "loss": 0.617, + "step": 13181 + }, + { + "epoch": 0.9974650978018236, + "grad_norm": 1.7582353353500366, + "learning_rate": 3.0688341769880004e-10, + "loss": 0.6051, + "step": 13182 + }, + { + "epoch": 0.9975407665241572, + "grad_norm": 2.7545251846313477, + "learning_rate": 2.8856632352636247e-10, + "loss": 0.5891, + "step": 13183 + }, + { + "epoch": 0.9976164352464909, + "grad_norm": 2.582125425338745, + "learning_rate": 2.708128203917637e-10, + "loss": 0.6018, + "step": 13184 + }, + { + "epoch": 0.9976921039688245, + "grad_norm": 2.3737707138061523, + "learning_rate": 2.536229094081133e-10, + "loss": 0.6962, + "step": 13185 + }, + { + "epoch": 0.9977677726911581, + "grad_norm": 2.5087530612945557, + "learning_rate": 2.369965916505512e-10, + "loss": 0.7588, + "step": 13186 + }, + { + "epoch": 0.9978434414134917, + "grad_norm": 2.271505117416382, + "learning_rate": 2.2093386816124383e-10, + "loss": 0.6266, + "step": 13187 + }, + { + "epoch": 0.9979191101358253, + "grad_norm": 2.8579909801483154, + "learning_rate": 2.054347399463863e-10, + "loss": 0.7987, + "step": 13188 + }, + { + "epoch": 0.997994778858159, + "grad_norm": 1.9885179996490479, + "learning_rate": 1.9049920797620245e-10, + "loss": 0.6924, + "step": 13189 + }, + { + "epoch": 0.9980704475804926, + "grad_norm": 2.303568124771118, + "learning_rate": 1.7612727318494503e-10, + "loss": 0.783, + "step": 13190 + }, + { + "epoch": 0.9981461163028262, + "grad_norm": 2.4012231826782227, + "learning_rate": 1.6231893647389306e-10, + "loss": 0.6901, + "step": 13191 + }, + { + "epoch": 0.9982217850251599, + "grad_norm": 2.4622113704681396, + "learning_rate": 1.4907419870835437e-10, + "loss": 0.7907, + "step": 13192 + }, + { + "epoch": 0.9982974537474935, + "grad_norm": 2.3311586380004883, + "learning_rate": 1.3639306071566714e-10, + "loss": 0.7405, + "step": 13193 + }, + { + "epoch": 0.9983731224698271, + "grad_norm": 2.215780019760132, + "learning_rate": 1.2427552329119517e-10, + "loss": 0.6988, + "step": 13194 + }, + { + "epoch": 0.9984487911921607, + "grad_norm": 2.3962464332580566, + "learning_rate": 1.12721587194331e-10, + "loss": 0.6859, + "step": 13195 + }, + { + "epoch": 0.9985244599144943, + "grad_norm": 1.8870036602020264, + "learning_rate": 1.0173125314749676e-10, + "loss": 0.6691, + "step": 13196 + }, + { + "epoch": 0.998600128636828, + "grad_norm": 1.81784188747406, + "learning_rate": 9.130452184014093e-11, + "loss": 0.5999, + "step": 13197 + }, + { + "epoch": 0.9986757973591616, + "grad_norm": 2.0552120208740234, + "learning_rate": 8.14413939237424e-11, + "loss": 0.5859, + "step": 13198 + }, + { + "epoch": 0.9987514660814952, + "grad_norm": 2.1729273796081543, + "learning_rate": 7.21418700178056e-11, + "loss": 0.5408, + "step": 13199 + }, + { + "epoch": 0.9988271348038288, + "grad_norm": 1.8984811305999756, + "learning_rate": 6.340595070286614e-11, + "loss": 0.6977, + "step": 13200 + }, + { + "epoch": 0.9989028035261625, + "grad_norm": 2.0209014415740967, + "learning_rate": 5.5233636526486054e-11, + "loss": 0.6062, + "step": 13201 + }, + { + "epoch": 0.9989784722484961, + "grad_norm": 2.05021333694458, + "learning_rate": 4.7624928001255286e-11, + "loss": 0.8087, + "step": 13202 + }, + { + "epoch": 0.9990541409708297, + "grad_norm": 2.2087504863739014, + "learning_rate": 4.057982560279339e-11, + "loss": 0.7889, + "step": 13203 + }, + { + "epoch": 0.9991298096931633, + "grad_norm": 2.0803823471069336, + "learning_rate": 3.409832977274707e-11, + "loss": 0.6568, + "step": 13204 + }, + { + "epoch": 0.999205478415497, + "grad_norm": 2.142883777618408, + "learning_rate": 2.818044091779104e-11, + "loss": 0.6263, + "step": 13205 + }, + { + "epoch": 0.9992811471378306, + "grad_norm": 2.5195977687835693, + "learning_rate": 2.2826159406630353e-11, + "loss": 0.6548, + "step": 13206 + }, + { + "epoch": 0.9993568158601642, + "grad_norm": 1.918473720550537, + "learning_rate": 1.8035485574996458e-11, + "loss": 0.6472, + "step": 13207 + }, + { + "epoch": 0.9994324845824978, + "grad_norm": 2.169924020767212, + "learning_rate": 1.380841972464797e-11, + "loss": 0.5984, + "step": 13208 + }, + { + "epoch": 0.9995081533048314, + "grad_norm": 2.4711294174194336, + "learning_rate": 1.0144962118374678e-11, + "loss": 0.6986, + "step": 13209 + }, + { + "epoch": 0.9995838220271651, + "grad_norm": 1.8922516107559204, + "learning_rate": 7.045112986991953e-12, + "loss": 0.6675, + "step": 13210 + }, + { + "epoch": 0.9996594907494987, + "grad_norm": 1.9481332302093506, + "learning_rate": 4.508872523345531e-12, + "loss": 0.6967, + "step": 13211 + }, + { + "epoch": 0.9997351594718323, + "grad_norm": 2.386504650115967, + "learning_rate": 2.536240887307528e-12, + "loss": 0.7459, + "step": 13212 + }, + { + "epoch": 0.9998108281941659, + "grad_norm": 1.9094631671905518, + "learning_rate": 1.127218201779634e-12, + "loss": 0.6635, + "step": 13213 + }, + { + "epoch": 0.9998864969164996, + "grad_norm": 2.6249496936798096, + "learning_rate": 2.818045546915116e-13, + "loss": 0.8093, + "step": 13214 + }, + { + "epoch": 0.9999621656388332, + "grad_norm": 2.202721118927002, + "learning_rate": 0.0, + "loss": 0.7656, + "step": 13215 + }, + { + "epoch": 0.9999621656388332, + "step": 13215, + "total_flos": 4.661948874456302e+18, + "train_loss": 0.7379108269377905, + "train_runtime": 288889.3373, + "train_samples_per_second": 2.928, + "train_steps_per_second": 0.046 + } + ], + "logging_steps": 1.0, + "max_steps": 13215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.661948874456302e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}