{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.900343642611684, "eval_steps": 500, "global_step": 11000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01718213058419244, "grad_norm": 6.171707630157471, "learning_rate": 1.7182130584192443e-06, "loss": 0.9772, "step": 10 }, { "epoch": 0.03436426116838488, "grad_norm": 8.092323303222656, "learning_rate": 3.4364261168384886e-06, "loss": 0.8776, "step": 20 }, { "epoch": 0.05154639175257732, "grad_norm": 4.56458044052124, "learning_rate": 5.154639175257732e-06, "loss": 0.9078, "step": 30 }, { "epoch": 0.06872852233676977, "grad_norm": 4.5016679763793945, "learning_rate": 6.872852233676977e-06, "loss": 0.6541, "step": 40 }, { "epoch": 0.0859106529209622, "grad_norm": 3.0442380905151367, "learning_rate": 8.591065292096221e-06, "loss": 0.4879, "step": 50 }, { "epoch": 0.10309278350515463, "grad_norm": 3.105207681655884, "learning_rate": 1.0309278350515464e-05, "loss": 0.4411, "step": 60 }, { "epoch": 0.12027491408934708, "grad_norm": 2.3539834022521973, "learning_rate": 1.2027491408934708e-05, "loss": 0.326, "step": 70 }, { "epoch": 0.13745704467353953, "grad_norm": 2.5519046783447266, "learning_rate": 1.3745704467353954e-05, "loss": 0.285, "step": 80 }, { "epoch": 0.15463917525773196, "grad_norm": 4.304392337799072, "learning_rate": 1.5463917525773197e-05, "loss": 0.2539, "step": 90 }, { "epoch": 0.1718213058419244, "grad_norm": 2.098220109939575, "learning_rate": 1.7182130584192442e-05, "loss": 0.2023, "step": 100 }, { "epoch": 0.18900343642611683, "grad_norm": 1.672839879989624, "learning_rate": 1.8900343642611683e-05, "loss": 0.1724, "step": 110 }, { "epoch": 0.20618556701030927, "grad_norm": 1.0948542356491089, "learning_rate": 2.0618556701030927e-05, "loss": 0.1859, "step": 120 }, { "epoch": 0.22336769759450173, "grad_norm": 1.5368601083755493, "learning_rate": 2.2336769759450175e-05, "loss": 0.1309, "step": 130 }, { "epoch": 0.24054982817869416, "grad_norm": 1.4471988677978516, "learning_rate": 2.4054982817869417e-05, "loss": 0.134, "step": 140 }, { "epoch": 0.25773195876288657, "grad_norm": 1.2296886444091797, "learning_rate": 2.5773195876288658e-05, "loss": 0.1439, "step": 150 }, { "epoch": 0.27491408934707906, "grad_norm": 4.241024017333984, "learning_rate": 2.749140893470791e-05, "loss": 0.1214, "step": 160 }, { "epoch": 0.2920962199312715, "grad_norm": 0.9207940697669983, "learning_rate": 2.920962199312715e-05, "loss": 0.1141, "step": 170 }, { "epoch": 0.30927835051546393, "grad_norm": 0.7457907199859619, "learning_rate": 3.0927835051546395e-05, "loss": 0.0938, "step": 180 }, { "epoch": 0.32646048109965636, "grad_norm": 2.0314087867736816, "learning_rate": 3.2646048109965636e-05, "loss": 0.1287, "step": 190 }, { "epoch": 0.3436426116838488, "grad_norm": 1.9431196451187134, "learning_rate": 3.4364261168384884e-05, "loss": 0.112, "step": 200 }, { "epoch": 0.36082474226804123, "grad_norm": 1.023748755455017, "learning_rate": 3.6082474226804125e-05, "loss": 0.1039, "step": 210 }, { "epoch": 0.37800687285223367, "grad_norm": 1.6307401657104492, "learning_rate": 3.7800687285223366e-05, "loss": 0.1076, "step": 220 }, { "epoch": 0.3951890034364261, "grad_norm": 1.2871341705322266, "learning_rate": 3.9518900343642614e-05, "loss": 0.1007, "step": 230 }, { "epoch": 0.41237113402061853, "grad_norm": 1.1707206964492798, "learning_rate": 4.1237113402061855e-05, "loss": 0.0978, "step": 240 }, { "epoch": 0.42955326460481097, "grad_norm": 1.1074360609054565, "learning_rate": 4.2955326460481096e-05, "loss": 0.0853, "step": 250 }, { "epoch": 0.44673539518900346, "grad_norm": 1.1564663648605347, "learning_rate": 4.467353951890035e-05, "loss": 0.0928, "step": 260 }, { "epoch": 0.4639175257731959, "grad_norm": 0.8830773830413818, "learning_rate": 4.639175257731959e-05, "loss": 0.0852, "step": 270 }, { "epoch": 0.48109965635738833, "grad_norm": 0.8775057792663574, "learning_rate": 4.810996563573883e-05, "loss": 0.0897, "step": 280 }, { "epoch": 0.49828178694158076, "grad_norm": 0.5895084142684937, "learning_rate": 4.982817869415808e-05, "loss": 0.0741, "step": 290 }, { "epoch": 0.5154639175257731, "grad_norm": 1.0802148580551147, "learning_rate": 5.1546391752577315e-05, "loss": 0.0829, "step": 300 }, { "epoch": 0.5326460481099656, "grad_norm": 1.7113603353500366, "learning_rate": 5.326460481099656e-05, "loss": 0.0781, "step": 310 }, { "epoch": 0.5498281786941581, "grad_norm": 1.0171607732772827, "learning_rate": 5.498281786941582e-05, "loss": 0.0794, "step": 320 }, { "epoch": 0.5670103092783505, "grad_norm": 1.2694463729858398, "learning_rate": 5.670103092783505e-05, "loss": 0.0828, "step": 330 }, { "epoch": 0.584192439862543, "grad_norm": 1.6448224782943726, "learning_rate": 5.84192439862543e-05, "loss": 0.0738, "step": 340 }, { "epoch": 0.6013745704467354, "grad_norm": 1.3152124881744385, "learning_rate": 6.013745704467354e-05, "loss": 0.0805, "step": 350 }, { "epoch": 0.6185567010309279, "grad_norm": 0.9917396306991577, "learning_rate": 6.185567010309279e-05, "loss": 0.0719, "step": 360 }, { "epoch": 0.6357388316151202, "grad_norm": 1.0059962272644043, "learning_rate": 6.357388316151203e-05, "loss": 0.0648, "step": 370 }, { "epoch": 0.6529209621993127, "grad_norm": 0.8844972848892212, "learning_rate": 6.529209621993127e-05, "loss": 0.0778, "step": 380 }, { "epoch": 0.6701030927835051, "grad_norm": 0.7589945793151855, "learning_rate": 6.701030927835051e-05, "loss": 0.0924, "step": 390 }, { "epoch": 0.6872852233676976, "grad_norm": 1.062225580215454, "learning_rate": 6.872852233676977e-05, "loss": 0.0637, "step": 400 }, { "epoch": 0.7044673539518901, "grad_norm": 1.1478430032730103, "learning_rate": 7.044673539518901e-05, "loss": 0.0878, "step": 410 }, { "epoch": 0.7216494845360825, "grad_norm": 1.2245433330535889, "learning_rate": 7.216494845360825e-05, "loss": 0.0856, "step": 420 }, { "epoch": 0.738831615120275, "grad_norm": 0.8035943508148193, "learning_rate": 7.38831615120275e-05, "loss": 0.0789, "step": 430 }, { "epoch": 0.7560137457044673, "grad_norm": 1.5007230043411255, "learning_rate": 7.560137457044673e-05, "loss": 0.099, "step": 440 }, { "epoch": 0.7731958762886598, "grad_norm": 0.8082581162452698, "learning_rate": 7.731958762886599e-05, "loss": 0.0818, "step": 450 }, { "epoch": 0.7903780068728522, "grad_norm": 1.0343904495239258, "learning_rate": 7.903780068728523e-05, "loss": 0.0622, "step": 460 }, { "epoch": 0.8075601374570447, "grad_norm": 0.7941983342170715, "learning_rate": 8.075601374570447e-05, "loss": 0.077, "step": 470 }, { "epoch": 0.8247422680412371, "grad_norm": 0.7006020545959473, "learning_rate": 8.247422680412371e-05, "loss": 0.056, "step": 480 }, { "epoch": 0.8419243986254296, "grad_norm": 0.5468656420707703, "learning_rate": 8.419243986254296e-05, "loss": 0.0611, "step": 490 }, { "epoch": 0.8591065292096219, "grad_norm": 0.581874668598175, "learning_rate": 8.591065292096219e-05, "loss": 0.0544, "step": 500 }, { "epoch": 0.8762886597938144, "grad_norm": 0.7868462800979614, "learning_rate": 8.762886597938145e-05, "loss": 0.0639, "step": 510 }, { "epoch": 0.8934707903780069, "grad_norm": 0.9123062491416931, "learning_rate": 8.93470790378007e-05, "loss": 0.0529, "step": 520 }, { "epoch": 0.9106529209621993, "grad_norm": 0.9630204439163208, "learning_rate": 9.106529209621993e-05, "loss": 0.0599, "step": 530 }, { "epoch": 0.9278350515463918, "grad_norm": 1.0028278827667236, "learning_rate": 9.278350515463918e-05, "loss": 0.0746, "step": 540 }, { "epoch": 0.9450171821305842, "grad_norm": 0.8045145869255066, "learning_rate": 9.450171821305843e-05, "loss": 0.0604, "step": 550 }, { "epoch": 0.9621993127147767, "grad_norm": 0.5860382914543152, "learning_rate": 9.621993127147767e-05, "loss": 0.0635, "step": 560 }, { "epoch": 0.979381443298969, "grad_norm": 0.9446794986724854, "learning_rate": 9.793814432989691e-05, "loss": 0.0711, "step": 570 }, { "epoch": 0.9965635738831615, "grad_norm": 0.9152433276176453, "learning_rate": 9.965635738831616e-05, "loss": 0.0651, "step": 580 }, { "epoch": 1.013745704467354, "grad_norm": 0.7524177432060242, "learning_rate": 9.999987081161148e-05, "loss": 0.0593, "step": 590 }, { "epoch": 1.0309278350515463, "grad_norm": 1.0932648181915283, "learning_rate": 9.999934598492723e-05, "loss": 0.0585, "step": 600 }, { "epoch": 1.0481099656357389, "grad_norm": 0.5448580384254456, "learning_rate": 9.999841744990731e-05, "loss": 0.0705, "step": 610 }, { "epoch": 1.0652920962199313, "grad_norm": 0.8481371402740479, "learning_rate": 9.999708521404896e-05, "loss": 0.0763, "step": 620 }, { "epoch": 1.0824742268041236, "grad_norm": 0.8610166311264038, "learning_rate": 9.999534928810904e-05, "loss": 0.0598, "step": 630 }, { "epoch": 1.0996563573883162, "grad_norm": 0.807761549949646, "learning_rate": 9.999320968610386e-05, "loss": 0.0567, "step": 640 }, { "epoch": 1.1168384879725086, "grad_norm": 0.4783917963504791, "learning_rate": 9.999066642530917e-05, "loss": 0.056, "step": 650 }, { "epoch": 1.134020618556701, "grad_norm": 0.6751272678375244, "learning_rate": 9.998771952625992e-05, "loss": 0.0498, "step": 660 }, { "epoch": 1.1512027491408934, "grad_norm": 0.8272377848625183, "learning_rate": 9.998436901275022e-05, "loss": 0.0449, "step": 670 }, { "epoch": 1.168384879725086, "grad_norm": 0.8059535026550293, "learning_rate": 9.998061491183297e-05, "loss": 0.0624, "step": 680 }, { "epoch": 1.1855670103092784, "grad_norm": 0.7479894757270813, "learning_rate": 9.997645725381986e-05, "loss": 0.0471, "step": 690 }, { "epoch": 1.2027491408934707, "grad_norm": 0.6483791470527649, "learning_rate": 9.997189607228092e-05, "loss": 0.0497, "step": 700 }, { "epoch": 1.2199312714776633, "grad_norm": 0.8845646381378174, "learning_rate": 9.99669314040444e-05, "loss": 0.0617, "step": 710 }, { "epoch": 1.2371134020618557, "grad_norm": 0.8434107303619385, "learning_rate": 9.996156328919635e-05, "loss": 0.0447, "step": 720 }, { "epoch": 1.254295532646048, "grad_norm": 0.6829891800880432, "learning_rate": 9.995579177108041e-05, "loss": 0.059, "step": 730 }, { "epoch": 1.2714776632302405, "grad_norm": 0.5923603773117065, "learning_rate": 9.994961689629738e-05, "loss": 0.0483, "step": 740 }, { "epoch": 1.2886597938144329, "grad_norm": 0.48384591937065125, "learning_rate": 9.994303871470489e-05, "loss": 0.0565, "step": 750 }, { "epoch": 1.3058419243986255, "grad_norm": 0.7825417518615723, "learning_rate": 9.993605727941697e-05, "loss": 0.0545, "step": 760 }, { "epoch": 1.3230240549828178, "grad_norm": 0.9657111167907715, "learning_rate": 9.992867264680361e-05, "loss": 0.0532, "step": 770 }, { "epoch": 1.3402061855670104, "grad_norm": 1.0996328592300415, "learning_rate": 9.992088487649038e-05, "loss": 0.0637, "step": 780 }, { "epoch": 1.3573883161512028, "grad_norm": 0.8697621822357178, "learning_rate": 9.991269403135783e-05, "loss": 0.0445, "step": 790 }, { "epoch": 1.3745704467353952, "grad_norm": 0.4780273735523224, "learning_rate": 9.990410017754108e-05, "loss": 0.0509, "step": 800 }, { "epoch": 1.3917525773195876, "grad_norm": 0.386453777551651, "learning_rate": 9.989510338442925e-05, "loss": 0.0465, "step": 810 }, { "epoch": 1.40893470790378, "grad_norm": 0.7011645436286926, "learning_rate": 9.98857037246649e-05, "loss": 0.0659, "step": 820 }, { "epoch": 1.4261168384879725, "grad_norm": 0.47305113077163696, "learning_rate": 9.987590127414344e-05, "loss": 0.0391, "step": 830 }, { "epoch": 1.443298969072165, "grad_norm": 0.6128239035606384, "learning_rate": 9.986569611201251e-05, "loss": 0.0433, "step": 840 }, { "epoch": 1.4604810996563573, "grad_norm": 0.6045581698417664, "learning_rate": 9.985508832067139e-05, "loss": 0.0485, "step": 850 }, { "epoch": 1.47766323024055, "grad_norm": 0.6033497452735901, "learning_rate": 9.984407798577027e-05, "loss": 0.049, "step": 860 }, { "epoch": 1.4948453608247423, "grad_norm": 0.47953736782073975, "learning_rate": 9.98326651962096e-05, "loss": 0.0539, "step": 870 }, { "epoch": 1.5120274914089347, "grad_norm": 0.8113358020782471, "learning_rate": 9.982085004413933e-05, "loss": 0.0481, "step": 880 }, { "epoch": 1.529209621993127, "grad_norm": 0.5726741552352905, "learning_rate": 9.980863262495821e-05, "loss": 0.0512, "step": 890 }, { "epoch": 1.5463917525773194, "grad_norm": 0.6560239195823669, "learning_rate": 9.979601303731306e-05, "loss": 0.0464, "step": 900 }, { "epoch": 1.563573883161512, "grad_norm": 0.5235106348991394, "learning_rate": 9.978299138309781e-05, "loss": 0.0486, "step": 910 }, { "epoch": 1.5807560137457046, "grad_norm": 0.6439309120178223, "learning_rate": 9.976956776745287e-05, "loss": 0.0536, "step": 920 }, { "epoch": 1.597938144329897, "grad_norm": 0.8001301884651184, "learning_rate": 9.975574229876417e-05, "loss": 0.0641, "step": 930 }, { "epoch": 1.6151202749140894, "grad_norm": 0.6167306900024414, "learning_rate": 9.974151508866231e-05, "loss": 0.0372, "step": 940 }, { "epoch": 1.6323024054982818, "grad_norm": 0.5872222781181335, "learning_rate": 9.972688625202164e-05, "loss": 0.0452, "step": 950 }, { "epoch": 1.6494845360824741, "grad_norm": 0.4873111844062805, "learning_rate": 9.97118559069594e-05, "loss": 0.0524, "step": 960 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4051572382450104, "learning_rate": 9.969642417483466e-05, "loss": 0.0407, "step": 970 }, { "epoch": 1.6838487972508591, "grad_norm": 0.43837177753448486, "learning_rate": 9.968059118024744e-05, "loss": 0.0471, "step": 980 }, { "epoch": 1.7010309278350515, "grad_norm": 0.5712767243385315, "learning_rate": 9.966435705103765e-05, "loss": 0.0398, "step": 990 }, { "epoch": 1.718213058419244, "grad_norm": 0.6849848628044128, "learning_rate": 9.964772191828407e-05, "loss": 0.0428, "step": 1000 }, { "epoch": 1.7353951890034365, "grad_norm": 0.3726734220981598, "learning_rate": 9.96306859163033e-05, "loss": 0.0584, "step": 1010 }, { "epoch": 1.7525773195876289, "grad_norm": 0.3805288076400757, "learning_rate": 9.961324918264865e-05, "loss": 0.0414, "step": 1020 }, { "epoch": 1.7697594501718212, "grad_norm": 0.45931509137153625, "learning_rate": 9.959541185810906e-05, "loss": 0.0438, "step": 1030 }, { "epoch": 1.7869415807560136, "grad_norm": 0.4892406761646271, "learning_rate": 9.957717408670793e-05, "loss": 0.0402, "step": 1040 }, { "epoch": 1.8041237113402062, "grad_norm": 0.6873617768287659, "learning_rate": 9.955853601570202e-05, "loss": 0.049, "step": 1050 }, { "epoch": 1.8213058419243986, "grad_norm": 0.8491326570510864, "learning_rate": 9.953949779558017e-05, "loss": 0.0532, "step": 1060 }, { "epoch": 1.8384879725085912, "grad_norm": 0.45153722167015076, "learning_rate": 9.952005958006217e-05, "loss": 0.0403, "step": 1070 }, { "epoch": 1.8556701030927836, "grad_norm": 0.608630359172821, "learning_rate": 9.950022152609745e-05, "loss": 0.0479, "step": 1080 }, { "epoch": 1.872852233676976, "grad_norm": 0.5155346989631653, "learning_rate": 9.947998379386388e-05, "loss": 0.045, "step": 1090 }, { "epoch": 1.8900343642611683, "grad_norm": 0.2339087724685669, "learning_rate": 9.945934654676639e-05, "loss": 0.0361, "step": 1100 }, { "epoch": 1.9072164948453607, "grad_norm": 0.4478403329849243, "learning_rate": 9.943830995143577e-05, "loss": 0.0355, "step": 1110 }, { "epoch": 1.9243986254295533, "grad_norm": 0.3183349072933197, "learning_rate": 9.941687417772718e-05, "loss": 0.0349, "step": 1120 }, { "epoch": 1.9415807560137457, "grad_norm": 0.3854424059391022, "learning_rate": 9.939503939871893e-05, "loss": 0.0329, "step": 1130 }, { "epoch": 1.9587628865979383, "grad_norm": 0.7198600172996521, "learning_rate": 9.937280579071095e-05, "loss": 0.0372, "step": 1140 }, { "epoch": 1.9759450171821307, "grad_norm": 0.5371730923652649, "learning_rate": 9.935017353322347e-05, "loss": 0.0388, "step": 1150 }, { "epoch": 1.993127147766323, "grad_norm": 0.6025398373603821, "learning_rate": 9.932714280899547e-05, "loss": 0.0334, "step": 1160 }, { "epoch": 2.0103092783505154, "grad_norm": 0.4118864834308624, "learning_rate": 9.930371380398331e-05, "loss": 0.0429, "step": 1170 }, { "epoch": 2.027491408934708, "grad_norm": 0.5828579664230347, "learning_rate": 9.927988670735915e-05, "loss": 0.0377, "step": 1180 }, { "epoch": 2.0446735395189, "grad_norm": 0.45602017641067505, "learning_rate": 9.925566171150945e-05, "loss": 0.0364, "step": 1190 }, { "epoch": 2.0618556701030926, "grad_norm": 0.236759752035141, "learning_rate": 9.923103901203343e-05, "loss": 0.0456, "step": 1200 }, { "epoch": 2.0790378006872854, "grad_norm": 0.5670115947723389, "learning_rate": 9.920601880774148e-05, "loss": 0.0514, "step": 1210 }, { "epoch": 2.0962199312714778, "grad_norm": 0.5565935373306274, "learning_rate": 9.918060130065354e-05, "loss": 0.0413, "step": 1220 }, { "epoch": 2.11340206185567, "grad_norm": 0.28620976209640503, "learning_rate": 9.915478669599747e-05, "loss": 0.0345, "step": 1230 }, { "epoch": 2.1305841924398625, "grad_norm": 0.598374605178833, "learning_rate": 9.912857520220743e-05, "loss": 0.0409, "step": 1240 }, { "epoch": 2.147766323024055, "grad_norm": 0.4782467186450958, "learning_rate": 9.910196703092216e-05, "loss": 0.0341, "step": 1250 }, { "epoch": 2.1649484536082473, "grad_norm": 0.3740648925304413, "learning_rate": 9.907496239698327e-05, "loss": 0.0334, "step": 1260 }, { "epoch": 2.1821305841924397, "grad_norm": 0.514352560043335, "learning_rate": 9.904756151843353e-05, "loss": 0.033, "step": 1270 }, { "epoch": 2.1993127147766325, "grad_norm": 0.6190779209136963, "learning_rate": 9.90197646165151e-05, "loss": 0.0382, "step": 1280 }, { "epoch": 2.216494845360825, "grad_norm": 0.45846354961395264, "learning_rate": 9.899157191566775e-05, "loss": 0.0352, "step": 1290 }, { "epoch": 2.2336769759450172, "grad_norm": 0.3824189603328705, "learning_rate": 9.8962983643527e-05, "loss": 0.0401, "step": 1300 }, { "epoch": 2.2508591065292096, "grad_norm": 0.3505632281303406, "learning_rate": 9.893400003092237e-05, "loss": 0.0335, "step": 1310 }, { "epoch": 2.268041237113402, "grad_norm": 0.55964595079422, "learning_rate": 9.890462131187543e-05, "loss": 0.0349, "step": 1320 }, { "epoch": 2.2852233676975944, "grad_norm": 0.41852259635925293, "learning_rate": 9.887484772359795e-05, "loss": 0.0489, "step": 1330 }, { "epoch": 2.3024054982817868, "grad_norm": 0.7132606506347656, "learning_rate": 9.884467950648998e-05, "loss": 0.0531, "step": 1340 }, { "epoch": 2.319587628865979, "grad_norm": 0.5213425755500793, "learning_rate": 9.881411690413796e-05, "loss": 0.034, "step": 1350 }, { "epoch": 2.336769759450172, "grad_norm": 0.6458540558815002, "learning_rate": 9.878316016331262e-05, "loss": 0.0473, "step": 1360 }, { "epoch": 2.3539518900343643, "grad_norm": 0.41432708501815796, "learning_rate": 9.875180953396714e-05, "loss": 0.0321, "step": 1370 }, { "epoch": 2.3711340206185567, "grad_norm": 0.3965621888637543, "learning_rate": 9.872006526923503e-05, "loss": 0.0351, "step": 1380 }, { "epoch": 2.388316151202749, "grad_norm": 0.2506723999977112, "learning_rate": 9.868792762542814e-05, "loss": 0.0402, "step": 1390 }, { "epoch": 2.4054982817869415, "grad_norm": 0.2948648929595947, "learning_rate": 9.865539686203455e-05, "loss": 0.0335, "step": 1400 }, { "epoch": 2.422680412371134, "grad_norm": 0.5881168842315674, "learning_rate": 9.862247324171652e-05, "loss": 0.0473, "step": 1410 }, { "epoch": 2.4398625429553267, "grad_norm": 0.5597307085990906, "learning_rate": 9.858915703030829e-05, "loss": 0.0387, "step": 1420 }, { "epoch": 2.457044673539519, "grad_norm": 0.3447171449661255, "learning_rate": 9.855544849681404e-05, "loss": 0.0395, "step": 1430 }, { "epoch": 2.4742268041237114, "grad_norm": 0.675528347492218, "learning_rate": 9.852134791340567e-05, "loss": 0.0303, "step": 1440 }, { "epoch": 2.491408934707904, "grad_norm": 0.4080379903316498, "learning_rate": 9.848685555542055e-05, "loss": 0.0414, "step": 1450 }, { "epoch": 2.508591065292096, "grad_norm": 0.34045320749282837, "learning_rate": 9.845197170135939e-05, "loss": 0.0291, "step": 1460 }, { "epoch": 2.5257731958762886, "grad_norm": 0.34041810035705566, "learning_rate": 9.841669663288391e-05, "loss": 0.0287, "step": 1470 }, { "epoch": 2.542955326460481, "grad_norm": 0.35550206899642944, "learning_rate": 9.838103063481464e-05, "loss": 0.035, "step": 1480 }, { "epoch": 2.5601374570446733, "grad_norm": 0.5085458755493164, "learning_rate": 9.834497399512855e-05, "loss": 0.0286, "step": 1490 }, { "epoch": 2.5773195876288657, "grad_norm": 0.3794465959072113, "learning_rate": 9.830852700495676e-05, "loss": 0.0383, "step": 1500 }, { "epoch": 2.5945017182130585, "grad_norm": 0.20820270478725433, "learning_rate": 9.82716899585822e-05, "loss": 0.0229, "step": 1510 }, { "epoch": 2.611683848797251, "grad_norm": 0.31715983152389526, "learning_rate": 9.823446315343723e-05, "loss": 0.0267, "step": 1520 }, { "epoch": 2.6288659793814433, "grad_norm": 0.518182635307312, "learning_rate": 9.819684689010119e-05, "loss": 0.0328, "step": 1530 }, { "epoch": 2.6460481099656357, "grad_norm": 0.3830466568470001, "learning_rate": 9.815884147229804e-05, "loss": 0.0289, "step": 1540 }, { "epoch": 2.663230240549828, "grad_norm": 0.4509371817111969, "learning_rate": 9.812044720689387e-05, "loss": 0.0369, "step": 1550 }, { "epoch": 2.680412371134021, "grad_norm": 0.5616033673286438, "learning_rate": 9.808166440389446e-05, "loss": 0.0264, "step": 1560 }, { "epoch": 2.6975945017182132, "grad_norm": 0.5223531723022461, "learning_rate": 9.80424933764427e-05, "loss": 0.0265, "step": 1570 }, { "epoch": 2.7147766323024056, "grad_norm": 0.5588796734809875, "learning_rate": 9.800293444081612e-05, "loss": 0.0298, "step": 1580 }, { "epoch": 2.731958762886598, "grad_norm": 0.5224287509918213, "learning_rate": 9.796298791642435e-05, "loss": 0.0334, "step": 1590 }, { "epoch": 2.7491408934707904, "grad_norm": 0.510735809803009, "learning_rate": 9.792265412580654e-05, "loss": 0.0344, "step": 1600 }, { "epoch": 2.7663230240549828, "grad_norm": 0.46988189220428467, "learning_rate": 9.788193339462866e-05, "loss": 0.034, "step": 1610 }, { "epoch": 2.783505154639175, "grad_norm": 0.43194422125816345, "learning_rate": 9.7840826051681e-05, "loss": 0.033, "step": 1620 }, { "epoch": 2.8006872852233675, "grad_norm": 0.5727249383926392, "learning_rate": 9.779933242887542e-05, "loss": 0.0321, "step": 1630 }, { "epoch": 2.81786941580756, "grad_norm": 0.3941832482814789, "learning_rate": 9.775745286124277e-05, "loss": 0.0286, "step": 1640 }, { "epoch": 2.8350515463917527, "grad_norm": 0.5706576704978943, "learning_rate": 9.771518768693004e-05, "loss": 0.0271, "step": 1650 }, { "epoch": 2.852233676975945, "grad_norm": 0.5128160715103149, "learning_rate": 9.76725372471978e-05, "loss": 0.0434, "step": 1660 }, { "epoch": 2.8694158075601375, "grad_norm": 0.34409016370773315, "learning_rate": 9.762950188641728e-05, "loss": 0.0314, "step": 1670 }, { "epoch": 2.88659793814433, "grad_norm": 0.532747209072113, "learning_rate": 9.758608195206771e-05, "loss": 0.0369, "step": 1680 }, { "epoch": 2.9037800687285222, "grad_norm": 0.5421701073646545, "learning_rate": 9.754227779473349e-05, "loss": 0.0404, "step": 1690 }, { "epoch": 2.9209621993127146, "grad_norm": 0.36500459909439087, "learning_rate": 9.749808976810128e-05, "loss": 0.0332, "step": 1700 }, { "epoch": 2.9381443298969074, "grad_norm": 0.5636774897575378, "learning_rate": 9.745351822895727e-05, "loss": 0.0309, "step": 1710 }, { "epoch": 2.9553264604811, "grad_norm": 0.408263236284256, "learning_rate": 9.740856353718419e-05, "loss": 0.033, "step": 1720 }, { "epoch": 2.972508591065292, "grad_norm": 0.4448431432247162, "learning_rate": 9.736322605575845e-05, "loss": 0.0248, "step": 1730 }, { "epoch": 2.9896907216494846, "grad_norm": 0.3676033020019531, "learning_rate": 9.731750615074724e-05, "loss": 0.036, "step": 1740 }, { "epoch": 3.006872852233677, "grad_norm": 0.3884856104850769, "learning_rate": 9.727140419130553e-05, "loss": 0.0256, "step": 1750 }, { "epoch": 3.0240549828178693, "grad_norm": 0.4114404320716858, "learning_rate": 9.72249205496731e-05, "loss": 0.0273, "step": 1760 }, { "epoch": 3.0412371134020617, "grad_norm": 0.5628842711448669, "learning_rate": 9.717805560117149e-05, "loss": 0.0254, "step": 1770 }, { "epoch": 3.058419243986254, "grad_norm": 0.34935763478279114, "learning_rate": 9.71308097242011e-05, "loss": 0.0246, "step": 1780 }, { "epoch": 3.075601374570447, "grad_norm": 0.8378509283065796, "learning_rate": 9.708318330023798e-05, "loss": 0.0358, "step": 1790 }, { "epoch": 3.0927835051546393, "grad_norm": 0.4501832127571106, "learning_rate": 9.703517671383086e-05, "loss": 0.0314, "step": 1800 }, { "epoch": 3.1099656357388317, "grad_norm": 0.5251947641372681, "learning_rate": 9.698679035259801e-05, "loss": 0.0291, "step": 1810 }, { "epoch": 3.127147766323024, "grad_norm": 0.36063244938850403, "learning_rate": 9.693802460722405e-05, "loss": 0.0244, "step": 1820 }, { "epoch": 3.1443298969072164, "grad_norm": 0.2640397548675537, "learning_rate": 9.688887987145691e-05, "loss": 0.0291, "step": 1830 }, { "epoch": 3.161512027491409, "grad_norm": 0.35009852051734924, "learning_rate": 9.683935654210457e-05, "loss": 0.0355, "step": 1840 }, { "epoch": 3.178694158075601, "grad_norm": 0.455991268157959, "learning_rate": 9.678945501903188e-05, "loss": 0.0244, "step": 1850 }, { "epoch": 3.195876288659794, "grad_norm": 0.2577104866504669, "learning_rate": 9.673917570515732e-05, "loss": 0.0277, "step": 1860 }, { "epoch": 3.2130584192439864, "grad_norm": 0.46351000666618347, "learning_rate": 9.668851900644975e-05, "loss": 0.0249, "step": 1870 }, { "epoch": 3.2302405498281788, "grad_norm": 0.4203677773475647, "learning_rate": 9.663748533192516e-05, "loss": 0.0251, "step": 1880 }, { "epoch": 3.247422680412371, "grad_norm": 0.24778026342391968, "learning_rate": 9.658607509364337e-05, "loss": 0.0286, "step": 1890 }, { "epoch": 3.2646048109965635, "grad_norm": 0.5941663980484009, "learning_rate": 9.653428870670459e-05, "loss": 0.0375, "step": 1900 }, { "epoch": 3.281786941580756, "grad_norm": 0.6710448265075684, "learning_rate": 9.648212658924625e-05, "loss": 0.0268, "step": 1910 }, { "epoch": 3.2989690721649483, "grad_norm": 0.40934911370277405, "learning_rate": 9.642958916243946e-05, "loss": 0.0187, "step": 1920 }, { "epoch": 3.3161512027491407, "grad_norm": 0.3697362542152405, "learning_rate": 9.637667685048575e-05, "loss": 0.0286, "step": 1930 }, { "epoch": 3.3333333333333335, "grad_norm": 0.40777045488357544, "learning_rate": 9.63233900806135e-05, "loss": 0.0232, "step": 1940 }, { "epoch": 3.350515463917526, "grad_norm": 0.2753160893917084, "learning_rate": 9.62697292830746e-05, "loss": 0.0305, "step": 1950 }, { "epoch": 3.3676975945017182, "grad_norm": 0.5245633721351624, "learning_rate": 9.6215694891141e-05, "loss": 0.0268, "step": 1960 }, { "epoch": 3.3848797250859106, "grad_norm": 0.4454520344734192, "learning_rate": 9.616128734110103e-05, "loss": 0.0334, "step": 1970 }, { "epoch": 3.402061855670103, "grad_norm": 0.40832188725471497, "learning_rate": 9.61065070722561e-05, "loss": 0.0375, "step": 1980 }, { "epoch": 3.4192439862542954, "grad_norm": 0.4421581029891968, "learning_rate": 9.6051354526917e-05, "loss": 0.0291, "step": 1990 }, { "epoch": 3.436426116838488, "grad_norm": 0.3832218050956726, "learning_rate": 9.59958301504004e-05, "loss": 0.0348, "step": 2000 }, { "epoch": 3.4536082474226806, "grad_norm": 0.2825784683227539, "learning_rate": 9.593993439102526e-05, "loss": 0.0285, "step": 2010 }, { "epoch": 3.470790378006873, "grad_norm": 0.4989912211894989, "learning_rate": 9.588366770010914e-05, "loss": 0.0298, "step": 2020 }, { "epoch": 3.4879725085910653, "grad_norm": 0.38946759700775146, "learning_rate": 9.582703053196464e-05, "loss": 0.0294, "step": 2030 }, { "epoch": 3.5051546391752577, "grad_norm": 0.3553588092327118, "learning_rate": 9.577002334389569e-05, "loss": 0.0281, "step": 2040 }, { "epoch": 3.52233676975945, "grad_norm": 0.48752427101135254, "learning_rate": 9.571264659619382e-05, "loss": 0.026, "step": 2050 }, { "epoch": 3.5395189003436425, "grad_norm": 0.3820585310459137, "learning_rate": 9.565490075213452e-05, "loss": 0.0259, "step": 2060 }, { "epoch": 3.556701030927835, "grad_norm": 0.35598281025886536, "learning_rate": 9.55967862779735e-05, "loss": 0.0343, "step": 2070 }, { "epoch": 3.5738831615120272, "grad_norm": 0.4193035364151001, "learning_rate": 9.55383036429428e-05, "loss": 0.0296, "step": 2080 }, { "epoch": 3.59106529209622, "grad_norm": 0.4993601441383362, "learning_rate": 9.547945331924717e-05, "loss": 0.0236, "step": 2090 }, { "epoch": 3.6082474226804124, "grad_norm": 0.3591003119945526, "learning_rate": 9.542023578206015e-05, "loss": 0.0301, "step": 2100 }, { "epoch": 3.625429553264605, "grad_norm": 0.30369478464126587, "learning_rate": 9.536065150952025e-05, "loss": 0.0327, "step": 2110 }, { "epoch": 3.642611683848797, "grad_norm": 0.37964117527008057, "learning_rate": 9.530070098272712e-05, "loss": 0.0351, "step": 2120 }, { "epoch": 3.6597938144329896, "grad_norm": 0.4031108617782593, "learning_rate": 9.524038468573764e-05, "loss": 0.0334, "step": 2130 }, { "epoch": 3.6769759450171824, "grad_norm": 0.24876996874809265, "learning_rate": 9.517970310556202e-05, "loss": 0.0238, "step": 2140 }, { "epoch": 3.6941580756013748, "grad_norm": 0.5632336139678955, "learning_rate": 9.511865673215986e-05, "loss": 0.0245, "step": 2150 }, { "epoch": 3.711340206185567, "grad_norm": 0.4374890625476837, "learning_rate": 9.50572460584362e-05, "loss": 0.0364, "step": 2160 }, { "epoch": 3.7285223367697595, "grad_norm": 0.4703497588634491, "learning_rate": 9.499547158023755e-05, "loss": 0.0248, "step": 2170 }, { "epoch": 3.745704467353952, "grad_norm": 0.3067072927951813, "learning_rate": 9.493333379634786e-05, "loss": 0.0203, "step": 2180 }, { "epoch": 3.7628865979381443, "grad_norm": 0.5396534204483032, "learning_rate": 9.487083320848454e-05, "loss": 0.0296, "step": 2190 }, { "epoch": 3.7800687285223367, "grad_norm": 0.2977238595485687, "learning_rate": 9.480797032129432e-05, "loss": 0.0243, "step": 2200 }, { "epoch": 3.797250859106529, "grad_norm": 0.35456737875938416, "learning_rate": 9.474474564234931e-05, "loss": 0.0331, "step": 2210 }, { "epoch": 3.8144329896907214, "grad_norm": 0.4030454456806183, "learning_rate": 9.468115968214276e-05, "loss": 0.0271, "step": 2220 }, { "epoch": 3.8316151202749142, "grad_norm": 0.4009501039981842, "learning_rate": 9.461721295408505e-05, "loss": 0.025, "step": 2230 }, { "epoch": 3.8487972508591066, "grad_norm": 0.34113046526908875, "learning_rate": 9.455290597449945e-05, "loss": 0.0298, "step": 2240 }, { "epoch": 3.865979381443299, "grad_norm": 0.4473305344581604, "learning_rate": 9.448823926261805e-05, "loss": 0.0293, "step": 2250 }, { "epoch": 3.8831615120274914, "grad_norm": 0.4152556359767914, "learning_rate": 9.442321334057748e-05, "loss": 0.0365, "step": 2260 }, { "epoch": 3.9003436426116838, "grad_norm": 0.5801966190338135, "learning_rate": 9.435782873341474e-05, "loss": 0.0283, "step": 2270 }, { "epoch": 3.917525773195876, "grad_norm": 0.5143575668334961, "learning_rate": 9.429208596906296e-05, "loss": 0.0276, "step": 2280 }, { "epoch": 3.934707903780069, "grad_norm": 0.28408244252204895, "learning_rate": 9.422598557834712e-05, "loss": 0.0266, "step": 2290 }, { "epoch": 3.9518900343642613, "grad_norm": 0.30861398577690125, "learning_rate": 9.415952809497979e-05, "loss": 0.0307, "step": 2300 }, { "epoch": 3.9690721649484537, "grad_norm": 0.5015305280685425, "learning_rate": 9.409271405555677e-05, "loss": 0.0238, "step": 2310 }, { "epoch": 3.986254295532646, "grad_norm": 0.42114853858947754, "learning_rate": 9.402554399955281e-05, "loss": 0.0297, "step": 2320 }, { "epoch": 4.0034364261168385, "grad_norm": 0.38618704676628113, "learning_rate": 9.395801846931726e-05, "loss": 0.0274, "step": 2330 }, { "epoch": 4.020618556701031, "grad_norm": 0.44997620582580566, "learning_rate": 9.389013801006961e-05, "loss": 0.0294, "step": 2340 }, { "epoch": 4.037800687285223, "grad_norm": 0.4600159227848053, "learning_rate": 9.382190316989518e-05, "loss": 0.0286, "step": 2350 }, { "epoch": 4.054982817869416, "grad_norm": 0.35218673944473267, "learning_rate": 9.375331449974066e-05, "loss": 0.0248, "step": 2360 }, { "epoch": 4.072164948453608, "grad_norm": 0.26790571212768555, "learning_rate": 9.368437255340965e-05, "loss": 0.0287, "step": 2370 }, { "epoch": 4.0893470790378, "grad_norm": 0.356351375579834, "learning_rate": 9.361507788755818e-05, "loss": 0.0207, "step": 2380 }, { "epoch": 4.106529209621993, "grad_norm": 0.3762167692184448, "learning_rate": 9.354543106169029e-05, "loss": 0.0303, "step": 2390 }, { "epoch": 4.123711340206185, "grad_norm": 0.21644559502601624, "learning_rate": 9.347543263815339e-05, "loss": 0.0262, "step": 2400 }, { "epoch": 4.140893470790378, "grad_norm": 0.3905271887779236, "learning_rate": 9.340508318213383e-05, "loss": 0.0267, "step": 2410 }, { "epoch": 4.158075601374571, "grad_norm": 0.25276127457618713, "learning_rate": 9.333438326165227e-05, "loss": 0.0292, "step": 2420 }, { "epoch": 4.175257731958763, "grad_norm": 0.21106575429439545, "learning_rate": 9.326333344755912e-05, "loss": 0.0218, "step": 2430 }, { "epoch": 4.1924398625429555, "grad_norm": 0.37403470277786255, "learning_rate": 9.319193431352993e-05, "loss": 0.0261, "step": 2440 }, { "epoch": 4.209621993127148, "grad_norm": 0.23083224892616272, "learning_rate": 9.312018643606074e-05, "loss": 0.0268, "step": 2450 }, { "epoch": 4.22680412371134, "grad_norm": 0.29775136709213257, "learning_rate": 9.304809039446347e-05, "loss": 0.0286, "step": 2460 }, { "epoch": 4.243986254295533, "grad_norm": 0.39073804020881653, "learning_rate": 9.297564677086118e-05, "loss": 0.0231, "step": 2470 }, { "epoch": 4.261168384879725, "grad_norm": 0.3536919951438904, "learning_rate": 9.290285615018342e-05, "loss": 0.0269, "step": 2480 }, { "epoch": 4.278350515463917, "grad_norm": 0.37961915135383606, "learning_rate": 9.282971912016149e-05, "loss": 0.0312, "step": 2490 }, { "epoch": 4.29553264604811, "grad_norm": 0.444950670003891, "learning_rate": 9.275623627132368e-05, "loss": 0.0275, "step": 2500 }, { "epoch": 4.312714776632302, "grad_norm": 0.3781861364841461, "learning_rate": 9.268240819699054e-05, "loss": 0.0285, "step": 2510 }, { "epoch": 4.329896907216495, "grad_norm": 0.2931497395038605, "learning_rate": 9.260823549327002e-05, "loss": 0.0258, "step": 2520 }, { "epoch": 4.347079037800687, "grad_norm": 0.26529255509376526, "learning_rate": 9.253371875905274e-05, "loss": 0.026, "step": 2530 }, { "epoch": 4.364261168384879, "grad_norm": 0.6221164464950562, "learning_rate": 9.245885859600712e-05, "loss": 0.0366, "step": 2540 }, { "epoch": 4.381443298969073, "grad_norm": 0.39402952790260315, "learning_rate": 9.238365560857447e-05, "loss": 0.0237, "step": 2550 }, { "epoch": 4.398625429553265, "grad_norm": 0.33800095319747925, "learning_rate": 9.230811040396423e-05, "loss": 0.0328, "step": 2560 }, { "epoch": 4.415807560137457, "grad_norm": 0.21751320362091064, "learning_rate": 9.223222359214891e-05, "loss": 0.0315, "step": 2570 }, { "epoch": 4.43298969072165, "grad_norm": 0.34266844391822815, "learning_rate": 9.215599578585936e-05, "loss": 0.0374, "step": 2580 }, { "epoch": 4.450171821305842, "grad_norm": 0.3306879997253418, "learning_rate": 9.207942760057958e-05, "loss": 0.023, "step": 2590 }, { "epoch": 4.4673539518900345, "grad_norm": 0.3142191767692566, "learning_rate": 9.200251965454199e-05, "loss": 0.0263, "step": 2600 }, { "epoch": 4.484536082474227, "grad_norm": 0.4040457010269165, "learning_rate": 9.192527256872226e-05, "loss": 0.0269, "step": 2610 }, { "epoch": 4.501718213058419, "grad_norm": 0.3506450951099396, "learning_rate": 9.184768696683443e-05, "loss": 0.0227, "step": 2620 }, { "epoch": 4.518900343642612, "grad_norm": 0.4573422074317932, "learning_rate": 9.176976347532575e-05, "loss": 0.0312, "step": 2630 }, { "epoch": 4.536082474226804, "grad_norm": 0.2337106168270111, "learning_rate": 9.169150272337172e-05, "loss": 0.028, "step": 2640 }, { "epoch": 4.553264604810996, "grad_norm": 0.28833648562431335, "learning_rate": 9.161290534287099e-05, "loss": 0.0245, "step": 2650 }, { "epoch": 4.570446735395189, "grad_norm": 0.24830326437950134, "learning_rate": 9.153397196844017e-05, "loss": 0.0218, "step": 2660 }, { "epoch": 4.587628865979381, "grad_norm": 0.55474853515625, "learning_rate": 9.145470323740885e-05, "loss": 0.0247, "step": 2670 }, { "epoch": 4.6048109965635735, "grad_norm": 0.49275097250938416, "learning_rate": 9.137509978981435e-05, "loss": 0.0276, "step": 2680 }, { "epoch": 4.621993127147766, "grad_norm": 0.30603882670402527, "learning_rate": 9.129516226839658e-05, "loss": 0.0208, "step": 2690 }, { "epoch": 4.639175257731958, "grad_norm": 0.32763534784317017, "learning_rate": 9.121489131859286e-05, "loss": 0.0267, "step": 2700 }, { "epoch": 4.6563573883161515, "grad_norm": 0.41010305285453796, "learning_rate": 9.113428758853268e-05, "loss": 0.0223, "step": 2710 }, { "epoch": 4.673539518900344, "grad_norm": 0.2709559500217438, "learning_rate": 9.105335172903253e-05, "loss": 0.0253, "step": 2720 }, { "epoch": 4.690721649484536, "grad_norm": 0.23412011563777924, "learning_rate": 9.097208439359057e-05, "loss": 0.0146, "step": 2730 }, { "epoch": 4.707903780068729, "grad_norm": 0.4020395576953888, "learning_rate": 9.08904862383814e-05, "loss": 0.0188, "step": 2740 }, { "epoch": 4.725085910652921, "grad_norm": 0.2301657497882843, "learning_rate": 9.080855792225076e-05, "loss": 0.0227, "step": 2750 }, { "epoch": 4.742268041237113, "grad_norm": 0.3554215133190155, "learning_rate": 9.072630010671015e-05, "loss": 0.0213, "step": 2760 }, { "epoch": 4.759450171821306, "grad_norm": 0.39176028966903687, "learning_rate": 9.064371345593161e-05, "loss": 0.0208, "step": 2770 }, { "epoch": 4.776632302405498, "grad_norm": 0.304556280374527, "learning_rate": 9.056079863674223e-05, "loss": 0.0211, "step": 2780 }, { "epoch": 4.793814432989691, "grad_norm": 0.3643367290496826, "learning_rate": 9.047755631861884e-05, "loss": 0.0237, "step": 2790 }, { "epoch": 4.810996563573883, "grad_norm": 0.379891961812973, "learning_rate": 9.039398717368259e-05, "loss": 0.025, "step": 2800 }, { "epoch": 4.828178694158075, "grad_norm": 0.3559863269329071, "learning_rate": 9.031009187669353e-05, "loss": 0.0204, "step": 2810 }, { "epoch": 4.845360824742268, "grad_norm": 0.43560439348220825, "learning_rate": 9.02258711050451e-05, "loss": 0.0254, "step": 2820 }, { "epoch": 4.862542955326461, "grad_norm": 0.28781425952911377, "learning_rate": 9.014132553875878e-05, "loss": 0.0319, "step": 2830 }, { "epoch": 4.879725085910653, "grad_norm": 0.2920217514038086, "learning_rate": 9.005645586047847e-05, "loss": 0.0259, "step": 2840 }, { "epoch": 4.896907216494846, "grad_norm": 0.28286901116371155, "learning_rate": 8.997126275546509e-05, "loss": 0.0213, "step": 2850 }, { "epoch": 4.914089347079038, "grad_norm": 0.3962228298187256, "learning_rate": 8.988574691159095e-05, "loss": 0.0257, "step": 2860 }, { "epoch": 4.9312714776632305, "grad_norm": 0.43689677119255066, "learning_rate": 8.979990901933428e-05, "loss": 0.0296, "step": 2870 }, { "epoch": 4.948453608247423, "grad_norm": 0.35005268454551697, "learning_rate": 8.971374977177356e-05, "loss": 0.0259, "step": 2880 }, { "epoch": 4.965635738831615, "grad_norm": 0.6228940486907959, "learning_rate": 8.962726986458207e-05, "loss": 0.0307, "step": 2890 }, { "epoch": 4.982817869415808, "grad_norm": 0.2236226499080658, "learning_rate": 8.954046999602211e-05, "loss": 0.0249, "step": 2900 }, { "epoch": 5.0, "grad_norm": 0.42723384499549866, "learning_rate": 8.945335086693942e-05, "loss": 0.0244, "step": 2910 }, { "epoch": 5.017182130584192, "grad_norm": 0.2944800853729248, "learning_rate": 8.936591318075764e-05, "loss": 0.0248, "step": 2920 }, { "epoch": 5.034364261168385, "grad_norm": 0.30557361245155334, "learning_rate": 8.927815764347242e-05, "loss": 0.0204, "step": 2930 }, { "epoch": 5.051546391752577, "grad_norm": 0.3732447922229767, "learning_rate": 8.919008496364587e-05, "loss": 0.0308, "step": 2940 }, { "epoch": 5.0687285223367695, "grad_norm": 0.30933091044425964, "learning_rate": 8.910169585240078e-05, "loss": 0.029, "step": 2950 }, { "epoch": 5.085910652920962, "grad_norm": 0.554576575756073, "learning_rate": 8.901299102341494e-05, "loss": 0.03, "step": 2960 }, { "epoch": 5.103092783505154, "grad_norm": 0.43166640400886536, "learning_rate": 8.892397119291526e-05, "loss": 0.0241, "step": 2970 }, { "epoch": 5.120274914089347, "grad_norm": 0.4924606680870056, "learning_rate": 8.883463707967211e-05, "loss": 0.0224, "step": 2980 }, { "epoch": 5.13745704467354, "grad_norm": 0.3466743230819702, "learning_rate": 8.874498940499346e-05, "loss": 0.0234, "step": 2990 }, { "epoch": 5.154639175257732, "grad_norm": 0.5283942222595215, "learning_rate": 8.865502889271901e-05, "loss": 0.0416, "step": 3000 }, { "epoch": 5.171821305841925, "grad_norm": 0.3286329209804535, "learning_rate": 8.85647562692145e-05, "loss": 0.0249, "step": 3010 }, { "epoch": 5.189003436426117, "grad_norm": 0.5245858430862427, "learning_rate": 8.847417226336561e-05, "loss": 0.0272, "step": 3020 }, { "epoch": 5.206185567010309, "grad_norm": 0.3810178339481354, "learning_rate": 8.83832776065723e-05, "loss": 0.0238, "step": 3030 }, { "epoch": 5.223367697594502, "grad_norm": 0.48333027958869934, "learning_rate": 8.829207303274279e-05, "loss": 0.0262, "step": 3040 }, { "epoch": 5.240549828178694, "grad_norm": 0.4119150638580322, "learning_rate": 8.820055927828762e-05, "loss": 0.0243, "step": 3050 }, { "epoch": 5.257731958762887, "grad_norm": 0.26957616209983826, "learning_rate": 8.810873708211383e-05, "loss": 0.0228, "step": 3060 }, { "epoch": 5.274914089347079, "grad_norm": 0.502048134803772, "learning_rate": 8.801660718561875e-05, "loss": 0.0215, "step": 3070 }, { "epoch": 5.292096219931271, "grad_norm": 0.3506264090538025, "learning_rate": 8.79241703326843e-05, "loss": 0.0335, "step": 3080 }, { "epoch": 5.309278350515464, "grad_norm": 0.42758750915527344, "learning_rate": 8.78314272696708e-05, "loss": 0.0294, "step": 3090 }, { "epoch": 5.326460481099656, "grad_norm": 0.25186318159103394, "learning_rate": 8.773837874541099e-05, "loss": 0.0319, "step": 3100 }, { "epoch": 5.3436426116838485, "grad_norm": 0.310088187456131, "learning_rate": 8.7645025511204e-05, "loss": 0.0263, "step": 3110 }, { "epoch": 5.360824742268041, "grad_norm": 0.3250679075717926, "learning_rate": 8.755136832080927e-05, "loss": 0.027, "step": 3120 }, { "epoch": 5.378006872852234, "grad_norm": 0.3429087698459625, "learning_rate": 8.745740793044046e-05, "loss": 0.024, "step": 3130 }, { "epoch": 5.3951890034364265, "grad_norm": 0.2694869637489319, "learning_rate": 8.736314509875934e-05, "loss": 0.0256, "step": 3140 }, { "epoch": 5.412371134020619, "grad_norm": 0.32267141342163086, "learning_rate": 8.726858058686968e-05, "loss": 0.0269, "step": 3150 }, { "epoch": 5.429553264604811, "grad_norm": 0.3753204643726349, "learning_rate": 8.717371515831112e-05, "loss": 0.0209, "step": 3160 }, { "epoch": 5.446735395189004, "grad_norm": 0.258056640625, "learning_rate": 8.707854957905294e-05, "loss": 0.0236, "step": 3170 }, { "epoch": 5.463917525773196, "grad_norm": 0.19171011447906494, "learning_rate": 8.698308461748799e-05, "loss": 0.0241, "step": 3180 }, { "epoch": 5.481099656357388, "grad_norm": 0.20204898715019226, "learning_rate": 8.688732104442632e-05, "loss": 0.0166, "step": 3190 }, { "epoch": 5.498281786941581, "grad_norm": 0.4373096227645874, "learning_rate": 8.679125963308909e-05, "loss": 0.0271, "step": 3200 }, { "epoch": 5.515463917525773, "grad_norm": 0.24470694363117218, "learning_rate": 8.669490115910234e-05, "loss": 0.0193, "step": 3210 }, { "epoch": 5.5326460481099655, "grad_norm": 0.3045484125614166, "learning_rate": 8.659824640049063e-05, "loss": 0.0203, "step": 3220 }, { "epoch": 5.549828178694158, "grad_norm": 0.18822188675403595, "learning_rate": 8.650129613767075e-05, "loss": 0.0179, "step": 3230 }, { "epoch": 5.56701030927835, "grad_norm": 0.2681497037410736, "learning_rate": 8.640405115344557e-05, "loss": 0.0179, "step": 3240 }, { "epoch": 5.584192439862543, "grad_norm": 0.2358640730381012, "learning_rate": 8.630651223299755e-05, "loss": 0.0231, "step": 3250 }, { "epoch": 5.601374570446735, "grad_norm": 0.33891239762306213, "learning_rate": 8.620868016388252e-05, "loss": 0.02, "step": 3260 }, { "epoch": 5.618556701030927, "grad_norm": 0.5138821005821228, "learning_rate": 8.611055573602323e-05, "loss": 0.0258, "step": 3270 }, { "epoch": 5.63573883161512, "grad_norm": 0.20864839851856232, "learning_rate": 8.601213974170303e-05, "loss": 0.0178, "step": 3280 }, { "epoch": 5.652920962199313, "grad_norm": 0.1877431720495224, "learning_rate": 8.591343297555947e-05, "loss": 0.0208, "step": 3290 }, { "epoch": 5.670103092783505, "grad_norm": 0.38843151926994324, "learning_rate": 8.581443623457785e-05, "loss": 0.0277, "step": 3300 }, { "epoch": 5.687285223367698, "grad_norm": 0.22977161407470703, "learning_rate": 8.571515031808484e-05, "loss": 0.0169, "step": 3310 }, { "epoch": 5.70446735395189, "grad_norm": 0.39261528849601746, "learning_rate": 8.561557602774196e-05, "loss": 0.0151, "step": 3320 }, { "epoch": 5.721649484536083, "grad_norm": 0.20939397811889648, "learning_rate": 8.551571416753912e-05, "loss": 0.0247, "step": 3330 }, { "epoch": 5.738831615120275, "grad_norm": 0.3138323724269867, "learning_rate": 8.54155655437882e-05, "loss": 0.0202, "step": 3340 }, { "epoch": 5.756013745704467, "grad_norm": 0.2958749830722809, "learning_rate": 8.531513096511646e-05, "loss": 0.0239, "step": 3350 }, { "epoch": 5.77319587628866, "grad_norm": 0.43186619877815247, "learning_rate": 8.521441124246002e-05, "loss": 0.028, "step": 3360 }, { "epoch": 5.790378006872852, "grad_norm": 0.3215327262878418, "learning_rate": 8.511340718905737e-05, "loss": 0.0273, "step": 3370 }, { "epoch": 5.8075601374570445, "grad_norm": 0.584010899066925, "learning_rate": 8.501211962044275e-05, "loss": 0.0241, "step": 3380 }, { "epoch": 5.824742268041237, "grad_norm": 0.4198577404022217, "learning_rate": 8.491054935443954e-05, "loss": 0.0197, "step": 3390 }, { "epoch": 5.841924398625429, "grad_norm": 0.4115603566169739, "learning_rate": 8.480869721115375e-05, "loss": 0.0201, "step": 3400 }, { "epoch": 5.859106529209622, "grad_norm": 0.19503287971019745, "learning_rate": 8.470656401296732e-05, "loss": 0.0226, "step": 3410 }, { "epoch": 5.876288659793815, "grad_norm": 0.3533823490142822, "learning_rate": 8.460415058453153e-05, "loss": 0.0245, "step": 3420 }, { "epoch": 5.893470790378007, "grad_norm": 0.22459329664707184, "learning_rate": 8.450145775276024e-05, "loss": 0.0203, "step": 3430 }, { "epoch": 5.9106529209622, "grad_norm": 0.5531524419784546, "learning_rate": 8.439848634682337e-05, "loss": 0.0347, "step": 3440 }, { "epoch": 5.927835051546392, "grad_norm": 0.3939720690250397, "learning_rate": 8.429523719814008e-05, "loss": 0.0217, "step": 3450 }, { "epoch": 5.945017182130584, "grad_norm": 0.26560521125793457, "learning_rate": 8.419171114037214e-05, "loss": 0.0249, "step": 3460 }, { "epoch": 5.962199312714777, "grad_norm": 0.26765570044517517, "learning_rate": 8.40879090094171e-05, "loss": 0.022, "step": 3470 }, { "epoch": 5.979381443298969, "grad_norm": 0.32663553953170776, "learning_rate": 8.398383164340167e-05, "loss": 0.0234, "step": 3480 }, { "epoch": 5.9965635738831615, "grad_norm": 0.3831205368041992, "learning_rate": 8.387947988267482e-05, "loss": 0.0265, "step": 3490 }, { "epoch": 6.013745704467354, "grad_norm": 0.35195666551589966, "learning_rate": 8.37748545698011e-05, "loss": 0.0182, "step": 3500 }, { "epoch": 6.030927835051546, "grad_norm": 0.3900887370109558, "learning_rate": 8.366995654955375e-05, "loss": 0.0234, "step": 3510 }, { "epoch": 6.048109965635739, "grad_norm": 0.41412341594696045, "learning_rate": 8.356478666890798e-05, "loss": 0.0254, "step": 3520 }, { "epoch": 6.065292096219931, "grad_norm": 0.2592662572860718, "learning_rate": 8.345934577703403e-05, "loss": 0.0163, "step": 3530 }, { "epoch": 6.082474226804123, "grad_norm": 0.3936319947242737, "learning_rate": 8.335363472529038e-05, "loss": 0.0266, "step": 3540 }, { "epoch": 6.099656357388316, "grad_norm": 0.3583790957927704, "learning_rate": 8.324765436721688e-05, "loss": 0.0178, "step": 3550 }, { "epoch": 6.116838487972508, "grad_norm": 0.4558425843715668, "learning_rate": 8.314140555852777e-05, "loss": 0.0259, "step": 3560 }, { "epoch": 6.134020618556701, "grad_norm": 0.3604467213153839, "learning_rate": 8.303488915710484e-05, "loss": 0.027, "step": 3570 }, { "epoch": 6.151202749140894, "grad_norm": 0.22830836474895477, "learning_rate": 8.292810602299059e-05, "loss": 0.0239, "step": 3580 }, { "epoch": 6.168384879725086, "grad_norm": 0.18954436480998993, "learning_rate": 8.282105701838106e-05, "loss": 0.0203, "step": 3590 }, { "epoch": 6.185567010309279, "grad_norm": 0.25453153252601624, "learning_rate": 8.271374300761911e-05, "loss": 0.0247, "step": 3600 }, { "epoch": 6.202749140893471, "grad_norm": 0.3951425552368164, "learning_rate": 8.260616485718727e-05, "loss": 0.0256, "step": 3610 }, { "epoch": 6.219931271477663, "grad_norm": 0.3867959976196289, "learning_rate": 8.249832343570082e-05, "loss": 0.0218, "step": 3620 }, { "epoch": 6.237113402061856, "grad_norm": 0.24521775543689728, "learning_rate": 8.239021961390078e-05, "loss": 0.0258, "step": 3630 }, { "epoch": 6.254295532646048, "grad_norm": 0.3367408215999603, "learning_rate": 8.228185426464684e-05, "loss": 0.0184, "step": 3640 }, { "epoch": 6.2714776632302405, "grad_norm": 0.28449004888534546, "learning_rate": 8.217322826291032e-05, "loss": 0.0235, "step": 3650 }, { "epoch": 6.288659793814433, "grad_norm": 0.23285141587257385, "learning_rate": 8.206434248576718e-05, "loss": 0.0249, "step": 3660 }, { "epoch": 6.305841924398625, "grad_norm": 0.4478093087673187, "learning_rate": 8.195519781239079e-05, "loss": 0.023, "step": 3670 }, { "epoch": 6.323024054982818, "grad_norm": 0.3469564914703369, "learning_rate": 8.1845795124045e-05, "loss": 0.022, "step": 3680 }, { "epoch": 6.34020618556701, "grad_norm": 0.24919480085372925, "learning_rate": 8.173613530407691e-05, "loss": 0.0191, "step": 3690 }, { "epoch": 6.357388316151202, "grad_norm": 0.27461591362953186, "learning_rate": 8.162621923790974e-05, "loss": 0.0222, "step": 3700 }, { "epoch": 6.374570446735396, "grad_norm": 0.35929545760154724, "learning_rate": 8.151604781303577e-05, "loss": 0.021, "step": 3710 }, { "epoch": 6.391752577319588, "grad_norm": 0.4438592791557312, "learning_rate": 8.140562191900909e-05, "loss": 0.0266, "step": 3720 }, { "epoch": 6.40893470790378, "grad_norm": 0.35622140765190125, "learning_rate": 8.129494244743842e-05, "loss": 0.0227, "step": 3730 }, { "epoch": 6.426116838487973, "grad_norm": 0.3558623790740967, "learning_rate": 8.118401029197996e-05, "loss": 0.0294, "step": 3740 }, { "epoch": 6.443298969072165, "grad_norm": 0.30522775650024414, "learning_rate": 8.107282634833015e-05, "loss": 0.0221, "step": 3750 }, { "epoch": 6.4604810996563575, "grad_norm": 0.3831705152988434, "learning_rate": 8.096139151421842e-05, "loss": 0.0198, "step": 3760 }, { "epoch": 6.47766323024055, "grad_norm": 0.2840515673160553, "learning_rate": 8.084970668939998e-05, "loss": 0.0215, "step": 3770 }, { "epoch": 6.494845360824742, "grad_norm": 0.30647212266921997, "learning_rate": 8.07377727756485e-05, "loss": 0.0179, "step": 3780 }, { "epoch": 6.512027491408935, "grad_norm": 0.2785893380641937, "learning_rate": 8.06255906767489e-05, "loss": 0.0205, "step": 3790 }, { "epoch": 6.529209621993127, "grad_norm": 0.30499890446662903, "learning_rate": 8.051316129849e-05, "loss": 0.0273, "step": 3800 }, { "epoch": 6.546391752577319, "grad_norm": 0.26266801357269287, "learning_rate": 8.04004855486572e-05, "loss": 0.0264, "step": 3810 }, { "epoch": 6.563573883161512, "grad_norm": 0.24231982231140137, "learning_rate": 8.02875643370252e-05, "loss": 0.0213, "step": 3820 }, { "epoch": 6.580756013745704, "grad_norm": 0.21040430665016174, "learning_rate": 8.01743985753506e-05, "loss": 0.0182, "step": 3830 }, { "epoch": 6.597938144329897, "grad_norm": 0.30289342999458313, "learning_rate": 8.006098917736461e-05, "loss": 0.0231, "step": 3840 }, { "epoch": 6.615120274914089, "grad_norm": 0.3555678427219391, "learning_rate": 7.994733705876558e-05, "loss": 0.0188, "step": 3850 }, { "epoch": 6.632302405498281, "grad_norm": 0.2579226493835449, "learning_rate": 7.983344313721166e-05, "loss": 0.0218, "step": 3860 }, { "epoch": 6.649484536082475, "grad_norm": 0.3365667164325714, "learning_rate": 7.971930833231338e-05, "loss": 0.0202, "step": 3870 }, { "epoch": 6.666666666666667, "grad_norm": 0.14933505654335022, "learning_rate": 7.960493356562624e-05, "loss": 0.0208, "step": 3880 }, { "epoch": 6.683848797250859, "grad_norm": 0.1804179847240448, "learning_rate": 7.949031976064327e-05, "loss": 0.0237, "step": 3890 }, { "epoch": 6.701030927835052, "grad_norm": 0.3504364788532257, "learning_rate": 7.937546784278753e-05, "loss": 0.0185, "step": 3900 }, { "epoch": 6.718213058419244, "grad_norm": 0.2884169816970825, "learning_rate": 7.926037873940469e-05, "loss": 0.0166, "step": 3910 }, { "epoch": 6.7353951890034365, "grad_norm": 0.3002106547355652, "learning_rate": 7.91450533797555e-05, "loss": 0.0198, "step": 3920 }, { "epoch": 6.752577319587629, "grad_norm": 0.4687197506427765, "learning_rate": 7.902949269500835e-05, "loss": 0.0222, "step": 3930 }, { "epoch": 6.769759450171821, "grad_norm": 0.3774946331977844, "learning_rate": 7.891369761823164e-05, "loss": 0.0245, "step": 3940 }, { "epoch": 6.786941580756014, "grad_norm": 0.43464595079421997, "learning_rate": 7.879766908438638e-05, "loss": 0.0238, "step": 3950 }, { "epoch": 6.804123711340206, "grad_norm": 0.338309109210968, "learning_rate": 7.868140803031853e-05, "loss": 0.0322, "step": 3960 }, { "epoch": 6.821305841924398, "grad_norm": 0.4015257954597473, "learning_rate": 7.85649153947515e-05, "loss": 0.0219, "step": 3970 }, { "epoch": 6.838487972508591, "grad_norm": 0.45906925201416016, "learning_rate": 7.844819211827861e-05, "loss": 0.0219, "step": 3980 }, { "epoch": 6.855670103092783, "grad_norm": 0.25680992007255554, "learning_rate": 7.83312391433553e-05, "loss": 0.0191, "step": 3990 }, { "epoch": 6.872852233676976, "grad_norm": 0.35143017768859863, "learning_rate": 7.821405741429179e-05, "loss": 0.0172, "step": 4000 }, { "epoch": 6.890034364261169, "grad_norm": 0.2712146043777466, "learning_rate": 7.809664787724527e-05, "loss": 0.0207, "step": 4010 }, { "epoch": 6.907216494845361, "grad_norm": 0.29727092385292053, "learning_rate": 7.79790114802123e-05, "loss": 0.0156, "step": 4020 }, { "epoch": 6.9243986254295535, "grad_norm": 0.2169165462255478, "learning_rate": 7.786114917302118e-05, "loss": 0.0213, "step": 4030 }, { "epoch": 6.941580756013746, "grad_norm": 0.27955862879753113, "learning_rate": 7.77430619073243e-05, "loss": 0.0205, "step": 4040 }, { "epoch": 6.958762886597938, "grad_norm": 0.1645563244819641, "learning_rate": 7.762475063659038e-05, "loss": 0.0233, "step": 4050 }, { "epoch": 6.975945017182131, "grad_norm": 0.22075164318084717, "learning_rate": 7.750621631609684e-05, "loss": 0.0229, "step": 4060 }, { "epoch": 6.993127147766323, "grad_norm": 0.19816423952579498, "learning_rate": 7.738745990292208e-05, "loss": 0.0269, "step": 4070 }, { "epoch": 7.010309278350515, "grad_norm": 0.2696300148963928, "learning_rate": 7.726848235593771e-05, "loss": 0.0209, "step": 4080 }, { "epoch": 7.027491408934708, "grad_norm": 0.3369583785533905, "learning_rate": 7.714928463580084e-05, "loss": 0.0192, "step": 4090 }, { "epoch": 7.0446735395189, "grad_norm": 0.20987503230571747, "learning_rate": 7.702986770494633e-05, "loss": 0.0203, "step": 4100 }, { "epoch": 7.061855670103093, "grad_norm": 0.220789834856987, "learning_rate": 7.691023252757901e-05, "loss": 0.0176, "step": 4110 }, { "epoch": 7.079037800687285, "grad_norm": 0.37767869234085083, "learning_rate": 7.679038006966587e-05, "loss": 0.0208, "step": 4120 }, { "epoch": 7.096219931271477, "grad_norm": 0.3345067799091339, "learning_rate": 7.66703112989283e-05, "loss": 0.0168, "step": 4130 }, { "epoch": 7.11340206185567, "grad_norm": 0.3052999973297119, "learning_rate": 7.655002718483424e-05, "loss": 0.0143, "step": 4140 }, { "epoch": 7.130584192439863, "grad_norm": 0.3365825414657593, "learning_rate": 7.64295286985904e-05, "loss": 0.0206, "step": 4150 }, { "epoch": 7.147766323024055, "grad_norm": 0.19445881247520447, "learning_rate": 7.630881681313436e-05, "loss": 0.0164, "step": 4160 }, { "epoch": 7.164948453608248, "grad_norm": 0.3136243224143982, "learning_rate": 7.618789250312675e-05, "loss": 0.0141, "step": 4170 }, { "epoch": 7.18213058419244, "grad_norm": 0.19267341494560242, "learning_rate": 7.606675674494341e-05, "loss": 0.0178, "step": 4180 }, { "epoch": 7.1993127147766325, "grad_norm": 0.1413758099079132, "learning_rate": 7.594541051666742e-05, "loss": 0.0179, "step": 4190 }, { "epoch": 7.216494845360825, "grad_norm": 0.19496262073516846, "learning_rate": 7.582385479808127e-05, "loss": 0.0141, "step": 4200 }, { "epoch": 7.233676975945017, "grad_norm": 0.27552464604377747, "learning_rate": 7.570209057065894e-05, "loss": 0.0184, "step": 4210 }, { "epoch": 7.25085910652921, "grad_norm": 0.19228124618530273, "learning_rate": 7.558011881755797e-05, "loss": 0.0144, "step": 4220 }, { "epoch": 7.268041237113402, "grad_norm": 0.29144996404647827, "learning_rate": 7.545794052361149e-05, "loss": 0.0172, "step": 4230 }, { "epoch": 7.285223367697594, "grad_norm": 0.21185532212257385, "learning_rate": 7.533555667532035e-05, "loss": 0.0126, "step": 4240 }, { "epoch": 7.302405498281787, "grad_norm": 0.4170054495334625, "learning_rate": 7.521296826084503e-05, "loss": 0.0231, "step": 4250 }, { "epoch": 7.319587628865979, "grad_norm": 0.26662755012512207, "learning_rate": 7.50901762699978e-05, "loss": 0.0161, "step": 4260 }, { "epoch": 7.3367697594501715, "grad_norm": 0.323034405708313, "learning_rate": 7.496718169423462e-05, "loss": 0.0172, "step": 4270 }, { "epoch": 7.353951890034364, "grad_norm": 0.22829285264015198, "learning_rate": 7.484398552664722e-05, "loss": 0.018, "step": 4280 }, { "epoch": 7.371134020618557, "grad_norm": 0.4607219099998474, "learning_rate": 7.472058876195496e-05, "loss": 0.0302, "step": 4290 }, { "epoch": 7.3883161512027495, "grad_norm": 0.3345796763896942, "learning_rate": 7.459699239649696e-05, "loss": 0.0177, "step": 4300 }, { "epoch": 7.405498281786942, "grad_norm": 0.35420554876327515, "learning_rate": 7.447319742822392e-05, "loss": 0.0166, "step": 4310 }, { "epoch": 7.422680412371134, "grad_norm": 0.2642367482185364, "learning_rate": 7.43492048566901e-05, "loss": 0.0186, "step": 4320 }, { "epoch": 7.439862542955327, "grad_norm": 0.3100736141204834, "learning_rate": 7.422501568304535e-05, "loss": 0.0242, "step": 4330 }, { "epoch": 7.457044673539519, "grad_norm": 0.34664222598075867, "learning_rate": 7.410063091002682e-05, "loss": 0.0139, "step": 4340 }, { "epoch": 7.474226804123711, "grad_norm": 0.2938918471336365, "learning_rate": 7.397605154195106e-05, "loss": 0.016, "step": 4350 }, { "epoch": 7.491408934707904, "grad_norm": 0.34358811378479004, "learning_rate": 7.385127858470582e-05, "loss": 0.0178, "step": 4360 }, { "epoch": 7.508591065292096, "grad_norm": 0.2687462568283081, "learning_rate": 7.372631304574194e-05, "loss": 0.0186, "step": 4370 }, { "epoch": 7.525773195876289, "grad_norm": 0.24897870421409607, "learning_rate": 7.36011559340652e-05, "loss": 0.0178, "step": 4380 }, { "epoch": 7.542955326460481, "grad_norm": 0.36554020643234253, "learning_rate": 7.347580826022821e-05, "loss": 0.0218, "step": 4390 }, { "epoch": 7.560137457044673, "grad_norm": 0.25389084219932556, "learning_rate": 7.335027103632223e-05, "loss": 0.016, "step": 4400 }, { "epoch": 7.577319587628866, "grad_norm": 0.3902638256549835, "learning_rate": 7.322454527596898e-05, "loss": 0.0179, "step": 4410 }, { "epoch": 7.594501718213058, "grad_norm": 0.220624178647995, "learning_rate": 7.30986319943125e-05, "loss": 0.0147, "step": 4420 }, { "epoch": 7.6116838487972505, "grad_norm": 0.3258158266544342, "learning_rate": 7.29725322080109e-05, "loss": 0.0179, "step": 4430 }, { "epoch": 7.628865979381443, "grad_norm": 0.34806087613105774, "learning_rate": 7.28462469352282e-05, "loss": 0.0218, "step": 4440 }, { "epoch": 7.646048109965636, "grad_norm": 0.28883498907089233, "learning_rate": 7.271977719562611e-05, "loss": 0.017, "step": 4450 }, { "epoch": 7.6632302405498285, "grad_norm": 0.17664246261119843, "learning_rate": 7.259312401035572e-05, "loss": 0.0154, "step": 4460 }, { "epoch": 7.680412371134021, "grad_norm": 0.3173231780529022, "learning_rate": 7.246628840204935e-05, "loss": 0.0209, "step": 4470 }, { "epoch": 7.697594501718213, "grad_norm": 0.34185221791267395, "learning_rate": 7.233927139481224e-05, "loss": 0.0174, "step": 4480 }, { "epoch": 7.714776632302406, "grad_norm": 0.3024695813655853, "learning_rate": 7.221207401421428e-05, "loss": 0.021, "step": 4490 }, { "epoch": 7.731958762886598, "grad_norm": 0.3330129086971283, "learning_rate": 7.208469728728178e-05, "loss": 0.0295, "step": 4500 }, { "epoch": 7.74914089347079, "grad_norm": 0.29602715373039246, "learning_rate": 7.195714224248912e-05, "loss": 0.0182, "step": 4510 }, { "epoch": 7.766323024054983, "grad_norm": 0.18014559149742126, "learning_rate": 7.182940990975048e-05, "loss": 0.0196, "step": 4520 }, { "epoch": 7.783505154639175, "grad_norm": 0.2823367714881897, "learning_rate": 7.170150132041146e-05, "loss": 0.0233, "step": 4530 }, { "epoch": 7.8006872852233675, "grad_norm": 0.31760045886039734, "learning_rate": 7.15734175072409e-05, "loss": 0.0155, "step": 4540 }, { "epoch": 7.81786941580756, "grad_norm": 0.2565371096134186, "learning_rate": 7.144515950442232e-05, "loss": 0.0221, "step": 4550 }, { "epoch": 7.835051546391752, "grad_norm": 0.39871808886528015, "learning_rate": 7.131672834754582e-05, "loss": 0.0184, "step": 4560 }, { "epoch": 7.852233676975945, "grad_norm": 0.3175216615200043, "learning_rate": 7.11881250735995e-05, "loss": 0.0184, "step": 4570 }, { "epoch": 7.869415807560138, "grad_norm": 0.37690746784210205, "learning_rate": 7.105935072096125e-05, "loss": 0.0198, "step": 4580 }, { "epoch": 7.88659793814433, "grad_norm": 0.24447882175445557, "learning_rate": 7.093040632939023e-05, "loss": 0.0151, "step": 4590 }, { "epoch": 7.903780068728523, "grad_norm": 0.2845030725002289, "learning_rate": 7.08012929400186e-05, "loss": 0.0168, "step": 4600 }, { "epoch": 7.920962199312715, "grad_norm": 0.2334176003932953, "learning_rate": 7.067201159534299e-05, "loss": 0.0238, "step": 4610 }, { "epoch": 7.938144329896907, "grad_norm": 0.2806495726108551, "learning_rate": 7.054256333921623e-05, "loss": 0.021, "step": 4620 }, { "epoch": 7.9553264604811, "grad_norm": 0.25240814685821533, "learning_rate": 7.041294921683876e-05, "loss": 0.0153, "step": 4630 }, { "epoch": 7.972508591065292, "grad_norm": 0.4571000337600708, "learning_rate": 7.02831702747503e-05, "loss": 0.0175, "step": 4640 }, { "epoch": 7.989690721649485, "grad_norm": 0.27207332849502563, "learning_rate": 7.01532275608214e-05, "loss": 0.0198, "step": 4650 }, { "epoch": 8.006872852233677, "grad_norm": 0.3235473930835724, "learning_rate": 7.002312212424488e-05, "loss": 0.0243, "step": 4660 }, { "epoch": 8.02405498281787, "grad_norm": 0.20169375836849213, "learning_rate": 6.989285501552751e-05, "loss": 0.0254, "step": 4670 }, { "epoch": 8.041237113402062, "grad_norm": 0.23877793550491333, "learning_rate": 6.976242728648137e-05, "loss": 0.0125, "step": 4680 }, { "epoch": 8.058419243986254, "grad_norm": 0.2380063384771347, "learning_rate": 6.963183999021546e-05, "loss": 0.0293, "step": 4690 }, { "epoch": 8.075601374570446, "grad_norm": 0.27434396743774414, "learning_rate": 6.95010941811272e-05, "loss": 0.0203, "step": 4700 }, { "epoch": 8.092783505154639, "grad_norm": 0.24492555856704712, "learning_rate": 6.93701909148938e-05, "loss": 0.0196, "step": 4710 }, { "epoch": 8.109965635738831, "grad_norm": 0.22814416885375977, "learning_rate": 6.923913124846397e-05, "loss": 0.0174, "step": 4720 }, { "epoch": 8.127147766323024, "grad_norm": 0.2595348358154297, "learning_rate": 6.910791624004907e-05, "loss": 0.0151, "step": 4730 }, { "epoch": 8.144329896907216, "grad_norm": 0.40572383999824524, "learning_rate": 6.897654694911486e-05, "loss": 0.021, "step": 4740 }, { "epoch": 8.161512027491408, "grad_norm": 0.36821913719177246, "learning_rate": 6.884502443637273e-05, "loss": 0.0167, "step": 4750 }, { "epoch": 8.1786941580756, "grad_norm": 0.2500125467777252, "learning_rate": 6.871334976377132e-05, "loss": 0.016, "step": 4760 }, { "epoch": 8.195876288659793, "grad_norm": 0.2473415732383728, "learning_rate": 6.858152399448773e-05, "loss": 0.0187, "step": 4770 }, { "epoch": 8.213058419243985, "grad_norm": 0.2067149579524994, "learning_rate": 6.844954819291918e-05, "loss": 0.0264, "step": 4780 }, { "epoch": 8.230240549828178, "grad_norm": 0.24544283747673035, "learning_rate": 6.831742342467418e-05, "loss": 0.0207, "step": 4790 }, { "epoch": 8.24742268041237, "grad_norm": 0.30843910574913025, "learning_rate": 6.818515075656412e-05, "loss": 0.017, "step": 4800 }, { "epoch": 8.264604810996564, "grad_norm": 0.3309854567050934, "learning_rate": 6.805273125659455e-05, "loss": 0.0179, "step": 4810 }, { "epoch": 8.281786941580757, "grad_norm": 0.21837979555130005, "learning_rate": 6.792016599395655e-05, "loss": 0.011, "step": 4820 }, { "epoch": 8.29896907216495, "grad_norm": 0.3258560597896576, "learning_rate": 6.778745603901817e-05, "loss": 0.0168, "step": 4830 }, { "epoch": 8.316151202749142, "grad_norm": 0.3291252553462982, "learning_rate": 6.765460246331573e-05, "loss": 0.0197, "step": 4840 }, { "epoch": 8.333333333333334, "grad_norm": 0.33732980489730835, "learning_rate": 6.752160633954515e-05, "loss": 0.0138, "step": 4850 }, { "epoch": 8.350515463917526, "grad_norm": 0.2825522720813751, "learning_rate": 6.73884687415534e-05, "loss": 0.0156, "step": 4860 }, { "epoch": 8.367697594501719, "grad_norm": 0.28338858485221863, "learning_rate": 6.725519074432965e-05, "loss": 0.0215, "step": 4870 }, { "epoch": 8.384879725085911, "grad_norm": 0.258777916431427, "learning_rate": 6.712177342399679e-05, "loss": 0.0197, "step": 4880 }, { "epoch": 8.402061855670103, "grad_norm": 0.3022059202194214, "learning_rate": 6.698821785780257e-05, "loss": 0.0177, "step": 4890 }, { "epoch": 8.419243986254296, "grad_norm": 0.23812155425548553, "learning_rate": 6.685452512411102e-05, "loss": 0.0179, "step": 4900 }, { "epoch": 8.436426116838488, "grad_norm": 0.1747688353061676, "learning_rate": 6.672069630239366e-05, "loss": 0.0207, "step": 4910 }, { "epoch": 8.45360824742268, "grad_norm": 0.38623926043510437, "learning_rate": 6.658673247322086e-05, "loss": 0.02, "step": 4920 }, { "epoch": 8.470790378006873, "grad_norm": 0.2657296657562256, "learning_rate": 6.645263471825303e-05, "loss": 0.0139, "step": 4930 }, { "epoch": 8.487972508591065, "grad_norm": 0.3186751902103424, "learning_rate": 6.631840412023201e-05, "loss": 0.0163, "step": 4940 }, { "epoch": 8.505154639175258, "grad_norm": 0.22730350494384766, "learning_rate": 6.618404176297217e-05, "loss": 0.015, "step": 4950 }, { "epoch": 8.52233676975945, "grad_norm": 0.4089230000972748, "learning_rate": 6.604954873135178e-05, "loss": 0.017, "step": 4960 }, { "epoch": 8.539518900343642, "grad_norm": 0.2689635753631592, "learning_rate": 6.591492611130421e-05, "loss": 0.0166, "step": 4970 }, { "epoch": 8.556701030927835, "grad_norm": 0.2454978972673416, "learning_rate": 6.578017498980913e-05, "loss": 0.0133, "step": 4980 }, { "epoch": 8.573883161512027, "grad_norm": 0.17714907228946686, "learning_rate": 6.564529645488383e-05, "loss": 0.018, "step": 4990 }, { "epoch": 8.59106529209622, "grad_norm": 0.26513901352882385, "learning_rate": 6.551029159557431e-05, "loss": 0.0194, "step": 5000 }, { "epoch": 8.608247422680412, "grad_norm": 0.20744717121124268, "learning_rate": 6.537516150194656e-05, "loss": 0.0187, "step": 5010 }, { "epoch": 8.625429553264604, "grad_norm": 0.21573315560817719, "learning_rate": 6.523990726507777e-05, "loss": 0.0178, "step": 5020 }, { "epoch": 8.642611683848797, "grad_norm": 0.1612836867570877, "learning_rate": 6.510452997704748e-05, "loss": 0.0195, "step": 5030 }, { "epoch": 8.65979381443299, "grad_norm": 0.23485371470451355, "learning_rate": 6.496903073092878e-05, "loss": 0.016, "step": 5040 }, { "epoch": 8.676975945017182, "grad_norm": 0.25484392046928406, "learning_rate": 6.483341062077948e-05, "loss": 0.0141, "step": 5050 }, { "epoch": 8.694158075601374, "grad_norm": 0.24695904552936554, "learning_rate": 6.46976707416333e-05, "loss": 0.0122, "step": 5060 }, { "epoch": 8.711340206185566, "grad_norm": 0.10241147875785828, "learning_rate": 6.456181218949096e-05, "loss": 0.0175, "step": 5070 }, { "epoch": 8.728522336769759, "grad_norm": 0.31217408180236816, "learning_rate": 6.442583606131143e-05, "loss": 0.0151, "step": 5080 }, { "epoch": 8.745704467353953, "grad_norm": 0.3693694472312927, "learning_rate": 6.428974345500299e-05, "loss": 0.0199, "step": 5090 }, { "epoch": 8.762886597938145, "grad_norm": 0.23745276033878326, "learning_rate": 6.415353546941441e-05, "loss": 0.0221, "step": 5100 }, { "epoch": 8.780068728522338, "grad_norm": 0.20179122686386108, "learning_rate": 6.401721320432604e-05, "loss": 0.0155, "step": 5110 }, { "epoch": 8.79725085910653, "grad_norm": 0.36349353194236755, "learning_rate": 6.388077776044102e-05, "loss": 0.0183, "step": 5120 }, { "epoch": 8.814432989690722, "grad_norm": 0.274783194065094, "learning_rate": 6.374423023937621e-05, "loss": 0.0153, "step": 5130 }, { "epoch": 8.831615120274915, "grad_norm": 0.29849973320961, "learning_rate": 6.360757174365355e-05, "loss": 0.0174, "step": 5140 }, { "epoch": 8.848797250859107, "grad_norm": 0.21367676556110382, "learning_rate": 6.34708033766909e-05, "loss": 0.0181, "step": 5150 }, { "epoch": 8.8659793814433, "grad_norm": 0.23595260083675385, "learning_rate": 6.333392624279333e-05, "loss": 0.0174, "step": 5160 }, { "epoch": 8.883161512027492, "grad_norm": 0.16049842536449432, "learning_rate": 6.319694144714407e-05, "loss": 0.0151, "step": 5170 }, { "epoch": 8.900343642611684, "grad_norm": 0.2062782198190689, "learning_rate": 6.30598500957957e-05, "loss": 0.0211, "step": 5180 }, { "epoch": 8.917525773195877, "grad_norm": 0.3576521873474121, "learning_rate": 6.292265329566108e-05, "loss": 0.0149, "step": 5190 }, { "epoch": 8.934707903780069, "grad_norm": 0.28101012110710144, "learning_rate": 6.278535215450458e-05, "loss": 0.0162, "step": 5200 }, { "epoch": 8.951890034364261, "grad_norm": 0.2609540522098541, "learning_rate": 6.264794778093297e-05, "loss": 0.0171, "step": 5210 }, { "epoch": 8.969072164948454, "grad_norm": 0.27727997303009033, "learning_rate": 6.25104412843866e-05, "loss": 0.0132, "step": 5220 }, { "epoch": 8.986254295532646, "grad_norm": 0.21067747473716736, "learning_rate": 6.237283377513036e-05, "loss": 0.0168, "step": 5230 }, { "epoch": 9.003436426116838, "grad_norm": 0.41480588912963867, "learning_rate": 6.223512636424478e-05, "loss": 0.0197, "step": 5240 }, { "epoch": 9.02061855670103, "grad_norm": 0.2617255449295044, "learning_rate": 6.209732016361696e-05, "loss": 0.0106, "step": 5250 }, { "epoch": 9.037800687285223, "grad_norm": 0.1343929022550583, "learning_rate": 6.19594162859317e-05, "loss": 0.0186, "step": 5260 }, { "epoch": 9.054982817869416, "grad_norm": 0.22022658586502075, "learning_rate": 6.182141584466247e-05, "loss": 0.0152, "step": 5270 }, { "epoch": 9.072164948453608, "grad_norm": 0.19647003710269928, "learning_rate": 6.168331995406244e-05, "loss": 0.0124, "step": 5280 }, { "epoch": 9.0893470790378, "grad_norm": 0.225993350148201, "learning_rate": 6.154512972915542e-05, "loss": 0.0182, "step": 5290 }, { "epoch": 9.106529209621993, "grad_norm": 0.2652854919433594, "learning_rate": 6.140684628572688e-05, "loss": 0.0203, "step": 5300 }, { "epoch": 9.123711340206185, "grad_norm": 0.18200494349002838, "learning_rate": 6.126847074031507e-05, "loss": 0.0241, "step": 5310 }, { "epoch": 9.140893470790378, "grad_norm": 0.24488599598407745, "learning_rate": 6.113000421020176e-05, "loss": 0.0178, "step": 5320 }, { "epoch": 9.15807560137457, "grad_norm": 0.28431186079978943, "learning_rate": 6.099144781340347e-05, "loss": 0.0231, "step": 5330 }, { "epoch": 9.175257731958762, "grad_norm": 0.2814132869243622, "learning_rate": 6.0852802668662256e-05, "loss": 0.0191, "step": 5340 }, { "epoch": 9.192439862542955, "grad_norm": 0.33205386996269226, "learning_rate": 6.071406989543678e-05, "loss": 0.0177, "step": 5350 }, { "epoch": 9.209621993127147, "grad_norm": 0.24390940368175507, "learning_rate": 6.057525061389324e-05, "loss": 0.0217, "step": 5360 }, { "epoch": 9.22680412371134, "grad_norm": 0.18197228014469147, "learning_rate": 6.04363459448963e-05, "loss": 0.0126, "step": 5370 }, { "epoch": 9.243986254295532, "grad_norm": 0.2006153017282486, "learning_rate": 6.0297357010000124e-05, "loss": 0.0171, "step": 5380 }, { "epoch": 9.261168384879726, "grad_norm": 0.199944868683815, "learning_rate": 6.0158284931439177e-05, "loss": 0.0165, "step": 5390 }, { "epoch": 9.278350515463918, "grad_norm": 0.1962256133556366, "learning_rate": 6.001913083211932e-05, "loss": 0.0198, "step": 5400 }, { "epoch": 9.29553264604811, "grad_norm": 0.2808385491371155, "learning_rate": 5.987989583560864e-05, "loss": 0.0164, "step": 5410 }, { "epoch": 9.312714776632303, "grad_norm": 0.24396586418151855, "learning_rate": 5.9740581066128435e-05, "loss": 0.0202, "step": 5420 }, { "epoch": 9.329896907216495, "grad_norm": 0.28668099641799927, "learning_rate": 5.9601187648544056e-05, "loss": 0.0156, "step": 5430 }, { "epoch": 9.347079037800688, "grad_norm": 0.25964459776878357, "learning_rate": 5.946171670835594e-05, "loss": 0.0197, "step": 5440 }, { "epoch": 9.36426116838488, "grad_norm": 0.3509371876716614, "learning_rate": 5.932216937169044e-05, "loss": 0.0229, "step": 5450 }, { "epoch": 9.381443298969073, "grad_norm": 0.29809918999671936, "learning_rate": 5.918254676529076e-05, "loss": 0.0134, "step": 5460 }, { "epoch": 9.398625429553265, "grad_norm": 0.20090153813362122, "learning_rate": 5.904285001650783e-05, "loss": 0.0184, "step": 5470 }, { "epoch": 9.415807560137457, "grad_norm": 0.3226790726184845, "learning_rate": 5.890308025329125e-05, "loss": 0.017, "step": 5480 }, { "epoch": 9.43298969072165, "grad_norm": 0.2159719467163086, "learning_rate": 5.876323860418016e-05, "loss": 0.0133, "step": 5490 }, { "epoch": 9.450171821305842, "grad_norm": 0.2575574219226837, "learning_rate": 5.8623326198294116e-05, "loss": 0.0156, "step": 5500 }, { "epoch": 9.467353951890034, "grad_norm": 0.2184896171092987, "learning_rate": 5.8483344165323975e-05, "loss": 0.0156, "step": 5510 }, { "epoch": 9.484536082474227, "grad_norm": 0.2843054533004761, "learning_rate": 5.834329363552279e-05, "loss": 0.0163, "step": 5520 }, { "epoch": 9.50171821305842, "grad_norm": 0.3006589710712433, "learning_rate": 5.820317573969669e-05, "loss": 0.0155, "step": 5530 }, { "epoch": 9.518900343642612, "grad_norm": 0.23060756921768188, "learning_rate": 5.806299160919573e-05, "loss": 0.0127, "step": 5540 }, { "epoch": 9.536082474226804, "grad_norm": 0.23474593460559845, "learning_rate": 5.792274237590471e-05, "loss": 0.0151, "step": 5550 }, { "epoch": 9.553264604810996, "grad_norm": 0.2775484621524811, "learning_rate": 5.7782429172234206e-05, "loss": 0.0194, "step": 5560 }, { "epoch": 9.570446735395189, "grad_norm": 0.20381006598472595, "learning_rate": 5.7642053131111186e-05, "loss": 0.0205, "step": 5570 }, { "epoch": 9.587628865979381, "grad_norm": 0.2642858028411865, "learning_rate": 5.7501615385970044e-05, "loss": 0.012, "step": 5580 }, { "epoch": 9.604810996563574, "grad_norm": 0.14698222279548645, "learning_rate": 5.7361117070743374e-05, "loss": 0.0151, "step": 5590 }, { "epoch": 9.621993127147766, "grad_norm": 0.2586089074611664, "learning_rate": 5.722055931985285e-05, "loss": 0.0173, "step": 5600 }, { "epoch": 9.639175257731958, "grad_norm": 0.2857683002948761, "learning_rate": 5.707994326820002e-05, "loss": 0.0173, "step": 5610 }, { "epoch": 9.65635738831615, "grad_norm": 0.25553369522094727, "learning_rate": 5.693927005115719e-05, "loss": 0.0193, "step": 5620 }, { "epoch": 9.673539518900343, "grad_norm": 0.2712913453578949, "learning_rate": 5.679854080455821e-05, "loss": 0.0104, "step": 5630 }, { "epoch": 9.690721649484535, "grad_norm": 0.2559773325920105, "learning_rate": 5.665775666468933e-05, "loss": 0.0144, "step": 5640 }, { "epoch": 9.707903780068728, "grad_norm": 0.3383992910385132, "learning_rate": 5.651691876828007e-05, "loss": 0.0144, "step": 5650 }, { "epoch": 9.72508591065292, "grad_norm": 0.3301098048686981, "learning_rate": 5.637602825249394e-05, "loss": 0.0157, "step": 5660 }, { "epoch": 9.742268041237114, "grad_norm": 0.22163395583629608, "learning_rate": 5.6235086254919324e-05, "loss": 0.0112, "step": 5670 }, { "epoch": 9.759450171821307, "grad_norm": 0.10947784781455994, "learning_rate": 5.609409391356031e-05, "loss": 0.0212, "step": 5680 }, { "epoch": 9.776632302405499, "grad_norm": 0.17621196806430817, "learning_rate": 5.595305236682743e-05, "loss": 0.0099, "step": 5690 }, { "epoch": 9.793814432989691, "grad_norm": 0.1827089488506317, "learning_rate": 5.581196275352858e-05, "loss": 0.018, "step": 5700 }, { "epoch": 9.810996563573884, "grad_norm": 0.20134034752845764, "learning_rate": 5.567082621285969e-05, "loss": 0.02, "step": 5710 }, { "epoch": 9.828178694158076, "grad_norm": 0.2766471803188324, "learning_rate": 5.5529643884395654e-05, "loss": 0.0125, "step": 5720 }, { "epoch": 9.845360824742269, "grad_norm": 0.1543634682893753, "learning_rate": 5.538841690808101e-05, "loss": 0.0166, "step": 5730 }, { "epoch": 9.862542955326461, "grad_norm": 0.2152809351682663, "learning_rate": 5.524714642422084e-05, "loss": 0.01, "step": 5740 }, { "epoch": 9.879725085910653, "grad_norm": 0.32943928241729736, "learning_rate": 5.510583357347149e-05, "loss": 0.0166, "step": 5750 }, { "epoch": 9.896907216494846, "grad_norm": 0.24659444391727448, "learning_rate": 5.4964479496831425e-05, "loss": 0.0173, "step": 5760 }, { "epoch": 9.914089347079038, "grad_norm": 0.173888698220253, "learning_rate": 5.482308533563193e-05, "loss": 0.0094, "step": 5770 }, { "epoch": 9.93127147766323, "grad_norm": 0.19505925476551056, "learning_rate": 5.468165223152798e-05, "loss": 0.0142, "step": 5780 }, { "epoch": 9.948453608247423, "grad_norm": 0.25433164834976196, "learning_rate": 5.454018132648897e-05, "loss": 0.015, "step": 5790 }, { "epoch": 9.965635738831615, "grad_norm": 0.26114964485168457, "learning_rate": 5.439867376278952e-05, "loss": 0.0136, "step": 5800 }, { "epoch": 9.982817869415808, "grad_norm": 0.36945995688438416, "learning_rate": 5.425713068300022e-05, "loss": 0.0213, "step": 5810 }, { "epoch": 10.0, "grad_norm": 0.4136284291744232, "learning_rate": 5.411555322997846e-05, "loss": 0.0235, "step": 5820 }, { "epoch": 10.017182130584192, "grad_norm": 0.2530066967010498, "learning_rate": 5.3973942546859145e-05, "loss": 0.0159, "step": 5830 }, { "epoch": 10.034364261168385, "grad_norm": 0.3279346227645874, "learning_rate": 5.3832299777045495e-05, "loss": 0.0123, "step": 5840 }, { "epoch": 10.051546391752577, "grad_norm": 0.2813730239868164, "learning_rate": 5.36906260641998e-05, "loss": 0.0152, "step": 5850 }, { "epoch": 10.06872852233677, "grad_norm": 0.2074098438024521, "learning_rate": 5.354892255223421e-05, "loss": 0.0134, "step": 5860 }, { "epoch": 10.085910652920962, "grad_norm": 0.2736356854438782, "learning_rate": 5.3407190385301456e-05, "loss": 0.0104, "step": 5870 }, { "epoch": 10.103092783505154, "grad_norm": 0.25040575861930847, "learning_rate": 5.3265430707785666e-05, "loss": 0.0172, "step": 5880 }, { "epoch": 10.120274914089347, "grad_norm": 0.3141660988330841, "learning_rate": 5.312364466429307e-05, "loss": 0.0125, "step": 5890 }, { "epoch": 10.137457044673539, "grad_norm": 0.16908888518810272, "learning_rate": 5.298183339964281e-05, "loss": 0.0117, "step": 5900 }, { "epoch": 10.154639175257731, "grad_norm": 0.2192607969045639, "learning_rate": 5.283999805885764e-05, "loss": 0.0212, "step": 5910 }, { "epoch": 10.171821305841924, "grad_norm": 0.14075499773025513, "learning_rate": 5.269813978715474e-05, "loss": 0.0171, "step": 5920 }, { "epoch": 10.189003436426116, "grad_norm": 0.15797455608844757, "learning_rate": 5.255625972993642e-05, "loss": 0.0143, "step": 5930 }, { "epoch": 10.206185567010309, "grad_norm": 0.22639349102973938, "learning_rate": 5.24143590327809e-05, "loss": 0.0174, "step": 5940 }, { "epoch": 10.223367697594501, "grad_norm": 0.22572936117649078, "learning_rate": 5.227243884143306e-05, "loss": 0.0123, "step": 5950 }, { "epoch": 10.240549828178693, "grad_norm": 0.24433186650276184, "learning_rate": 5.213050030179515e-05, "loss": 0.0152, "step": 5960 }, { "epoch": 10.257731958762886, "grad_norm": 0.2180275022983551, "learning_rate": 5.198854455991763e-05, "loss": 0.0136, "step": 5970 }, { "epoch": 10.27491408934708, "grad_norm": 0.1412176787853241, "learning_rate": 5.184657276198978e-05, "loss": 0.0083, "step": 5980 }, { "epoch": 10.292096219931272, "grad_norm": 0.23186911642551422, "learning_rate": 5.170458605433059e-05, "loss": 0.0128, "step": 5990 }, { "epoch": 10.309278350515465, "grad_norm": 0.2739560604095459, "learning_rate": 5.15625855833794e-05, "loss": 0.0212, "step": 6000 }, { "epoch": 10.326460481099657, "grad_norm": 0.2591661512851715, "learning_rate": 5.1420572495686646e-05, "loss": 0.0153, "step": 6010 }, { "epoch": 10.34364261168385, "grad_norm": 0.301039457321167, "learning_rate": 5.127854793790473e-05, "loss": 0.0128, "step": 6020 }, { "epoch": 10.360824742268042, "grad_norm": 0.30792465806007385, "learning_rate": 5.113651305677856e-05, "loss": 0.0206, "step": 6030 }, { "epoch": 10.378006872852234, "grad_norm": 0.20730407536029816, "learning_rate": 5.099446899913648e-05, "loss": 0.0184, "step": 6040 }, { "epoch": 10.395189003436426, "grad_norm": 0.2361646145582199, "learning_rate": 5.085241691188086e-05, "loss": 0.0142, "step": 6050 }, { "epoch": 10.412371134020619, "grad_norm": 0.15994442999362946, "learning_rate": 5.071035794197898e-05, "loss": 0.0128, "step": 6060 }, { "epoch": 10.429553264604811, "grad_norm": 0.1956380158662796, "learning_rate": 5.0568293236453614e-05, "loss": 0.0139, "step": 6070 }, { "epoch": 10.446735395189004, "grad_norm": 0.14793916046619415, "learning_rate": 5.042622394237391e-05, "loss": 0.01, "step": 6080 }, { "epoch": 10.463917525773196, "grad_norm": 0.23033088445663452, "learning_rate": 5.0284151206845996e-05, "loss": 0.0104, "step": 6090 }, { "epoch": 10.481099656357388, "grad_norm": 0.21595941483974457, "learning_rate": 5.014207617700388e-05, "loss": 0.0208, "step": 6100 }, { "epoch": 10.49828178694158, "grad_norm": 0.325511634349823, "learning_rate": 5e-05, "loss": 0.0226, "step": 6110 }, { "epoch": 10.515463917525773, "grad_norm": 0.24100159108638763, "learning_rate": 4.985792382299614e-05, "loss": 0.013, "step": 6120 }, { "epoch": 10.532646048109966, "grad_norm": 0.2464800477027893, "learning_rate": 4.9715848793154e-05, "loss": 0.0171, "step": 6130 }, { "epoch": 10.549828178694158, "grad_norm": 0.24693673849105835, "learning_rate": 4.957377605762611e-05, "loss": 0.015, "step": 6140 }, { "epoch": 10.56701030927835, "grad_norm": 0.13398700952529907, "learning_rate": 4.94317067635464e-05, "loss": 0.0126, "step": 6150 }, { "epoch": 10.584192439862543, "grad_norm": 0.2720285654067993, "learning_rate": 4.9289642058021043e-05, "loss": 0.0161, "step": 6160 }, { "epoch": 10.601374570446735, "grad_norm": 0.2861359119415283, "learning_rate": 4.914758308811913e-05, "loss": 0.0137, "step": 6170 }, { "epoch": 10.618556701030927, "grad_norm": 0.15878301858901978, "learning_rate": 4.900553100086353e-05, "loss": 0.0173, "step": 6180 }, { "epoch": 10.63573883161512, "grad_norm": 0.33061495423316956, "learning_rate": 4.886348694322145e-05, "loss": 0.013, "step": 6190 }, { "epoch": 10.652920962199312, "grad_norm": 0.30866488814353943, "learning_rate": 4.8721452062095294e-05, "loss": 0.0168, "step": 6200 }, { "epoch": 10.670103092783505, "grad_norm": 0.24568206071853638, "learning_rate": 4.8579427504313366e-05, "loss": 0.0152, "step": 6210 }, { "epoch": 10.687285223367697, "grad_norm": 0.24803771078586578, "learning_rate": 4.843741441662062e-05, "loss": 0.0178, "step": 6220 }, { "epoch": 10.70446735395189, "grad_norm": 0.17046746611595154, "learning_rate": 4.829541394566942e-05, "loss": 0.0124, "step": 6230 }, { "epoch": 10.721649484536082, "grad_norm": 0.22589251399040222, "learning_rate": 4.8153427238010227e-05, "loss": 0.014, "step": 6240 }, { "epoch": 10.738831615120276, "grad_norm": 0.24486307799816132, "learning_rate": 4.801145544008239e-05, "loss": 0.014, "step": 6250 }, { "epoch": 10.756013745704468, "grad_norm": 0.13196790218353271, "learning_rate": 4.7869499698204864e-05, "loss": 0.0144, "step": 6260 }, { "epoch": 10.77319587628866, "grad_norm": 0.20505741238594055, "learning_rate": 4.772756115856695e-05, "loss": 0.0146, "step": 6270 }, { "epoch": 10.790378006872853, "grad_norm": 0.22166849672794342, "learning_rate": 4.758564096721911e-05, "loss": 0.0143, "step": 6280 }, { "epoch": 10.807560137457045, "grad_norm": 0.27348771691322327, "learning_rate": 4.7443740270063584e-05, "loss": 0.0137, "step": 6290 }, { "epoch": 10.824742268041238, "grad_norm": 0.2516573667526245, "learning_rate": 4.7301860212845264e-05, "loss": 0.0142, "step": 6300 }, { "epoch": 10.84192439862543, "grad_norm": 0.28733956813812256, "learning_rate": 4.7160001941142365e-05, "loss": 0.0123, "step": 6310 }, { "epoch": 10.859106529209622, "grad_norm": 0.3413456678390503, "learning_rate": 4.7018166600357204e-05, "loss": 0.0142, "step": 6320 }, { "epoch": 10.876288659793815, "grad_norm": 0.3347049653530121, "learning_rate": 4.687635533570693e-05, "loss": 0.0137, "step": 6330 }, { "epoch": 10.893470790378007, "grad_norm": 0.3175305426120758, "learning_rate": 4.673456929221434e-05, "loss": 0.0205, "step": 6340 }, { "epoch": 10.9106529209622, "grad_norm": 0.1658443957567215, "learning_rate": 4.6592809614698556e-05, "loss": 0.013, "step": 6350 }, { "epoch": 10.927835051546392, "grad_norm": 0.12746182084083557, "learning_rate": 4.645107744776581e-05, "loss": 0.0126, "step": 6360 }, { "epoch": 10.945017182130584, "grad_norm": 0.20812661945819855, "learning_rate": 4.6309373935800205e-05, "loss": 0.0149, "step": 6370 }, { "epoch": 10.962199312714777, "grad_norm": 0.18740630149841309, "learning_rate": 4.616770022295451e-05, "loss": 0.0115, "step": 6380 }, { "epoch": 10.97938144329897, "grad_norm": 0.18948382139205933, "learning_rate": 4.602605745314087e-05, "loss": 0.0197, "step": 6390 }, { "epoch": 10.996563573883162, "grad_norm": 0.4297175109386444, "learning_rate": 4.5884446770021555e-05, "loss": 0.016, "step": 6400 }, { "epoch": 11.013745704467354, "grad_norm": 0.2623024880886078, "learning_rate": 4.574286931699978e-05, "loss": 0.0142, "step": 6410 }, { "epoch": 11.030927835051546, "grad_norm": 0.2243795096874237, "learning_rate": 4.560132623721049e-05, "loss": 0.0156, "step": 6420 }, { "epoch": 11.048109965635739, "grad_norm": 0.20103001594543457, "learning_rate": 4.545981867351104e-05, "loss": 0.0116, "step": 6430 }, { "epoch": 11.065292096219931, "grad_norm": 0.11890780925750732, "learning_rate": 4.5318347768472035e-05, "loss": 0.0081, "step": 6440 }, { "epoch": 11.082474226804123, "grad_norm": 0.26694929599761963, "learning_rate": 4.517691466436807e-05, "loss": 0.0155, "step": 6450 }, { "epoch": 11.099656357388316, "grad_norm": 0.18821591138839722, "learning_rate": 4.5035520503168586e-05, "loss": 0.0104, "step": 6460 }, { "epoch": 11.116838487972508, "grad_norm": 0.27548283338546753, "learning_rate": 4.4894166426528524e-05, "loss": 0.0114, "step": 6470 }, { "epoch": 11.1340206185567, "grad_norm": 0.1965043544769287, "learning_rate": 4.4752853575779185e-05, "loss": 0.0104, "step": 6480 }, { "epoch": 11.151202749140893, "grad_norm": 0.21741580963134766, "learning_rate": 4.4611583091919e-05, "loss": 0.0117, "step": 6490 }, { "epoch": 11.168384879725085, "grad_norm": 0.1215846836566925, "learning_rate": 4.4470356115604364e-05, "loss": 0.0093, "step": 6500 }, { "epoch": 11.185567010309278, "grad_norm": 0.1946978121995926, "learning_rate": 4.432917378714032e-05, "loss": 0.0194, "step": 6510 }, { "epoch": 11.20274914089347, "grad_norm": 0.22516775131225586, "learning_rate": 4.418803724647144e-05, "loss": 0.0149, "step": 6520 }, { "epoch": 11.219931271477662, "grad_norm": 0.22346094250679016, "learning_rate": 4.4046947633172566e-05, "loss": 0.0091, "step": 6530 }, { "epoch": 11.237113402061855, "grad_norm": 0.23928742110729218, "learning_rate": 4.3905906086439704e-05, "loss": 0.0164, "step": 6540 }, { "epoch": 11.254295532646047, "grad_norm": 0.34528031945228577, "learning_rate": 4.3764913745080695e-05, "loss": 0.0145, "step": 6550 }, { "epoch": 11.271477663230241, "grad_norm": 0.134693905711174, "learning_rate": 4.362397174750608e-05, "loss": 0.0076, "step": 6560 }, { "epoch": 11.288659793814434, "grad_norm": 0.35505372285842896, "learning_rate": 4.348308123171994e-05, "loss": 0.0138, "step": 6570 }, { "epoch": 11.305841924398626, "grad_norm": 0.17052118480205536, "learning_rate": 4.334224333531068e-05, "loss": 0.012, "step": 6580 }, { "epoch": 11.323024054982818, "grad_norm": 0.19103099405765533, "learning_rate": 4.32014591954418e-05, "loss": 0.013, "step": 6590 }, { "epoch": 11.34020618556701, "grad_norm": 0.20789751410484314, "learning_rate": 4.306072994884282e-05, "loss": 0.0091, "step": 6600 }, { "epoch": 11.357388316151203, "grad_norm": 0.2590029537677765, "learning_rate": 4.292005673179998e-05, "loss": 0.008, "step": 6610 }, { "epoch": 11.374570446735396, "grad_norm": 0.16030985116958618, "learning_rate": 4.277944068014716e-05, "loss": 0.0142, "step": 6620 }, { "epoch": 11.391752577319588, "grad_norm": 0.34259387850761414, "learning_rate": 4.263888292925664e-05, "loss": 0.0115, "step": 6630 }, { "epoch": 11.40893470790378, "grad_norm": 0.24973253905773163, "learning_rate": 4.249838461402997e-05, "loss": 0.0112, "step": 6640 }, { "epoch": 11.426116838487973, "grad_norm": 0.40062564611434937, "learning_rate": 4.235794686888882e-05, "loss": 0.0111, "step": 6650 }, { "epoch": 11.443298969072165, "grad_norm": 0.23818433284759521, "learning_rate": 4.22175708277658e-05, "loss": 0.0124, "step": 6660 }, { "epoch": 11.460481099656358, "grad_norm": 0.17521892488002777, "learning_rate": 4.207725762409529e-05, "loss": 0.0186, "step": 6670 }, { "epoch": 11.47766323024055, "grad_norm": 0.2232678085565567, "learning_rate": 4.19370083908043e-05, "loss": 0.012, "step": 6680 }, { "epoch": 11.494845360824742, "grad_norm": 0.1600189507007599, "learning_rate": 4.179682426030331e-05, "loss": 0.0107, "step": 6690 }, { "epoch": 11.512027491408935, "grad_norm": 0.3540445566177368, "learning_rate": 4.1656706364477214e-05, "loss": 0.0182, "step": 6700 }, { "epoch": 11.529209621993127, "grad_norm": 0.39657342433929443, "learning_rate": 4.151665583467604e-05, "loss": 0.0157, "step": 6710 }, { "epoch": 11.54639175257732, "grad_norm": 0.35762307047843933, "learning_rate": 4.137667380170591e-05, "loss": 0.0115, "step": 6720 }, { "epoch": 11.563573883161512, "grad_norm": 0.28293389081954956, "learning_rate": 4.123676139581984e-05, "loss": 0.0194, "step": 6730 }, { "epoch": 11.580756013745704, "grad_norm": 0.1835634410381317, "learning_rate": 4.1096919746708754e-05, "loss": 0.0143, "step": 6740 }, { "epoch": 11.597938144329897, "grad_norm": 0.1975705772638321, "learning_rate": 4.095714998349218e-05, "loss": 0.016, "step": 6750 }, { "epoch": 11.615120274914089, "grad_norm": 0.17618152499198914, "learning_rate": 4.081745323470926e-05, "loss": 0.0198, "step": 6760 }, { "epoch": 11.632302405498281, "grad_norm": 0.1503658890724182, "learning_rate": 4.067783062830955e-05, "loss": 0.0156, "step": 6770 }, { "epoch": 11.649484536082474, "grad_norm": 0.28605377674102783, "learning_rate": 4.053828329164407e-05, "loss": 0.0146, "step": 6780 }, { "epoch": 11.666666666666666, "grad_norm": 0.3132267892360687, "learning_rate": 4.0398812351455955e-05, "loss": 0.0102, "step": 6790 }, { "epoch": 11.683848797250858, "grad_norm": 0.2057536542415619, "learning_rate": 4.025941893387159e-05, "loss": 0.0176, "step": 6800 }, { "epoch": 11.70103092783505, "grad_norm": 0.2427815943956375, "learning_rate": 4.012010416439136e-05, "loss": 0.0132, "step": 6810 }, { "epoch": 11.718213058419243, "grad_norm": 0.2931414246559143, "learning_rate": 3.998086916788069e-05, "loss": 0.0108, "step": 6820 }, { "epoch": 11.735395189003437, "grad_norm": 0.2122270166873932, "learning_rate": 3.9841715068560835e-05, "loss": 0.0146, "step": 6830 }, { "epoch": 11.75257731958763, "grad_norm": 0.3742753565311432, "learning_rate": 3.970264298999991e-05, "loss": 0.0128, "step": 6840 }, { "epoch": 11.769759450171822, "grad_norm": 0.13350647687911987, "learning_rate": 3.956365405510369e-05, "loss": 0.0105, "step": 6850 }, { "epoch": 11.786941580756015, "grad_norm": 0.2694711685180664, "learning_rate": 3.942474938610677e-05, "loss": 0.0117, "step": 6860 }, { "epoch": 11.804123711340207, "grad_norm": 0.2818795144557953, "learning_rate": 3.9285930104563234e-05, "loss": 0.0086, "step": 6870 }, { "epoch": 11.8213058419244, "grad_norm": 0.2870750427246094, "learning_rate": 3.914719733133776e-05, "loss": 0.012, "step": 6880 }, { "epoch": 11.838487972508592, "grad_norm": 0.14688880741596222, "learning_rate": 3.900855218659655e-05, "loss": 0.0169, "step": 6890 }, { "epoch": 11.855670103092784, "grad_norm": 0.1673170030117035, "learning_rate": 3.886999578979824e-05, "loss": 0.011, "step": 6900 }, { "epoch": 11.872852233676976, "grad_norm": 0.3427187502384186, "learning_rate": 3.873152925968495e-05, "loss": 0.0172, "step": 6910 }, { "epoch": 11.890034364261169, "grad_norm": 0.32928958535194397, "learning_rate": 3.859315371427312e-05, "loss": 0.0157, "step": 6920 }, { "epoch": 11.907216494845361, "grad_norm": 0.2496093362569809, "learning_rate": 3.8454870270844593e-05, "loss": 0.0119, "step": 6930 }, { "epoch": 11.924398625429554, "grad_norm": 0.15401820838451385, "learning_rate": 3.831668004593756e-05, "loss": 0.0115, "step": 6940 }, { "epoch": 11.941580756013746, "grad_norm": 0.14115320146083832, "learning_rate": 3.8178584155337525e-05, "loss": 0.0106, "step": 6950 }, { "epoch": 11.958762886597938, "grad_norm": 0.20622394979000092, "learning_rate": 3.804058371406831e-05, "loss": 0.0138, "step": 6960 }, { "epoch": 11.97594501718213, "grad_norm": 0.11186587810516357, "learning_rate": 3.790267983638305e-05, "loss": 0.0152, "step": 6970 }, { "epoch": 11.993127147766323, "grad_norm": 0.18001288175582886, "learning_rate": 3.776487363575524e-05, "loss": 0.0098, "step": 6980 }, { "epoch": 12.010309278350515, "grad_norm": 0.3391369879245758, "learning_rate": 3.762716622486965e-05, "loss": 0.0234, "step": 6990 }, { "epoch": 12.027491408934708, "grad_norm": 0.19333554804325104, "learning_rate": 3.748955871561341e-05, "loss": 0.0127, "step": 7000 }, { "epoch": 12.0446735395189, "grad_norm": 0.2803151607513428, "learning_rate": 3.735205221906703e-05, "loss": 0.0141, "step": 7010 }, { "epoch": 12.061855670103093, "grad_norm": 0.28076592087745667, "learning_rate": 3.721464784549543e-05, "loss": 0.0116, "step": 7020 }, { "epoch": 12.079037800687285, "grad_norm": 0.3014523386955261, "learning_rate": 3.7077346704338935e-05, "loss": 0.014, "step": 7030 }, { "epoch": 12.096219931271477, "grad_norm": 0.15294674038887024, "learning_rate": 3.694014990420433e-05, "loss": 0.0133, "step": 7040 }, { "epoch": 12.11340206185567, "grad_norm": 0.21652719378471375, "learning_rate": 3.680305855285593e-05, "loss": 0.0106, "step": 7050 }, { "epoch": 12.130584192439862, "grad_norm": 0.24568617343902588, "learning_rate": 3.6666073757206686e-05, "loss": 0.0114, "step": 7060 }, { "epoch": 12.147766323024054, "grad_norm": 0.2690240144729614, "learning_rate": 3.6529196623309115e-05, "loss": 0.016, "step": 7070 }, { "epoch": 12.164948453608247, "grad_norm": 0.21522220969200134, "learning_rate": 3.6392428256346475e-05, "loss": 0.0136, "step": 7080 }, { "epoch": 12.18213058419244, "grad_norm": 0.25682464241981506, "learning_rate": 3.625576976062379e-05, "loss": 0.0119, "step": 7090 }, { "epoch": 12.199312714776632, "grad_norm": 0.15618295967578888, "learning_rate": 3.6119222239559e-05, "loss": 0.0131, "step": 7100 }, { "epoch": 12.216494845360824, "grad_norm": 0.21718665957450867, "learning_rate": 3.598278679567397e-05, "loss": 0.0153, "step": 7110 }, { "epoch": 12.233676975945016, "grad_norm": 0.17358386516571045, "learning_rate": 3.5846464530585624e-05, "loss": 0.0105, "step": 7120 }, { "epoch": 12.250859106529209, "grad_norm": 0.2519778907299042, "learning_rate": 3.571025654499702e-05, "loss": 0.0157, "step": 7130 }, { "epoch": 12.268041237113403, "grad_norm": 0.26433685421943665, "learning_rate": 3.557416393868859e-05, "loss": 0.0131, "step": 7140 }, { "epoch": 12.285223367697595, "grad_norm": 0.2645297646522522, "learning_rate": 3.543818781050906e-05, "loss": 0.0098, "step": 7150 }, { "epoch": 12.302405498281788, "grad_norm": 0.23010118305683136, "learning_rate": 3.530232925836673e-05, "loss": 0.018, "step": 7160 }, { "epoch": 12.31958762886598, "grad_norm": 0.08610416948795319, "learning_rate": 3.516658937922051e-05, "loss": 0.0095, "step": 7170 }, { "epoch": 12.336769759450172, "grad_norm": 0.14161959290504456, "learning_rate": 3.503096926907123e-05, "loss": 0.0153, "step": 7180 }, { "epoch": 12.353951890034365, "grad_norm": 0.3274645209312439, "learning_rate": 3.4895470022952536e-05, "loss": 0.0118, "step": 7190 }, { "epoch": 12.371134020618557, "grad_norm": 0.16021353006362915, "learning_rate": 3.476009273492225e-05, "loss": 0.0138, "step": 7200 }, { "epoch": 12.38831615120275, "grad_norm": 0.2030124068260193, "learning_rate": 3.462483849805346e-05, "loss": 0.0106, "step": 7210 }, { "epoch": 12.405498281786942, "grad_norm": 0.15385638177394867, "learning_rate": 3.4489708404425704e-05, "loss": 0.0102, "step": 7220 }, { "epoch": 12.422680412371134, "grad_norm": 0.10668976604938507, "learning_rate": 3.4354703545116185e-05, "loss": 0.0109, "step": 7230 }, { "epoch": 12.439862542955327, "grad_norm": 0.16402071714401245, "learning_rate": 3.421982501019087e-05, "loss": 0.0108, "step": 7240 }, { "epoch": 12.457044673539519, "grad_norm": 0.10426975041627884, "learning_rate": 3.4085073888695804e-05, "loss": 0.0103, "step": 7250 }, { "epoch": 12.474226804123711, "grad_norm": 0.23913106322288513, "learning_rate": 3.3950451268648235e-05, "loss": 0.0103, "step": 7260 }, { "epoch": 12.491408934707904, "grad_norm": 0.1630750596523285, "learning_rate": 3.381595823702784e-05, "loss": 0.018, "step": 7270 }, { "epoch": 12.508591065292096, "grad_norm": 0.3311632573604584, "learning_rate": 3.368159587976799e-05, "loss": 0.0089, "step": 7280 }, { "epoch": 12.525773195876289, "grad_norm": 0.45006489753723145, "learning_rate": 3.354736528174696e-05, "loss": 0.0124, "step": 7290 }, { "epoch": 12.542955326460481, "grad_norm": 0.23996764421463013, "learning_rate": 3.341326752677916e-05, "loss": 0.0179, "step": 7300 }, { "epoch": 12.560137457044673, "grad_norm": 0.14841718971729279, "learning_rate": 3.3279303697606354e-05, "loss": 0.0063, "step": 7310 }, { "epoch": 12.577319587628866, "grad_norm": 0.09983796626329422, "learning_rate": 3.314547487588901e-05, "loss": 0.0096, "step": 7320 }, { "epoch": 12.594501718213058, "grad_norm": 0.17602872848510742, "learning_rate": 3.301178214219744e-05, "loss": 0.009, "step": 7330 }, { "epoch": 12.61168384879725, "grad_norm": 0.24939224123954773, "learning_rate": 3.2878226576003225e-05, "loss": 0.013, "step": 7340 }, { "epoch": 12.628865979381443, "grad_norm": 0.17927074432373047, "learning_rate": 3.274480925567036e-05, "loss": 0.011, "step": 7350 }, { "epoch": 12.646048109965635, "grad_norm": 0.17862512171268463, "learning_rate": 3.261153125844663e-05, "loss": 0.0149, "step": 7360 }, { "epoch": 12.663230240549828, "grad_norm": 0.2447875738143921, "learning_rate": 3.247839366045485e-05, "loss": 0.0137, "step": 7370 }, { "epoch": 12.68041237113402, "grad_norm": 0.2494247555732727, "learning_rate": 3.2345397536684286e-05, "loss": 0.0126, "step": 7380 }, { "epoch": 12.697594501718212, "grad_norm": 0.1975736767053604, "learning_rate": 3.2212543960981845e-05, "loss": 0.0104, "step": 7390 }, { "epoch": 12.714776632302405, "grad_norm": 0.18755072355270386, "learning_rate": 3.207983400604347e-05, "loss": 0.009, "step": 7400 }, { "epoch": 12.731958762886597, "grad_norm": 0.2701716423034668, "learning_rate": 3.1947268743405457e-05, "loss": 0.0136, "step": 7410 }, { "epoch": 12.749140893470791, "grad_norm": 0.2599675953388214, "learning_rate": 3.1814849243435886e-05, "loss": 0.0217, "step": 7420 }, { "epoch": 12.766323024054984, "grad_norm": 0.25833481550216675, "learning_rate": 3.168257657532584e-05, "loss": 0.0135, "step": 7430 }, { "epoch": 12.783505154639176, "grad_norm": 0.33644336462020874, "learning_rate": 3.155045180708085e-05, "loss": 0.0098, "step": 7440 }, { "epoch": 12.800687285223368, "grad_norm": 0.12960873544216156, "learning_rate": 3.1418476005512265e-05, "loss": 0.0099, "step": 7450 }, { "epoch": 12.81786941580756, "grad_norm": 0.09624414145946503, "learning_rate": 3.1286650236228696e-05, "loss": 0.0084, "step": 7460 }, { "epoch": 12.835051546391753, "grad_norm": 0.175624817609787, "learning_rate": 3.115497556362727e-05, "loss": 0.0137, "step": 7470 }, { "epoch": 12.852233676975946, "grad_norm": 0.11060360819101334, "learning_rate": 3.102345305088516e-05, "loss": 0.0136, "step": 7480 }, { "epoch": 12.869415807560138, "grad_norm": 0.1332932859659195, "learning_rate": 3.089208375995092e-05, "loss": 0.0141, "step": 7490 }, { "epoch": 12.88659793814433, "grad_norm": 0.1730755716562271, "learning_rate": 3.0760868751536045e-05, "loss": 0.0111, "step": 7500 }, { "epoch": 12.903780068728523, "grad_norm": 0.16571182012557983, "learning_rate": 3.06298090851062e-05, "loss": 0.0078, "step": 7510 }, { "epoch": 12.920962199312715, "grad_norm": 0.2591513395309448, "learning_rate": 3.0498905818872836e-05, "loss": 0.0148, "step": 7520 }, { "epoch": 12.938144329896907, "grad_norm": 0.1701243668794632, "learning_rate": 3.036816000978455e-05, "loss": 0.0159, "step": 7530 }, { "epoch": 12.9553264604811, "grad_norm": 0.29323557019233704, "learning_rate": 3.0237572713518647e-05, "loss": 0.0127, "step": 7540 }, { "epoch": 12.972508591065292, "grad_norm": 0.2534872889518738, "learning_rate": 3.0107144984472502e-05, "loss": 0.0163, "step": 7550 }, { "epoch": 12.989690721649485, "grad_norm": 0.1676417738199234, "learning_rate": 2.9976877875755128e-05, "loss": 0.0083, "step": 7560 }, { "epoch": 13.006872852233677, "grad_norm": 0.11713390052318573, "learning_rate": 2.984677243917861e-05, "loss": 0.0082, "step": 7570 }, { "epoch": 13.02405498281787, "grad_norm": 0.35955625772476196, "learning_rate": 2.9716829725249707e-05, "loss": 0.0125, "step": 7580 }, { "epoch": 13.041237113402062, "grad_norm": 0.1874362677335739, "learning_rate": 2.9587050783161252e-05, "loss": 0.0112, "step": 7590 }, { "epoch": 13.058419243986254, "grad_norm": 0.06738214194774628, "learning_rate": 2.9457436660783784e-05, "loss": 0.0138, "step": 7600 }, { "epoch": 13.075601374570446, "grad_norm": 0.22004689276218414, "learning_rate": 2.9327988404657002e-05, "loss": 0.0105, "step": 7610 }, { "epoch": 13.092783505154639, "grad_norm": 0.11634822189807892, "learning_rate": 2.9198707059981413e-05, "loss": 0.0073, "step": 7620 }, { "epoch": 13.109965635738831, "grad_norm": 0.08798322826623917, "learning_rate": 2.9069593670609775e-05, "loss": 0.018, "step": 7630 }, { "epoch": 13.127147766323024, "grad_norm": 0.11149155348539352, "learning_rate": 2.8940649279038768e-05, "loss": 0.0091, "step": 7640 }, { "epoch": 13.144329896907216, "grad_norm": 0.1387196183204651, "learning_rate": 2.8811874926400483e-05, "loss": 0.0101, "step": 7650 }, { "epoch": 13.161512027491408, "grad_norm": 0.10784903913736343, "learning_rate": 2.868327165245419e-05, "loss": 0.0125, "step": 7660 }, { "epoch": 13.1786941580756, "grad_norm": 0.293300598859787, "learning_rate": 2.8554840495577682e-05, "loss": 0.0099, "step": 7670 }, { "epoch": 13.195876288659793, "grad_norm": 0.1339499056339264, "learning_rate": 2.8426582492759134e-05, "loss": 0.0089, "step": 7680 }, { "epoch": 13.213058419243985, "grad_norm": 0.1549367606639862, "learning_rate": 2.8298498679588525e-05, "loss": 0.0108, "step": 7690 }, { "epoch": 13.230240549828178, "grad_norm": 0.20458447933197021, "learning_rate": 2.817059009024953e-05, "loss": 0.0081, "step": 7700 }, { "epoch": 13.24742268041237, "grad_norm": 0.17270691692829132, "learning_rate": 2.8042857757510877e-05, "loss": 0.0094, "step": 7710 }, { "epoch": 13.264604810996564, "grad_norm": 0.17686305940151215, "learning_rate": 2.7915302712718227e-05, "loss": 0.0143, "step": 7720 }, { "epoch": 13.281786941580757, "grad_norm": 0.2391350120306015, "learning_rate": 2.7787925985785733e-05, "loss": 0.0127, "step": 7730 }, { "epoch": 13.29896907216495, "grad_norm": 0.21285896003246307, "learning_rate": 2.7660728605187776e-05, "loss": 0.0092, "step": 7740 }, { "epoch": 13.316151202749142, "grad_norm": 0.2621266841888428, "learning_rate": 2.753371159795065e-05, "loss": 0.0128, "step": 7750 }, { "epoch": 13.333333333333334, "grad_norm": 0.13718031346797943, "learning_rate": 2.740687598964429e-05, "loss": 0.0113, "step": 7760 }, { "epoch": 13.350515463917526, "grad_norm": 0.10009155422449112, "learning_rate": 2.7280222804373895e-05, "loss": 0.0088, "step": 7770 }, { "epoch": 13.367697594501719, "grad_norm": 0.18854975700378418, "learning_rate": 2.7153753064771792e-05, "loss": 0.0102, "step": 7780 }, { "epoch": 13.384879725085911, "grad_norm": 0.3908763825893402, "learning_rate": 2.702746779198912e-05, "loss": 0.0139, "step": 7790 }, { "epoch": 13.402061855670103, "grad_norm": 0.08939257264137268, "learning_rate": 2.690136800568752e-05, "loss": 0.0083, "step": 7800 }, { "epoch": 13.419243986254296, "grad_norm": 0.2188216745853424, "learning_rate": 2.6775454724031036e-05, "loss": 0.0114, "step": 7810 }, { "epoch": 13.436426116838488, "grad_norm": 0.13271217048168182, "learning_rate": 2.6649728963677783e-05, "loss": 0.0088, "step": 7820 }, { "epoch": 13.45360824742268, "grad_norm": 0.2332095205783844, "learning_rate": 2.6524191739771815e-05, "loss": 0.0105, "step": 7830 }, { "epoch": 13.470790378006873, "grad_norm": 0.11752445995807648, "learning_rate": 2.639884406593482e-05, "loss": 0.0099, "step": 7840 }, { "epoch": 13.487972508591065, "grad_norm": 0.22209575772285461, "learning_rate": 2.627368695425808e-05, "loss": 0.0098, "step": 7850 }, { "epoch": 13.505154639175258, "grad_norm": 0.18378068506717682, "learning_rate": 2.6148721415294186e-05, "loss": 0.0099, "step": 7860 }, { "epoch": 13.52233676975945, "grad_norm": 0.14152808487415314, "learning_rate": 2.6023948458048965e-05, "loss": 0.0102, "step": 7870 }, { "epoch": 13.539518900343642, "grad_norm": 0.17036782205104828, "learning_rate": 2.589936908997321e-05, "loss": 0.0096, "step": 7880 }, { "epoch": 13.556701030927835, "grad_norm": 0.16514594852924347, "learning_rate": 2.5774984316954676e-05, "loss": 0.0117, "step": 7890 }, { "epoch": 13.573883161512027, "grad_norm": 0.23391450941562653, "learning_rate": 2.5650795143309902e-05, "loss": 0.0136, "step": 7900 }, { "epoch": 13.59106529209622, "grad_norm": 0.22688448429107666, "learning_rate": 2.552680257177611e-05, "loss": 0.0099, "step": 7910 }, { "epoch": 13.608247422680412, "grad_norm": 0.15916913747787476, "learning_rate": 2.5403007603503053e-05, "loss": 0.0089, "step": 7920 }, { "epoch": 13.625429553264604, "grad_norm": 0.31960004568099976, "learning_rate": 2.527941123804504e-05, "loss": 0.0105, "step": 7930 }, { "epoch": 13.642611683848797, "grad_norm": 0.17471857368946075, "learning_rate": 2.5156014473352785e-05, "loss": 0.0133, "step": 7940 }, { "epoch": 13.65979381443299, "grad_norm": 0.16793015599250793, "learning_rate": 2.5032818305765383e-05, "loss": 0.0084, "step": 7950 }, { "epoch": 13.676975945017182, "grad_norm": 0.21041658520698547, "learning_rate": 2.4909823730002203e-05, "loss": 0.0088, "step": 7960 }, { "epoch": 13.694158075601374, "grad_norm": 0.2167925089597702, "learning_rate": 2.478703173915497e-05, "loss": 0.0097, "step": 7970 }, { "epoch": 13.711340206185566, "grad_norm": 0.16766490042209625, "learning_rate": 2.4664443324679653e-05, "loss": 0.0059, "step": 7980 }, { "epoch": 13.728522336769759, "grad_norm": 0.16163405776023865, "learning_rate": 2.454205947638852e-05, "loss": 0.0122, "step": 7990 }, { "epoch": 13.745704467353953, "grad_norm": 0.2345849871635437, "learning_rate": 2.4419881182442038e-05, "loss": 0.0115, "step": 8000 }, { "epoch": 13.762886597938145, "grad_norm": 0.10330498963594437, "learning_rate": 2.429790942934106e-05, "loss": 0.0097, "step": 8010 }, { "epoch": 13.780068728522338, "grad_norm": 0.1268969476222992, "learning_rate": 2.4176145201918726e-05, "loss": 0.0094, "step": 8020 }, { "epoch": 13.79725085910653, "grad_norm": 0.160488098859787, "learning_rate": 2.4054589483332597e-05, "loss": 0.0067, "step": 8030 }, { "epoch": 13.814432989690722, "grad_norm": 0.26570194959640503, "learning_rate": 2.3933243255056597e-05, "loss": 0.0092, "step": 8040 }, { "epoch": 13.831615120274915, "grad_norm": 0.3354252278804779, "learning_rate": 2.3812107496873248e-05, "loss": 0.0101, "step": 8050 }, { "epoch": 13.848797250859107, "grad_norm": 0.1483275294303894, "learning_rate": 2.3691183186865668e-05, "loss": 0.0101, "step": 8060 }, { "epoch": 13.8659793814433, "grad_norm": 0.26341909170150757, "learning_rate": 2.3570471301409618e-05, "loss": 0.0097, "step": 8070 }, { "epoch": 13.883161512027492, "grad_norm": 0.16232207417488098, "learning_rate": 2.3449972815165773e-05, "loss": 0.0154, "step": 8080 }, { "epoch": 13.900343642611684, "grad_norm": 0.19188156723976135, "learning_rate": 2.332968870107171e-05, "loss": 0.0069, "step": 8090 }, { "epoch": 13.917525773195877, "grad_norm": 0.14537520706653595, "learning_rate": 2.320961993033415e-05, "loss": 0.0079, "step": 8100 }, { "epoch": 13.934707903780069, "grad_norm": 0.10598124563694, "learning_rate": 2.3089767472421e-05, "loss": 0.0117, "step": 8110 }, { "epoch": 13.951890034364261, "grad_norm": 0.15896451473236084, "learning_rate": 2.297013229505367e-05, "loss": 0.0134, "step": 8120 }, { "epoch": 13.969072164948454, "grad_norm": 0.21453918516635895, "learning_rate": 2.285071536419916e-05, "loss": 0.0091, "step": 8130 }, { "epoch": 13.986254295532646, "grad_norm": 0.17623427510261536, "learning_rate": 2.2731517644062312e-05, "loss": 0.012, "step": 8140 }, { "epoch": 14.003436426116838, "grad_norm": 0.10579323023557663, "learning_rate": 2.2612540097077935e-05, "loss": 0.0067, "step": 8150 }, { "epoch": 14.02061855670103, "grad_norm": 0.1269347220659256, "learning_rate": 2.2493783683903185e-05, "loss": 0.0092, "step": 8160 }, { "epoch": 14.037800687285223, "grad_norm": 0.11808303743600845, "learning_rate": 2.237524936340963e-05, "loss": 0.0105, "step": 8170 }, { "epoch": 14.054982817869416, "grad_norm": 0.10431456565856934, "learning_rate": 2.2256938092675722e-05, "loss": 0.0069, "step": 8180 }, { "epoch": 14.072164948453608, "grad_norm": 0.3295063078403473, "learning_rate": 2.213885082697883e-05, "loss": 0.0107, "step": 8190 }, { "epoch": 14.0893470790378, "grad_norm": 0.19644266366958618, "learning_rate": 2.2020988519787733e-05, "loss": 0.0109, "step": 8200 }, { "epoch": 14.106529209621993, "grad_norm": 0.14295251667499542, "learning_rate": 2.1903352122754732e-05, "loss": 0.0095, "step": 8210 }, { "epoch": 14.123711340206185, "grad_norm": 0.1610773205757141, "learning_rate": 2.178594258570822e-05, "loss": 0.0092, "step": 8220 }, { "epoch": 14.140893470790378, "grad_norm": 0.18880592286586761, "learning_rate": 2.1668760856644703e-05, "loss": 0.0082, "step": 8230 }, { "epoch": 14.15807560137457, "grad_norm": 0.1384887844324112, "learning_rate": 2.1551807881721425e-05, "loss": 0.0087, "step": 8240 }, { "epoch": 14.175257731958762, "grad_norm": 0.19572345912456512, "learning_rate": 2.1435084605248484e-05, "loss": 0.0122, "step": 8250 }, { "epoch": 14.192439862542955, "grad_norm": 0.11073683947324753, "learning_rate": 2.131859196968149e-05, "loss": 0.0079, "step": 8260 }, { "epoch": 14.209621993127147, "grad_norm": 0.1309373676776886, "learning_rate": 2.1202330915613638e-05, "loss": 0.0089, "step": 8270 }, { "epoch": 14.22680412371134, "grad_norm": 0.11186233907938004, "learning_rate": 2.1086302381768385e-05, "loss": 0.0109, "step": 8280 }, { "epoch": 14.243986254295532, "grad_norm": 0.23258423805236816, "learning_rate": 2.0970507304991656e-05, "loss": 0.0145, "step": 8290 }, { "epoch": 14.261168384879726, "grad_norm": 0.195637047290802, "learning_rate": 2.0854946620244502e-05, "loss": 0.0054, "step": 8300 }, { "epoch": 14.278350515463918, "grad_norm": 0.17508986592292786, "learning_rate": 2.0739621260595315e-05, "loss": 0.0071, "step": 8310 }, { "epoch": 14.29553264604811, "grad_norm": 0.07197950035333633, "learning_rate": 2.0624532157212483e-05, "loss": 0.005, "step": 8320 }, { "epoch": 14.312714776632303, "grad_norm": 0.167429119348526, "learning_rate": 2.0509680239356728e-05, "loss": 0.0119, "step": 8330 }, { "epoch": 14.329896907216495, "grad_norm": 0.1402851641178131, "learning_rate": 2.0395066434373767e-05, "loss": 0.0142, "step": 8340 }, { "epoch": 14.347079037800688, "grad_norm": 0.08017238229513168, "learning_rate": 2.028069166768663e-05, "loss": 0.0086, "step": 8350 }, { "epoch": 14.36426116838488, "grad_norm": 0.3312987685203552, "learning_rate": 2.016655686278836e-05, "loss": 0.0123, "step": 8360 }, { "epoch": 14.381443298969073, "grad_norm": 0.1905941665172577, "learning_rate": 2.005266294123443e-05, "loss": 0.0119, "step": 8370 }, { "epoch": 14.398625429553265, "grad_norm": 0.17473655939102173, "learning_rate": 1.9939010822635384e-05, "loss": 0.0098, "step": 8380 }, { "epoch": 14.415807560137457, "grad_norm": 0.2841387689113617, "learning_rate": 1.982560142464939e-05, "loss": 0.007, "step": 8390 }, { "epoch": 14.43298969072165, "grad_norm": 0.15247130393981934, "learning_rate": 1.9712435662974816e-05, "loss": 0.008, "step": 8400 }, { "epoch": 14.450171821305842, "grad_norm": 0.11507634073495865, "learning_rate": 1.9599514451342816e-05, "loss": 0.0076, "step": 8410 }, { "epoch": 14.467353951890034, "grad_norm": 0.20362359285354614, "learning_rate": 1.9486838701510012e-05, "loss": 0.0072, "step": 8420 }, { "epoch": 14.484536082474227, "grad_norm": 0.30303680896759033, "learning_rate": 1.937440932325112e-05, "loss": 0.0061, "step": 8430 }, { "epoch": 14.50171821305842, "grad_norm": 0.18510393798351288, "learning_rate": 1.926222722435152e-05, "loss": 0.008, "step": 8440 }, { "epoch": 14.518900343642612, "grad_norm": 0.20415428280830383, "learning_rate": 1.9150293310600042e-05, "loss": 0.0094, "step": 8450 }, { "epoch": 14.536082474226804, "grad_norm": 0.17331425845623016, "learning_rate": 1.903860848578159e-05, "loss": 0.006, "step": 8460 }, { "epoch": 14.553264604810996, "grad_norm": 0.18844066560268402, "learning_rate": 1.8927173651669877e-05, "loss": 0.008, "step": 8470 }, { "epoch": 14.570446735395189, "grad_norm": 0.14389431476593018, "learning_rate": 1.8815989708020055e-05, "loss": 0.0097, "step": 8480 }, { "epoch": 14.587628865979381, "grad_norm": 0.10053612291812897, "learning_rate": 1.8705057552561595e-05, "loss": 0.0081, "step": 8490 }, { "epoch": 14.604810996563574, "grad_norm": 0.1731092631816864, "learning_rate": 1.8594378080990915e-05, "loss": 0.0077, "step": 8500 }, { "epoch": 14.621993127147766, "grad_norm": 0.1470867097377777, "learning_rate": 1.8483952186964237e-05, "loss": 0.0165, "step": 8510 }, { "epoch": 14.639175257731958, "grad_norm": 0.1546664535999298, "learning_rate": 1.8373780762090266e-05, "loss": 0.012, "step": 8520 }, { "epoch": 14.65635738831615, "grad_norm": 0.2409757524728775, "learning_rate": 1.82638646959231e-05, "loss": 0.0074, "step": 8530 }, { "epoch": 14.673539518900343, "grad_norm": 0.1771342009305954, "learning_rate": 1.8154204875955e-05, "loss": 0.013, "step": 8540 }, { "epoch": 14.690721649484535, "grad_norm": 0.11487053334712982, "learning_rate": 1.804480218760922e-05, "loss": 0.0074, "step": 8550 }, { "epoch": 14.707903780068728, "grad_norm": 0.18041536211967468, "learning_rate": 1.793565751423284e-05, "loss": 0.0074, "step": 8560 }, { "epoch": 14.72508591065292, "grad_norm": 0.1591220200061798, "learning_rate": 1.782677173708968e-05, "loss": 0.0066, "step": 8570 }, { "epoch": 14.742268041237114, "grad_norm": 0.21568642556667328, "learning_rate": 1.771814573535317e-05, "loss": 0.0079, "step": 8580 }, { "epoch": 14.759450171821307, "grad_norm": 0.12885995209217072, "learning_rate": 1.7609780386099234e-05, "loss": 0.0092, "step": 8590 }, { "epoch": 14.776632302405499, "grad_norm": 0.158493772149086, "learning_rate": 1.750167656429918e-05, "loss": 0.0073, "step": 8600 }, { "epoch": 14.793814432989691, "grad_norm": 0.14277073740959167, "learning_rate": 1.739383514281273e-05, "loss": 0.0096, "step": 8610 }, { "epoch": 14.810996563573884, "grad_norm": 0.1386091113090515, "learning_rate": 1.7286256992380888e-05, "loss": 0.0069, "step": 8620 }, { "epoch": 14.828178694158076, "grad_norm": 0.12749871611595154, "learning_rate": 1.7178942981618945e-05, "loss": 0.0127, "step": 8630 }, { "epoch": 14.845360824742269, "grad_norm": 0.18113838136196136, "learning_rate": 1.707189397700943e-05, "loss": 0.0057, "step": 8640 }, { "epoch": 14.862542955326461, "grad_norm": 0.2324298769235611, "learning_rate": 1.696511084289516e-05, "loss": 0.0078, "step": 8650 }, { "epoch": 14.879725085910653, "grad_norm": 0.09419187903404236, "learning_rate": 1.6858594441472242e-05, "loss": 0.0049, "step": 8660 }, { "epoch": 14.896907216494846, "grad_norm": 0.17267560958862305, "learning_rate": 1.6752345632783135e-05, "loss": 0.0081, "step": 8670 }, { "epoch": 14.914089347079038, "grad_norm": 0.24412667751312256, "learning_rate": 1.664636527470961e-05, "loss": 0.0075, "step": 8680 }, { "epoch": 14.93127147766323, "grad_norm": 0.10746461153030396, "learning_rate": 1.6540654222965973e-05, "loss": 0.0081, "step": 8690 }, { "epoch": 14.948453608247423, "grad_norm": 0.2459520846605301, "learning_rate": 1.6435213331092027e-05, "loss": 0.0083, "step": 8700 }, { "epoch": 14.965635738831615, "grad_norm": 0.26834601163864136, "learning_rate": 1.6330043450446265e-05, "loss": 0.0061, "step": 8710 }, { "epoch": 14.982817869415808, "grad_norm": 0.20477992296218872, "learning_rate": 1.6225145430198918e-05, "loss": 0.008, "step": 8720 }, { "epoch": 15.0, "grad_norm": 0.19616572558879852, "learning_rate": 1.6120520117325184e-05, "loss": 0.0074, "step": 8730 }, { "epoch": 15.017182130584192, "grad_norm": 0.14601653814315796, "learning_rate": 1.6016168356598343e-05, "loss": 0.005, "step": 8740 }, { "epoch": 15.034364261168385, "grad_norm": 0.11024006456136703, "learning_rate": 1.59120909905829e-05, "loss": 0.0068, "step": 8750 }, { "epoch": 15.051546391752577, "grad_norm": 0.21115554869174957, "learning_rate": 1.580828885962787e-05, "loss": 0.0092, "step": 8760 }, { "epoch": 15.06872852233677, "grad_norm": 0.23726613819599152, "learning_rate": 1.5704762801859916e-05, "loss": 0.0091, "step": 8770 }, { "epoch": 15.085910652920962, "grad_norm": 0.08648664504289627, "learning_rate": 1.560151365317665e-05, "loss": 0.0112, "step": 8780 }, { "epoch": 15.103092783505154, "grad_norm": 0.14515356719493866, "learning_rate": 1.549854224723978e-05, "loss": 0.0142, "step": 8790 }, { "epoch": 15.120274914089347, "grad_norm": 0.28362399339675903, "learning_rate": 1.5395849415468505e-05, "loss": 0.0138, "step": 8800 }, { "epoch": 15.137457044673539, "grad_norm": 0.16832397878170013, "learning_rate": 1.529343598703267e-05, "loss": 0.0146, "step": 8810 }, { "epoch": 15.154639175257731, "grad_norm": 0.16828805208206177, "learning_rate": 1.5191302788846256e-05, "loss": 0.0061, "step": 8820 }, { "epoch": 15.171821305841924, "grad_norm": 0.12565237283706665, "learning_rate": 1.508945064556047e-05, "loss": 0.0054, "step": 8830 }, { "epoch": 15.189003436426116, "grad_norm": 0.06071079894900322, "learning_rate": 1.498788037955728e-05, "loss": 0.0131, "step": 8840 }, { "epoch": 15.206185567010309, "grad_norm": 0.10003683716058731, "learning_rate": 1.4886592810942629e-05, "loss": 0.0128, "step": 8850 }, { "epoch": 15.223367697594501, "grad_norm": 0.30496570467948914, "learning_rate": 1.4785588757539991e-05, "loss": 0.0083, "step": 8860 }, { "epoch": 15.240549828178693, "grad_norm": 0.24895748496055603, "learning_rate": 1.4684869034883554e-05, "loss": 0.0106, "step": 8870 }, { "epoch": 15.257731958762886, "grad_norm": 0.21190665662288666, "learning_rate": 1.458443445621182e-05, "loss": 0.0116, "step": 8880 }, { "epoch": 15.27491408934708, "grad_norm": 0.29413625597953796, "learning_rate": 1.448428583246088e-05, "loss": 0.0078, "step": 8890 }, { "epoch": 15.292096219931272, "grad_norm": 0.11148206889629364, "learning_rate": 1.4384423972258055e-05, "loss": 0.0073, "step": 8900 }, { "epoch": 15.309278350515465, "grad_norm": 0.12171918898820877, "learning_rate": 1.4284849681915158e-05, "loss": 0.0073, "step": 8910 }, { "epoch": 15.326460481099657, "grad_norm": 0.13666512072086334, "learning_rate": 1.4185563765422155e-05, "loss": 0.0072, "step": 8920 }, { "epoch": 15.34364261168385, "grad_norm": 0.10317881405353546, "learning_rate": 1.4086567024440527e-05, "loss": 0.0059, "step": 8930 }, { "epoch": 15.360824742268042, "grad_norm": 0.12942399084568024, "learning_rate": 1.398786025829698e-05, "loss": 0.006, "step": 8940 }, { "epoch": 15.378006872852234, "grad_norm": 0.22688932716846466, "learning_rate": 1.3889444263976786e-05, "loss": 0.0074, "step": 8950 }, { "epoch": 15.395189003436426, "grad_norm": 0.184346541762352, "learning_rate": 1.3791319836117506e-05, "loss": 0.0094, "step": 8960 }, { "epoch": 15.412371134020619, "grad_norm": 0.3060619533061981, "learning_rate": 1.3693487767002445e-05, "loss": 0.0066, "step": 8970 }, { "epoch": 15.429553264604811, "grad_norm": 0.2948829233646393, "learning_rate": 1.3595948846554446e-05, "loss": 0.0098, "step": 8980 }, { "epoch": 15.446735395189004, "grad_norm": 0.22396834194660187, "learning_rate": 1.3498703862329254e-05, "loss": 0.0088, "step": 8990 }, { "epoch": 15.463917525773196, "grad_norm": 0.10084215551614761, "learning_rate": 1.3401753599509397e-05, "loss": 0.0075, "step": 9000 }, { "epoch": 15.481099656357388, "grad_norm": 0.24087756872177124, "learning_rate": 1.3305098840897646e-05, "loss": 0.0113, "step": 9010 }, { "epoch": 15.49828178694158, "grad_norm": 0.10929588228464127, "learning_rate": 1.3208740366910904e-05, "loss": 0.0079, "step": 9020 }, { "epoch": 15.515463917525773, "grad_norm": 0.10467175394296646, "learning_rate": 1.3112678955573693e-05, "loss": 0.0078, "step": 9030 }, { "epoch": 15.532646048109966, "grad_norm": 0.16169194877147675, "learning_rate": 1.3016915382512029e-05, "loss": 0.0073, "step": 9040 }, { "epoch": 15.549828178694158, "grad_norm": 0.23535805940628052, "learning_rate": 1.2921450420947057e-05, "loss": 0.009, "step": 9050 }, { "epoch": 15.56701030927835, "grad_norm": 0.14566457271575928, "learning_rate": 1.2826284841688885e-05, "loss": 0.0091, "step": 9060 }, { "epoch": 15.584192439862543, "grad_norm": 0.05960577726364136, "learning_rate": 1.2731419413130325e-05, "loss": 0.0077, "step": 9070 }, { "epoch": 15.601374570446735, "grad_norm": 0.12475630640983582, "learning_rate": 1.2636854901240681e-05, "loss": 0.0071, "step": 9080 }, { "epoch": 15.618556701030927, "grad_norm": 0.08590589463710785, "learning_rate": 1.2542592069559556e-05, "loss": 0.0058, "step": 9090 }, { "epoch": 15.63573883161512, "grad_norm": 0.13284145295619965, "learning_rate": 1.2448631679190736e-05, "loss": 0.0061, "step": 9100 }, { "epoch": 15.652920962199312, "grad_norm": 0.13162577152252197, "learning_rate": 1.2354974488796017e-05, "loss": 0.0059, "step": 9110 }, { "epoch": 15.670103092783505, "grad_norm": 0.09567181766033173, "learning_rate": 1.2261621254589022e-05, "loss": 0.0066, "step": 9120 }, { "epoch": 15.687285223367697, "grad_norm": 0.0930383950471878, "learning_rate": 1.2168572730329214e-05, "loss": 0.0095, "step": 9130 }, { "epoch": 15.70446735395189, "grad_norm": 0.09626749902963638, "learning_rate": 1.2075829667315708e-05, "loss": 0.0073, "step": 9140 }, { "epoch": 15.721649484536082, "grad_norm": 0.18719489872455597, "learning_rate": 1.1983392814381273e-05, "loss": 0.007, "step": 9150 }, { "epoch": 15.738831615120276, "grad_norm": 0.1957959532737732, "learning_rate": 1.1891262917886198e-05, "loss": 0.0083, "step": 9160 }, { "epoch": 15.756013745704468, "grad_norm": 0.21053771674633026, "learning_rate": 1.1799440721712368e-05, "loss": 0.0083, "step": 9170 }, { "epoch": 15.77319587628866, "grad_norm": 0.2002599984407425, "learning_rate": 1.170792696725721e-05, "loss": 0.0065, "step": 9180 }, { "epoch": 15.790378006872853, "grad_norm": 0.19797247648239136, "learning_rate": 1.1616722393427704e-05, "loss": 0.0062, "step": 9190 }, { "epoch": 15.807560137457045, "grad_norm": 0.24263660609722137, "learning_rate": 1.1525827736634398e-05, "loss": 0.0067, "step": 9200 }, { "epoch": 15.824742268041238, "grad_norm": 0.30493855476379395, "learning_rate": 1.1435243730785511e-05, "loss": 0.0128, "step": 9210 }, { "epoch": 15.84192439862543, "grad_norm": 0.09943480044603348, "learning_rate": 1.1344971107280978e-05, "loss": 0.0042, "step": 9220 }, { "epoch": 15.859106529209622, "grad_norm": 0.08174432069063187, "learning_rate": 1.125501059500656e-05, "loss": 0.0062, "step": 9230 }, { "epoch": 15.876288659793815, "grad_norm": 0.15180669724941254, "learning_rate": 1.1165362920327898e-05, "loss": 0.012, "step": 9240 }, { "epoch": 15.893470790378007, "grad_norm": 0.09688429534435272, "learning_rate": 1.1076028807084748e-05, "loss": 0.0074, "step": 9250 }, { "epoch": 15.9106529209622, "grad_norm": 0.0951455757021904, "learning_rate": 1.0987008976585073e-05, "loss": 0.0083, "step": 9260 }, { "epoch": 15.927835051546392, "grad_norm": 0.08295347541570663, "learning_rate": 1.0898304147599231e-05, "loss": 0.0066, "step": 9270 }, { "epoch": 15.945017182130584, "grad_norm": 0.20688819885253906, "learning_rate": 1.0809915036354152e-05, "loss": 0.0095, "step": 9280 }, { "epoch": 15.962199312714777, "grad_norm": 0.09953152388334274, "learning_rate": 1.0721842356527595e-05, "loss": 0.0052, "step": 9290 }, { "epoch": 15.97938144329897, "grad_norm": 0.27744606137275696, "learning_rate": 1.063408681924236e-05, "loss": 0.0105, "step": 9300 }, { "epoch": 15.996563573883162, "grad_norm": 0.217054545879364, "learning_rate": 1.0546649133060583e-05, "loss": 0.0076, "step": 9310 }, { "epoch": 16.013745704467354, "grad_norm": 0.15550248324871063, "learning_rate": 1.0459530003977908e-05, "loss": 0.0195, "step": 9320 }, { "epoch": 16.030927835051546, "grad_norm": 0.3384007513523102, "learning_rate": 1.0372730135417936e-05, "loss": 0.0066, "step": 9330 }, { "epoch": 16.04810996563574, "grad_norm": 0.23240487277507782, "learning_rate": 1.0286250228226434e-05, "loss": 0.0064, "step": 9340 }, { "epoch": 16.06529209621993, "grad_norm": 0.13849616050720215, "learning_rate": 1.0200090980665739e-05, "loss": 0.006, "step": 9350 }, { "epoch": 16.082474226804123, "grad_norm": 0.09946257621049881, "learning_rate": 1.0114253088409054e-05, "loss": 0.0058, "step": 9360 }, { "epoch": 16.099656357388316, "grad_norm": 0.17231334745883942, "learning_rate": 1.0028737244534914e-05, "loss": 0.0123, "step": 9370 }, { "epoch": 16.116838487972508, "grad_norm": 0.23527809977531433, "learning_rate": 9.943544139521521e-06, "loss": 0.0047, "step": 9380 }, { "epoch": 16.1340206185567, "grad_norm": 0.11022651195526123, "learning_rate": 9.858674461241229e-06, "loss": 0.0127, "step": 9390 }, { "epoch": 16.151202749140893, "grad_norm": 0.09669523686170578, "learning_rate": 9.774128894954904e-06, "loss": 0.0137, "step": 9400 }, { "epoch": 16.168384879725085, "grad_norm": 0.21540790796279907, "learning_rate": 9.68990812330648e-06, "loss": 0.0063, "step": 9410 }, { "epoch": 16.185567010309278, "grad_norm": 0.10020115226507187, "learning_rate": 9.606012826317417e-06, "loss": 0.0051, "step": 9420 }, { "epoch": 16.20274914089347, "grad_norm": 0.21819765865802765, "learning_rate": 9.522443681381172e-06, "loss": 0.0073, "step": 9430 }, { "epoch": 16.219931271477662, "grad_norm": 0.15638329088687897, "learning_rate": 9.439201363257778e-06, "loss": 0.008, "step": 9440 }, { "epoch": 16.237113402061855, "grad_norm": 0.23262475430965424, "learning_rate": 9.356286544068394e-06, "loss": 0.0093, "step": 9450 }, { "epoch": 16.254295532646047, "grad_norm": 0.05870979651808739, "learning_rate": 9.273699893289862e-06, "loss": 0.0041, "step": 9460 }, { "epoch": 16.27147766323024, "grad_norm": 0.1267542541027069, "learning_rate": 9.191442077749257e-06, "loss": 0.0068, "step": 9470 }, { "epoch": 16.288659793814432, "grad_norm": 0.11667685955762863, "learning_rate": 9.10951376161861e-06, "loss": 0.0037, "step": 9480 }, { "epoch": 16.305841924398624, "grad_norm": 0.13009728491306305, "learning_rate": 9.027915606409427e-06, "loss": 0.0088, "step": 9490 }, { "epoch": 16.323024054982817, "grad_norm": 0.2052125632762909, "learning_rate": 8.946648270967473e-06, "loss": 0.0058, "step": 9500 }, { "epoch": 16.34020618556701, "grad_norm": 0.06314068287611008, "learning_rate": 8.86571241146732e-06, "loss": 0.0074, "step": 9510 }, { "epoch": 16.3573883161512, "grad_norm": 0.1686325967311859, "learning_rate": 8.785108681407156e-06, "loss": 0.0075, "step": 9520 }, { "epoch": 16.374570446735394, "grad_norm": 0.07260609418153763, "learning_rate": 8.704837731603415e-06, "loss": 0.0055, "step": 9530 }, { "epoch": 16.391752577319586, "grad_norm": 0.1643422245979309, "learning_rate": 8.624900210185648e-06, "loss": 0.0089, "step": 9540 }, { "epoch": 16.40893470790378, "grad_norm": 0.20184507966041565, "learning_rate": 8.545296762591144e-06, "loss": 0.01, "step": 9550 }, { "epoch": 16.42611683848797, "grad_norm": 0.07979606091976166, "learning_rate": 8.466028031559836e-06, "loss": 0.0054, "step": 9560 }, { "epoch": 16.443298969072163, "grad_norm": 0.10305003076791763, "learning_rate": 8.387094657129013e-06, "loss": 0.0101, "step": 9570 }, { "epoch": 16.460481099656356, "grad_norm": 0.14057329297065735, "learning_rate": 8.308497276628279e-06, "loss": 0.0038, "step": 9580 }, { "epoch": 16.477663230240548, "grad_norm": 0.20156535506248474, "learning_rate": 8.230236524674256e-06, "loss": 0.0077, "step": 9590 }, { "epoch": 16.49484536082474, "grad_norm": 0.12216826528310776, "learning_rate": 8.152313033165592e-06, "loss": 0.0114, "step": 9600 }, { "epoch": 16.512027491408936, "grad_norm": 0.22437122464179993, "learning_rate": 8.074727431277745e-06, "loss": 0.006, "step": 9610 }, { "epoch": 16.52920962199313, "grad_norm": 0.21979045867919922, "learning_rate": 7.99748034545803e-06, "loss": 0.0063, "step": 9620 }, { "epoch": 16.54639175257732, "grad_norm": 0.13207381963729858, "learning_rate": 7.920572399420428e-06, "loss": 0.0069, "step": 9630 }, { "epoch": 16.563573883161514, "grad_norm": 0.3022846579551697, "learning_rate": 7.844004214140665e-06, "loss": 0.0051, "step": 9640 }, { "epoch": 16.580756013745706, "grad_norm": 0.16219815611839294, "learning_rate": 7.76777640785108e-06, "loss": 0.0094, "step": 9650 }, { "epoch": 16.5979381443299, "grad_norm": 0.2360786646604538, "learning_rate": 7.691889596035784e-06, "loss": 0.0075, "step": 9660 }, { "epoch": 16.61512027491409, "grad_norm": 0.2144838273525238, "learning_rate": 7.616344391425534e-06, "loss": 0.0158, "step": 9670 }, { "epoch": 16.632302405498283, "grad_norm": 0.17959503829479218, "learning_rate": 7.541141403992902e-06, "loss": 0.0168, "step": 9680 }, { "epoch": 16.649484536082475, "grad_norm": 0.06356091052293777, "learning_rate": 7.4662812409472705e-06, "loss": 0.0054, "step": 9690 }, { "epoch": 16.666666666666668, "grad_norm": 0.16825449466705322, "learning_rate": 7.391764506729992e-06, "loss": 0.0133, "step": 9700 }, { "epoch": 16.68384879725086, "grad_norm": 0.06855875998735428, "learning_rate": 7.317591803009472e-06, "loss": 0.0053, "step": 9710 }, { "epoch": 16.701030927835053, "grad_norm": 0.07593529671430588, "learning_rate": 7.243763728676328e-06, "loss": 0.0102, "step": 9720 }, { "epoch": 16.718213058419245, "grad_norm": 0.2639977037906647, "learning_rate": 7.170280879838515e-06, "loss": 0.0063, "step": 9730 }, { "epoch": 16.735395189003437, "grad_norm": 0.1285523623228073, "learning_rate": 7.097143849816584e-06, "loss": 0.0066, "step": 9740 }, { "epoch": 16.75257731958763, "grad_norm": 0.07977993786334991, "learning_rate": 7.024353229138836e-06, "loss": 0.0098, "step": 9750 }, { "epoch": 16.769759450171822, "grad_norm": 0.08578211069107056, "learning_rate": 6.951909605536544e-06, "loss": 0.0085, "step": 9760 }, { "epoch": 16.786941580756015, "grad_norm": 0.12852418422698975, "learning_rate": 6.879813563939269e-06, "loss": 0.0053, "step": 9770 }, { "epoch": 16.804123711340207, "grad_norm": 0.16339828073978424, "learning_rate": 6.808065686470083e-06, "loss": 0.0065, "step": 9780 }, { "epoch": 16.8213058419244, "grad_norm": 0.14644475281238556, "learning_rate": 6.736666552440896e-06, "loss": 0.0091, "step": 9790 }, { "epoch": 16.83848797250859, "grad_norm": 0.1779240071773529, "learning_rate": 6.665616738347741e-06, "loss": 0.0051, "step": 9800 }, { "epoch": 16.855670103092784, "grad_norm": 0.2106473445892334, "learning_rate": 6.5949168178661755e-06, "loss": 0.0063, "step": 9810 }, { "epoch": 16.872852233676976, "grad_norm": 0.09443774074316025, "learning_rate": 6.524567361846612e-06, "loss": 0.007, "step": 9820 }, { "epoch": 16.89003436426117, "grad_norm": 0.1403152197599411, "learning_rate": 6.454568938309724e-06, "loss": 0.0055, "step": 9830 }, { "epoch": 16.90721649484536, "grad_norm": 0.2038644254207611, "learning_rate": 6.384922112441821e-06, "loss": 0.0047, "step": 9840 }, { "epoch": 16.924398625429554, "grad_norm": 0.22020995616912842, "learning_rate": 6.315627446590367e-06, "loss": 0.0052, "step": 9850 }, { "epoch": 16.941580756013746, "grad_norm": 0.07635272294282913, "learning_rate": 6.246685500259352e-06, "loss": 0.0073, "step": 9860 }, { "epoch": 16.95876288659794, "grad_norm": 0.18454794585704803, "learning_rate": 6.1780968301048406e-06, "loss": 0.0072, "step": 9870 }, { "epoch": 16.97594501718213, "grad_norm": 0.3649043142795563, "learning_rate": 6.10986198993041e-06, "loss": 0.0048, "step": 9880 }, { "epoch": 16.993127147766323, "grad_norm": 0.09747838973999023, "learning_rate": 6.041981530682756e-06, "loss": 0.0071, "step": 9890 }, { "epoch": 17.010309278350515, "grad_norm": 0.170127272605896, "learning_rate": 5.9744560004471874e-06, "loss": 0.0089, "step": 9900 }, { "epoch": 17.027491408934708, "grad_norm": 0.11401594430208206, "learning_rate": 5.907285944443241e-06, "loss": 0.0064, "step": 9910 }, { "epoch": 17.0446735395189, "grad_norm": 0.09278935939073563, "learning_rate": 5.840471905020223e-06, "loss": 0.0038, "step": 9920 }, { "epoch": 17.061855670103093, "grad_norm": 0.09396050870418549, "learning_rate": 5.774014421652879e-06, "loss": 0.0088, "step": 9930 }, { "epoch": 17.079037800687285, "grad_norm": 0.22349518537521362, "learning_rate": 5.707914030937045e-06, "loss": 0.0116, "step": 9940 }, { "epoch": 17.096219931271477, "grad_norm": 0.11371111124753952, "learning_rate": 5.642171266585272e-06, "loss": 0.0071, "step": 9950 }, { "epoch": 17.11340206185567, "grad_norm": 0.11897526681423187, "learning_rate": 5.576786659422534e-06, "loss": 0.0124, "step": 9960 }, { "epoch": 17.130584192439862, "grad_norm": 0.11207327991724014, "learning_rate": 5.51176073738196e-06, "loss": 0.0056, "step": 9970 }, { "epoch": 17.147766323024054, "grad_norm": 0.11493603140115738, "learning_rate": 5.447094025500554e-06, "loss": 0.0057, "step": 9980 }, { "epoch": 17.164948453608247, "grad_norm": 0.15954163670539856, "learning_rate": 5.3827870459149665e-06, "loss": 0.0062, "step": 9990 }, { "epoch": 17.18213058419244, "grad_norm": 0.14759565889835358, "learning_rate": 5.318840317857248e-06, "loss": 0.0072, "step": 10000 }, { "epoch": 17.19931271477663, "grad_norm": 0.12291218340396881, "learning_rate": 5.2552543576506965e-06, "loss": 0.0049, "step": 10010 }, { "epoch": 17.216494845360824, "grad_norm": 0.15934151411056519, "learning_rate": 5.192029678705679e-06, "loss": 0.0117, "step": 10020 }, { "epoch": 17.233676975945016, "grad_norm": 0.06648825109004974, "learning_rate": 5.1291667915154774e-06, "loss": 0.0056, "step": 10030 }, { "epoch": 17.25085910652921, "grad_norm": 0.08082503825426102, "learning_rate": 5.066666203652148e-06, "loss": 0.0076, "step": 10040 }, { "epoch": 17.2680412371134, "grad_norm": 0.06852155178785324, "learning_rate": 5.004528419762455e-06, "loss": 0.0075, "step": 10050 }, { "epoch": 17.285223367697593, "grad_norm": 0.22112947702407837, "learning_rate": 4.9427539415638106e-06, "loss": 0.0064, "step": 10060 }, { "epoch": 17.302405498281786, "grad_norm": 0.11393307894468307, "learning_rate": 4.88134326784015e-06, "loss": 0.0054, "step": 10070 }, { "epoch": 17.31958762886598, "grad_norm": 0.07953692972660065, "learning_rate": 4.8202968944379865e-06, "loss": 0.0069, "step": 10080 }, { "epoch": 17.33676975945017, "grad_norm": 0.12307379394769669, "learning_rate": 4.759615314262361e-06, "loss": 0.0086, "step": 10090 }, { "epoch": 17.353951890034363, "grad_norm": 0.2868382930755615, "learning_rate": 4.6992990172728846e-06, "loss": 0.0067, "step": 10100 }, { "epoch": 17.371134020618555, "grad_norm": 0.13150477409362793, "learning_rate": 4.639348490479755e-06, "loss": 0.0052, "step": 10110 }, { "epoch": 17.388316151202748, "grad_norm": 0.13385729491710663, "learning_rate": 4.579764217939863e-06, "loss": 0.0085, "step": 10120 }, { "epoch": 17.40549828178694, "grad_norm": 0.05345318466424942, "learning_rate": 4.5205466807528294e-06, "loss": 0.0053, "step": 10130 }, { "epoch": 17.422680412371133, "grad_norm": 0.09881814569234848, "learning_rate": 4.4616963570572105e-06, "loss": 0.0037, "step": 10140 }, { "epoch": 17.439862542955325, "grad_norm": 0.15886837244033813, "learning_rate": 4.403213722026516e-06, "loss": 0.0094, "step": 10150 }, { "epoch": 17.457044673539517, "grad_norm": 0.046078938990831375, "learning_rate": 4.345099247865486e-06, "loss": 0.0066, "step": 10160 }, { "epoch": 17.47422680412371, "grad_norm": 0.31892022490501404, "learning_rate": 4.287353403806188e-06, "loss": 0.0071, "step": 10170 }, { "epoch": 17.491408934707902, "grad_norm": 0.23924608528614044, "learning_rate": 4.229976656104323e-06, "loss": 0.0065, "step": 10180 }, { "epoch": 17.508591065292094, "grad_norm": 0.10865090042352676, "learning_rate": 4.172969468035359e-06, "loss": 0.0059, "step": 10190 }, { "epoch": 17.52577319587629, "grad_norm": 0.10951688885688782, "learning_rate": 4.116332299890868e-06, "loss": 0.0114, "step": 10200 }, { "epoch": 17.542955326460483, "grad_norm": 0.08340981602668762, "learning_rate": 4.060065608974744e-06, "loss": 0.006, "step": 10210 }, { "epoch": 17.560137457044675, "grad_norm": 0.2836835980415344, "learning_rate": 4.0041698495996095e-06, "loss": 0.006, "step": 10220 }, { "epoch": 17.577319587628867, "grad_norm": 0.34195101261138916, "learning_rate": 3.948645473083018e-06, "loss": 0.0111, "step": 10230 }, { "epoch": 17.59450171821306, "grad_norm": 0.136323943734169, "learning_rate": 3.893492927743925e-06, "loss": 0.0095, "step": 10240 }, { "epoch": 17.611683848797252, "grad_norm": 0.19916993379592896, "learning_rate": 3.838712658898974e-06, "loss": 0.0065, "step": 10250 }, { "epoch": 17.628865979381445, "grad_norm": 0.10761203616857529, "learning_rate": 3.7843051088590153e-06, "loss": 0.0042, "step": 10260 }, { "epoch": 17.646048109965637, "grad_norm": 0.08006154000759125, "learning_rate": 3.730270716925394e-06, "loss": 0.0062, "step": 10270 }, { "epoch": 17.66323024054983, "grad_norm": 0.07104959338903427, "learning_rate": 3.67660991938652e-06, "loss": 0.005, "step": 10280 }, { "epoch": 17.68041237113402, "grad_norm": 0.19979846477508545, "learning_rate": 3.6233231495142626e-06, "loss": 0.0053, "step": 10290 }, { "epoch": 17.697594501718214, "grad_norm": 0.20565828680992126, "learning_rate": 3.5704108375605448e-06, "loss": 0.0055, "step": 10300 }, { "epoch": 17.714776632302407, "grad_norm": 0.17104719579219818, "learning_rate": 3.5178734107537637e-06, "loss": 0.0105, "step": 10310 }, { "epoch": 17.7319587628866, "grad_norm": 0.08832945674657822, "learning_rate": 3.4657112932954204e-06, "loss": 0.0056, "step": 10320 }, { "epoch": 17.74914089347079, "grad_norm": 0.053497232496738434, "learning_rate": 3.4139249063566415e-06, "loss": 0.0037, "step": 10330 }, { "epoch": 17.766323024054984, "grad_norm": 0.05350416153669357, "learning_rate": 3.36251466807484e-06, "loss": 0.0058, "step": 10340 }, { "epoch": 17.783505154639176, "grad_norm": 0.19712647795677185, "learning_rate": 3.311480993550259e-06, "loss": 0.0059, "step": 10350 }, { "epoch": 17.80068728522337, "grad_norm": 0.1699119359254837, "learning_rate": 3.2608242948427017e-06, "loss": 0.008, "step": 10360 }, { "epoch": 17.81786941580756, "grad_norm": 0.09976794570684433, "learning_rate": 3.2105449809681334e-06, "loss": 0.01, "step": 10370 }, { "epoch": 17.835051546391753, "grad_norm": 0.12520617246627808, "learning_rate": 3.160643457895435e-06, "loss": 0.0057, "step": 10380 }, { "epoch": 17.852233676975946, "grad_norm": 0.23560845851898193, "learning_rate": 3.111120128543088e-06, "loss": 0.0127, "step": 10390 }, { "epoch": 17.869415807560138, "grad_norm": 0.134457528591156, "learning_rate": 3.0619753927759565e-06, "loss": 0.0067, "step": 10400 }, { "epoch": 17.88659793814433, "grad_norm": 0.20700249075889587, "learning_rate": 3.013209647401999e-06, "loss": 0.0053, "step": 10410 }, { "epoch": 17.903780068728523, "grad_norm": 0.1971113532781601, "learning_rate": 2.964823286169133e-06, "loss": 0.0073, "step": 10420 }, { "epoch": 17.920962199312715, "grad_norm": 0.2505052089691162, "learning_rate": 2.9168166997620263e-06, "loss": 0.0066, "step": 10430 }, { "epoch": 17.938144329896907, "grad_norm": 0.1978893280029297, "learning_rate": 2.869190275798911e-06, "loss": 0.0067, "step": 10440 }, { "epoch": 17.9553264604811, "grad_norm": 0.09993084520101547, "learning_rate": 2.821944398828519e-06, "loss": 0.005, "step": 10450 }, { "epoch": 17.972508591065292, "grad_norm": 0.0836280807852745, "learning_rate": 2.775079450326917e-06, "loss": 0.0076, "step": 10460 }, { "epoch": 17.989690721649485, "grad_norm": 0.20113223791122437, "learning_rate": 2.7285958086944786e-06, "loss": 0.0039, "step": 10470 }, { "epoch": 18.006872852233677, "grad_norm": 0.10885003954172134, "learning_rate": 2.6824938492527595e-06, "loss": 0.0093, "step": 10480 }, { "epoch": 18.02405498281787, "grad_norm": 0.05460460111498833, "learning_rate": 2.636773944241555e-06, "loss": 0.003, "step": 10490 }, { "epoch": 18.04123711340206, "grad_norm": 0.08809126913547516, "learning_rate": 2.5914364628158217e-06, "loss": 0.0044, "step": 10500 }, { "epoch": 18.058419243986254, "grad_norm": 0.1788802295923233, "learning_rate": 2.5464817710427414e-06, "loss": 0.0072, "step": 10510 }, { "epoch": 18.075601374570446, "grad_norm": 0.08905819058418274, "learning_rate": 2.501910231898724e-06, "loss": 0.0066, "step": 10520 }, { "epoch": 18.09278350515464, "grad_norm": 0.08635038882493973, "learning_rate": 2.457722205266516e-06, "loss": 0.0048, "step": 10530 }, { "epoch": 18.10996563573883, "grad_norm": 0.3244889974594116, "learning_rate": 2.413918047932284e-06, "loss": 0.0084, "step": 10540 }, { "epoch": 18.127147766323024, "grad_norm": 0.1447770893573761, "learning_rate": 2.370498113582731e-06, "loss": 0.0054, "step": 10550 }, { "epoch": 18.144329896907216, "grad_norm": 0.13908503949642181, "learning_rate": 2.327462752802212e-06, "loss": 0.0045, "step": 10560 }, { "epoch": 18.16151202749141, "grad_norm": 0.15273341536521912, "learning_rate": 2.2848123130699562e-06, "loss": 0.0074, "step": 10570 }, { "epoch": 18.1786941580756, "grad_norm": 0.09530580788850784, "learning_rate": 2.2425471387572337e-06, "loss": 0.0054, "step": 10580 }, { "epoch": 18.195876288659793, "grad_norm": 0.0629500150680542, "learning_rate": 2.2006675711245818e-06, "loss": 0.0053, "step": 10590 }, { "epoch": 18.213058419243985, "grad_norm": 0.13536952435970306, "learning_rate": 2.15917394831901e-06, "loss": 0.0076, "step": 10600 }, { "epoch": 18.230240549828178, "grad_norm": 0.1433546394109726, "learning_rate": 2.118066605371344e-06, "loss": 0.0062, "step": 10610 }, { "epoch": 18.24742268041237, "grad_norm": 0.07788240164518356, "learning_rate": 2.0773458741934646e-06, "loss": 0.0043, "step": 10620 }, { "epoch": 18.264604810996563, "grad_norm": 0.2006695419549942, "learning_rate": 2.0370120835756513e-06, "loss": 0.0064, "step": 10630 }, { "epoch": 18.281786941580755, "grad_norm": 0.09646660089492798, "learning_rate": 1.9970655591838917e-06, "loss": 0.0066, "step": 10640 }, { "epoch": 18.298969072164947, "grad_norm": 0.08818981796503067, "learning_rate": 1.9575066235573205e-06, "loss": 0.0061, "step": 10650 }, { "epoch": 18.31615120274914, "grad_norm": 0.10929910838603973, "learning_rate": 1.918335596105553e-06, "loss": 0.0054, "step": 10660 }, { "epoch": 18.333333333333332, "grad_norm": 0.23234823346138, "learning_rate": 1.8795527931061374e-06, "loss": 0.0088, "step": 10670 }, { "epoch": 18.350515463917525, "grad_norm": 0.06758160889148712, "learning_rate": 1.841158527701975e-06, "loss": 0.0074, "step": 10680 }, { "epoch": 18.367697594501717, "grad_norm": 0.186759814620018, "learning_rate": 1.8031531098988252e-06, "loss": 0.0064, "step": 10690 }, { "epoch": 18.38487972508591, "grad_norm": 0.12347927689552307, "learning_rate": 1.765536846562782e-06, "loss": 0.0066, "step": 10700 }, { "epoch": 18.4020618556701, "grad_norm": 0.05850611999630928, "learning_rate": 1.7283100414178078e-06, "loss": 0.0071, "step": 10710 }, { "epoch": 18.419243986254294, "grad_norm": 0.22844308614730835, "learning_rate": 1.6914729950432474e-06, "loss": 0.0052, "step": 10720 }, { "epoch": 18.436426116838486, "grad_norm": 0.23077848553657532, "learning_rate": 1.6550260048714628e-06, "loss": 0.0094, "step": 10730 }, { "epoch": 18.45360824742268, "grad_norm": 0.04616402089595795, "learning_rate": 1.6189693651853687e-06, "loss": 0.0079, "step": 10740 }, { "epoch": 18.47079037800687, "grad_norm": 0.1235852912068367, "learning_rate": 1.58330336711609e-06, "loss": 0.0055, "step": 10750 }, { "epoch": 18.487972508591064, "grad_norm": 0.06739755719900131, "learning_rate": 1.5480282986406136e-06, "loss": 0.0088, "step": 10760 }, { "epoch": 18.50515463917526, "grad_norm": 0.08690838515758514, "learning_rate": 1.5131444445794506e-06, "loss": 0.004, "step": 10770 }, { "epoch": 18.522336769759452, "grad_norm": 0.12189892679452896, "learning_rate": 1.4786520865943344e-06, "loss": 0.0053, "step": 10780 }, { "epoch": 18.539518900343644, "grad_norm": 0.10416446626186371, "learning_rate": 1.4445515031859591e-06, "loss": 0.0042, "step": 10790 }, { "epoch": 18.556701030927837, "grad_norm": 0.13094037771224976, "learning_rate": 1.4108429696917225e-06, "loss": 0.0063, "step": 10800 }, { "epoch": 18.57388316151203, "grad_norm": 0.07218027114868164, "learning_rate": 1.3775267582834928e-06, "loss": 0.0068, "step": 10810 }, { "epoch": 18.59106529209622, "grad_norm": 0.10305195301771164, "learning_rate": 1.34460313796545e-06, "loss": 0.0059, "step": 10820 }, { "epoch": 18.608247422680414, "grad_norm": 0.18571843206882477, "learning_rate": 1.31207237457186e-06, "loss": 0.0036, "step": 10830 }, { "epoch": 18.625429553264606, "grad_norm": 0.09622086584568024, "learning_rate": 1.2799347307649756e-06, "loss": 0.0031, "step": 10840 }, { "epoch": 18.6426116838488, "grad_norm": 0.06238604336977005, "learning_rate": 1.248190466032867e-06, "loss": 0.0041, "step": 10850 }, { "epoch": 18.65979381443299, "grad_norm": 0.07316755503416061, "learning_rate": 1.2168398366873946e-06, "loss": 0.0048, "step": 10860 }, { "epoch": 18.676975945017183, "grad_norm": 0.28695550560951233, "learning_rate": 1.1858830958620559e-06, "loss": 0.0064, "step": 10870 }, { "epoch": 18.694158075601376, "grad_norm": 0.18788480758666992, "learning_rate": 1.155320493510026e-06, "loss": 0.008, "step": 10880 }, { "epoch": 18.711340206185568, "grad_norm": 0.06678231060504913, "learning_rate": 1.1251522764020638e-06, "loss": 0.0058, "step": 10890 }, { "epoch": 18.72852233676976, "grad_norm": 0.0773329809308052, "learning_rate": 1.095378688124582e-06, "loss": 0.0093, "step": 10900 }, { "epoch": 18.745704467353953, "grad_norm": 0.12073255330324173, "learning_rate": 1.0659999690776302e-06, "loss": 0.0047, "step": 10910 }, { "epoch": 18.762886597938145, "grad_norm": 0.09898494184017181, "learning_rate": 1.0370163564729974e-06, "loss": 0.007, "step": 10920 }, { "epoch": 18.780068728522338, "grad_norm": 0.0713253766298294, "learning_rate": 1.008428084332247e-06, "loss": 0.0061, "step": 10930 }, { "epoch": 18.79725085910653, "grad_norm": 0.1340664029121399, "learning_rate": 9.802353834848953e-07, "loss": 0.0045, "step": 10940 }, { "epoch": 18.814432989690722, "grad_norm": 0.08532940596342087, "learning_rate": 9.524384815664699e-07, "loss": 0.0038, "step": 10950 }, { "epoch": 18.831615120274915, "grad_norm": 0.12806546688079834, "learning_rate": 9.250376030167429e-07, "loss": 0.009, "step": 10960 }, { "epoch": 18.848797250859107, "grad_norm": 0.07002566009759903, "learning_rate": 8.980329690778499e-07, "loss": 0.0035, "step": 10970 }, { "epoch": 18.8659793814433, "grad_norm": 0.09975923597812653, "learning_rate": 8.714247977925749e-07, "loss": 0.004, "step": 10980 }, { "epoch": 18.883161512027492, "grad_norm": 0.1369011402130127, "learning_rate": 8.452133040025345e-07, "loss": 0.004, "step": 10990 }, { "epoch": 18.900343642611684, "grad_norm": 0.16670043766498566, "learning_rate": 8.193986993464686e-07, "loss": 0.0049, "step": 11000 } ], "logging_steps": 10, "max_steps": 11638, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7686827385256666e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }