{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.999290780141844, "eval_steps": 500, "global_step": 7049, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0070921985815602835, "grad_norm": 5.779086589813232, "learning_rate": 5.6657223796034e-06, "loss": 0.8629, "step": 10 }, { "epoch": 0.014184397163120567, "grad_norm": 4.5390448570251465, "learning_rate": 1.13314447592068e-05, "loss": 0.6688, "step": 20 }, { "epoch": 0.02127659574468085, "grad_norm": 3.5783498287200928, "learning_rate": 1.69971671388102e-05, "loss": 0.4349, "step": 30 }, { "epoch": 0.028368794326241134, "grad_norm": 2.7119619846343994, "learning_rate": 2.26628895184136e-05, "loss": 0.3312, "step": 40 }, { "epoch": 0.03546099290780142, "grad_norm": 1.4443682432174683, "learning_rate": 2.8328611898017e-05, "loss": 0.275, "step": 50 }, { "epoch": 0.0425531914893617, "grad_norm": 1.6015079021453857, "learning_rate": 3.39943342776204e-05, "loss": 0.2297, "step": 60 }, { "epoch": 0.04964539007092199, "grad_norm": 1.588321566581726, "learning_rate": 3.96600566572238e-05, "loss": 0.1544, "step": 70 }, { "epoch": 0.05673758865248227, "grad_norm": 2.757812738418579, "learning_rate": 4.53257790368272e-05, "loss": 0.1773, "step": 80 }, { "epoch": 0.06382978723404255, "grad_norm": 2.5903780460357666, "learning_rate": 5.09915014164306e-05, "loss": 0.1427, "step": 90 }, { "epoch": 0.07092198581560284, "grad_norm": 0.9456828236579895, "learning_rate": 5.6657223796034e-05, "loss": 0.1267, "step": 100 }, { "epoch": 0.07801418439716312, "grad_norm": 1.6622958183288574, "learning_rate": 6.23229461756374e-05, "loss": 0.13, "step": 110 }, { "epoch": 0.0851063829787234, "grad_norm": 1.7187730073928833, "learning_rate": 6.79886685552408e-05, "loss": 0.1246, "step": 120 }, { "epoch": 0.09219858156028368, "grad_norm": 1.5915329456329346, "learning_rate": 7.36543909348442e-05, "loss": 0.0941, "step": 130 }, { "epoch": 0.09929078014184398, "grad_norm": 1.4188594818115234, "learning_rate": 7.93201133144476e-05, "loss": 0.0948, "step": 140 }, { "epoch": 0.10638297872340426, "grad_norm": 1.5538837909698486, "learning_rate": 8.4985835694051e-05, "loss": 0.108, "step": 150 }, { "epoch": 0.11347517730496454, "grad_norm": 1.3440709114074707, "learning_rate": 9.06515580736544e-05, "loss": 0.0956, "step": 160 }, { "epoch": 0.12056737588652482, "grad_norm": 0.9447225332260132, "learning_rate": 9.631728045325779e-05, "loss": 0.0904, "step": 170 }, { "epoch": 0.1276595744680851, "grad_norm": 0.77945876121521, "learning_rate": 0.0001019830028328612, "loss": 0.0892, "step": 180 }, { "epoch": 0.1347517730496454, "grad_norm": 1.136725664138794, "learning_rate": 0.00010764872521246458, "loss": 0.0899, "step": 190 }, { "epoch": 0.14184397163120568, "grad_norm": 1.6664468050003052, "learning_rate": 0.000113314447592068, "loss": 0.0944, "step": 200 }, { "epoch": 0.14893617021276595, "grad_norm": 0.7696484923362732, "learning_rate": 0.00011898016997167138, "loss": 0.0924, "step": 210 }, { "epoch": 0.15602836879432624, "grad_norm": 0.918065071105957, "learning_rate": 0.0001246458923512748, "loss": 0.0853, "step": 220 }, { "epoch": 0.16312056737588654, "grad_norm": 1.1535277366638184, "learning_rate": 0.0001303116147308782, "loss": 0.0739, "step": 230 }, { "epoch": 0.1702127659574468, "grad_norm": 1.3063223361968994, "learning_rate": 0.0001359773371104816, "loss": 0.0791, "step": 240 }, { "epoch": 0.1773049645390071, "grad_norm": 0.9549968838691711, "learning_rate": 0.00014164305949008498, "loss": 0.075, "step": 250 }, { "epoch": 0.18439716312056736, "grad_norm": 0.7243878841400146, "learning_rate": 0.0001473087818696884, "loss": 0.0802, "step": 260 }, { "epoch": 0.19148936170212766, "grad_norm": 0.5810666084289551, "learning_rate": 0.0001529745042492918, "loss": 0.0748, "step": 270 }, { "epoch": 0.19858156028368795, "grad_norm": 0.7565228343009949, "learning_rate": 0.0001586402266288952, "loss": 0.0841, "step": 280 }, { "epoch": 0.20567375886524822, "grad_norm": 0.8032036423683167, "learning_rate": 0.00016430594900849858, "loss": 0.0621, "step": 290 }, { "epoch": 0.2127659574468085, "grad_norm": 0.8412207365036011, "learning_rate": 0.000169971671388102, "loss": 0.0815, "step": 300 }, { "epoch": 0.2198581560283688, "grad_norm": 0.7461270093917847, "learning_rate": 0.00017563739376770537, "loss": 0.0836, "step": 310 }, { "epoch": 0.22695035460992907, "grad_norm": 0.686876118183136, "learning_rate": 0.0001813031161473088, "loss": 0.0799, "step": 320 }, { "epoch": 0.23404255319148937, "grad_norm": 1.616554617881775, "learning_rate": 0.0001869688385269122, "loss": 0.0819, "step": 330 }, { "epoch": 0.24113475177304963, "grad_norm": 1.098061203956604, "learning_rate": 0.00019263456090651558, "loss": 0.0649, "step": 340 }, { "epoch": 0.24822695035460993, "grad_norm": 0.9771316647529602, "learning_rate": 0.00019830028328611898, "loss": 0.0675, "step": 350 }, { "epoch": 0.2553191489361702, "grad_norm": 0.97850102186203, "learning_rate": 0.0001999994606948725, "loss": 0.0645, "step": 360 }, { "epoch": 0.2624113475177305, "grad_norm": 0.7112658023834229, "learning_rate": 0.0001999968192143738, "loss": 0.0694, "step": 370 }, { "epoch": 0.2695035460992908, "grad_norm": 0.708350419998169, "learning_rate": 0.00019999197656053288, "loss": 0.0624, "step": 380 }, { "epoch": 0.2765957446808511, "grad_norm": 0.5273390412330627, "learning_rate": 0.00019998493283994825, "loss": 0.074, "step": 390 }, { "epoch": 0.28368794326241137, "grad_norm": 1.1027687788009644, "learning_rate": 0.0001999756882076694, "loss": 0.0572, "step": 400 }, { "epoch": 0.2907801418439716, "grad_norm": 1.0205180644989014, "learning_rate": 0.00019996424286719317, "loss": 0.0654, "step": 410 }, { "epoch": 0.2978723404255319, "grad_norm": 0.5349808931350708, "learning_rate": 0.00019995059707045935, "loss": 0.0682, "step": 420 }, { "epoch": 0.3049645390070922, "grad_norm": 0.6187090277671814, "learning_rate": 0.00019993475111784514, "loss": 0.0611, "step": 430 }, { "epoch": 0.3120567375886525, "grad_norm": 0.5722566843032837, "learning_rate": 0.00019991670535815853, "loss": 0.0635, "step": 440 }, { "epoch": 0.3191489361702128, "grad_norm": 1.1226184368133545, "learning_rate": 0.00019989646018863058, "loss": 0.0632, "step": 450 }, { "epoch": 0.3262411347517731, "grad_norm": 0.814766526222229, "learning_rate": 0.00019987401605490676, "loss": 0.0546, "step": 460 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8665221929550171, "learning_rate": 0.00019984937345103707, "loss": 0.0591, "step": 470 }, { "epoch": 0.3404255319148936, "grad_norm": 0.5462458729743958, "learning_rate": 0.0001998225329194652, "loss": 0.0543, "step": 480 }, { "epoch": 0.3475177304964539, "grad_norm": 0.3314288556575775, "learning_rate": 0.00019979349505101654, "loss": 0.0538, "step": 490 }, { "epoch": 0.3546099290780142, "grad_norm": 0.6016983985900879, "learning_rate": 0.00019976226048488526, "loss": 0.0417, "step": 500 }, { "epoch": 0.3617021276595745, "grad_norm": 0.5387014150619507, "learning_rate": 0.0001997288299086202, "loss": 0.0507, "step": 510 }, { "epoch": 0.36879432624113473, "grad_norm": 0.6287176609039307, "learning_rate": 0.00019969320405810966, "loss": 0.0497, "step": 520 }, { "epoch": 0.375886524822695, "grad_norm": 0.7354361414909363, "learning_rate": 0.0001996553837175653, "loss": 0.0556, "step": 530 }, { "epoch": 0.3829787234042553, "grad_norm": 0.5547359585762024, "learning_rate": 0.0001996153697195049, "loss": 0.056, "step": 540 }, { "epoch": 0.3900709219858156, "grad_norm": 0.612255871295929, "learning_rate": 0.00019957316294473389, "loss": 0.0504, "step": 550 }, { "epoch": 0.3971631205673759, "grad_norm": 0.7296609878540039, "learning_rate": 0.0001995287643223261, "loss": 0.0603, "step": 560 }, { "epoch": 0.40425531914893614, "grad_norm": 0.53786700963974, "learning_rate": 0.0001994821748296033, "loss": 0.0609, "step": 570 }, { "epoch": 0.41134751773049644, "grad_norm": 0.49047139286994934, "learning_rate": 0.00019943339549211359, "loss": 0.0581, "step": 580 }, { "epoch": 0.41843971631205673, "grad_norm": 0.4338454604148865, "learning_rate": 0.00019938242738360887, "loss": 0.0516, "step": 590 }, { "epoch": 0.425531914893617, "grad_norm": 0.7479408979415894, "learning_rate": 0.00019932927162602125, "loss": 0.0519, "step": 600 }, { "epoch": 0.4326241134751773, "grad_norm": 0.6909820437431335, "learning_rate": 0.00019927392938943835, "loss": 0.0577, "step": 610 }, { "epoch": 0.4397163120567376, "grad_norm": 0.44178247451782227, "learning_rate": 0.00019921640189207745, "loss": 0.0492, "step": 620 }, { "epoch": 0.44680851063829785, "grad_norm": 0.42631322145462036, "learning_rate": 0.00019915669040025872, "loss": 0.0442, "step": 630 }, { "epoch": 0.45390070921985815, "grad_norm": 0.36178159713745117, "learning_rate": 0.00019909479622837753, "loss": 0.0418, "step": 640 }, { "epoch": 0.46099290780141844, "grad_norm": 0.31890127062797546, "learning_rate": 0.00019903072073887507, "loss": 0.042, "step": 650 }, { "epoch": 0.46808510638297873, "grad_norm": 0.6327568292617798, "learning_rate": 0.00019896446534220893, "loss": 0.0519, "step": 660 }, { "epoch": 0.475177304964539, "grad_norm": 0.35806578397750854, "learning_rate": 0.00019889603149682156, "loss": 0.0396, "step": 670 }, { "epoch": 0.48226950354609927, "grad_norm": 0.7298017144203186, "learning_rate": 0.00019882542070910847, "loss": 0.049, "step": 680 }, { "epoch": 0.48936170212765956, "grad_norm": 0.497172474861145, "learning_rate": 0.00019875263453338494, "loss": 0.0426, "step": 690 }, { "epoch": 0.49645390070921985, "grad_norm": 0.7310464382171631, "learning_rate": 0.00019867767457185177, "loss": 0.0493, "step": 700 }, { "epoch": 0.5035460992907801, "grad_norm": 0.6526608467102051, "learning_rate": 0.0001986005424745602, "loss": 0.0516, "step": 710 }, { "epoch": 0.5106382978723404, "grad_norm": 0.6688535213470459, "learning_rate": 0.00019852123993937537, "loss": 0.0486, "step": 720 }, { "epoch": 0.5177304964539007, "grad_norm": 0.4904952049255371, "learning_rate": 0.0001984397687119391, "loss": 0.0528, "step": 730 }, { "epoch": 0.524822695035461, "grad_norm": 0.5391408801078796, "learning_rate": 0.00019835613058563132, "loss": 0.0492, "step": 740 }, { "epoch": 0.5319148936170213, "grad_norm": 0.6295509338378906, "learning_rate": 0.00019827032740153073, "loss": 0.0489, "step": 750 }, { "epoch": 0.5390070921985816, "grad_norm": 0.5825513005256653, "learning_rate": 0.0001981823610483743, "loss": 0.0476, "step": 760 }, { "epoch": 0.5460992907801419, "grad_norm": 0.45764580368995667, "learning_rate": 0.0001980922334625155, "loss": 0.0427, "step": 770 }, { "epoch": 0.5531914893617021, "grad_norm": 0.6424888372421265, "learning_rate": 0.00019799994662788177, "loss": 0.0507, "step": 780 }, { "epoch": 0.5602836879432624, "grad_norm": 0.5487046837806702, "learning_rate": 0.000197905502575931, "loss": 0.0396, "step": 790 }, { "epoch": 0.5673758865248227, "grad_norm": 0.4865322709083557, "learning_rate": 0.00019780890338560658, "loss": 0.0423, "step": 800 }, { "epoch": 0.574468085106383, "grad_norm": 0.3333698809146881, "learning_rate": 0.00019771015118329177, "loss": 0.0386, "step": 810 }, { "epoch": 0.5815602836879432, "grad_norm": 0.48208341002464294, "learning_rate": 0.0001976092481427629, "loss": 0.0626, "step": 820 }, { "epoch": 0.5886524822695035, "grad_norm": 0.41743791103363037, "learning_rate": 0.0001975061964851414, "loss": 0.0452, "step": 830 }, { "epoch": 0.5957446808510638, "grad_norm": 0.6599062085151672, "learning_rate": 0.00019740099847884507, "loss": 0.0375, "step": 840 }, { "epoch": 0.6028368794326241, "grad_norm": 0.33362510800361633, "learning_rate": 0.00019729365643953806, "loss": 0.0382, "step": 850 }, { "epoch": 0.6099290780141844, "grad_norm": 0.6667296886444092, "learning_rate": 0.00019718417273007982, "loss": 0.0541, "step": 860 }, { "epoch": 0.6170212765957447, "grad_norm": 0.3192714750766754, "learning_rate": 0.00019707254976047327, "loss": 0.0391, "step": 870 }, { "epoch": 0.624113475177305, "grad_norm": 0.6036109328269958, "learning_rate": 0.0001969587899878116, "loss": 0.0419, "step": 880 }, { "epoch": 0.6312056737588653, "grad_norm": 0.46062973141670227, "learning_rate": 0.0001968428959162243, "loss": 0.0399, "step": 890 }, { "epoch": 0.6382978723404256, "grad_norm": 0.45465371012687683, "learning_rate": 0.00019672487009682186, "loss": 0.0405, "step": 900 }, { "epoch": 0.6453900709219859, "grad_norm": 0.7598403692245483, "learning_rate": 0.00019660471512763982, "loss": 0.0467, "step": 910 }, { "epoch": 0.6524822695035462, "grad_norm": 0.5159363746643066, "learning_rate": 0.00019648243365358146, "loss": 0.05, "step": 920 }, { "epoch": 0.6595744680851063, "grad_norm": 0.6169264316558838, "learning_rate": 0.0001963580283663596, "loss": 0.0545, "step": 930 }, { "epoch": 0.6666666666666666, "grad_norm": 0.657867431640625, "learning_rate": 0.0001962315020044374, "loss": 0.0354, "step": 940 }, { "epoch": 0.6737588652482269, "grad_norm": 0.39899423718452454, "learning_rate": 0.00019610285735296797, "loss": 0.0432, "step": 950 }, { "epoch": 0.6808510638297872, "grad_norm": 0.39202943444252014, "learning_rate": 0.0001959720972437331, "loss": 0.0437, "step": 960 }, { "epoch": 0.6879432624113475, "grad_norm": 0.537214457988739, "learning_rate": 0.00019583922455508113, "loss": 0.0421, "step": 970 }, { "epoch": 0.6950354609929078, "grad_norm": 0.5166184902191162, "learning_rate": 0.0001957042422118632, "loss": 0.0377, "step": 980 }, { "epoch": 0.7021276595744681, "grad_norm": 0.6365172863006592, "learning_rate": 0.0001955671531853692, "loss": 0.0412, "step": 990 }, { "epoch": 0.7092198581560284, "grad_norm": 0.7188717126846313, "learning_rate": 0.00019542796049326223, "loss": 0.0437, "step": 1000 }, { "epoch": 0.7163120567375887, "grad_norm": 0.48798811435699463, "learning_rate": 0.00019528666719951222, "loss": 0.0594, "step": 1010 }, { "epoch": 0.723404255319149, "grad_norm": 0.7655798196792603, "learning_rate": 0.00019514327641432833, "loss": 0.0414, "step": 1020 }, { "epoch": 0.7304964539007093, "grad_norm": 0.5477094650268555, "learning_rate": 0.00019499779129409075, "loss": 0.0411, "step": 1030 }, { "epoch": 0.7375886524822695, "grad_norm": 0.5877737402915955, "learning_rate": 0.00019485021504128103, "loss": 0.0392, "step": 1040 }, { "epoch": 0.7446808510638298, "grad_norm": 0.42971503734588623, "learning_rate": 0.00019470055090441162, "loss": 0.0386, "step": 1050 }, { "epoch": 0.75177304964539, "grad_norm": 0.5418499708175659, "learning_rate": 0.00019454880217795441, "loss": 0.0467, "step": 1060 }, { "epoch": 0.7588652482269503, "grad_norm": 0.4252694845199585, "learning_rate": 0.00019439497220226818, "loss": 0.035, "step": 1070 }, { "epoch": 0.7659574468085106, "grad_norm": 0.39190033078193665, "learning_rate": 0.00019423906436352498, "loss": 0.043, "step": 1080 }, { "epoch": 0.7730496453900709, "grad_norm": 0.44122350215911865, "learning_rate": 0.0001940810820936358, "loss": 0.0448, "step": 1090 }, { "epoch": 0.7801418439716312, "grad_norm": 0.44722622632980347, "learning_rate": 0.0001939210288701749, "loss": 0.0377, "step": 1100 }, { "epoch": 0.7872340425531915, "grad_norm": 0.31242600083351135, "learning_rate": 0.00019375890821630315, "loss": 0.0443, "step": 1110 }, { "epoch": 0.7943262411347518, "grad_norm": 0.4650860130786896, "learning_rate": 0.0001935947237006908, "loss": 0.0361, "step": 1120 }, { "epoch": 0.8014184397163121, "grad_norm": 0.30178096890449524, "learning_rate": 0.00019342847893743847, "loss": 0.0366, "step": 1130 }, { "epoch": 0.8085106382978723, "grad_norm": 0.393303245306015, "learning_rate": 0.00019326017758599805, "loss": 0.0331, "step": 1140 }, { "epoch": 0.8156028368794326, "grad_norm": 0.48034635186195374, "learning_rate": 0.00019308982335109183, "loss": 0.042, "step": 1150 }, { "epoch": 0.8226950354609929, "grad_norm": 0.30449676513671875, "learning_rate": 0.00019291741998263108, "loss": 0.0355, "step": 1160 }, { "epoch": 0.8297872340425532, "grad_norm": 0.5498911738395691, "learning_rate": 0.00019274297127563355, "loss": 0.0495, "step": 1170 }, { "epoch": 0.8368794326241135, "grad_norm": 0.5336613059043884, "learning_rate": 0.00019256648107013978, "loss": 0.0415, "step": 1180 }, { "epoch": 0.8439716312056738, "grad_norm": 0.39611244201660156, "learning_rate": 0.0001923879532511287, "loss": 0.0387, "step": 1190 }, { "epoch": 0.851063829787234, "grad_norm": 0.4194435179233551, "learning_rate": 0.00019220739174843205, "loss": 0.0438, "step": 1200 }, { "epoch": 0.8581560283687943, "grad_norm": 0.2843916714191437, "learning_rate": 0.000192024800536648, "loss": 0.0448, "step": 1210 }, { "epoch": 0.8652482269503546, "grad_norm": 0.5037551522254944, "learning_rate": 0.00019184018363505335, "loss": 0.0393, "step": 1220 }, { "epoch": 0.8723404255319149, "grad_norm": 0.7623959183692932, "learning_rate": 0.00019165354510751548, "loss": 0.0302, "step": 1230 }, { "epoch": 0.8794326241134752, "grad_norm": 0.478982150554657, "learning_rate": 0.00019146488906240255, "loss": 0.043, "step": 1240 }, { "epoch": 0.8865248226950354, "grad_norm": 0.3730108141899109, "learning_rate": 0.00019127421965249323, "loss": 0.036, "step": 1250 }, { "epoch": 0.8936170212765957, "grad_norm": 0.32982558012008667, "learning_rate": 0.00019108154107488526, "loss": 0.0369, "step": 1260 }, { "epoch": 0.900709219858156, "grad_norm": 0.5964722633361816, "learning_rate": 0.000190886857570903, "loss": 0.0315, "step": 1270 }, { "epoch": 0.9078014184397163, "grad_norm": 0.4713937044143677, "learning_rate": 0.00019069017342600417, "loss": 0.0471, "step": 1280 }, { "epoch": 0.9148936170212766, "grad_norm": 0.19976341724395752, "learning_rate": 0.00019049149296968543, "loss": 0.0369, "step": 1290 }, { "epoch": 0.9219858156028369, "grad_norm": 0.5373408198356628, "learning_rate": 0.00019029082057538718, "loss": 0.04, "step": 1300 }, { "epoch": 0.9290780141843972, "grad_norm": 0.4725939631462097, "learning_rate": 0.0001900881606603971, "loss": 0.0384, "step": 1310 }, { "epoch": 0.9361702127659575, "grad_norm": 0.4426575303077698, "learning_rate": 0.0001898835176857532, "loss": 0.0357, "step": 1320 }, { "epoch": 0.9432624113475178, "grad_norm": 0.4588729441165924, "learning_rate": 0.0001896768961561453, "loss": 0.0397, "step": 1330 }, { "epoch": 0.950354609929078, "grad_norm": 0.5770094394683838, "learning_rate": 0.0001894683006198161, "loss": 0.0523, "step": 1340 }, { "epoch": 0.9574468085106383, "grad_norm": 0.5690935254096985, "learning_rate": 0.00018925773566846104, "loss": 0.0442, "step": 1350 }, { "epoch": 0.9645390070921985, "grad_norm": 0.415588915348053, "learning_rate": 0.0001890452059371271, "loss": 0.0465, "step": 1360 }, { "epoch": 0.9716312056737588, "grad_norm": 0.3456709682941437, "learning_rate": 0.00018883071610411082, "loss": 0.0388, "step": 1370 }, { "epoch": 0.9787234042553191, "grad_norm": 0.5673688650131226, "learning_rate": 0.00018861427089085552, "loss": 0.0426, "step": 1380 }, { "epoch": 0.9858156028368794, "grad_norm": 0.39077624678611755, "learning_rate": 0.00018839587506184699, "loss": 0.0458, "step": 1390 }, { "epoch": 0.9929078014184397, "grad_norm": 0.4722648561000824, "learning_rate": 0.00018817553342450897, "loss": 0.0353, "step": 1400 }, { "epoch": 1.0, "grad_norm": 0.42179620265960693, "learning_rate": 0.00018795325082909708, "loss": 0.0488, "step": 1410 }, { "epoch": 1.0070921985815602, "grad_norm": 0.4976104497909546, "learning_rate": 0.0001877290321685922, "loss": 0.0433, "step": 1420 }, { "epoch": 1.0141843971631206, "grad_norm": 0.3866007924079895, "learning_rate": 0.00018750288237859275, "loss": 0.0411, "step": 1430 }, { "epoch": 1.0212765957446808, "grad_norm": 0.4193371534347534, "learning_rate": 0.00018727480643720588, "loss": 0.0378, "step": 1440 }, { "epoch": 1.0283687943262412, "grad_norm": 0.4635421931743622, "learning_rate": 0.00018704480936493817, "loss": 0.0463, "step": 1450 }, { "epoch": 1.0354609929078014, "grad_norm": 0.47600266337394714, "learning_rate": 0.00018681289622458485, "loss": 0.0371, "step": 1460 }, { "epoch": 1.0425531914893618, "grad_norm": 0.4182673394680023, "learning_rate": 0.00018657907212111857, "loss": 0.0346, "step": 1470 }, { "epoch": 1.049645390070922, "grad_norm": 0.2784164547920227, "learning_rate": 0.00018634334220157684, "loss": 0.0401, "step": 1480 }, { "epoch": 1.0567375886524824, "grad_norm": 0.6221792697906494, "learning_rate": 0.00018610571165494889, "loss": 0.0355, "step": 1490 }, { "epoch": 1.0638297872340425, "grad_norm": 0.5726498961448669, "learning_rate": 0.00018586618571206134, "loss": 0.0396, "step": 1500 }, { "epoch": 1.070921985815603, "grad_norm": 0.30378204584121704, "learning_rate": 0.00018562476964546307, "loss": 0.03, "step": 1510 }, { "epoch": 1.0780141843971631, "grad_norm": 0.48525288701057434, "learning_rate": 0.00018538146876930924, "loss": 0.0342, "step": 1520 }, { "epoch": 1.0851063829787233, "grad_norm": 0.30012843012809753, "learning_rate": 0.00018513628843924425, "loss": 0.0348, "step": 1530 }, { "epoch": 1.0921985815602837, "grad_norm": 0.32308411598205566, "learning_rate": 0.00018488923405228378, "loss": 0.0298, "step": 1540 }, { "epoch": 1.099290780141844, "grad_norm": 0.3836841881275177, "learning_rate": 0.00018464031104669615, "loss": 0.0422, "step": 1550 }, { "epoch": 1.1063829787234043, "grad_norm": 0.5509107708930969, "learning_rate": 0.0001843895249018825, "loss": 0.0296, "step": 1560 }, { "epoch": 1.1134751773049645, "grad_norm": 0.4632577896118164, "learning_rate": 0.0001841368811382562, "loss": 0.0327, "step": 1570 }, { "epoch": 1.1205673758865249, "grad_norm": 0.39413148164749146, "learning_rate": 0.00018388238531712124, "loss": 0.0291, "step": 1580 }, { "epoch": 1.127659574468085, "grad_norm": 0.3808094561100006, "learning_rate": 0.00018362604304055007, "loss": 0.0336, "step": 1590 }, { "epoch": 1.1347517730496455, "grad_norm": 0.36897146701812744, "learning_rate": 0.00018336785995125998, "loss": 0.035, "step": 1600 }, { "epoch": 1.1418439716312057, "grad_norm": 0.3850401043891907, "learning_rate": 0.00018310784173248908, "loss": 0.0366, "step": 1610 }, { "epoch": 1.148936170212766, "grad_norm": 0.3824830949306488, "learning_rate": 0.0001828459941078712, "loss": 0.0372, "step": 1620 }, { "epoch": 1.1560283687943262, "grad_norm": 0.47498247027397156, "learning_rate": 0.0001825823228413097, "loss": 0.0385, "step": 1630 }, { "epoch": 1.1631205673758864, "grad_norm": 0.32731738686561584, "learning_rate": 0.0001823168337368509, "loss": 0.0289, "step": 1640 }, { "epoch": 1.1702127659574468, "grad_norm": 0.27296149730682373, "learning_rate": 0.00018204953263855603, "loss": 0.0268, "step": 1650 }, { "epoch": 1.177304964539007, "grad_norm": 0.3830318748950958, "learning_rate": 0.00018178042543037277, "loss": 0.0358, "step": 1660 }, { "epoch": 1.1843971631205674, "grad_norm": 0.4035814106464386, "learning_rate": 0.00018150951803600572, "loss": 0.0246, "step": 1670 }, { "epoch": 1.1914893617021276, "grad_norm": 0.39326614141464233, "learning_rate": 0.00018123681641878583, "loss": 0.0343, "step": 1680 }, { "epoch": 1.198581560283688, "grad_norm": 0.4689919352531433, "learning_rate": 0.0001809623265815394, "loss": 0.0488, "step": 1690 }, { "epoch": 1.2056737588652482, "grad_norm": 0.4357942044734955, "learning_rate": 0.00018068605456645574, "loss": 0.0336, "step": 1700 }, { "epoch": 1.2127659574468086, "grad_norm": 0.2753821611404419, "learning_rate": 0.0001804080064549542, "loss": 0.028, "step": 1710 }, { "epoch": 1.2198581560283688, "grad_norm": 0.33889490365982056, "learning_rate": 0.0001801281883675504, "loss": 0.0367, "step": 1720 }, { "epoch": 1.226950354609929, "grad_norm": 0.3855700194835663, "learning_rate": 0.0001798466064637214, "loss": 0.0275, "step": 1730 }, { "epoch": 1.2340425531914894, "grad_norm": 0.343795508146286, "learning_rate": 0.00017956326694177014, "loss": 0.0367, "step": 1740 }, { "epoch": 1.2411347517730495, "grad_norm": 0.30135342478752136, "learning_rate": 0.00017927817603868902, "loss": 0.0237, "step": 1750 }, { "epoch": 1.24822695035461, "grad_norm": 0.357537180185318, "learning_rate": 0.00017899134003002267, "loss": 0.0281, "step": 1760 }, { "epoch": 1.2553191489361701, "grad_norm": 0.47195494174957275, "learning_rate": 0.00017870276522972964, "loss": 0.0342, "step": 1770 }, { "epoch": 1.2624113475177305, "grad_norm": 0.2944539487361908, "learning_rate": 0.00017841245799004357, "loss": 0.0282, "step": 1780 }, { "epoch": 1.2695035460992907, "grad_norm": 0.5694823861122131, "learning_rate": 0.00017812042470133327, "loss": 0.0403, "step": 1790 }, { "epoch": 1.2765957446808511, "grad_norm": 0.3958476781845093, "learning_rate": 0.00017782667179196216, "loss": 0.0385, "step": 1800 }, { "epoch": 1.2836879432624113, "grad_norm": 0.40212109684944153, "learning_rate": 0.0001775312057281466, "loss": 0.0303, "step": 1810 }, { "epoch": 1.2907801418439715, "grad_norm": 0.36823150515556335, "learning_rate": 0.0001772340330138137, "loss": 0.0295, "step": 1820 }, { "epoch": 1.297872340425532, "grad_norm": 0.25558122992515564, "learning_rate": 0.00017693516019045817, "loss": 0.0312, "step": 1830 }, { "epoch": 1.3049645390070923, "grad_norm": 0.5120919346809387, "learning_rate": 0.0001766345938369981, "loss": 0.0338, "step": 1840 }, { "epoch": 1.3120567375886525, "grad_norm": 0.32636040449142456, "learning_rate": 0.00017633234056963049, "loss": 0.0339, "step": 1850 }, { "epoch": 1.3191489361702127, "grad_norm": 0.3285328447818756, "learning_rate": 0.0001760284070416852, "loss": 0.0315, "step": 1860 }, { "epoch": 1.326241134751773, "grad_norm": 0.40622803568840027, "learning_rate": 0.00017572279994347887, "loss": 0.0282, "step": 1870 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4662201702594757, "learning_rate": 0.00017541552600216747, "loss": 0.0413, "step": 1880 }, { "epoch": 1.3404255319148937, "grad_norm": 0.345679372549057, "learning_rate": 0.00017510659198159822, "loss": 0.0322, "step": 1890 }, { "epoch": 1.3475177304964538, "grad_norm": 0.34959903359413147, "learning_rate": 0.00017479600468216073, "loss": 0.0453, "step": 1900 }, { "epoch": 1.3546099290780143, "grad_norm": 0.4221673607826233, "learning_rate": 0.00017448377094063733, "loss": 0.0332, "step": 1910 }, { "epoch": 1.3617021276595744, "grad_norm": 0.4349720776081085, "learning_rate": 0.00017416989763005246, "loss": 0.03, "step": 1920 }, { "epoch": 1.3687943262411348, "grad_norm": 0.33618539571762085, "learning_rate": 0.00017385439165952158, "loss": 0.0333, "step": 1930 }, { "epoch": 1.375886524822695, "grad_norm": 0.23778650164604187, "learning_rate": 0.0001735372599740989, "loss": 0.029, "step": 1940 }, { "epoch": 1.3829787234042552, "grad_norm": 0.5169985890388489, "learning_rate": 0.00017321850955462454, "loss": 0.0406, "step": 1950 }, { "epoch": 1.3900709219858156, "grad_norm": 0.41695886850357056, "learning_rate": 0.00017289814741757099, "loss": 0.0289, "step": 1960 }, { "epoch": 1.397163120567376, "grad_norm": 0.33444464206695557, "learning_rate": 0.00017257618061488855, "loss": 0.0341, "step": 1970 }, { "epoch": 1.4042553191489362, "grad_norm": 0.37334710359573364, "learning_rate": 0.00017225261623385, "loss": 0.0362, "step": 1980 }, { "epoch": 1.4113475177304964, "grad_norm": 0.35155797004699707, "learning_rate": 0.0001719274613968948, "loss": 0.0344, "step": 1990 }, { "epoch": 1.4184397163120568, "grad_norm": 0.3294832408428192, "learning_rate": 0.0001716007232614723, "loss": 0.0347, "step": 2000 }, { "epoch": 1.425531914893617, "grad_norm": 0.3127981722354889, "learning_rate": 0.00017127240901988392, "loss": 0.0308, "step": 2010 }, { "epoch": 1.4326241134751774, "grad_norm": 0.30865317583084106, "learning_rate": 0.00017094252589912513, "loss": 0.0279, "step": 2020 }, { "epoch": 1.4397163120567376, "grad_norm": 0.30139267444610596, "learning_rate": 0.00017061108116072618, "loss": 0.0275, "step": 2030 }, { "epoch": 1.4468085106382977, "grad_norm": 0.3612299859523773, "learning_rate": 0.00017027808210059238, "loss": 0.0329, "step": 2040 }, { "epoch": 1.4539007092198581, "grad_norm": 0.2499615103006363, "learning_rate": 0.0001699435360488435, "loss": 0.0288, "step": 2050 }, { "epoch": 1.4609929078014185, "grad_norm": 0.4052152931690216, "learning_rate": 0.00016960745036965222, "loss": 0.0321, "step": 2060 }, { "epoch": 1.4680851063829787, "grad_norm": 0.40873101353645325, "learning_rate": 0.00016926983246108219, "loss": 0.0352, "step": 2070 }, { "epoch": 1.475177304964539, "grad_norm": 0.2873539328575134, "learning_rate": 0.0001689306897549253, "loss": 0.0313, "step": 2080 }, { "epoch": 1.4822695035460993, "grad_norm": 0.41596364974975586, "learning_rate": 0.00016859002971653778, "loss": 0.0261, "step": 2090 }, { "epoch": 1.4893617021276595, "grad_norm": 0.38014939427375793, "learning_rate": 0.00016824785984467612, "loss": 0.0277, "step": 2100 }, { "epoch": 1.49645390070922, "grad_norm": 0.396859347820282, "learning_rate": 0.00016790418767133192, "loss": 0.0327, "step": 2110 }, { "epoch": 1.50354609929078, "grad_norm": 0.3488304018974304, "learning_rate": 0.00016755902076156604, "loss": 0.0392, "step": 2120 }, { "epoch": 1.5106382978723403, "grad_norm": 0.25539007782936096, "learning_rate": 0.00016721236671334217, "loss": 0.0358, "step": 2130 }, { "epoch": 1.5177304964539007, "grad_norm": 0.29417455196380615, "learning_rate": 0.00016686423315735954, "loss": 0.0236, "step": 2140 }, { "epoch": 1.524822695035461, "grad_norm": 0.23996444046497345, "learning_rate": 0.00016651462775688493, "loss": 0.0283, "step": 2150 }, { "epoch": 1.5319148936170213, "grad_norm": 0.3587987422943115, "learning_rate": 0.00016616355820758398, "loss": 0.0455, "step": 2160 }, { "epoch": 1.5390070921985815, "grad_norm": 0.2270328402519226, "learning_rate": 0.00016581103223735187, "loss": 0.0228, "step": 2170 }, { "epoch": 1.5460992907801419, "grad_norm": 0.2835662364959717, "learning_rate": 0.00016545705760614307, "loss": 0.0222, "step": 2180 }, { "epoch": 1.5531914893617023, "grad_norm": 0.3576505780220032, "learning_rate": 0.00016510164210580065, "loss": 0.0277, "step": 2190 }, { "epoch": 1.5602836879432624, "grad_norm": 0.4545815885066986, "learning_rate": 0.0001647447935598847, "loss": 0.0285, "step": 2200 }, { "epoch": 1.5673758865248226, "grad_norm": 0.26356783509254456, "learning_rate": 0.00016438651982350017, "loss": 0.0317, "step": 2210 }, { "epoch": 1.574468085106383, "grad_norm": 0.3527631163597107, "learning_rate": 0.00016402682878312387, "loss": 0.0313, "step": 2220 }, { "epoch": 1.5815602836879432, "grad_norm": 0.16718344390392303, "learning_rate": 0.00016366572835643095, "loss": 0.0299, "step": 2230 }, { "epoch": 1.5886524822695036, "grad_norm": 0.24409593641757965, "learning_rate": 0.00016330322649212053, "loss": 0.0246, "step": 2240 }, { "epoch": 1.5957446808510638, "grad_norm": 0.2794688642024994, "learning_rate": 0.00016293933116974087, "loss": 0.027, "step": 2250 }, { "epoch": 1.602836879432624, "grad_norm": 0.29633232951164246, "learning_rate": 0.0001625740503995136, "loss": 0.0333, "step": 2260 }, { "epoch": 1.6099290780141844, "grad_norm": 0.3844369053840637, "learning_rate": 0.00016220739222215738, "loss": 0.0331, "step": 2270 }, { "epoch": 1.6170212765957448, "grad_norm": 0.7053892612457275, "learning_rate": 0.000161839364708711, "loss": 0.0357, "step": 2280 }, { "epoch": 1.624113475177305, "grad_norm": 0.5903012156486511, "learning_rate": 0.00016146997596035569, "loss": 0.0367, "step": 2290 }, { "epoch": 1.6312056737588652, "grad_norm": 0.41250288486480713, "learning_rate": 0.00016109923410823667, "loss": 0.0318, "step": 2300 }, { "epoch": 1.6382978723404256, "grad_norm": 0.4475861191749573, "learning_rate": 0.0001607271473132844, "loss": 0.0262, "step": 2310 }, { "epoch": 1.645390070921986, "grad_norm": 0.45323246717453003, "learning_rate": 0.00016035372376603475, "loss": 0.0324, "step": 2320 }, { "epoch": 1.6524822695035462, "grad_norm": 0.2859094440937042, "learning_rate": 0.0001599789716864487, "loss": 0.0253, "step": 2330 }, { "epoch": 1.6595744680851063, "grad_norm": 0.44032302498817444, "learning_rate": 0.00015960289932373157, "loss": 0.0329, "step": 2340 }, { "epoch": 1.6666666666666665, "grad_norm": 0.3153149485588074, "learning_rate": 0.00015922551495615124, "loss": 0.0352, "step": 2350 }, { "epoch": 1.673758865248227, "grad_norm": 0.34591788053512573, "learning_rate": 0.000158846826890856, "loss": 0.0265, "step": 2360 }, { "epoch": 1.6808510638297873, "grad_norm": 0.3234579563140869, "learning_rate": 0.00015846684346369186, "loss": 0.0271, "step": 2370 }, { "epoch": 1.6879432624113475, "grad_norm": 0.3714563548564911, "learning_rate": 0.00015808557303901866, "loss": 0.0301, "step": 2380 }, { "epoch": 1.6950354609929077, "grad_norm": 0.2916625142097473, "learning_rate": 0.00015770302400952639, "loss": 0.0373, "step": 2390 }, { "epoch": 1.702127659574468, "grad_norm": 0.2942117154598236, "learning_rate": 0.00015731920479605008, "loss": 0.0268, "step": 2400 }, { "epoch": 1.7092198581560285, "grad_norm": 0.2918272316455841, "learning_rate": 0.0001569341238473847, "loss": 0.029, "step": 2410 }, { "epoch": 1.7163120567375887, "grad_norm": 0.37234899401664734, "learning_rate": 0.0001565477896400991, "loss": 0.0263, "step": 2420 }, { "epoch": 1.7234042553191489, "grad_norm": 0.291821151971817, "learning_rate": 0.0001561602106783493, "loss": 0.0294, "step": 2430 }, { "epoch": 1.7304964539007093, "grad_norm": 0.6754502654075623, "learning_rate": 0.0001557713954936915, "loss": 0.0283, "step": 2440 }, { "epoch": 1.7375886524822695, "grad_norm": 0.25559020042419434, "learning_rate": 0.00015538135264489412, "loss": 0.0342, "step": 2450 }, { "epoch": 1.7446808510638299, "grad_norm": 0.4409390091896057, "learning_rate": 0.00015499009071774945, "loss": 0.0293, "step": 2460 }, { "epoch": 1.75177304964539, "grad_norm": 0.39615398645401, "learning_rate": 0.00015459761832488473, "loss": 0.0265, "step": 2470 }, { "epoch": 1.7588652482269502, "grad_norm": 0.26630377769470215, "learning_rate": 0.0001542039441055724, "loss": 0.0313, "step": 2480 }, { "epoch": 1.7659574468085106, "grad_norm": 0.31450021266937256, "learning_rate": 0.00015380907672554007, "loss": 0.0301, "step": 2490 }, { "epoch": 1.773049645390071, "grad_norm": 0.5041805505752563, "learning_rate": 0.00015341302487677973, "loss": 0.0304, "step": 2500 }, { "epoch": 1.7801418439716312, "grad_norm": 0.46769461035728455, "learning_rate": 0.0001530157972773564, "loss": 0.032, "step": 2510 }, { "epoch": 1.7872340425531914, "grad_norm": 0.3055451214313507, "learning_rate": 0.00015261740267121621, "loss": 0.0237, "step": 2520 }, { "epoch": 1.7943262411347518, "grad_norm": 0.2608392834663391, "learning_rate": 0.00015221784982799404, "loss": 0.0252, "step": 2530 }, { "epoch": 1.8014184397163122, "grad_norm": 0.4476111829280853, "learning_rate": 0.0001518171475428202, "loss": 0.0382, "step": 2540 }, { "epoch": 1.8085106382978724, "grad_norm": 0.3846855163574219, "learning_rate": 0.00015141530463612724, "loss": 0.0255, "step": 2550 }, { "epoch": 1.8156028368794326, "grad_norm": 0.3949633538722992, "learning_rate": 0.0001510123299534554, "loss": 0.0393, "step": 2560 }, { "epoch": 1.8226950354609928, "grad_norm": 0.2800884246826172, "learning_rate": 0.00015060823236525813, "loss": 0.0314, "step": 2570 }, { "epoch": 1.8297872340425532, "grad_norm": 0.26943808794021606, "learning_rate": 0.0001502030207667068, "loss": 0.0361, "step": 2580 }, { "epoch": 1.8368794326241136, "grad_norm": 0.29443779587745667, "learning_rate": 0.00014979670407749477, "loss": 0.0259, "step": 2590 }, { "epoch": 1.8439716312056738, "grad_norm": 0.27418988943099976, "learning_rate": 0.00014938929124164117, "loss": 0.0393, "step": 2600 }, { "epoch": 1.851063829787234, "grad_norm": 0.34443700313568115, "learning_rate": 0.00014898079122729402, "loss": 0.0283, "step": 2610 }, { "epoch": 1.8581560283687943, "grad_norm": 0.3798273205757141, "learning_rate": 0.00014857121302653273, "loss": 0.0287, "step": 2620 }, { "epoch": 1.8652482269503547, "grad_norm": 0.5379789471626282, "learning_rate": 0.0001481605656551703, "loss": 0.0307, "step": 2630 }, { "epoch": 1.872340425531915, "grad_norm": 0.31364014744758606, "learning_rate": 0.00014774885815255464, "loss": 0.0293, "step": 2640 }, { "epoch": 1.8794326241134751, "grad_norm": 0.3163762092590332, "learning_rate": 0.00014733609958136987, "loss": 0.0336, "step": 2650 }, { "epoch": 1.8865248226950353, "grad_norm": 0.24602492153644562, "learning_rate": 0.00014692229902743662, "loss": 0.0213, "step": 2660 }, { "epoch": 1.8936170212765957, "grad_norm": 0.32489195466041565, "learning_rate": 0.00014650746559951205, "loss": 0.0321, "step": 2670 }, { "epoch": 1.900709219858156, "grad_norm": 0.40947791934013367, "learning_rate": 0.00014609160842908952, "loss": 0.025, "step": 2680 }, { "epoch": 1.9078014184397163, "grad_norm": 0.17635834217071533, "learning_rate": 0.00014567473667019735, "loss": 0.0207, "step": 2690 }, { "epoch": 1.9148936170212765, "grad_norm": 0.30374202132225037, "learning_rate": 0.00014525685949919745, "loss": 0.0294, "step": 2700 }, { "epoch": 1.9219858156028369, "grad_norm": 0.3767811357975006, "learning_rate": 0.00014483798611458326, "loss": 0.0235, "step": 2710 }, { "epoch": 1.9290780141843973, "grad_norm": 0.425862580537796, "learning_rate": 0.00014441812573677738, "loss": 0.0298, "step": 2720 }, { "epoch": 1.9361702127659575, "grad_norm": 0.28912046551704407, "learning_rate": 0.00014399728760792848, "loss": 0.0215, "step": 2730 }, { "epoch": 1.9432624113475176, "grad_norm": 0.4757988452911377, "learning_rate": 0.00014357548099170795, "loss": 0.0283, "step": 2740 }, { "epoch": 1.950354609929078, "grad_norm": 0.38002464175224304, "learning_rate": 0.0001431527151731059, "loss": 0.0242, "step": 2750 }, { "epoch": 1.9574468085106385, "grad_norm": 0.2170061618089676, "learning_rate": 0.0001427289994582268, "loss": 0.0251, "step": 2760 }, { "epoch": 1.9645390070921986, "grad_norm": 0.2710418105125427, "learning_rate": 0.00014230434317408474, "loss": 0.0276, "step": 2770 }, { "epoch": 1.9716312056737588, "grad_norm": 0.3072097599506378, "learning_rate": 0.00014187875566839793, "loss": 0.0241, "step": 2780 }, { "epoch": 1.978723404255319, "grad_norm": 0.28988635540008545, "learning_rate": 0.0001414522463093831, "loss": 0.0308, "step": 2790 }, { "epoch": 1.9858156028368794, "grad_norm": 0.2880018353462219, "learning_rate": 0.0001410248244855492, "loss": 0.0321, "step": 2800 }, { "epoch": 1.9929078014184398, "grad_norm": 0.4506809115409851, "learning_rate": 0.0001405964996054907, "loss": 0.0247, "step": 2810 }, { "epoch": 2.0, "grad_norm": 0.37970393896102905, "learning_rate": 0.00014016728109768066, "loss": 0.0239, "step": 2820 }, { "epoch": 2.00709219858156, "grad_norm": 0.34335339069366455, "learning_rate": 0.0001397371784102629, "loss": 0.0288, "step": 2830 }, { "epoch": 2.0141843971631204, "grad_norm": 0.2879555821418762, "learning_rate": 0.00013930620101084432, "loss": 0.0244, "step": 2840 }, { "epoch": 2.021276595744681, "grad_norm": 0.29170095920562744, "learning_rate": 0.0001388743583862862, "loss": 0.0298, "step": 2850 }, { "epoch": 2.028368794326241, "grad_norm": 0.4589764177799225, "learning_rate": 0.00013844166004249578, "loss": 0.0248, "step": 2860 }, { "epoch": 2.0354609929078014, "grad_norm": 0.32960548996925354, "learning_rate": 0.00013800811550421655, "loss": 0.0296, "step": 2870 }, { "epoch": 2.0425531914893615, "grad_norm": 0.3347654938697815, "learning_rate": 0.00013757373431481882, "loss": 0.0278, "step": 2880 }, { "epoch": 2.049645390070922, "grad_norm": 0.3965177536010742, "learning_rate": 0.00013713852603608977, "loss": 0.0296, "step": 2890 }, { "epoch": 2.0567375886524824, "grad_norm": 0.2053564339876175, "learning_rate": 0.00013670250024802266, "loss": 0.0279, "step": 2900 }, { "epoch": 2.0638297872340425, "grad_norm": 0.6006909012794495, "learning_rate": 0.0001362656665486062, "loss": 0.0342, "step": 2910 }, { "epoch": 2.0709219858156027, "grad_norm": 0.24472111463546753, "learning_rate": 0.00013582803455361323, "loss": 0.0244, "step": 2920 }, { "epoch": 2.078014184397163, "grad_norm": 0.2269747108221054, "learning_rate": 0.00013538961389638897, "loss": 0.0256, "step": 2930 }, { "epoch": 2.0851063829787235, "grad_norm": 0.31606826186180115, "learning_rate": 0.00013495041422763898, "loss": 0.0305, "step": 2940 }, { "epoch": 2.0921985815602837, "grad_norm": 0.2274671196937561, "learning_rate": 0.0001345104452152168, "loss": 0.0307, "step": 2950 }, { "epoch": 2.099290780141844, "grad_norm": 0.2778410017490387, "learning_rate": 0.00013406971654391115, "loss": 0.0274, "step": 2960 }, { "epoch": 2.106382978723404, "grad_norm": 0.545807421207428, "learning_rate": 0.00013362823791523266, "loss": 0.028, "step": 2970 }, { "epoch": 2.1134751773049647, "grad_norm": 0.41192397475242615, "learning_rate": 0.00013318601904720024, "loss": 0.0247, "step": 2980 }, { "epoch": 2.120567375886525, "grad_norm": 0.4247974157333374, "learning_rate": 0.00013274306967412736, "loss": 0.0308, "step": 2990 }, { "epoch": 2.127659574468085, "grad_norm": 0.49807772040367126, "learning_rate": 0.0001322993995464078, "loss": 0.0379, "step": 3000 }, { "epoch": 2.1347517730496453, "grad_norm": 0.2732253968715668, "learning_rate": 0.0001318550184303006, "loss": 0.0242, "step": 3010 }, { "epoch": 2.141843971631206, "grad_norm": 0.3375590145587921, "learning_rate": 0.00013140993610771573, "loss": 0.0281, "step": 3020 }, { "epoch": 2.148936170212766, "grad_norm": 0.24619796872138977, "learning_rate": 0.0001309641623759982, "loss": 0.0261, "step": 3030 }, { "epoch": 2.1560283687943262, "grad_norm": 0.39782339334487915, "learning_rate": 0.0001305177070477126, "loss": 0.0268, "step": 3040 }, { "epoch": 2.1631205673758864, "grad_norm": 0.5394360423088074, "learning_rate": 0.00013007057995042732, "loss": 0.0305, "step": 3050 }, { "epoch": 2.1702127659574466, "grad_norm": 0.3109723925590515, "learning_rate": 0.00012962279092649783, "loss": 0.0224, "step": 3060 }, { "epoch": 2.1773049645390072, "grad_norm": 0.24598295986652374, "learning_rate": 0.0001291743498328503, "loss": 0.0239, "step": 3070 }, { "epoch": 2.1843971631205674, "grad_norm": 0.37659743428230286, "learning_rate": 0.0001287252665407645, "loss": 0.0322, "step": 3080 }, { "epoch": 2.1914893617021276, "grad_norm": 0.4015694260597229, "learning_rate": 0.00012827555093565657, "loss": 0.0304, "step": 3090 }, { "epoch": 2.198581560283688, "grad_norm": 0.32140156626701355, "learning_rate": 0.00012782521291686147, "loss": 0.0302, "step": 3100 }, { "epoch": 2.2056737588652484, "grad_norm": 0.3436850905418396, "learning_rate": 0.00012737426239741484, "loss": 0.0265, "step": 3110 }, { "epoch": 2.2127659574468086, "grad_norm": 0.2246638983488083, "learning_rate": 0.00012692270930383508, "loss": 0.0253, "step": 3120 }, { "epoch": 2.219858156028369, "grad_norm": 0.25144413113594055, "learning_rate": 0.00012647056357590472, "loss": 0.0274, "step": 3130 }, { "epoch": 2.226950354609929, "grad_norm": 0.3174673020839691, "learning_rate": 0.0001260178351664515, "loss": 0.0288, "step": 3140 }, { "epoch": 2.2340425531914896, "grad_norm": 0.36320146918296814, "learning_rate": 0.0001255645340411295, "loss": 0.0294, "step": 3150 }, { "epoch": 2.2411347517730498, "grad_norm": 0.22592422366142273, "learning_rate": 0.00012511067017819964, "loss": 0.0245, "step": 3160 }, { "epoch": 2.24822695035461, "grad_norm": 0.27350878715515137, "learning_rate": 0.00012465625356831007, "loss": 0.0297, "step": 3170 }, { "epoch": 2.25531914893617, "grad_norm": 0.3462134003639221, "learning_rate": 0.00012420129421427623, "loss": 0.0295, "step": 3180 }, { "epoch": 2.2624113475177303, "grad_norm": 0.23078316450119019, "learning_rate": 0.00012374580213086065, "loss": 0.0236, "step": 3190 }, { "epoch": 2.269503546099291, "grad_norm": 0.41071340441703796, "learning_rate": 0.00012328978734455265, "loss": 0.0303, "step": 3200 }, { "epoch": 2.276595744680851, "grad_norm": 0.3645431697368622, "learning_rate": 0.00012283325989334736, "loss": 0.0215, "step": 3210 }, { "epoch": 2.2836879432624113, "grad_norm": 0.3068390488624573, "learning_rate": 0.00012237622982652497, "loss": 0.0216, "step": 3220 }, { "epoch": 2.2907801418439715, "grad_norm": 0.2963109314441681, "learning_rate": 0.00012191870720442953, "loss": 0.024, "step": 3230 }, { "epoch": 2.297872340425532, "grad_norm": 0.34965890645980835, "learning_rate": 0.00012146070209824736, "loss": 0.0208, "step": 3240 }, { "epoch": 2.3049645390070923, "grad_norm": 0.3760829269886017, "learning_rate": 0.00012100222458978541, "loss": 0.0231, "step": 3250 }, { "epoch": 2.3120567375886525, "grad_norm": 0.39412423968315125, "learning_rate": 0.00012054328477124943, "loss": 0.0263, "step": 3260 }, { "epoch": 2.3191489361702127, "grad_norm": 0.47879675030708313, "learning_rate": 0.0001200838927450217, "loss": 0.0253, "step": 3270 }, { "epoch": 2.326241134751773, "grad_norm": 0.32520684599876404, "learning_rate": 0.00011962405862343866, "loss": 0.0221, "step": 3280 }, { "epoch": 2.3333333333333335, "grad_norm": 0.2687060236930847, "learning_rate": 0.00011916379252856838, "loss": 0.0251, "step": 3290 }, { "epoch": 2.3404255319148937, "grad_norm": 0.246451273560524, "learning_rate": 0.00011870310459198772, "loss": 0.0301, "step": 3300 }, { "epoch": 2.347517730496454, "grad_norm": 0.2831878960132599, "learning_rate": 0.00011824200495455929, "loss": 0.0207, "step": 3310 }, { "epoch": 2.354609929078014, "grad_norm": 0.2363816499710083, "learning_rate": 0.00011778050376620822, "loss": 0.0199, "step": 3320 }, { "epoch": 2.3617021276595747, "grad_norm": 0.30800870060920715, "learning_rate": 0.0001173186111856988, "loss": 0.0234, "step": 3330 }, { "epoch": 2.368794326241135, "grad_norm": 0.3593873679637909, "learning_rate": 0.00011685633738041075, "loss": 0.0218, "step": 3340 }, { "epoch": 2.375886524822695, "grad_norm": 0.1976577192544937, "learning_rate": 0.00011639369252611552, "loss": 0.0248, "step": 3350 }, { "epoch": 2.382978723404255, "grad_norm": 0.21267811954021454, "learning_rate": 0.00011593068680675228, "loss": 0.0245, "step": 3360 }, { "epoch": 2.3900709219858154, "grad_norm": 0.3738497197628021, "learning_rate": 0.00011546733041420362, "loss": 0.0282, "step": 3370 }, { "epoch": 2.397163120567376, "grad_norm": 0.37859100103378296, "learning_rate": 0.0001150036335480714, "loss": 0.0189, "step": 3380 }, { "epoch": 2.404255319148936, "grad_norm": 0.32795801758766174, "learning_rate": 0.00011453960641545206, "loss": 0.0236, "step": 3390 }, { "epoch": 2.4113475177304964, "grad_norm": 0.2747984528541565, "learning_rate": 0.00011407525923071207, "loss": 0.0223, "step": 3400 }, { "epoch": 2.4184397163120566, "grad_norm": 0.22304099798202515, "learning_rate": 0.00011361060221526301, "loss": 0.0256, "step": 3410 }, { "epoch": 2.425531914893617, "grad_norm": 0.15323618054389954, "learning_rate": 0.00011314564559733654, "loss": 0.0235, "step": 3420 }, { "epoch": 2.4326241134751774, "grad_norm": 0.4732156991958618, "learning_rate": 0.00011268039961175936, "loss": 0.0278, "step": 3430 }, { "epoch": 2.4397163120567376, "grad_norm": 0.3939105272293091, "learning_rate": 0.0001122148744997278, "loss": 0.0365, "step": 3440 }, { "epoch": 2.4468085106382977, "grad_norm": 0.3410981595516205, "learning_rate": 0.00011174908050858251, "loss": 0.0229, "step": 3450 }, { "epoch": 2.453900709219858, "grad_norm": 0.34085023403167725, "learning_rate": 0.00011128302789158273, "loss": 0.0246, "step": 3460 }, { "epoch": 2.4609929078014185, "grad_norm": 0.3545391261577606, "learning_rate": 0.00011081672690768084, "loss": 0.0241, "step": 3470 }, { "epoch": 2.4680851063829787, "grad_norm": 0.3405749797821045, "learning_rate": 0.00011035018782129618, "loss": 0.0238, "step": 3480 }, { "epoch": 2.475177304964539, "grad_norm": 0.7175484895706177, "learning_rate": 0.00010988342090208945, "loss": 0.0303, "step": 3490 }, { "epoch": 2.482269503546099, "grad_norm": 0.4182647168636322, "learning_rate": 0.00010941643642473646, "loss": 0.0219, "step": 3500 }, { "epoch": 2.4893617021276597, "grad_norm": 0.2711605131626129, "learning_rate": 0.00010894924466870205, "loss": 0.0282, "step": 3510 }, { "epoch": 2.49645390070922, "grad_norm": 0.3410244584083557, "learning_rate": 0.00010848185591801365, "loss": 0.0263, "step": 3520 }, { "epoch": 2.50354609929078, "grad_norm": 0.2878762483596802, "learning_rate": 0.0001080142804610351, "loss": 0.0213, "step": 3530 }, { "epoch": 2.5106382978723403, "grad_norm": 0.33405497670173645, "learning_rate": 0.00010754652859024017, "loss": 0.0316, "step": 3540 }, { "epoch": 2.5177304964539005, "grad_norm": 0.2429051399230957, "learning_rate": 0.00010707861060198581, "loss": 0.0223, "step": 3550 }, { "epoch": 2.524822695035461, "grad_norm": 0.16325709223747253, "learning_rate": 0.00010661053679628561, "loss": 0.0254, "step": 3560 }, { "epoch": 2.5319148936170213, "grad_norm": 0.301996648311615, "learning_rate": 0.0001061423174765832, "loss": 0.0309, "step": 3570 }, { "epoch": 2.5390070921985815, "grad_norm": 0.16105884313583374, "learning_rate": 0.00010567396294952527, "loss": 0.025, "step": 3580 }, { "epoch": 2.546099290780142, "grad_norm": 0.2534509003162384, "learning_rate": 0.00010520548352473468, "loss": 0.026, "step": 3590 }, { "epoch": 2.5531914893617023, "grad_norm": 0.17970708012580872, "learning_rate": 0.00010473688951458369, "loss": 0.022, "step": 3600 }, { "epoch": 2.5602836879432624, "grad_norm": 0.2073034942150116, "learning_rate": 0.00010426819123396687, "loss": 0.0253, "step": 3610 }, { "epoch": 2.5673758865248226, "grad_norm": 0.2955438494682312, "learning_rate": 0.00010379939900007393, "loss": 0.0235, "step": 3620 }, { "epoch": 2.574468085106383, "grad_norm": 0.19330568611621857, "learning_rate": 0.00010333052313216281, "loss": 0.0211, "step": 3630 }, { "epoch": 2.581560283687943, "grad_norm": 0.20802323520183563, "learning_rate": 0.00010286157395133244, "loss": 0.0186, "step": 3640 }, { "epoch": 2.5886524822695036, "grad_norm": 0.27056363224983215, "learning_rate": 0.00010239256178029548, "loss": 0.0233, "step": 3650 }, { "epoch": 2.595744680851064, "grad_norm": 0.20938947796821594, "learning_rate": 0.00010192349694315124, "loss": 0.0202, "step": 3660 }, { "epoch": 2.602836879432624, "grad_norm": 0.22657611966133118, "learning_rate": 0.00010145438976515828, "loss": 0.0223, "step": 3670 }, { "epoch": 2.6099290780141846, "grad_norm": 0.32048580050468445, "learning_rate": 0.00010098525057250718, "loss": 0.0208, "step": 3680 }, { "epoch": 2.617021276595745, "grad_norm": 0.30489683151245117, "learning_rate": 0.0001005160896920933, "loss": 0.0197, "step": 3690 }, { "epoch": 2.624113475177305, "grad_norm": 0.339277058839798, "learning_rate": 0.00010004691745128933, "loss": 0.0252, "step": 3700 }, { "epoch": 2.631205673758865, "grad_norm": 0.3218172490596771, "learning_rate": 9.957774417771809e-05, "loss": 0.0267, "step": 3710 }, { "epoch": 2.6382978723404253, "grad_norm": 0.33065009117126465, "learning_rate": 9.91085801990251e-05, "loss": 0.0258, "step": 3720 }, { "epoch": 2.645390070921986, "grad_norm": 0.32526910305023193, "learning_rate": 9.863943584265125e-05, "loss": 0.025, "step": 3730 }, { "epoch": 2.652482269503546, "grad_norm": 0.3356036841869354, "learning_rate": 9.817032143560561e-05, "loss": 0.0249, "step": 3740 }, { "epoch": 2.6595744680851063, "grad_norm": 0.29925739765167236, "learning_rate": 9.77012473042378e-05, "loss": 0.0284, "step": 3750 }, { "epoch": 2.6666666666666665, "grad_norm": 0.19530752301216125, "learning_rate": 9.72322237740111e-05, "loss": 0.0241, "step": 3760 }, { "epoch": 2.673758865248227, "grad_norm": 0.30117693543434143, "learning_rate": 9.676326116927472e-05, "loss": 0.0216, "step": 3770 }, { "epoch": 2.6808510638297873, "grad_norm": 0.32530850172042847, "learning_rate": 9.629436981303696e-05, "loss": 0.0213, "step": 3780 }, { "epoch": 2.6879432624113475, "grad_norm": 0.26192381978034973, "learning_rate": 9.582556002673756e-05, "loss": 0.0229, "step": 3790 }, { "epoch": 2.6950354609929077, "grad_norm": 0.2634004056453705, "learning_rate": 9.535684213002087e-05, "loss": 0.0317, "step": 3800 }, { "epoch": 2.702127659574468, "grad_norm": 0.31830480694770813, "learning_rate": 9.488822644050844e-05, "loss": 0.0229, "step": 3810 }, { "epoch": 2.7092198581560285, "grad_norm": 0.26657938957214355, "learning_rate": 9.441972327357203e-05, "loss": 0.025, "step": 3820 }, { "epoch": 2.7163120567375887, "grad_norm": 0.24714654684066772, "learning_rate": 9.395134294210648e-05, "loss": 0.0197, "step": 3830 }, { "epoch": 2.723404255319149, "grad_norm": 0.2549240291118622, "learning_rate": 9.348309575630271e-05, "loss": 0.0255, "step": 3840 }, { "epoch": 2.7304964539007095, "grad_norm": 0.19039757549762726, "learning_rate": 9.301499202342086e-05, "loss": 0.0224, "step": 3850 }, { "epoch": 2.7375886524822697, "grad_norm": 0.24891236424446106, "learning_rate": 9.254704204756319e-05, "loss": 0.0252, "step": 3860 }, { "epoch": 2.74468085106383, "grad_norm": 0.30138468742370605, "learning_rate": 9.207925612944752e-05, "loss": 0.0233, "step": 3870 }, { "epoch": 2.75177304964539, "grad_norm": 0.24954111874103546, "learning_rate": 9.161164456618027e-05, "loss": 0.0262, "step": 3880 }, { "epoch": 2.7588652482269502, "grad_norm": 0.24573440849781036, "learning_rate": 9.114421765102999e-05, "loss": 0.0276, "step": 3890 }, { "epoch": 2.7659574468085104, "grad_norm": 0.2604183256626129, "learning_rate": 9.06769856732005e-05, "loss": 0.0235, "step": 3900 }, { "epoch": 2.773049645390071, "grad_norm": 0.30546581745147705, "learning_rate": 9.02099589176048e-05, "loss": 0.0178, "step": 3910 }, { "epoch": 2.780141843971631, "grad_norm": 0.4359723627567291, "learning_rate": 8.974314766463825e-05, "loss": 0.0178, "step": 3920 }, { "epoch": 2.7872340425531914, "grad_norm": 0.27847006916999817, "learning_rate": 8.927656218995261e-05, "loss": 0.0223, "step": 3930 }, { "epoch": 2.794326241134752, "grad_norm": 0.21782971918582916, "learning_rate": 8.88102127642296e-05, "loss": 0.0271, "step": 3940 }, { "epoch": 2.801418439716312, "grad_norm": 0.3402385711669922, "learning_rate": 8.834410965295506e-05, "loss": 0.0276, "step": 3950 }, { "epoch": 2.8085106382978724, "grad_norm": 0.2527099847793579, "learning_rate": 8.78782631161927e-05, "loss": 0.0352, "step": 3960 }, { "epoch": 2.8156028368794326, "grad_norm": 0.4070510268211365, "learning_rate": 8.741268340835848e-05, "loss": 0.0237, "step": 3970 }, { "epoch": 2.8226950354609928, "grad_norm": 0.4537751078605652, "learning_rate": 8.694738077799488e-05, "loss": 0.0248, "step": 3980 }, { "epoch": 2.829787234042553, "grad_norm": 0.25030770897865295, "learning_rate": 8.648236546754502e-05, "loss": 0.0216, "step": 3990 }, { "epoch": 2.8368794326241136, "grad_norm": 0.2886297404766083, "learning_rate": 8.601764771312765e-05, "loss": 0.019, "step": 4000 }, { "epoch": 2.8439716312056738, "grad_norm": 0.2288312315940857, "learning_rate": 8.555323774431143e-05, "loss": 0.0184, "step": 4010 }, { "epoch": 2.851063829787234, "grad_norm": 0.36999377608299255, "learning_rate": 8.508914578389e-05, "loss": 0.019, "step": 4020 }, { "epoch": 2.8581560283687946, "grad_norm": 0.27074334025382996, "learning_rate": 8.462538204765675e-05, "loss": 0.0201, "step": 4030 }, { "epoch": 2.8652482269503547, "grad_norm": 0.32663315534591675, "learning_rate": 8.416195674418017e-05, "loss": 0.0217, "step": 4040 }, { "epoch": 2.872340425531915, "grad_norm": 0.2872985601425171, "learning_rate": 8.369888007457895e-05, "loss": 0.0252, "step": 4050 }, { "epoch": 2.879432624113475, "grad_norm": 0.2996869683265686, "learning_rate": 8.323616223229761e-05, "loss": 0.0247, "step": 4060 }, { "epoch": 2.8865248226950353, "grad_norm": 0.2089969664812088, "learning_rate": 8.277381340288182e-05, "loss": 0.0277, "step": 4070 }, { "epoch": 2.8936170212765955, "grad_norm": 0.4416872262954712, "learning_rate": 8.231184376375451e-05, "loss": 0.0186, "step": 4080 }, { "epoch": 2.900709219858156, "grad_norm": 0.27074792981147766, "learning_rate": 8.185026348399174e-05, "loss": 0.0191, "step": 4090 }, { "epoch": 2.9078014184397163, "grad_norm": 0.17527180910110474, "learning_rate": 8.138908272409869e-05, "loss": 0.0206, "step": 4100 }, { "epoch": 2.9148936170212765, "grad_norm": 0.2025071531534195, "learning_rate": 8.092831163578625e-05, "loss": 0.0178, "step": 4110 }, { "epoch": 2.921985815602837, "grad_norm": 0.2685222625732422, "learning_rate": 8.046796036174738e-05, "loss": 0.0174, "step": 4120 }, { "epoch": 2.9290780141843973, "grad_norm": 0.34887775778770447, "learning_rate": 8.000803903543398e-05, "loss": 0.0158, "step": 4130 }, { "epoch": 2.9361702127659575, "grad_norm": 0.2305716872215271, "learning_rate": 7.95485577808336e-05, "loss": 0.0167, "step": 4140 }, { "epoch": 2.9432624113475176, "grad_norm": 0.4017210006713867, "learning_rate": 7.908952671224698e-05, "loss": 0.0246, "step": 4150 }, { "epoch": 2.950354609929078, "grad_norm": 0.16771471500396729, "learning_rate": 7.863095593406491e-05, "loss": 0.0169, "step": 4160 }, { "epoch": 2.9574468085106385, "grad_norm": 0.3576452136039734, "learning_rate": 7.817285554054628e-05, "loss": 0.0176, "step": 4170 }, { "epoch": 2.9645390070921986, "grad_norm": 0.2420264184474945, "learning_rate": 7.771523561559555e-05, "loss": 0.019, "step": 4180 }, { "epoch": 2.971631205673759, "grad_norm": 0.15221752226352692, "learning_rate": 7.725810623254088e-05, "loss": 0.0168, "step": 4190 }, { "epoch": 2.978723404255319, "grad_norm": 0.18496164679527283, "learning_rate": 7.680147745391255e-05, "loss": 0.0128, "step": 4200 }, { "epoch": 2.9858156028368796, "grad_norm": 0.25904592871665955, "learning_rate": 7.634535933122111e-05, "loss": 0.0194, "step": 4210 }, { "epoch": 2.99290780141844, "grad_norm": 0.19931963086128235, "learning_rate": 7.588976190473657e-05, "loss": 0.0133, "step": 4220 }, { "epoch": 3.0, "grad_norm": 0.5127366781234741, "learning_rate": 7.543469520326694e-05, "loss": 0.0178, "step": 4230 }, { "epoch": 3.00709219858156, "grad_norm": 0.19438482820987701, "learning_rate": 7.498016924393778e-05, "loss": 0.0133, "step": 4240 }, { "epoch": 3.0141843971631204, "grad_norm": 0.43314528465270996, "learning_rate": 7.45261940319716e-05, "loss": 0.0203, "step": 4250 }, { "epoch": 3.021276595744681, "grad_norm": 0.33415085077285767, "learning_rate": 7.407277956046763e-05, "loss": 0.0153, "step": 4260 }, { "epoch": 3.028368794326241, "grad_norm": 0.2561874985694885, "learning_rate": 7.361993581018173e-05, "loss": 0.0189, "step": 4270 }, { "epoch": 3.0354609929078014, "grad_norm": 0.2553478181362152, "learning_rate": 7.316767274930699e-05, "loss": 0.0178, "step": 4280 }, { "epoch": 3.0425531914893615, "grad_norm": 0.38511422276496887, "learning_rate": 7.271600033325393e-05, "loss": 0.033, "step": 4290 }, { "epoch": 3.049645390070922, "grad_norm": 0.27205201983451843, "learning_rate": 7.226492850443161e-05, "loss": 0.0156, "step": 4300 }, { "epoch": 3.0567375886524824, "grad_norm": 0.22120597958564758, "learning_rate": 7.181446719202873e-05, "loss": 0.0189, "step": 4310 }, { "epoch": 3.0638297872340425, "grad_norm": 0.28937768936157227, "learning_rate": 7.136462631179502e-05, "loss": 0.019, "step": 4320 }, { "epoch": 3.0709219858156027, "grad_norm": 0.24847429990768433, "learning_rate": 7.0915415765823e-05, "loss": 0.0205, "step": 4330 }, { "epoch": 3.078014184397163, "grad_norm": 0.16116857528686523, "learning_rate": 7.04668454423299e-05, "loss": 0.0166, "step": 4340 }, { "epoch": 3.0851063829787235, "grad_norm": 0.4110172688961029, "learning_rate": 7.001892521544027e-05, "loss": 0.0213, "step": 4350 }, { "epoch": 3.0921985815602837, "grad_norm": 0.3403964936733246, "learning_rate": 6.957166494496828e-05, "loss": 0.0179, "step": 4360 }, { "epoch": 3.099290780141844, "grad_norm": 0.3022840917110443, "learning_rate": 6.912507447620102e-05, "loss": 0.018, "step": 4370 }, { "epoch": 3.106382978723404, "grad_norm": 0.14554759860038757, "learning_rate": 6.867916363968143e-05, "loss": 0.0158, "step": 4380 }, { "epoch": 3.1134751773049647, "grad_norm": 0.2720215916633606, "learning_rate": 6.82339422509923e-05, "loss": 0.0221, "step": 4390 }, { "epoch": 3.120567375886525, "grad_norm": 0.23295775055885315, "learning_rate": 6.778942011053976e-05, "loss": 0.0149, "step": 4400 }, { "epoch": 3.127659574468085, "grad_norm": 0.24804311990737915, "learning_rate": 6.7345607003338e-05, "loss": 0.0161, "step": 4410 }, { "epoch": 3.1347517730496453, "grad_norm": 0.1949310600757599, "learning_rate": 6.690251269879355e-05, "loss": 0.0188, "step": 4420 }, { "epoch": 3.141843971631206, "grad_norm": 0.2794622778892517, "learning_rate": 6.64601469504903e-05, "loss": 0.0141, "step": 4430 }, { "epoch": 3.148936170212766, "grad_norm": 0.17811664938926697, "learning_rate": 6.601851949597497e-05, "loss": 0.0189, "step": 4440 }, { "epoch": 3.1560283687943262, "grad_norm": 0.248806893825531, "learning_rate": 6.557764005654254e-05, "loss": 0.0172, "step": 4450 }, { "epoch": 3.1631205673758864, "grad_norm": 0.2596416175365448, "learning_rate": 6.513751833702244e-05, "loss": 0.0198, "step": 4460 }, { "epoch": 3.1702127659574466, "grad_norm": 0.3255382776260376, "learning_rate": 6.469816402556475e-05, "loss": 0.0221, "step": 4470 }, { "epoch": 3.1773049645390072, "grad_norm": 0.3366486430168152, "learning_rate": 6.425958679342708e-05, "loss": 0.0145, "step": 4480 }, { "epoch": 3.1843971631205674, "grad_norm": 0.24371476471424103, "learning_rate": 6.382179629476161e-05, "loss": 0.02, "step": 4490 }, { "epoch": 3.1914893617021276, "grad_norm": 0.16515839099884033, "learning_rate": 6.338480216640271e-05, "loss": 0.025, "step": 4500 }, { "epoch": 3.198581560283688, "grad_norm": 0.2598879635334015, "learning_rate": 6.294861402765448e-05, "loss": 0.0139, "step": 4510 }, { "epoch": 3.2056737588652484, "grad_norm": 0.25636476278305054, "learning_rate": 6.251324148007944e-05, "loss": 0.0266, "step": 4520 }, { "epoch": 3.2127659574468086, "grad_norm": 0.23822833597660065, "learning_rate": 6.207869410728689e-05, "loss": 0.0158, "step": 4530 }, { "epoch": 3.219858156028369, "grad_norm": 0.24598778784275055, "learning_rate": 6.164498147472194e-05, "loss": 0.0134, "step": 4540 }, { "epoch": 3.226950354609929, "grad_norm": 0.21434803307056427, "learning_rate": 6.121211312945518e-05, "loss": 0.0223, "step": 4550 }, { "epoch": 3.2340425531914896, "grad_norm": 0.2982153296470642, "learning_rate": 6.078009859997227e-05, "loss": 0.0194, "step": 4560 }, { "epoch": 3.2411347517730498, "grad_norm": 0.29648178815841675, "learning_rate": 6.034894739596442e-05, "loss": 0.0188, "step": 4570 }, { "epoch": 3.24822695035461, "grad_norm": 0.27711746096611023, "learning_rate": 5.991866900811876e-05, "loss": 0.0182, "step": 4580 }, { "epoch": 3.25531914893617, "grad_norm": 0.23644205927848816, "learning_rate": 5.9489272907909845e-05, "loss": 0.0153, "step": 4590 }, { "epoch": 3.2624113475177303, "grad_norm": 0.2650356590747833, "learning_rate": 5.906076854739074e-05, "loss": 0.0154, "step": 4600 }, { "epoch": 3.269503546099291, "grad_norm": 0.3089609146118164, "learning_rate": 5.863316535898527e-05, "loss": 0.0225, "step": 4610 }, { "epoch": 3.276595744680851, "grad_norm": 0.29182925820350647, "learning_rate": 5.8206472755280206e-05, "loss": 0.025, "step": 4620 }, { "epoch": 3.2836879432624113, "grad_norm": 0.14777465164661407, "learning_rate": 5.778070012881813e-05, "loss": 0.0177, "step": 4630 }, { "epoch": 3.2907801418439715, "grad_norm": 0.29156795144081116, "learning_rate": 5.735585685189075e-05, "loss": 0.0157, "step": 4640 }, { "epoch": 3.297872340425532, "grad_norm": 0.3428608179092407, "learning_rate": 5.6931952276332436e-05, "loss": 0.0204, "step": 4650 }, { "epoch": 3.3049645390070923, "grad_norm": 0.14621183276176453, "learning_rate": 5.6508995733314576e-05, "loss": 0.0222, "step": 4660 }, { "epoch": 3.3120567375886525, "grad_norm": 0.2761157155036926, "learning_rate": 5.6086996533139894e-05, "loss": 0.027, "step": 4670 }, { "epoch": 3.3191489361702127, "grad_norm": 0.25750598311424255, "learning_rate": 5.566596396503785e-05, "loss": 0.0128, "step": 4680 }, { "epoch": 3.326241134751773, "grad_norm": 0.14341020584106445, "learning_rate": 5.5245907296959795e-05, "loss": 0.0227, "step": 4690 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3929702043533325, "learning_rate": 5.4826835775375285e-05, "loss": 0.0211, "step": 4700 }, { "epoch": 3.3404255319148937, "grad_norm": 0.16511176526546478, "learning_rate": 5.440875862506838e-05, "loss": 0.0174, "step": 4710 }, { "epoch": 3.347517730496454, "grad_norm": 0.3370586335659027, "learning_rate": 5.3991685048934604e-05, "loss": 0.0154, "step": 4720 }, { "epoch": 3.354609929078014, "grad_norm": 0.2349589467048645, "learning_rate": 5.357562422777829e-05, "loss": 0.0158, "step": 4730 }, { "epoch": 3.3617021276595747, "grad_norm": 0.14333926141262054, "learning_rate": 5.316058532011068e-05, "loss": 0.0217, "step": 4740 }, { "epoch": 3.368794326241135, "grad_norm": 0.277540385723114, "learning_rate": 5.2746577461948166e-05, "loss": 0.0155, "step": 4750 }, { "epoch": 3.375886524822695, "grad_norm": 0.20732851326465607, "learning_rate": 5.233360976661117e-05, "loss": 0.0145, "step": 4760 }, { "epoch": 3.382978723404255, "grad_norm": 0.21173764765262604, "learning_rate": 5.1921691324523716e-05, "loss": 0.0166, "step": 4770 }, { "epoch": 3.3900709219858154, "grad_norm": 0.1785140335559845, "learning_rate": 5.151083120301314e-05, "loss": 0.0243, "step": 4780 }, { "epoch": 3.397163120567376, "grad_norm": 0.22786866128444672, "learning_rate": 5.110103844611064e-05, "loss": 0.0211, "step": 4790 }, { "epoch": 3.404255319148936, "grad_norm": 0.3462839424610138, "learning_rate": 5.069232207435196e-05, "loss": 0.0172, "step": 4800 }, { "epoch": 3.4113475177304964, "grad_norm": 0.35770383477211, "learning_rate": 5.0284691084579186e-05, "loss": 0.0178, "step": 4810 }, { "epoch": 3.4184397163120566, "grad_norm": 0.18918779492378235, "learning_rate": 4.987815444974233e-05, "loss": 0.0121, "step": 4820 }, { "epoch": 3.425531914893617, "grad_norm": 0.16905491054058075, "learning_rate": 4.94727211187021e-05, "loss": 0.0163, "step": 4830 }, { "epoch": 3.4326241134751774, "grad_norm": 0.15049125254154205, "learning_rate": 4.906840001603279e-05, "loss": 0.0183, "step": 4840 }, { "epoch": 3.4397163120567376, "grad_norm": 0.24548375606536865, "learning_rate": 4.866520004182583e-05, "loss": 0.0149, "step": 4850 }, { "epoch": 3.4468085106382977, "grad_norm": 0.2544180452823639, "learning_rate": 4.826313007149387e-05, "loss": 0.0167, "step": 4860 }, { "epoch": 3.453900709219858, "grad_norm": 0.29529479146003723, "learning_rate": 4.7862198955575366e-05, "loss": 0.0221, "step": 4870 }, { "epoch": 3.4609929078014185, "grad_norm": 0.37530678510665894, "learning_rate": 4.746241551953997e-05, "loss": 0.0188, "step": 4880 }, { "epoch": 3.4680851063829787, "grad_norm": 0.17742417752742767, "learning_rate": 4.7063788563593946e-05, "loss": 0.0173, "step": 4890 }, { "epoch": 3.475177304964539, "grad_norm": 0.1323331743478775, "learning_rate": 4.666632686248673e-05, "loss": 0.0139, "step": 4900 }, { "epoch": 3.482269503546099, "grad_norm": 0.2831038236618042, "learning_rate": 4.6270039165317605e-05, "loss": 0.0223, "step": 4910 }, { "epoch": 3.4893617021276597, "grad_norm": 0.31867706775665283, "learning_rate": 4.587493419534324e-05, "loss": 0.022, "step": 4920 }, { "epoch": 3.49645390070922, "grad_norm": 0.2805313766002655, "learning_rate": 4.5481020649785464e-05, "loss": 0.0188, "step": 4930 }, { "epoch": 3.50354609929078, "grad_norm": 0.23637081682682037, "learning_rate": 4.508830719964012e-05, "loss": 0.0153, "step": 4940 }, { "epoch": 3.5106382978723403, "grad_norm": 0.2689560055732727, "learning_rate": 4.469680248948589e-05, "loss": 0.0186, "step": 4950 }, { "epoch": 3.5177304964539005, "grad_norm": 0.26819270849227905, "learning_rate": 4.4306515137294243e-05, "loss": 0.0177, "step": 4960 }, { "epoch": 3.524822695035461, "grad_norm": 0.397055059671402, "learning_rate": 4.3917453734239566e-05, "loss": 0.0149, "step": 4970 }, { "epoch": 3.5319148936170213, "grad_norm": 0.2141522765159607, "learning_rate": 4.352962684451016e-05, "loss": 0.0142, "step": 4980 }, { "epoch": 3.5390070921985815, "grad_norm": 0.19512633979320526, "learning_rate": 4.314304300511975e-05, "loss": 0.0191, "step": 4990 }, { "epoch": 3.546099290780142, "grad_norm": 0.32705944776535034, "learning_rate": 4.275771072571932e-05, "loss": 0.018, "step": 5000 }, { "epoch": 3.5531914893617023, "grad_norm": 0.21208684146404266, "learning_rate": 4.237363848841018e-05, "loss": 0.02, "step": 5010 }, { "epoch": 3.5602836879432624, "grad_norm": 0.20367835462093353, "learning_rate": 4.1990834747556874e-05, "loss": 0.0173, "step": 5020 }, { "epoch": 3.5673758865248226, "grad_norm": 0.175355926156044, "learning_rate": 4.160930792960138e-05, "loss": 0.0158, "step": 5030 }, { "epoch": 3.574468085106383, "grad_norm": 0.2119171917438507, "learning_rate": 4.122906643287735e-05, "loss": 0.0142, "step": 5040 }, { "epoch": 3.581560283687943, "grad_norm": 0.20758385956287384, "learning_rate": 4.0850118627425615e-05, "loss": 0.0137, "step": 5050 }, { "epoch": 3.5886524822695036, "grad_norm": 0.292469322681427, "learning_rate": 4.0472472854809475e-05, "loss": 0.0125, "step": 5060 }, { "epoch": 3.595744680851064, "grad_norm": 0.26145458221435547, "learning_rate": 4.009613742793149e-05, "loss": 0.0193, "step": 5070 }, { "epoch": 3.602836879432624, "grad_norm": 0.21884608268737793, "learning_rate": 3.972112063085017e-05, "loss": 0.0195, "step": 5080 }, { "epoch": 3.6099290780141846, "grad_norm": 0.2939169406890869, "learning_rate": 3.934743071859794e-05, "loss": 0.0207, "step": 5090 }, { "epoch": 3.617021276595745, "grad_norm": 0.2042369693517685, "learning_rate": 3.8975075916999106e-05, "loss": 0.0229, "step": 5100 }, { "epoch": 3.624113475177305, "grad_norm": 0.2654969394207001, "learning_rate": 3.8604064422489036e-05, "loss": 0.0173, "step": 5110 }, { "epoch": 3.631205673758865, "grad_norm": 0.3279620110988617, "learning_rate": 3.8234404401933666e-05, "loss": 0.021, "step": 5120 }, { "epoch": 3.6382978723404253, "grad_norm": 0.2576730251312256, "learning_rate": 3.7866103992449566e-05, "loss": 0.0138, "step": 5130 }, { "epoch": 3.645390070921986, "grad_norm": 0.3340737223625183, "learning_rate": 3.749917130122511e-05, "loss": 0.0146, "step": 5140 }, { "epoch": 3.652482269503546, "grad_norm": 0.18983782827854156, "learning_rate": 3.713361440534173e-05, "loss": 0.0156, "step": 5150 }, { "epoch": 3.6595744680851063, "grad_norm": 0.21235822141170502, "learning_rate": 3.67694413515964e-05, "loss": 0.0159, "step": 5160 }, { "epoch": 3.6666666666666665, "grad_norm": 0.322186678647995, "learning_rate": 3.64066601563242e-05, "loss": 0.0184, "step": 5170 }, { "epoch": 3.673758865248227, "grad_norm": 0.30648860335350037, "learning_rate": 3.604527880522214e-05, "loss": 0.0175, "step": 5180 }, { "epoch": 3.6808510638297873, "grad_norm": 0.31347954273223877, "learning_rate": 3.56853052531732e-05, "loss": 0.0113, "step": 5190 }, { "epoch": 3.6879432624113475, "grad_norm": 0.2934507727622986, "learning_rate": 3.532674742407134e-05, "loss": 0.0184, "step": 5200 }, { "epoch": 3.6950354609929077, "grad_norm": 0.141712948679924, "learning_rate": 3.496961321064691e-05, "loss": 0.018, "step": 5210 }, { "epoch": 3.702127659574468, "grad_norm": 0.1768782138824463, "learning_rate": 3.461391047429304e-05, "loss": 0.0113, "step": 5220 }, { "epoch": 3.7092198581560285, "grad_norm": 0.1911313533782959, "learning_rate": 3.425964704489267e-05, "loss": 0.0157, "step": 5230 }, { "epoch": 3.7163120567375887, "grad_norm": 0.1564929336309433, "learning_rate": 3.390683072064594e-05, "loss": 0.013, "step": 5240 }, { "epoch": 3.723404255319149, "grad_norm": 0.3047170639038086, "learning_rate": 3.3555469267898784e-05, "loss": 0.0138, "step": 5250 }, { "epoch": 3.7304964539007095, "grad_norm": 0.10271797329187393, "learning_rate": 3.3205570420971855e-05, "loss": 0.0149, "step": 5260 }, { "epoch": 3.7375886524822697, "grad_norm": 0.2823294997215271, "learning_rate": 3.285714188199031e-05, "loss": 0.018, "step": 5270 }, { "epoch": 3.74468085106383, "grad_norm": 0.19177910685539246, "learning_rate": 3.251019132071415e-05, "loss": 0.0124, "step": 5280 }, { "epoch": 3.75177304964539, "grad_norm": 0.16738402843475342, "learning_rate": 3.2164726374369655e-05, "loss": 0.0106, "step": 5290 }, { "epoch": 3.7588652482269502, "grad_norm": 0.21252532303333282, "learning_rate": 3.182075464748094e-05, "loss": 0.0209, "step": 5300 }, { "epoch": 3.7659574468085104, "grad_norm": 0.21184132993221283, "learning_rate": 3.1478283711702825e-05, "loss": 0.0228, "step": 5310 }, { "epoch": 3.773049645390071, "grad_norm": 0.17201204597949982, "learning_rate": 3.1137321105654074e-05, "loss": 0.0175, "step": 5320 }, { "epoch": 3.780141843971631, "grad_norm": 0.1786358505487442, "learning_rate": 3.079787433475135e-05, "loss": 0.0191, "step": 5330 }, { "epoch": 3.7872340425531914, "grad_norm": 0.18210577964782715, "learning_rate": 3.0459950871044217e-05, "loss": 0.0186, "step": 5340 }, { "epoch": 3.794326241134752, "grad_norm": 0.2807255685329437, "learning_rate": 3.0123558153050423e-05, "loss": 0.0194, "step": 5350 }, { "epoch": 3.801418439716312, "grad_norm": 0.18478602170944214, "learning_rate": 2.9788703585592382e-05, "loss": 0.0159, "step": 5360 }, { "epoch": 3.8085106382978724, "grad_norm": 0.14877133071422577, "learning_rate": 2.9455394539633974e-05, "loss": 0.0123, "step": 5370 }, { "epoch": 3.8156028368794326, "grad_norm": 0.2453097105026245, "learning_rate": 2.9123638352118455e-05, "loss": 0.0163, "step": 5380 }, { "epoch": 3.8226950354609928, "grad_norm": 0.3394005596637726, "learning_rate": 2.8793442325806886e-05, "loss": 0.0157, "step": 5390 }, { "epoch": 3.829787234042553, "grad_norm": 0.21939441561698914, "learning_rate": 2.846481372911739e-05, "loss": 0.0191, "step": 5400 }, { "epoch": 3.8368794326241136, "grad_norm": 0.22955432534217834, "learning_rate": 2.8137759795965068e-05, "loss": 0.0145, "step": 5410 }, { "epoch": 3.8439716312056738, "grad_norm": 0.1423809826374054, "learning_rate": 2.781228772560297e-05, "loss": 0.016, "step": 5420 }, { "epoch": 3.851063829787234, "grad_norm": 0.2662895619869232, "learning_rate": 2.7488404682463353e-05, "loss": 0.0144, "step": 5430 }, { "epoch": 3.8581560283687946, "grad_norm": 0.26522767543792725, "learning_rate": 2.716611779600028e-05, "loss": 0.0196, "step": 5440 }, { "epoch": 3.8652482269503547, "grad_norm": 0.16856004297733307, "learning_rate": 2.6845434160532356e-05, "loss": 0.0161, "step": 5450 }, { "epoch": 3.872340425531915, "grad_norm": 0.24976351857185364, "learning_rate": 2.652636083508686e-05, "loss": 0.0128, "step": 5460 }, { "epoch": 3.879432624113475, "grad_norm": 0.18612882494926453, "learning_rate": 2.6208904843244186e-05, "loss": 0.0158, "step": 5470 }, { "epoch": 3.8865248226950353, "grad_norm": 0.2666490077972412, "learning_rate": 2.589307317298324e-05, "loss": 0.0174, "step": 5480 }, { "epoch": 3.8936170212765955, "grad_norm": 0.17859995365142822, "learning_rate": 2.5578872776527717e-05, "loss": 0.011, "step": 5490 }, { "epoch": 3.900709219858156, "grad_norm": 0.18200767040252686, "learning_rate": 2.526631057019294e-05, "loss": 0.0119, "step": 5500 }, { "epoch": 3.9078014184397163, "grad_norm": 0.15403611958026886, "learning_rate": 2.4955393434233754e-05, "loss": 0.0131, "step": 5510 }, { "epoch": 3.9148936170212765, "grad_norm": 0.13963736593723297, "learning_rate": 2.4646128212692952e-05, "loss": 0.0132, "step": 5520 }, { "epoch": 3.921985815602837, "grad_norm": 0.19664759933948517, "learning_rate": 2.433852171325072e-05, "loss": 0.016, "step": 5530 }, { "epoch": 3.9290780141843973, "grad_norm": 0.25922274589538574, "learning_rate": 2.4032580707074637e-05, "loss": 0.014, "step": 5540 }, { "epoch": 3.9361702127659575, "grad_norm": 0.09897946566343307, "learning_rate": 2.3728311928670845e-05, "loss": 0.0108, "step": 5550 }, { "epoch": 3.9432624113475176, "grad_norm": 0.25340279936790466, "learning_rate": 2.342572207573559e-05, "loss": 0.02, "step": 5560 }, { "epoch": 3.950354609929078, "grad_norm": 0.2628706395626068, "learning_rate": 2.3124817809007903e-05, "loss": 0.0161, "step": 5570 }, { "epoch": 3.9574468085106385, "grad_norm": 0.27684804797172546, "learning_rate": 2.282560575212298e-05, "loss": 0.0123, "step": 5580 }, { "epoch": 3.9645390070921986, "grad_norm": 0.14096932113170624, "learning_rate": 2.2528092491466358e-05, "loss": 0.0143, "step": 5590 }, { "epoch": 3.971631205673759, "grad_norm": 0.22769907116889954, "learning_rate": 2.223228457602897e-05, "loss": 0.0192, "step": 5600 }, { "epoch": 3.978723404255319, "grad_norm": 0.2954142391681671, "learning_rate": 2.1938188517262838e-05, "loss": 0.0118, "step": 5610 }, { "epoch": 3.9858156028368796, "grad_norm": 0.23976144194602966, "learning_rate": 2.1645810788937993e-05, "loss": 0.0161, "step": 5620 }, { "epoch": 3.99290780141844, "grad_norm": 0.4473976790904999, "learning_rate": 2.1355157826999704e-05, "loss": 0.0115, "step": 5630 }, { "epoch": 4.0, "grad_norm": 0.18849952518939972, "learning_rate": 2.1066236029427043e-05, "loss": 0.0134, "step": 5640 }, { "epoch": 4.00709219858156, "grad_norm": 0.3046376705169678, "learning_rate": 2.077905175609184e-05, "loss": 0.0145, "step": 5650 }, { "epoch": 4.01418439716312, "grad_norm": 0.2120821177959442, "learning_rate": 2.0493611328618876e-05, "loss": 0.0129, "step": 5660 }, { "epoch": 4.0212765957446805, "grad_norm": 0.14745740592479706, "learning_rate": 2.0209921030246613e-05, "loss": 0.0099, "step": 5670 }, { "epoch": 4.028368794326241, "grad_norm": 0.14253804087638855, "learning_rate": 1.9927987105688872e-05, "loss": 0.0212, "step": 5680 }, { "epoch": 4.035460992907802, "grad_norm": 0.19978418946266174, "learning_rate": 1.964781576099749e-05, "loss": 0.0093, "step": 5690 }, { "epoch": 4.042553191489362, "grad_norm": 0.16095592081546783, "learning_rate": 1.936941316342553e-05, "loss": 0.0133, "step": 5700 }, { "epoch": 4.049645390070922, "grad_norm": 0.2865225672721863, "learning_rate": 1.9092785441291738e-05, "loss": 0.0182, "step": 5710 }, { "epoch": 4.056737588652482, "grad_norm": 0.10646691173315048, "learning_rate": 1.8817938683845394e-05, "loss": 0.0103, "step": 5720 }, { "epoch": 4.0638297872340425, "grad_norm": 0.16811849176883698, "learning_rate": 1.8544878941132605e-05, "loss": 0.0151, "step": 5730 }, { "epoch": 4.070921985815603, "grad_norm": 0.23703442513942719, "learning_rate": 1.8273612223862734e-05, "loss": 0.0102, "step": 5740 }, { "epoch": 4.078014184397163, "grad_norm": 0.19330421090126038, "learning_rate": 1.8004144503276443e-05, "loss": 0.0152, "step": 5750 }, { "epoch": 4.085106382978723, "grad_norm": 0.20537878572940826, "learning_rate": 1.7736481711013984e-05, "loss": 0.0118, "step": 5760 }, { "epoch": 4.092198581560283, "grad_norm": 0.13630573451519012, "learning_rate": 1.7470629738984856e-05, "loss": 0.0102, "step": 5770 }, { "epoch": 4.099290780141844, "grad_norm": 0.38417133688926697, "learning_rate": 1.7206594439237865e-05, "loss": 0.0163, "step": 5780 }, { "epoch": 4.1063829787234045, "grad_norm": 0.3182227313518524, "learning_rate": 1.694438162383255e-05, "loss": 0.0149, "step": 5790 }, { "epoch": 4.113475177304965, "grad_norm": 0.16077764332294464, "learning_rate": 1.6683997064711132e-05, "loss": 0.0126, "step": 5800 }, { "epoch": 4.120567375886525, "grad_norm": 0.4243755340576172, "learning_rate": 1.6425446493571363e-05, "loss": 0.0187, "step": 5810 }, { "epoch": 4.127659574468085, "grad_norm": 0.22879810631275177, "learning_rate": 1.616873560174057e-05, "loss": 0.0213, "step": 5820 }, { "epoch": 4.134751773049645, "grad_norm": 0.1545029729604721, "learning_rate": 1.5913870040050162e-05, "loss": 0.0137, "step": 5830 }, { "epoch": 4.141843971631205, "grad_norm": 0.2279650866985321, "learning_rate": 1.566085541871145e-05, "loss": 0.0115, "step": 5840 }, { "epoch": 4.148936170212766, "grad_norm": 0.26870498061180115, "learning_rate": 1.540969730719193e-05, "loss": 0.011, "step": 5850 }, { "epoch": 4.156028368794326, "grad_norm": 0.17166127264499664, "learning_rate": 1.5160401234092868e-05, "loss": 0.0144, "step": 5860 }, { "epoch": 4.163120567375887, "grad_norm": 0.20089754462242126, "learning_rate": 1.4912972687027527e-05, "loss": 0.0122, "step": 5870 }, { "epoch": 4.170212765957447, "grad_norm": 0.14342001080513, "learning_rate": 1.4667417112500393e-05, "loss": 0.0176, "step": 5880 }, { "epoch": 4.177304964539007, "grad_norm": 0.2057809829711914, "learning_rate": 1.4423739915787183e-05, "loss": 0.0114, "step": 5890 }, { "epoch": 4.184397163120567, "grad_norm": 0.19354157149791718, "learning_rate": 1.4181946460816064e-05, "loss": 0.0114, "step": 5900 }, { "epoch": 4.191489361702128, "grad_norm": 0.23628641664981842, "learning_rate": 1.3942042070049389e-05, "loss": 0.0197, "step": 5910 }, { "epoch": 4.198581560283688, "grad_norm": 0.10524198412895203, "learning_rate": 1.3704032024366597e-05, "loss": 0.0135, "step": 5920 }, { "epoch": 4.205673758865248, "grad_norm": 0.19477611780166626, "learning_rate": 1.3467921562948039e-05, "loss": 0.0134, "step": 5930 }, { "epoch": 4.212765957446808, "grad_norm": 0.15423716604709625, "learning_rate": 1.3233715883159559e-05, "loss": 0.0169, "step": 5940 }, { "epoch": 4.219858156028369, "grad_norm": 0.18021738529205322, "learning_rate": 1.3001420140438158e-05, "loss": 0.0101, "step": 5950 }, { "epoch": 4.226950354609929, "grad_norm": 0.20674963295459747, "learning_rate": 1.27710394481784e-05, "loss": 0.012, "step": 5960 }, { "epoch": 4.23404255319149, "grad_norm": 0.2461792379617691, "learning_rate": 1.2542578877620026e-05, "loss": 0.0116, "step": 5970 }, { "epoch": 4.24113475177305, "grad_norm": 0.20844022929668427, "learning_rate": 1.2316043457736127e-05, "loss": 0.0091, "step": 5980 }, { "epoch": 4.24822695035461, "grad_norm": 0.16048000752925873, "learning_rate": 1.2091438175122593e-05, "loss": 0.0114, "step": 5990 }, { "epoch": 4.25531914893617, "grad_norm": 0.22476951777935028, "learning_rate": 1.1868767973888317e-05, "loss": 0.0193, "step": 6000 }, { "epoch": 4.26241134751773, "grad_norm": 0.1669793426990509, "learning_rate": 1.1648037755546325e-05, "loss": 0.0115, "step": 6010 }, { "epoch": 4.2695035460992905, "grad_norm": 0.07553170621395111, "learning_rate": 1.1429252378905875e-05, "loss": 0.0108, "step": 6020 }, { "epoch": 4.276595744680851, "grad_norm": 0.24128834903240204, "learning_rate": 1.1212416659965507e-05, "loss": 0.0181, "step": 6030 }, { "epoch": 4.283687943262412, "grad_norm": 0.24184225499629974, "learning_rate": 1.0997535371807143e-05, "loss": 0.0151, "step": 6040 }, { "epoch": 4.290780141843972, "grad_norm": 0.22201502323150635, "learning_rate": 1.0784613244490816e-05, "loss": 0.011, "step": 6050 }, { "epoch": 4.297872340425532, "grad_norm": 0.12740255892276764, "learning_rate": 1.057365496495073e-05, "loss": 0.013, "step": 6060 }, { "epoch": 4.304964539007092, "grad_norm": 0.2595222592353821, "learning_rate": 1.036466517689202e-05, "loss": 0.0116, "step": 6070 }, { "epoch": 4.3120567375886525, "grad_norm": 0.12973222136497498, "learning_rate": 1.015764848068852e-05, "loss": 0.0099, "step": 6080 }, { "epoch": 4.319148936170213, "grad_norm": 0.2121812403202057, "learning_rate": 9.95260943328148e-06, "loss": 0.0096, "step": 6090 }, { "epoch": 4.326241134751773, "grad_norm": 0.24669013917446136, "learning_rate": 9.749552548079344e-06, "loss": 0.0186, "step": 6100 }, { "epoch": 4.333333333333333, "grad_norm": 0.24903731048107147, "learning_rate": 9.548482294858285e-06, "loss": 0.0154, "step": 6110 }, { "epoch": 4.340425531914893, "grad_norm": 0.15923161804676056, "learning_rate": 9.349403099663923e-06, "loss": 0.0125, "step": 6120 }, { "epoch": 4.347517730496454, "grad_norm": 0.2829233407974243, "learning_rate": 9.1523193447138e-06, "loss": 0.0178, "step": 6130 }, { "epoch": 4.3546099290780145, "grad_norm": 0.27781230211257935, "learning_rate": 8.957235368300986e-06, "loss": 0.0133, "step": 6140 }, { "epoch": 4.361702127659575, "grad_norm": 0.14426802098751068, "learning_rate": 8.764155464698597e-06, "loss": 0.0106, "step": 6150 }, { "epoch": 4.368794326241135, "grad_norm": 0.17137634754180908, "learning_rate": 8.573083884065126e-06, "loss": 0.015, "step": 6160 }, { "epoch": 4.375886524822695, "grad_norm": 0.1472645103931427, "learning_rate": 8.384024832351112e-06, "loss": 0.0134, "step": 6170 }, { "epoch": 4.382978723404255, "grad_norm": 0.23173321783542633, "learning_rate": 8.196982471206326e-06, "loss": 0.0175, "step": 6180 }, { "epoch": 4.390070921985815, "grad_norm": 0.2875099182128906, "learning_rate": 8.011960917888339e-06, "loss": 0.0132, "step": 6190 }, { "epoch": 4.397163120567376, "grad_norm": 0.17926155030727386, "learning_rate": 7.82896424517181e-06, "loss": 0.0141, "step": 6200 }, { "epoch": 4.404255319148936, "grad_norm": 0.2918379306793213, "learning_rate": 7.647996481258868e-06, "loss": 0.0166, "step": 6210 }, { "epoch": 4.411347517730497, "grad_norm": 0.1396457999944687, "learning_rate": 7.469061609690353e-06, "loss": 0.0181, "step": 6220 }, { "epoch": 4.418439716312057, "grad_norm": 0.2047400027513504, "learning_rate": 7.292163569258281e-06, "loss": 0.0122, "step": 6230 }, { "epoch": 4.425531914893617, "grad_norm": 0.18662038445472717, "learning_rate": 7.117306253918999e-06, "loss": 0.0142, "step": 6240 }, { "epoch": 4.432624113475177, "grad_norm": 0.1682143360376358, "learning_rate": 6.94449351270754e-06, "loss": 0.0106, "step": 6250 }, { "epoch": 4.439716312056738, "grad_norm": 0.08281824737787247, "learning_rate": 6.773729149652907e-06, "loss": 0.0175, "step": 6260 }, { "epoch": 4.446808510638298, "grad_norm": 0.25101786851882935, "learning_rate": 6.605016923694307e-06, "loss": 0.0148, "step": 6270 }, { "epoch": 4.453900709219858, "grad_norm": 0.4814547598361969, "learning_rate": 6.43836054859841e-06, "loss": 0.0137, "step": 6280 }, { "epoch": 4.460992907801418, "grad_norm": 0.18742209672927856, "learning_rate": 6.273763692877588e-06, "loss": 0.0106, "step": 6290 }, { "epoch": 4.468085106382979, "grad_norm": 0.15110346674919128, "learning_rate": 6.11122997970921e-06, "loss": 0.0156, "step": 6300 }, { "epoch": 4.475177304964539, "grad_norm": 0.21841980516910553, "learning_rate": 5.9507629868558404e-06, "loss": 0.0087, "step": 6310 }, { "epoch": 4.4822695035460995, "grad_norm": 0.2773948013782501, "learning_rate": 5.792366246586511e-06, "loss": 0.0111, "step": 6320 }, { "epoch": 4.48936170212766, "grad_norm": 0.168336421251297, "learning_rate": 5.636043245598932e-06, "loss": 0.0122, "step": 6330 }, { "epoch": 4.49645390070922, "grad_norm": 0.22542957961559296, "learning_rate": 5.4817974249427914e-06, "loss": 0.0175, "step": 6340 }, { "epoch": 4.50354609929078, "grad_norm": 0.22203271090984344, "learning_rate": 5.329632179943977e-06, "loss": 0.0089, "step": 6350 }, { "epoch": 4.51063829787234, "grad_norm": 0.07676347345113754, "learning_rate": 5.179550860129823e-06, "loss": 0.0123, "step": 6360 }, { "epoch": 4.5177304964539005, "grad_norm": 0.1949254721403122, "learning_rate": 5.031556769155444e-06, "loss": 0.0132, "step": 6370 }, { "epoch": 4.524822695035461, "grad_norm": 0.2822635769844055, "learning_rate": 4.885653164730908e-06, "loss": 0.0128, "step": 6380 }, { "epoch": 4.531914893617021, "grad_norm": 0.22830314934253693, "learning_rate": 4.741843258549639e-06, "loss": 0.0169, "step": 6390 }, { "epoch": 4.539007092198582, "grad_norm": 0.3081783354282379, "learning_rate": 4.600130216217613e-06, "loss": 0.0134, "step": 6400 }, { "epoch": 4.546099290780142, "grad_norm": 0.2014082372188568, "learning_rate": 4.460517157183819e-06, "loss": 0.0139, "step": 6410 }, { "epoch": 4.553191489361702, "grad_norm": 0.2044125497341156, "learning_rate": 4.323007154671399e-06, "loss": 0.0134, "step": 6420 }, { "epoch": 4.560283687943262, "grad_norm": 0.17926239967346191, "learning_rate": 4.1876032356101825e-06, "loss": 0.0131, "step": 6430 }, { "epoch": 4.567375886524823, "grad_norm": 0.1999116986989975, "learning_rate": 4.054308380569927e-06, "loss": 0.0089, "step": 6440 }, { "epoch": 4.574468085106383, "grad_norm": 0.3048093020915985, "learning_rate": 3.9231255236947925e-06, "loss": 0.0126, "step": 6450 }, { "epoch": 4.581560283687943, "grad_norm": 0.11860720068216324, "learning_rate": 3.7940575526386857e-06, "loss": 0.0111, "step": 6460 }, { "epoch": 4.588652482269503, "grad_norm": 0.25165706872940063, "learning_rate": 3.667107308501749e-06, "loss": 0.0099, "step": 6470 }, { "epoch": 4.595744680851064, "grad_norm": 0.18574100732803345, "learning_rate": 3.542277585767828e-06, "loss": 0.0093, "step": 6480 }, { "epoch": 4.602836879432624, "grad_norm": 0.23088154196739197, "learning_rate": 3.4195711322428716e-06, "loss": 0.008, "step": 6490 }, { "epoch": 4.609929078014185, "grad_norm": 0.13895224034786224, "learning_rate": 3.298990648994571e-06, "loss": 0.0098, "step": 6500 }, { "epoch": 4.617021276595745, "grad_norm": 0.2849128246307373, "learning_rate": 3.1805387902927754e-06, "loss": 0.0225, "step": 6510 }, { "epoch": 4.624113475177305, "grad_norm": 0.2238398641347885, "learning_rate": 3.06421816355118e-06, "loss": 0.0158, "step": 6520 }, { "epoch": 4.631205673758865, "grad_norm": 0.2368791699409485, "learning_rate": 2.950031329269831e-06, "loss": 0.0104, "step": 6530 }, { "epoch": 4.638297872340425, "grad_norm": 0.2200658917427063, "learning_rate": 2.8379808009788677e-06, "loss": 0.0158, "step": 6540 }, { "epoch": 4.6453900709219855, "grad_norm": 0.17251890897750854, "learning_rate": 2.728069045183068e-06, "loss": 0.0139, "step": 6550 }, { "epoch": 4.652482269503546, "grad_norm": 0.13242077827453613, "learning_rate": 2.620298481307704e-06, "loss": 0.0083, "step": 6560 }, { "epoch": 4.659574468085106, "grad_norm": 0.2232428342103958, "learning_rate": 2.5146714816451033e-06, "loss": 0.0165, "step": 6570 }, { "epoch": 4.666666666666667, "grad_norm": 0.164382204413414, "learning_rate": 2.4111903713026294e-06, "loss": 0.0111, "step": 6580 }, { "epoch": 4.673758865248227, "grad_norm": 0.14152784645557404, "learning_rate": 2.3098574281513185e-06, "loss": 0.0114, "step": 6590 }, { "epoch": 4.680851063829787, "grad_norm": 0.2582566440105438, "learning_rate": 2.210674882775854e-06, "loss": 0.0103, "step": 6600 }, { "epoch": 4.6879432624113475, "grad_norm": 0.12841904163360596, "learning_rate": 2.1136449184254393e-06, "loss": 0.0075, "step": 6610 }, { "epoch": 4.695035460992908, "grad_norm": 0.14777736365795135, "learning_rate": 2.0187696709657145e-06, "loss": 0.0133, "step": 6620 }, { "epoch": 4.702127659574468, "grad_norm": 0.14513316750526428, "learning_rate": 1.9260512288317823e-06, "loss": 0.0108, "step": 6630 }, { "epoch": 4.709219858156028, "grad_norm": 0.21794749796390533, "learning_rate": 1.835491632982178e-06, "loss": 0.0105, "step": 6640 }, { "epoch": 4.716312056737589, "grad_norm": 0.31420379877090454, "learning_rate": 1.747092876853984e-06, "loss": 0.0108, "step": 6650 }, { "epoch": 4.723404255319149, "grad_norm": 0.1831444799900055, "learning_rate": 1.6608569063189638e-06, "loss": 0.012, "step": 6660 }, { "epoch": 4.7304964539007095, "grad_norm": 0.17518573999404907, "learning_rate": 1.5767856196406749e-06, "loss": 0.016, "step": 6670 }, { "epoch": 4.73758865248227, "grad_norm": 0.33270004391670227, "learning_rate": 1.4948808674327464e-06, "loss": 0.0137, "step": 6680 }, { "epoch": 4.74468085106383, "grad_norm": 0.14303705096244812, "learning_rate": 1.4151444526181112e-06, "loss": 0.0108, "step": 6690 }, { "epoch": 4.75177304964539, "grad_norm": 0.37515661120414734, "learning_rate": 1.337578130389272e-06, "loss": 0.0162, "step": 6700 }, { "epoch": 4.75886524822695, "grad_norm": 0.12578299641609192, "learning_rate": 1.2621836081697647e-06, "loss": 0.0093, "step": 6710 }, { "epoch": 4.76595744680851, "grad_norm": 0.23699671030044556, "learning_rate": 1.1889625455764997e-06, "loss": 0.0102, "step": 6720 }, { "epoch": 4.773049645390071, "grad_norm": 0.1644619107246399, "learning_rate": 1.117916554383258e-06, "loss": 0.0179, "step": 6730 }, { "epoch": 4.780141843971631, "grad_norm": 0.139451801776886, "learning_rate": 1.0490471984851868e-06, "loss": 0.0167, "step": 6740 }, { "epoch": 4.787234042553192, "grad_norm": 0.2277289628982544, "learning_rate": 9.823559938644255e-07, "loss": 0.0139, "step": 6750 }, { "epoch": 4.794326241134752, "grad_norm": 0.16943824291229248, "learning_rate": 9.178444085566895e-07, "loss": 0.0148, "step": 6760 }, { "epoch": 4.801418439716312, "grad_norm": 0.1669815182685852, "learning_rate": 8.555138626189618e-07, "loss": 0.0173, "step": 6770 }, { "epoch": 4.808510638297872, "grad_norm": 0.380924791097641, "learning_rate": 7.953657280982407e-07, "loss": 0.0131, "step": 6780 }, { "epoch": 4.815602836879433, "grad_norm": 0.14774571359157562, "learning_rate": 7.374013290013415e-07, "loss": 0.0108, "step": 6790 }, { "epoch": 4.822695035460993, "grad_norm": 0.26690518856048584, "learning_rate": 6.816219412657531e-07, "loss": 0.0191, "step": 6800 }, { "epoch": 4.829787234042553, "grad_norm": 0.34515464305877686, "learning_rate": 6.280287927315498e-07, "loss": 0.0138, "step": 6810 }, { "epoch": 4.836879432624113, "grad_norm": 0.1968574970960617, "learning_rate": 5.766230631143455e-07, "loss": 0.0125, "step": 6820 }, { "epoch": 4.843971631205674, "grad_norm": 0.24029354751110077, "learning_rate": 5.274058839793594e-07, "loss": 0.0134, "step": 6830 }, { "epoch": 4.851063829787234, "grad_norm": 0.19054192304611206, "learning_rate": 4.803783387164696e-07, "loss": 0.0156, "step": 6840 }, { "epoch": 4.858156028368795, "grad_norm": 0.08260553330183029, "learning_rate": 4.355414625163978e-07, "loss": 0.0097, "step": 6850 }, { "epoch": 4.865248226950355, "grad_norm": 0.16334979236125946, "learning_rate": 3.9289624234790656e-07, "loss": 0.0109, "step": 6860 }, { "epoch": 4.872340425531915, "grad_norm": 0.2816116213798523, "learning_rate": 3.524436169360601e-07, "loss": 0.0104, "step": 6870 }, { "epoch": 4.879432624113475, "grad_norm": 0.28883612155914307, "learning_rate": 3.141844767415969e-07, "loss": 0.0176, "step": 6880 }, { "epoch": 4.886524822695035, "grad_norm": 0.154693603515625, "learning_rate": 2.7811966394131195e-07, "loss": 0.0119, "step": 6890 }, { "epoch": 4.8936170212765955, "grad_norm": 0.1789465993642807, "learning_rate": 2.4424997240948266e-07, "loss": 0.0103, "step": 6900 }, { "epoch": 4.900709219858156, "grad_norm": 0.2560679018497467, "learning_rate": 2.1257614770046063e-07, "loss": 0.0171, "step": 6910 }, { "epoch": 4.907801418439716, "grad_norm": 0.14788945019245148, "learning_rate": 1.830988870321848e-07, "loss": 0.0129, "step": 6920 }, { "epoch": 4.914893617021277, "grad_norm": 0.2480207085609436, "learning_rate": 1.55818839270927e-07, "loss": 0.013, "step": 6930 }, { "epoch": 4.921985815602837, "grad_norm": 0.29826200008392334, "learning_rate": 1.3073660491691452e-07, "loss": 0.0105, "step": 6940 }, { "epoch": 4.929078014184397, "grad_norm": 0.1601097583770752, "learning_rate": 1.0785273609115188e-07, "loss": 0.0097, "step": 6950 }, { "epoch": 4.9361702127659575, "grad_norm": 0.15834511816501617, "learning_rate": 8.716773652330812e-08, "loss": 0.0147, "step": 6960 }, { "epoch": 4.943262411347518, "grad_norm": 0.06883599609136581, "learning_rate": 6.868206154054812e-08, "loss": 0.0109, "step": 6970 }, { "epoch": 4.950354609929078, "grad_norm": 0.13073433935642242, "learning_rate": 5.2396118057562724e-08, "loss": 0.0113, "step": 6980 }, { "epoch": 4.957446808510638, "grad_norm": 0.19870930910110474, "learning_rate": 3.831026456760922e-08, "loss": 0.0217, "step": 6990 }, { "epoch": 4.964539007092198, "grad_norm": 0.09622751176357269, "learning_rate": 2.6424811134584393e-08, "loss": 0.0155, "step": 7000 }, { "epoch": 4.971631205673759, "grad_norm": 0.16659928858280182, "learning_rate": 1.674001938626324e-08, "loss": 0.0162, "step": 7010 }, { "epoch": 4.9787234042553195, "grad_norm": 0.2666969299316406, "learning_rate": 9.256102508481412e-09, "loss": 0.0121, "step": 7020 }, { "epoch": 4.98581560283688, "grad_norm": 0.1449095904827118, "learning_rate": 3.973225240450073e-09, "loss": 0.0118, "step": 7030 }, { "epoch": 4.99290780141844, "grad_norm": 0.13001736998558044, "learning_rate": 8.915038711476697e-10, "loss": 0.0109, "step": 7040 }, { "epoch": 4.999290780141844, "step": 7049, "total_flos": 2.471599871280288e+17, "train_loss": 0.03227324262610191, "train_runtime": 3151.6449, "train_samples_per_second": 35.786, "train_steps_per_second": 2.237 } ], "logging_steps": 10, "max_steps": 7049, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.471599871280288e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }