{ "best_metric": 1.4804431200027466, "best_model_checkpoint": "./output/checkpoint-4650", "epoch": 0.7796503386360056, "eval_steps": 150, "global_step": 4950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001575051189163648, "grad_norm": 1.292704463005066, "learning_rate": 1.25e-05, "loss": 2.3495, "step": 10 }, { "epoch": 0.003150102378327296, "grad_norm": 1.1147561073303223, "learning_rate": 2.5e-05, "loss": 2.2929, "step": 20 }, { "epoch": 0.004725153567490943, "grad_norm": 1.0415704250335693, "learning_rate": 3.75e-05, "loss": 2.2227, "step": 30 }, { "epoch": 0.006300204756654592, "grad_norm": 1.171570062637329, "learning_rate": 5e-05, "loss": 2.1635, "step": 40 }, { "epoch": 0.007875255945818239, "grad_norm": 1.2755579948425293, "learning_rate": 6.25e-05, "loss": 2.0523, "step": 50 }, { "epoch": 0.009450307134981887, "grad_norm": 1.2706725597381592, "learning_rate": 7.5e-05, "loss": 1.9675, "step": 60 }, { "epoch": 0.011025358324145534, "grad_norm": 1.2686223983764648, "learning_rate": 8.75e-05, "loss": 1.9002, "step": 70 }, { "epoch": 0.012600409513309183, "grad_norm": 1.3380385637283325, "learning_rate": 0.0001, "loss": 1.8681, "step": 80 }, { "epoch": 0.01417546070247283, "grad_norm": 1.2959766387939453, "learning_rate": 0.00011250000000000001, "loss": 1.8263, "step": 90 }, { "epoch": 0.015750511891636478, "grad_norm": 1.4983580112457275, "learning_rate": 0.000125, "loss": 1.7796, "step": 100 }, { "epoch": 0.017325563080800126, "grad_norm": 1.2883356809616089, "learning_rate": 0.00012499871543489787, "loss": 1.7754, "step": 110 }, { "epoch": 0.018900614269963773, "grad_norm": 1.2248905897140503, "learning_rate": 0.00012499486179239495, "loss": 1.7442, "step": 120 }, { "epoch": 0.02047566545912742, "grad_norm": 1.290252923965454, "learning_rate": 0.00012498843923089938, "loss": 1.7451, "step": 130 }, { "epoch": 0.022050716648291068, "grad_norm": 1.246947169303894, "learning_rate": 0.0001249794480144175, "loss": 1.7151, "step": 140 }, { "epoch": 0.023625767837454716, "grad_norm": 1.2474550008773804, "learning_rate": 0.000124967888512543, "loss": 1.6997, "step": 150 }, { "epoch": 0.023625767837454716, "eval_loss": 1.713956594467163, "eval_runtime": 35.1971, "eval_samples_per_second": 14.234, "eval_steps_per_second": 14.234, "step": 150 }, { "epoch": 0.025200819026618367, "grad_norm": 1.2170847654342651, "learning_rate": 0.00012495376120044173, "loss": 1.6881, "step": 160 }, { "epoch": 0.026775870215782014, "grad_norm": 1.2175121307373047, "learning_rate": 0.00012493706665883217, "loss": 1.7135, "step": 170 }, { "epoch": 0.02835092140494566, "grad_norm": 1.1301295757293701, "learning_rate": 0.00012491780557396154, "loss": 1.682, "step": 180 }, { "epoch": 0.02992597259410931, "grad_norm": 1.1549564599990845, "learning_rate": 0.00012489597873757756, "loss": 1.6887, "step": 190 }, { "epoch": 0.031501023783272956, "grad_norm": 1.1598905324935913, "learning_rate": 0.00012487158704689602, "loss": 1.6954, "step": 200 }, { "epoch": 0.03307607497243661, "grad_norm": 1.1747504472732544, "learning_rate": 0.0001248446315045638, "loss": 1.6835, "step": 210 }, { "epoch": 0.03465112616160025, "grad_norm": 1.1416676044464111, "learning_rate": 0.00012481511321861763, "loss": 1.6521, "step": 220 }, { "epoch": 0.0362261773507639, "grad_norm": 1.1543567180633545, "learning_rate": 0.00012478303340243864, "loss": 1.662, "step": 230 }, { "epoch": 0.037801228539927546, "grad_norm": 1.2603256702423096, "learning_rate": 0.00012474839337470246, "loss": 1.6678, "step": 240 }, { "epoch": 0.0393762797290912, "grad_norm": 1.2086098194122314, "learning_rate": 0.0001247111945593249, "loss": 1.6475, "step": 250 }, { "epoch": 0.04095133091825484, "grad_norm": 1.2282692193984985, "learning_rate": 0.00012467143848540359, "loss": 1.6491, "step": 260 }, { "epoch": 0.04252638210741849, "grad_norm": 1.1063305139541626, "learning_rate": 0.000124629126787155, "loss": 1.654, "step": 270 }, { "epoch": 0.044101433296582136, "grad_norm": 1.114211082458496, "learning_rate": 0.00012458426120384738, "loss": 1.6244, "step": 280 }, { "epoch": 0.04567648448574579, "grad_norm": 1.1069458723068237, "learning_rate": 0.00012453684357972906, "loss": 1.6388, "step": 290 }, { "epoch": 0.04725153567490943, "grad_norm": 1.0202927589416504, "learning_rate": 0.00012448687586395289, "loss": 1.6367, "step": 300 }, { "epoch": 0.04725153567490943, "eval_loss": 1.6362468004226685, "eval_runtime": 35.7423, "eval_samples_per_second": 14.017, "eval_steps_per_second": 14.017, "step": 300 }, { "epoch": 0.04882658686407308, "grad_norm": 1.0611218214035034, "learning_rate": 0.00012443436011049593, "loss": 1.6127, "step": 310 }, { "epoch": 0.05040163805323673, "grad_norm": 1.1300408840179443, "learning_rate": 0.0001243792984780751, "loss": 1.6293, "step": 320 }, { "epoch": 0.05197668924240038, "grad_norm": 1.0638223886489868, "learning_rate": 0.00012432169323005853, "loss": 1.6267, "step": 330 }, { "epoch": 0.05355174043156403, "grad_norm": 1.1265466213226318, "learning_rate": 0.00012426154673437223, "loss": 1.6315, "step": 340 }, { "epoch": 0.05512679162072767, "grad_norm": 1.088779330253601, "learning_rate": 0.00012419886146340314, "loss": 1.6207, "step": 350 }, { "epoch": 0.05670184280989132, "grad_norm": 1.0430299043655396, "learning_rate": 0.0001241336399938972, "loss": 1.6132, "step": 360 }, { "epoch": 0.05827689399905497, "grad_norm": 1.0889984369277954, "learning_rate": 0.00012406588500685355, "loss": 1.6416, "step": 370 }, { "epoch": 0.05985194518821862, "grad_norm": 1.0341376066207886, "learning_rate": 0.00012399559928741435, "loss": 1.6217, "step": 380 }, { "epoch": 0.06142699637738226, "grad_norm": 1.073559284210205, "learning_rate": 0.00012392278572475023, "loss": 1.6314, "step": 390 }, { "epoch": 0.06300204756654591, "grad_norm": 1.039914846420288, "learning_rate": 0.0001238474473119416, "loss": 1.6149, "step": 400 }, { "epoch": 0.06457709875570956, "grad_norm": 1.0189768075942993, "learning_rate": 0.00012376958714585545, "loss": 1.5755, "step": 410 }, { "epoch": 0.06615214994487321, "grad_norm": 1.0381276607513428, "learning_rate": 0.0001236892084270183, "loss": 1.6057, "step": 420 }, { "epoch": 0.06772720113403685, "grad_norm": 1.089646339416504, "learning_rate": 0.00012360631445948448, "loss": 1.6127, "step": 430 }, { "epoch": 0.0693022523232005, "grad_norm": 1.0117536783218384, "learning_rate": 0.00012352090865070026, "loss": 1.6043, "step": 440 }, { "epoch": 0.07087730351236415, "grad_norm": 1.0716177225112915, "learning_rate": 0.00012343299451136397, "loss": 1.5994, "step": 450 }, { "epoch": 0.07087730351236415, "eval_loss": 1.6039879322052002, "eval_runtime": 35.4687, "eval_samples_per_second": 14.125, "eval_steps_per_second": 14.125, "step": 450 }, { "epoch": 0.0724523547015278, "grad_norm": 1.0424439907073975, "learning_rate": 0.00012334257565528155, "loss": 1.6081, "step": 460 }, { "epoch": 0.07402740589069144, "grad_norm": 1.020411491394043, "learning_rate": 0.000123249655799218, "loss": 1.5901, "step": 470 }, { "epoch": 0.07560245707985509, "grad_norm": 1.0402997732162476, "learning_rate": 0.00012315423876274468, "loss": 1.605, "step": 480 }, { "epoch": 0.07717750826901874, "grad_norm": 1.0569404363632202, "learning_rate": 0.0001230563284680822, "loss": 1.6015, "step": 490 }, { "epoch": 0.0787525594581824, "grad_norm": 1.0378941297531128, "learning_rate": 0.00012295592893993935, "loss": 1.5875, "step": 500 }, { "epoch": 0.08032761064734605, "grad_norm": 0.9868465065956116, "learning_rate": 0.00012285304430534745, "loss": 1.5916, "step": 510 }, { "epoch": 0.08190266183650968, "grad_norm": 0.9732680916786194, "learning_rate": 0.00012274767879349083, "loss": 1.6066, "step": 520 }, { "epoch": 0.08347771302567333, "grad_norm": 0.9945887923240662, "learning_rate": 0.00012263983673553306, "loss": 1.56, "step": 530 }, { "epoch": 0.08505276421483698, "grad_norm": 0.9910768270492554, "learning_rate": 0.0001225295225644387, "loss": 1.5889, "step": 540 }, { "epoch": 0.08662781540400064, "grad_norm": 1.0420514345169067, "learning_rate": 0.0001224167408147913, "loss": 1.5801, "step": 550 }, { "epoch": 0.08820286659316427, "grad_norm": 1.0232083797454834, "learning_rate": 0.0001223014961226068, "loss": 1.5764, "step": 560 }, { "epoch": 0.08977791778232792, "grad_norm": 0.9651984572410583, "learning_rate": 0.00012218379322514317, "loss": 1.5833, "step": 570 }, { "epoch": 0.09135296897149157, "grad_norm": 0.9818013310432434, "learning_rate": 0.00012206363696070545, "loss": 1.5998, "step": 580 }, { "epoch": 0.09292802016065523, "grad_norm": 1.0015103816986084, "learning_rate": 0.0001219410322684471, "loss": 1.5818, "step": 590 }, { "epoch": 0.09450307134981886, "grad_norm": 1.0065380334854126, "learning_rate": 0.0001218159841881668, "loss": 1.5851, "step": 600 }, { "epoch": 0.09450307134981886, "eval_loss": 1.583275318145752, "eval_runtime": 35.3825, "eval_samples_per_second": 14.16, "eval_steps_per_second": 14.16, "step": 600 }, { "epoch": 0.09607812253898251, "grad_norm": 1.0160647630691528, "learning_rate": 0.00012168849786010133, "loss": 1.5837, "step": 610 }, { "epoch": 0.09765317372814616, "grad_norm": 0.9476320147514343, "learning_rate": 0.00012155857852471433, "loss": 1.6115, "step": 620 }, { "epoch": 0.09922822491730982, "grad_norm": 0.9994568228721619, "learning_rate": 0.0001214262315224808, "loss": 1.5922, "step": 630 }, { "epoch": 0.10080327610647347, "grad_norm": 0.9553513526916504, "learning_rate": 0.00012129146229366766, "loss": 1.5811, "step": 640 }, { "epoch": 0.1023783272956371, "grad_norm": 1.055202603340149, "learning_rate": 0.00012115427637811003, "loss": 1.5672, "step": 650 }, { "epoch": 0.10395337848480075, "grad_norm": 1.0420925617218018, "learning_rate": 0.00012101467941498357, "loss": 1.5647, "step": 660 }, { "epoch": 0.1055284296739644, "grad_norm": 0.9376361966133118, "learning_rate": 0.0001208726771425727, "loss": 1.5771, "step": 670 }, { "epoch": 0.10710348086312806, "grad_norm": 0.9151705503463745, "learning_rate": 0.00012072827539803463, "loss": 1.5432, "step": 680 }, { "epoch": 0.1086785320522917, "grad_norm": 1.0111474990844727, "learning_rate": 0.00012058148011715949, "loss": 1.5674, "step": 690 }, { "epoch": 0.11025358324145534, "grad_norm": 0.8788293600082397, "learning_rate": 0.00012043229733412636, "loss": 1.5723, "step": 700 }, { "epoch": 0.111828634430619, "grad_norm": 0.9642289876937866, "learning_rate": 0.0001202807331812551, "loss": 1.5819, "step": 710 }, { "epoch": 0.11340368561978265, "grad_norm": 1.0280308723449707, "learning_rate": 0.00012012679388875441, "loss": 1.565, "step": 720 }, { "epoch": 0.1149787368089463, "grad_norm": 1.0084924697875977, "learning_rate": 0.00011997048578446568, "loss": 1.5608, "step": 730 }, { "epoch": 0.11655378799810993, "grad_norm": 0.9293835163116455, "learning_rate": 0.00011981181529360282, "loss": 1.568, "step": 740 }, { "epoch": 0.11812883918727358, "grad_norm": 0.9751657247543335, "learning_rate": 0.00011965078893848828, "loss": 1.5488, "step": 750 }, { "epoch": 0.11812883918727358, "eval_loss": 1.567505121231079, "eval_runtime": 35.4792, "eval_samples_per_second": 14.121, "eval_steps_per_second": 14.121, "step": 750 }, { "epoch": 0.11970389037643724, "grad_norm": 1.0379984378814697, "learning_rate": 0.00011948741333828481, "loss": 1.5631, "step": 760 }, { "epoch": 0.12127894156560089, "grad_norm": 0.9595450758934021, "learning_rate": 0.00011932169520872344, "loss": 1.5837, "step": 770 }, { "epoch": 0.12285399275476452, "grad_norm": 0.9366925954818726, "learning_rate": 0.00011915364136182738, "loss": 1.5587, "step": 780 }, { "epoch": 0.12442904394392817, "grad_norm": 0.943828821182251, "learning_rate": 0.0001189832587056321, "loss": 1.534, "step": 790 }, { "epoch": 0.12600409513309183, "grad_norm": 0.9779319763183594, "learning_rate": 0.00011881055424390119, "loss": 1.5814, "step": 800 }, { "epoch": 0.12757914632225548, "grad_norm": 0.9054823517799377, "learning_rate": 0.00011863553507583869, "loss": 1.5527, "step": 810 }, { "epoch": 0.12915419751141913, "grad_norm": 0.8591505885124207, "learning_rate": 0.00011845820839579708, "loss": 1.5332, "step": 820 }, { "epoch": 0.13072924870058278, "grad_norm": 0.9473647475242615, "learning_rate": 0.00011827858149298162, "loss": 1.5583, "step": 830 }, { "epoch": 0.13230429988974643, "grad_norm": 0.9654921889305115, "learning_rate": 0.00011809666175115075, "loss": 1.5464, "step": 840 }, { "epoch": 0.13387935107891005, "grad_norm": 0.9148724675178528, "learning_rate": 0.00011791245664831251, "loss": 1.5665, "step": 850 }, { "epoch": 0.1354544022680737, "grad_norm": 0.9524568319320679, "learning_rate": 0.0001177259737564172, "loss": 1.553, "step": 860 }, { "epoch": 0.13702945345723735, "grad_norm": 1.016753911972046, "learning_rate": 0.00011753722074104613, "loss": 1.5546, "step": 870 }, { "epoch": 0.138604504646401, "grad_norm": 0.9564122557640076, "learning_rate": 0.00011734620536109644, "loss": 1.5667, "step": 880 }, { "epoch": 0.14017955583556466, "grad_norm": 0.9233742952346802, "learning_rate": 0.00011715293546846223, "loss": 1.5437, "step": 890 }, { "epoch": 0.1417546070247283, "grad_norm": 0.9245238900184631, "learning_rate": 0.00011695741900771184, "loss": 1.556, "step": 900 }, { "epoch": 0.1417546070247283, "eval_loss": 1.5552129745483398, "eval_runtime": 35.3017, "eval_samples_per_second": 14.192, "eval_steps_per_second": 14.192, "step": 900 }, { "epoch": 0.14332965821389196, "grad_norm": 0.9321339726448059, "learning_rate": 0.00011675966401576116, "loss": 1.576, "step": 910 }, { "epoch": 0.1449047094030556, "grad_norm": 0.9396321773529053, "learning_rate": 0.00011655967862154335, "loss": 1.5476, "step": 920 }, { "epoch": 0.14647976059221926, "grad_norm": 0.8809250593185425, "learning_rate": 0.0001163574710456747, "loss": 1.5297, "step": 930 }, { "epoch": 0.14805481178138288, "grad_norm": 0.9630320072174072, "learning_rate": 0.00011615304960011663, "loss": 1.5354, "step": 940 }, { "epoch": 0.14962986297054653, "grad_norm": 0.9173291325569153, "learning_rate": 0.00011594642268783415, "loss": 1.5397, "step": 950 }, { "epoch": 0.15120491415971019, "grad_norm": 0.892159640789032, "learning_rate": 0.00011573759880245027, "loss": 1.5665, "step": 960 }, { "epoch": 0.15277996534887384, "grad_norm": 0.8549361228942871, "learning_rate": 0.00011552658652789703, "loss": 1.5582, "step": 970 }, { "epoch": 0.1543550165380375, "grad_norm": 0.9306020736694336, "learning_rate": 0.00011531339453806258, "loss": 1.5418, "step": 980 }, { "epoch": 0.15593006772720114, "grad_norm": 0.9093769788742065, "learning_rate": 0.00011509803159643458, "loss": 1.5448, "step": 990 }, { "epoch": 0.1575051189163648, "grad_norm": 0.8876301050186157, "learning_rate": 0.00011488050655574003, "loss": 1.571, "step": 1000 }, { "epoch": 0.15908017010552844, "grad_norm": 0.9180966019630432, "learning_rate": 0.00011466082835758141, "loss": 1.5266, "step": 1010 }, { "epoch": 0.1606552212946921, "grad_norm": 0.9715962409973145, "learning_rate": 0.000114439006032069, "loss": 1.5336, "step": 1020 }, { "epoch": 0.16223027248385571, "grad_norm": 0.9360828995704651, "learning_rate": 0.00011421504869744978, "loss": 1.5513, "step": 1030 }, { "epoch": 0.16380532367301937, "grad_norm": 0.9322426319122314, "learning_rate": 0.0001139889655597326, "loss": 1.562, "step": 1040 }, { "epoch": 0.16538037486218302, "grad_norm": 0.9031831622123718, "learning_rate": 0.00011376076591230974, "loss": 1.5581, "step": 1050 }, { "epoch": 0.16538037486218302, "eval_loss": 1.547297716140747, "eval_runtime": 35.376, "eval_samples_per_second": 14.162, "eval_steps_per_second": 14.162, "step": 1050 }, { "epoch": 0.16695542605134667, "grad_norm": 0.9079886078834534, "learning_rate": 0.00011353045913557492, "loss": 1.5496, "step": 1060 }, { "epoch": 0.16853047724051032, "grad_norm": 0.9747688174247742, "learning_rate": 0.00011329805469653768, "loss": 1.5394, "step": 1070 }, { "epoch": 0.17010552842967397, "grad_norm": 0.9190161824226379, "learning_rate": 0.00011306356214843422, "loss": 1.5569, "step": 1080 }, { "epoch": 0.17168057961883762, "grad_norm": 0.9262961745262146, "learning_rate": 0.00011282699113033477, "loss": 1.5239, "step": 1090 }, { "epoch": 0.17325563080800127, "grad_norm": 0.933107852935791, "learning_rate": 0.00011258835136674729, "loss": 1.516, "step": 1100 }, { "epoch": 0.1748306819971649, "grad_norm": 0.8730500936508179, "learning_rate": 0.00011234765266721778, "loss": 1.5374, "step": 1110 }, { "epoch": 0.17640573318632854, "grad_norm": 0.8708956837654114, "learning_rate": 0.00011210490492592703, "loss": 1.5517, "step": 1120 }, { "epoch": 0.1779807843754922, "grad_norm": 0.9595373272895813, "learning_rate": 0.0001118601181212839, "loss": 1.5312, "step": 1130 }, { "epoch": 0.17955583556465585, "grad_norm": 0.9185504913330078, "learning_rate": 0.00011161330231551515, "loss": 1.5402, "step": 1140 }, { "epoch": 0.1811308867538195, "grad_norm": 0.965939462184906, "learning_rate": 0.00011136446765425187, "loss": 1.5496, "step": 1150 }, { "epoch": 0.18270593794298315, "grad_norm": 0.9474881291389465, "learning_rate": 0.00011111362436611234, "loss": 1.5365, "step": 1160 }, { "epoch": 0.1842809891321468, "grad_norm": 0.9285203218460083, "learning_rate": 0.00011086078276228167, "loss": 1.5372, "step": 1170 }, { "epoch": 0.18585604032131045, "grad_norm": 0.8643173575401306, "learning_rate": 0.00011060595323608789, "loss": 1.5364, "step": 1180 }, { "epoch": 0.1874310915104741, "grad_norm": 0.8951886296272278, "learning_rate": 0.00011034914626257467, "loss": 1.5162, "step": 1190 }, { "epoch": 0.18900614269963772, "grad_norm": 0.9068513512611389, "learning_rate": 0.0001100903723980709, "loss": 1.5573, "step": 1200 }, { "epoch": 0.18900614269963772, "eval_loss": 1.538773775100708, "eval_runtime": 35.125, "eval_samples_per_second": 14.263, "eval_steps_per_second": 14.263, "step": 1200 }, { "epoch": 0.19058119388880138, "grad_norm": 0.8557595610618591, "learning_rate": 0.00010982964227975658, "loss": 1.5192, "step": 1210 }, { "epoch": 0.19215624507796503, "grad_norm": 0.9791204333305359, "learning_rate": 0.00010956696662522569, "loss": 1.5512, "step": 1220 }, { "epoch": 0.19373129626712868, "grad_norm": 0.9069547057151794, "learning_rate": 0.00010930235623204551, "loss": 1.5318, "step": 1230 }, { "epoch": 0.19530634745629233, "grad_norm": 0.9511467218399048, "learning_rate": 0.00010903582197731294, "loss": 1.5302, "step": 1240 }, { "epoch": 0.19688139864545598, "grad_norm": 0.8480790257453918, "learning_rate": 0.00010876737481720722, "loss": 1.5382, "step": 1250 }, { "epoch": 0.19845644983461963, "grad_norm": 0.8755747675895691, "learning_rate": 0.0001084970257865397, "loss": 1.5282, "step": 1260 }, { "epoch": 0.20003150102378328, "grad_norm": 0.8885294198989868, "learning_rate": 0.00010822478599830008, "loss": 1.525, "step": 1270 }, { "epoch": 0.20160655221294693, "grad_norm": 0.8845210075378418, "learning_rate": 0.00010795066664319983, "loss": 1.5461, "step": 1280 }, { "epoch": 0.20318160340211056, "grad_norm": 0.8408452868461609, "learning_rate": 0.00010767467898921197, "loss": 1.5276, "step": 1290 }, { "epoch": 0.2047566545912742, "grad_norm": 0.8788501024246216, "learning_rate": 0.00010739683438110797, "loss": 1.5317, "step": 1300 }, { "epoch": 0.20633170578043786, "grad_norm": 0.9068732261657715, "learning_rate": 0.00010711714423999145, "loss": 1.522, "step": 1310 }, { "epoch": 0.2079067569696015, "grad_norm": 0.8549708724021912, "learning_rate": 0.00010683562006282861, "loss": 1.5366, "step": 1320 }, { "epoch": 0.20948180815876516, "grad_norm": 0.9036719799041748, "learning_rate": 0.00010655227342197574, "loss": 1.5298, "step": 1330 }, { "epoch": 0.2110568593479288, "grad_norm": 0.8768537044525146, "learning_rate": 0.00010626711596470343, "loss": 1.5454, "step": 1340 }, { "epoch": 0.21263191053709246, "grad_norm": 0.8741724491119385, "learning_rate": 0.0001059801594127179, "loss": 1.5262, "step": 1350 }, { "epoch": 0.21263191053709246, "eval_loss": 1.532226800918579, "eval_runtime": 35.9878, "eval_samples_per_second": 13.921, "eval_steps_per_second": 13.921, "step": 1350 }, { "epoch": 0.2142069617262561, "grad_norm": 0.8824599981307983, "learning_rate": 0.00010569141556167905, "loss": 1.5167, "step": 1360 }, { "epoch": 0.21578201291541976, "grad_norm": 0.8644326329231262, "learning_rate": 0.00010540089628071566, "loss": 1.5396, "step": 1370 }, { "epoch": 0.2173570641045834, "grad_norm": 0.8992419242858887, "learning_rate": 0.00010510861351193747, "loss": 1.552, "step": 1380 }, { "epoch": 0.21893211529374704, "grad_norm": 0.8747900724411011, "learning_rate": 0.00010481457926994435, "loss": 1.5247, "step": 1390 }, { "epoch": 0.2205071664829107, "grad_norm": 0.9142392873764038, "learning_rate": 0.0001045188056413323, "loss": 1.5624, "step": 1400 }, { "epoch": 0.22208221767207434, "grad_norm": 0.9147394299507141, "learning_rate": 0.00010422130478419676, "loss": 1.5079, "step": 1410 }, { "epoch": 0.223657268861238, "grad_norm": 0.8999921679496765, "learning_rate": 0.00010392208892763269, "loss": 1.5059, "step": 1420 }, { "epoch": 0.22523232005040164, "grad_norm": 0.9109930396080017, "learning_rate": 0.00010362117037123204, "loss": 1.5033, "step": 1430 }, { "epoch": 0.2268073712395653, "grad_norm": 0.8617811799049377, "learning_rate": 0.00010331856148457803, "loss": 1.542, "step": 1440 }, { "epoch": 0.22838242242872894, "grad_norm": 0.8684085011482239, "learning_rate": 0.00010301427470673678, "loss": 1.5227, "step": 1450 }, { "epoch": 0.2299574736178926, "grad_norm": 0.9092370271682739, "learning_rate": 0.00010270832254574588, "loss": 1.5159, "step": 1460 }, { "epoch": 0.23153252480705622, "grad_norm": 0.924177885055542, "learning_rate": 0.00010240071757810036, "loss": 1.5267, "step": 1470 }, { "epoch": 0.23310757599621987, "grad_norm": 0.8798425793647766, "learning_rate": 0.00010209147244823564, "loss": 1.5222, "step": 1480 }, { "epoch": 0.23468262718538352, "grad_norm": 0.9220207333564758, "learning_rate": 0.00010178059986800773, "loss": 1.5162, "step": 1490 }, { "epoch": 0.23625767837454717, "grad_norm": 0.8781149387359619, "learning_rate": 0.00010146811261617085, "loss": 1.5199, "step": 1500 }, { "epoch": 0.23625767837454717, "eval_loss": 1.5264769792556763, "eval_runtime": 36.7562, "eval_samples_per_second": 13.63, "eval_steps_per_second": 13.63, "step": 1500 }, { "epoch": 0.23783272956371082, "grad_norm": 0.878212571144104, "learning_rate": 0.00010115402353785197, "loss": 1.5316, "step": 1510 }, { "epoch": 0.23940778075287447, "grad_norm": 0.9163106679916382, "learning_rate": 0.00010083834554402292, "loss": 1.4983, "step": 1520 }, { "epoch": 0.24098283194203812, "grad_norm": 0.859491229057312, "learning_rate": 0.00010052109161096958, "loss": 1.5189, "step": 1530 }, { "epoch": 0.24255788313120177, "grad_norm": 0.8865215182304382, "learning_rate": 0.00010020227477975852, "loss": 1.5439, "step": 1540 }, { "epoch": 0.24413293432036542, "grad_norm": 0.8885824084281921, "learning_rate": 9.9881908155701e-05, "loss": 1.5338, "step": 1550 }, { "epoch": 0.24570798550952905, "grad_norm": 0.8940324783325195, "learning_rate": 9.956000490781411e-05, "loss": 1.4984, "step": 1560 }, { "epoch": 0.2472830366986927, "grad_norm": 0.9195623397827148, "learning_rate": 9.923657826827957e-05, "loss": 1.5041, "step": 1570 }, { "epoch": 0.24885808788785635, "grad_norm": 0.8722363114356995, "learning_rate": 9.891164153189976e-05, "loss": 1.5234, "step": 1580 }, { "epoch": 0.25043313907702003, "grad_norm": 0.8719931840896606, "learning_rate": 9.858520805555123e-05, "loss": 1.5178, "step": 1590 }, { "epoch": 0.25200819026618365, "grad_norm": 0.8836755156517029, "learning_rate": 9.825729125763561e-05, "loss": 1.5063, "step": 1600 }, { "epoch": 0.2535832414553473, "grad_norm": 0.9015933275222778, "learning_rate": 9.792790461752813e-05, "loss": 1.5021, "step": 1610 }, { "epoch": 0.25515829264451095, "grad_norm": 0.8438279628753662, "learning_rate": 9.759706167502343e-05, "loss": 1.5349, "step": 1620 }, { "epoch": 0.2567333438336746, "grad_norm": 0.8984500169754028, "learning_rate": 9.726477602977905e-05, "loss": 1.5106, "step": 1630 }, { "epoch": 0.25830839502283826, "grad_norm": 0.8898086547851562, "learning_rate": 9.69310613407564e-05, "loss": 1.5039, "step": 1640 }, { "epoch": 0.2598834462120019, "grad_norm": 0.9477494359016418, "learning_rate": 9.659593132565929e-05, "loss": 1.5283, "step": 1650 }, { "epoch": 0.2598834462120019, "eval_loss": 1.521310567855835, "eval_runtime": 36.1857, "eval_samples_per_second": 13.845, "eval_steps_per_second": 13.845, "step": 1650 }, { "epoch": 0.26145849740116556, "grad_norm": 0.8961330652236938, "learning_rate": 9.625939976037002e-05, "loss": 1.5148, "step": 1660 }, { "epoch": 0.2630335485903292, "grad_norm": 0.8740301132202148, "learning_rate": 9.59214804783831e-05, "loss": 1.522, "step": 1670 }, { "epoch": 0.26460859977949286, "grad_norm": 0.8525272607803345, "learning_rate": 9.558218737023671e-05, "loss": 1.5199, "step": 1680 }, { "epoch": 0.2661836509686565, "grad_norm": 0.8677767515182495, "learning_rate": 9.524153438294159e-05, "loss": 1.5203, "step": 1690 }, { "epoch": 0.2677587021578201, "grad_norm": 0.939079225063324, "learning_rate": 9.489953551940783e-05, "loss": 1.4987, "step": 1700 }, { "epoch": 0.2693337533469838, "grad_norm": 0.9007495045661926, "learning_rate": 9.455620483786914e-05, "loss": 1.5392, "step": 1710 }, { "epoch": 0.2709088045361474, "grad_norm": 0.8517370223999023, "learning_rate": 9.421155645130514e-05, "loss": 1.5233, "step": 1720 }, { "epoch": 0.2724838557253111, "grad_norm": 0.8468499779701233, "learning_rate": 9.38656045268611e-05, "loss": 1.5024, "step": 1730 }, { "epoch": 0.2740589069144747, "grad_norm": 0.8898671269416809, "learning_rate": 9.351836328526563e-05, "loss": 1.5207, "step": 1740 }, { "epoch": 0.2756339581036384, "grad_norm": 0.8659180402755737, "learning_rate": 9.316984700024612e-05, "loss": 1.539, "step": 1750 }, { "epoch": 0.277209009292802, "grad_norm": 0.9182498455047607, "learning_rate": 9.2820069997942e-05, "loss": 1.5113, "step": 1760 }, { "epoch": 0.2787840604819657, "grad_norm": 0.9074522852897644, "learning_rate": 9.246904665631588e-05, "loss": 1.4885, "step": 1770 }, { "epoch": 0.2803591116711293, "grad_norm": 0.8520182967185974, "learning_rate": 9.211679140456242e-05, "loss": 1.5234, "step": 1780 }, { "epoch": 0.28193416286029294, "grad_norm": 0.8538583517074585, "learning_rate": 9.176331872251536e-05, "loss": 1.5097, "step": 1790 }, { "epoch": 0.2835092140494566, "grad_norm": 0.8584926128387451, "learning_rate": 9.140864314005222e-05, "loss": 1.5267, "step": 1800 }, { "epoch": 0.2835092140494566, "eval_loss": 1.517116665840149, "eval_runtime": 36.7679, "eval_samples_per_second": 13.626, "eval_steps_per_second": 13.626, "step": 1800 }, { "epoch": 0.28508426523862024, "grad_norm": 0.8618518710136414, "learning_rate": 9.105277923649698e-05, "loss": 1.4953, "step": 1810 }, { "epoch": 0.2866593164277839, "grad_norm": 0.8893195390701294, "learning_rate": 9.06957416400209e-05, "loss": 1.5071, "step": 1820 }, { "epoch": 0.28823436761694754, "grad_norm": 0.8630818724632263, "learning_rate": 9.03375450270412e-05, "loss": 1.506, "step": 1830 }, { "epoch": 0.2898094188061112, "grad_norm": 0.9402779936790466, "learning_rate": 8.997820412161764e-05, "loss": 1.5244, "step": 1840 }, { "epoch": 0.29138446999527484, "grad_norm": 0.8658491373062134, "learning_rate": 8.961773369484738e-05, "loss": 1.5435, "step": 1850 }, { "epoch": 0.2929595211844385, "grad_norm": 0.849281907081604, "learning_rate": 8.925614856425786e-05, "loss": 1.493, "step": 1860 }, { "epoch": 0.29453457237360214, "grad_norm": 0.9385392069816589, "learning_rate": 8.88934635931975e-05, "loss": 1.4993, "step": 1870 }, { "epoch": 0.29610962356276577, "grad_norm": 0.8613683581352234, "learning_rate": 8.852969369022494e-05, "loss": 1.5085, "step": 1880 }, { "epoch": 0.29768467475192945, "grad_norm": 0.8535075783729553, "learning_rate": 8.816485380849613e-05, "loss": 1.5198, "step": 1890 }, { "epoch": 0.29925972594109307, "grad_norm": 0.8856109976768494, "learning_rate": 8.779895894514961e-05, "loss": 1.5259, "step": 1900 }, { "epoch": 0.30083477713025675, "grad_norm": 0.8867731690406799, "learning_rate": 8.743202414069011e-05, "loss": 1.5249, "step": 1910 }, { "epoch": 0.30240982831942037, "grad_norm": 0.8741790652275085, "learning_rate": 8.706406447837023e-05, "loss": 1.5271, "step": 1920 }, { "epoch": 0.30398487950858405, "grad_norm": 0.8374512195587158, "learning_rate": 8.669509508357052e-05, "loss": 1.5227, "step": 1930 }, { "epoch": 0.3055599306977477, "grad_norm": 0.8484550714492798, "learning_rate": 8.632513112317761e-05, "loss": 1.5222, "step": 1940 }, { "epoch": 0.30713498188691135, "grad_norm": 0.86808842420578, "learning_rate": 8.59541878049609e-05, "loss": 1.5055, "step": 1950 }, { "epoch": 0.30713498188691135, "eval_loss": 1.5131734609603882, "eval_runtime": 36.9625, "eval_samples_per_second": 13.554, "eval_steps_per_second": 13.554, "step": 1950 }, { "epoch": 0.308710033076075, "grad_norm": 0.847622275352478, "learning_rate": 8.558228037694728e-05, "loss": 1.4996, "step": 1960 }, { "epoch": 0.3102850842652386, "grad_norm": 0.910416841506958, "learning_rate": 8.520942412679447e-05, "loss": 1.508, "step": 1970 }, { "epoch": 0.3118601354544023, "grad_norm": 0.9068947434425354, "learning_rate": 8.483563438116257e-05, "loss": 1.508, "step": 1980 }, { "epoch": 0.3134351866435659, "grad_norm": 0.8562408685684204, "learning_rate": 8.446092650508393e-05, "loss": 1.5022, "step": 1990 }, { "epoch": 0.3150102378327296, "grad_norm": 0.9227258563041687, "learning_rate": 8.408531590133172e-05, "loss": 1.5213, "step": 2000 }, { "epoch": 0.3165852890218932, "grad_norm": 0.8598424196243286, "learning_rate": 8.370881800978673e-05, "loss": 1.5171, "step": 2010 }, { "epoch": 0.3181603402110569, "grad_norm": 0.8680636286735535, "learning_rate": 8.333144830680262e-05, "loss": 1.5223, "step": 2020 }, { "epoch": 0.3197353914002205, "grad_norm": 0.8430407047271729, "learning_rate": 8.29532223045698e-05, "loss": 1.4946, "step": 2030 }, { "epoch": 0.3213104425893842, "grad_norm": 0.901833713054657, "learning_rate": 8.257415555047785e-05, "loss": 1.5113, "step": 2040 }, { "epoch": 0.3228854937785478, "grad_norm": 0.8412112593650818, "learning_rate": 8.21942636264763e-05, "loss": 1.5253, "step": 2050 }, { "epoch": 0.32446054496771143, "grad_norm": 0.8819128274917603, "learning_rate": 8.181356214843422e-05, "loss": 1.4934, "step": 2060 }, { "epoch": 0.3260355961568751, "grad_norm": 0.8501127362251282, "learning_rate": 8.143206676549826e-05, "loss": 1.5007, "step": 2070 }, { "epoch": 0.32761064734603873, "grad_norm": 0.8944729566574097, "learning_rate": 8.10497931594494e-05, "loss": 1.5156, "step": 2080 }, { "epoch": 0.3291856985352024, "grad_norm": 0.8759516477584839, "learning_rate": 8.066675704405836e-05, "loss": 1.4965, "step": 2090 }, { "epoch": 0.33076074972436603, "grad_norm": 0.8777790069580078, "learning_rate": 8.028297416443952e-05, "loss": 1.4847, "step": 2100 }, { "epoch": 0.33076074972436603, "eval_loss": 1.508542776107788, "eval_runtime": 35.4369, "eval_samples_per_second": 14.138, "eval_steps_per_second": 14.138, "step": 2100 }, { "epoch": 0.3323358009135297, "grad_norm": 0.8308836221694946, "learning_rate": 7.989846029640397e-05, "loss": 1.4937, "step": 2110 }, { "epoch": 0.33391085210269333, "grad_norm": 0.8855435252189636, "learning_rate": 7.951323124581069e-05, "loss": 1.5021, "step": 2120 }, { "epoch": 0.335485903291857, "grad_norm": 0.8671780824661255, "learning_rate": 7.91273028479172e-05, "loss": 1.5045, "step": 2130 }, { "epoch": 0.33706095448102064, "grad_norm": 0.8830945491790771, "learning_rate": 7.874069096672831e-05, "loss": 1.5186, "step": 2140 }, { "epoch": 0.33863600567018426, "grad_norm": 0.8480697274208069, "learning_rate": 7.83534114943442e-05, "loss": 1.5123, "step": 2150 }, { "epoch": 0.34021105685934794, "grad_norm": 0.8749104142189026, "learning_rate": 7.796548035030715e-05, "loss": 1.4982, "step": 2160 }, { "epoch": 0.34178610804851156, "grad_norm": 0.872046947479248, "learning_rate": 7.757691348094703e-05, "loss": 1.4944, "step": 2170 }, { "epoch": 0.34336115923767524, "grad_norm": 0.8679631948471069, "learning_rate": 7.718772685872595e-05, "loss": 1.4991, "step": 2180 }, { "epoch": 0.34493621042683886, "grad_norm": 0.8403679728507996, "learning_rate": 7.679793648158159e-05, "loss": 1.4881, "step": 2190 }, { "epoch": 0.34651126161600254, "grad_norm": 0.8893296718597412, "learning_rate": 7.640755837226965e-05, "loss": 1.5111, "step": 2200 }, { "epoch": 0.34808631280516616, "grad_norm": 0.9288545846939087, "learning_rate": 7.601660857770522e-05, "loss": 1.5076, "step": 2210 }, { "epoch": 0.3496613639943298, "grad_norm": 0.8662068247795105, "learning_rate": 7.562510316830308e-05, "loss": 1.5162, "step": 2220 }, { "epoch": 0.35123641518349347, "grad_norm": 0.8514265418052673, "learning_rate": 7.523305823731723e-05, "loss": 1.4988, "step": 2230 }, { "epoch": 0.3528114663726571, "grad_norm": 0.866111695766449, "learning_rate": 7.484048990017919e-05, "loss": 1.5058, "step": 2240 }, { "epoch": 0.35438651756182077, "grad_norm": 0.8357524871826172, "learning_rate": 7.444741429383578e-05, "loss": 1.5061, "step": 2250 }, { "epoch": 0.35438651756182077, "eval_loss": 1.5059038400650024, "eval_runtime": 35.4488, "eval_samples_per_second": 14.133, "eval_steps_per_second": 14.133, "step": 2250 }, { "epoch": 0.3559615687509844, "grad_norm": 0.8725208640098572, "learning_rate": 7.405384757608555e-05, "loss": 1.4972, "step": 2260 }, { "epoch": 0.35753661994014807, "grad_norm": 0.9390593767166138, "learning_rate": 7.36598059249148e-05, "loss": 1.5082, "step": 2270 }, { "epoch": 0.3591116711293117, "grad_norm": 0.8906267881393433, "learning_rate": 7.326530553783243e-05, "loss": 1.5092, "step": 2280 }, { "epoch": 0.3606867223184754, "grad_norm": 0.8382712602615356, "learning_rate": 7.287036263120425e-05, "loss": 1.4973, "step": 2290 }, { "epoch": 0.362261773507639, "grad_norm": 0.8869454264640808, "learning_rate": 7.247499343958621e-05, "loss": 1.4954, "step": 2300 }, { "epoch": 0.3638368246968026, "grad_norm": 0.8357947468757629, "learning_rate": 7.207921421505724e-05, "loss": 1.5114, "step": 2310 }, { "epoch": 0.3654118758859663, "grad_norm": 0.8370199799537659, "learning_rate": 7.168304122655113e-05, "loss": 1.5215, "step": 2320 }, { "epoch": 0.3669869270751299, "grad_norm": 0.8827706575393677, "learning_rate": 7.128649075918768e-05, "loss": 1.4958, "step": 2330 }, { "epoch": 0.3685619782642936, "grad_norm": 0.8319584727287292, "learning_rate": 7.088957911360347e-05, "loss": 1.5004, "step": 2340 }, { "epoch": 0.3701370294534572, "grad_norm": 0.8250936269760132, "learning_rate": 7.049232260528163e-05, "loss": 1.5118, "step": 2350 }, { "epoch": 0.3717120806426209, "grad_norm": 0.8642278909683228, "learning_rate": 7.009473756388128e-05, "loss": 1.5154, "step": 2360 }, { "epoch": 0.3732871318317845, "grad_norm": 0.8907039165496826, "learning_rate": 6.969684033256622e-05, "loss": 1.4889, "step": 2370 }, { "epoch": 0.3748621830209482, "grad_norm": 0.8550412058830261, "learning_rate": 6.92986472673332e-05, "loss": 1.5254, "step": 2380 }, { "epoch": 0.3764372342101118, "grad_norm": 0.9388350248336792, "learning_rate": 6.890017473633946e-05, "loss": 1.5123, "step": 2390 }, { "epoch": 0.37801228539927545, "grad_norm": 0.860334038734436, "learning_rate": 6.850143911923011e-05, "loss": 1.513, "step": 2400 }, { "epoch": 0.37801228539927545, "eval_loss": 1.5027722120285034, "eval_runtime": 35.9341, "eval_samples_per_second": 13.942, "eval_steps_per_second": 13.942, "step": 2400 }, { "epoch": 0.37958733658843913, "grad_norm": 0.8568481206893921, "learning_rate": 6.81024568064646e-05, "loss": 1.5119, "step": 2410 }, { "epoch": 0.38116238777760275, "grad_norm": 0.8408715128898621, "learning_rate": 6.770324419864309e-05, "loss": 1.5051, "step": 2420 }, { "epoch": 0.38273743896676643, "grad_norm": 0.8416137099266052, "learning_rate": 6.73038177058323e-05, "loss": 1.508, "step": 2430 }, { "epoch": 0.38431249015593005, "grad_norm": 0.8409276604652405, "learning_rate": 6.690419374689087e-05, "loss": 1.4854, "step": 2440 }, { "epoch": 0.38588754134509373, "grad_norm": 0.8319918513298035, "learning_rate": 6.650438874879456e-05, "loss": 1.5115, "step": 2450 }, { "epoch": 0.38746259253425736, "grad_norm": 0.8504980802536011, "learning_rate": 6.61044191459609e-05, "loss": 1.5088, "step": 2460 }, { "epoch": 0.38903764372342103, "grad_norm": 0.9393388628959656, "learning_rate": 6.57043013795737e-05, "loss": 1.5088, "step": 2470 }, { "epoch": 0.39061269491258466, "grad_norm": 0.8659742474555969, "learning_rate": 6.530405189690719e-05, "loss": 1.4848, "step": 2480 }, { "epoch": 0.3921877461017483, "grad_norm": 0.8352620005607605, "learning_rate": 6.49036871506499e-05, "loss": 1.4941, "step": 2490 }, { "epoch": 0.39376279729091196, "grad_norm": 0.8595870733261108, "learning_rate": 6.450322359822846e-05, "loss": 1.5065, "step": 2500 }, { "epoch": 0.3953378484800756, "grad_norm": 0.9233240485191345, "learning_rate": 6.410267770113098e-05, "loss": 1.4957, "step": 2510 }, { "epoch": 0.39691289966923926, "grad_norm": 0.8546603322029114, "learning_rate": 6.370206592423045e-05, "loss": 1.4699, "step": 2520 }, { "epoch": 0.3984879508584029, "grad_norm": 0.8289804458618164, "learning_rate": 6.330140473510796e-05, "loss": 1.5038, "step": 2530 }, { "epoch": 0.40006300204756656, "grad_norm": 0.8960427641868591, "learning_rate": 6.29007106033757e-05, "loss": 1.4959, "step": 2540 }, { "epoch": 0.4016380532367302, "grad_norm": 0.8288776874542236, "learning_rate": 6.25e-05, "loss": 1.5089, "step": 2550 }, { "epoch": 0.4016380532367302, "eval_loss": 1.500240683555603, "eval_runtime": 36.1286, "eval_samples_per_second": 13.867, "eval_steps_per_second": 13.867, "step": 2550 }, { "epoch": 0.40321310442589386, "grad_norm": 0.8510929942131042, "learning_rate": 6.20992893966243e-05, "loss": 1.4884, "step": 2560 }, { "epoch": 0.4047881556150575, "grad_norm": 0.8261246681213379, "learning_rate": 6.169859526489204e-05, "loss": 1.5104, "step": 2570 }, { "epoch": 0.4063632068042211, "grad_norm": 0.8622429370880127, "learning_rate": 6.129793407576955e-05, "loss": 1.4953, "step": 2580 }, { "epoch": 0.4079382579933848, "grad_norm": 0.8522881865501404, "learning_rate": 6.089732229886904e-05, "loss": 1.4839, "step": 2590 }, { "epoch": 0.4095133091825484, "grad_norm": 0.891878068447113, "learning_rate": 6.049677640177155e-05, "loss": 1.5085, "step": 2600 }, { "epoch": 0.4110883603717121, "grad_norm": 0.8279666900634766, "learning_rate": 6.00963128493501e-05, "loss": 1.5266, "step": 2610 }, { "epoch": 0.4126634115608757, "grad_norm": 0.8490031361579895, "learning_rate": 5.969594810309284e-05, "loss": 1.5034, "step": 2620 }, { "epoch": 0.4142384627500394, "grad_norm": 0.8807093501091003, "learning_rate": 5.929569862042631e-05, "loss": 1.5154, "step": 2630 }, { "epoch": 0.415813513939203, "grad_norm": 0.8721035122871399, "learning_rate": 5.889558085403911e-05, "loss": 1.4948, "step": 2640 }, { "epoch": 0.4173885651283667, "grad_norm": 0.8579753041267395, "learning_rate": 5.849561125120545e-05, "loss": 1.5139, "step": 2650 }, { "epoch": 0.4189636163175303, "grad_norm": 0.8881575465202332, "learning_rate": 5.809580625310912e-05, "loss": 1.511, "step": 2660 }, { "epoch": 0.42053866750669394, "grad_norm": 0.8358275294303894, "learning_rate": 5.769618229416773e-05, "loss": 1.5035, "step": 2670 }, { "epoch": 0.4221137186958576, "grad_norm": 0.8375245332717896, "learning_rate": 5.7296755801356926e-05, "loss": 1.5117, "step": 2680 }, { "epoch": 0.42368876988502124, "grad_norm": 0.8418902158737183, "learning_rate": 5.6897543193535414e-05, "loss": 1.4985, "step": 2690 }, { "epoch": 0.4252638210741849, "grad_norm": 0.8790860772132874, "learning_rate": 5.649856088076989e-05, "loss": 1.497, "step": 2700 }, { "epoch": 0.4252638210741849, "eval_loss": 1.4972805976867676, "eval_runtime": 36.5785, "eval_samples_per_second": 13.697, "eval_steps_per_second": 13.697, "step": 2700 }, { "epoch": 0.42683887226334855, "grad_norm": 0.885306179523468, "learning_rate": 5.609982526366054e-05, "loss": 1.4825, "step": 2710 }, { "epoch": 0.4284139234525122, "grad_norm": 0.9426984786987305, "learning_rate": 5.570135273266683e-05, "loss": 1.5129, "step": 2720 }, { "epoch": 0.42998897464167585, "grad_norm": 0.8404316902160645, "learning_rate": 5.53031596674338e-05, "loss": 1.4818, "step": 2730 }, { "epoch": 0.4315640258308395, "grad_norm": 0.9347336292266846, "learning_rate": 5.490526243611873e-05, "loss": 1.5109, "step": 2740 }, { "epoch": 0.43313907702000315, "grad_norm": 0.9323931336402893, "learning_rate": 5.450767739471837e-05, "loss": 1.4973, "step": 2750 }, { "epoch": 0.4347141282091668, "grad_norm": 0.8658491373062134, "learning_rate": 5.411042088639655e-05, "loss": 1.4957, "step": 2760 }, { "epoch": 0.43628917939833045, "grad_norm": 0.8817048072814941, "learning_rate": 5.371350924081234e-05, "loss": 1.5003, "step": 2770 }, { "epoch": 0.4378642305874941, "grad_norm": 0.8737426996231079, "learning_rate": 5.331695877344888e-05, "loss": 1.4831, "step": 2780 }, { "epoch": 0.43943928177665775, "grad_norm": 0.8520674705505371, "learning_rate": 5.292078578494275e-05, "loss": 1.4753, "step": 2790 }, { "epoch": 0.4410143329658214, "grad_norm": 0.8379813432693481, "learning_rate": 5.2525006560413816e-05, "loss": 1.5039, "step": 2800 }, { "epoch": 0.44258938415498505, "grad_norm": 0.8281566500663757, "learning_rate": 5.212963736879578e-05, "loss": 1.4906, "step": 2810 }, { "epoch": 0.4441644353441487, "grad_norm": 0.8392037749290466, "learning_rate": 5.173469446216757e-05, "loss": 1.4762, "step": 2820 }, { "epoch": 0.44573948653331236, "grad_norm": 0.861897885799408, "learning_rate": 5.134019407508521e-05, "loss": 1.5046, "step": 2830 }, { "epoch": 0.447314537722476, "grad_norm": 0.8166014552116394, "learning_rate": 5.0946152423914456e-05, "loss": 1.491, "step": 2840 }, { "epoch": 0.4488895889116396, "grad_norm": 0.8650168776512146, "learning_rate": 5.0552585706164246e-05, "loss": 1.4942, "step": 2850 }, { "epoch": 0.4488895889116396, "eval_loss": 1.4944835901260376, "eval_runtime": 35.972, "eval_samples_per_second": 13.927, "eval_steps_per_second": 13.927, "step": 2850 }, { "epoch": 0.4504646401008033, "grad_norm": 0.8844559192657471, "learning_rate": 5.015951009982081e-05, "loss": 1.4926, "step": 2860 }, { "epoch": 0.4520396912899669, "grad_norm": 0.8611796498298645, "learning_rate": 4.976694176268278e-05, "loss": 1.4975, "step": 2870 }, { "epoch": 0.4536147424791306, "grad_norm": 0.8338538408279419, "learning_rate": 4.937489683169692e-05, "loss": 1.4958, "step": 2880 }, { "epoch": 0.4551897936682942, "grad_norm": 0.9034844040870667, "learning_rate": 4.8983391422294786e-05, "loss": 1.5194, "step": 2890 }, { "epoch": 0.4567648448574579, "grad_norm": 0.8422059416770935, "learning_rate": 4.8592441627730355e-05, "loss": 1.5007, "step": 2900 }, { "epoch": 0.4583398960466215, "grad_norm": 0.8396774530410767, "learning_rate": 4.820206351841842e-05, "loss": 1.4905, "step": 2910 }, { "epoch": 0.4599149472357852, "grad_norm": 0.8403754830360413, "learning_rate": 4.781227314127405e-05, "loss": 1.5033, "step": 2920 }, { "epoch": 0.4614899984249488, "grad_norm": 0.8683463931083679, "learning_rate": 4.7423086519052966e-05, "loss": 1.4917, "step": 2930 }, { "epoch": 0.46306504961411243, "grad_norm": 0.8298001289367676, "learning_rate": 4.703451964969287e-05, "loss": 1.511, "step": 2940 }, { "epoch": 0.4646401008032761, "grad_norm": 0.8793701529502869, "learning_rate": 4.66465885056558e-05, "loss": 1.4906, "step": 2950 }, { "epoch": 0.46621515199243974, "grad_norm": 0.8985243439674377, "learning_rate": 4.62593090332717e-05, "loss": 1.4873, "step": 2960 }, { "epoch": 0.4677902031816034, "grad_norm": 0.834176778793335, "learning_rate": 4.587269715208281e-05, "loss": 1.4886, "step": 2970 }, { "epoch": 0.46936525437076704, "grad_norm": 0.8913531303405762, "learning_rate": 4.5486768754189305e-05, "loss": 1.4952, "step": 2980 }, { "epoch": 0.4709403055599307, "grad_norm": 0.8446453213691711, "learning_rate": 4.510153970359606e-05, "loss": 1.5149, "step": 2990 }, { "epoch": 0.47251535674909434, "grad_norm": 0.9674009680747986, "learning_rate": 4.4717025835560476e-05, "loss": 1.4887, "step": 3000 }, { "epoch": 0.47251535674909434, "eval_loss": 1.4920191764831543, "eval_runtime": 35.8777, "eval_samples_per_second": 13.964, "eval_steps_per_second": 13.964, "step": 3000 }, { "epoch": 0.474090407938258, "grad_norm": 0.8722410202026367, "learning_rate": 4.433324295594166e-05, "loss": 1.4964, "step": 3010 }, { "epoch": 0.47566545912742164, "grad_norm": 0.9078059792518616, "learning_rate": 4.3950206840550585e-05, "loss": 1.4928, "step": 3020 }, { "epoch": 0.47724051031658526, "grad_norm": 0.8529486060142517, "learning_rate": 4.3567933234501746e-05, "loss": 1.5025, "step": 3030 }, { "epoch": 0.47881556150574894, "grad_norm": 0.848800003528595, "learning_rate": 4.318643785156579e-05, "loss": 1.4942, "step": 3040 }, { "epoch": 0.48039061269491257, "grad_norm": 0.844018816947937, "learning_rate": 4.280573637352371e-05, "loss": 1.4918, "step": 3050 }, { "epoch": 0.48196566388407625, "grad_norm": 0.8629136085510254, "learning_rate": 4.242584444952216e-05, "loss": 1.5129, "step": 3060 }, { "epoch": 0.48354071507323987, "grad_norm": 0.9015205502510071, "learning_rate": 4.204677769543019e-05, "loss": 1.4685, "step": 3070 }, { "epoch": 0.48511576626240355, "grad_norm": 0.8628337979316711, "learning_rate": 4.16685516931974e-05, "loss": 1.5157, "step": 3080 }, { "epoch": 0.48669081745156717, "grad_norm": 0.8275268077850342, "learning_rate": 4.1291181990213286e-05, "loss": 1.4864, "step": 3090 }, { "epoch": 0.48826586864073085, "grad_norm": 0.8484097719192505, "learning_rate": 4.0914684098668286e-05, "loss": 1.4861, "step": 3100 }, { "epoch": 0.48984091982989447, "grad_norm": 0.8497301936149597, "learning_rate": 4.053907349491608e-05, "loss": 1.4999, "step": 3110 }, { "epoch": 0.4914159710190581, "grad_norm": 0.8445069789886475, "learning_rate": 4.016436561883746e-05, "loss": 1.4845, "step": 3120 }, { "epoch": 0.4929910222082218, "grad_norm": 0.8241081833839417, "learning_rate": 3.979057587320554e-05, "loss": 1.4993, "step": 3130 }, { "epoch": 0.4945660733973854, "grad_norm": 0.8906779289245605, "learning_rate": 3.941771962305274e-05, "loss": 1.4864, "step": 3140 }, { "epoch": 0.4961411245865491, "grad_norm": 0.9039560556411743, "learning_rate": 3.9045812195039125e-05, "loss": 1.479, "step": 3150 }, { "epoch": 0.4961411245865491, "eval_loss": 1.490482211112976, "eval_runtime": 35.4913, "eval_samples_per_second": 14.116, "eval_steps_per_second": 14.116, "step": 3150 }, { "epoch": 0.4977161757757127, "grad_norm": 0.8476235270500183, "learning_rate": 3.8674868876822395e-05, "loss": 1.5039, "step": 3160 }, { "epoch": 0.4992912269648764, "grad_norm": 0.8413540720939636, "learning_rate": 3.83049049164295e-05, "loss": 1.5041, "step": 3170 }, { "epoch": 0.5008662781540401, "grad_norm": 0.8997570872306824, "learning_rate": 3.793593552162978e-05, "loss": 1.489, "step": 3180 }, { "epoch": 0.5024413293432036, "grad_norm": 0.8490859270095825, "learning_rate": 3.75679758593099e-05, "loss": 1.4897, "step": 3190 }, { "epoch": 0.5040163805323673, "grad_norm": 0.9092013835906982, "learning_rate": 3.720104105485039e-05, "loss": 1.5053, "step": 3200 }, { "epoch": 0.505591431721531, "grad_norm": 0.8679947853088379, "learning_rate": 3.6835146191503885e-05, "loss": 1.4871, "step": 3210 }, { "epoch": 0.5071664829106945, "grad_norm": 0.842693567276001, "learning_rate": 3.647030630977508e-05, "loss": 1.4863, "step": 3220 }, { "epoch": 0.5087415340998582, "grad_norm": 0.8691567182540894, "learning_rate": 3.6106536406802524e-05, "loss": 1.4866, "step": 3230 }, { "epoch": 0.5103165852890219, "grad_norm": 0.8479135036468506, "learning_rate": 3.5743851435742176e-05, "loss": 1.4759, "step": 3240 }, { "epoch": 0.5118916364781856, "grad_norm": 0.819887638092041, "learning_rate": 3.538226630515262e-05, "loss": 1.4833, "step": 3250 }, { "epoch": 0.5134666876673492, "grad_norm": 0.881779670715332, "learning_rate": 3.502179587838238e-05, "loss": 1.498, "step": 3260 }, { "epoch": 0.5150417388565128, "grad_norm": 0.8957091569900513, "learning_rate": 3.46624549729588e-05, "loss": 1.5042, "step": 3270 }, { "epoch": 0.5166167900456765, "grad_norm": 0.8583941459655762, "learning_rate": 3.430425835997908e-05, "loss": 1.5151, "step": 3280 }, { "epoch": 0.5181918412348401, "grad_norm": 0.8976061940193176, "learning_rate": 3.394722076350302e-05, "loss": 1.4841, "step": 3290 }, { "epoch": 0.5197668924240038, "grad_norm": 0.8954448699951172, "learning_rate": 3.359135685994781e-05, "loss": 1.4981, "step": 3300 }, { "epoch": 0.5197668924240038, "eval_loss": 1.4886995553970337, "eval_runtime": 36.2455, "eval_samples_per_second": 13.822, "eval_steps_per_second": 13.822, "step": 3300 }, { "epoch": 0.5213419436131674, "grad_norm": 0.8868695497512817, "learning_rate": 3.3236681277484654e-05, "loss": 1.4895, "step": 3310 }, { "epoch": 0.5229169948023311, "grad_norm": 0.8330621123313904, "learning_rate": 3.2883208595437584e-05, "loss": 1.5, "step": 3320 }, { "epoch": 0.5244920459914947, "grad_norm": 0.9283692240715027, "learning_rate": 3.2530953343684136e-05, "loss": 1.4892, "step": 3330 }, { "epoch": 0.5260670971806584, "grad_norm": 0.841469943523407, "learning_rate": 3.217993000205799e-05, "loss": 1.5036, "step": 3340 }, { "epoch": 0.527642148369822, "grad_norm": 0.9198566675186157, "learning_rate": 3.1830152999753903e-05, "loss": 1.4751, "step": 3350 }, { "epoch": 0.5292171995589857, "grad_norm": 0.8299483060836792, "learning_rate": 3.148163671473439e-05, "loss": 1.468, "step": 3360 }, { "epoch": 0.5307922507481493, "grad_norm": 0.858586311340332, "learning_rate": 3.113439547313892e-05, "loss": 1.469, "step": 3370 }, { "epoch": 0.532367301937313, "grad_norm": 0.8130866885185242, "learning_rate": 3.0788443548694874e-05, "loss": 1.481, "step": 3380 }, { "epoch": 0.5339423531264766, "grad_norm": 0.8363484740257263, "learning_rate": 3.0443795162130876e-05, "loss": 1.4855, "step": 3390 }, { "epoch": 0.5355174043156402, "grad_norm": 0.8633217811584473, "learning_rate": 3.0100464480592185e-05, "loss": 1.491, "step": 3400 }, { "epoch": 0.5370924555048039, "grad_norm": 0.8282185196876526, "learning_rate": 2.9758465617058404e-05, "loss": 1.4894, "step": 3410 }, { "epoch": 0.5386675066939676, "grad_norm": 0.8978461027145386, "learning_rate": 2.9417812629763285e-05, "loss": 1.4821, "step": 3420 }, { "epoch": 0.5402425578831312, "grad_norm": 0.8958382606506348, "learning_rate": 2.9078519521616894e-05, "loss": 1.5043, "step": 3430 }, { "epoch": 0.5418176090722948, "grad_norm": 0.8614096641540527, "learning_rate": 2.8740600239630002e-05, "loss": 1.4882, "step": 3440 }, { "epoch": 0.5433926602614585, "grad_norm": 0.846077024936676, "learning_rate": 2.8404068674340714e-05, "loss": 1.4775, "step": 3450 }, { "epoch": 0.5433926602614585, "eval_loss": 1.486568808555603, "eval_runtime": 35.5801, "eval_samples_per_second": 14.081, "eval_steps_per_second": 14.081, "step": 3450 }, { "epoch": 0.5449677114506222, "grad_norm": 0.8439909815788269, "learning_rate": 2.80689386592436e-05, "loss": 1.4801, "step": 3460 }, { "epoch": 0.5465427626397857, "grad_norm": 0.866398274898529, "learning_rate": 2.7735223970220955e-05, "loss": 1.4751, "step": 3470 }, { "epoch": 0.5481178138289494, "grad_norm": 0.8495659232139587, "learning_rate": 2.7402938324976576e-05, "loss": 1.4839, "step": 3480 }, { "epoch": 0.5496928650181131, "grad_norm": 0.8776513934135437, "learning_rate": 2.70720953824719e-05, "loss": 1.4803, "step": 3490 }, { "epoch": 0.5512679162072768, "grad_norm": 0.8576133251190186, "learning_rate": 2.674270874236441e-05, "loss": 1.4799, "step": 3500 }, { "epoch": 0.5528429673964403, "grad_norm": 0.8660094141960144, "learning_rate": 2.64147919444488e-05, "loss": 1.4933, "step": 3510 }, { "epoch": 0.554418018585604, "grad_norm": 0.8755300045013428, "learning_rate": 2.6088358468100247e-05, "loss": 1.5073, "step": 3520 }, { "epoch": 0.5559930697747677, "grad_norm": 0.8800572156906128, "learning_rate": 2.5763421731720435e-05, "loss": 1.5244, "step": 3530 }, { "epoch": 0.5575681209639314, "grad_norm": 0.8597940802574158, "learning_rate": 2.5439995092185892e-05, "loss": 1.4697, "step": 3540 }, { "epoch": 0.559143172153095, "grad_norm": 0.8852632641792297, "learning_rate": 2.5118091844299e-05, "loss": 1.4786, "step": 3550 }, { "epoch": 0.5607182233422586, "grad_norm": 0.84444659948349, "learning_rate": 2.479772522024147e-05, "loss": 1.4842, "step": 3560 }, { "epoch": 0.5622932745314223, "grad_norm": 0.8705389499664307, "learning_rate": 2.4478908389030427e-05, "loss": 1.4817, "step": 3570 }, { "epoch": 0.5638683257205859, "grad_norm": 0.8876937627792358, "learning_rate": 2.41616544559771e-05, "loss": 1.5087, "step": 3580 }, { "epoch": 0.5654433769097496, "grad_norm": 0.8541763424873352, "learning_rate": 2.3845976462148033e-05, "loss": 1.4908, "step": 3590 }, { "epoch": 0.5670184280989132, "grad_norm": 0.8582256436347961, "learning_rate": 2.3531887383829157e-05, "loss": 1.4732, "step": 3600 }, { "epoch": 0.5670184280989132, "eval_loss": 1.485674262046814, "eval_runtime": 37.1809, "eval_samples_per_second": 13.475, "eval_steps_per_second": 13.475, "step": 3600 }, { "epoch": 0.5685934792880769, "grad_norm": 0.8538834452629089, "learning_rate": 2.3219400131992273e-05, "loss": 1.4944, "step": 3610 }, { "epoch": 0.5701685304772405, "grad_norm": 0.8310463428497314, "learning_rate": 2.2908527551764404e-05, "loss": 1.4637, "step": 3620 }, { "epoch": 0.5717435816664042, "grad_norm": 0.9042952656745911, "learning_rate": 2.259928242189966e-05, "loss": 1.4798, "step": 3630 }, { "epoch": 0.5733186328555678, "grad_norm": 0.8902590274810791, "learning_rate": 2.2291677454254136e-05, "loss": 1.4934, "step": 3640 }, { "epoch": 0.5748936840447314, "grad_norm": 0.8976852297782898, "learning_rate": 2.1985725293263237e-05, "loss": 1.4773, "step": 3650 }, { "epoch": 0.5764687352338951, "grad_norm": 0.8807442784309387, "learning_rate": 2.1681438515421953e-05, "loss": 1.4888, "step": 3660 }, { "epoch": 0.5780437864230588, "grad_norm": 0.861380934715271, "learning_rate": 2.1378829628767965e-05, "loss": 1.4996, "step": 3670 }, { "epoch": 0.5796188376122224, "grad_norm": 0.9131867289543152, "learning_rate": 2.1077911072367317e-05, "loss": 1.5014, "step": 3680 }, { "epoch": 0.581193888801386, "grad_norm": 0.8426468968391418, "learning_rate": 2.077869521580325e-05, "loss": 1.4832, "step": 3690 }, { "epoch": 0.5827689399905497, "grad_norm": 0.8558875918388367, "learning_rate": 2.0481194358667695e-05, "loss": 1.4796, "step": 3700 }, { "epoch": 0.5843439911797134, "grad_norm": 0.8693471550941467, "learning_rate": 2.018542073005567e-05, "loss": 1.4933, "step": 3710 }, { "epoch": 0.585919042368877, "grad_norm": 0.9600664973258972, "learning_rate": 1.9891386488062538e-05, "loss": 1.5139, "step": 3720 }, { "epoch": 0.5874940935580406, "grad_norm": 0.8897980451583862, "learning_rate": 1.959910371928436e-05, "loss": 1.5171, "step": 3730 }, { "epoch": 0.5890691447472043, "grad_norm": 0.8953452110290527, "learning_rate": 1.930858443832096e-05, "loss": 1.4849, "step": 3740 }, { "epoch": 0.590644195936368, "grad_norm": 0.8558352589607239, "learning_rate": 1.90198405872821e-05, "loss": 1.4922, "step": 3750 }, { "epoch": 0.590644195936368, "eval_loss": 1.4844595193862915, "eval_runtime": 36.1848, "eval_samples_per_second": 13.846, "eval_steps_per_second": 13.846, "step": 3750 }, { "epoch": 0.5922192471255315, "grad_norm": 0.8563032150268555, "learning_rate": 1.8732884035296582e-05, "loss": 1.4821, "step": 3760 }, { "epoch": 0.5937942983146952, "grad_norm": 0.8764299154281616, "learning_rate": 1.844772657802428e-05, "loss": 1.4916, "step": 3770 }, { "epoch": 0.5953693495038589, "grad_norm": 0.8964916467666626, "learning_rate": 1.8164379937171382e-05, "loss": 1.4733, "step": 3780 }, { "epoch": 0.5969444006930226, "grad_norm": 0.9039140343666077, "learning_rate": 1.7882855760008547e-05, "loss": 1.5123, "step": 3790 }, { "epoch": 0.5985194518821861, "grad_norm": 0.8859601020812988, "learning_rate": 1.760316561889203e-05, "loss": 1.4859, "step": 3800 }, { "epoch": 0.6000945030713498, "grad_norm": 0.8582342267036438, "learning_rate": 1.7325321010788034e-05, "loss": 1.4856, "step": 3810 }, { "epoch": 0.6016695542605135, "grad_norm": 0.8299849629402161, "learning_rate": 1.7049333356800167e-05, "loss": 1.4852, "step": 3820 }, { "epoch": 0.6032446054496771, "grad_norm": 0.8748874068260193, "learning_rate": 1.6775214001699914e-05, "loss": 1.477, "step": 3830 }, { "epoch": 0.6048196566388407, "grad_norm": 0.9403565526008606, "learning_rate": 1.6502974213460316e-05, "loss": 1.4734, "step": 3840 }, { "epoch": 0.6063947078280044, "grad_norm": 0.8735081553459167, "learning_rate": 1.623262518279279e-05, "loss": 1.5099, "step": 3850 }, { "epoch": 0.6079697590171681, "grad_norm": 0.8424332141876221, "learning_rate": 1.596417802268707e-05, "loss": 1.4852, "step": 3860 }, { "epoch": 0.6095448102063317, "grad_norm": 0.8874250650405884, "learning_rate": 1.5697643767954488e-05, "loss": 1.4814, "step": 3870 }, { "epoch": 0.6111198613954953, "grad_norm": 0.9068039059638977, "learning_rate": 1.543303337477432e-05, "loss": 1.4903, "step": 3880 }, { "epoch": 0.612694912584659, "grad_norm": 0.9054557681083679, "learning_rate": 1.517035772024343e-05, "loss": 1.5071, "step": 3890 }, { "epoch": 0.6142699637738227, "grad_norm": 0.8765373229980469, "learning_rate": 1.49096276019291e-05, "loss": 1.4716, "step": 3900 }, { "epoch": 0.6142699637738227, "eval_loss": 1.4831758737564087, "eval_runtime": 37.0102, "eval_samples_per_second": 13.537, "eval_steps_per_second": 13.537, "step": 3900 }, { "epoch": 0.6158450149629863, "grad_norm": 0.9288606643676758, "learning_rate": 1.4650853737425327e-05, "loss": 1.4683, "step": 3910 }, { "epoch": 0.61742006615215, "grad_norm": 0.9101633429527283, "learning_rate": 1.4394046763912122e-05, "loss": 1.4706, "step": 3920 }, { "epoch": 0.6189951173413136, "grad_norm": 0.8837119936943054, "learning_rate": 1.413921723771832e-05, "loss": 1.4799, "step": 3930 }, { "epoch": 0.6205701685304772, "grad_norm": 0.962098240852356, "learning_rate": 1.3886375633887665e-05, "loss": 1.5042, "step": 3940 }, { "epoch": 0.6221452197196409, "grad_norm": 0.8238315582275391, "learning_rate": 1.3635532345748137e-05, "loss": 1.4932, "step": 3950 }, { "epoch": 0.6237202709088046, "grad_norm": 0.9068928360939026, "learning_rate": 1.3386697684484853e-05, "loss": 1.4743, "step": 3960 }, { "epoch": 0.6252953220979682, "grad_norm": 0.8136940598487854, "learning_rate": 1.3139881878716107e-05, "loss": 1.4727, "step": 3970 }, { "epoch": 0.6268703732871318, "grad_norm": 0.8796588182449341, "learning_rate": 1.2895095074072986e-05, "loss": 1.4681, "step": 3980 }, { "epoch": 0.6284454244762955, "grad_norm": 0.8927239775657654, "learning_rate": 1.2652347332782227e-05, "loss": 1.4871, "step": 3990 }, { "epoch": 0.6300204756654592, "grad_norm": 0.8667739629745483, "learning_rate": 1.2411648633252719e-05, "loss": 1.4784, "step": 4000 }, { "epoch": 0.6315955268546227, "grad_norm": 0.8735933303833008, "learning_rate": 1.2173008869665241e-05, "loss": 1.4659, "step": 4010 }, { "epoch": 0.6331705780437864, "grad_norm": 0.8920369148254395, "learning_rate": 1.1936437851565791e-05, "loss": 1.4711, "step": 4020 }, { "epoch": 0.6347456292329501, "grad_norm": 0.854413628578186, "learning_rate": 1.1701945303462337e-05, "loss": 1.4877, "step": 4030 }, { "epoch": 0.6363206804221138, "grad_norm": 0.8223803639411926, "learning_rate": 1.146954086442508e-05, "loss": 1.477, "step": 4040 }, { "epoch": 0.6378957316112773, "grad_norm": 0.8744681477546692, "learning_rate": 1.1239234087690252e-05, "loss": 1.4864, "step": 4050 }, { "epoch": 0.6378957316112773, "eval_loss": 1.4826009273529053, "eval_runtime": 36.4666, "eval_samples_per_second": 13.739, "eval_steps_per_second": 13.739, "step": 4050 }, { "epoch": 0.639470782800441, "grad_norm": 0.879377543926239, "learning_rate": 1.1011034440267395e-05, "loss": 1.4908, "step": 4060 }, { "epoch": 0.6410458339896047, "grad_norm": 0.8856979608535767, "learning_rate": 1.078495130255023e-05, "loss": 1.4768, "step": 4070 }, { "epoch": 0.6426208851787684, "grad_norm": 0.8525427579879761, "learning_rate": 1.0560993967931004e-05, "loss": 1.4824, "step": 4080 }, { "epoch": 0.6441959363679319, "grad_norm": 0.8850044012069702, "learning_rate": 1.0339171642418585e-05, "loss": 1.4694, "step": 4090 }, { "epoch": 0.6457709875570956, "grad_norm": 0.9544363021850586, "learning_rate": 1.0119493444259963e-05, "loss": 1.4793, "step": 4100 }, { "epoch": 0.6473460387462593, "grad_norm": 0.8178799152374268, "learning_rate": 9.901968403565428e-06, "loss": 1.4607, "step": 4110 }, { "epoch": 0.6489210899354229, "grad_norm": 0.8692899346351624, "learning_rate": 9.686605461937441e-06, "loss": 1.4893, "step": 4120 }, { "epoch": 0.6504961411245865, "grad_norm": 0.8424128293991089, "learning_rate": 9.473413472102982e-06, "loss": 1.4903, "step": 4130 }, { "epoch": 0.6520711923137502, "grad_norm": 0.8360965847969055, "learning_rate": 9.262401197549744e-06, "loss": 1.4823, "step": 4140 }, { "epoch": 0.6536462435029139, "grad_norm": 0.8790252804756165, "learning_rate": 9.05357731216587e-06, "loss": 1.4884, "step": 4150 }, { "epoch": 0.6552212946920775, "grad_norm": 0.8617748022079468, "learning_rate": 8.846950399883368e-06, "loss": 1.4971, "step": 4160 }, { "epoch": 0.6567963458812411, "grad_norm": 0.8690015077590942, "learning_rate": 8.64252895432531e-06, "loss": 1.4819, "step": 4170 }, { "epoch": 0.6583713970704048, "grad_norm": 0.8620494604110718, "learning_rate": 8.440321378456656e-06, "loss": 1.4722, "step": 4180 }, { "epoch": 0.6599464482595684, "grad_norm": 0.8735961318016052, "learning_rate": 8.240335984238844e-06, "loss": 1.4877, "step": 4190 }, { "epoch": 0.6615214994487321, "grad_norm": 0.8974801301956177, "learning_rate": 8.042580992288163e-06, "loss": 1.4891, "step": 4200 }, { "epoch": 0.6615214994487321, "eval_loss": 1.4817373752593994, "eval_runtime": 35.5168, "eval_samples_per_second": 14.106, "eval_steps_per_second": 14.106, "step": 4200 }, { "epoch": 0.6630965506378957, "grad_norm": 0.8691059350967407, "learning_rate": 7.847064531537774e-06, "loss": 1.4908, "step": 4210 }, { "epoch": 0.6646716018270594, "grad_norm": 0.8714816570281982, "learning_rate": 7.653794638903574e-06, "loss": 1.4883, "step": 4220 }, { "epoch": 0.666246653016223, "grad_norm": 0.842909574508667, "learning_rate": 7.462779258953875e-06, "loss": 1.4752, "step": 4230 }, { "epoch": 0.6678217042053867, "grad_norm": 0.9124504327774048, "learning_rate": 7.274026243582796e-06, "loss": 1.4909, "step": 4240 }, { "epoch": 0.6693967553945503, "grad_norm": 0.9474470615386963, "learning_rate": 7.087543351687493e-06, "loss": 1.4917, "step": 4250 }, { "epoch": 0.670971806583714, "grad_norm": 0.8785407543182373, "learning_rate": 6.903338248849269e-06, "loss": 1.4958, "step": 4260 }, { "epoch": 0.6725468577728776, "grad_norm": 0.8614616990089417, "learning_rate": 6.7214185070183925e-06, "loss": 1.498, "step": 4270 }, { "epoch": 0.6741219089620413, "grad_norm": 0.8453344106674194, "learning_rate": 6.541791604202936e-06, "loss": 1.5008, "step": 4280 }, { "epoch": 0.675696960151205, "grad_norm": 0.8798742890357971, "learning_rate": 6.364464924161311e-06, "loss": 1.4749, "step": 4290 }, { "epoch": 0.6772720113403685, "grad_norm": 0.8620926737785339, "learning_rate": 6.1894457560988106e-06, "loss": 1.476, "step": 4300 }, { "epoch": 0.6788470625295322, "grad_norm": 0.8776394128799438, "learning_rate": 6.016741294367911e-06, "loss": 1.4864, "step": 4310 }, { "epoch": 0.6804221137186959, "grad_norm": 0.9075036644935608, "learning_rate": 5.846358638172615e-06, "loss": 1.4901, "step": 4320 }, { "epoch": 0.6819971649078596, "grad_norm": 0.8943276405334473, "learning_rate": 5.678304791276567e-06, "loss": 1.4913, "step": 4330 }, { "epoch": 0.6835722160970231, "grad_norm": 0.8560464382171631, "learning_rate": 5.51258666171519e-06, "loss": 1.4909, "step": 4340 }, { "epoch": 0.6851472672861868, "grad_norm": 0.9168397784233093, "learning_rate": 5.349211061511726e-06, "loss": 1.4596, "step": 4350 }, { "epoch": 0.6851472672861868, "eval_loss": 1.4812487363815308, "eval_runtime": 35.7079, "eval_samples_per_second": 14.031, "eval_steps_per_second": 14.031, "step": 4350 }, { "epoch": 0.6867223184753505, "grad_norm": 0.9016767144203186, "learning_rate": 5.188184706397182e-06, "loss": 1.4719, "step": 4360 }, { "epoch": 0.688297369664514, "grad_norm": 0.893745481967926, "learning_rate": 5.029514215534339e-06, "loss": 1.4829, "step": 4370 }, { "epoch": 0.6898724208536777, "grad_norm": 0.8733708262443542, "learning_rate": 4.873206111245594e-06, "loss": 1.4942, "step": 4380 }, { "epoch": 0.6914474720428414, "grad_norm": 0.8753581047058105, "learning_rate": 4.719266818744912e-06, "loss": 1.4839, "step": 4390 }, { "epoch": 0.6930225232320051, "grad_norm": 0.8269835710525513, "learning_rate": 4.567702665873648e-06, "loss": 1.4615, "step": 4400 }, { "epoch": 0.6945975744211687, "grad_norm": 0.9257881045341492, "learning_rate": 4.418519882840505e-06, "loss": 1.4939, "step": 4410 }, { "epoch": 0.6961726256103323, "grad_norm": 0.8763148188591003, "learning_rate": 4.271724601965371e-06, "loss": 1.4771, "step": 4420 }, { "epoch": 0.697747676799496, "grad_norm": 0.8695856332778931, "learning_rate": 4.127322857427306e-06, "loss": 1.4985, "step": 4430 }, { "epoch": 0.6993227279886596, "grad_norm": 0.8823320269584656, "learning_rate": 3.985320585016425e-06, "loss": 1.4825, "step": 4440 }, { "epoch": 0.7008977791778233, "grad_norm": 0.8587324619293213, "learning_rate": 3.845723621889973e-06, "loss": 1.4541, "step": 4450 }, { "epoch": 0.7024728303669869, "grad_norm": 0.8840261101722717, "learning_rate": 3.7085377063323447e-06, "loss": 1.4696, "step": 4460 }, { "epoch": 0.7040478815561506, "grad_norm": 0.9190268516540527, "learning_rate": 3.5737684775191887e-06, "loss": 1.4842, "step": 4470 }, { "epoch": 0.7056229327453142, "grad_norm": 0.8872419595718384, "learning_rate": 3.441421475285679e-06, "loss": 1.4892, "step": 4480 }, { "epoch": 0.7071979839344779, "grad_norm": 0.8445181250572205, "learning_rate": 3.3115021398986768e-06, "loss": 1.4875, "step": 4490 }, { "epoch": 0.7087730351236415, "grad_norm": 0.83912193775177, "learning_rate": 3.18401581183321e-06, "loss": 1.4774, "step": 4500 }, { "epoch": 0.7087730351236415, "eval_loss": 1.480732798576355, "eval_runtime": 37.196, "eval_samples_per_second": 13.469, "eval_steps_per_second": 13.469, "step": 4500 }, { "epoch": 0.7103480863128052, "grad_norm": 0.8629345893859863, "learning_rate": 3.0589677315529044e-06, "loss": 1.4749, "step": 4510 }, { "epoch": 0.7119231375019688, "grad_norm": 0.8621689677238464, "learning_rate": 2.9363630392945513e-06, "loss": 1.4772, "step": 4520 }, { "epoch": 0.7134981886911325, "grad_norm": 0.873957633972168, "learning_rate": 2.816206774856854e-06, "loss": 1.4937, "step": 4530 }, { "epoch": 0.7150732398802961, "grad_norm": 0.9086722135543823, "learning_rate": 2.6985038773932046e-06, "loss": 1.4512, "step": 4540 }, { "epoch": 0.7166482910694597, "grad_norm": 0.8475430607795715, "learning_rate": 2.583259185208714e-06, "loss": 1.4621, "step": 4550 }, { "epoch": 0.7182233422586234, "grad_norm": 0.8581358790397644, "learning_rate": 2.4704774355612943e-06, "loss": 1.4746, "step": 4560 }, { "epoch": 0.7197983934477871, "grad_norm": 0.8703014254570007, "learning_rate": 2.3601632644669536e-06, "loss": 1.4906, "step": 4570 }, { "epoch": 0.7213734446369507, "grad_norm": 0.9130226373672485, "learning_rate": 2.2523212065091723e-06, "loss": 1.4825, "step": 4580 }, { "epoch": 0.7229484958261143, "grad_norm": 0.8566045165061951, "learning_rate": 2.1469556946525706e-06, "loss": 1.4732, "step": 4590 }, { "epoch": 0.724523547015278, "grad_norm": 0.8569677472114563, "learning_rate": 2.0440710600606595e-06, "loss": 1.4725, "step": 4600 }, { "epoch": 0.7260985982044417, "grad_norm": 0.9196586012840271, "learning_rate": 1.9436715319177956e-06, "loss": 1.4832, "step": 4610 }, { "epoch": 0.7276736493936052, "grad_norm": 0.8641564249992371, "learning_rate": 1.8457612372553348e-06, "loss": 1.4994, "step": 4620 }, { "epoch": 0.7292487005827689, "grad_norm": 0.8218653798103333, "learning_rate": 1.75034420078201e-06, "loss": 1.4748, "step": 4630 }, { "epoch": 0.7308237517719326, "grad_norm": 0.8673424124717712, "learning_rate": 1.6574243447184597e-06, "loss": 1.4779, "step": 4640 }, { "epoch": 0.7323988029610963, "grad_norm": 0.8800205588340759, "learning_rate": 1.567005488636024e-06, "loss": 1.4927, "step": 4650 }, { "epoch": 0.7323988029610963, "eval_loss": 1.4804431200027466, "eval_runtime": 37.6922, "eval_samples_per_second": 13.292, "eval_steps_per_second": 13.292, "step": 4650 }, { "epoch": 0.7339738541502598, "grad_norm": 0.8832563757896423, "learning_rate": 1.4790913492997438e-06, "loss": 1.5031, "step": 4660 }, { "epoch": 0.7355489053394235, "grad_norm": 0.8596001863479614, "learning_rate": 1.3936855405155408e-06, "loss": 1.4974, "step": 4670 }, { "epoch": 0.7371239565285872, "grad_norm": 0.8389458656311035, "learning_rate": 1.3107915729816954e-06, "loss": 1.4873, "step": 4680 }, { "epoch": 0.7386990077177509, "grad_norm": 0.8755994439125061, "learning_rate": 1.230412854144547e-06, "loss": 1.4986, "step": 4690 }, { "epoch": 0.7402740589069144, "grad_norm": 0.8368931412696838, "learning_rate": 1.15255268805841e-06, "loss": 1.482, "step": 4700 }, { "epoch": 0.7418491100960781, "grad_norm": 0.853042721748352, "learning_rate": 1.0772142752497604e-06, "loss": 1.4824, "step": 4710 }, { "epoch": 0.7434241612852418, "grad_norm": 0.8514347672462463, "learning_rate": 1.004400712585646e-06, "loss": 1.4737, "step": 4720 }, { "epoch": 0.7449992124744054, "grad_norm": 0.8187866806983948, "learning_rate": 9.341149931464537e-07, "loss": 1.5147, "step": 4730 }, { "epoch": 0.746574263663569, "grad_norm": 0.8992719054222107, "learning_rate": 8.663600061028162e-07, "loss": 1.487, "step": 4740 }, { "epoch": 0.7481493148527327, "grad_norm": 0.834426999092102, "learning_rate": 8.011385365968641e-07, "loss": 1.4814, "step": 4750 }, { "epoch": 0.7497243660418964, "grad_norm": 0.8676533699035645, "learning_rate": 7.384532656277698e-07, "loss": 1.4703, "step": 4760 }, { "epoch": 0.75129941723106, "grad_norm": 0.8450558185577393, "learning_rate": 6.783067699414891e-07, "loss": 1.4648, "step": 4770 }, { "epoch": 0.7528744684202237, "grad_norm": 0.846495509147644, "learning_rate": 6.207015219248866e-07, "loss": 1.4792, "step": 4780 }, { "epoch": 0.7544495196093873, "grad_norm": 0.8761216998100281, "learning_rate": 5.656398895040813e-07, "loss": 1.4552, "step": 4790 }, { "epoch": 0.7560245707985509, "grad_norm": 0.8825677037239075, "learning_rate": 5.131241360471217e-07, "loss": 1.4959, "step": 4800 }, { "epoch": 0.7560245707985509, "eval_loss": 1.480470061302185, "eval_runtime": 35.9266, "eval_samples_per_second": 13.945, "eval_steps_per_second": 13.945, "step": 4800 }, { "epoch": 0.7575996219877146, "grad_norm": 0.8354045748710632, "learning_rate": 4.631564202709354e-07, "loss": 1.4743, "step": 4810 }, { "epoch": 0.7591746731768783, "grad_norm": 0.8703035712242126, "learning_rate": 4.1573879615262185e-07, "loss": 1.4686, "step": 4820 }, { "epoch": 0.7607497243660419, "grad_norm": 0.929268479347229, "learning_rate": 3.708732128449785e-07, "loss": 1.4695, "step": 4830 }, { "epoch": 0.7623247755552055, "grad_norm": 0.8741073608398438, "learning_rate": 3.2856151459641216e-07, "loss": 1.492, "step": 4840 }, { "epoch": 0.7638998267443692, "grad_norm": 0.8539476990699768, "learning_rate": 2.888054406751106e-07, "loss": 1.4913, "step": 4850 }, { "epoch": 0.7654748779335329, "grad_norm": 0.8607631325721741, "learning_rate": 2.5160662529755823e-07, "loss": 1.4712, "step": 4860 }, { "epoch": 0.7670499291226965, "grad_norm": 0.856203556060791, "learning_rate": 2.169665975613605e-07, "loss": 1.4829, "step": 4870 }, { "epoch": 0.7686249803118601, "grad_norm": 0.8610185384750366, "learning_rate": 1.8488678138238456e-07, "loss": 1.4983, "step": 4880 }, { "epoch": 0.7702000315010238, "grad_norm": 0.8597480058670044, "learning_rate": 1.5536849543621584e-07, "loss": 1.4763, "step": 4890 }, { "epoch": 0.7717750826901875, "grad_norm": 0.8443917632102966, "learning_rate": 1.2841295310397905e-07, "loss": 1.4961, "step": 4900 }, { "epoch": 0.773350133879351, "grad_norm": 0.8755921125411987, "learning_rate": 1.0402126242244764e-07, "loss": 1.4802, "step": 4910 }, { "epoch": 0.7749251850685147, "grad_norm": 0.888160765171051, "learning_rate": 8.219442603847605e-08, "loss": 1.4561, "step": 4920 }, { "epoch": 0.7765002362576784, "grad_norm": 0.842492938041687, "learning_rate": 6.293334116783817e-08, "loss": 1.4527, "step": 4930 }, { "epoch": 0.7780752874468421, "grad_norm": 0.886648952960968, "learning_rate": 4.623879955827082e-08, "loss": 1.489, "step": 4940 }, { "epoch": 0.7796503386360056, "grad_norm": 0.8370558023452759, "learning_rate": 3.211148745700665e-08, "loss": 1.4815, "step": 4950 }, { "epoch": 0.7796503386360056, "eval_loss": 1.4804972410202026, "eval_runtime": 35.7603, "eval_samples_per_second": 14.01, "eval_steps_per_second": 14.01, "step": 4950 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.149189940332462e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }