{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8669603524229075, "eval_steps": 500, "global_step": 2440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011747430249632892, "grad_norm": 1.5699902772903442, "learning_rate": 0.00027, "loss": 3.0983, "step": 10 }, { "epoch": 0.023494860499265784, "grad_norm": 1.6029695272445679, "learning_rate": 0.00029991523567092526, "loss": 2.062, "step": 20 }, { "epoch": 0.03524229074889868, "grad_norm": 1.593436360359192, "learning_rate": 0.00029962234616583063, "loss": 1.2074, "step": 30 }, { "epoch": 0.04698972099853157, "grad_norm": 0.5851414799690247, "learning_rate": 0.00029912069357315393, "loss": 0.888, "step": 40 }, { "epoch": 0.05873715124816446, "grad_norm": 0.25992292165756226, "learning_rate": 0.0002984109778320875, "loss": 0.7685, "step": 50 }, { "epoch": 0.07048458149779736, "grad_norm": 0.21082307398319244, "learning_rate": 0.00029749418918542057, "loss": 0.7096, "step": 60 }, { "epoch": 0.08223201174743025, "grad_norm": 0.16843102872371674, "learning_rate": 0.0002963716067978866, "loss": 0.6901, "step": 70 }, { "epoch": 0.09397944199706314, "grad_norm": 0.12076722830533981, "learning_rate": 0.000295044796971387, "loss": 0.6702, "step": 80 }, { "epoch": 0.10572687224669604, "grad_norm": 0.21371866762638092, "learning_rate": 0.000293515610959582, "loss": 0.6353, "step": 90 }, { "epoch": 0.11747430249632893, "grad_norm": 0.13458965718746185, "learning_rate": 0.0002917861823848985, "loss": 0.6479, "step": 100 }, { "epoch": 0.12922173274596183, "grad_norm": 0.265765517950058, "learning_rate": 0.0002898589242615568, "loss": 0.6244, "step": 110 }, { "epoch": 0.14096916299559473, "grad_norm": 0.1473032385110855, "learning_rate": 0.0002877365256287728, "loss": 0.6217, "step": 120 }, { "epoch": 0.1527165932452276, "grad_norm": 0.1591167151927948, "learning_rate": 0.00028542194779883047, "loss": 0.6022, "step": 130 }, { "epoch": 0.1644640234948605, "grad_norm": 0.13270772993564606, "learning_rate": 0.00028291842022526133, "loss": 0.6098, "step": 140 }, { "epoch": 0.1762114537444934, "grad_norm": 0.1444919854402542, "learning_rate": 0.0002802294359968954, "loss": 0.5971, "step": 150 }, { "epoch": 0.18795888399412627, "grad_norm": 0.1571902334690094, "learning_rate": 0.0002773587469640702, "loss": 0.5937, "step": 160 }, { "epoch": 0.19970631424375918, "grad_norm": 0.11585285514593124, "learning_rate": 0.0002743103585037989, "loss": 0.6054, "step": 170 }, { "epoch": 0.21145374449339208, "grad_norm": 0.10303252190351486, "learning_rate": 0.0002710885239312008, "loss": 0.5708, "step": 180 }, { "epoch": 0.22320117474302498, "grad_norm": 0.09355439245700836, "learning_rate": 0.00026769773856499167, "loss": 0.5806, "step": 190 }, { "epoch": 0.23494860499265785, "grad_norm": 0.09288550913333893, "learning_rate": 0.0002641427334553158, "loss": 0.5747, "step": 200 }, { "epoch": 0.24669603524229075, "grad_norm": 0.10875760763883591, "learning_rate": 0.00026042846878266795, "loss": 0.5879, "step": 210 }, { "epoch": 0.25844346549192365, "grad_norm": 0.09756477177143097, "learning_rate": 0.0002565601269371192, "loss": 0.5852, "step": 220 }, { "epoch": 0.2701908957415565, "grad_norm": 0.10926368832588196, "learning_rate": 0.0002525431052874997, "loss": 0.5605, "step": 230 }, { "epoch": 0.28193832599118945, "grad_norm": 0.09802033007144928, "learning_rate": 0.00024838300865062966, "loss": 0.5738, "step": 240 }, { "epoch": 0.2936857562408223, "grad_norm": 0.10284294933080673, "learning_rate": 0.00024408564147110443, "loss": 0.5847, "step": 250 }, { "epoch": 0.3054331864904552, "grad_norm": 0.09890703111886978, "learning_rate": 0.00023965699972254602, "loss": 0.5736, "step": 260 }, { "epoch": 0.31718061674008813, "grad_norm": 0.09091509878635406, "learning_rate": 0.00023510326254162027, "loss": 0.5577, "step": 270 }, { "epoch": 0.328928046989721, "grad_norm": 0.0930003970861435, "learning_rate": 0.00023043078360649285, "loss": 0.5651, "step": 280 }, { "epoch": 0.3406754772393539, "grad_norm": 0.0988878533244133, "learning_rate": 0.00022564608227175316, "loss": 0.548, "step": 290 }, { "epoch": 0.3524229074889868, "grad_norm": 0.11749754101037979, "learning_rate": 0.0002207558344721757, "loss": 0.5587, "step": 300 }, { "epoch": 0.3641703377386197, "grad_norm": 0.10936658829450607, "learning_rate": 0.00021576686340800985, "loss": 0.5694, "step": 310 }, { "epoch": 0.37591776798825255, "grad_norm": 0.15082670748233795, "learning_rate": 0.00021068613002479553, "loss": 0.5688, "step": 320 }, { "epoch": 0.3876651982378855, "grad_norm": 0.10255635529756546, "learning_rate": 0.00020552072330098716, "loss": 0.56, "step": 330 }, { "epoch": 0.39941262848751835, "grad_norm": 0.10504507273435593, "learning_rate": 0.0002002778503569374, "loss": 0.557, "step": 340 }, { "epoch": 0.4111600587371512, "grad_norm": 0.1146383211016655, "learning_rate": 0.00019496482639904194, "loss": 0.5497, "step": 350 }, { "epoch": 0.42290748898678415, "grad_norm": 0.09596443176269531, "learning_rate": 0.00018958906451307489, "loss": 0.556, "step": 360 }, { "epoch": 0.434654919236417, "grad_norm": 0.10395421087741852, "learning_rate": 0.0001841580653209565, "loss": 0.5634, "step": 370 }, { "epoch": 0.44640234948604995, "grad_norm": 0.08797866106033325, "learning_rate": 0.00017867940651538483, "loss": 0.5544, "step": 380 }, { "epoch": 0.4581497797356828, "grad_norm": 0.1006847620010376, "learning_rate": 0.0001731607322869329, "loss": 0.5562, "step": 390 }, { "epoch": 0.4698972099853157, "grad_norm": 0.09849337488412857, "learning_rate": 0.00016760974265836331, "loss": 0.5477, "step": 400 }, { "epoch": 0.48164464023494863, "grad_norm": 0.0988384336233139, "learning_rate": 0.00016203418274104278, "loss": 0.5459, "step": 410 }, { "epoch": 0.4933920704845815, "grad_norm": 0.09625212848186493, "learning_rate": 0.0001564418319284454, "loss": 0.5516, "step": 420 }, { "epoch": 0.5051395007342144, "grad_norm": 0.09705183655023575, "learning_rate": 0.00015084049304182347, "loss": 0.5375, "step": 430 }, { "epoch": 0.5168869309838473, "grad_norm": 0.17180472612380981, "learning_rate": 0.00014523798144319027, "loss": 0.539, "step": 440 }, { "epoch": 0.5286343612334802, "grad_norm": 0.09553302824497223, "learning_rate": 0.00013964211413080522, "loss": 0.5418, "step": 450 }, { "epoch": 0.540381791483113, "grad_norm": 0.10648112744092941, "learning_rate": 0.0001340606988323758, "loss": 0.5414, "step": 460 }, { "epoch": 0.5521292217327459, "grad_norm": 0.09907692670822144, "learning_rate": 0.00012850152311119498, "loss": 0.5353, "step": 470 }, { "epoch": 0.5638766519823789, "grad_norm": 0.11162377148866653, "learning_rate": 0.00012297234350041228, "loss": 0.528, "step": 480 }, { "epoch": 0.5756240822320118, "grad_norm": 0.10550152510404587, "learning_rate": 0.00011748087468060128, "loss": 0.533, "step": 490 }, { "epoch": 0.5873715124816447, "grad_norm": 0.09718377143144608, "learning_rate": 0.0001120347787157222, "loss": 0.5409, "step": 500 }, { "epoch": 0.5991189427312775, "grad_norm": 0.09185861796140671, "learning_rate": 0.0001066416543624984, "loss": 0.5354, "step": 510 }, { "epoch": 0.6108663729809104, "grad_norm": 0.0927920788526535, "learning_rate": 0.00010130902646812369, "loss": 0.5454, "step": 520 }, { "epoch": 0.6226138032305433, "grad_norm": 0.087093785405159, "learning_rate": 9.604433547109344e-05, "loss": 0.5295, "step": 530 }, { "epoch": 0.6343612334801763, "grad_norm": 0.09994326531887054, "learning_rate": 9.085492701980751e-05, "loss": 0.5322, "step": 540 }, { "epoch": 0.6461086637298091, "grad_norm": 0.09507084637880325, "learning_rate": 8.574804172343134e-05, "loss": 0.5224, "step": 550 }, { "epoch": 0.657856093979442, "grad_norm": 0.08571015298366547, "learning_rate": 8.07308050493148e-05, "loss": 0.5378, "step": 560 }, { "epoch": 0.6696035242290749, "grad_norm": 0.08876761794090271, "learning_rate": 7.581021738106408e-05, "loss": 0.5265, "step": 570 }, { "epoch": 0.6813509544787077, "grad_norm": 0.09467241168022156, "learning_rate": 7.099314425113907e-05, "loss": 0.5392, "step": 580 }, { "epoch": 0.6930983847283406, "grad_norm": 0.08804601430892944, "learning_rate": 6.628630676160445e-05, "loss": 0.5365, "step": 590 }, { "epoch": 0.7048458149779736, "grad_norm": 0.08877623081207275, "learning_rate": 6.169627220639871e-05, "loss": 0.5354, "step": 600 }, { "epoch": 0.7165932452276065, "grad_norm": 0.09122662246227264, "learning_rate": 5.722944490820774e-05, "loss": 0.5356, "step": 610 }, { "epoch": 0.7283406754772394, "grad_norm": 0.08744510263204575, "learning_rate": 5.289205728272586e-05, "loss": 0.5424, "step": 620 }, { "epoch": 0.7400881057268722, "grad_norm": 0.08927814662456512, "learning_rate": 4.869016114277345e-05, "loss": 0.5268, "step": 630 }, { "epoch": 0.7518355359765051, "grad_norm": 0.09256933629512787, "learning_rate": 4.462961925440341e-05, "loss": 0.5414, "step": 640 }, { "epoch": 0.7635829662261381, "grad_norm": 0.08703339844942093, "learning_rate": 4.071609715677899e-05, "loss": 0.5376, "step": 650 }, { "epoch": 0.775330396475771, "grad_norm": 0.08876251429319382, "learning_rate": 3.695505525723465e-05, "loss": 0.5307, "step": 660 }, { "epoch": 0.7870778267254038, "grad_norm": 0.08702490478754044, "learning_rate": 3.3351741212551595e-05, "loss": 0.5307, "step": 670 }, { "epoch": 0.7988252569750367, "grad_norm": 0.08601511269807816, "learning_rate": 2.9911182607076516e-05, "loss": 0.5372, "step": 680 }, { "epoch": 0.8105726872246696, "grad_norm": 0.0857272818684578, "learning_rate": 2.663817993790021e-05, "loss": 0.528, "step": 690 }, { "epoch": 0.8223201174743024, "grad_norm": 0.08725214004516602, "learning_rate": 2.3537299916883512e-05, "loss": 0.5378, "step": 700 }, { "epoch": 0.8340675477239354, "grad_norm": 0.0845843032002449, "learning_rate": 2.0612869098875988e-05, "loss": 0.5389, "step": 710 }, { "epoch": 0.8458149779735683, "grad_norm": 0.08480172604322433, "learning_rate": 1.786896784501778e-05, "loss": 0.5244, "step": 720 }, { "epoch": 0.8575624082232012, "grad_norm": 0.09265288710594177, "learning_rate": 1.5309424629547164e-05, "loss": 0.5403, "step": 730 }, { "epoch": 0.869309838472834, "grad_norm": 0.08523637801408768, "learning_rate": 1.2937810698057921e-05, "loss": 0.5332, "step": 740 }, { "epoch": 0.8810572687224669, "grad_norm": 0.08431612700223923, "learning_rate": 1.0757435084658694e-05, "loss": 0.5198, "step": 750 }, { "epoch": 0.8928046989720999, "grad_norm": 0.08998807519674301, "learning_rate": 8.771339994987953e-06, "loss": 0.5251, "step": 760 }, { "epoch": 0.9045521292217328, "grad_norm": 0.08884080499410629, "learning_rate": 6.98229656152543e-06, "loss": 0.5449, "step": 770 }, { "epoch": 0.9162995594713657, "grad_norm": 0.08583056926727295, "learning_rate": 5.392800977123047e-06, "loss": 0.5264, "step": 780 }, { "epoch": 0.9280469897209985, "grad_norm": 0.08824951946735382, "learning_rate": 4.005071012149952e-06, "loss": 0.5306, "step": 790 }, { "epoch": 0.9397944199706314, "grad_norm": 0.08726619184017181, "learning_rate": 2.821042920111427e-06, "loss": 0.5205, "step": 800 }, { "epoch": 0.9515418502202643, "grad_norm": 0.08729498088359833, "learning_rate": 1.8423687360584137e-06, "loss": 0.5217, "step": 810 }, { "epoch": 0.9632892804698973, "grad_norm": 0.08497074991464615, "learning_rate": 1.070413971558115e-06, "loss": 0.534, "step": 820 }, { "epoch": 0.9750367107195301, "grad_norm": 0.08497001975774765, "learning_rate": 5.062557094410058e-07, "loss": 0.5241, "step": 830 }, { "epoch": 0.986784140969163, "grad_norm": 0.08295251429080963, "learning_rate": 1.5068110098296338e-07, "loss": 0.5337, "step": 840 }, { "epoch": 0.9985315712187959, "grad_norm": 0.10860061645507812, "learning_rate": 0.00022638651575377874, "loss": 0.5227, "step": 850 }, { "epoch": 1.0105726872246696, "grad_norm": 0.11796294897794724, "learning_rate": 0.00022478592280680777, "loss": 0.588, "step": 860 }, { "epoch": 1.0223201174743024, "grad_norm": 0.1594405323266983, "learning_rate": 0.0002231739162937319, "loss": 0.5307, "step": 870 }, { "epoch": 1.0340675477239354, "grad_norm": 0.10787333548069, "learning_rate": 0.0002215507422333499, "loss": 0.5359, "step": 880 }, { "epoch": 1.0458149779735684, "grad_norm": 0.10763130336999893, "learning_rate": 0.0002199166483488127, "loss": 0.5407, "step": 890 }, { "epoch": 1.0575624082232011, "grad_norm": 0.13658902049064636, "learning_rate": 0.00021827188402981652, "loss": 0.5255, "step": 900 }, { "epoch": 1.0693098384728341, "grad_norm": 0.10522827506065369, "learning_rate": 0.00021661670029454207, "loss": 0.5276, "step": 910 }, { "epoch": 1.0810572687224669, "grad_norm": 0.1422538459300995, "learning_rate": 0.0002149513497513448, "loss": 0.5245, "step": 920 }, { "epoch": 1.0928046989720999, "grad_norm": 0.10326780378818512, "learning_rate": 0.00021327608656020305, "loss": 0.5294, "step": 930 }, { "epoch": 1.1045521292217328, "grad_norm": 0.11100132018327713, "learning_rate": 0.00021159116639392868, "loss": 0.52, "step": 940 }, { "epoch": 1.1162995594713656, "grad_norm": 0.09583411365747452, "learning_rate": 0.00020989684639914738, "loss": 0.5247, "step": 950 }, { "epoch": 1.1280469897209986, "grad_norm": 0.10812857002019882, "learning_rate": 0.00020819338515705378, "loss": 0.5236, "step": 960 }, { "epoch": 1.1397944199706314, "grad_norm": 0.12208293378353119, "learning_rate": 0.00020648104264394784, "loss": 0.5217, "step": 970 }, { "epoch": 1.1515418502202643, "grad_norm": 0.11540035158395767, "learning_rate": 0.00020476008019155794, "loss": 0.5387, "step": 980 }, { "epoch": 1.1632892804698973, "grad_norm": 0.10755149275064468, "learning_rate": 0.00020303076044715738, "loss": 0.5057, "step": 990 }, { "epoch": 1.17503671071953, "grad_norm": 0.10145018994808197, "learning_rate": 0.0002012933473334804, "loss": 0.5202, "step": 1000 }, { "epoch": 1.186784140969163, "grad_norm": 0.11095395684242249, "learning_rate": 0.00019954810600844277, "loss": 0.5314, "step": 1010 }, { "epoch": 1.1985315712187958, "grad_norm": 0.097834512591362, "learning_rate": 0.00019779530282467456, "loss": 0.5178, "step": 1020 }, { "epoch": 1.2102790014684288, "grad_norm": 0.09915532171726227, "learning_rate": 0.00019603520528887027, "loss": 0.5205, "step": 1030 }, { "epoch": 1.2220264317180616, "grad_norm": 0.1107698306441307, "learning_rate": 0.00019426808202096298, "loss": 0.5268, "step": 1040 }, { "epoch": 1.2337738619676946, "grad_norm": 0.11669424921274185, "learning_rate": 0.0001924942027131284, "loss": 0.53, "step": 1050 }, { "epoch": 1.2455212922173275, "grad_norm": 0.11590099334716797, "learning_rate": 0.00019071383808862534, "loss": 0.5085, "step": 1060 }, { "epoch": 1.2572687224669603, "grad_norm": 0.1027660220861435, "learning_rate": 0.00018892725986047917, "loss": 0.5193, "step": 1070 }, { "epoch": 1.2690161527165933, "grad_norm": 0.09436651319265366, "learning_rate": 0.00018713474069001354, "loss": 0.5002, "step": 1080 }, { "epoch": 1.280763582966226, "grad_norm": 0.11379121989011765, "learning_rate": 0.00018533655414523808, "loss": 0.5212, "step": 1090 }, { "epoch": 1.292511013215859, "grad_norm": 0.09809733927249908, "learning_rate": 0.00018353297465909717, "loss": 0.5124, "step": 1100 }, { "epoch": 1.3042584434654918, "grad_norm": 0.1027405858039856, "learning_rate": 0.00018172427748758713, "loss": 0.5177, "step": 1110 }, { "epoch": 1.3160058737151248, "grad_norm": 0.10089763253927231, "learning_rate": 0.0001799107386677475, "loss": 0.4969, "step": 1120 }, { "epoch": 1.3277533039647578, "grad_norm": 0.09994267672300339, "learning_rate": 0.0001780926349755332, "loss": 0.516, "step": 1130 }, { "epoch": 1.3395007342143905, "grad_norm": 0.10974204540252686, "learning_rate": 0.00017627024388357416, "loss": 0.5035, "step": 1140 }, { "epoch": 1.3512481644640235, "grad_norm": 0.09834876656532288, "learning_rate": 0.00017444384351882817, "loss": 0.5121, "step": 1150 }, { "epoch": 1.3629955947136563, "grad_norm": 0.09756341576576233, "learning_rate": 0.0001726137126201342, "loss": 0.5289, "step": 1160 }, { "epoch": 1.3747430249632893, "grad_norm": 0.09796813875436783, "learning_rate": 0.0001707801304956723, "loss": 0.5054, "step": 1170 }, { "epoch": 1.3864904552129222, "grad_norm": 0.09447074681520462, "learning_rate": 0.00016894337698033663, "loss": 0.5067, "step": 1180 }, { "epoch": 1.398237885462555, "grad_norm": 0.10086411237716675, "learning_rate": 0.00016710373239302772, "loss": 0.5191, "step": 1190 }, { "epoch": 1.409985315712188, "grad_norm": 0.10048293322324753, "learning_rate": 0.00016526147749387155, "loss": 0.5073, "step": 1200 }, { "epoch": 1.4217327459618208, "grad_norm": 0.09760654717683792, "learning_rate": 0.00016341689344137088, "loss": 0.5254, "step": 1210 }, { "epoch": 1.4334801762114537, "grad_norm": 0.09539603441953659, "learning_rate": 0.00016157026174949538, "loss": 0.5116, "step": 1220 }, { "epoch": 1.4452276064610867, "grad_norm": 0.09473835676908493, "learning_rate": 0.00015972186424471855, "loss": 0.5011, "step": 1230 }, { "epoch": 1.4569750367107195, "grad_norm": 0.10754521191120148, "learning_rate": 0.0001578719830230061, "loss": 0.5116, "step": 1240 }, { "epoch": 1.4687224669603525, "grad_norm": 0.09202101826667786, "learning_rate": 0.00015602090040676324, "loss": 0.4964, "step": 1250 }, { "epoch": 1.4804698972099852, "grad_norm": 0.1060076653957367, "learning_rate": 0.00015416889890174792, "loss": 0.505, "step": 1260 }, { "epoch": 1.4922173274596182, "grad_norm": 0.10437231510877609, "learning_rate": 0.0001523162611539557, "loss": 0.5065, "step": 1270 }, { "epoch": 1.5039647577092512, "grad_norm": 0.14216673374176025, "learning_rate": 0.0001504632699064833, "loss": 0.5221, "step": 1280 }, { "epoch": 1.515712187958884, "grad_norm": 0.10676784813404083, "learning_rate": 0.00014861020795637716, "loss": 0.5057, "step": 1290 }, { "epoch": 1.5274596182085167, "grad_norm": 0.09706170856952667, "learning_rate": 0.00014675735811147444, "loss": 0.5054, "step": 1300 }, { "epoch": 1.5392070484581497, "grad_norm": 0.09725037962198257, "learning_rate": 0.00014490500314724117, "loss": 0.5083, "step": 1310 }, { "epoch": 1.5509544787077827, "grad_norm": 0.09938537329435349, "learning_rate": 0.0001430534257636167, "loss": 0.5157, "step": 1320 }, { "epoch": 1.5627019089574157, "grad_norm": 0.10331527143716812, "learning_rate": 0.00014120290854186863, "loss": 0.5151, "step": 1330 }, { "epoch": 1.5744493392070484, "grad_norm": 0.09629181027412415, "learning_rate": 0.00013935373390146634, "loss": 0.507, "step": 1340 }, { "epoch": 1.5861967694566812, "grad_norm": 0.10021866858005524, "learning_rate": 0.00013750618405697912, "loss": 0.4973, "step": 1350 }, { "epoch": 1.5979441997063142, "grad_norm": 0.10414128750562668, "learning_rate": 0.0001356605409750058, "loss": 0.5033, "step": 1360 }, { "epoch": 1.6096916299559472, "grad_norm": 0.11649428308010101, "learning_rate": 0.0001340013252947644, "loss": 0.5259, "step": 1370 }, { "epoch": 1.6214390602055802, "grad_norm": 0.09817013144493103, "learning_rate": 0.00013216008080267535, "loss": 0.5164, "step": 1380 }, { "epoch": 1.633186490455213, "grad_norm": 0.10239794105291367, "learning_rate": 0.0001303215589766901, "loss": 0.5011, "step": 1390 }, { "epoch": 1.6449339207048457, "grad_norm": 0.09663370996713638, "learning_rate": 0.00012848604040558272, "loss": 0.5096, "step": 1400 }, { "epoch": 1.6566813509544787, "grad_norm": 0.10224564373493195, "learning_rate": 0.0001266538052197809, "loss": 0.5055, "step": 1410 }, { "epoch": 1.6684287812041116, "grad_norm": 0.09594379365444183, "learning_rate": 0.00012482513304861364, "loss": 0.5051, "step": 1420 }, { "epoch": 1.6801762114537446, "grad_norm": 0.10660111159086227, "learning_rate": 0.00012300030297763518, "loss": 0.5076, "step": 1430 }, { "epoch": 1.6919236417033774, "grad_norm": 0.10405760258436203, "learning_rate": 0.0001211795935060317, "loss": 0.5089, "step": 1440 }, { "epoch": 1.7036710719530102, "grad_norm": 0.10634606331586838, "learning_rate": 0.00011936328250411801, "loss": 0.504, "step": 1450 }, { "epoch": 1.7154185022026431, "grad_norm": 0.1119045689702034, "learning_rate": 0.00011755164717092988, "loss": 0.5105, "step": 1460 }, { "epoch": 1.7271659324522761, "grad_norm": 0.10727712512016296, "learning_rate": 0.00011574496399191876, "loss": 0.5185, "step": 1470 }, { "epoch": 1.738913362701909, "grad_norm": 0.09993914514780045, "learning_rate": 0.00011394350869675567, "loss": 0.5004, "step": 1480 }, { "epoch": 1.7506607929515419, "grad_norm": 0.10068885236978531, "learning_rate": 0.00011214755621725042, "loss": 0.5091, "step": 1490 }, { "epoch": 1.7624082232011746, "grad_norm": 0.09819114953279495, "learning_rate": 0.00011035738064539201, "loss": 0.496, "step": 1500 }, { "epoch": 1.7741556534508076, "grad_norm": 0.09929963946342468, "learning_rate": 0.00010857325519151842, "loss": 0.5033, "step": 1510 }, { "epoch": 1.7859030837004406, "grad_norm": 0.09696891903877258, "learning_rate": 0.00010679545214261935, "loss": 0.5133, "step": 1520 }, { "epoch": 1.7976505139500736, "grad_norm": 0.1030985563993454, "learning_rate": 0.0001050242428207814, "loss": 0.5142, "step": 1530 }, { "epoch": 1.8093979441997063, "grad_norm": 0.10666483640670776, "learning_rate": 0.0001032598975417796, "loss": 0.5205, "step": 1540 }, { "epoch": 1.821145374449339, "grad_norm": 0.1053660586476326, "learning_rate": 0.00010150268557382262, "loss": 0.498, "step": 1550 }, { "epoch": 1.832892804698972, "grad_norm": 0.1028640866279602, "learning_rate": 9.975287509645826e-05, "loss": 0.5096, "step": 1560 }, { "epoch": 1.844640234948605, "grad_norm": 0.10187330096960068, "learning_rate": 9.801073315964465e-05, "loss": 0.4961, "step": 1570 }, { "epoch": 1.8563876651982378, "grad_norm": 0.0993318185210228, "learning_rate": 9.627652564299405e-05, "loss": 0.5028, "step": 1580 }, { "epoch": 1.8681350954478708, "grad_norm": 0.11013616621494293, "learning_rate": 9.455051721519528e-05, "loss": 0.5011, "step": 1590 }, { "epoch": 1.8798825256975036, "grad_norm": 0.10433095693588257, "learning_rate": 9.283297129362094e-05, "loss": 0.4977, "step": 1600 }, { "epoch": 1.8916299559471366, "grad_norm": 0.10458213835954666, "learning_rate": 9.112415000412531e-05, "loss": 0.5107, "step": 1610 }, { "epoch": 1.9033773861967695, "grad_norm": 0.10259649157524109, "learning_rate": 8.942431414104001e-05, "loss": 0.4994, "step": 1620 }, { "epoch": 1.9151248164464023, "grad_norm": 0.09932565689086914, "learning_rate": 8.773372312737238e-05, "loss": 0.499, "step": 1630 }, { "epoch": 1.9268722466960353, "grad_norm": 0.10920233279466629, "learning_rate": 8.605263497521283e-05, "loss": 0.5061, "step": 1640 }, { "epoch": 1.938619676945668, "grad_norm": 0.0942125990986824, "learning_rate": 8.438130624635852e-05, "loss": 0.4941, "step": 1650 }, { "epoch": 1.950367107195301, "grad_norm": 0.12091836333274841, "learning_rate": 8.271999201315755e-05, "loss": 0.499, "step": 1660 }, { "epoch": 1.962114537444934, "grad_norm": 0.10680700093507767, "learning_rate": 8.106894581958054e-05, "loss": 0.4949, "step": 1670 }, { "epoch": 1.9738619676945668, "grad_norm": 0.10140874981880188, "learning_rate": 7.942841964252586e-05, "loss": 0.5032, "step": 1680 }, { "epoch": 1.9856093979441996, "grad_norm": 0.10470426827669144, "learning_rate": 7.779866385336391e-05, "loss": 0.5072, "step": 1690 }, { "epoch": 1.9973568281938325, "grad_norm": 0.10490316152572632, "learning_rate": 7.617992717972585e-05, "loss": 0.5024, "step": 1700 }, { "epoch": 2.0093979441997063, "grad_norm": 0.10185109823942184, "learning_rate": 7.457245666754417e-05, "loss": 0.541, "step": 1710 }, { "epoch": 2.0211453744493393, "grad_norm": 0.10151582956314087, "learning_rate": 7.297649764334912e-05, "loss": 0.4831, "step": 1720 }, { "epoch": 2.0328928046989723, "grad_norm": 0.10989069938659668, "learning_rate": 7.139229367682778e-05, "loss": 0.486, "step": 1730 }, { "epoch": 2.044640234948605, "grad_norm": 0.11440616101026535, "learning_rate": 6.982008654365156e-05, "loss": 0.4842, "step": 1740 }, { "epoch": 2.056387665198238, "grad_norm": 0.1018548235297203, "learning_rate": 6.82601161885771e-05, "loss": 0.4911, "step": 1750 }, { "epoch": 2.0681350954478708, "grad_norm": 0.1053592786192894, "learning_rate": 6.671262068882665e-05, "loss": 0.4924, "step": 1760 }, { "epoch": 2.0798825256975038, "grad_norm": 0.10619944334030151, "learning_rate": 6.517783621775382e-05, "loss": 0.4736, "step": 1770 }, { "epoch": 2.0916299559471367, "grad_norm": 0.11708024144172668, "learning_rate": 6.36559970087992e-05, "loss": 0.4824, "step": 1780 }, { "epoch": 2.1033773861967693, "grad_norm": 0.12601934373378754, "learning_rate": 6.214733531974292e-05, "loss": 0.4834, "step": 1790 }, { "epoch": 2.1151248164464023, "grad_norm": 0.10728344321250916, "learning_rate": 6.065208139725811e-05, "loss": 0.4889, "step": 1800 }, { "epoch": 2.1268722466960353, "grad_norm": 0.10394187271595001, "learning_rate": 5.917046344177123e-05, "loss": 0.4893, "step": 1810 }, { "epoch": 2.1386196769456682, "grad_norm": 0.11126961559057236, "learning_rate": 5.770270757263536e-05, "loss": 0.4876, "step": 1820 }, { "epoch": 2.150367107195301, "grad_norm": 0.10413071513175964, "learning_rate": 5.624903779362031e-05, "loss": 0.4764, "step": 1830 }, { "epoch": 2.1621145374449338, "grad_norm": 0.10565336793661118, "learning_rate": 5.480967595872602e-05, "loss": 0.4781, "step": 1840 }, { "epoch": 2.1738619676945667, "grad_norm": 0.10836539417505264, "learning_rate": 5.338484173832413e-05, "loss": 0.4854, "step": 1850 }, { "epoch": 2.1856093979441997, "grad_norm": 0.11080804467201233, "learning_rate": 5.197475258563249e-05, "loss": 0.4815, "step": 1860 }, { "epoch": 2.1973568281938327, "grad_norm": 0.11599951237440109, "learning_rate": 5.057962370352815e-05, "loss": 0.4878, "step": 1870 }, { "epoch": 2.2091042584434657, "grad_norm": 0.10745177417993546, "learning_rate": 4.91996680117041e-05, "loss": 0.4737, "step": 1880 }, { "epoch": 2.2208516886930982, "grad_norm": 0.10236770659685135, "learning_rate": 4.783509611417409e-05, "loss": 0.4759, "step": 1890 }, { "epoch": 2.232599118942731, "grad_norm": 0.11330056935548782, "learning_rate": 4.648611626713082e-05, "loss": 0.4725, "step": 1900 }, { "epoch": 2.244346549192364, "grad_norm": 0.10368051379919052, "learning_rate": 4.515293434716279e-05, "loss": 0.4872, "step": 1910 }, { "epoch": 2.256093979441997, "grad_norm": 0.10511163622140884, "learning_rate": 4.38357538198343e-05, "loss": 0.4839, "step": 1920 }, { "epoch": 2.2678414096916297, "grad_norm": 0.09937173873186111, "learning_rate": 4.253477570863275e-05, "loss": 0.4768, "step": 1930 }, { "epoch": 2.2795888399412627, "grad_norm": 0.1072772666811943, "learning_rate": 4.1250198564289644e-05, "loss": 0.4915, "step": 1940 }, { "epoch": 2.2913362701908957, "grad_norm": 0.11077064275741577, "learning_rate": 3.998221843447808e-05, "loss": 0.4773, "step": 1950 }, { "epoch": 2.3030837004405287, "grad_norm": 0.10051790624856949, "learning_rate": 3.8731028833892955e-05, "loss": 0.4728, "step": 1960 }, { "epoch": 2.3148311306901617, "grad_norm": 0.11899662017822266, "learning_rate": 3.749682071471727e-05, "loss": 0.4978, "step": 1970 }, { "epoch": 2.3265785609397946, "grad_norm": 0.1075495257973671, "learning_rate": 3.627978243747965e-05, "loss": 0.491, "step": 1980 }, { "epoch": 2.338325991189427, "grad_norm": 0.10509738326072693, "learning_rate": 3.5080099742307495e-05, "loss": 0.4672, "step": 1990 }, { "epoch": 2.35007342143906, "grad_norm": 0.10816201567649841, "learning_rate": 3.3897955720579985e-05, "loss": 0.4856, "step": 2000 }, { "epoch": 2.361820851688693, "grad_norm": 0.11207477003335953, "learning_rate": 3.2733530786985124e-05, "loss": 0.4893, "step": 2010 }, { "epoch": 2.373568281938326, "grad_norm": 0.10209004580974579, "learning_rate": 3.1587002651985776e-05, "loss": 0.4737, "step": 2020 }, { "epoch": 2.3853157121879587, "grad_norm": 0.11698783189058304, "learning_rate": 3.0458546294697954e-05, "loss": 0.4883, "step": 2030 }, { "epoch": 2.3970631424375917, "grad_norm": 0.09947340935468674, "learning_rate": 2.9348333936186003e-05, "loss": 0.4761, "step": 2040 }, { "epoch": 2.4088105726872246, "grad_norm": 0.1026497632265091, "learning_rate": 2.82565350131791e-05, "loss": 0.4867, "step": 2050 }, { "epoch": 2.4205580029368576, "grad_norm": 0.10487735271453857, "learning_rate": 2.718331615221218e-05, "loss": 0.4759, "step": 2060 }, { "epoch": 2.4323054331864906, "grad_norm": 0.10684232413768768, "learning_rate": 2.61288411441961e-05, "loss": 0.4881, "step": 2070 }, { "epoch": 2.444052863436123, "grad_norm": 0.104823999106884, "learning_rate": 2.5093270919420383e-05, "loss": 0.4973, "step": 2080 }, { "epoch": 2.455800293685756, "grad_norm": 0.10103822499513626, "learning_rate": 2.4076763522992665e-05, "loss": 0.4887, "step": 2090 }, { "epoch": 2.467547723935389, "grad_norm": 0.10180474817752838, "learning_rate": 2.307947409071825e-05, "loss": 0.4791, "step": 2100 }, { "epoch": 2.479295154185022, "grad_norm": 0.10081729292869568, "learning_rate": 2.210155482542402e-05, "loss": 0.4822, "step": 2110 }, { "epoch": 2.491042584434655, "grad_norm": 0.10296090692281723, "learning_rate": 2.1143154973729735e-05, "loss": 0.4776, "step": 2120 }, { "epoch": 2.5027900146842876, "grad_norm": 0.10592051595449448, "learning_rate": 2.0204420803270327e-05, "loss": 0.484, "step": 2130 }, { "epoch": 2.5145374449339206, "grad_norm": 0.10436718910932541, "learning_rate": 1.9285495580373362e-05, "loss": 0.4741, "step": 2140 }, { "epoch": 2.5262848751835536, "grad_norm": 0.10932262241840363, "learning_rate": 1.8386519548193994e-05, "loss": 0.4846, "step": 2150 }, { "epoch": 2.5380323054331866, "grad_norm": 0.10435889661312103, "learning_rate": 1.7507629905311644e-05, "loss": 0.4984, "step": 2160 }, { "epoch": 2.5497797356828196, "grad_norm": 0.11018865555524826, "learning_rate": 1.664896078479126e-05, "loss": 0.4728, "step": 2170 }, { "epoch": 2.561527165932452, "grad_norm": 0.10020755231380463, "learning_rate": 1.581064323371225e-05, "loss": 0.4808, "step": 2180 }, { "epoch": 2.573274596182085, "grad_norm": 0.10242980718612671, "learning_rate": 1.4992805193168717e-05, "loss": 0.483, "step": 2190 }, { "epoch": 2.585022026431718, "grad_norm": 0.10522596538066864, "learning_rate": 1.4195571478743495e-05, "loss": 0.4798, "step": 2200 }, { "epoch": 2.596769456681351, "grad_norm": 0.10111811757087708, "learning_rate": 1.3419063761459025e-05, "loss": 0.4758, "step": 2210 }, { "epoch": 2.6085168869309836, "grad_norm": 0.10785708576440811, "learning_rate": 1.2663400549208741e-05, "loss": 0.4785, "step": 2220 }, { "epoch": 2.6202643171806166, "grad_norm": 0.10563918203115463, "learning_rate": 1.1928697168670465e-05, "loss": 0.4838, "step": 2230 }, { "epoch": 2.6320117474302496, "grad_norm": 0.10474205017089844, "learning_rate": 1.1215065747705742e-05, "loss": 0.4752, "step": 2240 }, { "epoch": 2.6437591776798826, "grad_norm": 0.1004628837108612, "learning_rate": 1.0522615198247364e-05, "loss": 0.4812, "step": 2250 }, { "epoch": 2.6555066079295155, "grad_norm": 0.10973802208900452, "learning_rate": 9.851451199677573e-06, "loss": 0.4984, "step": 2260 }, { "epoch": 2.6672540381791485, "grad_norm": 0.0998314619064331, "learning_rate": 9.201676182699558e-06, "loss": 0.4908, "step": 2270 }, { "epoch": 2.679001468428781, "grad_norm": 0.10704085975885391, "learning_rate": 8.573389313704981e-06, "loss": 0.4659, "step": 2280 }, { "epoch": 2.690748898678414, "grad_norm": 0.1085141971707344, "learning_rate": 7.966686479639428e-06, "loss": 0.4846, "step": 2290 }, { "epoch": 2.702496328928047, "grad_norm": 0.10413148999214172, "learning_rate": 7.381660273368572e-06, "loss": 0.4787, "step": 2300 }, { "epoch": 2.71424375917768, "grad_norm": 0.10362162441015244, "learning_rate": 6.818399979546885e-06, "loss": 0.4793, "step": 2310 }, { "epoch": 2.7259911894273126, "grad_norm": 0.11292777210474014, "learning_rate": 6.276991560991395e-06, "loss": 0.4861, "step": 2320 }, { "epoch": 2.7377386196769455, "grad_norm": 0.10381820797920227, "learning_rate": 5.7575176455622764e-06, "loss": 0.4834, "step": 2330 }, { "epoch": 2.7494860499265785, "grad_norm": 0.10142537951469421, "learning_rate": 5.260057513552573e-06, "loss": 0.4812, "step": 2340 }, { "epoch": 2.7612334801762115, "grad_norm": 0.10647214204072952, "learning_rate": 4.78468708558864e-06, "loss": 0.4782, "step": 2350 }, { "epoch": 2.7729809104258445, "grad_norm": 0.10398197919130325, "learning_rate": 4.3314789110433675e-06, "loss": 0.4849, "step": 2360 }, { "epoch": 2.7847283406754775, "grad_norm": 0.10818745195865631, "learning_rate": 3.90050215696408e-06, "loss": 0.477, "step": 2370 }, { "epoch": 2.79647577092511, "grad_norm": 0.10164166986942291, "learning_rate": 3.491822597516375e-06, "loss": 0.4788, "step": 2380 }, { "epoch": 2.808223201174743, "grad_norm": 0.1044260635972023, "learning_rate": 3.1055026039459863e-06, "loss": 0.482, "step": 2390 }, { "epoch": 2.819970631424376, "grad_norm": 0.09673094749450684, "learning_rate": 2.741601135059851e-06, "loss": 0.4799, "step": 2400 }, { "epoch": 2.831718061674009, "grad_norm": 0.1022416204214096, "learning_rate": 2.4001737282280055e-06, "loss": 0.4966, "step": 2410 }, { "epoch": 2.8434654919236415, "grad_norm": 0.10257957875728607, "learning_rate": 2.081272490907765e-06, "loss": 0.4785, "step": 2420 }, { "epoch": 2.8552129221732745, "grad_norm": 0.10026570409536362, "learning_rate": 1.784946092691153e-06, "loss": 0.4849, "step": 2430 }, { "epoch": 2.8669603524229075, "grad_norm": 0.10606394708156586, "learning_rate": 1.5112397578771585e-06, "loss": 0.4804, "step": 2440 } ], "logging_steps": 10, "max_steps": 2553, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.171596814335345e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }