{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 17216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023234200743494422, "grad_norm": 0.33875614404678345, "learning_rate": 1.1149825783972125e-06, "loss": 3.2044, "step": 100 }, { "epoch": 0.046468401486988845, "grad_norm": 0.09059225022792816, "learning_rate": 2.2764227642276426e-06, "loss": 3.1582, "step": 200 }, { "epoch": 0.06970260223048327, "grad_norm": 0.24917162954807281, "learning_rate": 3.4378629500580724e-06, "loss": 3.1608, "step": 300 }, { "epoch": 0.09293680297397769, "grad_norm": 0.465605229139328, "learning_rate": 4.599303135888502e-06, "loss": 3.1981, "step": 400 }, { "epoch": 0.11617100371747212, "grad_norm": 0.27495619654655457, "learning_rate": 5.7607433217189324e-06, "loss": 3.1815, "step": 500 }, { "epoch": 0.13940520446096655, "grad_norm": 0.19188807904720306, "learning_rate": 6.922183507549362e-06, "loss": 3.1294, "step": 600 }, { "epoch": 0.16263940520446096, "grad_norm": 0.5246957540512085, "learning_rate": 8.083623693379791e-06, "loss": 3.0677, "step": 700 }, { "epoch": 0.18587360594795538, "grad_norm": 0.258408784866333, "learning_rate": 9.24506387921022e-06, "loss": 2.9832, "step": 800 }, { "epoch": 0.20910780669144982, "grad_norm": 0.31014084815979004, "learning_rate": 1.0406504065040652e-05, "loss": 2.9743, "step": 900 }, { "epoch": 0.23234200743494424, "grad_norm": 0.4873325824737549, "learning_rate": 1.1567944250871081e-05, "loss": 2.8721, "step": 1000 }, { "epoch": 0.2555762081784387, "grad_norm": 0.7442412972450256, "learning_rate": 1.272938443670151e-05, "loss": 2.7949, "step": 1100 }, { "epoch": 0.2788104089219331, "grad_norm": 0.6129536628723145, "learning_rate": 1.389082462253194e-05, "loss": 2.6381, "step": 1200 }, { "epoch": 0.3020446096654275, "grad_norm": 0.5687291026115417, "learning_rate": 1.5052264808362371e-05, "loss": 2.4031, "step": 1300 }, { "epoch": 0.3252788104089219, "grad_norm": 0.6154528856277466, "learning_rate": 1.62137049941928e-05, "loss": 2.107, "step": 1400 }, { "epoch": 0.34851301115241634, "grad_norm": 0.8730382323265076, "learning_rate": 1.7375145180023228e-05, "loss": 1.981, "step": 1500 }, { "epoch": 0.37174721189591076, "grad_norm": 0.6668545603752136, "learning_rate": 1.8536585365853663e-05, "loss": 1.9311, "step": 1600 }, { "epoch": 0.3949814126394052, "grad_norm": 0.6021186709403992, "learning_rate": 1.969802555168409e-05, "loss": 1.8733, "step": 1700 }, { "epoch": 0.41821561338289964, "grad_norm": 0.8815124034881592, "learning_rate": 1.9904479153220604e-05, "loss": 1.8434, "step": 1800 }, { "epoch": 0.44144981412639406, "grad_norm": 1.1727079153060913, "learning_rate": 1.9775396927843037e-05, "loss": 1.8051, "step": 1900 }, { "epoch": 0.4646840148698885, "grad_norm": 1.1215996742248535, "learning_rate": 1.964631470246547e-05, "loss": 1.733, "step": 2000 }, { "epoch": 0.4879182156133829, "grad_norm": 1.1965365409851074, "learning_rate": 1.9517232477087907e-05, "loss": 1.6994, "step": 2100 }, { "epoch": 0.5111524163568774, "grad_norm": 1.2489936351776123, "learning_rate": 1.938815025171034e-05, "loss": 1.6529, "step": 2200 }, { "epoch": 0.5343866171003717, "grad_norm": 1.5988222360610962, "learning_rate": 1.9259068026332776e-05, "loss": 1.5897, "step": 2300 }, { "epoch": 0.5576208178438662, "grad_norm": 0.6558517217636108, "learning_rate": 1.912998580095521e-05, "loss": 1.5099, "step": 2400 }, { "epoch": 0.5808550185873605, "grad_norm": 0.7629631757736206, "learning_rate": 1.900219439783142e-05, "loss": 1.4466, "step": 2500 }, { "epoch": 0.604089219330855, "grad_norm": 0.9707331657409668, "learning_rate": 1.8873112172453855e-05, "loss": 1.3949, "step": 2600 }, { "epoch": 0.6273234200743495, "grad_norm": 0.849176287651062, "learning_rate": 1.874402994707629e-05, "loss": 1.3449, "step": 2700 }, { "epoch": 0.6505576208178439, "grad_norm": 0.460151731967926, "learning_rate": 1.8614947721698724e-05, "loss": 1.3182, "step": 2800 }, { "epoch": 0.6737918215613383, "grad_norm": 0.652923047542572, "learning_rate": 1.8485865496321157e-05, "loss": 1.2623, "step": 2900 }, { "epoch": 0.6970260223048327, "grad_norm": 0.5269683599472046, "learning_rate": 1.8356783270943594e-05, "loss": 1.2059, "step": 3000 }, { "epoch": 0.7202602230483272, "grad_norm": 0.6761623024940491, "learning_rate": 1.8227701045566027e-05, "loss": 1.1477, "step": 3100 }, { "epoch": 0.7434944237918215, "grad_norm": 0.4611155390739441, "learning_rate": 1.809861882018846e-05, "loss": 1.1063, "step": 3200 }, { "epoch": 0.766728624535316, "grad_norm": 1.20090913772583, "learning_rate": 1.7969536594810896e-05, "loss": 1.0812, "step": 3300 }, { "epoch": 0.7899628252788105, "grad_norm": 0.5198754072189331, "learning_rate": 1.7840454369433332e-05, "loss": 1.0637, "step": 3400 }, { "epoch": 0.8131970260223048, "grad_norm": 0.7287588119506836, "learning_rate": 1.7711372144055765e-05, "loss": 1.0311, "step": 3500 }, { "epoch": 0.8364312267657993, "grad_norm": 0.850121021270752, "learning_rate": 1.75822899186782e-05, "loss": 0.9687, "step": 3600 }, { "epoch": 0.8596654275092936, "grad_norm": 0.5256717801094055, "learning_rate": 1.7453207693300635e-05, "loss": 0.8706, "step": 3700 }, { "epoch": 0.8828996282527881, "grad_norm": 0.6515185236930847, "learning_rate": 1.7324125467923068e-05, "loss": 0.8474, "step": 3800 }, { "epoch": 0.9061338289962825, "grad_norm": 0.8604176640510559, "learning_rate": 1.7195043242545504e-05, "loss": 0.8302, "step": 3900 }, { "epoch": 0.929368029739777, "grad_norm": 0.3369189202785492, "learning_rate": 1.7065961017167937e-05, "loss": 0.7959, "step": 4000 }, { "epoch": 0.9526022304832714, "grad_norm": 0.4804532527923584, "learning_rate": 1.6936878791790373e-05, "loss": 0.7945, "step": 4100 }, { "epoch": 0.9758364312267658, "grad_norm": 0.3839660882949829, "learning_rate": 1.6807796566412806e-05, "loss": 0.7975, "step": 4200 }, { "epoch": 0.9990706319702602, "grad_norm": 0.31136325001716614, "learning_rate": 1.667871434103524e-05, "loss": 0.7804, "step": 4300 }, { "epoch": 1.0223048327137547, "grad_norm": 0.2822754681110382, "learning_rate": 1.6549632115657676e-05, "loss": 0.7502, "step": 4400 }, { "epoch": 1.045539033457249, "grad_norm": 0.3364527225494385, "learning_rate": 1.6420549890280112e-05, "loss": 0.747, "step": 4500 }, { "epoch": 1.0687732342007434, "grad_norm": 0.45242545008659363, "learning_rate": 1.6291467664902545e-05, "loss": 0.7263, "step": 4600 }, { "epoch": 1.092007434944238, "grad_norm": 0.2541595995426178, "learning_rate": 1.6162385439524978e-05, "loss": 0.7311, "step": 4700 }, { "epoch": 1.1152416356877324, "grad_norm": 0.32410866022109985, "learning_rate": 1.6033303214147415e-05, "loss": 0.7213, "step": 4800 }, { "epoch": 1.1384758364312269, "grad_norm": 0.28702208399772644, "learning_rate": 1.5904220988769848e-05, "loss": 0.7103, "step": 4900 }, { "epoch": 1.161710037174721, "grad_norm": 0.2637524902820587, "learning_rate": 1.577513876339228e-05, "loss": 0.7033, "step": 5000 }, { "epoch": 1.1849442379182156, "grad_norm": 0.38048645853996277, "learning_rate": 1.5646056538014717e-05, "loss": 0.7111, "step": 5100 }, { "epoch": 1.20817843866171, "grad_norm": 0.22926197946071625, "learning_rate": 1.5516974312637153e-05, "loss": 0.7053, "step": 5200 }, { "epoch": 1.2314126394052045, "grad_norm": 0.2666023373603821, "learning_rate": 1.5387892087259586e-05, "loss": 0.6915, "step": 5300 }, { "epoch": 1.2546468401486988, "grad_norm": 0.2618410587310791, "learning_rate": 1.525880986188202e-05, "loss": 0.6843, "step": 5400 }, { "epoch": 1.2778810408921932, "grad_norm": 0.24479706585407257, "learning_rate": 1.5129727636504454e-05, "loss": 0.6775, "step": 5500 }, { "epoch": 1.3011152416356877, "grad_norm": 0.19555561244487762, "learning_rate": 1.5000645411126889e-05, "loss": 0.6601, "step": 5600 }, { "epoch": 1.3243494423791822, "grad_norm": 0.2121550738811493, "learning_rate": 1.4871563185749323e-05, "loss": 0.6625, "step": 5700 }, { "epoch": 1.3475836431226766, "grad_norm": 0.36492133140563965, "learning_rate": 1.474248096037176e-05, "loss": 0.6567, "step": 5800 }, { "epoch": 1.370817843866171, "grad_norm": 0.28411343693733215, "learning_rate": 1.4613398734994193e-05, "loss": 0.6424, "step": 5900 }, { "epoch": 1.3940520446096654, "grad_norm": 0.3487832248210907, "learning_rate": 1.4484316509616627e-05, "loss": 0.6508, "step": 6000 }, { "epoch": 1.4172862453531598, "grad_norm": 0.4025629758834839, "learning_rate": 1.4355234284239062e-05, "loss": 0.6374, "step": 6100 }, { "epoch": 1.4405204460966543, "grad_norm": 0.31936919689178467, "learning_rate": 1.4226152058861495e-05, "loss": 0.6462, "step": 6200 }, { "epoch": 1.4637546468401488, "grad_norm": 0.27360206842422485, "learning_rate": 1.409706983348393e-05, "loss": 0.6382, "step": 6300 }, { "epoch": 1.486988847583643, "grad_norm": 0.35483697056770325, "learning_rate": 1.3967987608106366e-05, "loss": 0.6274, "step": 6400 }, { "epoch": 1.5102230483271375, "grad_norm": 0.30311813950538635, "learning_rate": 1.38389053827288e-05, "loss": 0.6258, "step": 6500 }, { "epoch": 1.533457249070632, "grad_norm": 0.3184954524040222, "learning_rate": 1.3709823157351234e-05, "loss": 0.6313, "step": 6600 }, { "epoch": 1.5566914498141264, "grad_norm": 0.2632908821105957, "learning_rate": 1.3580740931973668e-05, "loss": 0.6217, "step": 6700 }, { "epoch": 1.579925650557621, "grad_norm": 0.22145096957683563, "learning_rate": 1.3451658706596103e-05, "loss": 0.6245, "step": 6800 }, { "epoch": 1.6031598513011152, "grad_norm": 0.5008528828620911, "learning_rate": 1.3322576481218536e-05, "loss": 0.6187, "step": 6900 }, { "epoch": 1.6263940520446096, "grad_norm": 0.25452372431755066, "learning_rate": 1.3193494255840972e-05, "loss": 0.6084, "step": 7000 }, { "epoch": 1.649628252788104, "grad_norm": 0.3917735815048218, "learning_rate": 1.3064412030463407e-05, "loss": 0.6088, "step": 7100 }, { "epoch": 1.6728624535315983, "grad_norm": 0.28736940026283264, "learning_rate": 1.2935329805085842e-05, "loss": 0.6084, "step": 7200 }, { "epoch": 1.696096654275093, "grad_norm": 0.3900860548019409, "learning_rate": 1.2807538401962051e-05, "loss": 0.6017, "step": 7300 }, { "epoch": 1.7193308550185873, "grad_norm": 0.2482582926750183, "learning_rate": 1.2678456176584486e-05, "loss": 0.5964, "step": 7400 }, { "epoch": 1.7425650557620818, "grad_norm": 0.2464774250984192, "learning_rate": 1.254937395120692e-05, "loss": 0.5929, "step": 7500 }, { "epoch": 1.7657992565055762, "grad_norm": 0.36112162470817566, "learning_rate": 1.2420291725829354e-05, "loss": 0.5913, "step": 7600 }, { "epoch": 1.7890334572490705, "grad_norm": 0.30204829573631287, "learning_rate": 1.2291209500451788e-05, "loss": 0.5804, "step": 7700 }, { "epoch": 1.8122676579925652, "grad_norm": 0.2731075584888458, "learning_rate": 1.2162127275074223e-05, "loss": 0.5881, "step": 7800 }, { "epoch": 1.8355018587360594, "grad_norm": 0.24604862928390503, "learning_rate": 1.2033045049696656e-05, "loss": 0.5679, "step": 7900 }, { "epoch": 1.858736059479554, "grad_norm": 0.3449194133281708, "learning_rate": 1.1903962824319092e-05, "loss": 0.582, "step": 8000 }, { "epoch": 1.8819702602230484, "grad_norm": 0.310375452041626, "learning_rate": 1.1774880598941527e-05, "loss": 0.575, "step": 8100 }, { "epoch": 1.9052044609665426, "grad_norm": 0.28315114974975586, "learning_rate": 1.1645798373563962e-05, "loss": 0.5722, "step": 8200 }, { "epoch": 1.9284386617100373, "grad_norm": 0.3091906011104584, "learning_rate": 1.1516716148186395e-05, "loss": 0.5533, "step": 8300 }, { "epoch": 1.9516728624535316, "grad_norm": 0.28990840911865234, "learning_rate": 1.138763392280883e-05, "loss": 0.5724, "step": 8400 }, { "epoch": 1.974907063197026, "grad_norm": 0.44591304659843445, "learning_rate": 1.1258551697431264e-05, "loss": 0.5701, "step": 8500 }, { "epoch": 1.9981412639405205, "grad_norm": 0.26404786109924316, "learning_rate": 1.11294694720537e-05, "loss": 0.553, "step": 8600 }, { "epoch": 2.0213754646840147, "grad_norm": 0.2843058705329895, "learning_rate": 1.1000387246676133e-05, "loss": 0.5631, "step": 8700 }, { "epoch": 2.0446096654275094, "grad_norm": 0.20029422640800476, "learning_rate": 1.0871305021298568e-05, "loss": 0.5495, "step": 8800 }, { "epoch": 2.0678438661710037, "grad_norm": 0.26215997338294983, "learning_rate": 1.0742222795921003e-05, "loss": 0.5562, "step": 8900 }, { "epoch": 2.091078066914498, "grad_norm": 0.29611942172050476, "learning_rate": 1.0613140570543436e-05, "loss": 0.5541, "step": 9000 }, { "epoch": 2.1143122676579926, "grad_norm": 0.2809213697910309, "learning_rate": 1.048405834516587e-05, "loss": 0.5429, "step": 9100 }, { "epoch": 2.137546468401487, "grad_norm": 0.4684973657131195, "learning_rate": 1.0354976119788307e-05, "loss": 0.5518, "step": 9200 }, { "epoch": 2.1607806691449816, "grad_norm": 0.2790776193141937, "learning_rate": 1.0225893894410741e-05, "loss": 0.5485, "step": 9300 }, { "epoch": 2.184014869888476, "grad_norm": 0.24624982476234436, "learning_rate": 1.0096811669033174e-05, "loss": 0.5434, "step": 9400 }, { "epoch": 2.20724907063197, "grad_norm": 0.27161070704460144, "learning_rate": 9.967729443655609e-06, "loss": 0.5503, "step": 9500 }, { "epoch": 2.2304832713754648, "grad_norm": 0.2635902166366577, "learning_rate": 9.838647218278044e-06, "loss": 0.538, "step": 9600 }, { "epoch": 2.253717472118959, "grad_norm": 0.35729700326919556, "learning_rate": 9.709564992900478e-06, "loss": 0.5376, "step": 9700 }, { "epoch": 2.2769516728624537, "grad_norm": 0.224281907081604, "learning_rate": 9.580482767522913e-06, "loss": 0.5423, "step": 9800 }, { "epoch": 2.300185873605948, "grad_norm": 0.2016523778438568, "learning_rate": 9.451400542145348e-06, "loss": 0.54, "step": 9900 }, { "epoch": 2.323420074349442, "grad_norm": 0.3719424605369568, "learning_rate": 9.322318316767782e-06, "loss": 0.5326, "step": 10000 }, { "epoch": 2.346654275092937, "grad_norm": 0.22268572449684143, "learning_rate": 9.193236091390217e-06, "loss": 0.5379, "step": 10100 }, { "epoch": 2.369888475836431, "grad_norm": 0.3181590735912323, "learning_rate": 9.06415386601265e-06, "loss": 0.5328, "step": 10200 }, { "epoch": 2.393122676579926, "grad_norm": 0.2703763246536255, "learning_rate": 8.935071640635087e-06, "loss": 0.5276, "step": 10300 }, { "epoch": 2.41635687732342, "grad_norm": 0.2698732912540436, "learning_rate": 8.80598941525752e-06, "loss": 0.5338, "step": 10400 }, { "epoch": 2.4395910780669143, "grad_norm": 0.2765790820121765, "learning_rate": 8.676907189879954e-06, "loss": 0.5418, "step": 10500 }, { "epoch": 2.462825278810409, "grad_norm": 0.36516493558883667, "learning_rate": 8.547824964502389e-06, "loss": 0.5249, "step": 10600 }, { "epoch": 2.4860594795539033, "grad_norm": 0.23371903598308563, "learning_rate": 8.418742739124824e-06, "loss": 0.5318, "step": 10700 }, { "epoch": 2.5092936802973975, "grad_norm": 0.23883387446403503, "learning_rate": 8.289660513747258e-06, "loss": 0.5336, "step": 10800 }, { "epoch": 2.532527881040892, "grad_norm": 0.23600026965141296, "learning_rate": 8.160578288369693e-06, "loss": 0.5207, "step": 10900 }, { "epoch": 2.5557620817843865, "grad_norm": 0.22283987700939178, "learning_rate": 8.031496062992128e-06, "loss": 0.5261, "step": 11000 }, { "epoch": 2.578996282527881, "grad_norm": 0.3077383041381836, "learning_rate": 7.90241383761456e-06, "loss": 0.5117, "step": 11100 }, { "epoch": 2.6022304832713754, "grad_norm": 0.24372899532318115, "learning_rate": 7.773331612236995e-06, "loss": 0.5251, "step": 11200 }, { "epoch": 2.6254646840148697, "grad_norm": 0.3168962001800537, "learning_rate": 7.64424938685943e-06, "loss": 0.5238, "step": 11300 }, { "epoch": 2.6486988847583643, "grad_norm": 0.2522094249725342, "learning_rate": 7.515167161481865e-06, "loss": 0.5141, "step": 11400 }, { "epoch": 2.6719330855018586, "grad_norm": 0.4139024317264557, "learning_rate": 7.3860849361042984e-06, "loss": 0.5185, "step": 11500 }, { "epoch": 2.6951672862453533, "grad_norm": 0.2781153619289398, "learning_rate": 7.257002710726734e-06, "loss": 0.5121, "step": 11600 }, { "epoch": 2.7184014869888475, "grad_norm": 0.38515913486480713, "learning_rate": 7.127920485349168e-06, "loss": 0.5178, "step": 11700 }, { "epoch": 2.741635687732342, "grad_norm": 0.33289971947669983, "learning_rate": 6.998838259971602e-06, "loss": 0.5124, "step": 11800 }, { "epoch": 2.7648698884758365, "grad_norm": 0.36876046657562256, "learning_rate": 6.871046856847813e-06, "loss": 0.5137, "step": 11900 }, { "epoch": 2.7881040892193307, "grad_norm": 0.28098130226135254, "learning_rate": 6.7419646314702466e-06, "loss": 0.509, "step": 12000 }, { "epoch": 2.8113382899628254, "grad_norm": 0.32521939277648926, "learning_rate": 6.612882406092681e-06, "loss": 0.512, "step": 12100 }, { "epoch": 2.8345724907063197, "grad_norm": 0.23627902567386627, "learning_rate": 6.483800180715116e-06, "loss": 0.5084, "step": 12200 }, { "epoch": 2.857806691449814, "grad_norm": 0.23111554980278015, "learning_rate": 6.354717955337551e-06, "loss": 0.517, "step": 12300 }, { "epoch": 2.8810408921933086, "grad_norm": 0.3062553107738495, "learning_rate": 6.2256357299599844e-06, "loss": 0.5063, "step": 12400 }, { "epoch": 2.904275092936803, "grad_norm": 0.3274383842945099, "learning_rate": 6.09655350458242e-06, "loss": 0.5066, "step": 12500 }, { "epoch": 2.9275092936802976, "grad_norm": 0.25803956389427185, "learning_rate": 5.967471279204854e-06, "loss": 0.5064, "step": 12600 }, { "epoch": 2.950743494423792, "grad_norm": 0.29026666283607483, "learning_rate": 5.838389053827288e-06, "loss": 0.5088, "step": 12700 }, { "epoch": 2.973977695167286, "grad_norm": 0.36228805780410767, "learning_rate": 5.709306828449723e-06, "loss": 0.507, "step": 12800 }, { "epoch": 2.9972118959107807, "grad_norm": 0.2669726014137268, "learning_rate": 5.580224603072157e-06, "loss": 0.4934, "step": 12900 }, { "epoch": 3.020446096654275, "grad_norm": 0.24396216869354248, "learning_rate": 5.451142377694592e-06, "loss": 0.5099, "step": 13000 }, { "epoch": 3.0436802973977697, "grad_norm": 0.25540581345558167, "learning_rate": 5.322060152317027e-06, "loss": 0.5037, "step": 13100 }, { "epoch": 3.066914498141264, "grad_norm": 0.1964583396911621, "learning_rate": 5.192977926939461e-06, "loss": 0.5055, "step": 13200 }, { "epoch": 3.090148698884758, "grad_norm": 0.2318154275417328, "learning_rate": 5.063895701561895e-06, "loss": 0.5041, "step": 13300 }, { "epoch": 3.113382899628253, "grad_norm": 0.28110265731811523, "learning_rate": 4.9348134761843295e-06, "loss": 0.5043, "step": 13400 }, { "epoch": 3.136617100371747, "grad_norm": 0.3360753357410431, "learning_rate": 4.805731250806764e-06, "loss": 0.4915, "step": 13500 }, { "epoch": 3.159851301115242, "grad_norm": 0.3044135868549347, "learning_rate": 4.676649025429199e-06, "loss": 0.499, "step": 13600 }, { "epoch": 3.183085501858736, "grad_norm": 0.28163620829582214, "learning_rate": 4.547566800051634e-06, "loss": 0.4996, "step": 13700 }, { "epoch": 3.2063197026022303, "grad_norm": 0.23853909969329834, "learning_rate": 4.418484574674068e-06, "loss": 0.5073, "step": 13800 }, { "epoch": 3.229553903345725, "grad_norm": 0.25510174036026, "learning_rate": 4.289402349296502e-06, "loss": 0.4988, "step": 13900 }, { "epoch": 3.2527881040892193, "grad_norm": 0.650174081325531, "learning_rate": 4.160320123918937e-06, "loss": 0.5024, "step": 14000 }, { "epoch": 3.276022304832714, "grad_norm": 0.36293137073516846, "learning_rate": 4.0312378985413715e-06, "loss": 0.4913, "step": 14100 }, { "epoch": 3.299256505576208, "grad_norm": 0.35399818420410156, "learning_rate": 3.902155673163805e-06, "loss": 0.4993, "step": 14200 }, { "epoch": 3.3224907063197024, "grad_norm": 0.2553289830684662, "learning_rate": 3.7730734477862404e-06, "loss": 0.5017, "step": 14300 }, { "epoch": 3.345724907063197, "grad_norm": 0.25535061955451965, "learning_rate": 3.643991222408675e-06, "loss": 0.4895, "step": 14400 }, { "epoch": 3.3689591078066914, "grad_norm": 0.2772742509841919, "learning_rate": 3.514908997031109e-06, "loss": 0.4954, "step": 14500 }, { "epoch": 3.392193308550186, "grad_norm": 0.26105812191963196, "learning_rate": 3.387117593907319e-06, "loss": 0.4964, "step": 14600 }, { "epoch": 3.4154275092936803, "grad_norm": 0.2538992166519165, "learning_rate": 3.258035368529754e-06, "loss": 0.4985, "step": 14700 }, { "epoch": 3.4386617100371746, "grad_norm": 0.2889178693294525, "learning_rate": 3.128953143152188e-06, "loss": 0.4969, "step": 14800 }, { "epoch": 3.4618959107806693, "grad_norm": 0.28792130947113037, "learning_rate": 2.9998709177746228e-06, "loss": 0.4985, "step": 14900 }, { "epoch": 3.4851301115241635, "grad_norm": 0.36826494336128235, "learning_rate": 2.8707886923970575e-06, "loss": 0.4937, "step": 15000 }, { "epoch": 3.508364312267658, "grad_norm": 0.24432937800884247, "learning_rate": 2.7417064670194917e-06, "loss": 0.4892, "step": 15100 }, { "epoch": 3.5315985130111525, "grad_norm": 0.36436623334884644, "learning_rate": 2.6126242416419264e-06, "loss": 0.5029, "step": 15200 }, { "epoch": 3.5548327137546467, "grad_norm": 0.3257830739021301, "learning_rate": 2.4835420162643606e-06, "loss": 0.484, "step": 15300 }, { "epoch": 3.5780669144981414, "grad_norm": 0.20910651981830597, "learning_rate": 2.354459790886795e-06, "loss": 0.4934, "step": 15400 }, { "epoch": 3.6013011152416357, "grad_norm": 0.27706313133239746, "learning_rate": 2.2253775655092296e-06, "loss": 0.4972, "step": 15500 }, { "epoch": 3.6245353159851303, "grad_norm": 0.28043028712272644, "learning_rate": 2.0962953401316643e-06, "loss": 0.4878, "step": 15600 }, { "epoch": 3.6477695167286246, "grad_norm": 0.34835153818130493, "learning_rate": 1.9672131147540985e-06, "loss": 0.4954, "step": 15700 }, { "epoch": 3.671003717472119, "grad_norm": 0.3561202585697174, "learning_rate": 1.838130889376533e-06, "loss": 0.4992, "step": 15800 }, { "epoch": 3.6942379182156135, "grad_norm": 0.2767621576786041, "learning_rate": 1.7090486639989677e-06, "loss": 0.4982, "step": 15900 }, { "epoch": 3.717472118959108, "grad_norm": 0.22851090133190155, "learning_rate": 1.579966438621402e-06, "loss": 0.498, "step": 16000 }, { "epoch": 3.7407063197026025, "grad_norm": 0.28282201290130615, "learning_rate": 1.4508842132438364e-06, "loss": 0.4898, "step": 16100 }, { "epoch": 3.7639405204460967, "grad_norm": 0.24474182724952698, "learning_rate": 1.3218019878662709e-06, "loss": 0.501, "step": 16200 }, { "epoch": 3.787174721189591, "grad_norm": 0.27427938580513, "learning_rate": 1.1927197624887055e-06, "loss": 0.4966, "step": 16300 }, { "epoch": 3.8104089219330852, "grad_norm": 0.38391393423080444, "learning_rate": 1.0636375371111398e-06, "loss": 0.4941, "step": 16400 }, { "epoch": 3.83364312267658, "grad_norm": 0.3098974823951721, "learning_rate": 9.345553117335744e-07, "loss": 0.4879, "step": 16500 }, { "epoch": 3.8568773234200746, "grad_norm": 0.2817577123641968, "learning_rate": 8.054730863560088e-07, "loss": 0.4925, "step": 16600 }, { "epoch": 3.880111524163569, "grad_norm": 0.3037372827529907, "learning_rate": 6.763908609784433e-07, "loss": 0.4927, "step": 16700 }, { "epoch": 3.903345724907063, "grad_norm": 0.2850995659828186, "learning_rate": 5.473086356008779e-07, "loss": 0.4909, "step": 16800 }, { "epoch": 3.9265799256505574, "grad_norm": 0.25115731358528137, "learning_rate": 4.182264102233123e-07, "loss": 0.5, "step": 16900 }, { "epoch": 3.949814126394052, "grad_norm": 0.4323899745941162, "learning_rate": 2.8914418484574677e-07, "loss": 0.4861, "step": 17000 }, { "epoch": 3.9730483271375467, "grad_norm": 0.30076873302459717, "learning_rate": 1.6006195946818127e-07, "loss": 0.4855, "step": 17100 }, { "epoch": 3.996282527881041, "grad_norm": 0.2874129116535187, "learning_rate": 3.097973409061573e-08, "loss": 0.4957, "step": 17200 } ], "logging_steps": 100, "max_steps": 17216, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.805111076121907e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }