{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06802721088435375, "grad_norm": 8.727604866027832, "learning_rate": 1.3513513513513515e-05, "loss": 1.5031, "step": 10 }, { "epoch": 0.1360544217687075, "grad_norm": 1.7814197540283203, "learning_rate": 2.702702702702703e-05, "loss": 0.7658, "step": 20 }, { "epoch": 0.20408163265306123, "grad_norm": 1.980387568473816, "learning_rate": 4.0540540540540545e-05, "loss": 0.3154, "step": 30 }, { "epoch": 0.272108843537415, "grad_norm": 0.6935164928436279, "learning_rate": 5.405405405405406e-05, "loss": 0.1928, "step": 40 }, { "epoch": 0.3401360544217687, "grad_norm": 0.9458510875701904, "learning_rate": 6.756756756756757e-05, "loss": 0.1309, "step": 50 }, { "epoch": 0.40816326530612246, "grad_norm": 0.7734609842300415, "learning_rate": 8.108108108108109e-05, "loss": 0.1017, "step": 60 }, { "epoch": 0.47619047619047616, "grad_norm": 0.6477052569389343, "learning_rate": 9.45945945945946e-05, "loss": 0.0922, "step": 70 }, { "epoch": 0.54421768707483, "grad_norm": 0.36809656023979187, "learning_rate": 9.999544209971299e-05, "loss": 0.077, "step": 80 }, { "epoch": 0.6122448979591837, "grad_norm": 0.5711944103240967, "learning_rate": 9.996759127387258e-05, "loss": 0.0734, "step": 90 }, { "epoch": 0.6802721088435374, "grad_norm": 0.7092326879501343, "learning_rate": 9.991443587612567e-05, "loss": 0.0723, "step": 100 }, { "epoch": 0.7482993197278912, "grad_norm": 0.48586952686309814, "learning_rate": 9.983600282541213e-05, "loss": 0.0633, "step": 110 }, { "epoch": 0.8163265306122449, "grad_norm": 0.5356522798538208, "learning_rate": 9.97323318417729e-05, "loss": 0.0533, "step": 120 }, { "epoch": 0.8843537414965986, "grad_norm": 0.5567297339439392, "learning_rate": 9.960347542623504e-05, "loss": 0.0482, "step": 130 }, { "epoch": 0.9523809523809523, "grad_norm": 0.27179425954818726, "learning_rate": 9.944949883422408e-05, "loss": 0.0485, "step": 140 }, { "epoch": 1.0204081632653061, "grad_norm": 0.3694400191307068, "learning_rate": 9.927048004251747e-05, "loss": 0.0434, "step": 150 }, { "epoch": 1.08843537414966, "grad_norm": 0.49855223298072815, "learning_rate": 9.906650970975558e-05, "loss": 0.0412, "step": 160 }, { "epoch": 1.1564625850340136, "grad_norm": 0.34035682678222656, "learning_rate": 9.883769113053039e-05, "loss": 0.0349, "step": 170 }, { "epoch": 1.2244897959183674, "grad_norm": 0.3545050621032715, "learning_rate": 9.858414018307503e-05, "loss": 0.0378, "step": 180 }, { "epoch": 1.2925170068027212, "grad_norm": 0.2835242450237274, "learning_rate": 9.830598527058082e-05, "loss": 0.0336, "step": 190 }, { "epoch": 1.3605442176870748, "grad_norm": 0.5776814222335815, "learning_rate": 9.800336725617135e-05, "loss": 0.0368, "step": 200 }, { "epoch": 1.4285714285714286, "grad_norm": 0.32552891969680786, "learning_rate": 9.767643939156658e-05, "loss": 0.0363, "step": 210 }, { "epoch": 1.4965986394557822, "grad_norm": 0.3568095266819, "learning_rate": 9.732536723947321e-05, "loss": 0.0343, "step": 220 }, { "epoch": 1.564625850340136, "grad_norm": 0.291451632976532, "learning_rate": 9.695032858974042e-05, "loss": 0.0304, "step": 230 }, { "epoch": 1.6326530612244898, "grad_norm": 0.46555641293525696, "learning_rate": 9.655151336932362e-05, "loss": 0.0295, "step": 240 }, { "epoch": 1.7006802721088436, "grad_norm": 0.4064323306083679, "learning_rate": 9.612912354610171e-05, "loss": 0.0313, "step": 250 }, { "epoch": 1.7687074829931972, "grad_norm": 0.4117746353149414, "learning_rate": 9.568337302659651e-05, "loss": 0.0308, "step": 260 }, { "epoch": 1.836734693877551, "grad_norm": 0.38040801882743835, "learning_rate": 9.521448754764639e-05, "loss": 0.0283, "step": 270 }, { "epoch": 1.9047619047619047, "grad_norm": 0.3420761227607727, "learning_rate": 9.472270456208855e-05, "loss": 0.0268, "step": 280 }, { "epoch": 1.9727891156462585, "grad_norm": 0.39420321583747864, "learning_rate": 9.420827311850836e-05, "loss": 0.0299, "step": 290 }, { "epoch": 2.0408163265306123, "grad_norm": 0.5203326940536499, "learning_rate": 9.367145373511619e-05, "loss": 0.0285, "step": 300 }, { "epoch": 2.108843537414966, "grad_norm": 0.2856321334838867, "learning_rate": 9.311251826781587e-05, "loss": 0.0297, "step": 310 }, { "epoch": 2.17687074829932, "grad_norm": 0.39213433861732483, "learning_rate": 9.25317497725315e-05, "loss": 0.0297, "step": 320 }, { "epoch": 2.2448979591836733, "grad_norm": 0.3779311776161194, "learning_rate": 9.192944236186236e-05, "loss": 0.0267, "step": 330 }, { "epoch": 2.312925170068027, "grad_norm": 0.18286894261837006, "learning_rate": 9.130590105613854e-05, "loss": 0.0287, "step": 340 }, { "epoch": 2.380952380952381, "grad_norm": 0.29343509674072266, "learning_rate": 9.066144162895258e-05, "loss": 0.0281, "step": 350 }, { "epoch": 2.4489795918367347, "grad_norm": 0.4211502969264984, "learning_rate": 8.999639044724555e-05, "loss": 0.0278, "step": 360 }, { "epoch": 2.5170068027210886, "grad_norm": 0.3153688311576843, "learning_rate": 8.931108430602834e-05, "loss": 0.026, "step": 370 }, { "epoch": 2.5850340136054424, "grad_norm": 0.5064184069633484, "learning_rate": 8.860587025782214e-05, "loss": 0.0279, "step": 380 }, { "epoch": 2.6530612244897958, "grad_norm": 0.48021095991134644, "learning_rate": 8.788110543690416e-05, "loss": 0.0275, "step": 390 }, { "epoch": 2.7210884353741496, "grad_norm": 0.3844946622848511, "learning_rate": 8.713715687844772e-05, "loss": 0.0262, "step": 400 }, { "epoch": 2.7891156462585034, "grad_norm": 0.40209805965423584, "learning_rate": 8.637440133264858e-05, "loss": 0.0234, "step": 410 }, { "epoch": 2.857142857142857, "grad_norm": 0.21443378925323486, "learning_rate": 8.55932250739311e-05, "loss": 0.0257, "step": 420 }, { "epoch": 2.925170068027211, "grad_norm": 0.33836257457733154, "learning_rate": 8.479402370533127e-05, "loss": 0.0256, "step": 430 }, { "epoch": 2.9931972789115644, "grad_norm": 0.4585645794868469, "learning_rate": 8.397720195815562e-05, "loss": 0.0263, "step": 440 }, { "epoch": 3.061224489795918, "grad_norm": 0.31649190187454224, "learning_rate": 8.314317348701723e-05, "loss": 0.0278, "step": 450 }, { "epoch": 3.129251700680272, "grad_norm": 0.2884267270565033, "learning_rate": 8.2292360660353e-05, "loss": 0.0252, "step": 460 }, { "epoch": 3.197278911564626, "grad_norm": 0.3036350607872009, "learning_rate": 8.142519434652782e-05, "loss": 0.0263, "step": 470 }, { "epoch": 3.2653061224489797, "grad_norm": 0.3131781816482544, "learning_rate": 8.054211369563447e-05, "loss": 0.0235, "step": 480 }, { "epoch": 3.3333333333333335, "grad_norm": 0.25176069140434265, "learning_rate": 7.96435659170993e-05, "loss": 0.0226, "step": 490 }, { "epoch": 3.4013605442176873, "grad_norm": 0.19175677001476288, "learning_rate": 7.873000605320659e-05, "loss": 0.0227, "step": 500 }, { "epoch": 3.4693877551020407, "grad_norm": 0.24616192281246185, "learning_rate": 7.780189674865616e-05, "loss": 0.0215, "step": 510 }, { "epoch": 3.5374149659863945, "grad_norm": 0.3125239610671997, "learning_rate": 7.685970801627108e-05, "loss": 0.0228, "step": 520 }, { "epoch": 3.6054421768707483, "grad_norm": 0.2894313633441925, "learning_rate": 7.590391699897375e-05, "loss": 0.0227, "step": 530 }, { "epoch": 3.673469387755102, "grad_norm": 0.2937842905521393, "learning_rate": 7.493500772815149e-05, "loss": 0.0229, "step": 540 }, { "epoch": 3.741496598639456, "grad_norm": 0.24282997846603394, "learning_rate": 7.395347087853349e-05, "loss": 0.0202, "step": 550 }, { "epoch": 3.8095238095238093, "grad_norm": 0.23689059913158417, "learning_rate": 7.29598035197033e-05, "loss": 0.023, "step": 560 }, { "epoch": 3.877551020408163, "grad_norm": 0.2780821919441223, "learning_rate": 7.195450886437334e-05, "loss": 0.0209, "step": 570 }, { "epoch": 3.945578231292517, "grad_norm": 0.21368242800235748, "learning_rate": 7.093809601354769e-05, "loss": 0.0223, "step": 580 }, { "epoch": 4.01360544217687, "grad_norm": 0.2794683873653412, "learning_rate": 6.991107969870363e-05, "loss": 0.0217, "step": 590 }, { "epoch": 4.081632653061225, "grad_norm": 0.3907700181007385, "learning_rate": 6.887398002112129e-05, "loss": 0.0244, "step": 600 }, { "epoch": 4.149659863945578, "grad_norm": 0.25769805908203125, "learning_rate": 6.782732218849424e-05, "loss": 0.0218, "step": 610 }, { "epoch": 4.217687074829932, "grad_norm": 0.3626869320869446, "learning_rate": 6.677163624895393e-05, "loss": 0.0232, "step": 620 }, { "epoch": 4.285714285714286, "grad_norm": 0.31296688318252563, "learning_rate": 6.570745682264288e-05, "loss": 0.0195, "step": 630 }, { "epoch": 4.35374149659864, "grad_norm": 0.2178487330675125, "learning_rate": 6.463532283097247e-05, "loss": 0.0185, "step": 640 }, { "epoch": 4.421768707482993, "grad_norm": 0.3192322552204132, "learning_rate": 6.355577722370264e-05, "loss": 0.0205, "step": 650 }, { "epoch": 4.489795918367347, "grad_norm": 0.2512325346469879, "learning_rate": 6.246936670398136e-05, "loss": 0.019, "step": 660 }, { "epoch": 4.557823129251701, "grad_norm": 0.23534470796585083, "learning_rate": 6.137664145148339e-05, "loss": 0.0194, "step": 670 }, { "epoch": 4.625850340136054, "grad_norm": 0.25171300768852234, "learning_rate": 6.027815484378848e-05, "loss": 0.0206, "step": 680 }, { "epoch": 4.6938775510204085, "grad_norm": 0.2704395055770874, "learning_rate": 5.9174463176140115e-05, "loss": 0.0203, "step": 690 }, { "epoch": 4.761904761904762, "grad_norm": 0.23156875371932983, "learning_rate": 5.8066125379726576e-05, "loss": 0.0224, "step": 700 }, { "epoch": 4.829931972789115, "grad_norm": 0.2349930703639984, "learning_rate": 5.695370273862721e-05, "loss": 0.0183, "step": 710 }, { "epoch": 4.8979591836734695, "grad_norm": 0.18136943876743317, "learning_rate": 5.583775860556717e-05, "loss": 0.0186, "step": 720 }, { "epoch": 4.965986394557823, "grad_norm": 0.16748912632465363, "learning_rate": 5.4718858116624416e-05, "loss": 0.0178, "step": 730 }, { "epoch": 5.034013605442177, "grad_norm": 0.24847835302352905, "learning_rate": 5.359756790503375e-05, "loss": 0.0198, "step": 740 }, { "epoch": 5.1020408163265305, "grad_norm": 0.2634091377258301, "learning_rate": 5.247445581423257e-05, "loss": 0.0194, "step": 750 }, { "epoch": 5.170068027210885, "grad_norm": 0.24373231828212738, "learning_rate": 5.1350090610293765e-05, "loss": 0.02, "step": 760 }, { "epoch": 5.238095238095238, "grad_norm": 0.25535136461257935, "learning_rate": 5.0225041693891396e-05, "loss": 0.0206, "step": 770 }, { "epoch": 5.3061224489795915, "grad_norm": 0.19376784563064575, "learning_rate": 4.909987881194497e-05, "loss": 0.0171, "step": 780 }, { "epoch": 5.374149659863946, "grad_norm": 0.3071853220462799, "learning_rate": 4.797517176908836e-05, "loss": 0.0169, "step": 790 }, { "epoch": 5.442176870748299, "grad_norm": 0.3157635033130646, "learning_rate": 4.685149013910961e-05, "loss": 0.0203, "step": 800 }, { "epoch": 5.510204081632653, "grad_norm": 0.4202154874801636, "learning_rate": 4.572940297650747e-05, "loss": 0.0209, "step": 810 }, { "epoch": 5.578231292517007, "grad_norm": 0.22340843081474304, "learning_rate": 4.460947852831096e-05, "loss": 0.0188, "step": 820 }, { "epoch": 5.646258503401361, "grad_norm": 0.3132137358188629, "learning_rate": 4.349228394630808e-05, "loss": 0.0174, "step": 830 }, { "epoch": 5.714285714285714, "grad_norm": 0.3128272294998169, "learning_rate": 4.2378384999828736e-05, "loss": 0.0165, "step": 840 }, { "epoch": 5.782312925170068, "grad_norm": 0.2383180409669876, "learning_rate": 4.1268345789228155e-05, "loss": 0.0173, "step": 850 }, { "epoch": 5.850340136054422, "grad_norm": 0.206215962767601, "learning_rate": 4.0162728460215346e-05, "loss": 0.0173, "step": 860 }, { "epoch": 5.918367346938775, "grad_norm": 0.2179701030254364, "learning_rate": 3.9062092919171414e-05, "loss": 0.0189, "step": 870 }, { "epoch": 5.986394557823129, "grad_norm": 0.1265663504600525, "learning_rate": 3.796699654960197e-05, "loss": 0.0189, "step": 880 }, { "epoch": 6.054421768707483, "grad_norm": 0.22085952758789062, "learning_rate": 3.687799392986714e-05, "loss": 0.0171, "step": 890 }, { "epoch": 6.122448979591836, "grad_norm": 0.269628643989563, "learning_rate": 3.57956365523322e-05, "loss": 0.0165, "step": 900 }, { "epoch": 6.190476190476191, "grad_norm": 0.17897725105285645, "learning_rate": 3.4720472544080905e-05, "loss": 0.0186, "step": 910 }, { "epoch": 6.258503401360544, "grad_norm": 0.20135192573070526, "learning_rate": 3.365304638933322e-05, "loss": 0.0166, "step": 920 }, { "epoch": 6.326530612244898, "grad_norm": 0.13800643384456635, "learning_rate": 3.2593898653707775e-05, "loss": 0.0178, "step": 930 }, { "epoch": 6.394557823129252, "grad_norm": 0.20002074539661407, "learning_rate": 3.1543565710468744e-05, "loss": 0.018, "step": 940 }, { "epoch": 6.462585034013605, "grad_norm": 0.21418657898902893, "learning_rate": 3.0502579468895943e-05, "loss": 0.0165, "step": 950 }, { "epoch": 6.530612244897959, "grad_norm": 0.14232315123081207, "learning_rate": 2.9471467104915453e-05, "loss": 0.0174, "step": 960 }, { "epoch": 6.598639455782313, "grad_norm": 0.24349236488342285, "learning_rate": 2.8450750794127312e-05, "loss": 0.0161, "step": 970 }, { "epoch": 6.666666666666667, "grad_norm": 0.21397683024406433, "learning_rate": 2.744094744736566e-05, "loss": 0.0157, "step": 980 }, { "epoch": 6.73469387755102, "grad_norm": 0.16351279616355896, "learning_rate": 2.6442568448924754e-05, "loss": 0.0149, "step": 990 }, { "epoch": 6.802721088435375, "grad_norm": 0.22943644225597382, "learning_rate": 2.5456119397583923e-05, "loss": 0.0151, "step": 1000 }, { "epoch": 6.870748299319728, "grad_norm": 0.23005446791648865, "learning_rate": 2.4482099850562494e-05, "loss": 0.0158, "step": 1010 }, { "epoch": 6.938775510204081, "grad_norm": 0.23477675020694733, "learning_rate": 2.3521003070534065e-05, "loss": 0.0177, "step": 1020 }, { "epoch": 7.006802721088436, "grad_norm": 0.1673477590084076, "learning_rate": 2.257331577582865e-05, "loss": 0.0173, "step": 1030 }, { "epoch": 7.074829931972789, "grad_norm": 0.2435804158449173, "learning_rate": 2.1639517893948925e-05, "loss": 0.017, "step": 1040 }, { "epoch": 7.142857142857143, "grad_norm": 0.1946263313293457, "learning_rate": 2.07200823185254e-05, "loss": 0.0156, "step": 1050 }, { "epoch": 7.210884353741497, "grad_norm": 0.14593501389026642, "learning_rate": 1.9815474669833982e-05, "loss": 0.0158, "step": 1060 }, { "epoch": 7.27891156462585, "grad_norm": 0.22885189950466156, "learning_rate": 1.8926153058996448e-05, "loss": 0.0164, "step": 1070 }, { "epoch": 7.346938775510204, "grad_norm": 0.27215731143951416, "learning_rate": 1.8052567855984158e-05, "loss": 0.0159, "step": 1080 }, { "epoch": 7.414965986394558, "grad_norm": 0.2108229249715805, "learning_rate": 1.719516146154169e-05, "loss": 0.018, "step": 1090 }, { "epoch": 7.482993197278912, "grad_norm": 0.19546398520469666, "learning_rate": 1.6354368083146532e-05, "loss": 0.015, "step": 1100 }, { "epoch": 7.551020408163265, "grad_norm": 0.19744373857975006, "learning_rate": 1.553061351511772e-05, "loss": 0.0161, "step": 1110 }, { "epoch": 7.619047619047619, "grad_norm": 0.15858300030231476, "learning_rate": 1.4724314922985339e-05, "loss": 0.0162, "step": 1120 }, { "epoch": 7.687074829931973, "grad_norm": 0.23708970844745636, "learning_rate": 1.3935880632229614e-05, "loss": 0.0163, "step": 1130 }, { "epoch": 7.755102040816326, "grad_norm": 0.23019392788410187, "learning_rate": 1.3165709921496872e-05, "loss": 0.0165, "step": 1140 }, { "epoch": 7.8231292517006805, "grad_norm": 0.1643388718366623, "learning_rate": 1.2414192820396985e-05, "loss": 0.0151, "step": 1150 }, { "epoch": 7.891156462585034, "grad_norm": 0.16524362564086914, "learning_rate": 1.168170991198464e-05, "loss": 0.0138, "step": 1160 }, { "epoch": 7.959183673469388, "grad_norm": 0.17709355056285858, "learning_rate": 1.0968632140024681e-05, "loss": 0.0148, "step": 1170 }, { "epoch": 8.02721088435374, "grad_norm": 0.12948837876319885, "learning_rate": 1.027532062113879e-05, "loss": 0.0153, "step": 1180 }, { "epoch": 8.095238095238095, "grad_norm": 0.2600167989730835, "learning_rate": 9.602126461929001e-06, "loss": 0.016, "step": 1190 }, { "epoch": 8.16326530612245, "grad_norm": 0.144463911652565, "learning_rate": 8.949390581170342e-06, "loss": 0.0149, "step": 1200 }, { "epoch": 8.231292517006803, "grad_norm": 0.20120234787464142, "learning_rate": 8.317443537162922e-06, "loss": 0.0148, "step": 1210 }, { "epoch": 8.299319727891156, "grad_norm": 0.19846169650554657, "learning_rate": 7.706605360330593e-06, "loss": 0.0151, "step": 1220 }, { "epoch": 8.36734693877551, "grad_norm": 0.2220635712146759, "learning_rate": 7.117185391151371e-06, "loss": 0.0155, "step": 1230 }, { "epoch": 8.435374149659864, "grad_norm": 0.1322047859430313, "learning_rate": 6.54948212350125e-06, "loss": 0.0157, "step": 1240 }, { "epoch": 8.503401360544217, "grad_norm": 0.16075831651687622, "learning_rate": 6.003783053491024e-06, "loss": 0.0132, "step": 1250 }, { "epoch": 8.571428571428571, "grad_norm": 0.2842198610305786, "learning_rate": 5.480364533872651e-06, "loss": 0.0161, "step": 1260 }, { "epoch": 8.639455782312925, "grad_norm": 0.1375337690114975, "learning_rate": 4.979491634088712e-06, "loss": 0.0149, "step": 1270 }, { "epoch": 8.70748299319728, "grad_norm": 0.2565004527568817, "learning_rate": 4.5014180060360846e-06, "loss": 0.0149, "step": 1280 }, { "epoch": 8.775510204081632, "grad_norm": 0.33463218808174133, "learning_rate": 4.0463857556115925e-06, "loss": 0.0157, "step": 1290 }, { "epoch": 8.843537414965986, "grad_norm": 0.1563730388879776, "learning_rate": 3.614625320104831e-06, "loss": 0.0131, "step": 1300 }, { "epoch": 8.91156462585034, "grad_norm": 0.1622951179742813, "learning_rate": 3.2063553515001842e-06, "loss": 0.0125, "step": 1310 }, { "epoch": 8.979591836734693, "grad_norm": 0.2716583013534546, "learning_rate": 2.821782605747142e-06, "loss": 0.016, "step": 1320 }, { "epoch": 9.047619047619047, "grad_norm": 0.30831456184387207, "learning_rate": 2.4611018380550298e-06, "loss": 0.017, "step": 1330 }, { "epoch": 9.115646258503402, "grad_norm": 0.15048925578594208, "learning_rate": 2.1244957042651393e-06, "loss": 0.0155, "step": 1340 }, { "epoch": 9.183673469387756, "grad_norm": 0.19770754873752594, "learning_rate": 1.8121346683502182e-06, "loss": 0.0127, "step": 1350 }, { "epoch": 9.251700680272108, "grad_norm": 0.1272617131471634, "learning_rate": 1.5241769160881103e-06, "loss": 0.0152, "step": 1360 }, { "epoch": 9.319727891156463, "grad_norm": 0.20454399287700653, "learning_rate": 1.2607682749534721e-06, "loss": 0.0124, "step": 1370 }, { "epoch": 9.387755102040817, "grad_norm": 0.1954611837863922, "learning_rate": 1.022042140267726e-06, "loss": 0.016, "step": 1380 }, { "epoch": 9.45578231292517, "grad_norm": 0.09543488174676895, "learning_rate": 8.081194076451748e-07, "loss": 0.015, "step": 1390 }, { "epoch": 9.523809523809524, "grad_norm": 0.28707054257392883, "learning_rate": 6.191084117689872e-07, "loss": 0.0162, "step": 1400 }, { "epoch": 9.591836734693878, "grad_norm": 0.11492811143398285, "learning_rate": 4.551048715284445e-07, "loss": 0.0149, "step": 1410 }, { "epoch": 9.65986394557823, "grad_norm": 0.15751147270202637, "learning_rate": 3.1619184154496605e-07, "loss": 0.017, "step": 1420 }, { "epoch": 9.727891156462585, "grad_norm": 0.1487809270620346, "learning_rate": 2.0243967011164267e-07, "loss": 0.0119, "step": 1430 }, { "epoch": 9.795918367346939, "grad_norm": 0.19853556156158447, "learning_rate": 1.139059635674733e-07, "loss": 0.0152, "step": 1440 }, { "epoch": 9.863945578231293, "grad_norm": 0.1522453874349594, "learning_rate": 5.063555712436219e-08, "loss": 0.0138, "step": 1450 }, { "epoch": 9.931972789115646, "grad_norm": 0.28007861971855164, "learning_rate": 1.2660492161709059e-08, "loss": 0.0162, "step": 1460 }, { "epoch": 10.0, "grad_norm": 0.4790020287036896, "learning_rate": 0.0, "loss": 0.0152, "step": 1470 }, { "epoch": 10.0, "step": 1470, "total_flos": 1.7829168111230112e+17, "train_loss": 0.04274284860410658, "train_runtime": 1631.2922, "train_samples_per_second": 95.893, "train_steps_per_second": 0.901 } ], "logging_steps": 10, "max_steps": 1470, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7829168111230112e+17, "train_batch_size": 107, "trial_name": null, "trial_params": null }