{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998849252013809, "eval_steps": 500, "global_step": 217, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004602991944764097, "grad_norm": 1.9528553485870361, "learning_rate": 7.142857142857143e-07, "loss": 0.5319, "step": 1 }, { "epoch": 0.009205983889528193, "grad_norm": 1.8612775802612305, "learning_rate": 1.4285714285714286e-06, "loss": 0.5107, "step": 2 }, { "epoch": 0.01380897583429229, "grad_norm": 1.9053367376327515, "learning_rate": 2.1428571428571427e-06, "loss": 0.5124, "step": 3 }, { "epoch": 0.018411967779056387, "grad_norm": 1.76083505153656, "learning_rate": 2.8571428571428573e-06, "loss": 0.536, "step": 4 }, { "epoch": 0.023014959723820484, "grad_norm": 1.5051020383834839, "learning_rate": 3.5714285714285718e-06, "loss": 0.492, "step": 5 }, { "epoch": 0.02761795166858458, "grad_norm": 1.4977186918258667, "learning_rate": 4.2857142857142855e-06, "loss": 0.5108, "step": 6 }, { "epoch": 0.03222094361334868, "grad_norm": 1.1002470254898071, "learning_rate": 5e-06, "loss": 0.4918, "step": 7 }, { "epoch": 0.03682393555811277, "grad_norm": 1.1534695625305176, "learning_rate": 4.999720254525684e-06, "loss": 0.4705, "step": 8 }, { "epoch": 0.04142692750287687, "grad_norm": 1.756179928779602, "learning_rate": 4.998881080708759e-06, "loss": 0.4622, "step": 9 }, { "epoch": 0.04602991944764097, "grad_norm": 1.7668858766555786, "learning_rate": 4.997482666353287e-06, "loss": 0.4538, "step": 10 }, { "epoch": 0.05063291139240506, "grad_norm": 1.736067295074463, "learning_rate": 4.995525324419338e-06, "loss": 0.4832, "step": 11 }, { "epoch": 0.05523590333716916, "grad_norm": 1.1255316734313965, "learning_rate": 4.993009492952951e-06, "loss": 0.443, "step": 12 }, { "epoch": 0.05983889528193326, "grad_norm": 0.9772641658782959, "learning_rate": 4.989935734988098e-06, "loss": 0.4184, "step": 13 }, { "epoch": 0.06444188722669736, "grad_norm": 0.7824466228485107, "learning_rate": 4.986304738420684e-06, "loss": 0.4166, "step": 14 }, { "epoch": 0.06904487917146145, "grad_norm": 0.7207390666007996, "learning_rate": 4.982117315854594e-06, "loss": 0.4344, "step": 15 }, { "epoch": 0.07364787111622555, "grad_norm": 0.7086092829704285, "learning_rate": 4.977374404419838e-06, "loss": 0.4281, "step": 16 }, { "epoch": 0.07825086306098965, "grad_norm": 0.791322648525238, "learning_rate": 4.9720770655628216e-06, "loss": 0.4428, "step": 17 }, { "epoch": 0.08285385500575373, "grad_norm": 0.674933671951294, "learning_rate": 4.966226484808804e-06, "loss": 0.4129, "step": 18 }, { "epoch": 0.08745684695051784, "grad_norm": 0.615633487701416, "learning_rate": 4.959823971496575e-06, "loss": 0.4362, "step": 19 }, { "epoch": 0.09205983889528194, "grad_norm": 0.656779944896698, "learning_rate": 4.9528709584854316e-06, "loss": 0.3944, "step": 20 }, { "epoch": 0.09666283084004602, "grad_norm": 0.6281305551528931, "learning_rate": 4.9453690018345144e-06, "loss": 0.4247, "step": 21 }, { "epoch": 0.10126582278481013, "grad_norm": 0.6116930842399597, "learning_rate": 4.937319780454559e-06, "loss": 0.452, "step": 22 }, { "epoch": 0.10586881472957423, "grad_norm": 0.5711112022399902, "learning_rate": 4.9287250957321685e-06, "loss": 0.4176, "step": 23 }, { "epoch": 0.11047180667433831, "grad_norm": 0.5996497273445129, "learning_rate": 4.919586871126667e-06, "loss": 0.4235, "step": 24 }, { "epoch": 0.11507479861910241, "grad_norm": 0.6450675129890442, "learning_rate": 4.909907151739634e-06, "loss": 0.4128, "step": 25 }, { "epoch": 0.11967779056386652, "grad_norm": 0.6167646050453186, "learning_rate": 4.899688103857223e-06, "loss": 0.3971, "step": 26 }, { "epoch": 0.12428078250863062, "grad_norm": 0.5306232571601868, "learning_rate": 4.8889320144653525e-06, "loss": 0.3675, "step": 27 }, { "epoch": 0.12888377445339472, "grad_norm": 0.5794817209243774, "learning_rate": 4.8776412907378845e-06, "loss": 0.4189, "step": 28 }, { "epoch": 0.1334867663981588, "grad_norm": 0.5581746697425842, "learning_rate": 4.865818459497911e-06, "loss": 0.4309, "step": 29 }, { "epoch": 0.1380897583429229, "grad_norm": 0.5573146939277649, "learning_rate": 4.853466166652259e-06, "loss": 0.4159, "step": 30 }, { "epoch": 0.142692750287687, "grad_norm": 0.5848842859268188, "learning_rate": 4.8405871765993435e-06, "loss": 0.4043, "step": 31 }, { "epoch": 0.1472957422324511, "grad_norm": 0.5522124767303467, "learning_rate": 4.827184371610511e-06, "loss": 0.4331, "step": 32 }, { "epoch": 0.1518987341772152, "grad_norm": 0.542629063129425, "learning_rate": 4.813260751184992e-06, "loss": 0.4027, "step": 33 }, { "epoch": 0.1565017261219793, "grad_norm": 0.6027759313583374, "learning_rate": 4.7988194313786275e-06, "loss": 0.4286, "step": 34 }, { "epoch": 0.1611047180667434, "grad_norm": 0.5948846936225891, "learning_rate": 4.783863644106502e-06, "loss": 0.4334, "step": 35 }, { "epoch": 0.16570771001150747, "grad_norm": 0.5355780720710754, "learning_rate": 4.7683967364196624e-06, "loss": 0.4004, "step": 36 }, { "epoch": 0.17031070195627157, "grad_norm": 0.5100039839744568, "learning_rate": 4.752422169756048e-06, "loss": 0.4058, "step": 37 }, { "epoch": 0.17491369390103567, "grad_norm": 0.5452431440353394, "learning_rate": 4.735943519165843e-06, "loss": 0.4199, "step": 38 }, { "epoch": 0.17951668584579977, "grad_norm": 0.5305486917495728, "learning_rate": 4.718964472511386e-06, "loss": 0.3845, "step": 39 }, { "epoch": 0.18411967779056387, "grad_norm": 0.5316929817199707, "learning_rate": 4.701488829641845e-06, "loss": 0.385, "step": 40 }, { "epoch": 0.18872266973532797, "grad_norm": 0.5721151232719421, "learning_rate": 4.683520501542825e-06, "loss": 0.4199, "step": 41 }, { "epoch": 0.19332566168009205, "grad_norm": 0.5623852610588074, "learning_rate": 4.665063509461098e-06, "loss": 0.393, "step": 42 }, { "epoch": 0.19792865362485615, "grad_norm": 0.5426440238952637, "learning_rate": 4.646121984004666e-06, "loss": 0.4143, "step": 43 }, { "epoch": 0.20253164556962025, "grad_norm": 0.5393860936164856, "learning_rate": 4.626700164218349e-06, "loss": 0.419, "step": 44 }, { "epoch": 0.20713463751438435, "grad_norm": 0.5237355828285217, "learning_rate": 4.606802396635098e-06, "loss": 0.3802, "step": 45 }, { "epoch": 0.21173762945914845, "grad_norm": 0.5436042547225952, "learning_rate": 4.586433134303257e-06, "loss": 0.4001, "step": 46 }, { "epoch": 0.21634062140391255, "grad_norm": 0.5216776132583618, "learning_rate": 4.565596935789987e-06, "loss": 0.3838, "step": 47 }, { "epoch": 0.22094361334867663, "grad_norm": 0.5988544821739197, "learning_rate": 4.544298464161079e-06, "loss": 0.4193, "step": 48 }, { "epoch": 0.22554660529344073, "grad_norm": 0.5307754278182983, "learning_rate": 4.522542485937369e-06, "loss": 0.4008, "step": 49 }, { "epoch": 0.23014959723820483, "grad_norm": 0.532762885093689, "learning_rate": 4.500333870028017e-06, "loss": 0.4135, "step": 50 }, { "epoch": 0.23475258918296893, "grad_norm": 0.5370959639549255, "learning_rate": 4.477677586640854e-06, "loss": 0.4008, "step": 51 }, { "epoch": 0.23935558112773303, "grad_norm": 0.5499489307403564, "learning_rate": 4.454578706170075e-06, "loss": 0.423, "step": 52 }, { "epoch": 0.24395857307249713, "grad_norm": 0.5250299572944641, "learning_rate": 4.431042398061499e-06, "loss": 0.3918, "step": 53 }, { "epoch": 0.24856156501726123, "grad_norm": 0.5530039668083191, "learning_rate": 4.4070739296556665e-06, "loss": 0.4121, "step": 54 }, { "epoch": 0.25316455696202533, "grad_norm": 0.5588434934616089, "learning_rate": 4.382678665009028e-06, "loss": 0.4127, "step": 55 }, { "epoch": 0.25776754890678943, "grad_norm": 0.5205167531967163, "learning_rate": 4.357862063693486e-06, "loss": 0.4041, "step": 56 }, { "epoch": 0.26237054085155354, "grad_norm": 0.5239887833595276, "learning_rate": 4.332629679574566e-06, "loss": 0.3856, "step": 57 }, { "epoch": 0.2669735327963176, "grad_norm": 0.527583122253418, "learning_rate": 4.3069871595684795e-06, "loss": 0.3868, "step": 58 }, { "epoch": 0.2715765247410817, "grad_norm": 0.5135255455970764, "learning_rate": 4.280940242378363e-06, "loss": 0.3801, "step": 59 }, { "epoch": 0.2761795166858458, "grad_norm": 0.5258612036705017, "learning_rate": 4.2544947572099795e-06, "loss": 0.3916, "step": 60 }, { "epoch": 0.2807825086306099, "grad_norm": 0.5070046186447144, "learning_rate": 4.227656622467162e-06, "loss": 0.3729, "step": 61 }, { "epoch": 0.285385500575374, "grad_norm": 0.5091351866722107, "learning_rate": 4.200431844427299e-06, "loss": 0.3743, "step": 62 }, { "epoch": 0.2899884925201381, "grad_norm": 0.534995436668396, "learning_rate": 4.172826515897146e-06, "loss": 0.3639, "step": 63 }, { "epoch": 0.2945914844649022, "grad_norm": 0.5401779413223267, "learning_rate": 4.144846814849282e-06, "loss": 0.3813, "step": 64 }, { "epoch": 0.2991944764096663, "grad_norm": 0.5409132242202759, "learning_rate": 4.116499003039499e-06, "loss": 0.3792, "step": 65 }, { "epoch": 0.3037974683544304, "grad_norm": 0.595007061958313, "learning_rate": 4.087789424605447e-06, "loss": 0.4266, "step": 66 }, { "epoch": 0.3084004602991945, "grad_norm": 0.5434048771858215, "learning_rate": 4.058724504646834e-06, "loss": 0.3893, "step": 67 }, { "epoch": 0.3130034522439586, "grad_norm": 0.5291695594787598, "learning_rate": 4.029310747787516e-06, "loss": 0.3922, "step": 68 }, { "epoch": 0.3176064441887227, "grad_norm": 0.4903387129306793, "learning_rate": 3.999554736719785e-06, "loss": 0.3626, "step": 69 }, { "epoch": 0.3222094361334868, "grad_norm": 0.5239253044128418, "learning_rate": 3.969463130731183e-06, "loss": 0.3702, "step": 70 }, { "epoch": 0.32681242807825084, "grad_norm": 0.534202516078949, "learning_rate": 3.939042664214185e-06, "loss": 0.3826, "step": 71 }, { "epoch": 0.33141542002301494, "grad_norm": 0.5841035842895508, "learning_rate": 3.908300145159055e-06, "loss": 0.4251, "step": 72 }, { "epoch": 0.33601841196777904, "grad_norm": 0.5424667000770569, "learning_rate": 3.8772424536302565e-06, "loss": 0.391, "step": 73 }, { "epoch": 0.34062140391254314, "grad_norm": 0.5215575098991394, "learning_rate": 3.845876540226707e-06, "loss": 0.4103, "step": 74 }, { "epoch": 0.34522439585730724, "grad_norm": 0.5513401031494141, "learning_rate": 3.8142094245262617e-06, "loss": 0.3976, "step": 75 }, { "epoch": 0.34982738780207134, "grad_norm": 0.5887762308120728, "learning_rate": 3.782248193514766e-06, "loss": 0.407, "step": 76 }, { "epoch": 0.35443037974683544, "grad_norm": 0.533482015132904, "learning_rate": 3.7500000000000005e-06, "loss": 0.3929, "step": 77 }, { "epoch": 0.35903337169159955, "grad_norm": 0.5082557201385498, "learning_rate": 3.7174720610109184e-06, "loss": 0.3859, "step": 78 }, { "epoch": 0.36363636363636365, "grad_norm": 0.538962721824646, "learning_rate": 3.684671656182497e-06, "loss": 0.3987, "step": 79 }, { "epoch": 0.36823935558112775, "grad_norm": 0.5121095180511475, "learning_rate": 3.6516061261265813e-06, "loss": 0.3847, "step": 80 }, { "epoch": 0.37284234752589185, "grad_norm": 0.5776516199111938, "learning_rate": 3.6182828707890816e-06, "loss": 0.3771, "step": 81 }, { "epoch": 0.37744533947065595, "grad_norm": 0.5664410591125488, "learning_rate": 3.5847093477938955e-06, "loss": 0.382, "step": 82 }, { "epoch": 0.38204833141542005, "grad_norm": 0.5202717781066895, "learning_rate": 3.5508930707739143e-06, "loss": 0.3608, "step": 83 }, { "epoch": 0.3866513233601841, "grad_norm": 0.5437837243080139, "learning_rate": 3.516841607689501e-06, "loss": 0.4171, "step": 84 }, { "epoch": 0.3912543153049482, "grad_norm": 0.5397577881813049, "learning_rate": 3.4825625791348093e-06, "loss": 0.3906, "step": 85 }, { "epoch": 0.3958573072497123, "grad_norm": 0.5708394646644592, "learning_rate": 3.4480636566323215e-06, "loss": 0.3802, "step": 86 }, { "epoch": 0.4004602991944764, "grad_norm": 0.5787578821182251, "learning_rate": 3.4133525609159883e-06, "loss": 0.3996, "step": 87 }, { "epoch": 0.4050632911392405, "grad_norm": 0.596613347530365, "learning_rate": 3.3784370602033572e-06, "loss": 0.3938, "step": 88 }, { "epoch": 0.4096662830840046, "grad_norm": 0.5240587592124939, "learning_rate": 3.3433249684570757e-06, "loss": 0.3798, "step": 89 }, { "epoch": 0.4142692750287687, "grad_norm": 0.5174093842506409, "learning_rate": 3.3080241436361505e-06, "loss": 0.3804, "step": 90 }, { "epoch": 0.4188722669735328, "grad_norm": 0.5353110432624817, "learning_rate": 3.272542485937369e-06, "loss": 0.3529, "step": 91 }, { "epoch": 0.4234752589182969, "grad_norm": 0.5501390695571899, "learning_rate": 3.236887936027261e-06, "loss": 0.4038, "step": 92 }, { "epoch": 0.428078250863061, "grad_norm": 0.5099984407424927, "learning_rate": 3.201068473265007e-06, "loss": 0.3755, "step": 93 }, { "epoch": 0.4326812428078251, "grad_norm": 0.5678063035011292, "learning_rate": 3.165092113916688e-06, "loss": 0.386, "step": 94 }, { "epoch": 0.4372842347525892, "grad_norm": 0.5758556723594666, "learning_rate": 3.128966909361272e-06, "loss": 0.413, "step": 95 }, { "epoch": 0.44188722669735325, "grad_norm": 0.5315706729888916, "learning_rate": 3.092700944288744e-06, "loss": 0.3824, "step": 96 }, { "epoch": 0.44649021864211735, "grad_norm": 0.5057752728462219, "learning_rate": 3.056302334890786e-06, "loss": 0.3582, "step": 97 }, { "epoch": 0.45109321058688145, "grad_norm": 0.5494191646575928, "learning_rate": 3.019779227044398e-06, "loss": 0.389, "step": 98 }, { "epoch": 0.45569620253164556, "grad_norm": 0.5015504360198975, "learning_rate": 2.9831397944888833e-06, "loss": 0.4077, "step": 99 }, { "epoch": 0.46029919447640966, "grad_norm": 0.5424397587776184, "learning_rate": 2.946392236996592e-06, "loss": 0.4167, "step": 100 }, { "epoch": 0.46490218642117376, "grad_norm": 0.526132345199585, "learning_rate": 2.9095447785378446e-06, "loss": 0.3941, "step": 101 }, { "epoch": 0.46950517836593786, "grad_norm": 0.5486814379692078, "learning_rate": 2.872605665440436e-06, "loss": 0.3957, "step": 102 }, { "epoch": 0.47410817031070196, "grad_norm": 0.5627570152282715, "learning_rate": 2.835583164544139e-06, "loss": 0.3898, "step": 103 }, { "epoch": 0.47871116225546606, "grad_norm": 0.5162888765335083, "learning_rate": 2.7984855613506107e-06, "loss": 0.3785, "step": 104 }, { "epoch": 0.48331415420023016, "grad_norm": 0.5262818932533264, "learning_rate": 2.761321158169134e-06, "loss": 0.3767, "step": 105 }, { "epoch": 0.48791714614499426, "grad_norm": 0.5049371123313904, "learning_rate": 2.724098272258584e-06, "loss": 0.3817, "step": 106 }, { "epoch": 0.49252013808975836, "grad_norm": 0.5545706748962402, "learning_rate": 2.686825233966061e-06, "loss": 0.4088, "step": 107 }, { "epoch": 0.49712313003452246, "grad_norm": 0.5535576939582825, "learning_rate": 2.649510384862586e-06, "loss": 0.4014, "step": 108 }, { "epoch": 0.5017261219792866, "grad_norm": 0.500004231929779, "learning_rate": 2.6121620758762877e-06, "loss": 0.3753, "step": 109 }, { "epoch": 0.5063291139240507, "grad_norm": 0.5142732858657837, "learning_rate": 2.5747886654234967e-06, "loss": 0.3788, "step": 110 }, { "epoch": 0.5109321058688148, "grad_norm": 0.5009371042251587, "learning_rate": 2.5373985175381595e-06, "loss": 0.3713, "step": 111 }, { "epoch": 0.5155350978135789, "grad_norm": 0.5388939380645752, "learning_rate": 2.5e-06, "loss": 0.3991, "step": 112 }, { "epoch": 0.520138089758343, "grad_norm": 0.5310930609703064, "learning_rate": 2.4626014824618418e-06, "loss": 0.3934, "step": 113 }, { "epoch": 0.5247410817031071, "grad_norm": 0.522955596446991, "learning_rate": 2.4252113345765045e-06, "loss": 0.384, "step": 114 }, { "epoch": 0.5293440736478712, "grad_norm": 0.5394883751869202, "learning_rate": 2.3878379241237136e-06, "loss": 0.4068, "step": 115 }, { "epoch": 0.5339470655926352, "grad_norm": 0.5207949876785278, "learning_rate": 2.3504896151374145e-06, "loss": 0.3903, "step": 116 }, { "epoch": 0.5385500575373993, "grad_norm": 0.544926643371582, "learning_rate": 2.3131747660339396e-06, "loss": 0.3914, "step": 117 }, { "epoch": 0.5431530494821634, "grad_norm": 0.5148684978485107, "learning_rate": 2.2759017277414165e-06, "loss": 0.3669, "step": 118 }, { "epoch": 0.5477560414269275, "grad_norm": 0.54156893491745, "learning_rate": 2.238678841830867e-06, "loss": 0.404, "step": 119 }, { "epoch": 0.5523590333716916, "grad_norm": 0.5111318230628967, "learning_rate": 2.2015144386493898e-06, "loss": 0.3735, "step": 120 }, { "epoch": 0.5569620253164557, "grad_norm": 0.5289490818977356, "learning_rate": 2.1644168354558623e-06, "loss": 0.3711, "step": 121 }, { "epoch": 0.5615650172612198, "grad_norm": 0.5365771055221558, "learning_rate": 2.1273943345595637e-06, "loss": 0.4074, "step": 122 }, { "epoch": 0.5661680092059839, "grad_norm": 0.5566303730010986, "learning_rate": 2.090455221462156e-06, "loss": 0.4354, "step": 123 }, { "epoch": 0.570771001150748, "grad_norm": 0.5099798440933228, "learning_rate": 2.053607763003409e-06, "loss": 0.3841, "step": 124 }, { "epoch": 0.5753739930955121, "grad_norm": 0.5596857070922852, "learning_rate": 2.0168602055111175e-06, "loss": 0.3855, "step": 125 }, { "epoch": 0.5799769850402762, "grad_norm": 0.5068130493164062, "learning_rate": 1.9802207729556023e-06, "loss": 0.3629, "step": 126 }, { "epoch": 0.5845799769850403, "grad_norm": 0.5307357907295227, "learning_rate": 1.9436976651092143e-06, "loss": 0.3741, "step": 127 }, { "epoch": 0.5891829689298044, "grad_norm": 0.5442883372306824, "learning_rate": 1.9072990557112567e-06, "loss": 0.3913, "step": 128 }, { "epoch": 0.5937859608745685, "grad_norm": 0.5069390535354614, "learning_rate": 1.8710330906387288e-06, "loss": 0.3686, "step": 129 }, { "epoch": 0.5983889528193326, "grad_norm": 0.5523373484611511, "learning_rate": 1.8349078860833125e-06, "loss": 0.3757, "step": 130 }, { "epoch": 0.6029919447640967, "grad_norm": 0.5090730786323547, "learning_rate": 1.7989315267349936e-06, "loss": 0.3639, "step": 131 }, { "epoch": 0.6075949367088608, "grad_norm": 0.5404998660087585, "learning_rate": 1.7631120639727396e-06, "loss": 0.3849, "step": 132 }, { "epoch": 0.6121979286536249, "grad_norm": 0.5424637794494629, "learning_rate": 1.7274575140626318e-06, "loss": 0.3775, "step": 133 }, { "epoch": 0.616800920598389, "grad_norm": 0.5275416374206543, "learning_rate": 1.6919758563638506e-06, "loss": 0.3715, "step": 134 }, { "epoch": 0.6214039125431531, "grad_norm": 0.5071358680725098, "learning_rate": 1.6566750315429254e-06, "loss": 0.3685, "step": 135 }, { "epoch": 0.6260069044879172, "grad_norm": 0.4969387352466583, "learning_rate": 1.6215629397966432e-06, "loss": 0.3526, "step": 136 }, { "epoch": 0.6306098964326813, "grad_norm": 0.5176503658294678, "learning_rate": 1.5866474390840126e-06, "loss": 0.3862, "step": 137 }, { "epoch": 0.6352128883774454, "grad_norm": 0.5372660160064697, "learning_rate": 1.5519363433676794e-06, "loss": 0.3885, "step": 138 }, { "epoch": 0.6398158803222095, "grad_norm": 0.5269474387168884, "learning_rate": 1.5174374208651913e-06, "loss": 0.3759, "step": 139 }, { "epoch": 0.6444188722669736, "grad_norm": 0.5645244121551514, "learning_rate": 1.4831583923105e-06, "loss": 0.4084, "step": 140 }, { "epoch": 0.6490218642117376, "grad_norm": 0.5266773104667664, "learning_rate": 1.4491069292260867e-06, "loss": 0.3878, "step": 141 }, { "epoch": 0.6536248561565017, "grad_norm": 0.5369369983673096, "learning_rate": 1.415290652206105e-06, "loss": 0.4166, "step": 142 }, { "epoch": 0.6582278481012658, "grad_norm": 0.5402231216430664, "learning_rate": 1.3817171292109182e-06, "loss": 0.3803, "step": 143 }, { "epoch": 0.6628308400460299, "grad_norm": 0.5421996116638184, "learning_rate": 1.3483938738734197e-06, "loss": 0.3696, "step": 144 }, { "epoch": 0.667433831990794, "grad_norm": 0.5073113441467285, "learning_rate": 1.3153283438175036e-06, "loss": 0.3721, "step": 145 }, { "epoch": 0.6720368239355581, "grad_norm": 0.5329331755638123, "learning_rate": 1.2825279389890818e-06, "loss": 0.3657, "step": 146 }, { "epoch": 0.6766398158803222, "grad_norm": 0.5001095533370972, "learning_rate": 1.2500000000000007e-06, "loss": 0.3682, "step": 147 }, { "epoch": 0.6812428078250863, "grad_norm": 0.5310607552528381, "learning_rate": 1.2177518064852348e-06, "loss": 0.392, "step": 148 }, { "epoch": 0.6858457997698504, "grad_norm": 0.5475272536277771, "learning_rate": 1.185790575473738e-06, "loss": 0.4002, "step": 149 }, { "epoch": 0.6904487917146145, "grad_norm": 0.5163525342941284, "learning_rate": 1.1541234597732947e-06, "loss": 0.3925, "step": 150 }, { "epoch": 0.6950517836593786, "grad_norm": 0.4944920539855957, "learning_rate": 1.122757546369744e-06, "loss": 0.3788, "step": 151 }, { "epoch": 0.6996547756041427, "grad_norm": 0.5034787058830261, "learning_rate": 1.0916998548409449e-06, "loss": 0.3696, "step": 152 }, { "epoch": 0.7042577675489068, "grad_norm": 0.5238296985626221, "learning_rate": 1.0609573357858166e-06, "loss": 0.3842, "step": 153 }, { "epoch": 0.7088607594936709, "grad_norm": 0.5249406099319458, "learning_rate": 1.0305368692688175e-06, "loss": 0.3879, "step": 154 }, { "epoch": 0.713463751438435, "grad_norm": 0.5139413475990295, "learning_rate": 1.0004452632802158e-06, "loss": 0.3795, "step": 155 }, { "epoch": 0.7180667433831991, "grad_norm": 0.49963629245758057, "learning_rate": 9.70689252212484e-07, "loss": 0.3702, "step": 156 }, { "epoch": 0.7226697353279632, "grad_norm": 0.5272103548049927, "learning_rate": 9.412754953531664e-07, "loss": 0.3726, "step": 157 }, { "epoch": 0.7272727272727273, "grad_norm": 0.4988826513290405, "learning_rate": 9.122105753945532e-07, "loss": 0.3764, "step": 158 }, { "epoch": 0.7318757192174914, "grad_norm": 0.542015552520752, "learning_rate": 8.835009969605013e-07, "loss": 0.4064, "step": 159 }, { "epoch": 0.7364787111622555, "grad_norm": 0.5293359756469727, "learning_rate": 8.551531851507186e-07, "loss": 0.3943, "step": 160 }, { "epoch": 0.7410817031070196, "grad_norm": 0.4968125820159912, "learning_rate": 8.271734841028553e-07, "loss": 0.3727, "step": 161 }, { "epoch": 0.7456846950517837, "grad_norm": 0.5024369955062866, "learning_rate": 7.995681555727011e-07, "loss": 0.3721, "step": 162 }, { "epoch": 0.7502876869965478, "grad_norm": 0.5272569060325623, "learning_rate": 7.723433775328385e-07, "loss": 0.4127, "step": 163 }, { "epoch": 0.7548906789413119, "grad_norm": 0.5185663104057312, "learning_rate": 7.455052427900214e-07, "loss": 0.3832, "step": 164 }, { "epoch": 0.759493670886076, "grad_norm": 0.5261924266815186, "learning_rate": 7.190597576216385e-07, "loss": 0.3804, "step": 165 }, { "epoch": 0.7640966628308401, "grad_norm": 0.5177868008613586, "learning_rate": 6.930128404315214e-07, "loss": 0.3873, "step": 166 }, { "epoch": 0.7686996547756041, "grad_norm": 0.5173397660255432, "learning_rate": 6.673703204254348e-07, "loss": 0.367, "step": 167 }, { "epoch": 0.7733026467203682, "grad_norm": 0.5059581995010376, "learning_rate": 6.421379363065142e-07, "loss": 0.3738, "step": 168 }, { "epoch": 0.7779056386651323, "grad_norm": 0.6127777695655823, "learning_rate": 6.17321334990973e-07, "loss": 0.3857, "step": 169 }, { "epoch": 0.7825086306098964, "grad_norm": 0.5311411023139954, "learning_rate": 5.929260703443337e-07, "loss": 0.3903, "step": 170 }, { "epoch": 0.7871116225546605, "grad_norm": 0.5173105597496033, "learning_rate": 5.689576019385015e-07, "loss": 0.3747, "step": 171 }, { "epoch": 0.7917146144994246, "grad_norm": 0.5185012817382812, "learning_rate": 5.454212938299256e-07, "loss": 0.3778, "step": 172 }, { "epoch": 0.7963176064441887, "grad_norm": 0.5253241658210754, "learning_rate": 5.223224133591475e-07, "loss": 0.3962, "step": 173 }, { "epoch": 0.8009205983889528, "grad_norm": 0.5264940857887268, "learning_rate": 4.996661299719846e-07, "loss": 0.3891, "step": 174 }, { "epoch": 0.8055235903337169, "grad_norm": 0.5051758289337158, "learning_rate": 4.774575140626317e-07, "loss": 0.3573, "step": 175 }, { "epoch": 0.810126582278481, "grad_norm": 0.5200861096382141, "learning_rate": 4.5570153583892165e-07, "loss": 0.3924, "step": 176 }, { "epoch": 0.8147295742232451, "grad_norm": 0.5002381205558777, "learning_rate": 4.344030642100133e-07, "loss": 0.3797, "step": 177 }, { "epoch": 0.8193325661680092, "grad_norm": 0.5222682952880859, "learning_rate": 4.1356686569674344e-07, "loss": 0.3688, "step": 178 }, { "epoch": 0.8239355581127733, "grad_norm": 0.5002561807632446, "learning_rate": 3.931976033649021e-07, "loss": 0.3858, "step": 179 }, { "epoch": 0.8285385500575374, "grad_norm": 0.5189615488052368, "learning_rate": 3.732998357816514e-07, "loss": 0.3782, "step": 180 }, { "epoch": 0.8331415420023015, "grad_norm": 0.5137009620666504, "learning_rate": 3.538780159953348e-07, "loss": 0.3729, "step": 181 }, { "epoch": 0.8377445339470656, "grad_norm": 0.533768892288208, "learning_rate": 3.3493649053890325e-07, "loss": 0.3887, "step": 182 }, { "epoch": 0.8423475258918297, "grad_norm": 0.5303562879562378, "learning_rate": 3.164794984571759e-07, "loss": 0.3734, "step": 183 }, { "epoch": 0.8469505178365938, "grad_norm": 0.5147203803062439, "learning_rate": 2.98511170358155e-07, "loss": 0.3757, "step": 184 }, { "epoch": 0.8515535097813579, "grad_norm": 0.487350732088089, "learning_rate": 2.810355274886148e-07, "loss": 0.3692, "step": 185 }, { "epoch": 0.856156501726122, "grad_norm": 0.4838736057281494, "learning_rate": 2.6405648083415835e-07, "loss": 0.3469, "step": 186 }, { "epoch": 0.8607594936708861, "grad_norm": 0.5008881092071533, "learning_rate": 2.4757783024395244e-07, "loss": 0.3714, "step": 187 }, { "epoch": 0.8653624856156502, "grad_norm": 0.4887051582336426, "learning_rate": 2.316032635803378e-07, "loss": 0.3681, "step": 188 }, { "epoch": 0.8699654775604143, "grad_norm": 0.521045446395874, "learning_rate": 2.1613635589349756e-07, "loss": 0.3828, "step": 189 }, { "epoch": 0.8745684695051784, "grad_norm": 0.5309224724769592, "learning_rate": 2.0118056862137358e-07, "loss": 0.3827, "step": 190 }, { "epoch": 0.8791714614499425, "grad_norm": 0.5095206499099731, "learning_rate": 1.8673924881500826e-07, "loss": 0.3707, "step": 191 }, { "epoch": 0.8837744533947065, "grad_norm": 0.49568819999694824, "learning_rate": 1.7281562838948968e-07, "loss": 0.3698, "step": 192 }, { "epoch": 0.8883774453394706, "grad_norm": 0.48604169487953186, "learning_rate": 1.59412823400657e-07, "loss": 0.3597, "step": 193 }, { "epoch": 0.8929804372842347, "grad_norm": 0.4996773898601532, "learning_rate": 1.465338333477423e-07, "loss": 0.3825, "step": 194 }, { "epoch": 0.8975834292289988, "grad_norm": 0.4933512508869171, "learning_rate": 1.3418154050208937e-07, "loss": 0.3477, "step": 195 }, { "epoch": 0.9021864211737629, "grad_norm": 0.5096411108970642, "learning_rate": 1.223587092621162e-07, "loss": 0.3609, "step": 196 }, { "epoch": 0.906789413118527, "grad_norm": 0.5132354497909546, "learning_rate": 1.1106798553464804e-07, "loss": 0.3804, "step": 197 }, { "epoch": 0.9113924050632911, "grad_norm": 0.5068391561508179, "learning_rate": 1.0031189614277765e-07, "loss": 0.372, "step": 198 }, { "epoch": 0.9159953970080552, "grad_norm": 0.4996238350868225, "learning_rate": 9.00928482603669e-08, "loss": 0.3798, "step": 199 }, { "epoch": 0.9205983889528193, "grad_norm": 0.5047898888587952, "learning_rate": 8.041312887333396e-08, "loss": 0.372, "step": 200 }, { "epoch": 0.9252013808975834, "grad_norm": 0.500909686088562, "learning_rate": 7.127490426783124e-08, "loss": 0.3909, "step": 201 }, { "epoch": 0.9298043728423475, "grad_norm": 0.5062527060508728, "learning_rate": 6.268021954544095e-08, "loss": 0.3921, "step": 202 }, { "epoch": 0.9344073647871116, "grad_norm": 0.5464354753494263, "learning_rate": 5.463099816548578e-08, "loss": 0.4175, "step": 203 }, { "epoch": 0.9390103567318757, "grad_norm": 0.4842507839202881, "learning_rate": 4.712904151456865e-08, "loss": 0.3589, "step": 204 }, { "epoch": 0.9436133486766398, "grad_norm": 0.5234150290489197, "learning_rate": 4.017602850342584e-08, "loss": 0.3839, "step": 205 }, { "epoch": 0.9482163406214039, "grad_norm": 0.5067636370658875, "learning_rate": 3.377351519119665e-08, "loss": 0.3901, "step": 206 }, { "epoch": 0.952819332566168, "grad_norm": 0.48845621943473816, "learning_rate": 2.7922934437178695e-08, "loss": 0.3694, "step": 207 }, { "epoch": 0.9574223245109321, "grad_norm": 0.5254659652709961, "learning_rate": 2.262559558016325e-08, "loss": 0.3935, "step": 208 }, { "epoch": 0.9620253164556962, "grad_norm": 0.5244017839431763, "learning_rate": 1.7882684145406616e-08, "loss": 0.3797, "step": 209 }, { "epoch": 0.9666283084004603, "grad_norm": 0.5155873894691467, "learning_rate": 1.3695261579316776e-08, "loss": 0.371, "step": 210 }, { "epoch": 0.9712313003452244, "grad_norm": 0.4954945147037506, "learning_rate": 1.006426501190233e-08, "loss": 0.3671, "step": 211 }, { "epoch": 0.9758342922899885, "grad_norm": 0.5454512238502502, "learning_rate": 6.990507047049677e-09, "loss": 0.3847, "step": 212 }, { "epoch": 0.9804372842347526, "grad_norm": 0.5421791076660156, "learning_rate": 4.474675580662113e-09, "loss": 0.3857, "step": 213 }, { "epoch": 0.9850402761795167, "grad_norm": 0.5033833980560303, "learning_rate": 2.5173336467135266e-09, "loss": 0.3734, "step": 214 }, { "epoch": 0.9896432681242808, "grad_norm": 0.5605362057685852, "learning_rate": 1.1189192912416935e-09, "loss": 0.3936, "step": 215 }, { "epoch": 0.9942462600690449, "grad_norm": 0.5034700632095337, "learning_rate": 2.797454743164174e-10, "loss": 0.3813, "step": 216 }, { "epoch": 0.998849252013809, "grad_norm": 0.5011968016624451, "learning_rate": 0.0, "loss": 0.4095, "step": 217 }, { "epoch": 0.998849252013809, "step": 217, "total_flos": 2.9080852357930025e+18, "train_loss": 0.39522024055230454, "train_runtime": 4203.0638, "train_samples_per_second": 13.229, "train_steps_per_second": 0.052 } ], "logging_steps": 1.0, "max_steps": 217, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9080852357930025e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }