{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7036059806508356, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003518029903254178, "grad_norm": 1.4669007062911987, "learning_rate": 0.0001, "loss": 2.9718, "step": 1 }, { "epoch": 0.007036059806508356, "grad_norm": 1.5240416526794434, "learning_rate": 9.949748743718594e-05, "loss": 3.0249, "step": 2 }, { "epoch": 0.010554089709762533, "grad_norm": 1.3310328722000122, "learning_rate": 9.899497487437186e-05, "loss": 2.7545, "step": 3 }, { "epoch": 0.014072119613016711, "grad_norm": 1.4892698526382446, "learning_rate": 9.84924623115578e-05, "loss": 2.6703, "step": 4 }, { "epoch": 0.01759014951627089, "grad_norm": 1.4727792739868164, "learning_rate": 9.798994974874372e-05, "loss": 2.4731, "step": 5 }, { "epoch": 0.021108179419525065, "grad_norm": 1.4451979398727417, "learning_rate": 9.748743718592965e-05, "loss": 2.2243, "step": 6 }, { "epoch": 0.024626209322779244, "grad_norm": 1.3103245496749878, "learning_rate": 9.698492462311559e-05, "loss": 2.0194, "step": 7 }, { "epoch": 0.028144239226033423, "grad_norm": 1.4852089881896973, "learning_rate": 9.64824120603015e-05, "loss": 1.9349, "step": 8 }, { "epoch": 0.0316622691292876, "grad_norm": 1.5170249938964844, "learning_rate": 9.597989949748745e-05, "loss": 1.7582, "step": 9 }, { "epoch": 0.03518029903254178, "grad_norm": 1.3428442478179932, "learning_rate": 9.547738693467337e-05, "loss": 1.6313, "step": 10 }, { "epoch": 0.03869832893579595, "grad_norm": 1.0400348901748657, "learning_rate": 9.49748743718593e-05, "loss": 1.4358, "step": 11 }, { "epoch": 0.04221635883905013, "grad_norm": 0.9891974329948425, "learning_rate": 9.447236180904523e-05, "loss": 1.3738, "step": 12 }, { "epoch": 0.04573438874230431, "grad_norm": 0.6980912685394287, "learning_rate": 9.396984924623115e-05, "loss": 1.425, "step": 13 }, { "epoch": 0.04925241864555849, "grad_norm": 0.6836680769920349, "learning_rate": 9.34673366834171e-05, "loss": 1.4575, "step": 14 }, { "epoch": 0.052770448548812667, "grad_norm": 0.9314870238304138, "learning_rate": 9.296482412060302e-05, "loss": 1.3206, "step": 15 }, { "epoch": 0.056288478452066845, "grad_norm": 0.6797922253608704, "learning_rate": 9.246231155778895e-05, "loss": 1.3724, "step": 16 }, { "epoch": 0.05980650835532102, "grad_norm": 0.6958814263343811, "learning_rate": 9.195979899497488e-05, "loss": 1.3661, "step": 17 }, { "epoch": 0.0633245382585752, "grad_norm": 0.7188398241996765, "learning_rate": 9.14572864321608e-05, "loss": 1.3242, "step": 18 }, { "epoch": 0.06684256816182937, "grad_norm": 0.8997742533683777, "learning_rate": 9.095477386934675e-05, "loss": 1.4049, "step": 19 }, { "epoch": 0.07036059806508356, "grad_norm": 0.8283623456954956, "learning_rate": 9.045226130653267e-05, "loss": 1.3121, "step": 20 }, { "epoch": 0.07387862796833773, "grad_norm": 0.8064684867858887, "learning_rate": 8.99497487437186e-05, "loss": 1.3451, "step": 21 }, { "epoch": 0.0773966578715919, "grad_norm": 0.8180544972419739, "learning_rate": 8.944723618090453e-05, "loss": 1.2111, "step": 22 }, { "epoch": 0.08091468777484609, "grad_norm": 0.8000004887580872, "learning_rate": 8.894472361809045e-05, "loss": 1.2933, "step": 23 }, { "epoch": 0.08443271767810026, "grad_norm": 0.8804137706756592, "learning_rate": 8.84422110552764e-05, "loss": 1.3173, "step": 24 }, { "epoch": 0.08795074758135445, "grad_norm": 0.8556327819824219, "learning_rate": 8.793969849246232e-05, "loss": 1.321, "step": 25 }, { "epoch": 0.09146877748460862, "grad_norm": 0.827410876750946, "learning_rate": 8.743718592964825e-05, "loss": 1.2195, "step": 26 }, { "epoch": 0.09498680738786279, "grad_norm": 0.9081262946128845, "learning_rate": 8.693467336683418e-05, "loss": 1.2451, "step": 27 }, { "epoch": 0.09850483729111698, "grad_norm": 0.9331269860267639, "learning_rate": 8.64321608040201e-05, "loss": 1.2204, "step": 28 }, { "epoch": 0.10202286719437115, "grad_norm": 1.0290558338165283, "learning_rate": 8.592964824120603e-05, "loss": 1.2379, "step": 29 }, { "epoch": 0.10554089709762533, "grad_norm": 1.1296031475067139, "learning_rate": 8.542713567839196e-05, "loss": 1.2412, "step": 30 }, { "epoch": 0.1090589270008795, "grad_norm": 1.1690081357955933, "learning_rate": 8.49246231155779e-05, "loss": 1.1888, "step": 31 }, { "epoch": 0.11257695690413369, "grad_norm": 1.1313647031784058, "learning_rate": 8.442211055276383e-05, "loss": 1.2961, "step": 32 }, { "epoch": 0.11609498680738786, "grad_norm": 1.1976656913757324, "learning_rate": 8.391959798994975e-05, "loss": 1.2387, "step": 33 }, { "epoch": 0.11961301671064203, "grad_norm": 1.20232355594635, "learning_rate": 8.341708542713568e-05, "loss": 1.3125, "step": 34 }, { "epoch": 0.12313104661389622, "grad_norm": 1.2482579946517944, "learning_rate": 8.291457286432161e-05, "loss": 1.322, "step": 35 }, { "epoch": 0.1266490765171504, "grad_norm": 1.0197736024856567, "learning_rate": 8.241206030150754e-05, "loss": 1.1192, "step": 36 }, { "epoch": 0.13016710642040458, "grad_norm": 0.9190375208854675, "learning_rate": 8.190954773869348e-05, "loss": 1.2522, "step": 37 }, { "epoch": 0.13368513632365875, "grad_norm": 0.7511453032493591, "learning_rate": 8.14070351758794e-05, "loss": 1.0525, "step": 38 }, { "epoch": 0.13720316622691292, "grad_norm": 0.7151877880096436, "learning_rate": 8.090452261306533e-05, "loss": 1.1839, "step": 39 }, { "epoch": 0.14072119613016712, "grad_norm": 0.6375951766967773, "learning_rate": 8.040201005025126e-05, "loss": 1.2203, "step": 40 }, { "epoch": 0.1442392260334213, "grad_norm": 0.6267354488372803, "learning_rate": 7.989949748743719e-05, "loss": 1.1996, "step": 41 }, { "epoch": 0.14775725593667546, "grad_norm": 0.5620112419128418, "learning_rate": 7.939698492462313e-05, "loss": 1.1745, "step": 42 }, { "epoch": 0.15127528583992964, "grad_norm": 0.6898969411849976, "learning_rate": 7.889447236180904e-05, "loss": 1.2377, "step": 43 }, { "epoch": 0.1547933157431838, "grad_norm": 0.5548388957977295, "learning_rate": 7.839195979899498e-05, "loss": 1.1654, "step": 44 }, { "epoch": 0.158311345646438, "grad_norm": 0.5869529843330383, "learning_rate": 7.788944723618091e-05, "loss": 1.1669, "step": 45 }, { "epoch": 0.16182937554969218, "grad_norm": 0.6272417902946472, "learning_rate": 7.738693467336684e-05, "loss": 1.132, "step": 46 }, { "epoch": 0.16534740545294635, "grad_norm": 0.6158267855644226, "learning_rate": 7.688442211055277e-05, "loss": 1.0767, "step": 47 }, { "epoch": 0.16886543535620052, "grad_norm": 0.661561906337738, "learning_rate": 7.638190954773869e-05, "loss": 1.1867, "step": 48 }, { "epoch": 0.1723834652594547, "grad_norm": 0.5605206489562988, "learning_rate": 7.587939698492463e-05, "loss": 1.1243, "step": 49 }, { "epoch": 0.1759014951627089, "grad_norm": 0.6338799595832825, "learning_rate": 7.537688442211056e-05, "loss": 1.1635, "step": 50 }, { "epoch": 0.17941952506596306, "grad_norm": 0.7251884937286377, "learning_rate": 7.487437185929649e-05, "loss": 1.1462, "step": 51 }, { "epoch": 0.18293755496921724, "grad_norm": 0.5688169598579407, "learning_rate": 7.437185929648241e-05, "loss": 1.1351, "step": 52 }, { "epoch": 0.1864555848724714, "grad_norm": 0.6056070923805237, "learning_rate": 7.386934673366834e-05, "loss": 1.1352, "step": 53 }, { "epoch": 0.18997361477572558, "grad_norm": 0.8283679485321045, "learning_rate": 7.336683417085427e-05, "loss": 1.2222, "step": 54 }, { "epoch": 0.19349164467897978, "grad_norm": 0.6316900253295898, "learning_rate": 7.28643216080402e-05, "loss": 1.2023, "step": 55 }, { "epoch": 0.19700967458223395, "grad_norm": 0.6092143058776855, "learning_rate": 7.236180904522614e-05, "loss": 1.0762, "step": 56 }, { "epoch": 0.20052770448548812, "grad_norm": 0.5600019097328186, "learning_rate": 7.185929648241206e-05, "loss": 1.0127, "step": 57 }, { "epoch": 0.2040457343887423, "grad_norm": 0.6157863736152649, "learning_rate": 7.135678391959799e-05, "loss": 1.1016, "step": 58 }, { "epoch": 0.2075637642919965, "grad_norm": 0.6391822099685669, "learning_rate": 7.085427135678392e-05, "loss": 1.2009, "step": 59 }, { "epoch": 0.21108179419525067, "grad_norm": 0.5637600421905518, "learning_rate": 7.035175879396985e-05, "loss": 1.1419, "step": 60 }, { "epoch": 0.21459982409850484, "grad_norm": 0.6826542019844055, "learning_rate": 6.984924623115579e-05, "loss": 1.1084, "step": 61 }, { "epoch": 0.218117854001759, "grad_norm": 0.6475107073783875, "learning_rate": 6.93467336683417e-05, "loss": 1.2033, "step": 62 }, { "epoch": 0.22163588390501318, "grad_norm": 0.5701493620872498, "learning_rate": 6.884422110552764e-05, "loss": 1.1425, "step": 63 }, { "epoch": 0.22515391380826738, "grad_norm": 0.5416231155395508, "learning_rate": 6.834170854271357e-05, "loss": 1.0869, "step": 64 }, { "epoch": 0.22867194371152155, "grad_norm": 0.611254870891571, "learning_rate": 6.78391959798995e-05, "loss": 1.1344, "step": 65 }, { "epoch": 0.23218997361477572, "grad_norm": 0.5644116401672363, "learning_rate": 6.733668341708544e-05, "loss": 1.0655, "step": 66 }, { "epoch": 0.2357080035180299, "grad_norm": 0.5953249931335449, "learning_rate": 6.683417085427135e-05, "loss": 1.1267, "step": 67 }, { "epoch": 0.23922603342128407, "grad_norm": 0.5902895331382751, "learning_rate": 6.633165829145729e-05, "loss": 1.1207, "step": 68 }, { "epoch": 0.24274406332453827, "grad_norm": 0.571882426738739, "learning_rate": 6.582914572864322e-05, "loss": 1.0945, "step": 69 }, { "epoch": 0.24626209322779244, "grad_norm": 0.6372458934783936, "learning_rate": 6.532663316582915e-05, "loss": 1.1933, "step": 70 }, { "epoch": 0.2497801231310466, "grad_norm": 0.6739147901535034, "learning_rate": 6.482412060301508e-05, "loss": 1.1202, "step": 71 }, { "epoch": 0.2532981530343008, "grad_norm": 0.6515147686004639, "learning_rate": 6.4321608040201e-05, "loss": 1.1685, "step": 72 }, { "epoch": 0.256816182937555, "grad_norm": 0.5706716775894165, "learning_rate": 6.381909547738694e-05, "loss": 1.1084, "step": 73 }, { "epoch": 0.26033421284080915, "grad_norm": 0.595585286617279, "learning_rate": 6.331658291457287e-05, "loss": 1.1218, "step": 74 }, { "epoch": 0.2638522427440633, "grad_norm": 0.6020475625991821, "learning_rate": 6.28140703517588e-05, "loss": 1.1282, "step": 75 }, { "epoch": 0.2673702726473175, "grad_norm": 0.628376305103302, "learning_rate": 6.231155778894473e-05, "loss": 1.1067, "step": 76 }, { "epoch": 0.27088830255057167, "grad_norm": 0.6371076107025146, "learning_rate": 6.180904522613065e-05, "loss": 1.1466, "step": 77 }, { "epoch": 0.27440633245382584, "grad_norm": 0.6206318140029907, "learning_rate": 6.130653266331658e-05, "loss": 1.0801, "step": 78 }, { "epoch": 0.27792436235708, "grad_norm": 0.6293841600418091, "learning_rate": 6.080402010050251e-05, "loss": 1.1644, "step": 79 }, { "epoch": 0.28144239226033424, "grad_norm": 0.6434080600738525, "learning_rate": 6.030150753768844e-05, "loss": 1.0589, "step": 80 }, { "epoch": 0.2849604221635884, "grad_norm": 0.5857638120651245, "learning_rate": 5.979899497487438e-05, "loss": 1.1711, "step": 81 }, { "epoch": 0.2884784520668426, "grad_norm": 0.6163449883460999, "learning_rate": 5.929648241206031e-05, "loss": 1.1627, "step": 82 }, { "epoch": 0.29199648197009676, "grad_norm": 0.6543634533882141, "learning_rate": 5.879396984924623e-05, "loss": 1.0909, "step": 83 }, { "epoch": 0.2955145118733509, "grad_norm": 0.6609559059143066, "learning_rate": 5.829145728643216e-05, "loss": 1.1505, "step": 84 }, { "epoch": 0.2990325417766051, "grad_norm": 0.5798302292823792, "learning_rate": 5.778894472361809e-05, "loss": 1.0834, "step": 85 }, { "epoch": 0.30255057167985927, "grad_norm": 0.6974066495895386, "learning_rate": 5.728643216080403e-05, "loss": 1.0965, "step": 86 }, { "epoch": 0.30606860158311344, "grad_norm": 0.67149817943573, "learning_rate": 5.6783919597989955e-05, "loss": 1.09, "step": 87 }, { "epoch": 0.3095866314863676, "grad_norm": 0.5761735439300537, "learning_rate": 5.628140703517588e-05, "loss": 1.1436, "step": 88 }, { "epoch": 0.3131046613896218, "grad_norm": 0.6142584681510925, "learning_rate": 5.577889447236181e-05, "loss": 1.0489, "step": 89 }, { "epoch": 0.316622691292876, "grad_norm": 0.6407614946365356, "learning_rate": 5.527638190954774e-05, "loss": 1.1449, "step": 90 }, { "epoch": 0.3201407211961302, "grad_norm": 0.6835021376609802, "learning_rate": 5.477386934673368e-05, "loss": 1.1332, "step": 91 }, { "epoch": 0.32365875109938436, "grad_norm": 0.5755856037139893, "learning_rate": 5.4271356783919604e-05, "loss": 1.1195, "step": 92 }, { "epoch": 0.32717678100263853, "grad_norm": 0.6232398748397827, "learning_rate": 5.376884422110553e-05, "loss": 1.1696, "step": 93 }, { "epoch": 0.3306948109058927, "grad_norm": 0.6193405389785767, "learning_rate": 5.3266331658291455e-05, "loss": 1.1106, "step": 94 }, { "epoch": 0.33421284080914687, "grad_norm": 0.6834057569503784, "learning_rate": 5.276381909547739e-05, "loss": 1.1349, "step": 95 }, { "epoch": 0.33773087071240104, "grad_norm": 0.7168384790420532, "learning_rate": 5.226130653266332e-05, "loss": 1.2054, "step": 96 }, { "epoch": 0.3412489006156552, "grad_norm": 0.6553971767425537, "learning_rate": 5.175879396984925e-05, "loss": 1.0975, "step": 97 }, { "epoch": 0.3447669305189094, "grad_norm": 0.6329600811004639, "learning_rate": 5.125628140703518e-05, "loss": 1.1212, "step": 98 }, { "epoch": 0.3482849604221636, "grad_norm": 0.6656339764595032, "learning_rate": 5.0753768844221104e-05, "loss": 1.1451, "step": 99 }, { "epoch": 0.3518029903254178, "grad_norm": 0.6817747950553894, "learning_rate": 5.0251256281407036e-05, "loss": 1.084, "step": 100 }, { "epoch": 0.35532102022867196, "grad_norm": 0.6384849548339844, "learning_rate": 4.974874371859297e-05, "loss": 1.047, "step": 101 }, { "epoch": 0.35883905013192613, "grad_norm": 0.6342082023620605, "learning_rate": 4.92462311557789e-05, "loss": 1.1122, "step": 102 }, { "epoch": 0.3623570800351803, "grad_norm": 0.6114000082015991, "learning_rate": 4.874371859296483e-05, "loss": 1.1094, "step": 103 }, { "epoch": 0.3658751099384345, "grad_norm": 0.6310352683067322, "learning_rate": 4.824120603015075e-05, "loss": 1.1508, "step": 104 }, { "epoch": 0.36939313984168864, "grad_norm": 0.6773234605789185, "learning_rate": 4.7738693467336685e-05, "loss": 1.0511, "step": 105 }, { "epoch": 0.3729111697449428, "grad_norm": 0.6625077724456787, "learning_rate": 4.723618090452262e-05, "loss": 1.1422, "step": 106 }, { "epoch": 0.376429199648197, "grad_norm": 0.6125949025154114, "learning_rate": 4.673366834170855e-05, "loss": 1.1189, "step": 107 }, { "epoch": 0.37994722955145116, "grad_norm": 0.684280514717102, "learning_rate": 4.6231155778894475e-05, "loss": 1.2249, "step": 108 }, { "epoch": 0.3834652594547054, "grad_norm": 0.8305927515029907, "learning_rate": 4.57286432160804e-05, "loss": 1.1758, "step": 109 }, { "epoch": 0.38698328935795956, "grad_norm": 0.6081312894821167, "learning_rate": 4.522613065326633e-05, "loss": 1.0853, "step": 110 }, { "epoch": 0.39050131926121373, "grad_norm": 0.716929018497467, "learning_rate": 4.4723618090452266e-05, "loss": 1.1903, "step": 111 }, { "epoch": 0.3940193491644679, "grad_norm": 0.5968315005302429, "learning_rate": 4.42211055276382e-05, "loss": 1.0717, "step": 112 }, { "epoch": 0.3975373790677221, "grad_norm": 0.6502510905265808, "learning_rate": 4.3718592964824124e-05, "loss": 1.0629, "step": 113 }, { "epoch": 0.40105540897097625, "grad_norm": 0.6408775448799133, "learning_rate": 4.321608040201005e-05, "loss": 1.0937, "step": 114 }, { "epoch": 0.4045734388742304, "grad_norm": 0.6137213110923767, "learning_rate": 4.271356783919598e-05, "loss": 1.0853, "step": 115 }, { "epoch": 0.4080914687774846, "grad_norm": 0.6401947736740112, "learning_rate": 4.2211055276381914e-05, "loss": 1.1542, "step": 116 }, { "epoch": 0.41160949868073876, "grad_norm": 0.6332412362098694, "learning_rate": 4.170854271356784e-05, "loss": 1.0731, "step": 117 }, { "epoch": 0.415127528583993, "grad_norm": 0.6274076700210571, "learning_rate": 4.120603015075377e-05, "loss": 1.0707, "step": 118 }, { "epoch": 0.41864555848724716, "grad_norm": 0.632633626461029, "learning_rate": 4.07035175879397e-05, "loss": 1.108, "step": 119 }, { "epoch": 0.42216358839050133, "grad_norm": 0.6979479193687439, "learning_rate": 4.020100502512563e-05, "loss": 1.1483, "step": 120 }, { "epoch": 0.4256816182937555, "grad_norm": 0.7355033755302429, "learning_rate": 3.969849246231156e-05, "loss": 1.1358, "step": 121 }, { "epoch": 0.4291996481970097, "grad_norm": 0.6254828572273254, "learning_rate": 3.919597989949749e-05, "loss": 1.1753, "step": 122 }, { "epoch": 0.43271767810026385, "grad_norm": 0.6851824522018433, "learning_rate": 3.869346733668342e-05, "loss": 1.0128, "step": 123 }, { "epoch": 0.436235708003518, "grad_norm": 0.6097928285598755, "learning_rate": 3.8190954773869346e-05, "loss": 1.1235, "step": 124 }, { "epoch": 0.4397537379067722, "grad_norm": 0.6748325824737549, "learning_rate": 3.768844221105528e-05, "loss": 1.0452, "step": 125 }, { "epoch": 0.44327176781002636, "grad_norm": 0.6666128039360046, "learning_rate": 3.7185929648241204e-05, "loss": 1.1075, "step": 126 }, { "epoch": 0.4467897977132806, "grad_norm": 0.7474984526634216, "learning_rate": 3.668341708542714e-05, "loss": 1.0695, "step": 127 }, { "epoch": 0.45030782761653476, "grad_norm": 0.6925339698791504, "learning_rate": 3.618090452261307e-05, "loss": 1.1024, "step": 128 }, { "epoch": 0.45382585751978893, "grad_norm": 0.6140123009681702, "learning_rate": 3.5678391959798995e-05, "loss": 1.0788, "step": 129 }, { "epoch": 0.4573438874230431, "grad_norm": 0.6771907806396484, "learning_rate": 3.517587939698493e-05, "loss": 1.0913, "step": 130 }, { "epoch": 0.4608619173262973, "grad_norm": 0.6700430512428284, "learning_rate": 3.467336683417085e-05, "loss": 1.0566, "step": 131 }, { "epoch": 0.46437994722955145, "grad_norm": 0.6931480169296265, "learning_rate": 3.4170854271356785e-05, "loss": 1.059, "step": 132 }, { "epoch": 0.4678979771328056, "grad_norm": 0.6608771085739136, "learning_rate": 3.366834170854272e-05, "loss": 1.119, "step": 133 }, { "epoch": 0.4714160070360598, "grad_norm": 0.6470663547515869, "learning_rate": 3.3165829145728643e-05, "loss": 1.0662, "step": 134 }, { "epoch": 0.47493403693931396, "grad_norm": 0.5729122757911682, "learning_rate": 3.2663316582914576e-05, "loss": 0.9999, "step": 135 }, { "epoch": 0.47845206684256814, "grad_norm": 0.6993862390518188, "learning_rate": 3.21608040201005e-05, "loss": 1.1819, "step": 136 }, { "epoch": 0.48197009674582236, "grad_norm": 0.6929494738578796, "learning_rate": 3.1658291457286434e-05, "loss": 1.1719, "step": 137 }, { "epoch": 0.48548812664907653, "grad_norm": 0.6951282620429993, "learning_rate": 3.1155778894472366e-05, "loss": 1.0716, "step": 138 }, { "epoch": 0.4890061565523307, "grad_norm": 0.6766693592071533, "learning_rate": 3.065326633165829e-05, "loss": 1.1589, "step": 139 }, { "epoch": 0.4925241864555849, "grad_norm": 0.6500269174575806, "learning_rate": 3.015075376884422e-05, "loss": 1.1122, "step": 140 }, { "epoch": 0.49604221635883905, "grad_norm": 0.7741857171058655, "learning_rate": 2.9648241206030153e-05, "loss": 1.1594, "step": 141 }, { "epoch": 0.4995602462620932, "grad_norm": 0.6630749106407166, "learning_rate": 2.914572864321608e-05, "loss": 1.0615, "step": 142 }, { "epoch": 0.5030782761653474, "grad_norm": 0.7230671048164368, "learning_rate": 2.8643216080402015e-05, "loss": 1.1521, "step": 143 }, { "epoch": 0.5065963060686016, "grad_norm": 0.6624138355255127, "learning_rate": 2.814070351758794e-05, "loss": 1.0347, "step": 144 }, { "epoch": 0.5101143359718557, "grad_norm": 0.6560067534446716, "learning_rate": 2.763819095477387e-05, "loss": 1.1214, "step": 145 }, { "epoch": 0.51363236587511, "grad_norm": 0.6742956638336182, "learning_rate": 2.7135678391959802e-05, "loss": 1.0956, "step": 146 }, { "epoch": 0.5171503957783641, "grad_norm": 0.706284761428833, "learning_rate": 2.6633165829145728e-05, "loss": 1.1058, "step": 147 }, { "epoch": 0.5206684256816183, "grad_norm": 0.6924006938934326, "learning_rate": 2.613065326633166e-05, "loss": 1.186, "step": 148 }, { "epoch": 0.5241864555848724, "grad_norm": 0.6287305951118469, "learning_rate": 2.562814070351759e-05, "loss": 1.0422, "step": 149 }, { "epoch": 0.5277044854881267, "grad_norm": 0.6957104206085205, "learning_rate": 2.5125628140703518e-05, "loss": 1.0896, "step": 150 }, { "epoch": 0.5312225153913809, "grad_norm": 0.7039506435394287, "learning_rate": 2.462311557788945e-05, "loss": 1.0818, "step": 151 }, { "epoch": 0.534740545294635, "grad_norm": 0.6502148509025574, "learning_rate": 2.4120603015075376e-05, "loss": 1.112, "step": 152 }, { "epoch": 0.5382585751978892, "grad_norm": 0.6823992133140564, "learning_rate": 2.361809045226131e-05, "loss": 1.0298, "step": 153 }, { "epoch": 0.5417766051011433, "grad_norm": 0.7539629936218262, "learning_rate": 2.3115577889447238e-05, "loss": 1.0618, "step": 154 }, { "epoch": 0.5452946350043976, "grad_norm": 0.6974697113037109, "learning_rate": 2.2613065326633167e-05, "loss": 1.1702, "step": 155 }, { "epoch": 0.5488126649076517, "grad_norm": 0.7035180330276489, "learning_rate": 2.21105527638191e-05, "loss": 1.0714, "step": 156 }, { "epoch": 0.5523306948109059, "grad_norm": 0.9007865786552429, "learning_rate": 2.1608040201005025e-05, "loss": 1.0565, "step": 157 }, { "epoch": 0.55584872471416, "grad_norm": 0.7083996534347534, "learning_rate": 2.1105527638190957e-05, "loss": 1.1425, "step": 158 }, { "epoch": 0.5593667546174143, "grad_norm": 0.7241733074188232, "learning_rate": 2.0603015075376886e-05, "loss": 1.1211, "step": 159 }, { "epoch": 0.5628847845206685, "grad_norm": 0.7474963068962097, "learning_rate": 2.0100502512562815e-05, "loss": 1.0546, "step": 160 }, { "epoch": 0.5664028144239226, "grad_norm": 0.7051181793212891, "learning_rate": 1.9597989949748744e-05, "loss": 0.9878, "step": 161 }, { "epoch": 0.5699208443271768, "grad_norm": 0.7359694242477417, "learning_rate": 1.9095477386934673e-05, "loss": 1.1283, "step": 162 }, { "epoch": 0.5734388742304309, "grad_norm": 0.6908060908317566, "learning_rate": 1.8592964824120602e-05, "loss": 1.1287, "step": 163 }, { "epoch": 0.5769569041336852, "grad_norm": 0.7220682501792908, "learning_rate": 1.8090452261306535e-05, "loss": 1.0424, "step": 164 }, { "epoch": 0.5804749340369393, "grad_norm": 0.7415404319763184, "learning_rate": 1.7587939698492464e-05, "loss": 1.0749, "step": 165 }, { "epoch": 0.5839929639401935, "grad_norm": 0.7168678641319275, "learning_rate": 1.7085427135678393e-05, "loss": 1.1308, "step": 166 }, { "epoch": 0.5875109938434476, "grad_norm": 0.653301477432251, "learning_rate": 1.6582914572864322e-05, "loss": 1.0777, "step": 167 }, { "epoch": 0.5910290237467019, "grad_norm": 0.7567819952964783, "learning_rate": 1.608040201005025e-05, "loss": 1.1476, "step": 168 }, { "epoch": 0.594547053649956, "grad_norm": 0.7353144288063049, "learning_rate": 1.5577889447236183e-05, "loss": 1.0961, "step": 169 }, { "epoch": 0.5980650835532102, "grad_norm": 0.6990388035774231, "learning_rate": 1.507537688442211e-05, "loss": 1.1619, "step": 170 }, { "epoch": 0.6015831134564644, "grad_norm": 0.7032533288002014, "learning_rate": 1.457286432160804e-05, "loss": 1.0619, "step": 171 }, { "epoch": 0.6051011433597185, "grad_norm": 0.6197975873947144, "learning_rate": 1.407035175879397e-05, "loss": 1.0953, "step": 172 }, { "epoch": 0.6086191732629728, "grad_norm": 0.746258556842804, "learning_rate": 1.3567839195979901e-05, "loss": 1.1201, "step": 173 }, { "epoch": 0.6121372031662269, "grad_norm": 0.6444905996322632, "learning_rate": 1.306532663316583e-05, "loss": 1.0241, "step": 174 }, { "epoch": 0.6156552330694811, "grad_norm": 0.7037890553474426, "learning_rate": 1.2562814070351759e-05, "loss": 1.0739, "step": 175 }, { "epoch": 0.6191732629727352, "grad_norm": 0.7138697504997253, "learning_rate": 1.2060301507537688e-05, "loss": 1.1102, "step": 176 }, { "epoch": 0.6226912928759895, "grad_norm": 0.7358911037445068, "learning_rate": 1.1557788944723619e-05, "loss": 1.1945, "step": 177 }, { "epoch": 0.6262093227792436, "grad_norm": 0.7306352853775024, "learning_rate": 1.105527638190955e-05, "loss": 1.0887, "step": 178 }, { "epoch": 0.6297273526824978, "grad_norm": 0.7626399993896484, "learning_rate": 1.0552763819095479e-05, "loss": 1.0918, "step": 179 }, { "epoch": 0.633245382585752, "grad_norm": 0.7157562375068665, "learning_rate": 1.0050251256281408e-05, "loss": 1.0794, "step": 180 }, { "epoch": 0.6367634124890061, "grad_norm": 0.674655556678772, "learning_rate": 9.547738693467337e-06, "loss": 1.1632, "step": 181 }, { "epoch": 0.6402814423922604, "grad_norm": 0.7276845574378967, "learning_rate": 9.045226130653267e-06, "loss": 1.0664, "step": 182 }, { "epoch": 0.6437994722955145, "grad_norm": 0.7614260315895081, "learning_rate": 8.542713567839196e-06, "loss": 1.1185, "step": 183 }, { "epoch": 0.6473175021987687, "grad_norm": 0.691209614276886, "learning_rate": 8.040201005025125e-06, "loss": 1.0648, "step": 184 }, { "epoch": 0.6508355321020228, "grad_norm": 0.6736161708831787, "learning_rate": 7.537688442211055e-06, "loss": 1.11, "step": 185 }, { "epoch": 0.6543535620052771, "grad_norm": 0.6875973343849182, "learning_rate": 7.035175879396985e-06, "loss": 1.1085, "step": 186 }, { "epoch": 0.6578715919085312, "grad_norm": 0.6715053915977478, "learning_rate": 6.532663316582915e-06, "loss": 1.1391, "step": 187 }, { "epoch": 0.6613896218117854, "grad_norm": 0.7241913080215454, "learning_rate": 6.030150753768844e-06, "loss": 1.193, "step": 188 }, { "epoch": 0.6649076517150396, "grad_norm": 0.722939133644104, "learning_rate": 5.527638190954775e-06, "loss": 1.1218, "step": 189 }, { "epoch": 0.6684256816182937, "grad_norm": 0.7348630428314209, "learning_rate": 5.025125628140704e-06, "loss": 1.0771, "step": 190 }, { "epoch": 0.671943711521548, "grad_norm": 0.72852623462677, "learning_rate": 4.522613065326634e-06, "loss": 1.1196, "step": 191 }, { "epoch": 0.6754617414248021, "grad_norm": 0.7617117762565613, "learning_rate": 4.020100502512563e-06, "loss": 1.1313, "step": 192 }, { "epoch": 0.6789797713280563, "grad_norm": 0.8029654622077942, "learning_rate": 3.5175879396984926e-06, "loss": 1.1405, "step": 193 }, { "epoch": 0.6824978012313104, "grad_norm": 0.6885625123977661, "learning_rate": 3.015075376884422e-06, "loss": 1.0565, "step": 194 }, { "epoch": 0.6860158311345647, "grad_norm": 0.7057883143424988, "learning_rate": 2.512562814070352e-06, "loss": 1.1625, "step": 195 }, { "epoch": 0.6895338610378188, "grad_norm": 0.7429342269897461, "learning_rate": 2.0100502512562813e-06, "loss": 1.044, "step": 196 }, { "epoch": 0.693051890941073, "grad_norm": 0.7036694884300232, "learning_rate": 1.507537688442211e-06, "loss": 1.0991, "step": 197 }, { "epoch": 0.6965699208443272, "grad_norm": 0.6950182318687439, "learning_rate": 1.0050251256281407e-06, "loss": 1.1014, "step": 198 }, { "epoch": 0.7000879507475813, "grad_norm": 0.7009806632995605, "learning_rate": 5.025125628140703e-07, "loss": 1.1108, "step": 199 }, { "epoch": 0.7036059806508356, "grad_norm": 0.6382765769958496, "learning_rate": 0.0, "loss": 1.0479, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5191482454605824e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }