{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 200, "global_step": 762, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003940886699507389, "grad_norm": 1.046875, "learning_rate": 0.0, "loss": 1.9491, "step": 1 }, { "epoch": 0.019704433497536946, "grad_norm": 1.0546875, "learning_rate": 1.038961038961039e-05, "loss": 1.9959, "step": 5 }, { "epoch": 0.03940886699507389, "grad_norm": 0.92578125, "learning_rate": 2.3376623376623376e-05, "loss": 2.0041, "step": 10 }, { "epoch": 0.059113300492610835, "grad_norm": 0.81640625, "learning_rate": 3.6363636363636364e-05, "loss": 1.9683, "step": 15 }, { "epoch": 0.07881773399014778, "grad_norm": 0.6796875, "learning_rate": 4.9350649350649355e-05, "loss": 1.9528, "step": 20 }, { "epoch": 0.09852216748768473, "grad_norm": 0.5859375, "learning_rate": 6.233766233766233e-05, "loss": 1.9353, "step": 25 }, { "epoch": 0.11822660098522167, "grad_norm": 0.5390625, "learning_rate": 7.532467532467533e-05, "loss": 1.9095, "step": 30 }, { "epoch": 0.13793103448275862, "grad_norm": 0.455078125, "learning_rate": 8.831168831168831e-05, "loss": 1.8699, "step": 35 }, { "epoch": 0.15763546798029557, "grad_norm": 0.443359375, "learning_rate": 0.0001012987012987013, "loss": 1.865, "step": 40 }, { "epoch": 0.17733990147783252, "grad_norm": 0.384765625, "learning_rate": 0.00011428571428571428, "loss": 1.8171, "step": 45 }, { "epoch": 0.19704433497536947, "grad_norm": 0.328125, "learning_rate": 0.00012727272727272728, "loss": 1.8046, "step": 50 }, { "epoch": 0.21674876847290642, "grad_norm": 0.271484375, "learning_rate": 0.00014025974025974028, "loss": 1.7805, "step": 55 }, { "epoch": 0.23645320197044334, "grad_norm": 0.220703125, "learning_rate": 0.00015324675324675325, "loss": 1.7589, "step": 60 }, { "epoch": 0.2561576354679803, "grad_norm": 0.1875, "learning_rate": 0.00016623376623376625, "loss": 1.7481, "step": 65 }, { "epoch": 0.27586206896551724, "grad_norm": 0.1767578125, "learning_rate": 0.00017922077922077922, "loss": 1.7414, "step": 70 }, { "epoch": 0.2955665024630542, "grad_norm": 0.1533203125, "learning_rate": 0.00019220779220779222, "loss": 1.7252, "step": 75 }, { "epoch": 0.31527093596059114, "grad_norm": 0.150390625, "learning_rate": 0.00019999579326114222, "loss": 1.7274, "step": 80 }, { "epoch": 0.33497536945812806, "grad_norm": 0.1357421875, "learning_rate": 0.00019994847151359784, "loss": 1.6936, "step": 85 }, { "epoch": 0.35467980295566504, "grad_norm": 0.1396484375, "learning_rate": 0.0001998485945607536, "loss": 1.7096, "step": 90 }, { "epoch": 0.37438423645320196, "grad_norm": 0.1357421875, "learning_rate": 0.00019969621492020869, "loss": 1.6763, "step": 95 }, { "epoch": 0.39408866995073893, "grad_norm": 0.134765625, "learning_rate": 0.00019949141271668306, "loss": 1.69, "step": 100 }, { "epoch": 0.41379310344827586, "grad_norm": 0.123046875, "learning_rate": 0.00019923429563988614, "loss": 1.6758, "step": 105 }, { "epoch": 0.43349753694581283, "grad_norm": 0.12060546875, "learning_rate": 0.00019892499888789098, "loss": 1.688, "step": 110 }, { "epoch": 0.45320197044334976, "grad_norm": 0.125, "learning_rate": 0.00019856368509604412, "loss": 1.6867, "step": 115 }, { "epoch": 0.4729064039408867, "grad_norm": 0.1220703125, "learning_rate": 0.00019815054425144815, "loss": 1.663, "step": 120 }, { "epoch": 0.49261083743842365, "grad_norm": 0.111328125, "learning_rate": 0.00019768579359306205, "loss": 1.6621, "step": 125 }, { "epoch": 0.5123152709359606, "grad_norm": 0.10986328125, "learning_rate": 0.00019716967749747207, "loss": 1.6559, "step": 130 }, { "epoch": 0.5320197044334976, "grad_norm": 0.10888671875, "learning_rate": 0.00019660246735039266, "loss": 1.6891, "step": 135 }, { "epoch": 0.5517241379310345, "grad_norm": 0.11865234375, "learning_rate": 0.00019598446140396605, "loss": 1.6542, "step": 140 }, { "epoch": 0.5714285714285714, "grad_norm": 0.11767578125, "learning_rate": 0.00019531598461993392, "loss": 1.6707, "step": 145 }, { "epoch": 0.5911330049261084, "grad_norm": 0.107421875, "learning_rate": 0.00019459738849876543, "loss": 1.6433, "step": 150 }, { "epoch": 0.6108374384236454, "grad_norm": 0.11181640625, "learning_rate": 0.00019382905089482995, "loss": 1.6751, "step": 155 }, { "epoch": 0.6305418719211823, "grad_norm": 0.10595703125, "learning_rate": 0.00019301137581771266, "loss": 1.6452, "step": 160 }, { "epoch": 0.6502463054187192, "grad_norm": 0.109375, "learning_rate": 0.00019214479321977697, "loss": 1.6555, "step": 165 }, { "epoch": 0.6699507389162561, "grad_norm": 0.107421875, "learning_rate": 0.00019122975877008567, "loss": 1.6574, "step": 170 }, { "epoch": 0.6896551724137931, "grad_norm": 0.1181640625, "learning_rate": 0.00019026675361479969, "loss": 1.6703, "step": 175 }, { "epoch": 0.7093596059113301, "grad_norm": 0.1083984375, "learning_rate": 0.0001892562841241804, "loss": 1.651, "step": 180 }, { "epoch": 0.729064039408867, "grad_norm": 0.10888671875, "learning_rate": 0.00018819888162632838, "loss": 1.6543, "step": 185 }, { "epoch": 0.7487684729064039, "grad_norm": 0.1201171875, "learning_rate": 0.00018709510212779903, "loss": 1.6613, "step": 190 }, { "epoch": 0.7684729064039408, "grad_norm": 0.10791015625, "learning_rate": 0.0001859455260212414, "loss": 1.6542, "step": 195 }, { "epoch": 0.7881773399014779, "grad_norm": 0.11083984375, "learning_rate": 0.00018475075778021438, "loss": 1.6642, "step": 200 }, { "epoch": 0.7881773399014779, "eval_loss": 1.663087010383606, "eval_runtime": 89.4786, "eval_samples_per_second": 80.634, "eval_steps_per_second": 2.526, "step": 200 }, { "epoch": 0.8078817733990148, "grad_norm": 0.10986328125, "learning_rate": 0.00018351142564134078, "loss": 1.6655, "step": 205 }, { "epoch": 0.8275862068965517, "grad_norm": 0.109375, "learning_rate": 0.0001822281812739659, "loss": 1.6505, "step": 210 }, { "epoch": 0.8472906403940886, "grad_norm": 0.1044921875, "learning_rate": 0.00018090169943749476, "loss": 1.6417, "step": 215 }, { "epoch": 0.8669950738916257, "grad_norm": 0.11474609375, "learning_rate": 0.00017953267762658827, "loss": 1.6399, "step": 220 }, { "epoch": 0.8866995073891626, "grad_norm": 0.1025390625, "learning_rate": 0.00017812183570440428, "loss": 1.6497, "step": 225 }, { "epoch": 0.9064039408866995, "grad_norm": 0.11376953125, "learning_rate": 0.00017666991552407724, "loss": 1.6401, "step": 230 }, { "epoch": 0.9261083743842364, "grad_norm": 0.10546875, "learning_rate": 0.0001751776805386344, "loss": 1.6324, "step": 235 }, { "epoch": 0.9458128078817734, "grad_norm": 0.1044921875, "learning_rate": 0.000173645915399555, "loss": 1.6243, "step": 240 }, { "epoch": 0.9655172413793104, "grad_norm": 0.1123046875, "learning_rate": 0.00017207542554418227, "loss": 1.6299, "step": 245 }, { "epoch": 0.9852216748768473, "grad_norm": 0.10595703125, "learning_rate": 0.000170467036772206, "loss": 1.6373, "step": 250 }, { "epoch": 1.0039408866995074, "grad_norm": 0.10205078125, "learning_rate": 0.00016882159481143802, "loss": 1.6341, "step": 255 }, { "epoch": 1.0236453201970444, "grad_norm": 0.1044921875, "learning_rate": 0.00016713996487310916, "loss": 1.6373, "step": 260 }, { "epoch": 1.0433497536945813, "grad_norm": 0.10546875, "learning_rate": 0.00016542303119692129, "loss": 1.6343, "step": 265 }, { "epoch": 1.0630541871921182, "grad_norm": 0.115234375, "learning_rate": 0.00016367169658609355, "loss": 1.6345, "step": 270 }, { "epoch": 1.0827586206896551, "grad_norm": 0.10107421875, "learning_rate": 0.0001618868819326479, "loss": 1.6256, "step": 275 }, { "epoch": 1.102463054187192, "grad_norm": 0.1142578125, "learning_rate": 0.00016006952573318278, "loss": 1.627, "step": 280 }, { "epoch": 1.1221674876847292, "grad_norm": 0.1044921875, "learning_rate": 0.00015822058359539002, "loss": 1.6313, "step": 285 }, { "epoch": 1.141871921182266, "grad_norm": 0.10693359375, "learning_rate": 0.0001563410277355743, "loss": 1.6384, "step": 290 }, { "epoch": 1.161576354679803, "grad_norm": 0.10986328125, "learning_rate": 0.0001544318464674397, "loss": 1.615, "step": 295 }, { "epoch": 1.18128078817734, "grad_norm": 0.1015625, "learning_rate": 0.00015249404368241116, "loss": 1.6239, "step": 300 }, { "epoch": 1.2009852216748769, "grad_norm": 0.10302734375, "learning_rate": 0.0001505286383217657, "loss": 1.6183, "step": 305 }, { "epoch": 1.2206896551724138, "grad_norm": 0.1083984375, "learning_rate": 0.0001485366638408496, "loss": 1.6164, "step": 310 }, { "epoch": 1.2403940886699507, "grad_norm": 0.10986328125, "learning_rate": 0.0001465191676656634, "loss": 1.6268, "step": 315 }, { "epoch": 1.2600985221674876, "grad_norm": 0.10205078125, "learning_rate": 0.00014447721064210186, "loss": 1.6222, "step": 320 }, { "epoch": 1.2798029556650246, "grad_norm": 0.099609375, "learning_rate": 0.00014241186647813626, "loss": 1.6309, "step": 325 }, { "epoch": 1.2995073891625615, "grad_norm": 0.09814453125, "learning_rate": 0.00014032422117923426, "loss": 1.612, "step": 330 }, { "epoch": 1.3192118226600984, "grad_norm": 0.1015625, "learning_rate": 0.00013821537247731336, "loss": 1.6265, "step": 335 }, { "epoch": 1.3389162561576355, "grad_norm": 0.10205078125, "learning_rate": 0.00013608642925352793, "loss": 1.6414, "step": 340 }, { "epoch": 1.3586206896551725, "grad_norm": 0.10107421875, "learning_rate": 0.00013393851095519423, "loss": 1.6311, "step": 345 }, { "epoch": 1.3783251231527094, "grad_norm": 0.10595703125, "learning_rate": 0.00013177274700715914, "loss": 1.6226, "step": 350 }, { "epoch": 1.3980295566502463, "grad_norm": 0.10205078125, "learning_rate": 0.00012959027621792265, "loss": 1.6294, "step": 355 }, { "epoch": 1.4177339901477832, "grad_norm": 0.1015625, "learning_rate": 0.00012739224618082612, "loss": 1.6382, "step": 360 }, { "epoch": 1.4374384236453202, "grad_norm": 0.1064453125, "learning_rate": 0.00012517981267062134, "loss": 1.6221, "step": 365 }, { "epoch": 1.457142857142857, "grad_norm": 0.10986328125, "learning_rate": 0.00012295413903573756, "loss": 1.6241, "step": 370 }, { "epoch": 1.4768472906403942, "grad_norm": 0.10302734375, "learning_rate": 0.00012071639558656614, "loss": 1.631, "step": 375 }, { "epoch": 1.4965517241379311, "grad_norm": 0.1025390625, "learning_rate": 0.00011846775898008438, "loss": 1.6187, "step": 380 }, { "epoch": 1.516256157635468, "grad_norm": 0.103515625, "learning_rate": 0.00011620941160114229, "loss": 1.6244, "step": 385 }, { "epoch": 1.535960591133005, "grad_norm": 0.1005859375, "learning_rate": 0.0001139425409407374, "loss": 1.6291, "step": 390 }, { "epoch": 1.555665024630542, "grad_norm": 0.09814453125, "learning_rate": 0.00011166833897160465, "loss": 1.6324, "step": 395 }, { "epoch": 1.5753694581280788, "grad_norm": 0.1025390625, "learning_rate": 0.00010938800152144984, "loss": 1.6064, "step": 400 }, { "epoch": 1.5753694581280788, "eval_loss": 1.6445980072021484, "eval_runtime": 89.4599, "eval_samples_per_second": 80.651, "eval_steps_per_second": 2.526, "step": 400 }, { "epoch": 1.5950738916256157, "grad_norm": 0.099609375, "learning_rate": 0.00010710272764415566, "loss": 1.63, "step": 405 }, { "epoch": 1.6147783251231527, "grad_norm": 0.1025390625, "learning_rate": 0.00010481371898929186, "loss": 1.6147, "step": 410 }, { "epoch": 1.6344827586206896, "grad_norm": 0.10205078125, "learning_rate": 0.0001025221791702601, "loss": 1.6396, "step": 415 }, { "epoch": 1.6541871921182265, "grad_norm": 0.10107421875, "learning_rate": 0.00010022931313140638, "loss": 1.6197, "step": 420 }, { "epoch": 1.6738916256157634, "grad_norm": 0.103515625, "learning_rate": 9.793632651443357e-05, "loss": 1.6212, "step": 425 }, { "epoch": 1.6935960591133004, "grad_norm": 0.095703125, "learning_rate": 9.564442502444735e-05, "loss": 1.6193, "step": 430 }, { "epoch": 1.7133004926108373, "grad_norm": 0.09912109375, "learning_rate": 9.33548137959686e-05, "loss": 1.6423, "step": 435 }, { "epoch": 1.7330049261083744, "grad_norm": 0.10595703125, "learning_rate": 9.106869675924605e-05, "loss": 1.6217, "step": 440 }, { "epoch": 1.7527093596059113, "grad_norm": 0.1005859375, "learning_rate": 8.878727600720207e-05, "loss": 1.6264, "step": 445 }, { "epoch": 1.7724137931034483, "grad_norm": 0.09765625, "learning_rate": 8.651175116334443e-05, "loss": 1.6167, "step": 450 }, { "epoch": 1.7921182266009852, "grad_norm": 0.103515625, "learning_rate": 8.424331875097688e-05, "loss": 1.634, "step": 455 }, { "epoch": 1.8118226600985223, "grad_norm": 0.10400390625, "learning_rate": 8.19831715640394e-05, "loss": 1.6366, "step": 460 }, { "epoch": 1.8315270935960593, "grad_norm": 0.1005859375, "learning_rate": 7.973249803991006e-05, "loss": 1.6169, "step": 465 }, { "epoch": 1.8512315270935962, "grad_norm": 0.10693359375, "learning_rate": 7.749248163449693e-05, "loss": 1.6167, "step": 470 }, { "epoch": 1.870935960591133, "grad_norm": 0.10205078125, "learning_rate": 7.526430019995001e-05, "loss": 1.6429, "step": 475 }, { "epoch": 1.89064039408867, "grad_norm": 0.0986328125, "learning_rate": 7.304912536531944e-05, "loss": 1.6252, "step": 480 }, { "epoch": 1.910344827586207, "grad_norm": 0.0966796875, "learning_rate": 7.084812192048594e-05, "loss": 1.6298, "step": 485 }, { "epoch": 1.9300492610837439, "grad_norm": 0.095703125, "learning_rate": 6.866244720368737e-05, "loss": 1.6283, "step": 490 }, { "epoch": 1.9497536945812808, "grad_norm": 0.099609375, "learning_rate": 6.6493250492964e-05, "loss": 1.6139, "step": 495 }, { "epoch": 1.9694581280788177, "grad_norm": 0.10595703125, "learning_rate": 6.434167240184135e-05, "loss": 1.6419, "step": 500 }, { "epoch": 1.9891625615763546, "grad_norm": 0.09912109375, "learning_rate": 6.220884427956953e-05, "loss": 1.6255, "step": 505 }, { "epoch": 2.007881773399015, "grad_norm": 0.09765625, "learning_rate": 6.0095887616233796e-05, "loss": 1.6173, "step": 510 }, { "epoch": 2.027586206896552, "grad_norm": 0.09765625, "learning_rate": 5.800391345304914e-05, "loss": 1.6155, "step": 515 }, { "epoch": 2.0472906403940887, "grad_norm": 0.09375, "learning_rate": 5.593402179814944e-05, "loss": 1.6179, "step": 520 }, { "epoch": 2.0669950738916256, "grad_norm": 0.09619140625, "learning_rate": 5.388730104817769e-05, "loss": 1.6202, "step": 525 }, { "epoch": 2.0866995073891625, "grad_norm": 0.0986328125, "learning_rate": 5.18648274159821e-05, "loss": 1.6166, "step": 530 }, { "epoch": 2.1064039408866995, "grad_norm": 0.10205078125, "learning_rate": 4.9867664364718725e-05, "loss": 1.6219, "step": 535 }, { "epoch": 2.1261083743842364, "grad_norm": 0.09521484375, "learning_rate": 4.7896862048657965e-05, "loss": 1.6201, "step": 540 }, { "epoch": 2.1458128078817733, "grad_norm": 0.10693359375, "learning_rate": 4.595345676098923e-05, "loss": 1.6175, "step": 545 }, { "epoch": 2.1655172413793102, "grad_norm": 0.10009765625, "learning_rate": 4.403847038891424e-05, "loss": 1.6131, "step": 550 }, { "epoch": 2.185221674876847, "grad_norm": 0.0947265625, "learning_rate": 4.2152909876315316e-05, "loss": 1.6232, "step": 555 }, { "epoch": 2.204926108374384, "grad_norm": 0.10205078125, "learning_rate": 4.0297766694280915e-05, "loss": 1.6291, "step": 560 }, { "epoch": 2.224630541871921, "grad_norm": 0.09423828125, "learning_rate": 3.8474016319767435e-05, "loss": 1.6178, "step": 565 }, { "epoch": 2.2443349753694584, "grad_norm": 0.1044921875, "learning_rate": 3.6682617722671096e-05, "loss": 1.6215, "step": 570 }, { "epoch": 2.264039408866995, "grad_norm": 0.095703125, "learning_rate": 3.4924512861579315e-05, "loss": 1.615, "step": 575 }, { "epoch": 2.283743842364532, "grad_norm": 0.09814453125, "learning_rate": 3.3200626188467344e-05, "loss": 1.6251, "step": 580 }, { "epoch": 2.303448275862069, "grad_norm": 0.09912109375, "learning_rate": 3.151186416260006e-05, "loss": 1.6425, "step": 585 }, { "epoch": 2.323152709359606, "grad_norm": 0.09423828125, "learning_rate": 2.9859114773895025e-05, "loss": 1.6162, "step": 590 }, { "epoch": 2.342857142857143, "grad_norm": 0.107421875, "learning_rate": 2.8243247075996693e-05, "loss": 1.6128, "step": 595 }, { "epoch": 2.36256157635468, "grad_norm": 0.09912109375, "learning_rate": 2.6665110729308263e-05, "loss": 1.6249, "step": 600 }, { "epoch": 2.36256157635468, "eval_loss": 1.6416531801223755, "eval_runtime": 89.4559, "eval_samples_per_second": 80.654, "eval_steps_per_second": 2.526, "step": 600 }, { "epoch": 2.382266009852217, "grad_norm": 0.1025390625, "learning_rate": 2.5125535554220482e-05, "loss": 1.6283, "step": 605 }, { "epoch": 2.4019704433497537, "grad_norm": 0.1005859375, "learning_rate": 2.3625331094773206e-05, "loss": 1.6113, "step": 610 }, { "epoch": 2.4216748768472907, "grad_norm": 0.0966796875, "learning_rate": 2.2165286192978342e-05, "loss": 1.6162, "step": 615 }, { "epoch": 2.4413793103448276, "grad_norm": 0.09619140625, "learning_rate": 2.074616857402867e-05, "loss": 1.6098, "step": 620 }, { "epoch": 2.4610837438423645, "grad_norm": 0.1044921875, "learning_rate": 1.936872444261022e-05, "loss": 1.6156, "step": 625 }, { "epoch": 2.4807881773399014, "grad_norm": 0.0947265625, "learning_rate": 1.8033678090530813e-05, "loss": 1.6227, "step": 630 }, { "epoch": 2.5004926108374383, "grad_norm": 0.0966796875, "learning_rate": 1.6741731515870594e-05, "loss": 1.6173, "step": 635 }, { "epoch": 2.5201970443349753, "grad_norm": 0.09716796875, "learning_rate": 1.549356405385538e-05, "loss": 1.6022, "step": 640 }, { "epoch": 2.539901477832512, "grad_norm": 0.09814453125, "learning_rate": 1.428983201964662e-05, "loss": 1.6261, "step": 645 }, { "epoch": 2.559605911330049, "grad_norm": 0.0966796875, "learning_rate": 1.313116836323568e-05, "loss": 1.6238, "step": 650 }, { "epoch": 2.5793103448275865, "grad_norm": 0.10009765625, "learning_rate": 1.2018182336624273e-05, "loss": 1.6201, "step": 655 }, { "epoch": 2.599014778325123, "grad_norm": 0.095703125, "learning_rate": 1.0951459173465629e-05, "loss": 1.6343, "step": 660 }, { "epoch": 2.6187192118226603, "grad_norm": 0.09814453125, "learning_rate": 9.93155978133541e-06, "loss": 1.6194, "step": 665 }, { "epoch": 2.638423645320197, "grad_norm": 0.09814453125, "learning_rate": 8.959020446793288e-06, "loss": 1.6181, "step": 670 }, { "epoch": 2.658128078817734, "grad_norm": 0.0966796875, "learning_rate": 8.034352553391367e-06, "loss": 1.6321, "step": 675 }, { "epoch": 2.677832512315271, "grad_norm": 0.10546875, "learning_rate": 7.158042312776847e-06, "loss": 1.6124, "step": 680 }, { "epoch": 2.697536945812808, "grad_norm": 0.0966796875, "learning_rate": 6.330550509030852e-06, "loss": 1.6087, "step": 685 }, { "epoch": 2.717241379310345, "grad_norm": 0.095703125, "learning_rate": 5.552312256377423e-06, "loss": 1.609, "step": 690 }, { "epoch": 2.736945812807882, "grad_norm": 0.10107421875, "learning_rate": 4.823736770390552e-06, "loss": 1.6194, "step": 695 }, { "epoch": 2.7566502463054188, "grad_norm": 0.10205078125, "learning_rate": 4.14520715281923e-06, "loss": 1.6287, "step": 700 }, { "epoch": 2.7763546798029557, "grad_norm": 0.109375, "learning_rate": 3.517080190143629e-06, "loss": 1.6165, "step": 705 }, { "epoch": 2.7960591133004926, "grad_norm": 0.0986328125, "learning_rate": 2.9396861659686915e-06, "loss": 1.6165, "step": 710 }, { "epoch": 2.8157635467980295, "grad_norm": 0.0966796875, "learning_rate": 2.4133286873533112e-06, "loss": 1.6024, "step": 715 }, { "epoch": 2.8354679802955665, "grad_norm": 0.09912109375, "learning_rate": 1.9382845251668335e-06, "loss": 1.6149, "step": 720 }, { "epoch": 2.8551724137931034, "grad_norm": 0.095703125, "learning_rate": 1.514803468556547e-06, "loss": 1.6083, "step": 725 }, { "epoch": 2.8748768472906403, "grad_norm": 0.09716796875, "learning_rate": 1.14310819360276e-06, "loss": 1.6267, "step": 730 }, { "epoch": 2.8945812807881772, "grad_norm": 0.1005859375, "learning_rate": 8.233941462306271e-07, "loss": 1.6094, "step": 735 }, { "epoch": 2.914285714285714, "grad_norm": 0.09619140625, "learning_rate": 5.558294394402253e-07, "loss": 1.6445, "step": 740 }, { "epoch": 2.933990147783251, "grad_norm": 0.09765625, "learning_rate": 3.405547649087959e-07, "loss": 1.643, "step": 745 }, { "epoch": 2.9536945812807884, "grad_norm": 0.1044921875, "learning_rate": 1.7768331901187875e-07, "loss": 1.6242, "step": 750 }, { "epoch": 2.973399014778325, "grad_norm": 0.099609375, "learning_rate": 6.730074330203451e-08, "loss": 1.6259, "step": 755 }, { "epoch": 2.9931034482758623, "grad_norm": 0.09912109375, "learning_rate": 9.46507947655606e-09, "loss": 1.5984, "step": 760 }, { "epoch": 3.0, "step": 762, "total_flos": 2.543043044567941e+17, "train_loss": 1.6552943881415318, "train_runtime": 7255.878, "train_samples_per_second": 26.856, "train_steps_per_second": 0.105 } ], "logging_steps": 5, "max_steps": 762, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.543043044567941e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }