{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9921259842519685, "eval_steps": 500, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005249343832020997, "grad_norm": 4.262586140207107, "learning_rate": 1.2500000000000002e-07, "loss": 1.2143, "step": 1 }, { "epoch": 0.010498687664041995, "grad_norm": 4.1559742669756154, "learning_rate": 2.5000000000000004e-07, "loss": 1.2307, "step": 2 }, { "epoch": 0.015748031496062992, "grad_norm": 4.2196647284049895, "learning_rate": 3.75e-07, "loss": 1.2286, "step": 3 }, { "epoch": 0.02099737532808399, "grad_norm": 4.13634077943981, "learning_rate": 5.000000000000001e-07, "loss": 1.2002, "step": 4 }, { "epoch": 0.026246719160104987, "grad_norm": 4.015668455829927, "learning_rate": 6.25e-07, "loss": 1.1672, "step": 5 }, { "epoch": 0.031496062992125984, "grad_norm": 3.832855314884781, "learning_rate": 7.5e-07, "loss": 1.1993, "step": 6 }, { "epoch": 0.03674540682414698, "grad_norm": 3.8323407788221733, "learning_rate": 8.75e-07, "loss": 1.1554, "step": 7 }, { "epoch": 0.04199475065616798, "grad_norm": 3.7465244180174917, "learning_rate": 1.0000000000000002e-06, "loss": 1.1672, "step": 8 }, { "epoch": 0.047244094488188976, "grad_norm": 3.7827251172961986, "learning_rate": 1.125e-06, "loss": 1.1755, "step": 9 }, { "epoch": 0.05249343832020997, "grad_norm": 3.470602675526565, "learning_rate": 1.25e-06, "loss": 1.1419, "step": 10 }, { "epoch": 0.05774278215223097, "grad_norm": 3.556221853917274, "learning_rate": 1.3750000000000002e-06, "loss": 1.194, "step": 11 }, { "epoch": 0.06299212598425197, "grad_norm": 3.324934060085957, "learning_rate": 1.5e-06, "loss": 1.1336, "step": 12 }, { "epoch": 0.06824146981627296, "grad_norm": 2.965981688480075, "learning_rate": 1.6250000000000001e-06, "loss": 1.1349, "step": 13 }, { "epoch": 0.07349081364829396, "grad_norm": 2.8658973663115046, "learning_rate": 1.75e-06, "loss": 1.1776, "step": 14 }, { "epoch": 0.07874015748031496, "grad_norm": 2.720689909744523, "learning_rate": 1.8750000000000003e-06, "loss": 1.1549, "step": 15 }, { "epoch": 0.08398950131233596, "grad_norm": 2.439062154183451, "learning_rate": 2.0000000000000003e-06, "loss": 1.141, "step": 16 }, { "epoch": 0.08923884514435695, "grad_norm": 2.1353279033918002, "learning_rate": 2.125e-06, "loss": 1.139, "step": 17 }, { "epoch": 0.09448818897637795, "grad_norm": 1.2092875477650313, "learning_rate": 2.25e-06, "loss": 1.0516, "step": 18 }, { "epoch": 0.09973753280839895, "grad_norm": 1.0763847439987342, "learning_rate": 2.375e-06, "loss": 1.0802, "step": 19 }, { "epoch": 0.10498687664041995, "grad_norm": 1.0340250902529846, "learning_rate": 2.5e-06, "loss": 1.0607, "step": 20 }, { "epoch": 0.11023622047244094, "grad_norm": 0.8630354040462489, "learning_rate": 2.6250000000000003e-06, "loss": 1.0496, "step": 21 }, { "epoch": 0.11548556430446194, "grad_norm": 0.8072735722523627, "learning_rate": 2.7500000000000004e-06, "loss": 1.08, "step": 22 }, { "epoch": 0.12073490813648294, "grad_norm": 0.7305262800316248, "learning_rate": 2.875e-06, "loss": 1.0539, "step": 23 }, { "epoch": 0.12598425196850394, "grad_norm": 0.7406394208995156, "learning_rate": 3e-06, "loss": 1.0529, "step": 24 }, { "epoch": 0.13123359580052493, "grad_norm": 0.7903255346265977, "learning_rate": 3.125e-06, "loss": 0.9914, "step": 25 }, { "epoch": 0.13648293963254593, "grad_norm": 0.8017049075586423, "learning_rate": 3.2500000000000002e-06, "loss": 1.0313, "step": 26 }, { "epoch": 0.14173228346456693, "grad_norm": 0.7764417012146556, "learning_rate": 3.3750000000000003e-06, "loss": 1.0422, "step": 27 }, { "epoch": 0.14698162729658792, "grad_norm": 0.7112571206699242, "learning_rate": 3.5e-06, "loss": 1.0187, "step": 28 }, { "epoch": 0.15223097112860892, "grad_norm": 0.6069574236656299, "learning_rate": 3.625e-06, "loss": 0.9958, "step": 29 }, { "epoch": 0.15748031496062992, "grad_norm": 0.5747553586770598, "learning_rate": 3.7500000000000005e-06, "loss": 0.9858, "step": 30 }, { "epoch": 0.16272965879265092, "grad_norm": 0.590510044443011, "learning_rate": 3.875e-06, "loss": 0.9841, "step": 31 }, { "epoch": 0.1679790026246719, "grad_norm": 0.5240510507345066, "learning_rate": 4.000000000000001e-06, "loss": 1.0171, "step": 32 }, { "epoch": 0.1732283464566929, "grad_norm": 0.4913378134702597, "learning_rate": 4.125e-06, "loss": 1.0218, "step": 33 }, { "epoch": 0.1784776902887139, "grad_norm": 0.47909247638813307, "learning_rate": 4.25e-06, "loss": 1.02, "step": 34 }, { "epoch": 0.1837270341207349, "grad_norm": 0.4949722185401137, "learning_rate": 4.3750000000000005e-06, "loss": 1.0206, "step": 35 }, { "epoch": 0.1889763779527559, "grad_norm": 0.48456911030733396, "learning_rate": 4.5e-06, "loss": 1.0321, "step": 36 }, { "epoch": 0.1942257217847769, "grad_norm": 0.49965866704889284, "learning_rate": 4.625000000000001e-06, "loss": 1.0184, "step": 37 }, { "epoch": 0.1994750656167979, "grad_norm": 0.4898360279427832, "learning_rate": 4.75e-06, "loss": 1.0165, "step": 38 }, { "epoch": 0.2047244094488189, "grad_norm": 0.5039246597121783, "learning_rate": 4.875e-06, "loss": 0.9811, "step": 39 }, { "epoch": 0.2099737532808399, "grad_norm": 0.462371012430454, "learning_rate": 5e-06, "loss": 0.9999, "step": 40 }, { "epoch": 0.2152230971128609, "grad_norm": 0.4564370676360458, "learning_rate": 4.99989327925842e-06, "loss": 1.0056, "step": 41 }, { "epoch": 0.2204724409448819, "grad_norm": 0.4838548423542603, "learning_rate": 4.999573126145132e-06, "loss": 1.0099, "step": 42 }, { "epoch": 0.22572178477690288, "grad_norm": 0.47657399375831033, "learning_rate": 4.999039567993719e-06, "loss": 1.0059, "step": 43 }, { "epoch": 0.23097112860892388, "grad_norm": 0.4609228968128241, "learning_rate": 4.998292650357558e-06, "loss": 0.9613, "step": 44 }, { "epoch": 0.23622047244094488, "grad_norm": 0.5379441335347738, "learning_rate": 4.997332437005932e-06, "loss": 0.9912, "step": 45 }, { "epoch": 0.24146981627296588, "grad_norm": 0.4488276134846175, "learning_rate": 4.996159009918586e-06, "loss": 0.9623, "step": 46 }, { "epoch": 0.24671916010498687, "grad_norm": 0.7899879742350473, "learning_rate": 4.994772469278726e-06, "loss": 0.9373, "step": 47 }, { "epoch": 0.25196850393700787, "grad_norm": 0.45351107525432893, "learning_rate": 4.99317293346447e-06, "loss": 0.9312, "step": 48 }, { "epoch": 0.2572178477690289, "grad_norm": 0.4145057356223518, "learning_rate": 4.991360539038737e-06, "loss": 0.9133, "step": 49 }, { "epoch": 0.26246719160104987, "grad_norm": 0.4371226648356658, "learning_rate": 4.989335440737587e-06, "loss": 0.9763, "step": 50 }, { "epoch": 0.2677165354330709, "grad_norm": 0.49987252416920314, "learning_rate": 4.987097811457015e-06, "loss": 0.9753, "step": 51 }, { "epoch": 0.27296587926509186, "grad_norm": 0.4737378597917066, "learning_rate": 4.984647842238185e-06, "loss": 0.9509, "step": 52 }, { "epoch": 0.2782152230971129, "grad_norm": 0.4803218242006868, "learning_rate": 4.981985742251123e-06, "loss": 1.0008, "step": 53 }, { "epoch": 0.28346456692913385, "grad_norm": 0.4238793960297473, "learning_rate": 4.9791117387768575e-06, "loss": 1.0024, "step": 54 }, { "epoch": 0.2887139107611549, "grad_norm": 0.4135698693902407, "learning_rate": 4.976026077188013e-06, "loss": 0.9208, "step": 55 }, { "epoch": 0.29396325459317585, "grad_norm": 0.49893312330659967, "learning_rate": 4.972729020927866e-06, "loss": 0.9771, "step": 56 }, { "epoch": 0.2992125984251969, "grad_norm": 0.43878390791709027, "learning_rate": 4.9692208514878445e-06, "loss": 0.937, "step": 57 }, { "epoch": 0.30446194225721784, "grad_norm": 0.42250251333050837, "learning_rate": 4.965501868383507e-06, "loss": 0.9287, "step": 58 }, { "epoch": 0.30971128608923887, "grad_norm": 0.42426620742455357, "learning_rate": 4.961572389128959e-06, "loss": 0.9374, "step": 59 }, { "epoch": 0.31496062992125984, "grad_norm": 0.4622588583575654, "learning_rate": 4.957432749209755e-06, "loss": 0.99, "step": 60 }, { "epoch": 0.32020997375328086, "grad_norm": 0.4324798534582787, "learning_rate": 4.953083302054247e-06, "loss": 1.0035, "step": 61 }, { "epoch": 0.32545931758530183, "grad_norm": 0.4303590460079348, "learning_rate": 4.948524419003415e-06, "loss": 0.9585, "step": 62 }, { "epoch": 0.33070866141732286, "grad_norm": 0.42861048906851473, "learning_rate": 4.943756489279164e-06, "loss": 0.9772, "step": 63 }, { "epoch": 0.3359580052493438, "grad_norm": 0.4115697149677722, "learning_rate": 4.938779919951092e-06, "loss": 0.9426, "step": 64 }, { "epoch": 0.34120734908136485, "grad_norm": 0.417300147661056, "learning_rate": 4.933595135901733e-06, "loss": 0.9447, "step": 65 }, { "epoch": 0.3464566929133858, "grad_norm": 0.4040601468423496, "learning_rate": 4.928202579790285e-06, "loss": 0.966, "step": 66 }, { "epoch": 0.35170603674540685, "grad_norm": 0.3677161548087925, "learning_rate": 4.9226027120148195e-06, "loss": 0.941, "step": 67 }, { "epoch": 0.3569553805774278, "grad_norm": 0.3832286188469758, "learning_rate": 4.916796010672969e-06, "loss": 0.9822, "step": 68 }, { "epoch": 0.36220472440944884, "grad_norm": 0.4345291089971557, "learning_rate": 4.910782971521112e-06, "loss": 0.9687, "step": 69 }, { "epoch": 0.3674540682414698, "grad_norm": 0.4133843702108161, "learning_rate": 4.904564107932048e-06, "loss": 0.9283, "step": 70 }, { "epoch": 0.37270341207349084, "grad_norm": 0.38465081625519515, "learning_rate": 4.898139950851163e-06, "loss": 0.9479, "step": 71 }, { "epoch": 0.3779527559055118, "grad_norm": 0.424340283488392, "learning_rate": 4.891511048751102e-06, "loss": 0.9593, "step": 72 }, { "epoch": 0.38320209973753283, "grad_norm": 0.40638719244646565, "learning_rate": 4.884677967584945e-06, "loss": 0.9264, "step": 73 }, { "epoch": 0.3884514435695538, "grad_norm": 0.4168684939745582, "learning_rate": 4.8776412907378845e-06, "loss": 0.9399, "step": 74 }, { "epoch": 0.3937007874015748, "grad_norm": 0.3928820462579945, "learning_rate": 4.870401618977415e-06, "loss": 0.9736, "step": 75 }, { "epoch": 0.3989501312335958, "grad_norm": 0.48005159652101537, "learning_rate": 4.86295957040205e-06, "loss": 0.9935, "step": 76 }, { "epoch": 0.4041994750656168, "grad_norm": 0.4229896582271501, "learning_rate": 4.855315780388541e-06, "loss": 0.9358, "step": 77 }, { "epoch": 0.4094488188976378, "grad_norm": 0.4258651800976577, "learning_rate": 4.847470901537642e-06, "loss": 0.9583, "step": 78 }, { "epoch": 0.4146981627296588, "grad_norm": 0.4069975102427097, "learning_rate": 4.839425603618382e-06, "loss": 0.9237, "step": 79 }, { "epoch": 0.4199475065616798, "grad_norm": 0.3853018091795304, "learning_rate": 4.83118057351089e-06, "loss": 0.9528, "step": 80 }, { "epoch": 0.4251968503937008, "grad_norm": 0.41640976933906576, "learning_rate": 4.822736515147748e-06, "loss": 0.9281, "step": 81 }, { "epoch": 0.4304461942257218, "grad_norm": 0.4341414623727107, "learning_rate": 4.814094149453891e-06, "loss": 0.983, "step": 82 }, { "epoch": 0.4356955380577428, "grad_norm": 0.41893835966431175, "learning_rate": 4.805254214285061e-06, "loss": 0.9691, "step": 83 }, { "epoch": 0.4409448818897638, "grad_norm": 0.3778203088676148, "learning_rate": 4.796217464364808e-06, "loss": 0.9386, "step": 84 }, { "epoch": 0.4461942257217848, "grad_norm": 0.4049971835755209, "learning_rate": 4.786984671220053e-06, "loss": 0.9604, "step": 85 }, { "epoch": 0.45144356955380577, "grad_norm": 0.39430021496671025, "learning_rate": 4.7775566231152216e-06, "loss": 0.9625, "step": 86 }, { "epoch": 0.4566929133858268, "grad_norm": 0.4303135030837173, "learning_rate": 4.767934124984941e-06, "loss": 0.9421, "step": 87 }, { "epoch": 0.46194225721784776, "grad_norm": 0.4037289734626591, "learning_rate": 4.7581179983653224e-06, "loss": 0.9395, "step": 88 }, { "epoch": 0.4671916010498688, "grad_norm": 0.389977925556389, "learning_rate": 4.7481090813238145e-06, "loss": 0.9494, "step": 89 }, { "epoch": 0.47244094488188976, "grad_norm": 0.4221791671439025, "learning_rate": 4.737908228387656e-06, "loss": 0.9236, "step": 90 }, { "epoch": 0.4776902887139108, "grad_norm": 0.38571752654505004, "learning_rate": 4.72751631047092e-06, "loss": 0.9836, "step": 91 }, { "epoch": 0.48293963254593175, "grad_norm": 0.3756683416227555, "learning_rate": 4.716934214800155e-06, "loss": 0.9847, "step": 92 }, { "epoch": 0.4881889763779528, "grad_norm": 0.43805821160946656, "learning_rate": 4.70616284483864e-06, "loss": 0.9759, "step": 93 }, { "epoch": 0.49343832020997375, "grad_norm": 0.42186430904629324, "learning_rate": 4.695203120209245e-06, "loss": 0.9381, "step": 94 }, { "epoch": 0.49868766404199477, "grad_norm": 0.3816672833085737, "learning_rate": 4.684055976615924e-06, "loss": 0.9381, "step": 95 }, { "epoch": 0.5039370078740157, "grad_norm": 0.4151475033902527, "learning_rate": 4.672722365763821e-06, "loss": 0.9449, "step": 96 }, { "epoch": 0.5091863517060368, "grad_norm": 0.39782263887391656, "learning_rate": 4.66120325527802e-06, "loss": 0.9617, "step": 97 }, { "epoch": 0.5144356955380578, "grad_norm": 0.40910931809868095, "learning_rate": 4.649499628620931e-06, "loss": 0.9334, "step": 98 }, { "epoch": 0.5196850393700787, "grad_norm": 0.3910070221772484, "learning_rate": 4.637612485008328e-06, "loss": 0.9344, "step": 99 }, { "epoch": 0.5249343832020997, "grad_norm": 0.3983780342477285, "learning_rate": 4.625542839324036e-06, "loss": 0.9201, "step": 100 }, { "epoch": 0.5301837270341208, "grad_norm": 0.41440216503056093, "learning_rate": 4.613291722033285e-06, "loss": 0.9854, "step": 101 }, { "epoch": 0.5354330708661418, "grad_norm": 0.399617487921351, "learning_rate": 4.600860179094732e-06, "loss": 0.9317, "step": 102 }, { "epoch": 0.5406824146981627, "grad_norm": 0.38269422826680943, "learning_rate": 4.588249271871164e-06, "loss": 0.9026, "step": 103 }, { "epoch": 0.5459317585301837, "grad_norm": 0.4161308186315819, "learning_rate": 4.575460077038877e-06, "loss": 0.9402, "step": 104 }, { "epoch": 0.5511811023622047, "grad_norm": 0.38768080702286994, "learning_rate": 4.562493686495756e-06, "loss": 0.9276, "step": 105 }, { "epoch": 0.5564304461942258, "grad_norm": 0.4454064598197059, "learning_rate": 4.5493512072680535e-06, "loss": 0.9452, "step": 106 }, { "epoch": 0.5616797900262467, "grad_norm": 0.3695333796660741, "learning_rate": 4.536033761415871e-06, "loss": 0.9493, "step": 107 }, { "epoch": 0.5669291338582677, "grad_norm": 0.4111826761670116, "learning_rate": 4.522542485937369e-06, "loss": 0.9109, "step": 108 }, { "epoch": 0.5721784776902887, "grad_norm": 0.384299356893817, "learning_rate": 4.508878532671684e-06, "loss": 0.938, "step": 109 }, { "epoch": 0.5774278215223098, "grad_norm": 0.39579785919606036, "learning_rate": 4.4950430682005995e-06, "loss": 0.924, "step": 110 }, { "epoch": 0.5826771653543307, "grad_norm": 0.40856237862457445, "learning_rate": 4.481037273748935e-06, "loss": 0.9092, "step": 111 }, { "epoch": 0.5879265091863517, "grad_norm": 0.38651275760437465, "learning_rate": 4.4668623450837085e-06, "loss": 0.9311, "step": 112 }, { "epoch": 0.5931758530183727, "grad_norm": 0.378464413540759, "learning_rate": 4.452519492412039e-06, "loss": 0.9255, "step": 113 }, { "epoch": 0.5984251968503937, "grad_norm": 0.3779607392785478, "learning_rate": 4.438009940277825e-06, "loss": 0.9024, "step": 114 }, { "epoch": 0.6036745406824147, "grad_norm": 0.410919812420786, "learning_rate": 4.423334927457198e-06, "loss": 0.9105, "step": 115 }, { "epoch": 0.6089238845144357, "grad_norm": 0.41911457133998964, "learning_rate": 4.408495706852758e-06, "loss": 0.9483, "step": 116 }, { "epoch": 0.6141732283464567, "grad_norm": 0.39458100856937656, "learning_rate": 4.393493545386607e-06, "loss": 0.9388, "step": 117 }, { "epoch": 0.6194225721784777, "grad_norm": 0.40200990450921853, "learning_rate": 4.378329723892184e-06, "loss": 0.927, "step": 118 }, { "epoch": 0.6246719160104987, "grad_norm": 0.42346347209583185, "learning_rate": 4.3630055370049065e-06, "loss": 0.9439, "step": 119 }, { "epoch": 0.6299212598425197, "grad_norm": 0.40678132773256936, "learning_rate": 4.3475222930516484e-06, "loss": 0.994, "step": 120 }, { "epoch": 0.6351706036745407, "grad_norm": 0.4215857320252809, "learning_rate": 4.3318813139390295e-06, "loss": 0.8946, "step": 121 }, { "epoch": 0.6404199475065617, "grad_norm": 0.39311078218513845, "learning_rate": 4.316083935040561e-06, "loss": 0.9129, "step": 122 }, { "epoch": 0.6456692913385826, "grad_norm": 0.38855202294392815, "learning_rate": 4.300131505082637e-06, "loss": 0.9229, "step": 123 }, { "epoch": 0.6509186351706037, "grad_norm": 0.437029643792513, "learning_rate": 4.284025386029381e-06, "loss": 0.9151, "step": 124 }, { "epoch": 0.6561679790026247, "grad_norm": 0.41975844377898647, "learning_rate": 4.267766952966369e-06, "loss": 0.9224, "step": 125 }, { "epoch": 0.6614173228346457, "grad_norm": 0.39675424131353054, "learning_rate": 4.251357593983228e-06, "loss": 0.9371, "step": 126 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3980469727389334, "learning_rate": 4.234798710055124e-06, "loss": 0.928, "step": 127 }, { "epoch": 0.6719160104986877, "grad_norm": 0.3868709015712275, "learning_rate": 4.218091714923157e-06, "loss": 0.925, "step": 128 }, { "epoch": 0.6771653543307087, "grad_norm": 0.4150283358784954, "learning_rate": 4.2012380349736544e-06, "loss": 0.9632, "step": 129 }, { "epoch": 0.6824146981627297, "grad_norm": 0.3965769285465639, "learning_rate": 4.184239109116393e-06, "loss": 0.916, "step": 130 }, { "epoch": 0.6876640419947506, "grad_norm": 0.4040770175389451, "learning_rate": 4.167096388661754e-06, "loss": 0.9013, "step": 131 }, { "epoch": 0.6929133858267716, "grad_norm": 0.4065859256340288, "learning_rate": 4.149811337196808e-06, "loss": 0.9115, "step": 132 }, { "epoch": 0.6981627296587927, "grad_norm": 0.39095752489168806, "learning_rate": 4.132385430460361e-06, "loss": 0.9401, "step": 133 }, { "epoch": 0.7034120734908137, "grad_norm": 0.37457240781741025, "learning_rate": 4.114820156216969e-06, "loss": 0.9178, "step": 134 }, { "epoch": 0.7086614173228346, "grad_norm": 0.399797934048477, "learning_rate": 4.097117014129903e-06, "loss": 0.9579, "step": 135 }, { "epoch": 0.7139107611548556, "grad_norm": 0.4477976289426918, "learning_rate": 4.079277515633127e-06, "loss": 0.9448, "step": 136 }, { "epoch": 0.7191601049868767, "grad_norm": 0.38322925064614577, "learning_rate": 4.061303183802248e-06, "loss": 0.9192, "step": 137 }, { "epoch": 0.7244094488188977, "grad_norm": 0.40594305556063015, "learning_rate": 4.043195553224482e-06, "loss": 0.9254, "step": 138 }, { "epoch": 0.7296587926509186, "grad_norm": 0.36120175115175723, "learning_rate": 4.024956169867642e-06, "loss": 0.9591, "step": 139 }, { "epoch": 0.7349081364829396, "grad_norm": 0.38003421628995726, "learning_rate": 4.006586590948141e-06, "loss": 0.9154, "step": 140 }, { "epoch": 0.7401574803149606, "grad_norm": 0.3779196146369032, "learning_rate": 3.9880883847980475e-06, "loss": 0.9412, "step": 141 }, { "epoch": 0.7454068241469817, "grad_norm": 0.3908051818324559, "learning_rate": 3.969463130731183e-06, "loss": 0.9037, "step": 142 }, { "epoch": 0.7506561679790026, "grad_norm": 0.4043989800878751, "learning_rate": 3.95071241890829e-06, "loss": 0.9032, "step": 143 }, { "epoch": 0.7559055118110236, "grad_norm": 0.396255171092691, "learning_rate": 3.9318378502012636e-06, "loss": 0.9288, "step": 144 }, { "epoch": 0.7611548556430446, "grad_norm": 0.40757200005039956, "learning_rate": 3.91284103605648e-06, "loss": 0.9179, "step": 145 }, { "epoch": 0.7664041994750657, "grad_norm": 0.3862826523020415, "learning_rate": 3.893723598357214e-06, "loss": 0.8894, "step": 146 }, { "epoch": 0.7716535433070866, "grad_norm": 0.3770664118848218, "learning_rate": 3.874487169285168e-06, "loss": 0.8898, "step": 147 }, { "epoch": 0.7769028871391076, "grad_norm": 0.3854873052422692, "learning_rate": 3.855133391181124e-06, "loss": 0.9135, "step": 148 }, { "epoch": 0.7821522309711286, "grad_norm": 0.38997928953459443, "learning_rate": 3.835663916404721e-06, "loss": 0.8843, "step": 149 }, { "epoch": 0.7874015748031497, "grad_norm": 0.45712052623884775, "learning_rate": 3.81608040719339e-06, "loss": 0.9694, "step": 150 }, { "epoch": 0.7926509186351706, "grad_norm": 0.41182630339038373, "learning_rate": 3.7963845355204303e-06, "loss": 0.9194, "step": 151 }, { "epoch": 0.7979002624671916, "grad_norm": 0.4065794661619882, "learning_rate": 3.7765779829522674e-06, "loss": 0.9278, "step": 152 }, { "epoch": 0.8031496062992126, "grad_norm": 0.38122636394786497, "learning_rate": 3.7566624405048847e-06, "loss": 0.892, "step": 153 }, { "epoch": 0.8083989501312336, "grad_norm": 0.4026809828765533, "learning_rate": 3.736639608499448e-06, "loss": 0.9246, "step": 154 }, { "epoch": 0.8136482939632546, "grad_norm": 0.3884561226623423, "learning_rate": 3.7165111964171407e-06, "loss": 0.9438, "step": 155 }, { "epoch": 0.8188976377952756, "grad_norm": 0.3844815855908215, "learning_rate": 3.6962789227532165e-06, "loss": 0.9316, "step": 156 }, { "epoch": 0.8241469816272966, "grad_norm": 0.3715029261772477, "learning_rate": 3.675944514870274e-06, "loss": 0.924, "step": 157 }, { "epoch": 0.8293963254593176, "grad_norm": 0.4062567635907239, "learning_rate": 3.6555097088507837e-06, "loss": 0.9616, "step": 158 }, { "epoch": 0.8346456692913385, "grad_norm": 0.40370287012056855, "learning_rate": 3.634976249348867e-06, "loss": 0.9526, "step": 159 }, { "epoch": 0.8398950131233596, "grad_norm": 0.3899308479039462, "learning_rate": 3.6143458894413463e-06, "loss": 0.9215, "step": 160 }, { "epoch": 0.8451443569553806, "grad_norm": 0.3813874729696962, "learning_rate": 3.5936203904780665e-06, "loss": 0.953, "step": 161 }, { "epoch": 0.8503937007874016, "grad_norm": 0.38224710953621555, "learning_rate": 3.5728015219315226e-06, "loss": 0.894, "step": 162 }, { "epoch": 0.8556430446194225, "grad_norm": 0.4038493099805114, "learning_rate": 3.5518910612457885e-06, "loss": 0.9614, "step": 163 }, { "epoch": 0.8608923884514436, "grad_norm": 0.3976228258032158, "learning_rate": 3.530890793684759e-06, "loss": 0.9364, "step": 164 }, { "epoch": 0.8661417322834646, "grad_norm": 0.4141980981190029, "learning_rate": 3.5098025121797375e-06, "loss": 0.9316, "step": 165 }, { "epoch": 0.8713910761154856, "grad_norm": 0.4156903411116242, "learning_rate": 3.4886280171763563e-06, "loss": 0.923, "step": 166 }, { "epoch": 0.8766404199475065, "grad_norm": 0.38250613966133334, "learning_rate": 3.467369116480864e-06, "loss": 0.9153, "step": 167 }, { "epoch": 0.8818897637795275, "grad_norm": 0.4070344049084728, "learning_rate": 3.446027625105776e-06, "loss": 0.9347, "step": 168 }, { "epoch": 0.8871391076115486, "grad_norm": 0.3844877252304378, "learning_rate": 3.424605365114923e-06, "loss": 0.9214, "step": 169 }, { "epoch": 0.8923884514435696, "grad_norm": 0.37585915847896717, "learning_rate": 3.403104165467883e-06, "loss": 0.9133, "step": 170 }, { "epoch": 0.8976377952755905, "grad_norm": 0.6300998021233689, "learning_rate": 3.3815258618638316e-06, "loss": 0.9395, "step": 171 }, { "epoch": 0.9028871391076115, "grad_norm": 0.3994171061317929, "learning_rate": 3.359872296584821e-06, "loss": 0.917, "step": 172 }, { "epoch": 0.9081364829396326, "grad_norm": 0.3806190962949758, "learning_rate": 3.338145318338485e-06, "loss": 0.9408, "step": 173 }, { "epoch": 0.9133858267716536, "grad_norm": 0.36777162273585867, "learning_rate": 3.3163467821002082e-06, "loss": 0.9346, "step": 174 }, { "epoch": 0.9186351706036745, "grad_norm": 0.3927948155295108, "learning_rate": 3.2944785489547544e-06, "loss": 0.9121, "step": 175 }, { "epoch": 0.9238845144356955, "grad_norm": 0.3958182964876464, "learning_rate": 3.272542485937369e-06, "loss": 0.9318, "step": 176 }, { "epoch": 0.9291338582677166, "grad_norm": 0.39754399123912254, "learning_rate": 3.250540465874382e-06, "loss": 0.9244, "step": 177 }, { "epoch": 0.9343832020997376, "grad_norm": 0.38861358420887904, "learning_rate": 3.228474367223312e-06, "loss": 0.9051, "step": 178 }, { "epoch": 0.9396325459317585, "grad_norm": 0.3926071787199394, "learning_rate": 3.206346073912488e-06, "loss": 0.9409, "step": 179 }, { "epoch": 0.9448818897637795, "grad_norm": 0.4008739814562732, "learning_rate": 3.184157475180208e-06, "loss": 0.9222, "step": 180 }, { "epoch": 0.9501312335958005, "grad_norm": 0.3665607732753151, "learning_rate": 3.1619104654134397e-06, "loss": 0.913, "step": 181 }, { "epoch": 0.9553805774278216, "grad_norm": 0.38983081632202093, "learning_rate": 3.1396069439860894e-06, "loss": 0.9297, "step": 182 }, { "epoch": 0.9606299212598425, "grad_norm": 0.3645800135143814, "learning_rate": 3.117248815096833e-06, "loss": 0.8883, "step": 183 }, { "epoch": 0.9658792650918635, "grad_norm": 0.3720108758786826, "learning_rate": 3.094837987606547e-06, "loss": 0.9204, "step": 184 }, { "epoch": 0.9711286089238845, "grad_norm": 0.3818234182095755, "learning_rate": 3.0723763748753354e-06, "loss": 0.8814, "step": 185 }, { "epoch": 0.9763779527559056, "grad_norm": 0.3847151463563777, "learning_rate": 3.049865894599172e-06, "loss": 0.9133, "step": 186 }, { "epoch": 0.9816272965879265, "grad_norm": 0.37971775342950864, "learning_rate": 3.027308468646175e-06, "loss": 0.8906, "step": 187 }, { "epoch": 0.9868766404199475, "grad_norm": 0.41572124992250203, "learning_rate": 3.0047060228925256e-06, "loss": 0.9672, "step": 188 }, { "epoch": 0.9921259842519685, "grad_norm": 0.40219608509658256, "learning_rate": 2.9820604870580426e-06, "loss": 0.9011, "step": 189 }, { "epoch": 0.9973753280839895, "grad_norm": 0.397234257384178, "learning_rate": 2.9593737945414264e-06, "loss": 0.9174, "step": 190 }, { "epoch": 1.0, "grad_norm": 0.397234257384178, "learning_rate": 2.9366478822551973e-06, "loss": 0.9204, "step": 191 }, { "epoch": 1.005249343832021, "grad_norm": 0.6279719871788059, "learning_rate": 2.913884690460325e-06, "loss": 0.8376, "step": 192 }, { "epoch": 1.010498687664042, "grad_norm": 0.41242697308859033, "learning_rate": 2.8910861626005774e-06, "loss": 0.8369, "step": 193 }, { "epoch": 1.015748031496063, "grad_norm": 0.38981163388237194, "learning_rate": 2.8682542451365943e-06, "loss": 0.8493, "step": 194 }, { "epoch": 1.020997375328084, "grad_norm": 0.42318354204075453, "learning_rate": 2.845390887379706e-06, "loss": 0.8618, "step": 195 }, { "epoch": 1.026246719160105, "grad_norm": 0.5052809667467608, "learning_rate": 2.822498041325509e-06, "loss": 0.8644, "step": 196 }, { "epoch": 1.031496062992126, "grad_norm": 0.36960238923766053, "learning_rate": 2.7995776614872083e-06, "loss": 0.8484, "step": 197 }, { "epoch": 1.036745406824147, "grad_norm": 0.3606472920225704, "learning_rate": 2.776631704728752e-06, "loss": 0.8413, "step": 198 }, { "epoch": 1.041994750656168, "grad_norm": 0.38778609923815943, "learning_rate": 2.753662130097758e-06, "loss": 0.8266, "step": 199 }, { "epoch": 1.047244094488189, "grad_norm": 0.3636856280047818, "learning_rate": 2.730670898658255e-06, "loss": 0.8285, "step": 200 }, { "epoch": 1.05249343832021, "grad_norm": 0.3886565787437705, "learning_rate": 2.70765997332326e-06, "loss": 0.8628, "step": 201 }, { "epoch": 1.057742782152231, "grad_norm": 0.41378173196429036, "learning_rate": 2.684631318687185e-06, "loss": 0.8549, "step": 202 }, { "epoch": 1.0629921259842519, "grad_norm": 0.3657527566283362, "learning_rate": 2.661586900858111e-06, "loss": 0.8472, "step": 203 }, { "epoch": 1.068241469816273, "grad_norm": 0.3666340150026852, "learning_rate": 2.638528687289925e-06, "loss": 0.8331, "step": 204 }, { "epoch": 1.073490813648294, "grad_norm": 0.3661321106045701, "learning_rate": 2.6154586466143495e-06, "loss": 0.8706, "step": 205 }, { "epoch": 1.078740157480315, "grad_norm": 0.3796204270047528, "learning_rate": 2.592378748472863e-06, "loss": 0.8329, "step": 206 }, { "epoch": 1.083989501312336, "grad_norm": 0.4003268539729557, "learning_rate": 2.5692909633485414e-06, "loss": 0.8762, "step": 207 }, { "epoch": 1.0892388451443569, "grad_norm": 0.3816091507612548, "learning_rate": 2.546197262397825e-06, "loss": 0.8499, "step": 208 }, { "epoch": 1.094488188976378, "grad_norm": 0.4135433735229758, "learning_rate": 2.5230996172822274e-06, "loss": 0.8191, "step": 209 }, { "epoch": 1.099737532808399, "grad_norm": 0.38519176727175336, "learning_rate": 2.5e-06, "loss": 0.8164, "step": 210 }, { "epoch": 1.10498687664042, "grad_norm": 0.397224934471037, "learning_rate": 2.4769003827177735e-06, "loss": 0.8373, "step": 211 }, { "epoch": 1.110236220472441, "grad_norm": 0.37696141546345585, "learning_rate": 2.453802737602176e-06, "loss": 0.8575, "step": 212 }, { "epoch": 1.1154855643044619, "grad_norm": 0.38562550508165394, "learning_rate": 2.4307090366514594e-06, "loss": 0.8372, "step": 213 }, { "epoch": 1.120734908136483, "grad_norm": 0.392610655726213, "learning_rate": 2.4076212515271384e-06, "loss": 0.8561, "step": 214 }, { "epoch": 1.125984251968504, "grad_norm": 0.3752434088251031, "learning_rate": 2.3845413533856517e-06, "loss": 0.8539, "step": 215 }, { "epoch": 1.1312335958005248, "grad_norm": 0.3769283394800778, "learning_rate": 2.3614713127100752e-06, "loss": 0.8134, "step": 216 }, { "epoch": 1.136482939632546, "grad_norm": 0.39533971296250825, "learning_rate": 2.3384130991418896e-06, "loss": 0.8608, "step": 217 }, { "epoch": 1.141732283464567, "grad_norm": 0.5791866799791944, "learning_rate": 2.3153686813128153e-06, "loss": 0.8334, "step": 218 }, { "epoch": 1.1469816272965878, "grad_norm": 0.3810153033409976, "learning_rate": 2.2923400266767406e-06, "loss": 0.8472, "step": 219 }, { "epoch": 1.152230971128609, "grad_norm": 0.3728143509479016, "learning_rate": 2.269329101341745e-06, "loss": 0.8376, "step": 220 }, { "epoch": 1.1574803149606299, "grad_norm": 0.40305096887095054, "learning_rate": 2.246337869902243e-06, "loss": 0.8275, "step": 221 }, { "epoch": 1.162729658792651, "grad_norm": 0.3692851271916895, "learning_rate": 2.2233682952712484e-06, "loss": 0.8413, "step": 222 }, { "epoch": 1.167979002624672, "grad_norm": 0.3963527736664121, "learning_rate": 2.2004223385127925e-06, "loss": 0.8295, "step": 223 }, { "epoch": 1.1732283464566928, "grad_norm": 0.3587695306587395, "learning_rate": 2.1775019586744924e-06, "loss": 0.8547, "step": 224 }, { "epoch": 1.178477690288714, "grad_norm": 0.3614709935886563, "learning_rate": 2.1546091126202955e-06, "loss": 0.813, "step": 225 }, { "epoch": 1.1837270341207349, "grad_norm": 0.4140010983968987, "learning_rate": 2.131745754863406e-06, "loss": 0.8398, "step": 226 }, { "epoch": 1.188976377952756, "grad_norm": 0.45996885685295197, "learning_rate": 2.1089138373994226e-06, "loss": 0.8518, "step": 227 }, { "epoch": 1.194225721784777, "grad_norm": 0.36254259181474985, "learning_rate": 2.086115309539675e-06, "loss": 0.8647, "step": 228 }, { "epoch": 1.1994750656167978, "grad_norm": 0.4122269000782737, "learning_rate": 2.063352117744803e-06, "loss": 0.8881, "step": 229 }, { "epoch": 1.204724409448819, "grad_norm": 0.40615776242804974, "learning_rate": 2.040626205458574e-06, "loss": 0.8328, "step": 230 }, { "epoch": 1.20997375328084, "grad_norm": 0.41826600558691523, "learning_rate": 2.017939512941958e-06, "loss": 0.8281, "step": 231 }, { "epoch": 1.2152230971128608, "grad_norm": 0.4153271729906844, "learning_rate": 1.995293977107475e-06, "loss": 0.8693, "step": 232 }, { "epoch": 1.220472440944882, "grad_norm": 0.3654074693662248, "learning_rate": 1.972691531353826e-06, "loss": 0.821, "step": 233 }, { "epoch": 1.2257217847769029, "grad_norm": 0.3981178374801672, "learning_rate": 1.9501341054008292e-06, "loss": 0.8962, "step": 234 }, { "epoch": 1.2309711286089238, "grad_norm": 0.4049324135862524, "learning_rate": 1.9276236251246655e-06, "loss": 0.7905, "step": 235 }, { "epoch": 1.236220472440945, "grad_norm": 0.33631867862267323, "learning_rate": 1.9051620123934538e-06, "loss": 0.8284, "step": 236 }, { "epoch": 1.2414698162729658, "grad_norm": 0.37393386640045784, "learning_rate": 1.882751184903167e-06, "loss": 0.8405, "step": 237 }, { "epoch": 1.246719160104987, "grad_norm": 0.3462880519364805, "learning_rate": 1.860393056013911e-06, "loss": 0.7939, "step": 238 }, { "epoch": 1.2519685039370079, "grad_norm": 0.3848360075044728, "learning_rate": 1.8380895345865603e-06, "loss": 0.8375, "step": 239 }, { "epoch": 1.257217847769029, "grad_norm": 0.3617106402936481, "learning_rate": 1.8158425248197931e-06, "loss": 0.8162, "step": 240 }, { "epoch": 1.26246719160105, "grad_norm": 0.34772753646733273, "learning_rate": 1.7936539260875125e-06, "loss": 0.8408, "step": 241 }, { "epoch": 1.2677165354330708, "grad_norm": 0.36406532287683085, "learning_rate": 1.7715256327766887e-06, "loss": 0.8103, "step": 242 }, { "epoch": 1.272965879265092, "grad_norm": 0.3765574533307364, "learning_rate": 1.7494595341256185e-06, "loss": 0.8461, "step": 243 }, { "epoch": 1.2782152230971129, "grad_norm": 0.3859435308304487, "learning_rate": 1.7274575140626318e-06, "loss": 0.8761, "step": 244 }, { "epoch": 1.2834645669291338, "grad_norm": 0.3936691957558663, "learning_rate": 1.7055214510452462e-06, "loss": 0.9159, "step": 245 }, { "epoch": 1.288713910761155, "grad_norm": 0.5107365379023212, "learning_rate": 1.6836532178997922e-06, "loss": 0.8649, "step": 246 }, { "epoch": 1.2939632545931758, "grad_norm": 0.388972979502565, "learning_rate": 1.6618546816615162e-06, "loss": 0.8734, "step": 247 }, { "epoch": 1.2992125984251968, "grad_norm": 0.4410810635653612, "learning_rate": 1.6401277034151798e-06, "loss": 0.8405, "step": 248 }, { "epoch": 1.304461942257218, "grad_norm": 0.38502463729196623, "learning_rate": 1.6184741381361684e-06, "loss": 0.8546, "step": 249 }, { "epoch": 1.3097112860892388, "grad_norm": 0.38032181472927906, "learning_rate": 1.5968958345321178e-06, "loss": 0.8253, "step": 250 }, { "epoch": 1.3149606299212597, "grad_norm": 0.36970392450888717, "learning_rate": 1.5753946348850774e-06, "loss": 0.8558, "step": 251 }, { "epoch": 1.3202099737532809, "grad_norm": 0.3934484470589474, "learning_rate": 1.5539723748942246e-06, "loss": 0.8299, "step": 252 }, { "epoch": 1.3254593175853018, "grad_norm": 0.3602834073019444, "learning_rate": 1.5326308835191372e-06, "loss": 0.8476, "step": 253 }, { "epoch": 1.330708661417323, "grad_norm": 0.3482676328694225, "learning_rate": 1.5113719828236439e-06, "loss": 0.858, "step": 254 }, { "epoch": 1.3359580052493438, "grad_norm": 0.44035367856347457, "learning_rate": 1.490197487820263e-06, "loss": 0.8351, "step": 255 }, { "epoch": 1.341207349081365, "grad_norm": 0.371715281121202, "learning_rate": 1.4691092063152417e-06, "loss": 0.827, "step": 256 }, { "epoch": 1.3464566929133859, "grad_norm": 0.35070038567136974, "learning_rate": 1.4481089387542134e-06, "loss": 0.8492, "step": 257 }, { "epoch": 1.3517060367454068, "grad_norm": 0.3499117772233927, "learning_rate": 1.4271984780684778e-06, "loss": 0.8399, "step": 258 }, { "epoch": 1.356955380577428, "grad_norm": 0.3811483044855955, "learning_rate": 1.4063796095219345e-06, "loss": 0.8688, "step": 259 }, { "epoch": 1.3622047244094488, "grad_norm": 0.3714007196954483, "learning_rate": 1.3856541105586545e-06, "loss": 0.813, "step": 260 }, { "epoch": 1.3674540682414698, "grad_norm": 0.3577252388966486, "learning_rate": 1.3650237506511333e-06, "loss": 0.8506, "step": 261 }, { "epoch": 1.372703412073491, "grad_norm": 0.3670679328461459, "learning_rate": 1.3444902911492174e-06, "loss": 0.8267, "step": 262 }, { "epoch": 1.3779527559055118, "grad_norm": 0.3521888751612076, "learning_rate": 1.324055485129727e-06, "loss": 0.8079, "step": 263 }, { "epoch": 1.3832020997375327, "grad_norm": 0.3751986141403746, "learning_rate": 1.303721077246784e-06, "loss": 0.8491, "step": 264 }, { "epoch": 1.3884514435695539, "grad_norm": 0.3589588384505892, "learning_rate": 1.2834888035828597e-06, "loss": 0.8768, "step": 265 }, { "epoch": 1.3937007874015748, "grad_norm": 0.372280310354174, "learning_rate": 1.2633603915005535e-06, "loss": 0.8335, "step": 266 }, { "epoch": 1.3989501312335957, "grad_norm": 0.40534475247811924, "learning_rate": 1.2433375594951166e-06, "loss": 0.8719, "step": 267 }, { "epoch": 1.4041994750656168, "grad_norm": 0.40997625659212245, "learning_rate": 1.2234220170477332e-06, "loss": 0.8269, "step": 268 }, { "epoch": 1.4094488188976377, "grad_norm": 0.3668404917652971, "learning_rate": 1.2036154644795697e-06, "loss": 0.7913, "step": 269 }, { "epoch": 1.4146981627296589, "grad_norm": 0.3641040286199539, "learning_rate": 1.1839195928066101e-06, "loss": 0.8469, "step": 270 }, { "epoch": 1.4199475065616798, "grad_norm": 0.36110674154533795, "learning_rate": 1.164336083595279e-06, "loss": 0.8698, "step": 271 }, { "epoch": 1.425196850393701, "grad_norm": 0.376323216718325, "learning_rate": 1.1448666088188766e-06, "loss": 0.8355, "step": 272 }, { "epoch": 1.4304461942257218, "grad_norm": 0.37039873117700245, "learning_rate": 1.1255128307148319e-06, "loss": 0.8219, "step": 273 }, { "epoch": 1.4356955380577427, "grad_norm": 0.3612230977145218, "learning_rate": 1.1062764016427864e-06, "loss": 0.8568, "step": 274 }, { "epoch": 1.4409448818897639, "grad_norm": 0.3962177187198973, "learning_rate": 1.0871589639435204e-06, "loss": 0.8115, "step": 275 }, { "epoch": 1.4461942257217848, "grad_norm": 0.38303094180829605, "learning_rate": 1.068162149798737e-06, "loss": 0.818, "step": 276 }, { "epoch": 1.4514435695538057, "grad_norm": 0.364448052838668, "learning_rate": 1.049287581091711e-06, "loss": 0.803, "step": 277 }, { "epoch": 1.4566929133858268, "grad_norm": 0.37746023670056045, "learning_rate": 1.0305368692688175e-06, "loss": 0.8495, "step": 278 }, { "epoch": 1.4619422572178478, "grad_norm": 0.40984945067844814, "learning_rate": 1.0119116152019535e-06, "loss": 0.8631, "step": 279 }, { "epoch": 1.4671916010498687, "grad_norm": 0.3840327770579671, "learning_rate": 9.934134090518593e-07, "loss": 0.8318, "step": 280 }, { "epoch": 1.4724409448818898, "grad_norm": 0.3575557636540703, "learning_rate": 9.750438301323584e-07, "loss": 0.8244, "step": 281 }, { "epoch": 1.4776902887139107, "grad_norm": 0.363642443404998, "learning_rate": 9.56804446775518e-07, "loss": 0.8109, "step": 282 }, { "epoch": 1.4829396325459316, "grad_norm": 0.3410295671236108, "learning_rate": 9.386968161977528e-07, "loss": 0.8227, "step": 283 }, { "epoch": 1.4881889763779528, "grad_norm": 0.3693264048471325, "learning_rate": 9.207224843668733e-07, "loss": 0.8349, "step": 284 }, { "epoch": 1.4934383202099737, "grad_norm": 0.3912247837861982, "learning_rate": 9.028829858700974e-07, "loss": 0.8218, "step": 285 }, { "epoch": 1.4986876640419948, "grad_norm": 0.3771536484006796, "learning_rate": 8.851798437830323e-07, "loss": 0.8328, "step": 286 }, { "epoch": 1.5039370078740157, "grad_norm": 0.3956914054034924, "learning_rate": 8.676145695396399e-07, "loss": 0.8779, "step": 287 }, { "epoch": 1.5091863517060369, "grad_norm": 0.34503047792761804, "learning_rate": 8.501886628031941e-07, "loss": 0.8265, "step": 288 }, { "epoch": 1.5144356955380578, "grad_norm": 0.3888965654379534, "learning_rate": 8.329036113382474e-07, "loss": 0.8354, "step": 289 }, { "epoch": 1.5196850393700787, "grad_norm": 0.38477861699293, "learning_rate": 8.157608908836071e-07, "loss": 0.8109, "step": 290 }, { "epoch": 1.5249343832020998, "grad_norm": 0.378643274506092, "learning_rate": 7.987619650263462e-07, "loss": 0.8507, "step": 291 }, { "epoch": 1.5301837270341208, "grad_norm": 0.3619947185774947, "learning_rate": 7.819082850768433e-07, "loss": 0.8321, "step": 292 }, { "epoch": 1.5354330708661417, "grad_norm": 0.3793890733316615, "learning_rate": 7.652012899448761e-07, "loss": 0.846, "step": 293 }, { "epoch": 1.5406824146981628, "grad_norm": 0.37104232865461917, "learning_rate": 7.486424060167726e-07, "loss": 0.8113, "step": 294 }, { "epoch": 1.5459317585301837, "grad_norm": 2.1051715634012864, "learning_rate": 7.322330470336314e-07, "loss": 0.8174, "step": 295 }, { "epoch": 1.5511811023622046, "grad_norm": 0.45390059444688674, "learning_rate": 7.159746139706194e-07, "loss": 0.8414, "step": 296 }, { "epoch": 1.5564304461942258, "grad_norm": 0.39115421496994207, "learning_rate": 6.99868494917364e-07, "loss": 0.8085, "step": 297 }, { "epoch": 1.5616797900262467, "grad_norm": 0.37250458373292433, "learning_rate": 6.839160649594401e-07, "loss": 0.7906, "step": 298 }, { "epoch": 1.5669291338582676, "grad_norm": 0.3746838213451273, "learning_rate": 6.68118686060972e-07, "loss": 0.8314, "step": 299 }, { "epoch": 1.5721784776902887, "grad_norm": 0.5365749531577879, "learning_rate": 6.524777069483526e-07, "loss": 0.8332, "step": 300 }, { "epoch": 1.5774278215223099, "grad_norm": 0.3788491507367419, "learning_rate": 6.369944629950933e-07, "loss": 0.8611, "step": 301 }, { "epoch": 1.5826771653543306, "grad_norm": 0.3690009715495826, "learning_rate": 6.216702761078167e-07, "loss": 0.8099, "step": 302 }, { "epoch": 1.5879265091863517, "grad_norm": 0.34643553497540036, "learning_rate": 6.06506454613393e-07, "loss": 0.8255, "step": 303 }, { "epoch": 1.5931758530183728, "grad_norm": 0.37384009617772274, "learning_rate": 5.915042931472426e-07, "loss": 0.8024, "step": 304 }, { "epoch": 1.5984251968503937, "grad_norm": 0.34112911330910745, "learning_rate": 5.766650725428027e-07, "loss": 0.8172, "step": 305 }, { "epoch": 1.6036745406824147, "grad_norm": 0.3675924179740712, "learning_rate": 5.619900597221753e-07, "loss": 0.8195, "step": 306 }, { "epoch": 1.6089238845144358, "grad_norm": 0.36296097997746496, "learning_rate": 5.474805075879616e-07, "loss": 0.808, "step": 307 }, { "epoch": 1.6141732283464567, "grad_norm": 0.37901033900237063, "learning_rate": 5.33137654916292e-07, "loss": 0.8068, "step": 308 }, { "epoch": 1.6194225721784776, "grad_norm": 0.3662350700773784, "learning_rate": 5.189627262510655e-07, "loss": 0.8695, "step": 309 }, { "epoch": 1.6246719160104988, "grad_norm": 0.36967015418887056, "learning_rate": 5.049569317994013e-07, "loss": 0.8448, "step": 310 }, { "epoch": 1.6299212598425197, "grad_norm": 0.3579476966594129, "learning_rate": 4.911214673283157e-07, "loss": 0.8375, "step": 311 }, { "epoch": 1.6351706036745406, "grad_norm": 0.3611024965491316, "learning_rate": 4.774575140626317e-07, "loss": 0.8519, "step": 312 }, { "epoch": 1.6404199475065617, "grad_norm": 0.3485470659035558, "learning_rate": 4.639662385841293e-07, "loss": 0.8217, "step": 313 }, { "epoch": 1.6456692913385826, "grad_norm": 0.37287096492671606, "learning_rate": 4.506487927319475e-07, "loss": 0.8413, "step": 314 }, { "epoch": 1.6509186351706036, "grad_norm": 0.38705573070810245, "learning_rate": 4.3750631350424456e-07, "loss": 0.8499, "step": 315 }, { "epoch": 1.6561679790026247, "grad_norm": 0.3582574621798544, "learning_rate": 4.2453992296112384e-07, "loss": 0.8422, "step": 316 }, { "epoch": 1.6614173228346458, "grad_norm": 0.36916485545424343, "learning_rate": 4.117507281288366e-07, "loss": 0.8312, "step": 317 }, { "epoch": 1.6666666666666665, "grad_norm": 0.36367894508342596, "learning_rate": 3.991398209052685e-07, "loss": 0.8342, "step": 318 }, { "epoch": 1.6719160104986877, "grad_norm": 0.39384159873675784, "learning_rate": 3.8670827796671637e-07, "loss": 0.8006, "step": 319 }, { "epoch": 1.6771653543307088, "grad_norm": 0.39604366934016394, "learning_rate": 3.7445716067596506e-07, "loss": 0.8556, "step": 320 }, { "epoch": 1.6824146981627297, "grad_norm": 0.3672901516398384, "learning_rate": 3.623875149916725e-07, "loss": 0.8621, "step": 321 }, { "epoch": 1.6876640419947506, "grad_norm": 0.38389464950775926, "learning_rate": 3.505003713790689e-07, "loss": 0.8343, "step": 322 }, { "epoch": 1.6929133858267718, "grad_norm": 0.36767539320644105, "learning_rate": 3.387967447219803e-07, "loss": 0.842, "step": 323 }, { "epoch": 1.6981627296587927, "grad_norm": 0.3907752041858798, "learning_rate": 3.2727763423617915e-07, "loss": 0.8767, "step": 324 }, { "epoch": 1.7034120734908136, "grad_norm": 0.37713933030901214, "learning_rate": 3.1594402338407633e-07, "loss": 0.8326, "step": 325 }, { "epoch": 1.7086614173228347, "grad_norm": 0.38192173068300844, "learning_rate": 3.047968797907552e-07, "loss": 0.8168, "step": 326 }, { "epoch": 1.7139107611548556, "grad_norm": 0.35632236660696437, "learning_rate": 2.9383715516136083e-07, "loss": 0.8344, "step": 327 }, { "epoch": 1.7191601049868765, "grad_norm": 0.39281712398945773, "learning_rate": 2.8306578519984526e-07, "loss": 0.8057, "step": 328 }, { "epoch": 1.7244094488188977, "grad_norm": 0.3713265865015316, "learning_rate": 2.7248368952908055e-07, "loss": 0.8128, "step": 329 }, { "epoch": 1.7296587926509186, "grad_norm": 0.3704631773575278, "learning_rate": 2.620917716123444e-07, "loss": 0.8593, "step": 330 }, { "epoch": 1.7349081364829395, "grad_norm": 0.3833563384401533, "learning_rate": 2.5189091867618615e-07, "loss": 0.8353, "step": 331 }, { "epoch": 1.7401574803149606, "grad_norm": 0.39756909736931284, "learning_rate": 2.418820016346779e-07, "loss": 0.8596, "step": 332 }, { "epoch": 1.7454068241469818, "grad_norm": 0.36212623044011794, "learning_rate": 2.3206587501505866e-07, "loss": 0.8311, "step": 333 }, { "epoch": 1.7506561679790025, "grad_norm": 0.36098218836000906, "learning_rate": 2.224433768847789e-07, "loss": 0.8091, "step": 334 }, { "epoch": 1.7559055118110236, "grad_norm": 0.3713293844163563, "learning_rate": 2.1301532877994747e-07, "loss": 0.8147, "step": 335 }, { "epoch": 1.7611548556430447, "grad_norm": 0.34562968172819897, "learning_rate": 2.0378253563519247e-07, "loss": 0.8284, "step": 336 }, { "epoch": 1.7664041994750657, "grad_norm": 0.39461152581923564, "learning_rate": 1.9474578571493874e-07, "loss": 0.8632, "step": 337 }, { "epoch": 1.7716535433070866, "grad_norm": 0.3689417081438894, "learning_rate": 1.859058505461095e-07, "loss": 0.8259, "step": 338 }, { "epoch": 1.7769028871391077, "grad_norm": 0.3926353728632124, "learning_rate": 1.7726348485225337e-07, "loss": 0.8364, "step": 339 }, { "epoch": 1.7821522309711286, "grad_norm": 0.354022004999706, "learning_rate": 1.6881942648911077e-07, "loss": 0.8773, "step": 340 }, { "epoch": 1.7874015748031495, "grad_norm": 0.3876142010040814, "learning_rate": 1.6057439638161891e-07, "loss": 0.82, "step": 341 }, { "epoch": 1.7926509186351707, "grad_norm": 0.35474834193863947, "learning_rate": 1.5252909846235898e-07, "loss": 0.8193, "step": 342 }, { "epoch": 1.7979002624671916, "grad_norm": 0.35916164129360717, "learning_rate": 1.4468421961145924e-07, "loss": 0.8398, "step": 343 }, { "epoch": 1.8031496062992125, "grad_norm": 0.3592392678728242, "learning_rate": 1.3704042959795132e-07, "loss": 0.8384, "step": 344 }, { "epoch": 1.8083989501312336, "grad_norm": 0.3717050715996407, "learning_rate": 1.2959838102258537e-07, "loss": 0.827, "step": 345 }, { "epoch": 1.8136482939632546, "grad_norm": 0.3714676540830701, "learning_rate": 1.223587092621162e-07, "loss": 0.811, "step": 346 }, { "epoch": 1.8188976377952755, "grad_norm": 0.36420858424939995, "learning_rate": 1.1532203241505474e-07, "loss": 0.8769, "step": 347 }, { "epoch": 1.8241469816272966, "grad_norm": 0.3993728488042469, "learning_rate": 1.0848895124889819e-07, "loss": 0.8437, "step": 348 }, { "epoch": 1.8293963254593177, "grad_norm": 0.36458761423411645, "learning_rate": 1.0186004914883779e-07, "loss": 0.8378, "step": 349 }, { "epoch": 1.8346456692913384, "grad_norm": 0.3874822926899934, "learning_rate": 9.54358920679524e-08, "loss": 0.8284, "step": 350 }, { "epoch": 1.8398950131233596, "grad_norm": 0.38256154200443654, "learning_rate": 8.921702847888791e-08, "loss": 0.8602, "step": 351 }, { "epoch": 1.8451443569553807, "grad_norm": 0.36179666269244404, "learning_rate": 8.320398932703145e-08, "loss": 0.8274, "step": 352 }, { "epoch": 1.8503937007874016, "grad_norm": 0.365807165050502, "learning_rate": 7.739728798518115e-08, "loss": 0.8709, "step": 353 }, { "epoch": 1.8556430446194225, "grad_norm": 0.35458800620057285, "learning_rate": 7.17974202097152e-08, "loss": 0.8111, "step": 354 }, { "epoch": 1.8608923884514437, "grad_norm": 0.37957333795063364, "learning_rate": 6.640486409826785e-08, "loss": 0.7994, "step": 355 }, { "epoch": 1.8661417322834646, "grad_norm": 0.3527171222570775, "learning_rate": 6.12200800489085e-08, "loss": 0.853, "step": 356 }, { "epoch": 1.8713910761154855, "grad_norm": 0.39242714191446193, "learning_rate": 5.624351072083561e-08, "loss": 0.8203, "step": 357 }, { "epoch": 1.8766404199475066, "grad_norm": 0.35707175993984824, "learning_rate": 5.1475580996585285e-08, "loss": 0.7958, "step": 358 }, { "epoch": 1.8818897637795275, "grad_norm": 0.3335686552470708, "learning_rate": 4.691669794575388e-08, "loss": 0.8433, "step": 359 }, { "epoch": 1.8871391076115485, "grad_norm": 0.38462069858291403, "learning_rate": 4.256725079024554e-08, "loss": 0.7975, "step": 360 }, { "epoch": 1.8923884514435696, "grad_norm": 0.34131290146851945, "learning_rate": 3.8427610871041024e-08, "loss": 0.8223, "step": 361 }, { "epoch": 1.8976377952755905, "grad_norm": 0.35781280354041417, "learning_rate": 3.449813161649357e-08, "loss": 0.9063, "step": 362 }, { "epoch": 1.9028871391076114, "grad_norm": 0.33992014890280325, "learning_rate": 3.077914851215585e-08, "loss": 0.8081, "step": 363 }, { "epoch": 1.9081364829396326, "grad_norm": 0.3582145429279079, "learning_rate": 2.7270979072135106e-08, "loss": 0.8487, "step": 364 }, { "epoch": 1.9133858267716537, "grad_norm": 0.36355378789809917, "learning_rate": 2.3973922811987295e-08, "loss": 0.8128, "step": 365 }, { "epoch": 1.9186351706036744, "grad_norm": 0.36811282667120626, "learning_rate": 2.0888261223143136e-08, "loss": 0.8442, "step": 366 }, { "epoch": 1.9238845144356955, "grad_norm": 0.36592765014539297, "learning_rate": 1.8014257748877606e-08, "loss": 0.8385, "step": 367 }, { "epoch": 1.9291338582677167, "grad_norm": 0.3695353834129384, "learning_rate": 1.5352157761815978e-08, "loss": 0.809, "step": 368 }, { "epoch": 1.9343832020997376, "grad_norm": 0.3907850295180158, "learning_rate": 1.2902188542986139e-08, "loss": 0.8295, "step": 369 }, { "epoch": 1.9396325459317585, "grad_norm": 0.34933462418116523, "learning_rate": 1.0664559262413831e-08, "loss": 0.8238, "step": 370 }, { "epoch": 1.9448818897637796, "grad_norm": 0.3537875486215043, "learning_rate": 8.639460961263612e-09, "loss": 0.8377, "step": 371 }, { "epoch": 1.9501312335958005, "grad_norm": 0.35563927647325777, "learning_rate": 6.827066535529947e-09, "loss": 0.7943, "step": 372 }, { "epoch": 1.9553805774278215, "grad_norm": 0.3474400330562922, "learning_rate": 5.2275307212742986e-09, "loss": 0.8511, "step": 373 }, { "epoch": 1.9606299212598426, "grad_norm": 0.3593397542649248, "learning_rate": 3.840990081415141e-09, "loss": 0.8186, "step": 374 }, { "epoch": 1.9658792650918635, "grad_norm": 0.3612580709923566, "learning_rate": 2.6675629940689508e-09, "loss": 0.848, "step": 375 }, { "epoch": 1.9711286089238844, "grad_norm": 0.3614238458155997, "learning_rate": 1.707349642442735e-09, "loss": 0.7933, "step": 376 }, { "epoch": 1.9763779527559056, "grad_norm": 0.39289720311660936, "learning_rate": 9.604320062814309e-10, "loss": 0.8323, "step": 377 }, { "epoch": 1.9816272965879265, "grad_norm": 0.36167265662489084, "learning_rate": 4.268738548682261e-10, "loss": 0.8512, "step": 378 }, { "epoch": 1.9868766404199474, "grad_norm": 0.3690750146631276, "learning_rate": 1.0672074158030176e-10, "loss": 0.832, "step": 379 }, { "epoch": 1.9921259842519685, "grad_norm": 0.35408134628230764, "learning_rate": 0.0, "loss": 0.8172, "step": 380 } ], "logging_steps": 1, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 95, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.333503833071944e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }