{ "best_metric": 10.745776176452637, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.043215211754537596, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000216076058772688, "grad_norm": 0.200728178024292, "learning_rate": 3.3333333333333333e-06, "loss": 10.8463, "step": 1 }, { "epoch": 0.000216076058772688, "eval_loss": 10.84037971496582, "eval_runtime": 27.2248, "eval_samples_per_second": 286.32, "eval_steps_per_second": 143.178, "step": 1 }, { "epoch": 0.000432152117545376, "grad_norm": 0.23658832907676697, "learning_rate": 6.666666666666667e-06, "loss": 10.8472, "step": 2 }, { "epoch": 0.0006482281763180639, "grad_norm": 0.20040327310562134, "learning_rate": 1e-05, "loss": 10.8415, "step": 3 }, { "epoch": 0.000864304235090752, "grad_norm": 0.2336120903491974, "learning_rate": 1.3333333333333333e-05, "loss": 10.828, "step": 4 }, { "epoch": 0.00108038029386344, "grad_norm": 0.20422425866127014, "learning_rate": 1.6666666666666667e-05, "loss": 10.8437, "step": 5 }, { "epoch": 0.0012964563526361278, "grad_norm": 0.2417161911725998, "learning_rate": 2e-05, "loss": 10.8422, "step": 6 }, { "epoch": 0.001512532411408816, "grad_norm": 0.22463224828243256, "learning_rate": 2.3333333333333336e-05, "loss": 10.8167, "step": 7 }, { "epoch": 0.001728608470181504, "grad_norm": 0.22320792078971863, "learning_rate": 2.6666666666666667e-05, "loss": 10.8394, "step": 8 }, { "epoch": 0.0019446845289541918, "grad_norm": 0.22033357620239258, "learning_rate": 3e-05, "loss": 10.8351, "step": 9 }, { "epoch": 0.00216076058772688, "grad_norm": 0.20691785216331482, "learning_rate": 3.3333333333333335e-05, "loss": 10.8433, "step": 10 }, { "epoch": 0.0023768366464995676, "grad_norm": 0.21809300780296326, "learning_rate": 3.6666666666666666e-05, "loss": 10.8364, "step": 11 }, { "epoch": 0.0025929127052722557, "grad_norm": 0.2736707031726837, "learning_rate": 4e-05, "loss": 10.8364, "step": 12 }, { "epoch": 0.0028089887640449437, "grad_norm": 0.24762022495269775, "learning_rate": 4.3333333333333334e-05, "loss": 10.8266, "step": 13 }, { "epoch": 0.003025064822817632, "grad_norm": 0.24276190996170044, "learning_rate": 4.666666666666667e-05, "loss": 10.8375, "step": 14 }, { "epoch": 0.00324114088159032, "grad_norm": 0.23069171607494354, "learning_rate": 5e-05, "loss": 10.8393, "step": 15 }, { "epoch": 0.003457216940363008, "grad_norm": 0.24690602719783783, "learning_rate": 5.333333333333333e-05, "loss": 10.8348, "step": 16 }, { "epoch": 0.003673292999135696, "grad_norm": 0.23763756453990936, "learning_rate": 5.666666666666667e-05, "loss": 10.8294, "step": 17 }, { "epoch": 0.0038893690579083835, "grad_norm": 0.23151656985282898, "learning_rate": 6e-05, "loss": 10.8329, "step": 18 }, { "epoch": 0.004105445116681072, "grad_norm": 0.23155605792999268, "learning_rate": 6.333333333333333e-05, "loss": 10.8299, "step": 19 }, { "epoch": 0.00432152117545376, "grad_norm": 0.2375253140926361, "learning_rate": 6.666666666666667e-05, "loss": 10.8228, "step": 20 }, { "epoch": 0.004537597234226448, "grad_norm": 0.31918394565582275, "learning_rate": 7e-05, "loss": 10.8364, "step": 21 }, { "epoch": 0.004753673292999135, "grad_norm": 0.2590275704860687, "learning_rate": 7.333333333333333e-05, "loss": 10.8279, "step": 22 }, { "epoch": 0.004969749351771823, "grad_norm": 0.3064877986907959, "learning_rate": 7.666666666666667e-05, "loss": 10.8388, "step": 23 }, { "epoch": 0.005185825410544511, "grad_norm": 0.3065643310546875, "learning_rate": 8e-05, "loss": 10.8247, "step": 24 }, { "epoch": 0.0054019014693171994, "grad_norm": 0.2851259708404541, "learning_rate": 8.333333333333334e-05, "loss": 10.8244, "step": 25 }, { "epoch": 0.0056179775280898875, "grad_norm": 0.30654624104499817, "learning_rate": 8.666666666666667e-05, "loss": 10.8216, "step": 26 }, { "epoch": 0.0058340535868625755, "grad_norm": 0.2787514328956604, "learning_rate": 9e-05, "loss": 10.8265, "step": 27 }, { "epoch": 0.006050129645635264, "grad_norm": 0.314116895198822, "learning_rate": 9.333333333333334e-05, "loss": 10.8232, "step": 28 }, { "epoch": 0.006266205704407952, "grad_norm": 0.28672200441360474, "learning_rate": 9.666666666666667e-05, "loss": 10.8145, "step": 29 }, { "epoch": 0.00648228176318064, "grad_norm": 0.27016451954841614, "learning_rate": 0.0001, "loss": 10.8084, "step": 30 }, { "epoch": 0.006698357821953328, "grad_norm": 0.2747277617454529, "learning_rate": 9.999146252290264e-05, "loss": 10.81, "step": 31 }, { "epoch": 0.006914433880726016, "grad_norm": 0.29148223996162415, "learning_rate": 9.996585300715116e-05, "loss": 10.813, "step": 32 }, { "epoch": 0.007130509939498704, "grad_norm": 0.3096551299095154, "learning_rate": 9.99231801983717e-05, "loss": 10.802, "step": 33 }, { "epoch": 0.007346585998271392, "grad_norm": 0.3342970609664917, "learning_rate": 9.986345866928941e-05, "loss": 10.8117, "step": 34 }, { "epoch": 0.00756266205704408, "grad_norm": 0.32004502415657043, "learning_rate": 9.978670881475172e-05, "loss": 10.7982, "step": 35 }, { "epoch": 0.007778738115816767, "grad_norm": 0.3316425383090973, "learning_rate": 9.96929568447637e-05, "loss": 10.8037, "step": 36 }, { "epoch": 0.007994814174589455, "grad_norm": 0.32472503185272217, "learning_rate": 9.958223477553714e-05, "loss": 10.7961, "step": 37 }, { "epoch": 0.008210890233362144, "grad_norm": 0.3180193603038788, "learning_rate": 9.94545804185573e-05, "loss": 10.7925, "step": 38 }, { "epoch": 0.008426966292134831, "grad_norm": 0.34710487723350525, "learning_rate": 9.931003736767013e-05, "loss": 10.7994, "step": 39 }, { "epoch": 0.00864304235090752, "grad_norm": 0.34599560499191284, "learning_rate": 9.91486549841951e-05, "loss": 10.7997, "step": 40 }, { "epoch": 0.008859118409680207, "grad_norm": 0.4090355634689331, "learning_rate": 9.89704883800683e-05, "loss": 10.7918, "step": 41 }, { "epoch": 0.009075194468452896, "grad_norm": 0.38032764196395874, "learning_rate": 9.877559839902184e-05, "loss": 10.7877, "step": 42 }, { "epoch": 0.009291270527225583, "grad_norm": 0.3663756847381592, "learning_rate": 9.85640515958057e-05, "loss": 10.7845, "step": 43 }, { "epoch": 0.00950734658599827, "grad_norm": 0.38913989067077637, "learning_rate": 9.833592021345937e-05, "loss": 10.7775, "step": 44 }, { "epoch": 0.00972342264477096, "grad_norm": 0.4066934585571289, "learning_rate": 9.809128215864097e-05, "loss": 10.7805, "step": 45 }, { "epoch": 0.009939498703543647, "grad_norm": 0.40702107548713684, "learning_rate": 9.783022097502204e-05, "loss": 10.766, "step": 46 }, { "epoch": 0.010155574762316336, "grad_norm": 0.5095567107200623, "learning_rate": 9.755282581475769e-05, "loss": 10.7686, "step": 47 }, { "epoch": 0.010371650821089023, "grad_norm": 0.5398594737052917, "learning_rate": 9.725919140804099e-05, "loss": 10.7674, "step": 48 }, { "epoch": 0.010587726879861712, "grad_norm": 0.5338243246078491, "learning_rate": 9.694941803075283e-05, "loss": 10.7631, "step": 49 }, { "epoch": 0.010803802938634399, "grad_norm": 0.7691417932510376, "learning_rate": 9.662361147021779e-05, "loss": 10.7645, "step": 50 }, { "epoch": 0.010803802938634399, "eval_loss": 10.766061782836914, "eval_runtime": 27.2467, "eval_samples_per_second": 286.089, "eval_steps_per_second": 143.063, "step": 50 }, { "epoch": 0.011019878997407088, "grad_norm": 0.31955963373184204, "learning_rate": 9.628188298907782e-05, "loss": 10.7833, "step": 51 }, { "epoch": 0.011235955056179775, "grad_norm": 0.2600715458393097, "learning_rate": 9.592434928729616e-05, "loss": 10.7863, "step": 52 }, { "epoch": 0.011452031114952464, "grad_norm": 0.25826898217201233, "learning_rate": 9.555113246230442e-05, "loss": 10.7734, "step": 53 }, { "epoch": 0.011668107173725151, "grad_norm": 0.33920153975486755, "learning_rate": 9.516235996730645e-05, "loss": 10.7821, "step": 54 }, { "epoch": 0.01188418323249784, "grad_norm": 0.2760711908340454, "learning_rate": 9.475816456775313e-05, "loss": 10.7771, "step": 55 }, { "epoch": 0.012100259291270527, "grad_norm": 0.2651015818119049, "learning_rate": 9.43386842960031e-05, "loss": 10.7812, "step": 56 }, { "epoch": 0.012316335350043216, "grad_norm": 0.3452089726924896, "learning_rate": 9.39040624041849e-05, "loss": 10.7577, "step": 57 }, { "epoch": 0.012532411408815903, "grad_norm": 0.24794746935367584, "learning_rate": 9.345444731527642e-05, "loss": 10.7618, "step": 58 }, { "epoch": 0.01274848746758859, "grad_norm": 0.26710182428359985, "learning_rate": 9.298999257241863e-05, "loss": 10.7622, "step": 59 }, { "epoch": 0.01296456352636128, "grad_norm": 0.2514508068561554, "learning_rate": 9.251085678648072e-05, "loss": 10.7781, "step": 60 }, { "epoch": 0.013180639585133967, "grad_norm": 0.2650161385536194, "learning_rate": 9.201720358189464e-05, "loss": 10.7779, "step": 61 }, { "epoch": 0.013396715643906655, "grad_norm": 0.2767448127269745, "learning_rate": 9.150920154077754e-05, "loss": 10.7757, "step": 62 }, { "epoch": 0.013612791702679343, "grad_norm": 0.29482796788215637, "learning_rate": 9.098702414536107e-05, "loss": 10.771, "step": 63 }, { "epoch": 0.013828867761452032, "grad_norm": 0.274749755859375, "learning_rate": 9.045084971874738e-05, "loss": 10.7587, "step": 64 }, { "epoch": 0.014044943820224719, "grad_norm": 0.21438376605510712, "learning_rate": 8.9900861364012e-05, "loss": 10.773, "step": 65 }, { "epoch": 0.014261019878997408, "grad_norm": 0.2738189995288849, "learning_rate": 8.933724690167417e-05, "loss": 10.7707, "step": 66 }, { "epoch": 0.014477095937770095, "grad_norm": 0.2180059552192688, "learning_rate": 8.876019880555649e-05, "loss": 10.7649, "step": 67 }, { "epoch": 0.014693171996542784, "grad_norm": 0.2755647301673889, "learning_rate": 8.816991413705516e-05, "loss": 10.7556, "step": 68 }, { "epoch": 0.014909248055315471, "grad_norm": 0.2628243863582611, "learning_rate": 8.756659447784368e-05, "loss": 10.7485, "step": 69 }, { "epoch": 0.01512532411408816, "grad_norm": 0.27653780579566956, "learning_rate": 8.695044586103296e-05, "loss": 10.7742, "step": 70 }, { "epoch": 0.015341400172860847, "grad_norm": 0.292253315448761, "learning_rate": 8.632167870081121e-05, "loss": 10.7542, "step": 71 }, { "epoch": 0.015557476231633534, "grad_norm": 0.21869610249996185, "learning_rate": 8.568050772058762e-05, "loss": 10.7724, "step": 72 }, { "epoch": 0.015773552290406223, "grad_norm": 0.20132863521575928, "learning_rate": 8.502715187966455e-05, "loss": 10.7621, "step": 73 }, { "epoch": 0.01598962834917891, "grad_norm": 0.2435745745897293, "learning_rate": 8.436183429846313e-05, "loss": 10.7661, "step": 74 }, { "epoch": 0.016205704407951597, "grad_norm": 0.23909196257591248, "learning_rate": 8.368478218232787e-05, "loss": 10.7694, "step": 75 }, { "epoch": 0.016421780466724288, "grad_norm": 0.22482334077358246, "learning_rate": 8.299622674393614e-05, "loss": 10.7642, "step": 76 }, { "epoch": 0.016637856525496975, "grad_norm": 0.2314310371875763, "learning_rate": 8.229640312433937e-05, "loss": 10.7584, "step": 77 }, { "epoch": 0.016853932584269662, "grad_norm": 0.23524075746536255, "learning_rate": 8.158555031266254e-05, "loss": 10.7632, "step": 78 }, { "epoch": 0.01707000864304235, "grad_norm": 0.20763476192951202, "learning_rate": 8.086391106448965e-05, "loss": 10.7533, "step": 79 }, { "epoch": 0.01728608470181504, "grad_norm": 0.1953965127468109, "learning_rate": 8.013173181896283e-05, "loss": 10.7633, "step": 80 }, { "epoch": 0.017502160760587727, "grad_norm": 0.1644563525915146, "learning_rate": 7.938926261462366e-05, "loss": 10.764, "step": 81 }, { "epoch": 0.017718236819360415, "grad_norm": 0.2148687094449997, "learning_rate": 7.863675700402526e-05, "loss": 10.7566, "step": 82 }, { "epoch": 0.017934312878133102, "grad_norm": 0.2654571831226349, "learning_rate": 7.787447196714427e-05, "loss": 10.7444, "step": 83 }, { "epoch": 0.018150388936905792, "grad_norm": 0.231702521443367, "learning_rate": 7.710266782362247e-05, "loss": 10.753, "step": 84 }, { "epoch": 0.01836646499567848, "grad_norm": 0.26304373145103455, "learning_rate": 7.63216081438678e-05, "loss": 10.7559, "step": 85 }, { "epoch": 0.018582541054451167, "grad_norm": 0.19173169136047363, "learning_rate": 7.553155965904535e-05, "loss": 10.7675, "step": 86 }, { "epoch": 0.018798617113223854, "grad_norm": 0.2556131184101105, "learning_rate": 7.473279216998895e-05, "loss": 10.7589, "step": 87 }, { "epoch": 0.01901469317199654, "grad_norm": 0.22893981635570526, "learning_rate": 7.392557845506432e-05, "loss": 10.7599, "step": 88 }, { "epoch": 0.019230769230769232, "grad_norm": 0.21988166868686676, "learning_rate": 7.311019417701566e-05, "loss": 10.7564, "step": 89 }, { "epoch": 0.01944684528954192, "grad_norm": 0.24298734962940216, "learning_rate": 7.228691778882693e-05, "loss": 10.7532, "step": 90 }, { "epoch": 0.019662921348314606, "grad_norm": 0.21422868967056274, "learning_rate": 7.145603043863045e-05, "loss": 10.7588, "step": 91 }, { "epoch": 0.019878997407087293, "grad_norm": 0.18029530346393585, "learning_rate": 7.061781587369519e-05, "loss": 10.7508, "step": 92 }, { "epoch": 0.020095073465859984, "grad_norm": 0.29323095083236694, "learning_rate": 6.977256034352712e-05, "loss": 10.7534, "step": 93 }, { "epoch": 0.02031114952463267, "grad_norm": 0.24313682317733765, "learning_rate": 6.892055250211552e-05, "loss": 10.753, "step": 94 }, { "epoch": 0.02052722558340536, "grad_norm": 0.30919376015663147, "learning_rate": 6.806208330935766e-05, "loss": 10.7502, "step": 95 }, { "epoch": 0.020743301642178046, "grad_norm": 0.3091332018375397, "learning_rate": 6.719744593169641e-05, "loss": 10.7492, "step": 96 }, { "epoch": 0.020959377700950736, "grad_norm": 0.31597936153411865, "learning_rate": 6.632693564200416e-05, "loss": 10.7353, "step": 97 }, { "epoch": 0.021175453759723423, "grad_norm": 0.41740843653678894, "learning_rate": 6.545084971874738e-05, "loss": 10.753, "step": 98 }, { "epoch": 0.02139152981849611, "grad_norm": 0.3725014328956604, "learning_rate": 6.456948734446624e-05, "loss": 10.72, "step": 99 }, { "epoch": 0.021607605877268798, "grad_norm": 0.6240119934082031, "learning_rate": 6.368314950360415e-05, "loss": 10.7379, "step": 100 }, { "epoch": 0.021607605877268798, "eval_loss": 10.751439094543457, "eval_runtime": 27.1534, "eval_samples_per_second": 287.073, "eval_steps_per_second": 143.555, "step": 100 }, { "epoch": 0.021823681936041485, "grad_norm": 0.21080626547336578, "learning_rate": 6.279213887972179e-05, "loss": 10.7706, "step": 101 }, { "epoch": 0.022039757994814176, "grad_norm": 0.20775455236434937, "learning_rate": 6.189675975213094e-05, "loss": 10.7617, "step": 102 }, { "epoch": 0.022255834053586863, "grad_norm": 0.1883164793252945, "learning_rate": 6.099731789198344e-05, "loss": 10.7622, "step": 103 }, { "epoch": 0.02247191011235955, "grad_norm": 0.22989457845687866, "learning_rate": 6.009412045785051e-05, "loss": 10.767, "step": 104 }, { "epoch": 0.022687986171132237, "grad_norm": 0.21413500607013702, "learning_rate": 5.918747589082853e-05, "loss": 10.7754, "step": 105 }, { "epoch": 0.022904062229904928, "grad_norm": 0.20524542033672333, "learning_rate": 5.82776938092065e-05, "loss": 10.7728, "step": 106 }, { "epoch": 0.023120138288677615, "grad_norm": 0.276962012052536, "learning_rate": 5.736508490273188e-05, "loss": 10.7504, "step": 107 }, { "epoch": 0.023336214347450302, "grad_norm": 0.2452148199081421, "learning_rate": 5.644996082651017e-05, "loss": 10.7616, "step": 108 }, { "epoch": 0.02355229040622299, "grad_norm": 0.25284409523010254, "learning_rate": 5.553263409457504e-05, "loss": 10.7604, "step": 109 }, { "epoch": 0.02376836646499568, "grad_norm": 0.247440367937088, "learning_rate": 5.4613417973165106e-05, "loss": 10.7641, "step": 110 }, { "epoch": 0.023984442523768367, "grad_norm": 0.18067599833011627, "learning_rate": 5.3692626373743706e-05, "loss": 10.7746, "step": 111 }, { "epoch": 0.024200518582541054, "grad_norm": 0.19137042760849, "learning_rate": 5.27705737457985e-05, "loss": 10.7714, "step": 112 }, { "epoch": 0.02441659464131374, "grad_norm": 0.2149994820356369, "learning_rate": 5.184757496945726e-05, "loss": 10.7733, "step": 113 }, { "epoch": 0.024632670700086432, "grad_norm": 0.20906798541545868, "learning_rate": 5.092394524795649e-05, "loss": 10.7726, "step": 114 }, { "epoch": 0.02484874675885912, "grad_norm": 0.24666734039783478, "learning_rate": 5e-05, "loss": 10.7536, "step": 115 }, { "epoch": 0.025064822817631807, "grad_norm": 0.23656940460205078, "learning_rate": 4.907605475204352e-05, "loss": 10.7602, "step": 116 }, { "epoch": 0.025280898876404494, "grad_norm": 0.25157082080841064, "learning_rate": 4.8152425030542766e-05, "loss": 10.7619, "step": 117 }, { "epoch": 0.02549697493517718, "grad_norm": 0.2460283637046814, "learning_rate": 4.72294262542015e-05, "loss": 10.7613, "step": 118 }, { "epoch": 0.02571305099394987, "grad_norm": 0.1980646699666977, "learning_rate": 4.6307373626256306e-05, "loss": 10.7639, "step": 119 }, { "epoch": 0.02592912705272256, "grad_norm": 0.20079004764556885, "learning_rate": 4.5386582026834906e-05, "loss": 10.7587, "step": 120 }, { "epoch": 0.026145203111495246, "grad_norm": 0.22045651078224182, "learning_rate": 4.446736590542497e-05, "loss": 10.7678, "step": 121 }, { "epoch": 0.026361279170267933, "grad_norm": 0.22736740112304688, "learning_rate": 4.3550039173489845e-05, "loss": 10.7542, "step": 122 }, { "epoch": 0.026577355229040624, "grad_norm": 0.17214606702327728, "learning_rate": 4.2634915097268115e-05, "loss": 10.7608, "step": 123 }, { "epoch": 0.02679343128781331, "grad_norm": 0.18513911962509155, "learning_rate": 4.1722306190793495e-05, "loss": 10.7495, "step": 124 }, { "epoch": 0.027009507346585998, "grad_norm": 0.2161649465560913, "learning_rate": 4.0812524109171476e-05, "loss": 10.7578, "step": 125 }, { "epoch": 0.027225583405358685, "grad_norm": 0.2477547526359558, "learning_rate": 3.99058795421495e-05, "loss": 10.7566, "step": 126 }, { "epoch": 0.027441659464131376, "grad_norm": 0.19045643508434296, "learning_rate": 3.9002682108016585e-05, "loss": 10.753, "step": 127 }, { "epoch": 0.027657735522904063, "grad_norm": 0.2026197761297226, "learning_rate": 3.8103240247869075e-05, "loss": 10.7643, "step": 128 }, { "epoch": 0.02787381158167675, "grad_norm": 0.23443067073822021, "learning_rate": 3.720786112027822e-05, "loss": 10.7545, "step": 129 }, { "epoch": 0.028089887640449437, "grad_norm": 0.21837399899959564, "learning_rate": 3.631685049639586e-05, "loss": 10.7466, "step": 130 }, { "epoch": 0.028305963699222125, "grad_norm": 0.228903129696846, "learning_rate": 3.543051265553377e-05, "loss": 10.7553, "step": 131 }, { "epoch": 0.028522039757994815, "grad_norm": 0.22999686002731323, "learning_rate": 3.4549150281252636e-05, "loss": 10.7365, "step": 132 }, { "epoch": 0.028738115816767502, "grad_norm": 0.2587832808494568, "learning_rate": 3.367306435799584e-05, "loss": 10.7495, "step": 133 }, { "epoch": 0.02895419187554019, "grad_norm": 0.2272672802209854, "learning_rate": 3.2802554068303596e-05, "loss": 10.7485, "step": 134 }, { "epoch": 0.029170267934312877, "grad_norm": 0.1977323740720749, "learning_rate": 3.1937916690642356e-05, "loss": 10.7478, "step": 135 }, { "epoch": 0.029386343993085567, "grad_norm": 0.20933398604393005, "learning_rate": 3.107944749788449e-05, "loss": 10.759, "step": 136 }, { "epoch": 0.029602420051858255, "grad_norm": 0.2519747316837311, "learning_rate": 3.0227439656472877e-05, "loss": 10.7503, "step": 137 }, { "epoch": 0.029818496110630942, "grad_norm": 0.2388688176870346, "learning_rate": 2.9382184126304834e-05, "loss": 10.7351, "step": 138 }, { "epoch": 0.03003457216940363, "grad_norm": 0.29849356412887573, "learning_rate": 2.8543969561369556e-05, "loss": 10.7364, "step": 139 }, { "epoch": 0.03025064822817632, "grad_norm": 0.2356027066707611, "learning_rate": 2.771308221117309e-05, "loss": 10.7399, "step": 140 }, { "epoch": 0.030466724286949007, "grad_norm": 0.25474485754966736, "learning_rate": 2.688980582298435e-05, "loss": 10.74, "step": 141 }, { "epoch": 0.030682800345721694, "grad_norm": 0.23734347522258759, "learning_rate": 2.607442154493568e-05, "loss": 10.7393, "step": 142 }, { "epoch": 0.03089887640449438, "grad_norm": 0.28821271657943726, "learning_rate": 2.5267207830011068e-05, "loss": 10.7336, "step": 143 }, { "epoch": 0.03111495246326707, "grad_norm": 0.2773192822933197, "learning_rate": 2.446844034095466e-05, "loss": 10.7414, "step": 144 }, { "epoch": 0.031331028522039756, "grad_norm": 0.3964485824108124, "learning_rate": 2.3678391856132204e-05, "loss": 10.742, "step": 145 }, { "epoch": 0.031547104580812446, "grad_norm": 0.33878329396247864, "learning_rate": 2.2897332176377528e-05, "loss": 10.7489, "step": 146 }, { "epoch": 0.03176318063958514, "grad_norm": 0.3486817479133606, "learning_rate": 2.2125528032855724e-05, "loss": 10.7379, "step": 147 }, { "epoch": 0.03197925669835782, "grad_norm": 0.38555988669395447, "learning_rate": 2.136324299597474e-05, "loss": 10.7374, "step": 148 }, { "epoch": 0.03219533275713051, "grad_norm": 0.521436333656311, "learning_rate": 2.061073738537635e-05, "loss": 10.7291, "step": 149 }, { "epoch": 0.032411408815903195, "grad_norm": 0.8618770241737366, "learning_rate": 1.9868268181037185e-05, "loss": 10.7151, "step": 150 }, { "epoch": 0.032411408815903195, "eval_loss": 10.746877670288086, "eval_runtime": 27.2327, "eval_samples_per_second": 286.237, "eval_steps_per_second": 143.137, "step": 150 }, { "epoch": 0.032627484874675886, "grad_norm": 0.2572970688343048, "learning_rate": 1.9136088935510362e-05, "loss": 10.7659, "step": 151 }, { "epoch": 0.032843560933448576, "grad_norm": 0.23141218721866608, "learning_rate": 1.8414449687337464e-05, "loss": 10.7649, "step": 152 }, { "epoch": 0.03305963699222126, "grad_norm": 0.2419702559709549, "learning_rate": 1.7703596875660645e-05, "loss": 10.7615, "step": 153 }, { "epoch": 0.03327571305099395, "grad_norm": 0.19218285381793976, "learning_rate": 1.700377325606388e-05, "loss": 10.7609, "step": 154 }, { "epoch": 0.03349178910976664, "grad_norm": 0.2367192953824997, "learning_rate": 1.631521781767214e-05, "loss": 10.7647, "step": 155 }, { "epoch": 0.033707865168539325, "grad_norm": 0.21695514023303986, "learning_rate": 1.5638165701536868e-05, "loss": 10.765, "step": 156 }, { "epoch": 0.033923941227312016, "grad_norm": 0.2253107875585556, "learning_rate": 1.4972848120335453e-05, "loss": 10.7699, "step": 157 }, { "epoch": 0.0341400172860847, "grad_norm": 0.19992785155773163, "learning_rate": 1.4319492279412388e-05, "loss": 10.7592, "step": 158 }, { "epoch": 0.03435609334485739, "grad_norm": 0.21194936335086823, "learning_rate": 1.3678321299188801e-05, "loss": 10.7629, "step": 159 }, { "epoch": 0.03457216940363008, "grad_norm": 0.26007580757141113, "learning_rate": 1.3049554138967051e-05, "loss": 10.7648, "step": 160 }, { "epoch": 0.034788245462402764, "grad_norm": 0.22751381993293762, "learning_rate": 1.2433405522156332e-05, "loss": 10.7603, "step": 161 }, { "epoch": 0.035004321521175455, "grad_norm": 0.2545979619026184, "learning_rate": 1.183008586294485e-05, "loss": 10.7617, "step": 162 }, { "epoch": 0.03522039757994814, "grad_norm": 0.20070402324199677, "learning_rate": 1.1239801194443506e-05, "loss": 10.7547, "step": 163 }, { "epoch": 0.03543647363872083, "grad_norm": 0.20730213820934296, "learning_rate": 1.066275309832584e-05, "loss": 10.761, "step": 164 }, { "epoch": 0.03565254969749352, "grad_norm": 0.2222391963005066, "learning_rate": 1.0099138635988026e-05, "loss": 10.7707, "step": 165 }, { "epoch": 0.035868625756266204, "grad_norm": 0.21017976105213165, "learning_rate": 9.549150281252633e-06, "loss": 10.7582, "step": 166 }, { "epoch": 0.036084701815038894, "grad_norm": 0.21349570155143738, "learning_rate": 9.012975854638949e-06, "loss": 10.7534, "step": 167 }, { "epoch": 0.036300777873811585, "grad_norm": 0.19311493635177612, "learning_rate": 8.490798459222476e-06, "loss": 10.7536, "step": 168 }, { "epoch": 0.03651685393258427, "grad_norm": 0.18329201638698578, "learning_rate": 7.982796418105371e-06, "loss": 10.7595, "step": 169 }, { "epoch": 0.03673292999135696, "grad_norm": 0.24617359042167664, "learning_rate": 7.489143213519301e-06, "loss": 10.7562, "step": 170 }, { "epoch": 0.03694900605012964, "grad_norm": 0.24574460089206696, "learning_rate": 7.010007427581378e-06, "loss": 10.7496, "step": 171 }, { "epoch": 0.037165082108902334, "grad_norm": 0.2094903290271759, "learning_rate": 6.5455526847235825e-06, "loss": 10.7454, "step": 172 }, { "epoch": 0.037381158167675024, "grad_norm": 0.23481670022010803, "learning_rate": 6.0959375958151045e-06, "loss": 10.7503, "step": 173 }, { "epoch": 0.03759723422644771, "grad_norm": 0.20529882609844208, "learning_rate": 5.6613157039969055e-06, "loss": 10.7551, "step": 174 }, { "epoch": 0.0378133102852204, "grad_norm": 0.2087268829345703, "learning_rate": 5.241835432246889e-06, "loss": 10.7512, "step": 175 }, { "epoch": 0.03802938634399308, "grad_norm": 0.24301747977733612, "learning_rate": 4.837640032693558e-06, "loss": 10.7587, "step": 176 }, { "epoch": 0.03824546240276577, "grad_norm": 0.24005451798439026, "learning_rate": 4.448867537695578e-06, "loss": 10.7615, "step": 177 }, { "epoch": 0.038461538461538464, "grad_norm": 0.2205115705728531, "learning_rate": 4.075650712703849e-06, "loss": 10.7405, "step": 178 }, { "epoch": 0.03867761452031115, "grad_norm": 0.22657622396945953, "learning_rate": 3.71811701092219e-06, "loss": 10.7544, "step": 179 }, { "epoch": 0.03889369057908384, "grad_norm": 0.24251677095890045, "learning_rate": 3.376388529782215e-06, "loss": 10.7535, "step": 180 }, { "epoch": 0.03910976663785653, "grad_norm": 0.22308377921581268, "learning_rate": 3.0505819692471792e-06, "loss": 10.7536, "step": 181 }, { "epoch": 0.03932584269662921, "grad_norm": 0.2320922166109085, "learning_rate": 2.7408085919590264e-06, "loss": 10.7482, "step": 182 }, { "epoch": 0.0395419187554019, "grad_norm": 0.23464159667491913, "learning_rate": 2.4471741852423237e-06, "loss": 10.7496, "step": 183 }, { "epoch": 0.03975799481417459, "grad_norm": 0.21937261521816254, "learning_rate": 2.1697790249779636e-06, "loss": 10.7502, "step": 184 }, { "epoch": 0.03997407087294728, "grad_norm": 0.23580560088157654, "learning_rate": 1.908717841359048e-06, "loss": 10.7532, "step": 185 }, { "epoch": 0.04019014693171997, "grad_norm": 0.2424219399690628, "learning_rate": 1.6640797865406288e-06, "loss": 10.745, "step": 186 }, { "epoch": 0.04040622299049265, "grad_norm": 0.2548362612724304, "learning_rate": 1.4359484041943038e-06, "loss": 10.7458, "step": 187 }, { "epoch": 0.04062229904926534, "grad_norm": 0.31250232458114624, "learning_rate": 1.2244016009781701e-06, "loss": 10.744, "step": 188 }, { "epoch": 0.040838375108038026, "grad_norm": 0.2745141088962555, "learning_rate": 1.0295116199317057e-06, "loss": 10.7386, "step": 189 }, { "epoch": 0.04105445116681072, "grad_norm": 0.2804180383682251, "learning_rate": 8.513450158049108e-07, "loss": 10.7314, "step": 190 }, { "epoch": 0.04127052722558341, "grad_norm": 0.2679109573364258, "learning_rate": 6.899626323298713e-07, "loss": 10.7439, "step": 191 }, { "epoch": 0.04148660328435609, "grad_norm": 0.22815419733524323, "learning_rate": 5.454195814427021e-07, "loss": 10.7384, "step": 192 }, { "epoch": 0.04170267934312878, "grad_norm": 0.2827376127243042, "learning_rate": 4.177652244628627e-07, "loss": 10.7385, "step": 193 }, { "epoch": 0.04191875540190147, "grad_norm": 0.3146594762802124, "learning_rate": 3.0704315523631953e-07, "loss": 10.7451, "step": 194 }, { "epoch": 0.042134831460674156, "grad_norm": 0.30587294697761536, "learning_rate": 2.1329118524827662e-07, "loss": 10.7321, "step": 195 }, { "epoch": 0.04235090751944685, "grad_norm": 0.26626572012901306, "learning_rate": 1.3654133071059893e-07, "loss": 10.7262, "step": 196 }, { "epoch": 0.04256698357821953, "grad_norm": 0.3111208975315094, "learning_rate": 7.681980162830282e-08, "loss": 10.7239, "step": 197 }, { "epoch": 0.04278305963699222, "grad_norm": 0.4671022891998291, "learning_rate": 3.4146992848854695e-08, "loss": 10.7375, "step": 198 }, { "epoch": 0.04299913569576491, "grad_norm": 0.520592987537384, "learning_rate": 8.537477097364522e-09, "loss": 10.7301, "step": 199 }, { "epoch": 0.043215211754537596, "grad_norm": 0.5407875776290894, "learning_rate": 0.0, "loss": 10.6871, "step": 200 }, { "epoch": 0.043215211754537596, "eval_loss": 10.745776176452637, "eval_runtime": 27.2162, "eval_samples_per_second": 286.41, "eval_steps_per_second": 143.223, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 139523405119488.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }