{ "best_metric": 1.6292288303375244, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.9956382669380633, "eval_steps": 50, "global_step": 214, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004652515266065717, "grad_norm": 3.1542959213256836, "learning_rate": 2e-05, "loss": 5.9362, "step": 1 }, { "epoch": 0.004652515266065717, "eval_loss": 5.9479522705078125, "eval_runtime": 20.8376, "eval_samples_per_second": 34.745, "eval_steps_per_second": 8.686, "step": 1 }, { "epoch": 0.009305030532131433, "grad_norm": 3.2570960521698, "learning_rate": 4e-05, "loss": 5.9508, "step": 2 }, { "epoch": 0.01395754579819715, "grad_norm": 2.7825522422790527, "learning_rate": 6e-05, "loss": 5.7132, "step": 3 }, { "epoch": 0.018610061064262867, "grad_norm": 3.4090566635131836, "learning_rate": 8e-05, "loss": 5.9836, "step": 4 }, { "epoch": 0.023262576330328584, "grad_norm": 3.439707040786743, "learning_rate": 0.0001, "loss": 5.7045, "step": 5 }, { "epoch": 0.0279150915963943, "grad_norm": 3.6563377380371094, "learning_rate": 0.00012, "loss": 5.8168, "step": 6 }, { "epoch": 0.03256760686246002, "grad_norm": 4.006112098693848, "learning_rate": 0.00014, "loss": 4.9581, "step": 7 }, { "epoch": 0.03722012212852573, "grad_norm": 4.275187015533447, "learning_rate": 0.00016, "loss": 4.7639, "step": 8 }, { "epoch": 0.041872637394591454, "grad_norm": 4.293889999389648, "learning_rate": 0.00018, "loss": 4.3398, "step": 9 }, { "epoch": 0.04652515266065717, "grad_norm": 4.2498321533203125, "learning_rate": 0.0002, "loss": 4.2132, "step": 10 }, { "epoch": 0.05117766792672288, "grad_norm": 3.1846446990966797, "learning_rate": 0.0001999881422898077, "loss": 4.0895, "step": 11 }, { "epoch": 0.0558301831927886, "grad_norm": 4.165881633758545, "learning_rate": 0.0001999525719713366, "loss": 3.5577, "step": 12 }, { "epoch": 0.06048269845885432, "grad_norm": 4.896974563598633, "learning_rate": 0.00019989329748023725, "loss": 3.7735, "step": 13 }, { "epoch": 0.06513521372492004, "grad_norm": 4.080206394195557, "learning_rate": 0.00019981033287370443, "loss": 3.4187, "step": 14 }, { "epoch": 0.06978772899098575, "grad_norm": 3.606748580932617, "learning_rate": 0.0001997036978271433, "loss": 3.1502, "step": 15 }, { "epoch": 0.07444024425705147, "grad_norm": 2.8589611053466797, "learning_rate": 0.00019957341762950344, "loss": 2.8054, "step": 16 }, { "epoch": 0.07909275952311719, "grad_norm": 2.743023633956909, "learning_rate": 0.00019941952317728147, "loss": 2.774, "step": 17 }, { "epoch": 0.08374527478918291, "grad_norm": 2.8571105003356934, "learning_rate": 0.0001992420509671936, "loss": 2.902, "step": 18 }, { "epoch": 0.08839779005524862, "grad_norm": 2.666266679763794, "learning_rate": 0.0001990410430875205, "loss": 2.8819, "step": 19 }, { "epoch": 0.09305030532131434, "grad_norm": 2.3131182193756104, "learning_rate": 0.00019881654720812594, "loss": 2.7994, "step": 20 }, { "epoch": 0.09770282058738006, "grad_norm": 2.048189640045166, "learning_rate": 0.00019856861656915143, "loss": 2.5718, "step": 21 }, { "epoch": 0.10235533585344576, "grad_norm": 1.92734694480896, "learning_rate": 0.0001982973099683902, "loss": 2.6741, "step": 22 }, { "epoch": 0.10700785111951149, "grad_norm": 1.5441073179244995, "learning_rate": 0.0001980026917473432, "loss": 2.4727, "step": 23 }, { "epoch": 0.1116603663855772, "grad_norm": 1.761895775794983, "learning_rate": 0.0001976848317759601, "loss": 2.3773, "step": 24 }, { "epoch": 0.11631288165164291, "grad_norm": 1.940789818763733, "learning_rate": 0.0001973438054360693, "loss": 2.4848, "step": 25 }, { "epoch": 0.12096539691770863, "grad_norm": 1.9712603092193604, "learning_rate": 0.00019697969360350098, "loss": 2.4821, "step": 26 }, { "epoch": 0.12561791218377436, "grad_norm": 1.70182466506958, "learning_rate": 0.00019659258262890683, "loss": 2.238, "step": 27 }, { "epoch": 0.13027042744984008, "grad_norm": 1.6575437784194946, "learning_rate": 0.00019618256431728194, "loss": 2.6379, "step": 28 }, { "epoch": 0.1349229427159058, "grad_norm": 1.502398133277893, "learning_rate": 0.00019574973590619243, "loss": 2.2527, "step": 29 }, { "epoch": 0.1395754579819715, "grad_norm": 1.6205648183822632, "learning_rate": 0.00019529420004271567, "loss": 2.2776, "step": 30 }, { "epoch": 0.1442279732480372, "grad_norm": 1.4457286596298218, "learning_rate": 0.0001948160647590966, "loss": 2.2438, "step": 31 }, { "epoch": 0.14888048851410293, "grad_norm": 1.5694420337677002, "learning_rate": 0.00019431544344712776, "loss": 2.1488, "step": 32 }, { "epoch": 0.15353300378016865, "grad_norm": 1.890031337738037, "learning_rate": 0.00019379245483125784, "loss": 2.0873, "step": 33 }, { "epoch": 0.15818551904623437, "grad_norm": 1.6246261596679688, "learning_rate": 0.00019324722294043558, "loss": 2.163, "step": 34 }, { "epoch": 0.1628380343123001, "grad_norm": 1.498121976852417, "learning_rate": 0.00019267987707869606, "loss": 2.0497, "step": 35 }, { "epoch": 0.16749054957836582, "grad_norm": 1.6934443712234497, "learning_rate": 0.0001920905517944954, "loss": 2.2254, "step": 36 }, { "epoch": 0.1721430648444315, "grad_norm": 2.14957594871521, "learning_rate": 0.0001914793868488021, "loss": 2.4021, "step": 37 }, { "epoch": 0.17679558011049723, "grad_norm": 1.7338439226150513, "learning_rate": 0.00019084652718195238, "loss": 2.5293, "step": 38 }, { "epoch": 0.18144809537656295, "grad_norm": 1.5174095630645752, "learning_rate": 0.00019019212287927663, "loss": 2.1819, "step": 39 }, { "epoch": 0.18610061064262867, "grad_norm": 1.547797679901123, "learning_rate": 0.00018951632913550626, "loss": 2.0053, "step": 40 }, { "epoch": 0.1907531259086944, "grad_norm": 1.5773917436599731, "learning_rate": 0.00018881930621796847, "loss": 2.273, "step": 41 }, { "epoch": 0.19540564117476011, "grad_norm": 1.5535215139389038, "learning_rate": 0.00018810121942857845, "loss": 2.0164, "step": 42 }, { "epoch": 0.20005815644082584, "grad_norm": 1.773815393447876, "learning_rate": 0.00018736223906463696, "loss": 2.2051, "step": 43 }, { "epoch": 0.20471067170689153, "grad_norm": 1.91315758228302, "learning_rate": 0.00018660254037844388, "loss": 2.0598, "step": 44 }, { "epoch": 0.20936318697295725, "grad_norm": 1.5708611011505127, "learning_rate": 0.00018582230353573627, "loss": 2.1581, "step": 45 }, { "epoch": 0.21401570223902297, "grad_norm": 1.4507153034210205, "learning_rate": 0.00018502171357296144, "loss": 2.0607, "step": 46 }, { "epoch": 0.2186682175050887, "grad_norm": 1.6080292463302612, "learning_rate": 0.00018420096035339452, "loss": 2.2465, "step": 47 }, { "epoch": 0.2233207327711544, "grad_norm": 1.536863923072815, "learning_rate": 0.00018336023852211195, "loss": 2.1152, "step": 48 }, { "epoch": 0.22797324803722013, "grad_norm": 1.4842332601547241, "learning_rate": 0.00018249974745983023, "loss": 2.0389, "step": 49 }, { "epoch": 0.23262576330328583, "grad_norm": 1.4118927717208862, "learning_rate": 0.0001816196912356222, "loss": 2.1003, "step": 50 }, { "epoch": 0.23262576330328583, "eval_loss": 2.0300514698028564, "eval_runtime": 20.9081, "eval_samples_per_second": 34.628, "eval_steps_per_second": 8.657, "step": 50 }, { "epoch": 0.23727827856935155, "grad_norm": 1.608886957168579, "learning_rate": 0.00018072027855852097, "loss": 2.2955, "step": 51 }, { "epoch": 0.24193079383541727, "grad_norm": 1.4855011701583862, "learning_rate": 0.000179801722728024, "loss": 1.9307, "step": 52 }, { "epoch": 0.246583309101483, "grad_norm": 1.465166449546814, "learning_rate": 0.00017886424158350782, "loss": 1.9173, "step": 53 }, { "epoch": 0.2512358243675487, "grad_norm": 1.79639732837677, "learning_rate": 0.00017790805745256704, "loss": 2.0639, "step": 54 }, { "epoch": 0.2558883396336144, "grad_norm": 1.683768630027771, "learning_rate": 0.00017693339709828792, "loss": 1.9898, "step": 55 }, { "epoch": 0.26054085489968015, "grad_norm": 1.7183611392974854, "learning_rate": 0.00017594049166547073, "loss": 2.0649, "step": 56 }, { "epoch": 0.26519337016574585, "grad_norm": 1.4495972394943237, "learning_rate": 0.00017492957662581295, "loss": 2.0176, "step": 57 }, { "epoch": 0.2698458854318116, "grad_norm": 1.500927448272705, "learning_rate": 0.00017390089172206592, "loss": 2.0764, "step": 58 }, { "epoch": 0.2744984006978773, "grad_norm": 1.5012840032577515, "learning_rate": 0.00017285468091117904, "loss": 1.8248, "step": 59 }, { "epoch": 0.279150915963943, "grad_norm": 1.5936579704284668, "learning_rate": 0.0001717911923064442, "loss": 2.0497, "step": 60 }, { "epoch": 0.28380343123000873, "grad_norm": 1.5540063381195068, "learning_rate": 0.00017071067811865476, "loss": 1.833, "step": 61 }, { "epoch": 0.2884559464960744, "grad_norm": 1.512662649154663, "learning_rate": 0.0001696133945962927, "loss": 1.9966, "step": 62 }, { "epoch": 0.29310846176214017, "grad_norm": 1.5647892951965332, "learning_rate": 0.00016849960196475806, "loss": 2.0752, "step": 63 }, { "epoch": 0.29776097702820586, "grad_norm": 1.5723665952682495, "learning_rate": 0.00016736956436465573, "loss": 1.9368, "step": 64 }, { "epoch": 0.3024134922942716, "grad_norm": 1.7973029613494873, "learning_rate": 0.00016622354978915304, "loss": 1.9789, "step": 65 }, { "epoch": 0.3070660075603373, "grad_norm": 1.3695393800735474, "learning_rate": 0.0001650618300204242, "loss": 1.7856, "step": 66 }, { "epoch": 0.311718522826403, "grad_norm": 1.7126473188400269, "learning_rate": 0.00016388468056519612, "loss": 1.8566, "step": 67 }, { "epoch": 0.31637103809246875, "grad_norm": 1.408624529838562, "learning_rate": 0.0001626923805894107, "loss": 1.6383, "step": 68 }, { "epoch": 0.32102355335853444, "grad_norm": 1.6226450204849243, "learning_rate": 0.00016148521285201927, "loss": 1.8517, "step": 69 }, { "epoch": 0.3256760686246002, "grad_norm": 1.6124584674835205, "learning_rate": 0.00016026346363792567, "loss": 1.7469, "step": 70 }, { "epoch": 0.3303285838906659, "grad_norm": 1.7233102321624756, "learning_rate": 0.00015902742269009197, "loss": 1.7507, "step": 71 }, { "epoch": 0.33498109915673163, "grad_norm": 1.468773603439331, "learning_rate": 0.00015777738314082514, "loss": 1.7616, "step": 72 }, { "epoch": 0.3396336144227973, "grad_norm": 1.9513323307037354, "learning_rate": 0.0001565136414422592, "loss": 2.0448, "step": 73 }, { "epoch": 0.344286129688863, "grad_norm": 1.96356999874115, "learning_rate": 0.0001552364972960506, "loss": 1.9519, "step": 74 }, { "epoch": 0.34893864495492877, "grad_norm": 1.7340761423110962, "learning_rate": 0.0001539462535823025, "loss": 1.9677, "step": 75 }, { "epoch": 0.35359116022099446, "grad_norm": 1.6672484874725342, "learning_rate": 0.0001526432162877356, "loss": 1.9008, "step": 76 }, { "epoch": 0.3582436754870602, "grad_norm": 1.7562329769134521, "learning_rate": 0.00015132769443312207, "loss": 1.834, "step": 77 }, { "epoch": 0.3628961907531259, "grad_norm": 1.6138277053833008, "learning_rate": 0.00015000000000000001, "loss": 2.0238, "step": 78 }, { "epoch": 0.36754870601919165, "grad_norm": 1.873212456703186, "learning_rate": 0.00014866044785668563, "loss": 1.8391, "step": 79 }, { "epoch": 0.37220122128525734, "grad_norm": 1.608636736869812, "learning_rate": 0.00014730935568360102, "loss": 1.6384, "step": 80 }, { "epoch": 0.37685373655132304, "grad_norm": 1.6646443605422974, "learning_rate": 0.00014594704389793477, "loss": 1.8669, "step": 81 }, { "epoch": 0.3815062518173888, "grad_norm": 1.522123098373413, "learning_rate": 0.00014457383557765386, "loss": 1.9653, "step": 82 }, { "epoch": 0.3861587670834545, "grad_norm": 1.595712423324585, "learning_rate": 0.0001431900563848841, "loss": 1.8904, "step": 83 }, { "epoch": 0.39081128234952023, "grad_norm": 1.62184739112854, "learning_rate": 0.00014179603448867835, "loss": 1.7053, "step": 84 }, { "epoch": 0.3954637976155859, "grad_norm": 1.5826489925384521, "learning_rate": 0.00014039210048718949, "loss": 1.9059, "step": 85 }, { "epoch": 0.40011631288165167, "grad_norm": 1.608913540840149, "learning_rate": 0.00013897858732926793, "loss": 1.8071, "step": 86 }, { "epoch": 0.40476882814771736, "grad_norm": 1.504594326019287, "learning_rate": 0.00013755583023550126, "loss": 1.6661, "step": 87 }, { "epoch": 0.40942134341378306, "grad_norm": 1.6623263359069824, "learning_rate": 0.00013612416661871533, "loss": 1.8381, "step": 88 }, { "epoch": 0.4140738586798488, "grad_norm": 1.5866388082504272, "learning_rate": 0.00013468393600395525, "loss": 1.7955, "step": 89 }, { "epoch": 0.4187263739459145, "grad_norm": 1.596091389656067, "learning_rate": 0.00013323547994796597, "loss": 1.7176, "step": 90 }, { "epoch": 0.42337888921198025, "grad_norm": 1.6838603019714355, "learning_rate": 0.00013177914195819016, "loss": 1.7935, "step": 91 }, { "epoch": 0.42803140447804594, "grad_norm": 1.6236923933029175, "learning_rate": 0.00013031526741130435, "loss": 1.7203, "step": 92 }, { "epoch": 0.43268391974411163, "grad_norm": 1.6118321418762207, "learning_rate": 0.00012884420347131123, "loss": 1.851, "step": 93 }, { "epoch": 0.4373364350101774, "grad_norm": 1.6473873853683472, "learning_rate": 0.0001273662990072083, "loss": 1.8701, "step": 94 }, { "epoch": 0.4419889502762431, "grad_norm": 1.7001062631607056, "learning_rate": 0.00012588190451025207, "loss": 1.7731, "step": 95 }, { "epoch": 0.4466414655423088, "grad_norm": 1.6646205186843872, "learning_rate": 0.00012439137201083773, "loss": 2.0302, "step": 96 }, { "epoch": 0.4512939808083745, "grad_norm": 1.8173680305480957, "learning_rate": 0.0001228950549950134, "loss": 1.8191, "step": 97 }, { "epoch": 0.45594649607444027, "grad_norm": 1.6642239093780518, "learning_rate": 0.00012139330832064974, "loss": 1.7608, "step": 98 }, { "epoch": 0.46059901134050596, "grad_norm": 1.5319768190383911, "learning_rate": 0.00011988648813328367, "loss": 1.7914, "step": 99 }, { "epoch": 0.46525152660657165, "grad_norm": 1.56061851978302, "learning_rate": 0.00011837495178165706, "loss": 1.98, "step": 100 }, { "epoch": 0.46525152660657165, "eval_loss": 1.8041728734970093, "eval_runtime": 20.9031, "eval_samples_per_second": 34.636, "eval_steps_per_second": 8.659, "step": 100 }, { "epoch": 0.4699040418726374, "grad_norm": 1.6357735395431519, "learning_rate": 0.00011685905773296992, "loss": 1.7414, "step": 101 }, { "epoch": 0.4745565571387031, "grad_norm": 1.675748348236084, "learning_rate": 0.00011533916548786857, "loss": 1.8028, "step": 102 }, { "epoch": 0.47920907240476884, "grad_norm": 1.55147385597229, "learning_rate": 0.00011381563549518823, "loss": 1.5612, "step": 103 }, { "epoch": 0.48386158767083454, "grad_norm": 1.6434597969055176, "learning_rate": 0.00011228882906647142, "loss": 1.8869, "step": 104 }, { "epoch": 0.4885141029369003, "grad_norm": 1.544256567955017, "learning_rate": 0.00011075910829028115, "loss": 1.6021, "step": 105 }, { "epoch": 0.493166618202966, "grad_norm": 1.5358442068099976, "learning_rate": 0.00010922683594633021, "loss": 1.8826, "step": 106 }, { "epoch": 0.49781913346903167, "grad_norm": 1.6584852933883667, "learning_rate": 0.0001076923754194464, "loss": 1.5318, "step": 107 }, { "epoch": 0.5024716487350974, "grad_norm": 1.5296037197113037, "learning_rate": 0.00010615609061339432, "loss": 1.5062, "step": 108 }, { "epoch": 0.5071241640011631, "grad_norm": 1.638400912284851, "learning_rate": 0.00010461834586457398, "loss": 1.5932, "step": 109 }, { "epoch": 0.5117766792672288, "grad_norm": 1.72173011302948, "learning_rate": 0.00010307950585561706, "loss": 1.7414, "step": 110 }, { "epoch": 0.5164291945332946, "grad_norm": 1.6859912872314453, "learning_rate": 0.00010153993552890069, "loss": 1.8711, "step": 111 }, { "epoch": 0.5210817097993603, "grad_norm": 1.7842934131622314, "learning_rate": 0.0001, "loss": 1.6803, "step": 112 }, { "epoch": 0.525734225065426, "grad_norm": 1.5940512418746948, "learning_rate": 9.846006447109933e-05, "loss": 1.5046, "step": 113 }, { "epoch": 0.5303867403314917, "grad_norm": 1.9007431268692017, "learning_rate": 9.692049414438299e-05, "loss": 1.9239, "step": 114 }, { "epoch": 0.5350392555975574, "grad_norm": 1.940891146659851, "learning_rate": 9.538165413542607e-05, "loss": 1.8778, "step": 115 }, { "epoch": 0.5396917708636232, "grad_norm": 1.5742617845535278, "learning_rate": 9.384390938660572e-05, "loss": 1.5219, "step": 116 }, { "epoch": 0.5443442861296889, "grad_norm": 1.632001519203186, "learning_rate": 9.230762458055363e-05, "loss": 1.7918, "step": 117 }, { "epoch": 0.5489968013957546, "grad_norm": 1.7606934309005737, "learning_rate": 9.077316405366981e-05, "loss": 1.6797, "step": 118 }, { "epoch": 0.5536493166618203, "grad_norm": 1.8048553466796875, "learning_rate": 8.924089170971887e-05, "loss": 1.7574, "step": 119 }, { "epoch": 0.558301831927886, "grad_norm": 1.7053390741348267, "learning_rate": 8.77111709335286e-05, "loss": 1.6776, "step": 120 }, { "epoch": 0.5629543471939518, "grad_norm": 1.7866935729980469, "learning_rate": 8.61843645048118e-05, "loss": 1.676, "step": 121 }, { "epoch": 0.5676068624600175, "grad_norm": 1.7857139110565186, "learning_rate": 8.466083451213144e-05, "loss": 1.9838, "step": 122 }, { "epoch": 0.5722593777260832, "grad_norm": 1.7687913179397583, "learning_rate": 8.314094226703007e-05, "loss": 1.9557, "step": 123 }, { "epoch": 0.5769118929921488, "grad_norm": 1.9080584049224854, "learning_rate": 8.162504821834295e-05, "loss": 1.6932, "step": 124 }, { "epoch": 0.5815644082582146, "grad_norm": 1.7965078353881836, "learning_rate": 8.011351186671637e-05, "loss": 1.8605, "step": 125 }, { "epoch": 0.5862169235242803, "grad_norm": 1.7524250745773315, "learning_rate": 7.860669167935028e-05, "loss": 1.7335, "step": 126 }, { "epoch": 0.590869438790346, "grad_norm": 1.721587061882019, "learning_rate": 7.710494500498662e-05, "loss": 1.7249, "step": 127 }, { "epoch": 0.5955219540564117, "grad_norm": 1.7121999263763428, "learning_rate": 7.560862798916228e-05, "loss": 1.7491, "step": 128 }, { "epoch": 0.6001744693224774, "grad_norm": 1.5758925676345825, "learning_rate": 7.411809548974792e-05, "loss": 1.709, "step": 129 }, { "epoch": 0.6048269845885432, "grad_norm": 1.5831133127212524, "learning_rate": 7.263370099279172e-05, "loss": 1.4999, "step": 130 }, { "epoch": 0.6094794998546089, "grad_norm": 1.7787184715270996, "learning_rate": 7.115579652868878e-05, "loss": 1.7472, "step": 131 }, { "epoch": 0.6141320151206746, "grad_norm": 1.6525014638900757, "learning_rate": 6.968473258869566e-05, "loss": 1.7104, "step": 132 }, { "epoch": 0.6187845303867403, "grad_norm": 1.8373721837997437, "learning_rate": 6.822085804180984e-05, "loss": 1.8997, "step": 133 }, { "epoch": 0.623437045652806, "grad_norm": 1.6728414297103882, "learning_rate": 6.676452005203406e-05, "loss": 1.5062, "step": 134 }, { "epoch": 0.6280895609188718, "grad_norm": 1.663562297821045, "learning_rate": 6.531606399604473e-05, "loss": 1.7005, "step": 135 }, { "epoch": 0.6327420761849375, "grad_norm": 1.7047648429870605, "learning_rate": 6.387583338128471e-05, "loss": 1.6651, "step": 136 }, { "epoch": 0.6373945914510032, "grad_norm": 1.8546326160430908, "learning_rate": 6.244416976449875e-05, "loss": 1.7414, "step": 137 }, { "epoch": 0.6420471067170689, "grad_norm": 1.8506556749343872, "learning_rate": 6.102141267073207e-05, "loss": 1.5764, "step": 138 }, { "epoch": 0.6466996219831347, "grad_norm": 1.863836407661438, "learning_rate": 5.960789951281052e-05, "loss": 1.8554, "step": 139 }, { "epoch": 0.6513521372492004, "grad_norm": 1.5789958238601685, "learning_rate": 5.82039655113217e-05, "loss": 1.7332, "step": 140 }, { "epoch": 0.6560046525152661, "grad_norm": 1.8007911443710327, "learning_rate": 5.680994361511591e-05, "loss": 1.7878, "step": 141 }, { "epoch": 0.6606571677813318, "grad_norm": 1.7672853469848633, "learning_rate": 5.542616442234618e-05, "loss": 1.59, "step": 142 }, { "epoch": 0.6653096830473975, "grad_norm": 1.6216075420379639, "learning_rate": 5.4052956102065246e-05, "loss": 1.4842, "step": 143 }, { "epoch": 0.6699621983134633, "grad_norm": 1.8410775661468506, "learning_rate": 5.269064431639901e-05, "loss": 1.6629, "step": 144 }, { "epoch": 0.674614713579529, "grad_norm": 1.8751968145370483, "learning_rate": 5.1339552143314384e-05, "loss": 1.6441, "step": 145 }, { "epoch": 0.6792672288455947, "grad_norm": 1.913163661956787, "learning_rate": 5.000000000000002e-05, "loss": 1.6159, "step": 146 }, { "epoch": 0.6839197441116603, "grad_norm": 1.6089226007461548, "learning_rate": 4.8672305566877964e-05, "loss": 1.6247, "step": 147 }, { "epoch": 0.688572259377726, "grad_norm": 1.820172667503357, "learning_rate": 4.735678371226441e-05, "loss": 1.8709, "step": 148 }, { "epoch": 0.6932247746437918, "grad_norm": 1.7547824382781982, "learning_rate": 4.605374641769752e-05, "loss": 1.4424, "step": 149 }, { "epoch": 0.6978772899098575, "grad_norm": 1.677914023399353, "learning_rate": 4.476350270394942e-05, "loss": 1.7515, "step": 150 }, { "epoch": 0.6978772899098575, "eval_loss": 1.6791523694992065, "eval_runtime": 20.9234, "eval_samples_per_second": 34.602, "eval_steps_per_second": 8.651, "step": 150 }, { "epoch": 0.7025298051759232, "grad_norm": 1.8747222423553467, "learning_rate": 4.3486358557740814e-05, "loss": 1.8838, "step": 151 }, { "epoch": 0.7071823204419889, "grad_norm": 1.7761543989181519, "learning_rate": 4.222261685917489e-05, "loss": 1.6168, "step": 152 }, { "epoch": 0.7118348357080546, "grad_norm": 1.6737406253814697, "learning_rate": 4.0972577309908056e-05, "loss": 1.6401, "step": 153 }, { "epoch": 0.7164873509741204, "grad_norm": 1.9486554861068726, "learning_rate": 3.973653636207437e-05, "loss": 1.8098, "step": 154 }, { "epoch": 0.7211398662401861, "grad_norm": 1.7520830631256104, "learning_rate": 3.851478714798076e-05, "loss": 1.7116, "step": 155 }, { "epoch": 0.7257923815062518, "grad_norm": 1.7359018325805664, "learning_rate": 3.7307619410589376e-05, "loss": 1.8196, "step": 156 }, { "epoch": 0.7304448967723175, "grad_norm": 1.5514014959335327, "learning_rate": 3.6115319434803894e-05, "loss": 1.3944, "step": 157 }, { "epoch": 0.7350974120383833, "grad_norm": 1.8193638324737549, "learning_rate": 3.493816997957582e-05, "loss": 1.8875, "step": 158 }, { "epoch": 0.739749927304449, "grad_norm": 1.7850615978240967, "learning_rate": 3.377645021084701e-05, "loss": 1.551, "step": 159 }, { "epoch": 0.7444024425705147, "grad_norm": 1.8897948265075684, "learning_rate": 3.263043563534428e-05, "loss": 1.6699, "step": 160 }, { "epoch": 0.7490549578365804, "grad_norm": 1.7700817584991455, "learning_rate": 3.150039803524194e-05, "loss": 1.5546, "step": 161 }, { "epoch": 0.7537074731026461, "grad_norm": 1.7360986471176147, "learning_rate": 3.0386605403707346e-05, "loss": 1.8158, "step": 162 }, { "epoch": 0.7583599883687119, "grad_norm": 1.7811108827590942, "learning_rate": 2.9289321881345254e-05, "loss": 1.7055, "step": 163 }, { "epoch": 0.7630125036347776, "grad_norm": 1.7199565172195435, "learning_rate": 2.8208807693555818e-05, "loss": 1.7042, "step": 164 }, { "epoch": 0.7676650189008433, "grad_norm": 1.774168610572815, "learning_rate": 2.7145319088820987e-05, "loss": 1.622, "step": 165 }, { "epoch": 0.772317534166909, "grad_norm": 1.7940188646316528, "learning_rate": 2.6099108277934103e-05, "loss": 1.5445, "step": 166 }, { "epoch": 0.7769700494329747, "grad_norm": 1.5857652425765991, "learning_rate": 2.507042337418707e-05, "loss": 1.5953, "step": 167 }, { "epoch": 0.7816225646990405, "grad_norm": 1.7121484279632568, "learning_rate": 2.405950833452928e-05, "loss": 1.5087, "step": 168 }, { "epoch": 0.7862750799651061, "grad_norm": 1.9980762004852295, "learning_rate": 2.3066602901712108e-05, "loss": 1.8725, "step": 169 }, { "epoch": 0.7909275952311718, "grad_norm": 1.7364083528518677, "learning_rate": 2.2091942547432955e-05, "loss": 1.7714, "step": 170 }, { "epoch": 0.7955801104972375, "grad_norm": 1.6725072860717773, "learning_rate": 2.113575841649217e-05, "loss": 1.4332, "step": 171 }, { "epoch": 0.8002326257633033, "grad_norm": 1.9727587699890137, "learning_rate": 2.0198277271976052e-05, "loss": 1.6559, "step": 172 }, { "epoch": 0.804885141029369, "grad_norm": 1.7346326112747192, "learning_rate": 1.927972144147905e-05, "loss": 1.5057, "step": 173 }, { "epoch": 0.8095376562954347, "grad_norm": 1.7290587425231934, "learning_rate": 1.8380308764377842e-05, "loss": 1.6469, "step": 174 }, { "epoch": 0.8141901715615004, "grad_norm": 1.8323500156402588, "learning_rate": 1.750025254016978e-05, "loss": 1.4771, "step": 175 }, { "epoch": 0.8188426868275661, "grad_norm": 1.9538246393203735, "learning_rate": 1.663976147788806e-05, "loss": 1.7257, "step": 176 }, { "epoch": 0.8234952020936319, "grad_norm": 1.7197211980819702, "learning_rate": 1.5799039646605486e-05, "loss": 1.6527, "step": 177 }, { "epoch": 0.8281477173596976, "grad_norm": 1.805812120437622, "learning_rate": 1.4978286427038601e-05, "loss": 1.5049, "step": 178 }, { "epoch": 0.8328002326257633, "grad_norm": 1.8770802021026611, "learning_rate": 1.4177696464263723e-05, "loss": 1.7229, "step": 179 }, { "epoch": 0.837452747891829, "grad_norm": 1.6201673746109009, "learning_rate": 1.339745962155613e-05, "loss": 1.4748, "step": 180 }, { "epoch": 0.8421052631578947, "grad_norm": 1.7318499088287354, "learning_rate": 1.2637760935363053e-05, "loss": 1.5424, "step": 181 }, { "epoch": 0.8467577784239605, "grad_norm": 1.6393368244171143, "learning_rate": 1.1898780571421552e-05, "loss": 1.5458, "step": 182 }, { "epoch": 0.8514102936900262, "grad_norm": 1.7201930284500122, "learning_rate": 1.1180693782031516e-05, "loss": 1.7075, "step": 183 }, { "epoch": 0.8560628089560919, "grad_norm": 1.825761079788208, "learning_rate": 1.0483670864493778e-05, "loss": 1.8075, "step": 184 }, { "epoch": 0.8607153242221576, "grad_norm": 1.7473479509353638, "learning_rate": 9.807877120723396e-06, "loss": 1.5959, "step": 185 }, { "epoch": 0.8653678394882233, "grad_norm": 1.7737841606140137, "learning_rate": 9.153472818047625e-06, "loss": 1.4923, "step": 186 }, { "epoch": 0.8700203547542891, "grad_norm": 1.8927415609359741, "learning_rate": 8.520613151197898e-06, "loss": 1.5957, "step": 187 }, { "epoch": 0.8746728700203548, "grad_norm": 1.8504620790481567, "learning_rate": 7.909448205504632e-06, "loss": 1.7199, "step": 188 }, { "epoch": 0.8793253852864205, "grad_norm": 1.9773368835449219, "learning_rate": 7.320122921303962e-06, "loss": 1.673, "step": 189 }, { "epoch": 0.8839779005524862, "grad_norm": 1.729998230934143, "learning_rate": 6.75277705956443e-06, "loss": 1.6096, "step": 190 }, { "epoch": 0.888630415818552, "grad_norm": 1.628357172012329, "learning_rate": 6.2075451687422124e-06, "loss": 1.4744, "step": 191 }, { "epoch": 0.8932829310846176, "grad_norm": 1.7737938165664673, "learning_rate": 5.684556552872256e-06, "loss": 1.8465, "step": 192 }, { "epoch": 0.8979354463506833, "grad_norm": 1.7610527276992798, "learning_rate": 5.183935240903414e-06, "loss": 1.5833, "step": 193 }, { "epoch": 0.902587961616749, "grad_norm": 1.850711703300476, "learning_rate": 4.705799957284351e-06, "loss": 1.6079, "step": 194 }, { "epoch": 0.9072404768828147, "grad_norm": 1.5661609172821045, "learning_rate": 4.250264093807565e-06, "loss": 1.4349, "step": 195 }, { "epoch": 0.9118929921488805, "grad_norm": 1.8847752809524536, "learning_rate": 3.817435682718096e-06, "loss": 1.6189, "step": 196 }, { "epoch": 0.9165455074149462, "grad_norm": 1.8437247276306152, "learning_rate": 3.40741737109318e-06, "loss": 1.6144, "step": 197 }, { "epoch": 0.9211980226810119, "grad_norm": 1.9048221111297607, "learning_rate": 3.0203063964990617e-06, "loss": 1.6744, "step": 198 }, { "epoch": 0.9258505379470776, "grad_norm": 1.8907570838928223, "learning_rate": 2.656194563930714e-06, "loss": 1.7463, "step": 199 }, { "epoch": 0.9305030532131433, "grad_norm": 1.9094120264053345, "learning_rate": 2.315168224039932e-06, "loss": 1.8705, "step": 200 }, { "epoch": 0.9305030532131433, "eval_loss": 1.6292288303375244, "eval_runtime": 20.9189, "eval_samples_per_second": 34.61, "eval_steps_per_second": 8.652, "step": 200 }, { "epoch": 0.9351555684792091, "grad_norm": 1.598107933998108, "learning_rate": 1.9973082526568154e-06, "loss": 1.4567, "step": 201 }, { "epoch": 0.9398080837452748, "grad_norm": 2.187147855758667, "learning_rate": 1.7026900316098215e-06, "loss": 1.6859, "step": 202 }, { "epoch": 0.9444605990113405, "grad_norm": 1.804359793663025, "learning_rate": 1.4313834308486097e-06, "loss": 1.614, "step": 203 }, { "epoch": 0.9491131142774062, "grad_norm": 1.7503759860992432, "learning_rate": 1.1834527918740623e-06, "loss": 1.68, "step": 204 }, { "epoch": 0.953765629543472, "grad_norm": 2.0529308319091797, "learning_rate": 9.589569124794916e-07, "loss": 1.7563, "step": 205 }, { "epoch": 0.9584181448095377, "grad_norm": 1.7820945978164673, "learning_rate": 7.579490328064265e-07, "loss": 1.516, "step": 206 }, { "epoch": 0.9630706600756034, "grad_norm": 1.8575091361999512, "learning_rate": 5.804768227185565e-07, "loss": 1.6248, "step": 207 }, { "epoch": 0.9677231753416691, "grad_norm": 1.8180886507034302, "learning_rate": 4.2658237049655323e-07, "loss": 1.6101, "step": 208 }, { "epoch": 0.9723756906077348, "grad_norm": 1.6702853441238403, "learning_rate": 2.963021728567106e-07, "loss": 1.5597, "step": 209 }, { "epoch": 0.9770282058738006, "grad_norm": 1.678638219833374, "learning_rate": 1.8966712629558957e-07, "loss": 1.5329, "step": 210 }, { "epoch": 0.9816807211398663, "grad_norm": 1.6849240064620972, "learning_rate": 1.0670251976275803e-07, "loss": 1.5622, "step": 211 }, { "epoch": 0.986333236405932, "grad_norm": 1.7889765501022339, "learning_rate": 4.74280286634099e-08, "loss": 1.4413, "step": 212 }, { "epoch": 0.9909857516719977, "grad_norm": 1.7154433727264404, "learning_rate": 1.1857710192308969e-08, "loss": 1.5593, "step": 213 }, { "epoch": 0.9956382669380633, "grad_norm": 1.744558572769165, "learning_rate": 0.0, "loss": 1.5847, "step": 214 } ], "logging_steps": 1, "max_steps": 214, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 30, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1181661243205222e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }