{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.016675351745700884, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.337675872850443e-05, "grad_norm": 2.534695863723755, "learning_rate": 2e-05, "loss": 6.996, "step": 1 }, { "epoch": 8.337675872850443e-05, "eval_loss": 2.2557170391082764, "eval_runtime": 69.6462, "eval_samples_per_second": 72.509, "eval_steps_per_second": 36.255, "step": 1 }, { "epoch": 0.00016675351745700886, "grad_norm": 4.112269878387451, "learning_rate": 4e-05, "loss": 10.7356, "step": 2 }, { "epoch": 0.0002501302761855133, "grad_norm": 4.108906269073486, "learning_rate": 6e-05, "loss": 9.3082, "step": 3 }, { "epoch": 0.0003335070349140177, "grad_norm": 3.1692538261413574, "learning_rate": 8e-05, "loss": 9.1058, "step": 4 }, { "epoch": 0.00041688379364252214, "grad_norm": 5.433767318725586, "learning_rate": 0.0001, "loss": 13.4658, "step": 5 }, { "epoch": 0.0005002605523710266, "grad_norm": 3.1914710998535156, "learning_rate": 0.00012, "loss": 8.0176, "step": 6 }, { "epoch": 0.000583637311099531, "grad_norm": 3.6264748573303223, "learning_rate": 0.00014, "loss": 9.5218, "step": 7 }, { "epoch": 0.0006670140698280355, "grad_norm": 3.173428773880005, "learning_rate": 0.00016, "loss": 8.7686, "step": 8 }, { "epoch": 0.0007503908285565399, "grad_norm": 3.8662312030792236, "learning_rate": 0.00018, "loss": 9.6705, "step": 9 }, { "epoch": 0.0008337675872850443, "grad_norm": 2.918992280960083, "learning_rate": 0.0002, "loss": 7.5198, "step": 10 }, { "epoch": 0.0009171443460135487, "grad_norm": 2.782639741897583, "learning_rate": 0.0001999863304992469, "loss": 8.8055, "step": 11 }, { "epoch": 0.0010005211047420532, "grad_norm": 2.9841339588165283, "learning_rate": 0.00019994532573409262, "loss": 8.0065, "step": 12 }, { "epoch": 0.0010838978634705575, "grad_norm": 4.018250942230225, "learning_rate": 0.00019987699691483048, "loss": 8.9573, "step": 13 }, { "epoch": 0.001167274622199062, "grad_norm": 3.4327125549316406, "learning_rate": 0.00019978136272187747, "loss": 9.973, "step": 14 }, { "epoch": 0.0012506513809275664, "grad_norm": 3.299730062484741, "learning_rate": 0.000199658449300667, "loss": 7.8683, "step": 15 }, { "epoch": 0.001334028139656071, "grad_norm": 3.753506898880005, "learning_rate": 0.00019950829025450114, "loss": 7.2567, "step": 16 }, { "epoch": 0.0014174048983845752, "grad_norm": 4.432242393493652, "learning_rate": 0.00019933092663536382, "loss": 9.6064, "step": 17 }, { "epoch": 0.0015007816571130797, "grad_norm": 6.0634684562683105, "learning_rate": 0.00019912640693269752, "loss": 10.0913, "step": 18 }, { "epoch": 0.0015841584158415843, "grad_norm": 5.204840660095215, "learning_rate": 0.00019889478706014687, "loss": 8.775, "step": 19 }, { "epoch": 0.0016675351745700886, "grad_norm": 5.777037620544434, "learning_rate": 0.00019863613034027224, "loss": 9.278, "step": 20 }, { "epoch": 0.001750911933298593, "grad_norm": 4.261403560638428, "learning_rate": 0.00019835050748723824, "loss": 9.6213, "step": 21 }, { "epoch": 0.0018342886920270974, "grad_norm": 4.248034954071045, "learning_rate": 0.00019803799658748094, "loss": 9.3568, "step": 22 }, { "epoch": 0.001917665450755602, "grad_norm": 3.803323268890381, "learning_rate": 0.00019769868307835994, "loss": 8.48, "step": 23 }, { "epoch": 0.0020010422094841065, "grad_norm": 3.443775177001953, "learning_rate": 0.0001973326597248006, "loss": 8.5166, "step": 24 }, { "epoch": 0.0020844189682126106, "grad_norm": 3.10636568069458, "learning_rate": 0.00019694002659393305, "loss": 6.9142, "step": 25 }, { "epoch": 0.002167795726941115, "grad_norm": 7.100295543670654, "learning_rate": 0.00019652089102773488, "loss": 11.6384, "step": 26 }, { "epoch": 0.0022511724856696196, "grad_norm": 5.785402774810791, "learning_rate": 0.00019607536761368484, "loss": 10.7454, "step": 27 }, { "epoch": 0.002334549244398124, "grad_norm": 3.018549680709839, "learning_rate": 0.00019560357815343577, "loss": 7.6228, "step": 28 }, { "epoch": 0.0024179260031266287, "grad_norm": 4.010953426361084, "learning_rate": 0.00019510565162951537, "loss": 9.4401, "step": 29 }, { "epoch": 0.0025013027618551328, "grad_norm": 4.524446964263916, "learning_rate": 0.00019458172417006347, "loss": 10.1375, "step": 30 }, { "epoch": 0.0025846795205836373, "grad_norm": 3.461047887802124, "learning_rate": 0.00019403193901161613, "loss": 8.033, "step": 31 }, { "epoch": 0.002668056279312142, "grad_norm": 3.3522675037384033, "learning_rate": 0.0001934564464599461, "loss": 8.2424, "step": 32 }, { "epoch": 0.0027514330380406463, "grad_norm": 2.893357515335083, "learning_rate": 0.00019285540384897073, "loss": 7.1057, "step": 33 }, { "epoch": 0.0028348097967691504, "grad_norm": 3.0926976203918457, "learning_rate": 0.00019222897549773848, "loss": 8.0821, "step": 34 }, { "epoch": 0.002918186555497655, "grad_norm": 3.8426332473754883, "learning_rate": 0.00019157733266550575, "loss": 10.5508, "step": 35 }, { "epoch": 0.0030015633142261595, "grad_norm": 4.10872220993042, "learning_rate": 0.00019090065350491626, "loss": 10.314, "step": 36 }, { "epoch": 0.003084940072954664, "grad_norm": 2.7273058891296387, "learning_rate": 0.00019019912301329592, "loss": 8.1275, "step": 37 }, { "epoch": 0.0031683168316831685, "grad_norm": 3.5605056285858154, "learning_rate": 0.00018947293298207635, "loss": 8.6723, "step": 38 }, { "epoch": 0.0032516935904116726, "grad_norm": 5.416788578033447, "learning_rate": 0.0001887222819443612, "loss": 9.7495, "step": 39 }, { "epoch": 0.003335070349140177, "grad_norm": 3.8840010166168213, "learning_rate": 0.0001879473751206489, "loss": 8.8682, "step": 40 }, { "epoch": 0.0034184471078686817, "grad_norm": 3.141036033630371, "learning_rate": 0.00018714842436272773, "loss": 7.8402, "step": 41 }, { "epoch": 0.003501823866597186, "grad_norm": 3.415287971496582, "learning_rate": 0.00018632564809575742, "loss": 7.4874, "step": 42 }, { "epoch": 0.0035852006253256903, "grad_norm": 6.077539443969727, "learning_rate": 0.0001854792712585539, "loss": 10.3711, "step": 43 }, { "epoch": 0.003668577384054195, "grad_norm": 4.345938682556152, "learning_rate": 0.00018460952524209355, "loss": 10.1496, "step": 44 }, { "epoch": 0.0037519541427826993, "grad_norm": 4.29965353012085, "learning_rate": 0.00018371664782625287, "loss": 9.2361, "step": 45 }, { "epoch": 0.003835330901511204, "grad_norm": 4.321191787719727, "learning_rate": 0.00018280088311480201, "loss": 8.8282, "step": 46 }, { "epoch": 0.003918707660239708, "grad_norm": 3.4017069339752197, "learning_rate": 0.00018186248146866927, "loss": 8.8426, "step": 47 }, { "epoch": 0.004002084418968213, "grad_norm": 3.803649425506592, "learning_rate": 0.00018090169943749476, "loss": 7.1382, "step": 48 }, { "epoch": 0.004085461177696717, "grad_norm": 5.015726566314697, "learning_rate": 0.0001799187996894925, "loss": 9.7273, "step": 49 }, { "epoch": 0.004168837936425221, "grad_norm": 5.0045037269592285, "learning_rate": 0.00017891405093963938, "loss": 10.0517, "step": 50 }, { "epoch": 0.004168837936425221, "eval_loss": 2.041832447052002, "eval_runtime": 69.7121, "eval_samples_per_second": 72.441, "eval_steps_per_second": 36.22, "step": 50 }, { "epoch": 0.004252214695153726, "grad_norm": 5.273454666137695, "learning_rate": 0.00017788772787621126, "loss": 7.7355, "step": 51 }, { "epoch": 0.00433559145388223, "grad_norm": 4.412261962890625, "learning_rate": 0.00017684011108568592, "loss": 6.7413, "step": 52 }, { "epoch": 0.004418968212610735, "grad_norm": 3.984342098236084, "learning_rate": 0.0001757714869760335, "loss": 8.8219, "step": 53 }, { "epoch": 0.004502344971339239, "grad_norm": 3.3683133125305176, "learning_rate": 0.0001746821476984154, "loss": 7.4397, "step": 54 }, { "epoch": 0.004585721730067743, "grad_norm": 4.496111869812012, "learning_rate": 0.00017357239106731317, "loss": 6.8065, "step": 55 }, { "epoch": 0.004669098488796248, "grad_norm": 4.162015438079834, "learning_rate": 0.00017244252047910892, "loss": 8.1048, "step": 56 }, { "epoch": 0.004752475247524752, "grad_norm": 3.4676620960235596, "learning_rate": 0.00017129284482913972, "loss": 6.8288, "step": 57 }, { "epoch": 0.004835852006253257, "grad_norm": 4.051041603088379, "learning_rate": 0.00017012367842724887, "loss": 7.3718, "step": 58 }, { "epoch": 0.004919228764981761, "grad_norm": 5.180737018585205, "learning_rate": 0.0001689353409118566, "loss": 9.8836, "step": 59 }, { "epoch": 0.0050026055237102655, "grad_norm": 5.89562463760376, "learning_rate": 0.00016772815716257412, "loss": 8.8932, "step": 60 }, { "epoch": 0.0050859822824387705, "grad_norm": 4.652097702026367, "learning_rate": 0.0001665024572113848, "loss": 7.3464, "step": 61 }, { "epoch": 0.0051693590411672746, "grad_norm": 3.2940993309020996, "learning_rate": 0.00016525857615241687, "loss": 6.9377, "step": 62 }, { "epoch": 0.005252735799895779, "grad_norm": 4.341916561126709, "learning_rate": 0.00016399685405033167, "loss": 9.9174, "step": 63 }, { "epoch": 0.005336112558624284, "grad_norm": 6.677371501922607, "learning_rate": 0.0001627176358473537, "loss": 10.8274, "step": 64 }, { "epoch": 0.005419489317352788, "grad_norm": 3.8812875747680664, "learning_rate": 0.0001614212712689668, "loss": 7.6091, "step": 65 }, { "epoch": 0.005502866076081293, "grad_norm": 5.111429691314697, "learning_rate": 0.00016010811472830252, "loss": 6.8907, "step": 66 }, { "epoch": 0.005586242834809797, "grad_norm": 3.911635398864746, "learning_rate": 0.00015877852522924732, "loss": 7.5746, "step": 67 }, { "epoch": 0.005669619593538301, "grad_norm": 4.402072906494141, "learning_rate": 0.00015743286626829437, "loss": 7.3162, "step": 68 }, { "epoch": 0.005752996352266806, "grad_norm": 4.469682216644287, "learning_rate": 0.0001560715057351673, "loss": 8.1245, "step": 69 }, { "epoch": 0.00583637311099531, "grad_norm": 5.504849910736084, "learning_rate": 0.00015469481581224272, "loss": 10.3321, "step": 70 }, { "epoch": 0.005919749869723815, "grad_norm": 5.990438938140869, "learning_rate": 0.0001533031728727994, "loss": 8.4619, "step": 71 }, { "epoch": 0.006003126628452319, "grad_norm": 3.5529396533966064, "learning_rate": 0.00015189695737812152, "loss": 7.236, "step": 72 }, { "epoch": 0.006086503387180823, "grad_norm": 5.105240345001221, "learning_rate": 0.0001504765537734844, "loss": 11.3329, "step": 73 }, { "epoch": 0.006169880145909328, "grad_norm": 3.823227882385254, "learning_rate": 0.00014904235038305083, "loss": 8.5633, "step": 74 }, { "epoch": 0.006253256904637832, "grad_norm": 4.1277971267700195, "learning_rate": 0.00014759473930370736, "loss": 8.9986, "step": 75 }, { "epoch": 0.006336633663366337, "grad_norm": 5.1016740798950195, "learning_rate": 0.0001461341162978688, "loss": 6.7809, "step": 76 }, { "epoch": 0.006420010422094841, "grad_norm": 5.213894367218018, "learning_rate": 0.00014466088068528068, "loss": 8.1234, "step": 77 }, { "epoch": 0.006503387180823345, "grad_norm": 3.9442667961120605, "learning_rate": 0.00014317543523384928, "loss": 6.803, "step": 78 }, { "epoch": 0.00658676393955185, "grad_norm": 4.572739601135254, "learning_rate": 0.00014167818604952906, "loss": 9.0664, "step": 79 }, { "epoch": 0.006670140698280354, "grad_norm": 5.51597785949707, "learning_rate": 0.00014016954246529696, "loss": 8.7253, "step": 80 }, { "epoch": 0.006753517457008858, "grad_norm": 6.085229396820068, "learning_rate": 0.00013864991692924523, "loss": 7.5132, "step": 81 }, { "epoch": 0.006836894215737363, "grad_norm": 5.175082683563232, "learning_rate": 0.00013711972489182208, "loss": 6.8016, "step": 82 }, { "epoch": 0.0069202709744658674, "grad_norm": 3.689929723739624, "learning_rate": 0.00013557938469225167, "loss": 8.3562, "step": 83 }, { "epoch": 0.007003647733194372, "grad_norm": 3.6427206993103027, "learning_rate": 0.00013402931744416433, "loss": 7.3571, "step": 84 }, { "epoch": 0.0070870244919228765, "grad_norm": 4.614701271057129, "learning_rate": 0.00013246994692046836, "loss": 8.9938, "step": 85 }, { "epoch": 0.007170401250651381, "grad_norm": 4.688872337341309, "learning_rate": 0.00013090169943749476, "loss": 7.112, "step": 86 }, { "epoch": 0.0072537780093798855, "grad_norm": 3.395167350769043, "learning_rate": 0.0001293250037384465, "loss": 9.9337, "step": 87 }, { "epoch": 0.00733715476810839, "grad_norm": 4.059149265289307, "learning_rate": 0.00012774029087618446, "loss": 8.5826, "step": 88 }, { "epoch": 0.007420531526836895, "grad_norm": 3.590693473815918, "learning_rate": 0.00012614799409538198, "loss": 6.5018, "step": 89 }, { "epoch": 0.007503908285565399, "grad_norm": 4.000766277313232, "learning_rate": 0.00012454854871407994, "loss": 7.8792, "step": 90 }, { "epoch": 0.007587285044293903, "grad_norm": 5.370626449584961, "learning_rate": 0.00012294239200467516, "loss": 9.4348, "step": 91 }, { "epoch": 0.007670661803022408, "grad_norm": 4.013746738433838, "learning_rate": 0.0001213299630743747, "loss": 7.533, "step": 92 }, { "epoch": 0.007754038561750912, "grad_norm": 5.8706583976745605, "learning_rate": 0.00011971170274514802, "loss": 8.9689, "step": 93 }, { "epoch": 0.007837415320479416, "grad_norm": 5.318309783935547, "learning_rate": 0.000118088053433211, "loss": 11.444, "step": 94 }, { "epoch": 0.007920792079207921, "grad_norm": 5.957444667816162, "learning_rate": 0.00011645945902807341, "loss": 9.4652, "step": 95 }, { "epoch": 0.008004168837936426, "grad_norm": 4.867214679718018, "learning_rate": 0.0001148263647711842, "loss": 8.7648, "step": 96 }, { "epoch": 0.008087545596664929, "grad_norm": 4.044260501861572, "learning_rate": 0.00011318921713420691, "loss": 9.4336, "step": 97 }, { "epoch": 0.008170922355393434, "grad_norm": 5.8006205558776855, "learning_rate": 0.00011154846369695863, "loss": 8.8013, "step": 98 }, { "epoch": 0.008254299114121939, "grad_norm": 3.970647096633911, "learning_rate": 0.0001099045530250463, "loss": 8.509, "step": 99 }, { "epoch": 0.008337675872850442, "grad_norm": 2.7641403675079346, "learning_rate": 0.00010825793454723325, "loss": 5.7705, "step": 100 }, { "epoch": 0.008337675872850442, "eval_loss": 1.9884008169174194, "eval_runtime": 70.3896, "eval_samples_per_second": 71.744, "eval_steps_per_second": 35.872, "step": 100 }, { "epoch": 0.008421052631578947, "grad_norm": 4.176263332366943, "learning_rate": 0.00010660905843256994, "loss": 7.0963, "step": 101 }, { "epoch": 0.008504429390307452, "grad_norm": 3.327336549758911, "learning_rate": 0.00010495837546732224, "loss": 7.0791, "step": 102 }, { "epoch": 0.008587806149035955, "grad_norm": 5.326552391052246, "learning_rate": 0.00010330633693173082, "loss": 8.5557, "step": 103 }, { "epoch": 0.00867118290776446, "grad_norm": 3.137587785720825, "learning_rate": 0.00010165339447663587, "loss": 7.6346, "step": 104 }, { "epoch": 0.008754559666492965, "grad_norm": 4.366918087005615, "learning_rate": 0.0001, "loss": 7.8156, "step": 105 }, { "epoch": 0.00883793642522147, "grad_norm": 4.763668060302734, "learning_rate": 9.834660552336415e-05, "loss": 8.0528, "step": 106 }, { "epoch": 0.008921313183949973, "grad_norm": 3.8986947536468506, "learning_rate": 9.669366306826919e-05, "loss": 6.3366, "step": 107 }, { "epoch": 0.009004689942678478, "grad_norm": 5.50061559677124, "learning_rate": 9.504162453267777e-05, "loss": 7.7455, "step": 108 }, { "epoch": 0.009088066701406983, "grad_norm": 3.723641872406006, "learning_rate": 9.339094156743007e-05, "loss": 6.2653, "step": 109 }, { "epoch": 0.009171443460135487, "grad_norm": 3.923414945602417, "learning_rate": 9.174206545276677e-05, "loss": 7.2898, "step": 110 }, { "epoch": 0.009254820218863992, "grad_norm": 4.864738464355469, "learning_rate": 9.009544697495374e-05, "loss": 9.1495, "step": 111 }, { "epoch": 0.009338196977592497, "grad_norm": 4.508790016174316, "learning_rate": 8.845153630304139e-05, "loss": 7.5381, "step": 112 }, { "epoch": 0.009421573736321, "grad_norm": 5.183701515197754, "learning_rate": 8.681078286579311e-05, "loss": 8.099, "step": 113 }, { "epoch": 0.009504950495049505, "grad_norm": 5.300266742706299, "learning_rate": 8.517363522881579e-05, "loss": 9.3539, "step": 114 }, { "epoch": 0.00958832725377801, "grad_norm": 4.453273773193359, "learning_rate": 8.35405409719266e-05, "loss": 9.0837, "step": 115 }, { "epoch": 0.009671704012506515, "grad_norm": 5.1587910652160645, "learning_rate": 8.191194656678904e-05, "loss": 10.6395, "step": 116 }, { "epoch": 0.009755080771235018, "grad_norm": 4.783015727996826, "learning_rate": 8.028829725485199e-05, "loss": 7.0859, "step": 117 }, { "epoch": 0.009838457529963523, "grad_norm": 5.205554485321045, "learning_rate": 7.867003692562534e-05, "loss": 6.7606, "step": 118 }, { "epoch": 0.009921834288692028, "grad_norm": 3.5442142486572266, "learning_rate": 7.705760799532485e-05, "loss": 5.8093, "step": 119 }, { "epoch": 0.010005211047420531, "grad_norm": 4.767430305480957, "learning_rate": 7.54514512859201e-05, "loss": 8.8166, "step": 120 }, { "epoch": 0.010088587806149036, "grad_norm": 4.638835430145264, "learning_rate": 7.385200590461803e-05, "loss": 7.7852, "step": 121 }, { "epoch": 0.010171964564877541, "grad_norm": 3.8365132808685303, "learning_rate": 7.225970912381556e-05, "loss": 7.4408, "step": 122 }, { "epoch": 0.010255341323606044, "grad_norm": 3.970036506652832, "learning_rate": 7.067499626155354e-05, "loss": 8.0905, "step": 123 }, { "epoch": 0.010338718082334549, "grad_norm": 3.594309091567993, "learning_rate": 6.909830056250527e-05, "loss": 9.6643, "step": 124 }, { "epoch": 0.010422094841063054, "grad_norm": 4.544534206390381, "learning_rate": 6.753005307953167e-05, "loss": 7.2089, "step": 125 }, { "epoch": 0.010505471599791557, "grad_norm": 4.061487674713135, "learning_rate": 6.59706825558357e-05, "loss": 6.8838, "step": 126 }, { "epoch": 0.010588848358520062, "grad_norm": 4.6268510818481445, "learning_rate": 6.442061530774834e-05, "loss": 8.0996, "step": 127 }, { "epoch": 0.010672225117248567, "grad_norm": 4.591619491577148, "learning_rate": 6.28802751081779e-05, "loss": 9.0279, "step": 128 }, { "epoch": 0.010755601875977072, "grad_norm": 3.5984318256378174, "learning_rate": 6.135008307075481e-05, "loss": 6.8998, "step": 129 }, { "epoch": 0.010838978634705575, "grad_norm": 4.134429931640625, "learning_rate": 5.983045753470308e-05, "loss": 7.7342, "step": 130 }, { "epoch": 0.01092235539343408, "grad_norm": 4.544061660766602, "learning_rate": 5.832181395047098e-05, "loss": 6.6566, "step": 131 }, { "epoch": 0.011005732152162585, "grad_norm": 4.653716087341309, "learning_rate": 5.6824564766150726e-05, "loss": 8.1876, "step": 132 }, { "epoch": 0.011089108910891089, "grad_norm": 4.810211181640625, "learning_rate": 5.533911931471936e-05, "loss": 9.1589, "step": 133 }, { "epoch": 0.011172485669619594, "grad_norm": 4.453709125518799, "learning_rate": 5.386588370213124e-05, "loss": 8.7409, "step": 134 }, { "epoch": 0.011255862428348098, "grad_norm": 5.478562831878662, "learning_rate": 5.240526069629265e-05, "loss": 10.2675, "step": 135 }, { "epoch": 0.011339239187076602, "grad_norm": 4.585888862609863, "learning_rate": 5.095764961694922e-05, "loss": 7.5847, "step": 136 }, { "epoch": 0.011422615945805107, "grad_norm": 4.79464054107666, "learning_rate": 4.952344622651566e-05, "loss": 8.9001, "step": 137 }, { "epoch": 0.011505992704533612, "grad_norm": 5.097901344299316, "learning_rate": 4.810304262187852e-05, "loss": 9.4132, "step": 138 }, { "epoch": 0.011589369463262115, "grad_norm": 3.9933440685272217, "learning_rate": 4.669682712720065e-05, "loss": 7.8629, "step": 139 }, { "epoch": 0.01167274622199062, "grad_norm": 4.931108474731445, "learning_rate": 4.530518418775733e-05, "loss": 8.0418, "step": 140 }, { "epoch": 0.011756122980719125, "grad_norm": 4.090979099273682, "learning_rate": 4.392849426483274e-05, "loss": 7.5149, "step": 141 }, { "epoch": 0.01183949973944763, "grad_norm": 4.233511447906494, "learning_rate": 4.256713373170564e-05, "loss": 6.9281, "step": 142 }, { "epoch": 0.011922876498176133, "grad_norm": 3.975322723388672, "learning_rate": 4.12214747707527e-05, "loss": 7.2225, "step": 143 }, { "epoch": 0.012006253256904638, "grad_norm": 5.84738302230835, "learning_rate": 3.9891885271697496e-05, "loss": 9.5679, "step": 144 }, { "epoch": 0.012089630015633143, "grad_norm": 3.988117218017578, "learning_rate": 3.857872873103322e-05, "loss": 6.1582, "step": 145 }, { "epoch": 0.012173006774361646, "grad_norm": 3.905040740966797, "learning_rate": 3.7282364152646297e-05, "loss": 7.2591, "step": 146 }, { "epoch": 0.012256383533090151, "grad_norm": 5.4452924728393555, "learning_rate": 3.600314594966834e-05, "loss": 9.7908, "step": 147 }, { "epoch": 0.012339760291818656, "grad_norm": 5.016132354736328, "learning_rate": 3.4741423847583134e-05, "loss": 7.3294, "step": 148 }, { "epoch": 0.01242313705054716, "grad_norm": 4.367519855499268, "learning_rate": 3.349754278861517e-05, "loss": 9.6115, "step": 149 }, { "epoch": 0.012506513809275664, "grad_norm": 4.329085350036621, "learning_rate": 3.227184283742591e-05, "loss": 7.6718, "step": 150 }, { "epoch": 0.012506513809275664, "eval_loss": 1.9661880731582642, "eval_runtime": 70.4704, "eval_samples_per_second": 71.661, "eval_steps_per_second": 35.831, "step": 150 }, { "epoch": 0.01258989056800417, "grad_norm": 3.8532228469848633, "learning_rate": 3.106465908814342e-05, "loss": 9.0608, "step": 151 }, { "epoch": 0.012673267326732674, "grad_norm": 4.6140456199646, "learning_rate": 2.9876321572751144e-05, "loss": 7.3028, "step": 152 }, { "epoch": 0.012756644085461177, "grad_norm": 4.772500514984131, "learning_rate": 2.87071551708603e-05, "loss": 10.0175, "step": 153 }, { "epoch": 0.012840020844189682, "grad_norm": 4.909071445465088, "learning_rate": 2.7557479520891104e-05, "loss": 9.2292, "step": 154 }, { "epoch": 0.012923397602918187, "grad_norm": 4.347821235656738, "learning_rate": 2.6427608932686843e-05, "loss": 7.6785, "step": 155 }, { "epoch": 0.01300677436164669, "grad_norm": 5.958515644073486, "learning_rate": 2.5317852301584643e-05, "loss": 10.8612, "step": 156 }, { "epoch": 0.013090151120375195, "grad_norm": 4.036256790161133, "learning_rate": 2.422851302396655e-05, "loss": 6.5377, "step": 157 }, { "epoch": 0.0131735278791037, "grad_norm": 4.0767436027526855, "learning_rate": 2.315988891431412e-05, "loss": 6.9137, "step": 158 }, { "epoch": 0.013256904637832204, "grad_norm": 4.559920787811279, "learning_rate": 2.2112272123788768e-05, "loss": 7.8971, "step": 159 }, { "epoch": 0.013340281396560709, "grad_norm": 3.1472551822662354, "learning_rate": 2.1085949060360654e-05, "loss": 5.2795, "step": 160 }, { "epoch": 0.013423658155289214, "grad_norm": 4.643826961517334, "learning_rate": 2.008120031050753e-05, "loss": 10.4631, "step": 161 }, { "epoch": 0.013507034914017717, "grad_norm": 5.551768779754639, "learning_rate": 1.9098300562505266e-05, "loss": 8.5777, "step": 162 }, { "epoch": 0.013590411672746222, "grad_norm": 3.418463706970215, "learning_rate": 1.8137518531330767e-05, "loss": 5.9437, "step": 163 }, { "epoch": 0.013673788431474727, "grad_norm": 4.048105239868164, "learning_rate": 1.7199116885197995e-05, "loss": 7.9703, "step": 164 }, { "epoch": 0.013757165190203232, "grad_norm": 3.720338821411133, "learning_rate": 1.6283352173747145e-05, "loss": 7.5605, "step": 165 }, { "epoch": 0.013840541948931735, "grad_norm": 4.476045608520508, "learning_rate": 1.5390474757906446e-05, "loss": 7.0048, "step": 166 }, { "epoch": 0.01392391870766024, "grad_norm": 4.96762228012085, "learning_rate": 1.4520728741446089e-05, "loss": 10.7742, "step": 167 }, { "epoch": 0.014007295466388745, "grad_norm": 4.895275592803955, "learning_rate": 1.3674351904242611e-05, "loss": 6.773, "step": 168 }, { "epoch": 0.014090672225117248, "grad_norm": 5.70258903503418, "learning_rate": 1.2851575637272262e-05, "loss": 11.6797, "step": 169 }, { "epoch": 0.014174048983845753, "grad_norm": 3.3568615913391113, "learning_rate": 1.2052624879351104e-05, "loss": 6.4714, "step": 170 }, { "epoch": 0.014257425742574258, "grad_norm": 4.252028465270996, "learning_rate": 1.1277718055638819e-05, "loss": 7.7119, "step": 171 }, { "epoch": 0.014340802501302761, "grad_norm": 4.521555423736572, "learning_rate": 1.0527067017923654e-05, "loss": 7.6881, "step": 172 }, { "epoch": 0.014424179260031266, "grad_norm": 4.935907363891602, "learning_rate": 9.80087698670411e-06, "loss": 8.6496, "step": 173 }, { "epoch": 0.014507556018759771, "grad_norm": 3.695267915725708, "learning_rate": 9.09934649508375e-06, "loss": 7.9681, "step": 174 }, { "epoch": 0.014590932777488274, "grad_norm": 4.178027629852295, "learning_rate": 8.422667334494249e-06, "loss": 8.4342, "step": 175 }, { "epoch": 0.01467430953621678, "grad_norm": 4.292721748352051, "learning_rate": 7.771024502261526e-06, "loss": 8.0612, "step": 176 }, { "epoch": 0.014757686294945284, "grad_norm": 5.052070140838623, "learning_rate": 7.144596151029303e-06, "loss": 7.641, "step": 177 }, { "epoch": 0.01484106305367379, "grad_norm": 5.070157527923584, "learning_rate": 6.543553540053926e-06, "loss": 7.9055, "step": 178 }, { "epoch": 0.014924439812402292, "grad_norm": 4.510176658630371, "learning_rate": 5.968060988383883e-06, "loss": 6.882, "step": 179 }, { "epoch": 0.015007816571130797, "grad_norm": 4.591582775115967, "learning_rate": 5.418275829936537e-06, "loss": 7.9564, "step": 180 }, { "epoch": 0.015091193329859302, "grad_norm": 5.847330570220947, "learning_rate": 4.8943483704846475e-06, "loss": 9.3527, "step": 181 }, { "epoch": 0.015174570088587806, "grad_norm": 4.053606033325195, "learning_rate": 4.3964218465642355e-06, "loss": 8.5674, "step": 182 }, { "epoch": 0.01525794684731631, "grad_norm": 4.323848247528076, "learning_rate": 3.924632386315186e-06, "loss": 8.7715, "step": 183 }, { "epoch": 0.015341323606044815, "grad_norm": 4.508976459503174, "learning_rate": 3.4791089722651436e-06, "loss": 9.1552, "step": 184 }, { "epoch": 0.015424700364773319, "grad_norm": 3.246110439300537, "learning_rate": 3.059973406066963e-06, "loss": 8.1741, "step": 185 }, { "epoch": 0.015508077123501824, "grad_norm": 4.064354419708252, "learning_rate": 2.667340275199426e-06, "loss": 9.1723, "step": 186 }, { "epoch": 0.015591453882230329, "grad_norm": 4.16738748550415, "learning_rate": 2.3013169216400733e-06, "loss": 7.7564, "step": 187 }, { "epoch": 0.015674830640958832, "grad_norm": 3.58866810798645, "learning_rate": 1.9620034125190644e-06, "loss": 8.4719, "step": 188 }, { "epoch": 0.01575820739968734, "grad_norm": 4.435379505157471, "learning_rate": 1.6494925127617634e-06, "loss": 8.5871, "step": 189 }, { "epoch": 0.015841584158415842, "grad_norm": 4.594212532043457, "learning_rate": 1.3638696597277679e-06, "loss": 8.6829, "step": 190 }, { "epoch": 0.015924960917144345, "grad_norm": 5.113223552703857, "learning_rate": 1.1052129398531507e-06, "loss": 7.9665, "step": 191 }, { "epoch": 0.01600833767587285, "grad_norm": 3.7875256538391113, "learning_rate": 8.735930673024806e-07, "loss": 6.4576, "step": 192 }, { "epoch": 0.016091714434601355, "grad_norm": 3.661071300506592, "learning_rate": 6.690733646361857e-07, "loss": 6.8199, "step": 193 }, { "epoch": 0.016175091193329858, "grad_norm": 4.465787887573242, "learning_rate": 4.917097454988584e-07, "loss": 8.3244, "step": 194 }, { "epoch": 0.016258467952058365, "grad_norm": 4.833078861236572, "learning_rate": 3.415506993330153e-07, "loss": 8.3167, "step": 195 }, { "epoch": 0.016341844710786868, "grad_norm": 3.7651824951171875, "learning_rate": 2.1863727812254653e-07, "loss": 6.5716, "step": 196 }, { "epoch": 0.01642522146951537, "grad_norm": 4.892351150512695, "learning_rate": 1.230030851695263e-07, "loss": 7.0042, "step": 197 }, { "epoch": 0.016508598228243878, "grad_norm": 3.952244758605957, "learning_rate": 5.467426590739511e-08, "loss": 7.8461, "step": 198 }, { "epoch": 0.01659197498697238, "grad_norm": 3.891923427581787, "learning_rate": 1.3669500753099585e-08, "loss": 7.6043, "step": 199 }, { "epoch": 0.016675351745700884, "grad_norm": 5.590742588043213, "learning_rate": 0.0, "loss": 10.1178, "step": 200 }, { "epoch": 0.016675351745700884, "eval_loss": 1.9614673852920532, "eval_runtime": 69.1641, "eval_samples_per_second": 73.015, "eval_steps_per_second": 36.507, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1821610856153088.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }