{ "best_metric": 0.3242824375629425, "best_model_checkpoint": "/workspace/plateer_classifier_v0.1_result/checkpoint-110000", "epoch": 0.6441270979878347, "eval_steps": 55000, "global_step": 110000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014640195241643742, "grad_norm": 50.05304718017578, "learning_rate": 4.880000000000001e-06, "loss": 4.3958, "step": 250 }, { "epoch": 0.0029280390483287485, "grad_norm": 48.363304138183594, "learning_rate": 9.88e-06, "loss": 1.6496, "step": 500 }, { "epoch": 0.004392058572493123, "grad_norm": 54.546974182128906, "learning_rate": 1.488e-05, "loss": 0.8787, "step": 750 }, { "epoch": 0.005856078096657497, "grad_norm": 50.317874908447266, "learning_rate": 1.9880000000000003e-05, "loss": 0.7721, "step": 1000 }, { "epoch": 0.007320097620821872, "grad_norm": 62.48823928833008, "learning_rate": 2.488e-05, "loss": 0.7047, "step": 1250 }, { "epoch": 0.008784117144986246, "grad_norm": 44.35001754760742, "learning_rate": 2.9880000000000002e-05, "loss": 0.6749, "step": 1500 }, { "epoch": 0.01024813666915062, "grad_norm": 36.486793518066406, "learning_rate": 3.4880000000000005e-05, "loss": 0.6409, "step": 1750 }, { "epoch": 0.011712156193314994, "grad_norm": 47.03588104248047, "learning_rate": 3.988e-05, "loss": 0.6406, "step": 2000 }, { "epoch": 0.013176175717479368, "grad_norm": 31.227832794189453, "learning_rate": 4.488e-05, "loss": 0.6149, "step": 2250 }, { "epoch": 0.014640195241643743, "grad_norm": 39.8408317565918, "learning_rate": 4.9880000000000004e-05, "loss": 0.5956, "step": 2500 }, { "epoch": 0.016104214765808117, "grad_norm": 41.118736267089844, "learning_rate": 5.4879999999999996e-05, "loss": 0.5905, "step": 2750 }, { "epoch": 0.017568234289972492, "grad_norm": 29.624338150024414, "learning_rate": 5.988e-05, "loss": 0.5608, "step": 3000 }, { "epoch": 0.019032253814136865, "grad_norm": 22.993818283081055, "learning_rate": 6.488e-05, "loss": 0.5614, "step": 3250 }, { "epoch": 0.02049627333830124, "grad_norm": 19.964269638061523, "learning_rate": 6.988e-05, "loss": 0.5569, "step": 3500 }, { "epoch": 0.021960292862465612, "grad_norm": 36.538047790527344, "learning_rate": 7.488e-05, "loss": 0.5316, "step": 3750 }, { "epoch": 0.023424312386629988, "grad_norm": 37.63505935668945, "learning_rate": 7.988e-05, "loss": 0.5364, "step": 4000 }, { "epoch": 0.024888331910794363, "grad_norm": 25.934967041015625, "learning_rate": 8.486000000000001e-05, "loss": 0.5234, "step": 4250 }, { "epoch": 0.026352351434958735, "grad_norm": 24.810028076171875, "learning_rate": 8.986e-05, "loss": 0.5155, "step": 4500 }, { "epoch": 0.02781637095912311, "grad_norm": 32.76811981201172, "learning_rate": 9.484e-05, "loss": 0.5022, "step": 4750 }, { "epoch": 0.029280390483287486, "grad_norm": 27.094772338867188, "learning_rate": 9.984e-05, "loss": 0.5023, "step": 5000 }, { "epoch": 0.029280390483287486, "eval_accuracy": 0.8572352668691132, "eval_loss": 0.5044249296188354, "eval_runtime": 11541.1431, "eval_samples_per_second": 210.432, "eval_steps_per_second": 6.576, "step": 5000 }, { "epoch": 0.03074441000745186, "grad_norm": 24.74563217163086, "learning_rate": 0.00010484, "loss": 0.5073, "step": 5250 }, { "epoch": 0.032208429531616234, "grad_norm": 17.229019165039062, "learning_rate": 0.00010984, "loss": 0.4932, "step": 5500 }, { "epoch": 0.03367244905578061, "grad_norm": 23.318979263305664, "learning_rate": 0.00011484000000000002, "loss": 0.504, "step": 5750 }, { "epoch": 0.035136468579944985, "grad_norm": 22.271846771240234, "learning_rate": 0.00011983999999999999, "loss": 0.4817, "step": 6000 }, { "epoch": 0.036600488104109354, "grad_norm": 24.304887771606445, "learning_rate": 0.00012484, "loss": 0.4966, "step": 6250 }, { "epoch": 0.03806450762827373, "grad_norm": 23.76158905029297, "learning_rate": 0.00012984000000000002, "loss": 0.4899, "step": 6500 }, { "epoch": 0.039528527152438105, "grad_norm": 20.765274047851562, "learning_rate": 0.00013484, "loss": 0.4773, "step": 6750 }, { "epoch": 0.04099254667660248, "grad_norm": 12.793950080871582, "learning_rate": 0.00013982000000000003, "loss": 0.4781, "step": 7000 }, { "epoch": 0.042456566200766856, "grad_norm": 14.128210067749023, "learning_rate": 0.00014482, "loss": 0.4687, "step": 7250 }, { "epoch": 0.043920585724931224, "grad_norm": 22.348928451538086, "learning_rate": 0.00014982, "loss": 0.4722, "step": 7500 }, { "epoch": 0.0453846052490956, "grad_norm": 17.29800796508789, "learning_rate": 0.00015480000000000002, "loss": 0.4692, "step": 7750 }, { "epoch": 0.046848624773259975, "grad_norm": 11.0147066116333, "learning_rate": 0.0001598, "loss": 0.4689, "step": 8000 }, { "epoch": 0.04831264429742435, "grad_norm": 11.713265419006348, "learning_rate": 0.0001648, "loss": 0.4788, "step": 8250 }, { "epoch": 0.049776663821588726, "grad_norm": 12.367693901062012, "learning_rate": 0.0001698, "loss": 0.4697, "step": 8500 }, { "epoch": 0.0512406833457531, "grad_norm": 8.11889934539795, "learning_rate": 0.00017480000000000002, "loss": 0.4696, "step": 8750 }, { "epoch": 0.05270470286991747, "grad_norm": 12.321019172668457, "learning_rate": 0.0001798, "loss": 0.461, "step": 9000 }, { "epoch": 0.054168722394081846, "grad_norm": 15.612183570861816, "learning_rate": 0.00018480000000000002, "loss": 0.4646, "step": 9250 }, { "epoch": 0.05563274191824622, "grad_norm": 10.72978687286377, "learning_rate": 0.0001898, "loss": 0.4673, "step": 9500 }, { "epoch": 0.0570967614424106, "grad_norm": 8.815441131591797, "learning_rate": 0.0001948, "loss": 0.4472, "step": 9750 }, { "epoch": 0.05856078096657497, "grad_norm": 8.681705474853516, "learning_rate": 0.0001998, "loss": 0.4629, "step": 10000 }, { "epoch": 0.05856078096657497, "eval_accuracy": 0.8688706572649133, "eval_loss": 0.457188218832016, "eval_runtime": 11537.8227, "eval_samples_per_second": 210.492, "eval_steps_per_second": 6.578, "step": 10000 }, { "epoch": 0.06002480049073934, "grad_norm": 13.643828392028809, "learning_rate": 0.0001997014219778306, "loss": 0.456, "step": 10250 }, { "epoch": 0.06148882001490372, "grad_norm": 13.211404800415039, "learning_rate": 0.00019939040320473745, "loss": 0.4666, "step": 10500 }, { "epoch": 0.06295283953906809, "grad_norm": 11.1001615524292, "learning_rate": 0.00019907938443164432, "loss": 0.4495, "step": 10750 }, { "epoch": 0.06441685906323247, "grad_norm": 8.222249984741211, "learning_rate": 0.00019876836565855117, "loss": 0.4483, "step": 11000 }, { "epoch": 0.06588087858739684, "grad_norm": 13.589752197265625, "learning_rate": 0.0001984585909605504, "loss": 0.4438, "step": 11250 }, { "epoch": 0.06734489811156122, "grad_norm": 9.988068580627441, "learning_rate": 0.00019814757218745724, "loss": 0.447, "step": 11500 }, { "epoch": 0.0688089176357256, "grad_norm": 8.311960220336914, "learning_rate": 0.0001978365534143641, "loss": 0.4476, "step": 11750 }, { "epoch": 0.07027293715988997, "grad_norm": 8.099685668945312, "learning_rate": 0.00019752553464127094, "loss": 0.4477, "step": 12000 }, { "epoch": 0.07173695668405435, "grad_norm": 8.23130989074707, "learning_rate": 0.00019721451586817782, "loss": 0.4385, "step": 12250 }, { "epoch": 0.07320097620821871, "grad_norm": 10.875362396240234, "learning_rate": 0.00019690349709508467, "loss": 0.4345, "step": 12500 }, { "epoch": 0.07466499573238308, "grad_norm": 9.479572296142578, "learning_rate": 0.00019659247832199152, "loss": 0.4345, "step": 12750 }, { "epoch": 0.07612901525654746, "grad_norm": 11.883151054382324, "learning_rate": 0.0001962814595488984, "loss": 0.4241, "step": 13000 }, { "epoch": 0.07759303478071183, "grad_norm": 8.15208911895752, "learning_rate": 0.00019597044077580524, "loss": 0.4335, "step": 13250 }, { "epoch": 0.07905705430487621, "grad_norm": 9.323240280151367, "learning_rate": 0.0001956594220027121, "loss": 0.4396, "step": 13500 }, { "epoch": 0.08052107382904058, "grad_norm": 7.250824928283691, "learning_rate": 0.00019534840322961897, "loss": 0.4376, "step": 13750 }, { "epoch": 0.08198509335320496, "grad_norm": 12.220071792602539, "learning_rate": 0.0001950373844565258, "loss": 0.4323, "step": 14000 }, { "epoch": 0.08344911287736934, "grad_norm": 8.460916519165039, "learning_rate": 0.00019472636568343266, "loss": 0.4271, "step": 14250 }, { "epoch": 0.08491313240153371, "grad_norm": 6.110500812530518, "learning_rate": 0.0001944153469103395, "loss": 0.4253, "step": 14500 }, { "epoch": 0.08637715192569809, "grad_norm": 10.618386268615723, "learning_rate": 0.00019410432813724636, "loss": 0.427, "step": 14750 }, { "epoch": 0.08784117144986245, "grad_norm": 9.827556610107422, "learning_rate": 0.00019379330936415324, "loss": 0.4254, "step": 15000 }, { "epoch": 0.08784117144986245, "eval_accuracy": 0.877075711565186, "eval_loss": 0.4201970100402832, "eval_runtime": 11537.2443, "eval_samples_per_second": 210.503, "eval_steps_per_second": 6.578, "step": 15000 }, { "epoch": 0.0892994385846771, "grad_norm": 10.84184455871582, "learning_rate": 0.00019349020046898423, "loss": 0.4211, "step": 15250 }, { "epoch": 0.09076336380737672, "grad_norm": 7.9568657875061035, "learning_rate": 0.00019317920297562402, "loss": 0.4203, "step": 15500 }, { "epoch": 0.09222728903007635, "grad_norm": 12.237702369689941, "learning_rate": 0.00019286820548226384, "loss": 0.4181, "step": 15750 }, { "epoch": 0.09369121425277596, "grad_norm": 25.739120483398438, "learning_rate": 0.00019255720798890363, "loss": 0.4143, "step": 16000 }, { "epoch": 0.09515513947547559, "grad_norm": 8.341870307922363, "learning_rate": 0.00019224621049554342, "loss": 0.4171, "step": 16250 }, { "epoch": 0.09661906469817522, "grad_norm": 10.707802772521973, "learning_rate": 0.0001919352130021832, "loss": 0.4058, "step": 16500 }, { "epoch": 0.09808298992087484, "grad_norm": 7.021149158477783, "learning_rate": 0.00019162421550882302, "loss": 0.4211, "step": 16750 }, { "epoch": 0.09954691514357447, "grad_norm": 11.840470314025879, "learning_rate": 0.0001913132180154628, "loss": 0.4093, "step": 17000 }, { "epoch": 0.10101084036627409, "grad_norm": 7.401727676391602, "learning_rate": 0.0001910022205221026, "loss": 0.4281, "step": 17250 }, { "epoch": 0.10247476558897371, "grad_norm": 7.601231575012207, "learning_rate": 0.00019069246701871584, "loss": 0.4044, "step": 17500 }, { "epoch": 0.10393869081167334, "grad_norm": 6.85632848739624, "learning_rate": 0.00019038146952535563, "loss": 0.4244, "step": 17750 }, { "epoch": 0.10540261603437297, "grad_norm": 10.810693740844727, "learning_rate": 0.00019007171602196887, "loss": 0.4216, "step": 18000 }, { "epoch": 0.1068665412570726, "grad_norm": 9.758743286132812, "learning_rate": 0.00018976071852860865, "loss": 0.417, "step": 18250 }, { "epoch": 0.10833046647977221, "grad_norm": 10.75692367553711, "learning_rate": 0.00018944972103524847, "loss": 0.4143, "step": 18500 }, { "epoch": 0.10979439170247184, "grad_norm": 10.375711441040039, "learning_rate": 0.00018913872354188826, "loss": 0.4075, "step": 18750 }, { "epoch": 0.11125831692517146, "grad_norm": 8.414403915405273, "learning_rate": 0.00018882772604852805, "loss": 0.4148, "step": 19000 }, { "epoch": 0.11272224214787109, "grad_norm": 9.86490249633789, "learning_rate": 0.00018851672855516786, "loss": 0.4074, "step": 19250 }, { "epoch": 0.11418616737057072, "grad_norm": 7.522060394287109, "learning_rate": 0.00018820573106180765, "loss": 0.4106, "step": 19500 }, { "epoch": 0.11565009259327033, "grad_norm": 7.423270225524902, "learning_rate": 0.00018789473356844744, "loss": 0.4034, "step": 19750 }, { "epoch": 0.11711401781596996, "grad_norm": 8.761688232421875, "learning_rate": 0.00018758373607508723, "loss": 0.4025, "step": 20000 }, { "epoch": 0.11711401781596996, "eval_accuracy": 0.8823756104911845, "eval_loss": 0.4016551673412323, "eval_runtime": 11547.1595, "eval_samples_per_second": 210.336, "eval_steps_per_second": 6.573, "step": 20000 }, { "epoch": 0.11857794303866959, "grad_norm": 9.6015043258667, "learning_rate": 0.0001872802025215677, "loss": 0.4087, "step": 20250 }, { "epoch": 0.12004186826136921, "grad_norm": 6.658656120300293, "learning_rate": 0.00018696920502820748, "loss": 0.408, "step": 20500 }, { "epoch": 0.12150579348406883, "grad_norm": 6.935655117034912, "learning_rate": 0.00018665820753484727, "loss": 0.3983, "step": 20750 }, { "epoch": 0.12296971870676845, "grad_norm": 7.918155193328857, "learning_rate": 0.00018634721004148706, "loss": 0.3994, "step": 21000 }, { "epoch": 0.12443364392946808, "grad_norm": 7.246758937835693, "learning_rate": 0.00018603621254812688, "loss": 0.4111, "step": 21250 }, { "epoch": 0.1258975691521677, "grad_norm": 8.375380516052246, "learning_rate": 0.00018572521505476667, "loss": 0.4006, "step": 21500 }, { "epoch": 0.12736149437486732, "grad_norm": 6.993825435638428, "learning_rate": 0.0001854154615513799, "loss": 0.4113, "step": 21750 }, { "epoch": 0.12882541959756696, "grad_norm": 8.703255653381348, "learning_rate": 0.00018510446405801972, "loss": 0.3977, "step": 22000 }, { "epoch": 0.13028934482026658, "grad_norm": 6.940033912658691, "learning_rate": 0.0001847934665646595, "loss": 0.4005, "step": 22250 }, { "epoch": 0.1317532700429662, "grad_norm": 6.712055683135986, "learning_rate": 0.0001844824690712993, "loss": 0.41, "step": 22500 }, { "epoch": 0.13321719526566583, "grad_norm": 6.171209812164307, "learning_rate": 0.0001841714715779391, "loss": 0.3971, "step": 22750 }, { "epoch": 0.13468112048836545, "grad_norm": 10.764921188354492, "learning_rate": 0.0001838604740845789, "loss": 0.4105, "step": 23000 }, { "epoch": 0.1361450457110651, "grad_norm": 8.0676908493042, "learning_rate": 0.0001835494765912187, "loss": 0.3958, "step": 23250 }, { "epoch": 0.1376089709337647, "grad_norm": 5.20599365234375, "learning_rate": 0.00018323847909785848, "loss": 0.3946, "step": 23500 }, { "epoch": 0.13907289615646432, "grad_norm": 5.9439239501953125, "learning_rate": 0.0001829274816044983, "loss": 0.3951, "step": 23750 }, { "epoch": 0.14053682137916396, "grad_norm": 9.821541786193848, "learning_rate": 0.0001826164841111381, "loss": 0.3906, "step": 24000 }, { "epoch": 0.14200074660186357, "grad_norm": 6.659691333770752, "learning_rate": 0.00018230673060775133, "loss": 0.4009, "step": 24250 }, { "epoch": 0.1434646718245632, "grad_norm": 6.624240398406982, "learning_rate": 0.00018199573311439112, "loss": 0.3975, "step": 24500 }, { "epoch": 0.14492859704726282, "grad_norm": 7.993641376495361, "learning_rate": 0.0001816847356210309, "loss": 0.3925, "step": 24750 }, { "epoch": 0.14639252226996244, "grad_norm": 6.6386613845825195, "learning_rate": 0.0001813737381276707, "loss": 0.3975, "step": 25000 }, { "epoch": 0.14785644749266208, "grad_norm": 9.204560279846191, "learning_rate": 0.0001810627406343105, "loss": 0.3997, "step": 25250 }, { "epoch": 0.1493203727153617, "grad_norm": 8.072566986083984, "learning_rate": 0.0001807517431409503, "loss": 0.4022, "step": 25500 }, { "epoch": 0.15078429793806133, "grad_norm": 10.15225601196289, "learning_rate": 0.0001804407456475901, "loss": 0.392, "step": 25750 }, { "epoch": 0.15224822316076095, "grad_norm": 7.751401901245117, "learning_rate": 0.0001801297481542299, "loss": 0.3946, "step": 26000 }, { "epoch": 0.15371214838346056, "grad_norm": 8.481501579284668, "learning_rate": 0.0001798187506608697, "loss": 0.3883, "step": 26250 }, { "epoch": 0.1551760736061602, "grad_norm": 9.861278533935547, "learning_rate": 0.00017950775316750948, "loss": 0.3824, "step": 26500 }, { "epoch": 0.15663999882885982, "grad_norm": 6.405235290527344, "learning_rate": 0.0001791967556741493, "loss": 0.4006, "step": 26750 }, { "epoch": 0.15810392405155946, "grad_norm": 9.90355110168457, "learning_rate": 0.00017888575818078909, "loss": 0.3881, "step": 27000 }, { "epoch": 0.15956784927425907, "grad_norm": 9.354215621948242, "learning_rate": 0.00017857476068742887, "loss": 0.3965, "step": 27250 }, { "epoch": 0.16103177449695869, "grad_norm": 9.162219047546387, "learning_rate": 0.00017826376319406866, "loss": 0.3933, "step": 27500 }, { "epoch": 0.16249569971965833, "grad_norm": 6.755202770233154, "learning_rate": 0.00017795276570070848, "loss": 0.3874, "step": 27750 }, { "epoch": 0.16395962494235794, "grad_norm": 8.385200500488281, "learning_rate": 0.00017764176820734827, "loss": 0.3873, "step": 28000 }, { "epoch": 0.16542355016505758, "grad_norm": 6.508645057678223, "learning_rate": 0.00017733077071398806, "loss": 0.3895, "step": 28250 }, { "epoch": 0.1668874753877572, "grad_norm": 8.241129875183105, "learning_rate": 0.00017702226120057472, "loss": 0.3912, "step": 28500 }, { "epoch": 0.1683514006104568, "grad_norm": 7.879597187042236, "learning_rate": 0.00017671126370721454, "loss": 0.3929, "step": 28750 }, { "epoch": 0.16981532583315645, "grad_norm": 12.0702486038208, "learning_rate": 0.00017640026621385432, "loss": 0.404, "step": 29000 }, { "epoch": 0.17127925105585606, "grad_norm": 8.789772033691406, "learning_rate": 0.0001760892687204941, "loss": 0.3823, "step": 29250 }, { "epoch": 0.1727431762785557, "grad_norm": 11.022305488586426, "learning_rate": 0.00017577827122713393, "loss": 0.3887, "step": 29500 }, { "epoch": 0.17420710150125532, "grad_norm": 7.665167331695557, "learning_rate": 0.00017546727373377372, "loss": 0.394, "step": 29750 }, { "epoch": 0.17567102672395493, "grad_norm": 11.05783748626709, "learning_rate": 0.0001751562762404135, "loss": 0.3938, "step": 30000 }, { "epoch": 0.17713495194665457, "grad_norm": 8.389631271362305, "learning_rate": 0.0001748452787470533, "loss": 0.39, "step": 30250 }, { "epoch": 0.1785988771693542, "grad_norm": 8.158947944641113, "learning_rate": 0.0001745342812536931, "loss": 0.3818, "step": 30500 }, { "epoch": 0.1800628023920538, "grad_norm": 7.684356689453125, "learning_rate": 0.0001742232837603329, "loss": 0.3905, "step": 30750 }, { "epoch": 0.18152672761475344, "grad_norm": 10.129668235778809, "learning_rate": 0.00017391353025694614, "loss": 0.3886, "step": 31000 }, { "epoch": 0.18299065283745305, "grad_norm": 6.924737453460693, "learning_rate": 0.00017360253276358593, "loss": 0.3892, "step": 31250 }, { "epoch": 0.1844545780601527, "grad_norm": 5.863354206085205, "learning_rate": 0.00017329153527022572, "loss": 0.3822, "step": 31500 }, { "epoch": 0.1859185032828523, "grad_norm": 9.10240650177002, "learning_rate": 0.00017298053777686553, "loss": 0.3895, "step": 31750 }, { "epoch": 0.18738242850555192, "grad_norm": 9.565494537353516, "learning_rate": 0.00017266954028350532, "loss": 0.383, "step": 32000 }, { "epoch": 0.18884635372825156, "grad_norm": 8.238012313842773, "learning_rate": 0.0001723585427901451, "loss": 0.3854, "step": 32250 }, { "epoch": 0.19031027895095118, "grad_norm": 9.350130081176758, "learning_rate": 0.0001720475452967849, "loss": 0.3922, "step": 32500 }, { "epoch": 0.19177420417365082, "grad_norm": 6.337550163269043, "learning_rate": 0.00017173654780342472, "loss": 0.3778, "step": 32750 }, { "epoch": 0.19323812939635043, "grad_norm": 8.421921730041504, "learning_rate": 0.00017142679430003793, "loss": 0.3929, "step": 33000 }, { "epoch": 0.19470205461905005, "grad_norm": 8.888238906860352, "learning_rate": 0.00017111579680667774, "loss": 0.3844, "step": 33250 }, { "epoch": 0.1961659798417497, "grad_norm": 10.774327278137207, "learning_rate": 0.00017080479931331753, "loss": 0.3804, "step": 33500 }, { "epoch": 0.1976299050644493, "grad_norm": 7.07879114151001, "learning_rate": 0.00017049380181995732, "loss": 0.3954, "step": 33750 }, { "epoch": 0.19909383028714894, "grad_norm": 7.102870941162109, "learning_rate": 0.00017018280432659714, "loss": 0.3815, "step": 34000 }, { "epoch": 0.20055775550984856, "grad_norm": 5.815110206604004, "learning_rate": 0.00016987180683323693, "loss": 0.3907, "step": 34250 }, { "epoch": 0.20202168073254817, "grad_norm": 7.749156475067139, "learning_rate": 0.00016956080933987672, "loss": 0.3798, "step": 34500 }, { "epoch": 0.2034856059552478, "grad_norm": 7.0530476570129395, "learning_rate": 0.0001692498118465165, "loss": 0.3947, "step": 34750 }, { "epoch": 0.20494953117794742, "grad_norm": 6.623088836669922, "learning_rate": 0.00016893881435315632, "loss": 0.3816, "step": 35000 }, { "epoch": 0.20641345640064707, "grad_norm": 8.431561470031738, "learning_rate": 0.0001686278168597961, "loss": 0.3815, "step": 35250 }, { "epoch": 0.20787738162334668, "grad_norm": 11.600255012512207, "learning_rate": 0.00016831806335640935, "loss": 0.3782, "step": 35500 }, { "epoch": 0.2093413068460463, "grad_norm": 5.186095237731934, "learning_rate": 0.00016800706586304914, "loss": 0.3828, "step": 35750 }, { "epoch": 0.21080523206874593, "grad_norm": 12.819711685180664, "learning_rate": 0.00016769606836968895, "loss": 0.3902, "step": 36000 }, { "epoch": 0.21226915729144555, "grad_norm": 7.843264579772949, "learning_rate": 0.00016738507087632874, "loss": 0.3716, "step": 36250 }, { "epoch": 0.2137330825141452, "grad_norm": 8.602349281311035, "learning_rate": 0.00016707407338296853, "loss": 0.3791, "step": 36500 }, { "epoch": 0.2151970077368448, "grad_norm": 7.939485549926758, "learning_rate": 0.00016676307588960832, "loss": 0.3752, "step": 36750 }, { "epoch": 0.21666093295954442, "grad_norm": 6.328729629516602, "learning_rate": 0.00016645207839624814, "loss": 0.3761, "step": 37000 }, { "epoch": 0.21812485818224406, "grad_norm": 6.196065902709961, "learning_rate": 0.00016614108090288793, "loss": 0.3817, "step": 37250 }, { "epoch": 0.21958878340494367, "grad_norm": 10.096115112304688, "learning_rate": 0.00016583008340952771, "loss": 0.3828, "step": 37500 }, { "epoch": 0.2210527086276433, "grad_norm": 6.120075702667236, "learning_rate": 0.0001655190859161675, "loss": 0.3774, "step": 37750 }, { "epoch": 0.22251663385034293, "grad_norm": 6.575611114501953, "learning_rate": 0.00016520808842280732, "loss": 0.3823, "step": 38000 }, { "epoch": 0.22398055907304254, "grad_norm": 7.636918067932129, "learning_rate": 0.0001648970909294471, "loss": 0.3846, "step": 38250 }, { "epoch": 0.22544448429574218, "grad_norm": 15.759072303771973, "learning_rate": 0.00016458733742606037, "loss": 0.3842, "step": 38500 }, { "epoch": 0.2269084095184418, "grad_norm": 10.398168563842773, "learning_rate": 0.0001642775839226736, "loss": 0.3794, "step": 38750 }, { "epoch": 0.22837233474114144, "grad_norm": 6.939914703369141, "learning_rate": 0.0001639665864293134, "loss": 0.3763, "step": 39000 }, { "epoch": 0.22983625996384105, "grad_norm": 11.021454811096191, "learning_rate": 0.0001636555889359532, "loss": 0.368, "step": 39250 }, { "epoch": 0.23130018518654066, "grad_norm": 7.381429195404053, "learning_rate": 0.00016334459144259298, "loss": 0.3783, "step": 39500 }, { "epoch": 0.2327641104092403, "grad_norm": 9.803789138793945, "learning_rate": 0.0001630335939492328, "loss": 0.3828, "step": 39750 }, { "epoch": 0.23422803563193992, "grad_norm": 7.722465991973877, "learning_rate": 0.00016272259645587259, "loss": 0.3764, "step": 40000 }, { "epoch": 0.23569196085463953, "grad_norm": 8.471487998962402, "learning_rate": 0.00016241159896251237, "loss": 0.3879, "step": 40250 }, { "epoch": 0.23715588607733917, "grad_norm": 9.46483039855957, "learning_rate": 0.00016210060146915216, "loss": 0.3772, "step": 40500 }, { "epoch": 0.2386198113000388, "grad_norm": 11.850425720214844, "learning_rate": 0.00016178960397579198, "loss": 0.3688, "step": 40750 }, { "epoch": 0.24008373652273843, "grad_norm": 7.718139171600342, "learning_rate": 0.00016147860648243177, "loss": 0.3728, "step": 41000 }, { "epoch": 0.24154766174543804, "grad_norm": 7.039102077484131, "learning_rate": 0.00016116760898907156, "loss": 0.3718, "step": 41250 }, { "epoch": 0.24301158696813766, "grad_norm": 6.891547679901123, "learning_rate": 0.00016085661149571137, "loss": 0.3713, "step": 41500 }, { "epoch": 0.2444755121908373, "grad_norm": 8.54554271697998, "learning_rate": 0.00016054561400235116, "loss": 0.3818, "step": 41750 }, { "epoch": 0.2459394374135369, "grad_norm": 6.554268836975098, "learning_rate": 0.00016023461650899095, "loss": 0.3706, "step": 42000 }, { "epoch": 0.24740336263623655, "grad_norm": 6.389885902404785, "learning_rate": 0.00015992361901563074, "loss": 0.3577, "step": 42250 }, { "epoch": 0.24886728785893616, "grad_norm": 6.833805561065674, "learning_rate": 0.00015961262152227056, "loss": 0.3722, "step": 42500 }, { "epoch": 0.2503312130816358, "grad_norm": 9.135841369628906, "learning_rate": 0.00015930162402891034, "loss": 0.3747, "step": 42750 }, { "epoch": 0.2517951383043354, "grad_norm": 7.466910362243652, "learning_rate": 0.00015899187052552358, "loss": 0.378, "step": 43000 }, { "epoch": 0.25325906352703503, "grad_norm": 14.597432136535645, "learning_rate": 0.00015868087303216337, "loss": 0.3743, "step": 43250 }, { "epoch": 0.25472298874973465, "grad_norm": 6.523279190063477, "learning_rate": 0.00015836987553880316, "loss": 0.3728, "step": 43500 }, { "epoch": 0.25618691397243426, "grad_norm": 5.352029800415039, "learning_rate": 0.00015805887804544298, "loss": 0.367, "step": 43750 }, { "epoch": 0.25765083919513393, "grad_norm": 8.408788681030273, "learning_rate": 0.00015774788055208277, "loss": 0.3694, "step": 44000 }, { "epoch": 0.25911476441783354, "grad_norm": 7.64408016204834, "learning_rate": 0.00015743688305872256, "loss": 0.3664, "step": 44250 }, { "epoch": 0.26057868964053316, "grad_norm": 4.888110637664795, "learning_rate": 0.00015712588556536234, "loss": 0.3637, "step": 44500 }, { "epoch": 0.26204261486323277, "grad_norm": 5.068843841552734, "learning_rate": 0.00015681488807200216, "loss": 0.369, "step": 44750 }, { "epoch": 0.2635065400859324, "grad_norm": 6.427637577056885, "learning_rate": 0.00015650389057864195, "loss": 0.3788, "step": 45000 }, { "epoch": 0.26497046530863205, "grad_norm": 8.00766658782959, "learning_rate": 0.00015619289308528174, "loss": 0.3638, "step": 45250 }, { "epoch": 0.26643439053133167, "grad_norm": 8.729680061340332, "learning_rate": 0.00015588189559192155, "loss": 0.3736, "step": 45500 }, { "epoch": 0.2678983157540313, "grad_norm": 10.317773818969727, "learning_rate": 0.00015557089809856134, "loss": 0.3618, "step": 45750 }, { "epoch": 0.2693622409767309, "grad_norm": 7.715869903564453, "learning_rate": 0.00015525990060520113, "loss": 0.3741, "step": 46000 }, { "epoch": 0.2708261661994305, "grad_norm": 5.711330890655518, "learning_rate": 0.00015494890311184092, "loss": 0.3745, "step": 46250 }, { "epoch": 0.2722900914221302, "grad_norm": 9.835432052612305, "learning_rate": 0.00015463790561848074, "loss": 0.3693, "step": 46500 }, { "epoch": 0.2737540166448298, "grad_norm": 6.019217014312744, "learning_rate": 0.00015432815211509395, "loss": 0.3674, "step": 46750 }, { "epoch": 0.2752179418675294, "grad_norm": 7.813283443450928, "learning_rate": 0.00015401715462173376, "loss": 0.3674, "step": 47000 }, { "epoch": 0.276681867090229, "grad_norm": 7.319979190826416, "learning_rate": 0.00015370615712837355, "loss": 0.3675, "step": 47250 }, { "epoch": 0.27814579231292863, "grad_norm": 8.74886703491211, "learning_rate": 0.00015339515963501334, "loss": 0.3633, "step": 47500 }, { "epoch": 0.2796097175356283, "grad_norm": 9.456360816955566, "learning_rate": 0.00015308416214165316, "loss": 0.379, "step": 47750 }, { "epoch": 0.2810736427583279, "grad_norm": 10.024221420288086, "learning_rate": 0.00015277316464829295, "loss": 0.375, "step": 48000 }, { "epoch": 0.2825375679810275, "grad_norm": 6.477073669433594, "learning_rate": 0.00015246216715493274, "loss": 0.3634, "step": 48250 }, { "epoch": 0.28400149320372714, "grad_norm": 8.587589263916016, "learning_rate": 0.00015215116966157255, "loss": 0.3693, "step": 48500 }, { "epoch": 0.28546541842642675, "grad_norm": 10.675822257995605, "learning_rate": 0.00015184017216821234, "loss": 0.3668, "step": 48750 }, { "epoch": 0.2869293436491264, "grad_norm": 10.77786636352539, "learning_rate": 0.00015153041866482558, "loss": 0.3711, "step": 49000 }, { "epoch": 0.28839326887182604, "grad_norm": 7.768797874450684, "learning_rate": 0.00015121942117146537, "loss": 0.3692, "step": 49250 }, { "epoch": 0.28985719409452565, "grad_norm": 6.11573600769043, "learning_rate": 0.00015090842367810516, "loss": 0.3618, "step": 49500 }, { "epoch": 0.29132111931722526, "grad_norm": 7.369346618652344, "learning_rate": 0.00015059742618474495, "loss": 0.365, "step": 49750 }, { "epoch": 0.2927850445399249, "grad_norm": 10.559876441955566, "learning_rate": 0.00015028642869138476, "loss": 0.369, "step": 50000 }, { "epoch": 0.29424896976262455, "grad_norm": 6.763681888580322, "learning_rate": 0.00014997543119802455, "loss": 0.3723, "step": 50250 }, { "epoch": 0.29571289498532416, "grad_norm": 14.075911521911621, "learning_rate": 0.00014966443370466434, "loss": 0.3656, "step": 50500 }, { "epoch": 0.2971768202080238, "grad_norm": 7.817617893218994, "learning_rate": 0.00014935343621130416, "loss": 0.3745, "step": 50750 }, { "epoch": 0.2986407454307234, "grad_norm": 5.018287181854248, "learning_rate": 0.00014904243871794395, "loss": 0.3664, "step": 51000 }, { "epoch": 0.300104670653423, "grad_norm": 9.846301078796387, "learning_rate": 0.00014873144122458373, "loss": 0.3644, "step": 51250 }, { "epoch": 0.30156859587612267, "grad_norm": 8.65786361694336, "learning_rate": 0.00014842044373122352, "loss": 0.3698, "step": 51500 }, { "epoch": 0.3030325210988223, "grad_norm": 6.303979873657227, "learning_rate": 0.00014810944623786334, "loss": 0.3707, "step": 51750 }, { "epoch": 0.3044964463215219, "grad_norm": 39.32520294189453, "learning_rate": 0.00014779844874450313, "loss": 0.3617, "step": 52000 }, { "epoch": 0.3059603715442215, "grad_norm": 6.535865306854248, "learning_rate": 0.00014748869524111637, "loss": 0.3642, "step": 52250 }, { "epoch": 0.3074242967669211, "grad_norm": 6.031300067901611, "learning_rate": 0.00014717769774775616, "loss": 0.363, "step": 52500 }, { "epoch": 0.3088882219896208, "grad_norm": 7.255093097686768, "learning_rate": 0.00014686670025439595, "loss": 0.3594, "step": 52750 }, { "epoch": 0.3103521472123204, "grad_norm": 7.491271018981934, "learning_rate": 0.00014655570276103576, "loss": 0.3697, "step": 53000 }, { "epoch": 0.31181607243502, "grad_norm": 8.154767036437988, "learning_rate": 0.00014624470526767555, "loss": 0.3667, "step": 53250 }, { "epoch": 0.31327999765771963, "grad_norm": 7.7836384773254395, "learning_rate": 0.00014593370777431534, "loss": 0.3756, "step": 53500 }, { "epoch": 0.31474392288041925, "grad_norm": 7.439420223236084, "learning_rate": 0.00014562271028095513, "loss": 0.3734, "step": 53750 }, { "epoch": 0.3162078481031189, "grad_norm": 7.654810428619385, "learning_rate": 0.00014531171278759494, "loss": 0.3689, "step": 54000 }, { "epoch": 0.31767177332581853, "grad_norm": 4.918389320373535, "learning_rate": 0.00014500195928420816, "loss": 0.3688, "step": 54250 }, { "epoch": 0.31913569854851814, "grad_norm": 6.2310895919799805, "learning_rate": 0.00014469096179084797, "loss": 0.3711, "step": 54500 }, { "epoch": 0.32059962377121776, "grad_norm": 7.458713054656982, "learning_rate": 0.00014437996429748776, "loss": 0.3614, "step": 54750 }, { "epoch": 0.32206354899391737, "grad_norm": 6.790125370025635, "learning_rate": 0.00014406896680412755, "loss": 0.3635, "step": 55000 }, { "epoch": 0.32206354899391737, "eval_accuracy": 0.8905084935576763, "eval_loss": 0.362331748008728, "eval_runtime": 11551.2138, "eval_samples_per_second": 210.262, "eval_steps_per_second": 6.571, "step": 55000 }, { "epoch": 0.32352747421661704, "grad_norm": 7.128218650817871, "learning_rate": 0.00014375796931076737, "loss": 0.357, "step": 55250 }, { "epoch": 0.32499139943931665, "grad_norm": 4.943136692047119, "learning_rate": 0.00014344697181740715, "loss": 0.3576, "step": 55500 }, { "epoch": 0.32645532466201627, "grad_norm": 7.633016109466553, "learning_rate": 0.00014313597432404694, "loss": 0.3655, "step": 55750 }, { "epoch": 0.3279192498847159, "grad_norm": 9.49149227142334, "learning_rate": 0.00014282497683068673, "loss": 0.3687, "step": 56000 }, { "epoch": 0.3293831751074155, "grad_norm": 7.4215521812438965, "learning_rate": 0.00014251397933732655, "loss": 0.3705, "step": 56250 }, { "epoch": 0.33084710033011516, "grad_norm": 5.638499736785889, "learning_rate": 0.00014220298184396634, "loss": 0.3709, "step": 56500 }, { "epoch": 0.3323110255528148, "grad_norm": 9.440450668334961, "learning_rate": 0.00014189198435060613, "loss": 0.35, "step": 56750 }, { "epoch": 0.3337749507755144, "grad_norm": 7.706991195678711, "learning_rate": 0.00014158098685724594, "loss": 0.3601, "step": 57000 }, { "epoch": 0.335238875998214, "grad_norm": 8.154605865478516, "learning_rate": 0.00014126998936388573, "loss": 0.3625, "step": 57250 }, { "epoch": 0.3367028012209136, "grad_norm": 7.608438491821289, "learning_rate": 0.00014095899187052552, "loss": 0.3588, "step": 57500 }, { "epoch": 0.3381667264436133, "grad_norm": 5.466573715209961, "learning_rate": 0.00014064799437716534, "loss": 0.3528, "step": 57750 }, { "epoch": 0.3396306516663129, "grad_norm": 7.514803409576416, "learning_rate": 0.00014033699688380512, "loss": 0.3624, "step": 58000 }, { "epoch": 0.3410945768890125, "grad_norm": 4.846391677856445, "learning_rate": 0.00014002599939044491, "loss": 0.3525, "step": 58250 }, { "epoch": 0.3425585021117121, "grad_norm": 6.116271018981934, "learning_rate": 0.0001397150018970847, "loss": 0.3556, "step": 58500 }, { "epoch": 0.34402242733441174, "grad_norm": 7.234938621520996, "learning_rate": 0.00013940400440372452, "loss": 0.3723, "step": 58750 }, { "epoch": 0.3454863525571114, "grad_norm": 8.690266609191895, "learning_rate": 0.0001390930069103643, "loss": 0.3671, "step": 59000 }, { "epoch": 0.346950277779811, "grad_norm": 5.558066368103027, "learning_rate": 0.0001387820094170041, "loss": 0.3563, "step": 59250 }, { "epoch": 0.34841420300251064, "grad_norm": 5.277857303619385, "learning_rate": 0.0001384710119236439, "loss": 0.3633, "step": 59500 }, { "epoch": 0.34987812822521025, "grad_norm": 4.810859680175781, "learning_rate": 0.00013816125842025712, "loss": 0.3615, "step": 59750 }, { "epoch": 0.35134205344790986, "grad_norm": 6.860721111297607, "learning_rate": 0.00013785026092689694, "loss": 0.3561, "step": 60000 }, { "epoch": 0.35280597867060953, "grad_norm": 6.673612117767334, "learning_rate": 0.00013753926343353673, "loss": 0.3513, "step": 60250 }, { "epoch": 0.35426990389330915, "grad_norm": 6.9296956062316895, "learning_rate": 0.00013722826594017652, "loss": 0.3563, "step": 60500 }, { "epoch": 0.35573382911600876, "grad_norm": 6.235531806945801, "learning_rate": 0.0001369172684468163, "loss": 0.3586, "step": 60750 }, { "epoch": 0.3571977543387084, "grad_norm": 6.549998760223389, "learning_rate": 0.00013660627095345612, "loss": 0.3572, "step": 61000 }, { "epoch": 0.358661679561408, "grad_norm": 6.800797939300537, "learning_rate": 0.0001362952734600959, "loss": 0.3687, "step": 61250 }, { "epoch": 0.3601256047841076, "grad_norm": 5.545276641845703, "learning_rate": 0.0001359842759667357, "loss": 0.3539, "step": 61500 }, { "epoch": 0.36158953000680727, "grad_norm": 8.63070011138916, "learning_rate": 0.00013567327847337552, "loss": 0.3605, "step": 61750 }, { "epoch": 0.3630534552295069, "grad_norm": 5.199543476104736, "learning_rate": 0.0001353622809800153, "loss": 0.3559, "step": 62000 }, { "epoch": 0.3645173804522065, "grad_norm": 27.297420501708984, "learning_rate": 0.0001350512834866551, "loss": 0.3676, "step": 62250 }, { "epoch": 0.3659813056749061, "grad_norm": 8.235854148864746, "learning_rate": 0.00013474152998326833, "loss": 0.3583, "step": 62500 }, { "epoch": 0.3674452308976057, "grad_norm": 6.224372386932373, "learning_rate": 0.00013443053248990812, "loss": 0.3623, "step": 62750 }, { "epoch": 0.3689091561203054, "grad_norm": 8.013957977294922, "learning_rate": 0.0001341195349965479, "loss": 0.3619, "step": 63000 }, { "epoch": 0.370373081343005, "grad_norm": 6.442314147949219, "learning_rate": 0.00013380853750318773, "loss": 0.3586, "step": 63250 }, { "epoch": 0.3718370065657046, "grad_norm": 6.883063793182373, "learning_rate": 0.00013349754000982752, "loss": 0.3635, "step": 63500 }, { "epoch": 0.37330093178840423, "grad_norm": 5.502562999725342, "learning_rate": 0.0001331865425164673, "loss": 0.3525, "step": 63750 }, { "epoch": 0.37476485701110385, "grad_norm": 6.841543197631836, "learning_rate": 0.00013287554502310712, "loss": 0.3564, "step": 64000 }, { "epoch": 0.3762287822338035, "grad_norm": 6.850903034210205, "learning_rate": 0.0001325645475297469, "loss": 0.3549, "step": 64250 }, { "epoch": 0.37769270745650313, "grad_norm": 5.823826313018799, "learning_rate": 0.00013225479402636015, "loss": 0.3488, "step": 64500 }, { "epoch": 0.37915663267920274, "grad_norm": 9.849250793457031, "learning_rate": 0.00013194379653299997, "loss": 0.3526, "step": 64750 }, { "epoch": 0.38062055790190236, "grad_norm": 7.8498992919921875, "learning_rate": 0.00013163279903963975, "loss": 0.3596, "step": 65000 }, { "epoch": 0.38208448312460197, "grad_norm": 7.845436096191406, "learning_rate": 0.00013132180154627954, "loss": 0.3497, "step": 65250 }, { "epoch": 0.38354840834730164, "grad_norm": 10.533845901489258, "learning_rate": 0.00013101080405291933, "loss": 0.3523, "step": 65500 }, { "epoch": 0.38501233357000125, "grad_norm": 9.09399127960205, "learning_rate": 0.00013069980655955912, "loss": 0.347, "step": 65750 }, { "epoch": 0.38647625879270087, "grad_norm": 7.205333232879639, "learning_rate": 0.00013038880906619894, "loss": 0.355, "step": 66000 }, { "epoch": 0.3879401840154005, "grad_norm": 6.770249843597412, "learning_rate": 0.00013007781157283873, "loss": 0.3549, "step": 66250 }, { "epoch": 0.3894041092381001, "grad_norm": 8.14482593536377, "learning_rate": 0.00012976681407947851, "loss": 0.3537, "step": 66500 }, { "epoch": 0.39086803446079976, "grad_norm": 5.998184680938721, "learning_rate": 0.0001294558165861183, "loss": 0.3562, "step": 66750 }, { "epoch": 0.3923319596834994, "grad_norm": 5.583696365356445, "learning_rate": 0.00012914481909275812, "loss": 0.3499, "step": 67000 }, { "epoch": 0.393795884906199, "grad_norm": 6.899207592010498, "learning_rate": 0.0001288338215993979, "loss": 0.3506, "step": 67250 }, { "epoch": 0.3952598101288986, "grad_norm": 6.205395221710205, "learning_rate": 0.0001285228241060377, "loss": 0.3512, "step": 67500 }, { "epoch": 0.3967237353515982, "grad_norm": 9.125551223754883, "learning_rate": 0.0001282118266126775, "loss": 0.3585, "step": 67750 }, { "epoch": 0.3981876605742979, "grad_norm": 6.943772792816162, "learning_rate": 0.0001279008291193173, "loss": 0.362, "step": 68000 }, { "epoch": 0.3996515857969975, "grad_norm": 6.106304168701172, "learning_rate": 0.0001275898316259571, "loss": 0.3545, "step": 68250 }, { "epoch": 0.4011155110196971, "grad_norm": 6.197811126708984, "learning_rate": 0.00012728007812257036, "loss": 0.3524, "step": 68500 }, { "epoch": 0.4025794362423967, "grad_norm": 8.07652759552002, "learning_rate": 0.00012696908062921015, "loss": 0.3467, "step": 68750 }, { "epoch": 0.40404336146509634, "grad_norm": 7.444363117218018, "learning_rate": 0.00012665808313584994, "loss": 0.3541, "step": 69000 }, { "epoch": 0.405507286687796, "grad_norm": 6.2395782470703125, "learning_rate": 0.00012634708564248972, "loss": 0.3488, "step": 69250 }, { "epoch": 0.4069712119104956, "grad_norm": 7.489956378936768, "learning_rate": 0.00012603608814912954, "loss": 0.3595, "step": 69500 }, { "epoch": 0.40843513713319524, "grad_norm": 6.762283802032471, "learning_rate": 0.00012572509065576933, "loss": 0.3555, "step": 69750 }, { "epoch": 0.40989906235589485, "grad_norm": 10.423229217529297, "learning_rate": 0.00012541409316240912, "loss": 0.3474, "step": 70000 }, { "epoch": 0.41136298757859446, "grad_norm": 7.812709331512451, "learning_rate": 0.0001251030956690489, "loss": 0.3588, "step": 70250 }, { "epoch": 0.41282691280129413, "grad_norm": 8.506246566772461, "learning_rate": 0.00012479334216566215, "loss": 0.3473, "step": 70500 }, { "epoch": 0.41429083802399375, "grad_norm": 6.0005784034729, "learning_rate": 0.00012448234467230196, "loss": 0.3423, "step": 70750 }, { "epoch": 0.41575476324669336, "grad_norm": 7.6112494468688965, "learning_rate": 0.00012417134717894175, "loss": 0.3469, "step": 71000 }, { "epoch": 0.417218688469393, "grad_norm": 6.460068225860596, "learning_rate": 0.00012386034968558154, "loss": 0.3514, "step": 71250 }, { "epoch": 0.4186826136920926, "grad_norm": 25.509037017822266, "learning_rate": 0.00012354935219222136, "loss": 0.3538, "step": 71500 }, { "epoch": 0.42014653891479226, "grad_norm": 5.778562068939209, "learning_rate": 0.00012323835469886114, "loss": 0.3409, "step": 71750 }, { "epoch": 0.42161046413749187, "grad_norm": 10.19543170928955, "learning_rate": 0.00012292735720550093, "loss": 0.3487, "step": 72000 }, { "epoch": 0.4230743893601915, "grad_norm": 7.6341633796691895, "learning_rate": 0.00012261635971214072, "loss": 0.3477, "step": 72250 }, { "epoch": 0.4245383145828911, "grad_norm": 5.656210422515869, "learning_rate": 0.00012230536221878054, "loss": 0.353, "step": 72500 }, { "epoch": 0.4260022398055907, "grad_norm": 7.81094217300415, "learning_rate": 0.00012199436472542031, "loss": 0.3589, "step": 72750 }, { "epoch": 0.4274661650282904, "grad_norm": 5.924116611480713, "learning_rate": 0.0001216833672320601, "loss": 0.346, "step": 73000 }, { "epoch": 0.42893009025099, "grad_norm": 6.293444633483887, "learning_rate": 0.00012137236973869992, "loss": 0.3496, "step": 73250 }, { "epoch": 0.4303940154736896, "grad_norm": 9.766921997070312, "learning_rate": 0.00012106137224533971, "loss": 0.347, "step": 73500 }, { "epoch": 0.4318579406963892, "grad_norm": 5.998900890350342, "learning_rate": 0.0001207503747519795, "loss": 0.3465, "step": 73750 }, { "epoch": 0.43332186591908883, "grad_norm": 8.364704132080078, "learning_rate": 0.00012043937725861929, "loss": 0.3429, "step": 74000 }, { "epoch": 0.4347857911417885, "grad_norm": 5.508989334106445, "learning_rate": 0.0001201283797652591, "loss": 0.355, "step": 74250 }, { "epoch": 0.4362497163644881, "grad_norm": 6.357595443725586, "learning_rate": 0.00011981738227189889, "loss": 0.3504, "step": 74500 }, { "epoch": 0.43771364158718773, "grad_norm": 8.691376686096191, "learning_rate": 0.00011950762876851213, "loss": 0.3471, "step": 74750 }, { "epoch": 0.43917756680988734, "grad_norm": 11.246256828308105, "learning_rate": 0.00011919663127515193, "loss": 0.3487, "step": 75000 }, { "epoch": 0.44064149203258696, "grad_norm": 6.3526811599731445, "learning_rate": 0.00011888563378179172, "loss": 0.3414, "step": 75250 }, { "epoch": 0.4421054172552866, "grad_norm": 9.6268310546875, "learning_rate": 0.00011857463628843152, "loss": 0.3457, "step": 75500 }, { "epoch": 0.44356934247798624, "grad_norm": 8.093045234680176, "learning_rate": 0.00011826363879507131, "loss": 0.3515, "step": 75750 }, { "epoch": 0.44503326770068585, "grad_norm": 7.497385025024414, "learning_rate": 0.00011795264130171111, "loss": 0.3361, "step": 76000 }, { "epoch": 0.44649719292338547, "grad_norm": 8.374622344970703, "learning_rate": 0.00011764164380835092, "loss": 0.3552, "step": 76250 }, { "epoch": 0.4479611181460851, "grad_norm": 8.583603858947754, "learning_rate": 0.0001173306463149907, "loss": 0.3395, "step": 76500 }, { "epoch": 0.44942504336878475, "grad_norm": 5.933279991149902, "learning_rate": 0.0001170196488216305, "loss": 0.3539, "step": 76750 }, { "epoch": 0.45088896859148436, "grad_norm": 7.1400556564331055, "learning_rate": 0.00011670989531824375, "loss": 0.3556, "step": 77000 }, { "epoch": 0.452352893814184, "grad_norm": 6.4177374839782715, "learning_rate": 0.00011639889782488354, "loss": 0.34, "step": 77250 }, { "epoch": 0.4538168190368836, "grad_norm": 8.248872756958008, "learning_rate": 0.00011608790033152333, "loss": 0.3454, "step": 77500 }, { "epoch": 0.4552807442595832, "grad_norm": 6.789691925048828, "learning_rate": 0.00011577690283816314, "loss": 0.3506, "step": 77750 }, { "epoch": 0.4567446694822829, "grad_norm": 7.519604206085205, "learning_rate": 0.00011546590534480293, "loss": 0.3438, "step": 78000 }, { "epoch": 0.4582085947049825, "grad_norm": 11.287620544433594, "learning_rate": 0.00011515490785144272, "loss": 0.3536, "step": 78250 }, { "epoch": 0.4596725199276821, "grad_norm": 5.6864914894104, "learning_rate": 0.00011484391035808254, "loss": 0.348, "step": 78500 }, { "epoch": 0.4611364451503817, "grad_norm": 7.405890941619873, "learning_rate": 0.00011453291286472232, "loss": 0.3395, "step": 78750 }, { "epoch": 0.4626003703730813, "grad_norm": 5.379487991333008, "learning_rate": 0.00011422315936133556, "loss": 0.3463, "step": 79000 }, { "epoch": 0.46406429559578094, "grad_norm": 7.769617080688477, "learning_rate": 0.00011391216186797535, "loss": 0.3458, "step": 79250 }, { "epoch": 0.4655282208184806, "grad_norm": 9.26171875, "learning_rate": 0.00011360116437461514, "loss": 0.3394, "step": 79500 }, { "epoch": 0.4669921460411802, "grad_norm": 9.037941932678223, "learning_rate": 0.00011329016688125493, "loss": 0.349, "step": 79750 }, { "epoch": 0.46845607126387984, "grad_norm": 8.776792526245117, "learning_rate": 0.00011297916938789475, "loss": 0.3384, "step": 80000 }, { "epoch": 0.46991999648657945, "grad_norm": 6.737313270568848, "learning_rate": 0.00011266817189453454, "loss": 0.3472, "step": 80250 }, { "epoch": 0.47138392170927906, "grad_norm": 7.2374114990234375, "learning_rate": 0.00011235717440117432, "loss": 0.3434, "step": 80500 }, { "epoch": 0.47284784693197873, "grad_norm": 6.939677715301514, "learning_rate": 0.00011204617690781414, "loss": 0.3451, "step": 80750 }, { "epoch": 0.47431177215467835, "grad_norm": 4.702803611755371, "learning_rate": 0.00011173517941445393, "loss": 0.3508, "step": 81000 }, { "epoch": 0.47577569737737796, "grad_norm": 7.359582901000977, "learning_rate": 0.00011142418192109372, "loss": 0.3415, "step": 81250 }, { "epoch": 0.4772396226000776, "grad_norm": 8.404651641845703, "learning_rate": 0.00011111442841770696, "loss": 0.3438, "step": 81500 }, { "epoch": 0.4787035478227772, "grad_norm": 6.176925182342529, "learning_rate": 0.00011080343092434675, "loss": 0.3484, "step": 81750 }, { "epoch": 0.48016747304547686, "grad_norm": 8.614276885986328, "learning_rate": 0.00011049243343098655, "loss": 0.3525, "step": 82000 }, { "epoch": 0.48163139826817647, "grad_norm": 5.756929874420166, "learning_rate": 0.00011018143593762635, "loss": 0.3432, "step": 82250 }, { "epoch": 0.4830953234908761, "grad_norm": 7.686267852783203, "learning_rate": 0.00010987043844426614, "loss": 0.3508, "step": 82500 }, { "epoch": 0.4845592487135757, "grad_norm": 6.590146541595459, "learning_rate": 0.00010955944095090593, "loss": 0.3357, "step": 82750 }, { "epoch": 0.4860231739362753, "grad_norm": 7.363981246948242, "learning_rate": 0.00010924968744751918, "loss": 0.3469, "step": 83000 }, { "epoch": 0.487487099158975, "grad_norm": 5.942411422729492, "learning_rate": 0.00010893868995415897, "loss": 0.3464, "step": 83250 }, { "epoch": 0.4889510243816746, "grad_norm": 8.531744003295898, "learning_rate": 0.00010862769246079879, "loss": 0.3349, "step": 83500 }, { "epoch": 0.4904149496043742, "grad_norm": 20.821125030517578, "learning_rate": 0.00010831669496743858, "loss": 0.3434, "step": 83750 }, { "epoch": 0.4918788748270738, "grad_norm": 9.569067001342773, "learning_rate": 0.00010800569747407836, "loss": 0.3421, "step": 84000 }, { "epoch": 0.49334280004977343, "grad_norm": 7.6851725578308105, "learning_rate": 0.00010769469998071815, "loss": 0.3407, "step": 84250 }, { "epoch": 0.4948067252724731, "grad_norm": 9.591890335083008, "learning_rate": 0.00010738370248735797, "loss": 0.347, "step": 84500 }, { "epoch": 0.4962706504951727, "grad_norm": 5.16259765625, "learning_rate": 0.00010707270499399776, "loss": 0.3383, "step": 84750 }, { "epoch": 0.49773457571787233, "grad_norm": 4.6993794441223145, "learning_rate": 0.00010676170750063755, "loss": 0.3392, "step": 85000 }, { "epoch": 0.49919850094057194, "grad_norm": 6.331507682800293, "learning_rate": 0.00010645071000727735, "loss": 0.351, "step": 85250 }, { "epoch": 0.5006624261632716, "grad_norm": 7.329137325286865, "learning_rate": 0.00010613971251391714, "loss": 0.3486, "step": 85500 }, { "epoch": 0.5021263513859712, "grad_norm": 6.907947540283203, "learning_rate": 0.00010582871502055694, "loss": 0.3443, "step": 85750 }, { "epoch": 0.5035902766086708, "grad_norm": 4.780885696411133, "learning_rate": 0.00010551771752719674, "loss": 0.3401, "step": 86000 }, { "epoch": 0.5050542018313705, "grad_norm": 9.042526245117188, "learning_rate": 0.00010520672003383653, "loss": 0.3402, "step": 86250 }, { "epoch": 0.5065181270540701, "grad_norm": 5.397533416748047, "learning_rate": 0.00010489572254047632, "loss": 0.3392, "step": 86500 }, { "epoch": 0.5079820522767697, "grad_norm": 7.72251033782959, "learning_rate": 0.00010458472504711612, "loss": 0.3337, "step": 86750 }, { "epoch": 0.5094459774994693, "grad_norm": 7.379674434661865, "learning_rate": 0.00010427497154372936, "loss": 0.3457, "step": 87000 }, { "epoch": 0.510909902722169, "grad_norm": 7.123027801513672, "learning_rate": 0.00010396397405036915, "loss": 0.3311, "step": 87250 }, { "epoch": 0.5123738279448685, "grad_norm": 6.388451099395752, "learning_rate": 0.00010365297655700897, "loss": 0.3386, "step": 87500 }, { "epoch": 0.5138377531675682, "grad_norm": 8.933717727661133, "learning_rate": 0.00010334197906364876, "loss": 0.3377, "step": 87750 }, { "epoch": 0.5153016783902679, "grad_norm": 5.813757419586182, "learning_rate": 0.000103032225560262, "loss": 0.3368, "step": 88000 }, { "epoch": 0.5167656036129674, "grad_norm": 10.707741737365723, "learning_rate": 0.00010272122806690178, "loss": 0.3429, "step": 88250 }, { "epoch": 0.5182295288356671, "grad_norm": 7.433245658874512, "learning_rate": 0.00010241023057354157, "loss": 0.3457, "step": 88500 }, { "epoch": 0.5196934540583666, "grad_norm": 6.408331394195557, "learning_rate": 0.00010209923308018139, "loss": 0.3409, "step": 88750 }, { "epoch": 0.5211573792810663, "grad_norm": 7.5843987464904785, "learning_rate": 0.00010178823558682118, "loss": 0.3347, "step": 89000 }, { "epoch": 0.522621304503766, "grad_norm": 9.049858093261719, "learning_rate": 0.00010147723809346097, "loss": 0.3392, "step": 89250 }, { "epoch": 0.5240852297264655, "grad_norm": 8.207107543945312, "learning_rate": 0.00010116624060010076, "loss": 0.334, "step": 89500 }, { "epoch": 0.5255491549491652, "grad_norm": 6.511790752410889, "learning_rate": 0.00010085648709671401, "loss": 0.3462, "step": 89750 }, { "epoch": 0.5270130801718648, "grad_norm": 5.541443824768066, "learning_rate": 0.0001005454896033538, "loss": 0.3318, "step": 90000 }, { "epoch": 0.5284770053945644, "grad_norm": 6.216821670532227, "learning_rate": 0.0001002344921099936, "loss": 0.338, "step": 90250 }, { "epoch": 0.5299409306172641, "grad_norm": 5.138360977172852, "learning_rate": 9.992349461663339e-05, "loss": 0.3457, "step": 90500 }, { "epoch": 0.5314048558399637, "grad_norm": 8.401073455810547, "learning_rate": 9.961249712327319e-05, "loss": 0.3523, "step": 90750 }, { "epoch": 0.5328687810626633, "grad_norm": 8.749157905578613, "learning_rate": 9.930149962991298e-05, "loss": 0.3391, "step": 91000 }, { "epoch": 0.5343327062853629, "grad_norm": 7.809004783630371, "learning_rate": 9.899050213655278e-05, "loss": 0.3422, "step": 91250 }, { "epoch": 0.5357966315080626, "grad_norm": 7.649618148803711, "learning_rate": 9.867950464319257e-05, "loss": 0.3512, "step": 91500 }, { "epoch": 0.5372605567307622, "grad_norm": 8.770468711853027, "learning_rate": 9.836850714983237e-05, "loss": 0.3367, "step": 91750 }, { "epoch": 0.5387244819534618, "grad_norm": 8.32112979888916, "learning_rate": 9.805750965647216e-05, "loss": 0.3384, "step": 92000 }, { "epoch": 0.5401884071761615, "grad_norm": 9.602888107299805, "learning_rate": 9.774651216311197e-05, "loss": 0.3344, "step": 92250 }, { "epoch": 0.541652332398861, "grad_norm": 3.2295093536376953, "learning_rate": 9.743551466975177e-05, "loss": 0.3314, "step": 92500 }, { "epoch": 0.5431162576215607, "grad_norm": 5.456012725830078, "learning_rate": 9.712451717639156e-05, "loss": 0.3313, "step": 92750 }, { "epoch": 0.5445801828442604, "grad_norm": 7.777164936065674, "learning_rate": 9.681351968303136e-05, "loss": 0.3417, "step": 93000 }, { "epoch": 0.5460441080669599, "grad_norm": 10.10175895690918, "learning_rate": 9.650252218967115e-05, "loss": 0.3357, "step": 93250 }, { "epoch": 0.5475080332896596, "grad_norm": 8.296233177185059, "learning_rate": 9.619152469631095e-05, "loss": 0.3368, "step": 93500 }, { "epoch": 0.5489719585123591, "grad_norm": 5.55683708190918, "learning_rate": 9.588052720295075e-05, "loss": 0.3338, "step": 93750 }, { "epoch": 0.5504358837350588, "grad_norm": 5.92700719833374, "learning_rate": 9.556952970959054e-05, "loss": 0.3431, "step": 94000 }, { "epoch": 0.5518998089577585, "grad_norm": 5.411899089813232, "learning_rate": 9.525853221623034e-05, "loss": 0.3393, "step": 94250 }, { "epoch": 0.553363734180458, "grad_norm": 6.517271995544434, "learning_rate": 9.494753472287013e-05, "loss": 0.3332, "step": 94500 }, { "epoch": 0.5548276594031577, "grad_norm": 9.099715232849121, "learning_rate": 9.463653722950994e-05, "loss": 0.3343, "step": 94750 }, { "epoch": 0.5562915846258573, "grad_norm": 4.845067501068115, "learning_rate": 9.432553973614972e-05, "loss": 0.3344, "step": 95000 }, { "epoch": 0.5577555098485569, "grad_norm": 8.56153392791748, "learning_rate": 9.401454224278953e-05, "loss": 0.33, "step": 95250 }, { "epoch": 0.5592194350712566, "grad_norm": 7.1542439460754395, "learning_rate": 9.370354474942933e-05, "loss": 0.3186, "step": 95500 }, { "epoch": 0.5606833602939562, "grad_norm": 7.00217342376709, "learning_rate": 9.339254725606912e-05, "loss": 0.335, "step": 95750 }, { "epoch": 0.5621472855166558, "grad_norm": 7.365664482116699, "learning_rate": 9.308279375268236e-05, "loss": 0.3303, "step": 96000 }, { "epoch": 0.5636112107393554, "grad_norm": 8.063042640686035, "learning_rate": 9.277179625932215e-05, "loss": 0.3441, "step": 96250 }, { "epoch": 0.565075135962055, "grad_norm": 5.403791904449463, "learning_rate": 9.246079876596195e-05, "loss": 0.3318, "step": 96500 }, { "epoch": 0.5665390611847547, "grad_norm": 5.911950588226318, "learning_rate": 9.215104526257519e-05, "loss": 0.3327, "step": 96750 }, { "epoch": 0.5680029864074543, "grad_norm": 5.484018802642822, "learning_rate": 9.184004776921499e-05, "loss": 0.3384, "step": 97000 }, { "epoch": 0.569466911630154, "grad_norm": 4.785627365112305, "learning_rate": 9.152905027585478e-05, "loss": 0.3437, "step": 97250 }, { "epoch": 0.5709308368528535, "grad_norm": 7.17230749130249, "learning_rate": 9.121805278249458e-05, "loss": 0.3331, "step": 97500 }, { "epoch": 0.5723947620755532, "grad_norm": 7.777104377746582, "learning_rate": 9.090705528913437e-05, "loss": 0.3371, "step": 97750 }, { "epoch": 0.5738586872982528, "grad_norm": 6.8572001457214355, "learning_rate": 9.059605779577417e-05, "loss": 0.3397, "step": 98000 }, { "epoch": 0.5753226125209524, "grad_norm": 9.132293701171875, "learning_rate": 9.028506030241398e-05, "loss": 0.3421, "step": 98250 }, { "epoch": 0.5767865377436521, "grad_norm": 7.351444244384766, "learning_rate": 8.997406280905376e-05, "loss": 0.3315, "step": 98500 }, { "epoch": 0.5782504629663516, "grad_norm": 5.444695949554443, "learning_rate": 8.966306531569357e-05, "loss": 0.3313, "step": 98750 }, { "epoch": 0.5797143881890513, "grad_norm": 6.229501724243164, "learning_rate": 8.935206782233336e-05, "loss": 0.3321, "step": 99000 }, { "epoch": 0.581178313411751, "grad_norm": 4.431236743927002, "learning_rate": 8.904107032897316e-05, "loss": 0.3326, "step": 99250 }, { "epoch": 0.5826422386344505, "grad_norm": 4.78348445892334, "learning_rate": 8.873007283561296e-05, "loss": 0.3362, "step": 99500 }, { "epoch": 0.5841061638571502, "grad_norm": 5.964051723480225, "learning_rate": 8.841907534225275e-05, "loss": 0.3408, "step": 99750 }, { "epoch": 0.5855700890798498, "grad_norm": 5.310559272766113, "learning_rate": 8.810807784889255e-05, "loss": 0.3328, "step": 100000 }, { "epoch": 0.5870340143025494, "grad_norm": 4.985818862915039, "learning_rate": 8.779708035553234e-05, "loss": 0.337, "step": 100250 }, { "epoch": 0.5884979395252491, "grad_norm": 4.851356506347656, "learning_rate": 8.748608286217213e-05, "loss": 0.3314, "step": 100500 }, { "epoch": 0.5899618647479486, "grad_norm": 6.863201141357422, "learning_rate": 8.717508536881193e-05, "loss": 0.3231, "step": 100750 }, { "epoch": 0.5914257899706483, "grad_norm": 6.387337684631348, "learning_rate": 8.686533186542517e-05, "loss": 0.322, "step": 101000 }, { "epoch": 0.5928897151933479, "grad_norm": 7.897363662719727, "learning_rate": 8.655433437206496e-05, "loss": 0.3361, "step": 101250 }, { "epoch": 0.5943536404160475, "grad_norm": 5.876019477844238, "learning_rate": 8.624333687870476e-05, "loss": 0.3211, "step": 101500 }, { "epoch": 0.5958175656387472, "grad_norm": 4.175768852233887, "learning_rate": 8.593233938534457e-05, "loss": 0.3317, "step": 101750 }, { "epoch": 0.5972814908614468, "grad_norm": 6.496226787567139, "learning_rate": 8.562134189198435e-05, "loss": 0.3289, "step": 102000 }, { "epoch": 0.5987454160841464, "grad_norm": 7.092103004455566, "learning_rate": 8.531034439862416e-05, "loss": 0.3329, "step": 102250 }, { "epoch": 0.600209341306846, "grad_norm": 7.335963726043701, "learning_rate": 8.499934690526395e-05, "loss": 0.3305, "step": 102500 }, { "epoch": 0.6016732665295457, "grad_norm": 6.620415687561035, "learning_rate": 8.468834941190375e-05, "loss": 0.3324, "step": 102750 }, { "epoch": 0.6031371917522453, "grad_norm": 6.866759777069092, "learning_rate": 8.437735191854355e-05, "loss": 0.3395, "step": 103000 }, { "epoch": 0.6046011169749449, "grad_norm": 7.7242045402526855, "learning_rate": 8.406759841515678e-05, "loss": 0.3368, "step": 103250 }, { "epoch": 0.6060650421976446, "grad_norm": 6.402958869934082, "learning_rate": 8.375660092179658e-05, "loss": 0.3366, "step": 103500 }, { "epoch": 0.6075289674203441, "grad_norm": 6.456150531768799, "learning_rate": 8.344560342843637e-05, "loss": 0.3372, "step": 103750 }, { "epoch": 0.6089928926430438, "grad_norm": 7.6825971603393555, "learning_rate": 8.313460593507617e-05, "loss": 0.3331, "step": 104000 }, { "epoch": 0.6104568178657435, "grad_norm": 11.974824905395508, "learning_rate": 8.282360844171596e-05, "loss": 0.3317, "step": 104250 }, { "epoch": 0.611920743088443, "grad_norm": 5.445409774780273, "learning_rate": 8.251261094835576e-05, "loss": 0.3303, "step": 104500 }, { "epoch": 0.6133846683111427, "grad_norm": 8.099034309387207, "learning_rate": 8.220161345499555e-05, "loss": 0.3317, "step": 104750 }, { "epoch": 0.6148485935338422, "grad_norm": 21.789043426513672, "learning_rate": 8.189061596163535e-05, "loss": 0.3146, "step": 105000 }, { "epoch": 0.6163125187565419, "grad_norm": 6.879361152648926, "learning_rate": 8.158086245824859e-05, "loss": 0.3346, "step": 105250 }, { "epoch": 0.6177764439792416, "grad_norm": 5.477085113525391, "learning_rate": 8.126986496488838e-05, "loss": 0.3274, "step": 105500 }, { "epoch": 0.6192403692019411, "grad_norm": 6.2816667556762695, "learning_rate": 8.095886747152818e-05, "loss": 0.3271, "step": 105750 }, { "epoch": 0.6207042944246408, "grad_norm": 9.089285850524902, "learning_rate": 8.064786997816797e-05, "loss": 0.3351, "step": 106000 }, { "epoch": 0.6221682196473404, "grad_norm": 6.114886283874512, "learning_rate": 8.033687248480777e-05, "loss": 0.3296, "step": 106250 }, { "epoch": 0.62363214487004, "grad_norm": 7.2542548179626465, "learning_rate": 8.002587499144756e-05, "loss": 0.3246, "step": 106500 }, { "epoch": 0.6250960700927397, "grad_norm": 5.58528995513916, "learning_rate": 7.971487749808737e-05, "loss": 0.3327, "step": 106750 }, { "epoch": 0.6265599953154393, "grad_norm": 3.898178815841675, "learning_rate": 7.940388000472715e-05, "loss": 0.3291, "step": 107000 }, { "epoch": 0.6280239205381389, "grad_norm": 5.644820690155029, "learning_rate": 7.909288251136696e-05, "loss": 0.3281, "step": 107250 }, { "epoch": 0.6294878457608385, "grad_norm": 6.363776206970215, "learning_rate": 7.878188501800676e-05, "loss": 0.3304, "step": 107500 }, { "epoch": 0.6309517709835382, "grad_norm": 5.209687232971191, "learning_rate": 7.847213151462e-05, "loss": 0.3224, "step": 107750 }, { "epoch": 0.6324156962062378, "grad_norm": 6.911553382873535, "learning_rate": 7.81611340212598e-05, "loss": 0.3246, "step": 108000 }, { "epoch": 0.6338796214289374, "grad_norm": 7.6557111740112305, "learning_rate": 7.785013652789959e-05, "loss": 0.322, "step": 108250 }, { "epoch": 0.6353435466516371, "grad_norm": 7.857481002807617, "learning_rate": 7.753913903453939e-05, "loss": 0.3318, "step": 108500 }, { "epoch": 0.6368074718743366, "grad_norm": 5.911120891571045, "learning_rate": 7.722814154117918e-05, "loss": 0.325, "step": 108750 }, { "epoch": 0.6382713970970363, "grad_norm": 8.592209815979004, "learning_rate": 7.691714404781898e-05, "loss": 0.3209, "step": 109000 }, { "epoch": 0.639735322319736, "grad_norm": 6.824602127075195, "learning_rate": 7.660614655445879e-05, "loss": 0.3331, "step": 109250 }, { "epoch": 0.6411992475424355, "grad_norm": 6.813981056213379, "learning_rate": 7.629514906109858e-05, "loss": 0.3313, "step": 109500 }, { "epoch": 0.6426631727651352, "grad_norm": 5.7169671058654785, "learning_rate": 7.598539555771181e-05, "loss": 0.3206, "step": 109750 }, { "epoch": 0.6441270979878347, "grad_norm": 5.429720401763916, "learning_rate": 7.56743980643516e-05, "loss": 0.3192, "step": 110000 }, { "epoch": 0.6441270979878347, "eval_accuracy": 0.8997983351325891, "eval_loss": 0.3242824375629425, "eval_runtime": 11546.6804, "eval_samples_per_second": 210.345, "eval_steps_per_second": 6.573, "step": 110000 } ], "logging_steps": 250, "max_steps": 170773, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 55000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8505890873482936e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }