{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 8124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00036927621861152144, "grad_norm": 0.8675795197486877, "learning_rate": 4e-05, "loss": 2.3283, "step": 1 }, { "epoch": 0.0007385524372230429, "grad_norm": 0.825646698474884, "learning_rate": 8e-05, "loss": 2.4218, "step": 2 }, { "epoch": 0.0011078286558345643, "grad_norm": 0.9647901654243469, "learning_rate": 0.00012, "loss": 2.4091, "step": 3 }, { "epoch": 0.0014771048744460858, "grad_norm": 0.9235298037528992, "learning_rate": 0.00016, "loss": 2.4576, "step": 4 }, { "epoch": 0.001846381093057607, "grad_norm": 0.7539268732070923, "learning_rate": 0.0002, "loss": 2.1086, "step": 5 }, { "epoch": 0.0022156573116691287, "grad_norm": 0.897127091884613, "learning_rate": 0.0001999753664244365, "loss": 1.8471, "step": 6 }, { "epoch": 0.00258493353028065, "grad_norm": 1.0578811168670654, "learning_rate": 0.000199950732848873, "loss": 1.4978, "step": 7 }, { "epoch": 0.0029542097488921715, "grad_norm": 1.1795432567596436, "learning_rate": 0.00019992609927330953, "loss": 1.2179, "step": 8 }, { "epoch": 0.0033234859675036928, "grad_norm": 1.1029233932495117, "learning_rate": 0.00019990146569774604, "loss": 1.0425, "step": 9 }, { "epoch": 0.003692762186115214, "grad_norm": 0.9418121576309204, "learning_rate": 0.00019987683212218253, "loss": 0.7962, "step": 10 }, { "epoch": 0.004062038404726735, "grad_norm": 0.5863303542137146, "learning_rate": 0.00019985219854661904, "loss": 0.8584, "step": 11 }, { "epoch": 0.004431314623338257, "grad_norm": 0.5713561177253723, "learning_rate": 0.00019982756497105556, "loss": 0.7218, "step": 12 }, { "epoch": 0.0048005908419497785, "grad_norm": 0.39308950304985046, "learning_rate": 0.00019980293139549207, "loss": 0.6724, "step": 13 }, { "epoch": 0.0051698670605613, "grad_norm": 0.43338122963905334, "learning_rate": 0.00019977829781992856, "loss": 0.6111, "step": 14 }, { "epoch": 0.005539143279172821, "grad_norm": 0.38551896810531616, "learning_rate": 0.00019975366424436507, "loss": 0.6062, "step": 15 }, { "epoch": 0.005908419497784343, "grad_norm": 0.39907607436180115, "learning_rate": 0.00019972903066880156, "loss": 0.5282, "step": 16 }, { "epoch": 0.006277695716395864, "grad_norm": 0.4794689416885376, "learning_rate": 0.0001997043970932381, "loss": 0.5971, "step": 17 }, { "epoch": 0.0066469719350073855, "grad_norm": 0.8089517951011658, "learning_rate": 0.0001996797635176746, "loss": 0.6808, "step": 18 }, { "epoch": 0.007016248153618907, "grad_norm": 0.5165871977806091, "learning_rate": 0.0001996551299421111, "loss": 0.585, "step": 19 }, { "epoch": 0.007385524372230428, "grad_norm": 0.5364779829978943, "learning_rate": 0.0001996304963665476, "loss": 0.5776, "step": 20 }, { "epoch": 0.00775480059084195, "grad_norm": 0.509082019329071, "learning_rate": 0.0001996058627909841, "loss": 0.6039, "step": 21 }, { "epoch": 0.00812407680945347, "grad_norm": 0.5860568881034851, "learning_rate": 0.00019958122921542062, "loss": 0.5506, "step": 22 }, { "epoch": 0.008493353028064993, "grad_norm": 0.5222316384315491, "learning_rate": 0.00019955659563985714, "loss": 0.5856, "step": 23 }, { "epoch": 0.008862629246676515, "grad_norm": 0.5385368466377258, "learning_rate": 0.00019953196206429363, "loss": 0.5881, "step": 24 }, { "epoch": 0.009231905465288036, "grad_norm": 0.5340928435325623, "learning_rate": 0.00019950732848873014, "loss": 0.4904, "step": 25 }, { "epoch": 0.009601181683899557, "grad_norm": 0.5609019994735718, "learning_rate": 0.00019948269491316665, "loss": 0.4813, "step": 26 }, { "epoch": 0.009970457902511078, "grad_norm": 0.5047788023948669, "learning_rate": 0.00019945806133760317, "loss": 0.5821, "step": 27 }, { "epoch": 0.0103397341211226, "grad_norm": 0.4963848888874054, "learning_rate": 0.00019943342776203966, "loss": 0.6033, "step": 28 }, { "epoch": 0.01070901033973412, "grad_norm": 0.39464226365089417, "learning_rate": 0.00019940879418647617, "loss": 0.4476, "step": 29 }, { "epoch": 0.011078286558345642, "grad_norm": 0.3563077449798584, "learning_rate": 0.00019938416061091269, "loss": 0.4926, "step": 30 }, { "epoch": 0.011447562776957163, "grad_norm": 0.4271666407585144, "learning_rate": 0.0001993595270353492, "loss": 0.5853, "step": 31 }, { "epoch": 0.011816838995568686, "grad_norm": 0.40097787976264954, "learning_rate": 0.0001993348934597857, "loss": 0.5056, "step": 32 }, { "epoch": 0.012186115214180207, "grad_norm": 0.49196794629096985, "learning_rate": 0.0001993102598842222, "loss": 0.4884, "step": 33 }, { "epoch": 0.012555391432791729, "grad_norm": 0.3631584346294403, "learning_rate": 0.0001992856263086587, "loss": 0.4672, "step": 34 }, { "epoch": 0.01292466765140325, "grad_norm": 0.43581250309944153, "learning_rate": 0.00019926099273309523, "loss": 0.4092, "step": 35 }, { "epoch": 0.013293943870014771, "grad_norm": 0.3988689184188843, "learning_rate": 0.00019923635915753172, "loss": 0.5401, "step": 36 }, { "epoch": 0.013663220088626292, "grad_norm": 0.36282041668891907, "learning_rate": 0.00019921172558196823, "loss": 0.4813, "step": 37 }, { "epoch": 0.014032496307237814, "grad_norm": 0.4121813178062439, "learning_rate": 0.00019918709200640472, "loss": 0.4544, "step": 38 }, { "epoch": 0.014401772525849335, "grad_norm": 0.3579091727733612, "learning_rate": 0.00019916245843084124, "loss": 0.4316, "step": 39 }, { "epoch": 0.014771048744460856, "grad_norm": 0.4818468689918518, "learning_rate": 0.00019913782485527775, "loss": 0.5688, "step": 40 }, { "epoch": 0.015140324963072379, "grad_norm": 0.444948673248291, "learning_rate": 0.00019911319127971427, "loss": 0.5442, "step": 41 }, { "epoch": 0.0155096011816839, "grad_norm": 0.35346049070358276, "learning_rate": 0.00019908855770415075, "loss": 0.3857, "step": 42 }, { "epoch": 0.01587887740029542, "grad_norm": 0.4043765664100647, "learning_rate": 0.00019906392412858727, "loss": 0.5172, "step": 43 }, { "epoch": 0.01624815361890694, "grad_norm": 0.47345271706581116, "learning_rate": 0.00019903929055302378, "loss": 0.491, "step": 44 }, { "epoch": 0.016617429837518464, "grad_norm": 0.3552895784378052, "learning_rate": 0.0001990146569774603, "loss": 0.5305, "step": 45 }, { "epoch": 0.016986706056129987, "grad_norm": 0.36031079292297363, "learning_rate": 0.00019899002340189678, "loss": 0.4335, "step": 46 }, { "epoch": 0.017355982274741506, "grad_norm": 0.37877610325813293, "learning_rate": 0.0001989653898263333, "loss": 0.4634, "step": 47 }, { "epoch": 0.01772525849335303, "grad_norm": 0.3502812385559082, "learning_rate": 0.0001989407562507698, "loss": 0.4983, "step": 48 }, { "epoch": 0.01809453471196455, "grad_norm": 0.3954726457595825, "learning_rate": 0.00019891612267520633, "loss": 0.5041, "step": 49 }, { "epoch": 0.01846381093057607, "grad_norm": 0.35697075724601746, "learning_rate": 0.00019889148909964282, "loss": 0.5117, "step": 50 }, { "epoch": 0.01846381093057607, "eval_loss": 0.45282307267189026, "eval_runtime": 6.8472, "eval_samples_per_second": 7.302, "eval_steps_per_second": 1.022, "step": 50 }, { "epoch": 0.01883308714918759, "grad_norm": 0.36884281039237976, "learning_rate": 0.00019886685552407933, "loss": 0.4023, "step": 51 }, { "epoch": 0.019202363367799114, "grad_norm": 0.4169233739376068, "learning_rate": 0.00019884222194851582, "loss": 0.5357, "step": 52 }, { "epoch": 0.019571639586410634, "grad_norm": 0.45588135719299316, "learning_rate": 0.00019881758837295233, "loss": 0.5044, "step": 53 }, { "epoch": 0.019940915805022157, "grad_norm": 0.34777161478996277, "learning_rate": 0.00019879295479738885, "loss": 0.4362, "step": 54 }, { "epoch": 0.02031019202363368, "grad_norm": 0.29894888401031494, "learning_rate": 0.00019876832122182536, "loss": 0.395, "step": 55 }, { "epoch": 0.0206794682422452, "grad_norm": 0.34574243426322937, "learning_rate": 0.00019874368764626185, "loss": 0.4749, "step": 56 }, { "epoch": 0.021048744460856722, "grad_norm": 0.33335548639297485, "learning_rate": 0.00019871905407069836, "loss": 0.4977, "step": 57 }, { "epoch": 0.02141802067946824, "grad_norm": 0.3539446294307709, "learning_rate": 0.00019869442049513488, "loss": 0.4267, "step": 58 }, { "epoch": 0.021787296898079764, "grad_norm": 0.42208486795425415, "learning_rate": 0.0001986697869195714, "loss": 0.5042, "step": 59 }, { "epoch": 0.022156573116691284, "grad_norm": 0.740729808807373, "learning_rate": 0.00019864515334400788, "loss": 0.5147, "step": 60 }, { "epoch": 0.022525849335302807, "grad_norm": 0.40567144751548767, "learning_rate": 0.0001986205197684444, "loss": 0.4998, "step": 61 }, { "epoch": 0.022895125553914326, "grad_norm": 0.3556930720806122, "learning_rate": 0.0001985958861928809, "loss": 0.4965, "step": 62 }, { "epoch": 0.02326440177252585, "grad_norm": 0.3750048875808716, "learning_rate": 0.00019857125261731743, "loss": 0.5568, "step": 63 }, { "epoch": 0.023633677991137372, "grad_norm": 0.29892468452453613, "learning_rate": 0.0001985466190417539, "loss": 0.431, "step": 64 }, { "epoch": 0.024002954209748892, "grad_norm": 0.4254045784473419, "learning_rate": 0.00019852198546619043, "loss": 0.4617, "step": 65 }, { "epoch": 0.024372230428360415, "grad_norm": 0.37175410985946655, "learning_rate": 0.00019849735189062691, "loss": 0.4988, "step": 66 }, { "epoch": 0.024741506646971934, "grad_norm": 0.37746497988700867, "learning_rate": 0.00019847271831506346, "loss": 0.5082, "step": 67 }, { "epoch": 0.025110782865583457, "grad_norm": 0.32295873761177063, "learning_rate": 0.00019844808473949994, "loss": 0.3769, "step": 68 }, { "epoch": 0.025480059084194977, "grad_norm": 0.34700700640678406, "learning_rate": 0.00019842345116393646, "loss": 0.4392, "step": 69 }, { "epoch": 0.0258493353028065, "grad_norm": 0.47979068756103516, "learning_rate": 0.00019839881758837295, "loss": 0.5096, "step": 70 }, { "epoch": 0.02621861152141802, "grad_norm": 0.3435942828655243, "learning_rate": 0.00019837418401280946, "loss": 0.4908, "step": 71 }, { "epoch": 0.026587887740029542, "grad_norm": 0.43033188581466675, "learning_rate": 0.00019834955043724598, "loss": 0.4926, "step": 72 }, { "epoch": 0.026957163958641065, "grad_norm": 0.32151058316230774, "learning_rate": 0.0001983249168616825, "loss": 0.4244, "step": 73 }, { "epoch": 0.027326440177252585, "grad_norm": 0.37814804911613464, "learning_rate": 0.00019830028328611898, "loss": 0.55, "step": 74 }, { "epoch": 0.027695716395864108, "grad_norm": 0.39102527499198914, "learning_rate": 0.0001982756497105555, "loss": 0.5034, "step": 75 }, { "epoch": 0.028064992614475627, "grad_norm": 0.4471779763698578, "learning_rate": 0.000198251016134992, "loss": 0.543, "step": 76 }, { "epoch": 0.02843426883308715, "grad_norm": 0.4095175266265869, "learning_rate": 0.00019822638255942852, "loss": 0.5311, "step": 77 }, { "epoch": 0.02880354505169867, "grad_norm": 0.3491657078266144, "learning_rate": 0.000198201748983865, "loss": 0.5068, "step": 78 }, { "epoch": 0.029172821270310192, "grad_norm": 0.3270619511604309, "learning_rate": 0.00019817711540830152, "loss": 0.44, "step": 79 }, { "epoch": 0.029542097488921712, "grad_norm": 0.30704453587532043, "learning_rate": 0.000198152481832738, "loss": 0.4399, "step": 80 }, { "epoch": 0.029911373707533235, "grad_norm": 0.34257006645202637, "learning_rate": 0.00019812784825717455, "loss": 0.4092, "step": 81 }, { "epoch": 0.030280649926144758, "grad_norm": 0.3198525011539459, "learning_rate": 0.00019810321468161104, "loss": 0.4801, "step": 82 }, { "epoch": 0.030649926144756277, "grad_norm": 0.30465447902679443, "learning_rate": 0.00019807858110604756, "loss": 0.416, "step": 83 }, { "epoch": 0.0310192023633678, "grad_norm": 0.32561299204826355, "learning_rate": 0.00019805394753048404, "loss": 0.514, "step": 84 }, { "epoch": 0.03138847858197932, "grad_norm": 0.3592464327812195, "learning_rate": 0.00019802931395492056, "loss": 0.4441, "step": 85 }, { "epoch": 0.03175775480059084, "grad_norm": 0.36717188358306885, "learning_rate": 0.00019800468037935707, "loss": 0.4374, "step": 86 }, { "epoch": 0.03212703101920236, "grad_norm": 0.2966742217540741, "learning_rate": 0.0001979800468037936, "loss": 0.38, "step": 87 }, { "epoch": 0.03249630723781388, "grad_norm": 0.35723623633384705, "learning_rate": 0.00019795541322823007, "loss": 0.4626, "step": 88 }, { "epoch": 0.03286558345642541, "grad_norm": 0.31849217414855957, "learning_rate": 0.0001979307796526666, "loss": 0.387, "step": 89 }, { "epoch": 0.03323485967503693, "grad_norm": 0.32004714012145996, "learning_rate": 0.0001979061460771031, "loss": 0.4019, "step": 90 }, { "epoch": 0.03360413589364845, "grad_norm": 0.3383117616176605, "learning_rate": 0.00019788151250153962, "loss": 0.4433, "step": 91 }, { "epoch": 0.033973412112259974, "grad_norm": 0.3656468093395233, "learning_rate": 0.0001978568789259761, "loss": 0.4419, "step": 92 }, { "epoch": 0.03434268833087149, "grad_norm": 0.3365080654621124, "learning_rate": 0.00019783224535041262, "loss": 0.431, "step": 93 }, { "epoch": 0.03471196454948301, "grad_norm": 0.4304644465446472, "learning_rate": 0.00019780761177484913, "loss": 0.4476, "step": 94 }, { "epoch": 0.03508124076809453, "grad_norm": 0.43384629487991333, "learning_rate": 0.00019778297819928565, "loss": 0.4424, "step": 95 }, { "epoch": 0.03545051698670606, "grad_norm": 0.3593868315219879, "learning_rate": 0.00019775834462372214, "loss": 0.462, "step": 96 }, { "epoch": 0.03581979320531758, "grad_norm": 0.307407945394516, "learning_rate": 0.00019773371104815865, "loss": 0.4392, "step": 97 }, { "epoch": 0.0361890694239291, "grad_norm": 0.36906832456588745, "learning_rate": 0.00019770907747259514, "loss": 0.5025, "step": 98 }, { "epoch": 0.03655834564254062, "grad_norm": 0.28905150294303894, "learning_rate": 0.00019768444389703168, "loss": 0.4284, "step": 99 }, { "epoch": 0.03692762186115214, "grad_norm": 0.272885262966156, "learning_rate": 0.00019765981032146817, "loss": 0.3671, "step": 100 }, { "epoch": 0.03692762186115214, "eval_loss": 0.419514000415802, "eval_runtime": 5.8696, "eval_samples_per_second": 8.518, "eval_steps_per_second": 1.193, "step": 100 }, { "epoch": 0.03729689807976366, "grad_norm": 0.33638831973075867, "learning_rate": 0.00019763517674590468, "loss": 0.501, "step": 101 }, { "epoch": 0.03766617429837518, "grad_norm": 0.2842804789543152, "learning_rate": 0.00019761054317034117, "loss": 0.3818, "step": 102 }, { "epoch": 0.03803545051698671, "grad_norm": 0.36060819029808044, "learning_rate": 0.00019758590959477769, "loss": 0.389, "step": 103 }, { "epoch": 0.03840472673559823, "grad_norm": 0.3600110709667206, "learning_rate": 0.0001975612760192142, "loss": 0.4301, "step": 104 }, { "epoch": 0.03877400295420975, "grad_norm": 0.35801422595977783, "learning_rate": 0.00019753664244365071, "loss": 0.4298, "step": 105 }, { "epoch": 0.03914327917282127, "grad_norm": 0.43014129996299744, "learning_rate": 0.0001975120088680872, "loss": 0.4976, "step": 106 }, { "epoch": 0.039512555391432794, "grad_norm": 0.3420620560646057, "learning_rate": 0.00019748737529252372, "loss": 0.3749, "step": 107 }, { "epoch": 0.03988183161004431, "grad_norm": 0.3296439051628113, "learning_rate": 0.00019746274171696023, "loss": 0.4317, "step": 108 }, { "epoch": 0.04025110782865583, "grad_norm": 0.29001107811927795, "learning_rate": 0.00019743810814139675, "loss": 0.416, "step": 109 }, { "epoch": 0.04062038404726736, "grad_norm": 0.37443405389785767, "learning_rate": 0.00019741347456583323, "loss": 0.3803, "step": 110 }, { "epoch": 0.04098966026587888, "grad_norm": 0.3694715201854706, "learning_rate": 0.00019738884099026975, "loss": 0.4312, "step": 111 }, { "epoch": 0.0413589364844904, "grad_norm": 0.34137406945228577, "learning_rate": 0.00019736420741470624, "loss": 0.5141, "step": 112 }, { "epoch": 0.04172821270310192, "grad_norm": 0.3892885744571686, "learning_rate": 0.00019733957383914278, "loss": 0.4073, "step": 113 }, { "epoch": 0.042097488921713444, "grad_norm": 0.3243370056152344, "learning_rate": 0.00019731494026357927, "loss": 0.4549, "step": 114 }, { "epoch": 0.042466765140324964, "grad_norm": 0.31876978278160095, "learning_rate": 0.00019729030668801578, "loss": 0.4027, "step": 115 }, { "epoch": 0.04283604135893648, "grad_norm": 0.27553582191467285, "learning_rate": 0.00019726567311245227, "loss": 0.326, "step": 116 }, { "epoch": 0.043205317577548, "grad_norm": 0.30736929178237915, "learning_rate": 0.00019724103953688878, "loss": 0.4841, "step": 117 }, { "epoch": 0.04357459379615953, "grad_norm": 0.41499844193458557, "learning_rate": 0.0001972164059613253, "loss": 0.5411, "step": 118 }, { "epoch": 0.04394387001477105, "grad_norm": 0.3472537398338318, "learning_rate": 0.0001971917723857618, "loss": 0.3876, "step": 119 }, { "epoch": 0.04431314623338257, "grad_norm": 0.33110174536705017, "learning_rate": 0.0001971671388101983, "loss": 0.4174, "step": 120 }, { "epoch": 0.044682422451994094, "grad_norm": 0.27718454599380493, "learning_rate": 0.0001971425052346348, "loss": 0.339, "step": 121 }, { "epoch": 0.045051698670605614, "grad_norm": 0.2659473717212677, "learning_rate": 0.00019711787165907133, "loss": 0.3273, "step": 122 }, { "epoch": 0.04542097488921713, "grad_norm": 0.37134623527526855, "learning_rate": 0.00019709323808350784, "loss": 0.4924, "step": 123 }, { "epoch": 0.04579025110782865, "grad_norm": 0.32371461391448975, "learning_rate": 0.00019706860450794433, "loss": 0.384, "step": 124 }, { "epoch": 0.04615952732644018, "grad_norm": 0.31927135586738586, "learning_rate": 0.00019704397093238084, "loss": 0.4308, "step": 125 }, { "epoch": 0.0465288035450517, "grad_norm": 0.3606109619140625, "learning_rate": 0.00019701933735681736, "loss": 0.392, "step": 126 }, { "epoch": 0.04689807976366322, "grad_norm": 0.39174655079841614, "learning_rate": 0.00019699470378125387, "loss": 0.4944, "step": 127 }, { "epoch": 0.047267355982274745, "grad_norm": 0.379129022359848, "learning_rate": 0.00019697007020569036, "loss": 0.422, "step": 128 }, { "epoch": 0.047636632200886264, "grad_norm": 0.31061556935310364, "learning_rate": 0.00019694543663012688, "loss": 0.4159, "step": 129 }, { "epoch": 0.048005908419497784, "grad_norm": 0.35142982006073, "learning_rate": 0.00019692080305456336, "loss": 0.5237, "step": 130 }, { "epoch": 0.0483751846381093, "grad_norm": 0.352500855922699, "learning_rate": 0.0001968961694789999, "loss": 0.4278, "step": 131 }, { "epoch": 0.04874446085672083, "grad_norm": 0.3081618547439575, "learning_rate": 0.0001968715359034364, "loss": 0.4002, "step": 132 }, { "epoch": 0.04911373707533235, "grad_norm": 0.3187482953071594, "learning_rate": 0.0001968469023278729, "loss": 0.4478, "step": 133 }, { "epoch": 0.04948301329394387, "grad_norm": 0.3648607134819031, "learning_rate": 0.0001968222687523094, "loss": 0.3772, "step": 134 }, { "epoch": 0.04985228951255539, "grad_norm": 0.3242417871952057, "learning_rate": 0.0001967976351767459, "loss": 0.399, "step": 135 }, { "epoch": 0.050221565731166914, "grad_norm": 0.3742475211620331, "learning_rate": 0.00019677300160118242, "loss": 0.5924, "step": 136 }, { "epoch": 0.050590841949778434, "grad_norm": 0.3294001817703247, "learning_rate": 0.00019674836802561894, "loss": 0.4061, "step": 137 }, { "epoch": 0.05096011816838995, "grad_norm": 0.2923147976398468, "learning_rate": 0.00019672373445005543, "loss": 0.3732, "step": 138 }, { "epoch": 0.05132939438700148, "grad_norm": 0.28740236163139343, "learning_rate": 0.00019669910087449194, "loss": 0.3689, "step": 139 }, { "epoch": 0.051698670605613, "grad_norm": 0.3307899534702301, "learning_rate": 0.00019667446729892846, "loss": 0.4061, "step": 140 }, { "epoch": 0.05206794682422452, "grad_norm": 0.3527816832065582, "learning_rate": 0.00019664983372336497, "loss": 0.5354, "step": 141 }, { "epoch": 0.05243722304283604, "grad_norm": 0.32473066449165344, "learning_rate": 0.00019662520014780146, "loss": 0.393, "step": 142 }, { "epoch": 0.052806499261447565, "grad_norm": 0.3600977659225464, "learning_rate": 0.00019660056657223797, "loss": 0.4797, "step": 143 }, { "epoch": 0.053175775480059084, "grad_norm": 0.3709307610988617, "learning_rate": 0.00019657593299667446, "loss": 0.4527, "step": 144 }, { "epoch": 0.053545051698670604, "grad_norm": 0.31951725482940674, "learning_rate": 0.000196551299421111, "loss": 0.4234, "step": 145 }, { "epoch": 0.05391432791728213, "grad_norm": 0.34864845871925354, "learning_rate": 0.0001965266658455475, "loss": 0.4104, "step": 146 }, { "epoch": 0.05428360413589365, "grad_norm": 0.3169608414173126, "learning_rate": 0.000196502032269984, "loss": 0.4348, "step": 147 }, { "epoch": 0.05465288035450517, "grad_norm": 0.32839155197143555, "learning_rate": 0.0001964773986944205, "loss": 0.4174, "step": 148 }, { "epoch": 0.05502215657311669, "grad_norm": 0.31056901812553406, "learning_rate": 0.000196452765118857, "loss": 0.3643, "step": 149 }, { "epoch": 0.055391432791728215, "grad_norm": 0.3091226816177368, "learning_rate": 0.00019642813154329352, "loss": 0.353, "step": 150 }, { "epoch": 0.055391432791728215, "eval_loss": 0.4037952721118927, "eval_runtime": 5.8759, "eval_samples_per_second": 8.509, "eval_steps_per_second": 1.191, "step": 150 }, { "epoch": 0.055760709010339735, "grad_norm": 0.32927340269088745, "learning_rate": 0.00019640349796773004, "loss": 0.4333, "step": 151 }, { "epoch": 0.056129985228951254, "grad_norm": 0.37403640151023865, "learning_rate": 0.00019637886439216652, "loss": 0.5012, "step": 152 }, { "epoch": 0.056499261447562774, "grad_norm": 0.31805136799812317, "learning_rate": 0.00019635423081660304, "loss": 0.4309, "step": 153 }, { "epoch": 0.0568685376661743, "grad_norm": 0.2776757776737213, "learning_rate": 0.00019632959724103955, "loss": 0.3624, "step": 154 }, { "epoch": 0.05723781388478582, "grad_norm": 0.39969655871391296, "learning_rate": 0.00019630496366547607, "loss": 0.4735, "step": 155 }, { "epoch": 0.05760709010339734, "grad_norm": 0.3075847327709198, "learning_rate": 0.00019628033008991255, "loss": 0.4294, "step": 156 }, { "epoch": 0.057976366322008865, "grad_norm": 0.2900707423686981, "learning_rate": 0.00019625569651434907, "loss": 0.3997, "step": 157 }, { "epoch": 0.058345642540620385, "grad_norm": 0.3196985125541687, "learning_rate": 0.00019623106293878556, "loss": 0.4385, "step": 158 }, { "epoch": 0.058714918759231904, "grad_norm": 0.42510777711868286, "learning_rate": 0.0001962064293632221, "loss": 0.3615, "step": 159 }, { "epoch": 0.059084194977843424, "grad_norm": 0.31159719824790955, "learning_rate": 0.00019618179578765859, "loss": 0.4185, "step": 160 }, { "epoch": 0.05945347119645495, "grad_norm": 0.2744397222995758, "learning_rate": 0.0001961571622120951, "loss": 0.3571, "step": 161 }, { "epoch": 0.05982274741506647, "grad_norm": 0.3381613492965698, "learning_rate": 0.0001961325286365316, "loss": 0.4154, "step": 162 }, { "epoch": 0.06019202363367799, "grad_norm": 0.27710267901420593, "learning_rate": 0.00019610789506096813, "loss": 0.3451, "step": 163 }, { "epoch": 0.060561299852289516, "grad_norm": 0.3598025143146515, "learning_rate": 0.00019608326148540462, "loss": 0.4224, "step": 164 }, { "epoch": 0.060930576070901035, "grad_norm": 0.2834571897983551, "learning_rate": 0.00019605862790984113, "loss": 0.4527, "step": 165 }, { "epoch": 0.061299852289512555, "grad_norm": 0.31147488951683044, "learning_rate": 0.00019603399433427762, "loss": 0.4013, "step": 166 }, { "epoch": 0.061669128508124074, "grad_norm": 0.3192875385284424, "learning_rate": 0.00019600936075871413, "loss": 0.404, "step": 167 }, { "epoch": 0.0620384047267356, "grad_norm": 0.31411129236221313, "learning_rate": 0.00019598472718315065, "loss": 0.3808, "step": 168 }, { "epoch": 0.06240768094534712, "grad_norm": 0.32308077812194824, "learning_rate": 0.00019596009360758716, "loss": 0.4712, "step": 169 }, { "epoch": 0.06277695716395865, "grad_norm": 0.28216615319252014, "learning_rate": 0.00019593546003202365, "loss": 0.3801, "step": 170 }, { "epoch": 0.06314623338257017, "grad_norm": 0.31070226430892944, "learning_rate": 0.00019591082645646017, "loss": 0.4255, "step": 171 }, { "epoch": 0.06351550960118169, "grad_norm": 0.36081427335739136, "learning_rate": 0.00019588619288089668, "loss": 0.3502, "step": 172 }, { "epoch": 0.0638847858197932, "grad_norm": 0.3797888159751892, "learning_rate": 0.0001958615593053332, "loss": 0.4398, "step": 173 }, { "epoch": 0.06425406203840472, "grad_norm": 0.33760133385658264, "learning_rate": 0.00019583692572976968, "loss": 0.5073, "step": 174 }, { "epoch": 0.06462333825701624, "grad_norm": 0.3150223195552826, "learning_rate": 0.0001958122921542062, "loss": 0.3642, "step": 175 }, { "epoch": 0.06499261447562776, "grad_norm": 0.47819983959198, "learning_rate": 0.00019578765857864268, "loss": 0.4292, "step": 176 }, { "epoch": 0.0653618906942393, "grad_norm": 0.28102725744247437, "learning_rate": 0.00019576302500307923, "loss": 0.3867, "step": 177 }, { "epoch": 0.06573116691285082, "grad_norm": 0.357327401638031, "learning_rate": 0.00019573839142751571, "loss": 0.4362, "step": 178 }, { "epoch": 0.06610044313146234, "grad_norm": 0.2614378035068512, "learning_rate": 0.00019571375785195223, "loss": 0.3398, "step": 179 }, { "epoch": 0.06646971935007386, "grad_norm": 0.2837601900100708, "learning_rate": 0.00019568912427638872, "loss": 0.3158, "step": 180 }, { "epoch": 0.06683899556868537, "grad_norm": 0.2811811566352844, "learning_rate": 0.00019566449070082523, "loss": 0.3756, "step": 181 }, { "epoch": 0.0672082717872969, "grad_norm": 0.3223420977592468, "learning_rate": 0.00019563985712526175, "loss": 0.4041, "step": 182 }, { "epoch": 0.06757754800590841, "grad_norm": 0.3521338105201721, "learning_rate": 0.00019561522354969826, "loss": 0.5336, "step": 183 }, { "epoch": 0.06794682422451995, "grad_norm": 0.36033207178115845, "learning_rate": 0.00019559058997413475, "loss": 0.5104, "step": 184 }, { "epoch": 0.06831610044313147, "grad_norm": 0.3409123122692108, "learning_rate": 0.00019556595639857126, "loss": 0.4852, "step": 185 }, { "epoch": 0.06868537666174299, "grad_norm": 0.29176652431488037, "learning_rate": 0.00019554132282300778, "loss": 0.3441, "step": 186 }, { "epoch": 0.0690546528803545, "grad_norm": 0.2858871519565582, "learning_rate": 0.0001955166892474443, "loss": 0.3342, "step": 187 }, { "epoch": 0.06942392909896603, "grad_norm": 0.2597043514251709, "learning_rate": 0.00019549205567188078, "loss": 0.3095, "step": 188 }, { "epoch": 0.06979320531757754, "grad_norm": 0.30069851875305176, "learning_rate": 0.0001954674220963173, "loss": 0.3474, "step": 189 }, { "epoch": 0.07016248153618906, "grad_norm": 0.33916133642196655, "learning_rate": 0.00019544278852075378, "loss": 0.5525, "step": 190 }, { "epoch": 0.0705317577548006, "grad_norm": 0.3795078992843628, "learning_rate": 0.00019541815494519032, "loss": 0.5012, "step": 191 }, { "epoch": 0.07090103397341212, "grad_norm": 0.4375127851963043, "learning_rate": 0.0001953935213696268, "loss": 0.4266, "step": 192 }, { "epoch": 0.07127031019202364, "grad_norm": 0.3799235224723816, "learning_rate": 0.00019536888779406333, "loss": 0.5288, "step": 193 }, { "epoch": 0.07163958641063516, "grad_norm": 0.2929205894470215, "learning_rate": 0.0001953442542184998, "loss": 0.3563, "step": 194 }, { "epoch": 0.07200886262924668, "grad_norm": 0.3236566483974457, "learning_rate": 0.00019531962064293633, "loss": 0.3988, "step": 195 }, { "epoch": 0.0723781388478582, "grad_norm": 0.38911980390548706, "learning_rate": 0.00019529498706737284, "loss": 0.5104, "step": 196 }, { "epoch": 0.07274741506646971, "grad_norm": 0.3128013014793396, "learning_rate": 0.00019527035349180936, "loss": 0.4066, "step": 197 }, { "epoch": 0.07311669128508123, "grad_norm": 0.3556188642978668, "learning_rate": 0.00019524571991624584, "loss": 0.5096, "step": 198 }, { "epoch": 0.07348596750369277, "grad_norm": 0.2989361584186554, "learning_rate": 0.00019522108634068236, "loss": 0.33, "step": 199 }, { "epoch": 0.07385524372230429, "grad_norm": 0.5028315782546997, "learning_rate": 0.00019519645276511887, "loss": 0.5224, "step": 200 }, { "epoch": 0.07385524372230429, "eval_loss": 0.39015138149261475, "eval_runtime": 5.896, "eval_samples_per_second": 8.48, "eval_steps_per_second": 1.187, "step": 200 }, { "epoch": 0.0742245199409158, "grad_norm": 0.3487118184566498, "learning_rate": 0.0001951718191895554, "loss": 0.5037, "step": 201 }, { "epoch": 0.07459379615952733, "grad_norm": 0.34047648310661316, "learning_rate": 0.00019514718561399188, "loss": 0.439, "step": 202 }, { "epoch": 0.07496307237813885, "grad_norm": 0.32839900255203247, "learning_rate": 0.0001951225520384284, "loss": 0.4776, "step": 203 }, { "epoch": 0.07533234859675036, "grad_norm": 0.349039763212204, "learning_rate": 0.0001950979184628649, "loss": 0.4567, "step": 204 }, { "epoch": 0.07570162481536188, "grad_norm": 0.37458980083465576, "learning_rate": 0.00019507328488730142, "loss": 0.4025, "step": 205 }, { "epoch": 0.07607090103397342, "grad_norm": 0.32469749450683594, "learning_rate": 0.0001950486513117379, "loss": 0.4164, "step": 206 }, { "epoch": 0.07644017725258494, "grad_norm": 0.259811133146286, "learning_rate": 0.00019502401773617442, "loss": 0.3229, "step": 207 }, { "epoch": 0.07680945347119646, "grad_norm": 0.3223322629928589, "learning_rate": 0.0001949993841606109, "loss": 0.4247, "step": 208 }, { "epoch": 0.07717872968980798, "grad_norm": 0.29984578490257263, "learning_rate": 0.00019497475058504745, "loss": 0.3751, "step": 209 }, { "epoch": 0.0775480059084195, "grad_norm": 0.2897316515445709, "learning_rate": 0.00019495011700948394, "loss": 0.3907, "step": 210 }, { "epoch": 0.07791728212703102, "grad_norm": 0.31401652097702026, "learning_rate": 0.00019492548343392045, "loss": 0.42, "step": 211 }, { "epoch": 0.07828655834564253, "grad_norm": 0.29450908303260803, "learning_rate": 0.00019490084985835694, "loss": 0.4077, "step": 212 }, { "epoch": 0.07865583456425407, "grad_norm": 0.2941333055496216, "learning_rate": 0.00019487621628279346, "loss": 0.3752, "step": 213 }, { "epoch": 0.07902511078286559, "grad_norm": 0.3410256505012512, "learning_rate": 0.00019485158270722997, "loss": 0.4759, "step": 214 }, { "epoch": 0.0793943870014771, "grad_norm": 0.35839465260505676, "learning_rate": 0.00019482694913166648, "loss": 0.4277, "step": 215 }, { "epoch": 0.07976366322008863, "grad_norm": 0.3427143394947052, "learning_rate": 0.00019480231555610297, "loss": 0.4251, "step": 216 }, { "epoch": 0.08013293943870015, "grad_norm": 0.27888450026512146, "learning_rate": 0.0001947776819805395, "loss": 0.3859, "step": 217 }, { "epoch": 0.08050221565731167, "grad_norm": 0.36770373582839966, "learning_rate": 0.000194753048404976, "loss": 0.459, "step": 218 }, { "epoch": 0.08087149187592318, "grad_norm": 0.29189980030059814, "learning_rate": 0.00019472841482941252, "loss": 0.3102, "step": 219 }, { "epoch": 0.08124076809453472, "grad_norm": 0.3150429427623749, "learning_rate": 0.000194703781253849, "loss": 0.401, "step": 220 }, { "epoch": 0.08161004431314624, "grad_norm": 0.3211479187011719, "learning_rate": 0.00019467914767828552, "loss": 0.421, "step": 221 }, { "epoch": 0.08197932053175776, "grad_norm": 0.26583531498908997, "learning_rate": 0.000194654514102722, "loss": 0.388, "step": 222 }, { "epoch": 0.08234859675036928, "grad_norm": 0.321421355009079, "learning_rate": 0.00019462988052715855, "loss": 0.4625, "step": 223 }, { "epoch": 0.0827178729689808, "grad_norm": 0.31825941801071167, "learning_rate": 0.00019460524695159504, "loss": 0.4025, "step": 224 }, { "epoch": 0.08308714918759232, "grad_norm": 0.31772172451019287, "learning_rate": 0.00019458061337603155, "loss": 0.436, "step": 225 }, { "epoch": 0.08345642540620384, "grad_norm": 0.2731233835220337, "learning_rate": 0.00019455597980046804, "loss": 0.3841, "step": 226 }, { "epoch": 0.08382570162481537, "grad_norm": 0.28971999883651733, "learning_rate": 0.00019453134622490455, "loss": 0.3601, "step": 227 }, { "epoch": 0.08419497784342689, "grad_norm": 0.25201430916786194, "learning_rate": 0.00019450671264934107, "loss": 0.2993, "step": 228 }, { "epoch": 0.08456425406203841, "grad_norm": 0.31219369173049927, "learning_rate": 0.00019448207907377758, "loss": 0.3776, "step": 229 }, { "epoch": 0.08493353028064993, "grad_norm": 0.379317969083786, "learning_rate": 0.00019445744549821407, "loss": 0.5589, "step": 230 }, { "epoch": 0.08530280649926145, "grad_norm": 0.311305433511734, "learning_rate": 0.00019443281192265058, "loss": 0.4364, "step": 231 }, { "epoch": 0.08567208271787297, "grad_norm": 0.32585617899894714, "learning_rate": 0.0001944081783470871, "loss": 0.4224, "step": 232 }, { "epoch": 0.08604135893648449, "grad_norm": 0.26801955699920654, "learning_rate": 0.0001943835447715236, "loss": 0.3569, "step": 233 }, { "epoch": 0.086410635155096, "grad_norm": 0.3283174932003021, "learning_rate": 0.0001943589111959601, "loss": 0.4421, "step": 234 }, { "epoch": 0.08677991137370754, "grad_norm": 0.3212074935436249, "learning_rate": 0.00019433427762039661, "loss": 0.4274, "step": 235 }, { "epoch": 0.08714918759231906, "grad_norm": 0.3012539744377136, "learning_rate": 0.00019430964404483313, "loss": 0.3998, "step": 236 }, { "epoch": 0.08751846381093058, "grad_norm": 0.31821128726005554, "learning_rate": 0.00019428501046926962, "loss": 0.3862, "step": 237 }, { "epoch": 0.0878877400295421, "grad_norm": 0.3257669508457184, "learning_rate": 0.00019426037689370613, "loss": 0.5178, "step": 238 }, { "epoch": 0.08825701624815362, "grad_norm": 0.3112789988517761, "learning_rate": 0.00019423574331814262, "loss": 0.4182, "step": 239 }, { "epoch": 0.08862629246676514, "grad_norm": 0.25881633162498474, "learning_rate": 0.00019421110974257913, "loss": 0.3079, "step": 240 }, { "epoch": 0.08899556868537666, "grad_norm": 0.31627917289733887, "learning_rate": 0.00019418647616701565, "loss": 0.461, "step": 241 }, { "epoch": 0.08936484490398819, "grad_norm": 0.3195187449455261, "learning_rate": 0.00019416184259145216, "loss": 0.4439, "step": 242 }, { "epoch": 0.08973412112259971, "grad_norm": 0.3452574908733368, "learning_rate": 0.00019413720901588865, "loss": 0.4799, "step": 243 }, { "epoch": 0.09010339734121123, "grad_norm": 0.336542546749115, "learning_rate": 0.00019411257544032517, "loss": 0.4815, "step": 244 }, { "epoch": 0.09047267355982275, "grad_norm": 0.31545954942703247, "learning_rate": 0.00019408794186476168, "loss": 0.4452, "step": 245 }, { "epoch": 0.09084194977843427, "grad_norm": 0.3060772716999054, "learning_rate": 0.0001940633082891982, "loss": 0.4713, "step": 246 }, { "epoch": 0.09121122599704579, "grad_norm": 0.3096682131290436, "learning_rate": 0.00019403867471363468, "loss": 0.4104, "step": 247 }, { "epoch": 0.0915805022156573, "grad_norm": 0.2977633476257324, "learning_rate": 0.0001940140411380712, "loss": 0.4928, "step": 248 }, { "epoch": 0.09194977843426884, "grad_norm": 0.2890436053276062, "learning_rate": 0.00019398940756250768, "loss": 0.4116, "step": 249 }, { "epoch": 0.09231905465288036, "grad_norm": 0.2783840596675873, "learning_rate": 0.00019396477398694423, "loss": 0.3643, "step": 250 }, { "epoch": 0.09231905465288036, "eval_loss": 0.3816169500350952, "eval_runtime": 5.8658, "eval_samples_per_second": 8.524, "eval_steps_per_second": 1.193, "step": 250 }, { "epoch": 0.09268833087149188, "grad_norm": 0.3246957063674927, "learning_rate": 0.00019394014041138071, "loss": 0.477, "step": 251 }, { "epoch": 0.0930576070901034, "grad_norm": 0.3633597493171692, "learning_rate": 0.00019391550683581723, "loss": 0.4847, "step": 252 }, { "epoch": 0.09342688330871492, "grad_norm": 0.3402022421360016, "learning_rate": 0.00019389087326025372, "loss": 0.4732, "step": 253 }, { "epoch": 0.09379615952732644, "grad_norm": 0.40175941586494446, "learning_rate": 0.00019386623968469023, "loss": 0.3792, "step": 254 }, { "epoch": 0.09416543574593796, "grad_norm": 0.3117552101612091, "learning_rate": 0.00019384160610912675, "loss": 0.381, "step": 255 }, { "epoch": 0.09453471196454949, "grad_norm": 0.26914849877357483, "learning_rate": 0.00019381697253356326, "loss": 0.3325, "step": 256 }, { "epoch": 0.09490398818316101, "grad_norm": 0.3286375403404236, "learning_rate": 0.00019379233895799975, "loss": 0.4223, "step": 257 }, { "epoch": 0.09527326440177253, "grad_norm": 0.31570112705230713, "learning_rate": 0.00019376770538243626, "loss": 0.4549, "step": 258 }, { "epoch": 0.09564254062038405, "grad_norm": 0.3004911541938782, "learning_rate": 0.00019374307180687278, "loss": 0.422, "step": 259 }, { "epoch": 0.09601181683899557, "grad_norm": 0.316474974155426, "learning_rate": 0.0001937184382313093, "loss": 0.451, "step": 260 }, { "epoch": 0.09638109305760709, "grad_norm": 0.3089964687824249, "learning_rate": 0.00019369380465574578, "loss": 0.426, "step": 261 }, { "epoch": 0.0967503692762186, "grad_norm": 0.28488385677337646, "learning_rate": 0.0001936691710801823, "loss": 0.3753, "step": 262 }, { "epoch": 0.09711964549483014, "grad_norm": 0.27882590889930725, "learning_rate": 0.0001936445375046188, "loss": 0.3279, "step": 263 }, { "epoch": 0.09748892171344166, "grad_norm": 0.3286533057689667, "learning_rate": 0.00019361990392905532, "loss": 0.4219, "step": 264 }, { "epoch": 0.09785819793205318, "grad_norm": 0.3470388948917389, "learning_rate": 0.0001935952703534918, "loss": 0.4469, "step": 265 }, { "epoch": 0.0982274741506647, "grad_norm": 0.2946823537349701, "learning_rate": 0.00019357063677792832, "loss": 0.4202, "step": 266 }, { "epoch": 0.09859675036927622, "grad_norm": 0.35018935799598694, "learning_rate": 0.0001935460032023648, "loss": 0.4219, "step": 267 }, { "epoch": 0.09896602658788774, "grad_norm": 0.3789230287075043, "learning_rate": 0.00019352136962680135, "loss": 0.4787, "step": 268 }, { "epoch": 0.09933530280649926, "grad_norm": 0.26236382126808167, "learning_rate": 0.00019349673605123784, "loss": 0.3258, "step": 269 }, { "epoch": 0.09970457902511078, "grad_norm": 0.3044803738594055, "learning_rate": 0.00019347210247567436, "loss": 0.4202, "step": 270 }, { "epoch": 0.10007385524372231, "grad_norm": 0.34376615285873413, "learning_rate": 0.00019344746890011084, "loss": 0.3709, "step": 271 }, { "epoch": 0.10044313146233383, "grad_norm": 0.2787488102912903, "learning_rate": 0.00019342283532454736, "loss": 0.3019, "step": 272 }, { "epoch": 0.10081240768094535, "grad_norm": 0.2931845784187317, "learning_rate": 0.00019339820174898387, "loss": 0.3439, "step": 273 }, { "epoch": 0.10118168389955687, "grad_norm": 0.3124999701976776, "learning_rate": 0.0001933735681734204, "loss": 0.3801, "step": 274 }, { "epoch": 0.10155096011816839, "grad_norm": 0.34231653809547424, "learning_rate": 0.00019334893459785688, "loss": 0.4276, "step": 275 }, { "epoch": 0.1019202363367799, "grad_norm": 0.26137593388557434, "learning_rate": 0.0001933243010222934, "loss": 0.3719, "step": 276 }, { "epoch": 0.10228951255539143, "grad_norm": 0.30112504959106445, "learning_rate": 0.0001932996674467299, "loss": 0.4047, "step": 277 }, { "epoch": 0.10265878877400296, "grad_norm": 0.2984439730644226, "learning_rate": 0.00019327503387116642, "loss": 0.3506, "step": 278 }, { "epoch": 0.10302806499261448, "grad_norm": 0.3731374144554138, "learning_rate": 0.0001932504002956029, "loss": 0.439, "step": 279 }, { "epoch": 0.103397341211226, "grad_norm": 0.29310372471809387, "learning_rate": 0.00019322576672003942, "loss": 0.3963, "step": 280 }, { "epoch": 0.10376661742983752, "grad_norm": 0.3897750973701477, "learning_rate": 0.0001932011331444759, "loss": 0.4959, "step": 281 }, { "epoch": 0.10413589364844904, "grad_norm": 0.29394257068634033, "learning_rate": 0.00019317649956891245, "loss": 0.3666, "step": 282 }, { "epoch": 0.10450516986706056, "grad_norm": 0.3201434314250946, "learning_rate": 0.00019315186599334894, "loss": 0.3513, "step": 283 }, { "epoch": 0.10487444608567208, "grad_norm": 0.2828335165977478, "learning_rate": 0.00019312723241778545, "loss": 0.3175, "step": 284 }, { "epoch": 0.10524372230428361, "grad_norm": 0.2940762937068939, "learning_rate": 0.00019310259884222194, "loss": 0.3655, "step": 285 }, { "epoch": 0.10561299852289513, "grad_norm": 0.3026212751865387, "learning_rate": 0.00019307796526665845, "loss": 0.3552, "step": 286 }, { "epoch": 0.10598227474150665, "grad_norm": 0.3048368990421295, "learning_rate": 0.00019305333169109497, "loss": 0.4266, "step": 287 }, { "epoch": 0.10635155096011817, "grad_norm": 0.28596359491348267, "learning_rate": 0.00019302869811553148, "loss": 0.3745, "step": 288 }, { "epoch": 0.10672082717872969, "grad_norm": 0.46713870763778687, "learning_rate": 0.00019300406453996797, "loss": 0.5193, "step": 289 }, { "epoch": 0.10709010339734121, "grad_norm": 0.3674304485321045, "learning_rate": 0.00019297943096440449, "loss": 0.4062, "step": 290 }, { "epoch": 0.10745937961595273, "grad_norm": 0.3007575571537018, "learning_rate": 0.000192954797388841, "loss": 0.3723, "step": 291 }, { "epoch": 0.10782865583456426, "grad_norm": 0.3332579433917999, "learning_rate": 0.00019293016381327752, "loss": 0.3609, "step": 292 }, { "epoch": 0.10819793205317578, "grad_norm": 0.31988173723220825, "learning_rate": 0.000192905530237714, "loss": 0.4013, "step": 293 }, { "epoch": 0.1085672082717873, "grad_norm": 0.3104618787765503, "learning_rate": 0.00019288089666215052, "loss": 0.3623, "step": 294 }, { "epoch": 0.10893648449039882, "grad_norm": 0.35320624709129333, "learning_rate": 0.00019285626308658703, "loss": 0.2864, "step": 295 }, { "epoch": 0.10930576070901034, "grad_norm": 0.24095280468463898, "learning_rate": 0.00019283162951102355, "loss": 0.288, "step": 296 }, { "epoch": 0.10967503692762186, "grad_norm": 0.34680652618408203, "learning_rate": 0.00019280699593546003, "loss": 0.4466, "step": 297 }, { "epoch": 0.11004431314623338, "grad_norm": 0.31190916895866394, "learning_rate": 0.00019278236235989655, "loss": 0.3376, "step": 298 }, { "epoch": 0.11041358936484491, "grad_norm": 0.316723108291626, "learning_rate": 0.00019275772878433304, "loss": 0.4236, "step": 299 }, { "epoch": 0.11078286558345643, "grad_norm": 0.2838907837867737, "learning_rate": 0.00019273309520876958, "loss": 0.3508, "step": 300 }, { "epoch": 0.11078286558345643, "eval_loss": 0.3670826852321625, "eval_runtime": 5.8544, "eval_samples_per_second": 8.541, "eval_steps_per_second": 1.196, "step": 300 }, { "epoch": 0.11115214180206795, "grad_norm": 0.3103469908237457, "learning_rate": 0.00019270846163320607, "loss": 0.4048, "step": 301 }, { "epoch": 0.11152141802067947, "grad_norm": 0.3298850655555725, "learning_rate": 0.00019268382805764258, "loss": 0.4088, "step": 302 }, { "epoch": 0.11189069423929099, "grad_norm": 0.2599942684173584, "learning_rate": 0.00019265919448207907, "loss": 0.3307, "step": 303 }, { "epoch": 0.11225997045790251, "grad_norm": 0.36178743839263916, "learning_rate": 0.00019263456090651558, "loss": 0.3637, "step": 304 }, { "epoch": 0.11262924667651403, "grad_norm": 0.38140785694122314, "learning_rate": 0.0001926099273309521, "loss": 0.5136, "step": 305 }, { "epoch": 0.11299852289512555, "grad_norm": 0.3953641355037689, "learning_rate": 0.0001925852937553886, "loss": 0.4494, "step": 306 }, { "epoch": 0.11336779911373708, "grad_norm": 0.34380874037742615, "learning_rate": 0.0001925606601798251, "loss": 0.3824, "step": 307 }, { "epoch": 0.1137370753323486, "grad_norm": 0.29729288816452026, "learning_rate": 0.00019253602660426161, "loss": 0.3685, "step": 308 }, { "epoch": 0.11410635155096012, "grad_norm": 0.7438095211982727, "learning_rate": 0.00019251139302869813, "loss": 0.3982, "step": 309 }, { "epoch": 0.11447562776957164, "grad_norm": 0.3309866487979889, "learning_rate": 0.00019248675945313464, "loss": 0.4468, "step": 310 }, { "epoch": 0.11484490398818316, "grad_norm": 0.26817625761032104, "learning_rate": 0.00019246212587757113, "loss": 0.3913, "step": 311 }, { "epoch": 0.11521418020679468, "grad_norm": 0.31022658944129944, "learning_rate": 0.00019243749230200765, "loss": 0.4418, "step": 312 }, { "epoch": 0.1155834564254062, "grad_norm": 0.32103851437568665, "learning_rate": 0.00019241285872644413, "loss": 0.4338, "step": 313 }, { "epoch": 0.11595273264401773, "grad_norm": 0.25964781641960144, "learning_rate": 0.00019238822515088068, "loss": 0.2941, "step": 314 }, { "epoch": 0.11632200886262925, "grad_norm": 0.3442859351634979, "learning_rate": 0.00019236359157531716, "loss": 0.4596, "step": 315 }, { "epoch": 0.11669128508124077, "grad_norm": 0.3221491873264313, "learning_rate": 0.00019233895799975368, "loss": 0.4285, "step": 316 }, { "epoch": 0.11706056129985229, "grad_norm": 0.32146963477134705, "learning_rate": 0.00019231432442419016, "loss": 0.3989, "step": 317 }, { "epoch": 0.11742983751846381, "grad_norm": 0.3061736524105072, "learning_rate": 0.00019228969084862668, "loss": 0.3835, "step": 318 }, { "epoch": 0.11779911373707533, "grad_norm": 0.3913908302783966, "learning_rate": 0.0001922650572730632, "loss": 0.4835, "step": 319 }, { "epoch": 0.11816838995568685, "grad_norm": 0.31400611996650696, "learning_rate": 0.0001922404236974997, "loss": 0.4233, "step": 320 }, { "epoch": 0.11853766617429838, "grad_norm": 0.36072105169296265, "learning_rate": 0.0001922157901219362, "loss": 0.4879, "step": 321 }, { "epoch": 0.1189069423929099, "grad_norm": 0.3598588705062866, "learning_rate": 0.0001921911565463727, "loss": 0.4645, "step": 322 }, { "epoch": 0.11927621861152142, "grad_norm": 0.3355506956577301, "learning_rate": 0.00019216652297080923, "loss": 0.3992, "step": 323 }, { "epoch": 0.11964549483013294, "grad_norm": 0.2957211136817932, "learning_rate": 0.00019214188939524574, "loss": 0.3671, "step": 324 }, { "epoch": 0.12001477104874446, "grad_norm": 0.29227033257484436, "learning_rate": 0.00019211725581968223, "loss": 0.3793, "step": 325 }, { "epoch": 0.12038404726735598, "grad_norm": 0.2803254723548889, "learning_rate": 0.00019209262224411874, "loss": 0.3603, "step": 326 }, { "epoch": 0.1207533234859675, "grad_norm": 0.5801281332969666, "learning_rate": 0.00019206798866855523, "loss": 0.5687, "step": 327 }, { "epoch": 0.12112259970457903, "grad_norm": 0.31717267632484436, "learning_rate": 0.00019204335509299177, "loss": 0.3732, "step": 328 }, { "epoch": 0.12149187592319055, "grad_norm": 0.33161666989326477, "learning_rate": 0.00019201872151742826, "loss": 0.3671, "step": 329 }, { "epoch": 0.12186115214180207, "grad_norm": 0.32501599192619324, "learning_rate": 0.00019199408794186477, "loss": 0.4167, "step": 330 }, { "epoch": 0.12223042836041359, "grad_norm": 0.309461385011673, "learning_rate": 0.00019196945436630126, "loss": 0.3919, "step": 331 }, { "epoch": 0.12259970457902511, "grad_norm": 0.29599183797836304, "learning_rate": 0.00019194482079073778, "loss": 0.4568, "step": 332 }, { "epoch": 0.12296898079763663, "grad_norm": 0.38673701882362366, "learning_rate": 0.0001919201872151743, "loss": 0.4347, "step": 333 }, { "epoch": 0.12333825701624815, "grad_norm": 0.3297078013420105, "learning_rate": 0.0001918955536396108, "loss": 0.4622, "step": 334 }, { "epoch": 0.12370753323485968, "grad_norm": 0.3321128487586975, "learning_rate": 0.0001918709200640473, "loss": 0.4433, "step": 335 }, { "epoch": 0.1240768094534712, "grad_norm": 0.2586759030818939, "learning_rate": 0.0001918462864884838, "loss": 0.3136, "step": 336 }, { "epoch": 0.12444608567208272, "grad_norm": 0.24902012944221497, "learning_rate": 0.00019182165291292032, "loss": 0.3435, "step": 337 }, { "epoch": 0.12481536189069424, "grad_norm": 0.4221639931201935, "learning_rate": 0.00019179701933735684, "loss": 0.3849, "step": 338 }, { "epoch": 0.12518463810930577, "grad_norm": 0.33962059020996094, "learning_rate": 0.00019177238576179332, "loss": 0.3935, "step": 339 }, { "epoch": 0.1255539143279173, "grad_norm": 0.35686731338500977, "learning_rate": 0.00019174775218622984, "loss": 0.4114, "step": 340 }, { "epoch": 0.1259231905465288, "grad_norm": 0.3300132155418396, "learning_rate": 0.00019172311861066635, "loss": 0.4289, "step": 341 }, { "epoch": 0.12629246676514033, "grad_norm": 0.30660581588745117, "learning_rate": 0.00019169848503510287, "loss": 0.3981, "step": 342 }, { "epoch": 0.12666174298375185, "grad_norm": 0.2764551639556885, "learning_rate": 0.00019167385145953936, "loss": 0.3313, "step": 343 }, { "epoch": 0.12703101920236337, "grad_norm": 0.33090174198150635, "learning_rate": 0.00019164921788397587, "loss": 0.3681, "step": 344 }, { "epoch": 0.1274002954209749, "grad_norm": 0.3257863521575928, "learning_rate": 0.00019162458430841236, "loss": 0.4006, "step": 345 }, { "epoch": 0.1277695716395864, "grad_norm": 0.2760215699672699, "learning_rate": 0.0001915999507328489, "loss": 0.3531, "step": 346 }, { "epoch": 0.12813884785819793, "grad_norm": 0.27463459968566895, "learning_rate": 0.0001915753171572854, "loss": 0.3362, "step": 347 }, { "epoch": 0.12850812407680945, "grad_norm": 0.2651127576828003, "learning_rate": 0.0001915506835817219, "loss": 0.3311, "step": 348 }, { "epoch": 0.12887740029542097, "grad_norm": 0.26319748163223267, "learning_rate": 0.0001915260500061584, "loss": 0.3812, "step": 349 }, { "epoch": 0.1292466765140325, "grad_norm": 0.3255946934223175, "learning_rate": 0.0001915014164305949, "loss": 0.3957, "step": 350 }, { "epoch": 0.1292466765140325, "eval_loss": 0.3631550669670105, "eval_runtime": 5.8601, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.195, "step": 350 }, { "epoch": 0.129615952732644, "grad_norm": 0.8458616733551025, "learning_rate": 0.00019147678285503142, "loss": 0.5132, "step": 351 }, { "epoch": 0.12998522895125553, "grad_norm": 0.2686603367328644, "learning_rate": 0.00019145214927946793, "loss": 0.2914, "step": 352 }, { "epoch": 0.13035450516986707, "grad_norm": 0.3185257911682129, "learning_rate": 0.00019142751570390442, "loss": 0.3779, "step": 353 }, { "epoch": 0.1307237813884786, "grad_norm": 0.40341976284980774, "learning_rate": 0.00019140288212834094, "loss": 0.4262, "step": 354 }, { "epoch": 0.1310930576070901, "grad_norm": 0.287338525056839, "learning_rate": 0.00019137824855277745, "loss": 0.3584, "step": 355 }, { "epoch": 0.13146233382570163, "grad_norm": 0.3099704384803772, "learning_rate": 0.00019135361497721396, "loss": 0.391, "step": 356 }, { "epoch": 0.13183161004431315, "grad_norm": 0.29361316561698914, "learning_rate": 0.00019132898140165045, "loss": 0.3941, "step": 357 }, { "epoch": 0.13220088626292467, "grad_norm": 0.3063497543334961, "learning_rate": 0.00019130434782608697, "loss": 0.3711, "step": 358 }, { "epoch": 0.1325701624815362, "grad_norm": 0.33277326822280884, "learning_rate": 0.00019127971425052345, "loss": 0.4204, "step": 359 }, { "epoch": 0.1329394387001477, "grad_norm": 0.2999245822429657, "learning_rate": 0.00019125508067496, "loss": 0.4316, "step": 360 }, { "epoch": 0.13330871491875923, "grad_norm": 0.3451574444770813, "learning_rate": 0.00019123044709939648, "loss": 0.4635, "step": 361 }, { "epoch": 0.13367799113737075, "grad_norm": 0.31505754590034485, "learning_rate": 0.000191205813523833, "loss": 0.424, "step": 362 }, { "epoch": 0.13404726735598227, "grad_norm": 0.3461337089538574, "learning_rate": 0.00019118117994826949, "loss": 0.413, "step": 363 }, { "epoch": 0.1344165435745938, "grad_norm": 0.28650403022766113, "learning_rate": 0.000191156546372706, "loss": 0.3617, "step": 364 }, { "epoch": 0.1347858197932053, "grad_norm": 0.3007316291332245, "learning_rate": 0.00019113191279714252, "loss": 0.3575, "step": 365 }, { "epoch": 0.13515509601181683, "grad_norm": 0.2797721028327942, "learning_rate": 0.00019110727922157903, "loss": 0.3361, "step": 366 }, { "epoch": 0.13552437223042835, "grad_norm": 0.30158933997154236, "learning_rate": 0.00019108264564601552, "loss": 0.3612, "step": 367 }, { "epoch": 0.1358936484490399, "grad_norm": 0.3466665744781494, "learning_rate": 0.00019105801207045203, "loss": 0.3723, "step": 368 }, { "epoch": 0.1362629246676514, "grad_norm": 0.293473482131958, "learning_rate": 0.00019103337849488855, "loss": 0.3841, "step": 369 }, { "epoch": 0.13663220088626293, "grad_norm": 0.39320608973503113, "learning_rate": 0.00019100874491932506, "loss": 0.4726, "step": 370 }, { "epoch": 0.13700147710487445, "grad_norm": 0.30709806084632874, "learning_rate": 0.00019098411134376155, "loss": 0.4062, "step": 371 }, { "epoch": 0.13737075332348597, "grad_norm": 0.3337753713130951, "learning_rate": 0.00019095947776819806, "loss": 0.3259, "step": 372 }, { "epoch": 0.1377400295420975, "grad_norm": 0.263394296169281, "learning_rate": 0.00019093484419263458, "loss": 0.3291, "step": 373 }, { "epoch": 0.138109305760709, "grad_norm": 0.2679811716079712, "learning_rate": 0.0001909102106170711, "loss": 0.3157, "step": 374 }, { "epoch": 0.13847858197932053, "grad_norm": 0.3496569097042084, "learning_rate": 0.00019088557704150758, "loss": 0.4116, "step": 375 }, { "epoch": 0.13884785819793205, "grad_norm": 0.36639106273651123, "learning_rate": 0.0001908609434659441, "loss": 0.4704, "step": 376 }, { "epoch": 0.13921713441654357, "grad_norm": 0.33925941586494446, "learning_rate": 0.00019083630989038058, "loss": 0.4368, "step": 377 }, { "epoch": 0.1395864106351551, "grad_norm": 0.3684757947921753, "learning_rate": 0.00019081167631481712, "loss": 0.3754, "step": 378 }, { "epoch": 0.1399556868537666, "grad_norm": 0.28193071484565735, "learning_rate": 0.0001907870427392536, "loss": 0.3299, "step": 379 }, { "epoch": 0.14032496307237813, "grad_norm": 0.2718585133552551, "learning_rate": 0.00019076240916369013, "loss": 0.3369, "step": 380 }, { "epoch": 0.14069423929098965, "grad_norm": 0.2778518795967102, "learning_rate": 0.00019073777558812661, "loss": 0.3625, "step": 381 }, { "epoch": 0.1410635155096012, "grad_norm": 0.29346945881843567, "learning_rate": 0.00019071314201256313, "loss": 0.3746, "step": 382 }, { "epoch": 0.14143279172821271, "grad_norm": 0.33248963952064514, "learning_rate": 0.00019068850843699964, "loss": 0.3983, "step": 383 }, { "epoch": 0.14180206794682423, "grad_norm": 0.32002905011177063, "learning_rate": 0.00019066387486143616, "loss": 0.4637, "step": 384 }, { "epoch": 0.14217134416543575, "grad_norm": 0.2967303693294525, "learning_rate": 0.00019063924128587265, "loss": 0.3241, "step": 385 }, { "epoch": 0.14254062038404727, "grad_norm": 0.3041168749332428, "learning_rate": 0.00019061460771030916, "loss": 0.389, "step": 386 }, { "epoch": 0.1429098966026588, "grad_norm": 0.2852483093738556, "learning_rate": 0.00019058997413474567, "loss": 0.3429, "step": 387 }, { "epoch": 0.1432791728212703, "grad_norm": 0.34776344895362854, "learning_rate": 0.0001905653405591822, "loss": 0.4762, "step": 388 }, { "epoch": 0.14364844903988183, "grad_norm": 0.3596150279045105, "learning_rate": 0.00019054070698361868, "loss": 0.4183, "step": 389 }, { "epoch": 0.14401772525849335, "grad_norm": 0.2584787905216217, "learning_rate": 0.0001905160734080552, "loss": 0.3065, "step": 390 }, { "epoch": 0.14438700147710487, "grad_norm": 0.3181762397289276, "learning_rate": 0.00019049143983249168, "loss": 0.4199, "step": 391 }, { "epoch": 0.1447562776957164, "grad_norm": 0.2671319246292114, "learning_rate": 0.00019046680625692822, "loss": 0.3182, "step": 392 }, { "epoch": 0.1451255539143279, "grad_norm": 0.27982842922210693, "learning_rate": 0.0001904421726813647, "loss": 0.3592, "step": 393 }, { "epoch": 0.14549483013293943, "grad_norm": 0.26896971464157104, "learning_rate": 0.00019041753910580122, "loss": 0.4185, "step": 394 }, { "epoch": 0.14586410635155095, "grad_norm": 0.2974869906902313, "learning_rate": 0.0001903929055302377, "loss": 0.4006, "step": 395 }, { "epoch": 0.14623338257016247, "grad_norm": 0.3384716510772705, "learning_rate": 0.00019036827195467423, "loss": 0.3985, "step": 396 }, { "epoch": 0.14660265878877402, "grad_norm": 0.2817436456680298, "learning_rate": 0.00019034363837911074, "loss": 0.3608, "step": 397 }, { "epoch": 0.14697193500738553, "grad_norm": 0.271081805229187, "learning_rate": 0.00019031900480354725, "loss": 0.3625, "step": 398 }, { "epoch": 0.14734121122599705, "grad_norm": 0.27475908398628235, "learning_rate": 0.00019029437122798374, "loss": 0.3769, "step": 399 }, { "epoch": 0.14771048744460857, "grad_norm": 0.36818766593933105, "learning_rate": 0.00019026973765242026, "loss": 0.4717, "step": 400 }, { "epoch": 0.14771048744460857, "eval_loss": 0.36711856722831726, "eval_runtime": 5.861, "eval_samples_per_second": 8.531, "eval_steps_per_second": 1.194, "step": 400 }, { "epoch": 0.1480797636632201, "grad_norm": 0.33024027943611145, "learning_rate": 0.00019024510407685677, "loss": 0.4197, "step": 401 }, { "epoch": 0.1484490398818316, "grad_norm": 0.29586273431777954, "learning_rate": 0.00019022047050129329, "loss": 0.3627, "step": 402 }, { "epoch": 0.14881831610044313, "grad_norm": 0.2561482787132263, "learning_rate": 0.00019019583692572977, "loss": 0.3229, "step": 403 }, { "epoch": 0.14918759231905465, "grad_norm": 0.40970271825790405, "learning_rate": 0.0001901712033501663, "loss": 0.3999, "step": 404 }, { "epoch": 0.14955686853766617, "grad_norm": 0.2951110601425171, "learning_rate": 0.0001901465697746028, "loss": 0.3577, "step": 405 }, { "epoch": 0.1499261447562777, "grad_norm": 0.2874334156513214, "learning_rate": 0.00019012193619903932, "loss": 0.4147, "step": 406 }, { "epoch": 0.1502954209748892, "grad_norm": 0.3102007806301117, "learning_rate": 0.0001900973026234758, "loss": 0.4029, "step": 407 }, { "epoch": 0.15066469719350073, "grad_norm": 0.36349523067474365, "learning_rate": 0.00019007266904791232, "loss": 0.4179, "step": 408 }, { "epoch": 0.15103397341211225, "grad_norm": 0.38406822085380554, "learning_rate": 0.0001900480354723488, "loss": 0.4687, "step": 409 }, { "epoch": 0.15140324963072377, "grad_norm": 0.29971274733543396, "learning_rate": 0.00019002340189678535, "loss": 0.3451, "step": 410 }, { "epoch": 0.15177252584933532, "grad_norm": 0.29210084676742554, "learning_rate": 0.00018999876832122184, "loss": 0.3963, "step": 411 }, { "epoch": 0.15214180206794684, "grad_norm": 0.2798343300819397, "learning_rate": 0.00018997413474565835, "loss": 0.3325, "step": 412 }, { "epoch": 0.15251107828655835, "grad_norm": 0.3042639493942261, "learning_rate": 0.00018994950117009484, "loss": 0.4247, "step": 413 }, { "epoch": 0.15288035450516987, "grad_norm": 0.3588464558124542, "learning_rate": 0.00018992486759453135, "loss": 0.3864, "step": 414 }, { "epoch": 0.1532496307237814, "grad_norm": 0.2987573742866516, "learning_rate": 0.00018990023401896787, "loss": 0.3517, "step": 415 }, { "epoch": 0.1536189069423929, "grad_norm": 0.31477200984954834, "learning_rate": 0.00018987560044340438, "loss": 0.3267, "step": 416 }, { "epoch": 0.15398818316100443, "grad_norm": 0.2674895226955414, "learning_rate": 0.00018985096686784087, "loss": 0.3123, "step": 417 }, { "epoch": 0.15435745937961595, "grad_norm": 0.3107979893684387, "learning_rate": 0.00018982633329227738, "loss": 0.3532, "step": 418 }, { "epoch": 0.15472673559822747, "grad_norm": 0.35842299461364746, "learning_rate": 0.0001898016997167139, "loss": 0.3993, "step": 419 }, { "epoch": 0.155096011816839, "grad_norm": 0.31787213683128357, "learning_rate": 0.00018977706614115041, "loss": 0.4211, "step": 420 }, { "epoch": 0.1554652880354505, "grad_norm": 0.4663584232330322, "learning_rate": 0.0001897524325655869, "loss": 0.481, "step": 421 }, { "epoch": 0.15583456425406203, "grad_norm": 0.2762092351913452, "learning_rate": 0.00018972779899002342, "loss": 0.3008, "step": 422 }, { "epoch": 0.15620384047267355, "grad_norm": 0.35082077980041504, "learning_rate": 0.0001897031654144599, "loss": 0.4183, "step": 423 }, { "epoch": 0.15657311669128507, "grad_norm": 0.29008132219314575, "learning_rate": 0.00018967853183889645, "loss": 0.3859, "step": 424 }, { "epoch": 0.15694239290989662, "grad_norm": 0.32294392585754395, "learning_rate": 0.00018965389826333293, "loss": 0.3669, "step": 425 }, { "epoch": 0.15731166912850814, "grad_norm": 0.2959410846233368, "learning_rate": 0.00018962926468776945, "loss": 0.3342, "step": 426 }, { "epoch": 0.15768094534711966, "grad_norm": 0.3213634788990021, "learning_rate": 0.00018960463111220593, "loss": 0.3667, "step": 427 }, { "epoch": 0.15805022156573117, "grad_norm": 0.30615440011024475, "learning_rate": 0.00018957999753664245, "loss": 0.4233, "step": 428 }, { "epoch": 0.1584194977843427, "grad_norm": 0.33406612277030945, "learning_rate": 0.00018955536396107896, "loss": 0.5, "step": 429 }, { "epoch": 0.1587887740029542, "grad_norm": 0.347396582365036, "learning_rate": 0.00018953073038551548, "loss": 0.3849, "step": 430 }, { "epoch": 0.15915805022156573, "grad_norm": 0.3068416118621826, "learning_rate": 0.00018950609680995197, "loss": 0.4376, "step": 431 }, { "epoch": 0.15952732644017725, "grad_norm": 0.2605426609516144, "learning_rate": 0.00018948146323438848, "loss": 0.3734, "step": 432 }, { "epoch": 0.15989660265878877, "grad_norm": 0.3910651206970215, "learning_rate": 0.000189456829658825, "loss": 0.3859, "step": 433 }, { "epoch": 0.1602658788774003, "grad_norm": 0.3249022960662842, "learning_rate": 0.0001894321960832615, "loss": 0.4983, "step": 434 }, { "epoch": 0.1606351550960118, "grad_norm": 0.3365795910358429, "learning_rate": 0.000189407562507698, "loss": 0.3826, "step": 435 }, { "epoch": 0.16100443131462333, "grad_norm": 0.304880291223526, "learning_rate": 0.0001893829289321345, "loss": 0.4046, "step": 436 }, { "epoch": 0.16137370753323485, "grad_norm": 0.2914137840270996, "learning_rate": 0.00018935829535657103, "loss": 0.3765, "step": 437 }, { "epoch": 0.16174298375184637, "grad_norm": 0.26551946997642517, "learning_rate": 0.00018933366178100754, "loss": 0.3176, "step": 438 }, { "epoch": 0.1621122599704579, "grad_norm": 0.29021403193473816, "learning_rate": 0.00018930902820544403, "loss": 0.4308, "step": 439 }, { "epoch": 0.16248153618906944, "grad_norm": 0.32742637395858765, "learning_rate": 0.00018928439462988054, "loss": 0.3765, "step": 440 }, { "epoch": 0.16285081240768096, "grad_norm": 0.3414210081100464, "learning_rate": 0.00018925976105431703, "loss": 0.3878, "step": 441 }, { "epoch": 0.16322008862629248, "grad_norm": 0.3173421621322632, "learning_rate": 0.00018923512747875357, "loss": 0.4032, "step": 442 }, { "epoch": 0.163589364844904, "grad_norm": 0.3012334406375885, "learning_rate": 0.00018921049390319006, "loss": 0.4113, "step": 443 }, { "epoch": 0.16395864106351551, "grad_norm": 0.291298508644104, "learning_rate": 0.00018918586032762658, "loss": 0.3144, "step": 444 }, { "epoch": 0.16432791728212703, "grad_norm": 0.2866033911705017, "learning_rate": 0.00018916122675206306, "loss": 0.3517, "step": 445 }, { "epoch": 0.16469719350073855, "grad_norm": 0.28361326456069946, "learning_rate": 0.00018913659317649958, "loss": 0.3092, "step": 446 }, { "epoch": 0.16506646971935007, "grad_norm": 0.28165173530578613, "learning_rate": 0.0001891119596009361, "loss": 0.332, "step": 447 }, { "epoch": 0.1654357459379616, "grad_norm": 0.27297672629356384, "learning_rate": 0.0001890873260253726, "loss": 0.3231, "step": 448 }, { "epoch": 0.1658050221565731, "grad_norm": 0.3012530505657196, "learning_rate": 0.0001890626924498091, "loss": 0.4061, "step": 449 }, { "epoch": 0.16617429837518463, "grad_norm": 0.9882247447967529, "learning_rate": 0.0001890380588742456, "loss": 0.4416, "step": 450 }, { "epoch": 0.16617429837518463, "eval_loss": 0.3619551956653595, "eval_runtime": 5.8578, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 450 }, { "epoch": 0.16654357459379615, "grad_norm": 0.36340954899787903, "learning_rate": 0.00018901342529868212, "loss": 0.3902, "step": 451 }, { "epoch": 0.16691285081240767, "grad_norm": 0.32566314935684204, "learning_rate": 0.00018898879172311864, "loss": 0.3252, "step": 452 }, { "epoch": 0.1672821270310192, "grad_norm": 0.2897259593009949, "learning_rate": 0.00018896415814755513, "loss": 0.4222, "step": 453 }, { "epoch": 0.16765140324963074, "grad_norm": 0.40911665558815, "learning_rate": 0.00018893952457199164, "loss": 0.4465, "step": 454 }, { "epoch": 0.16802067946824226, "grad_norm": 0.3212021589279175, "learning_rate": 0.00018891489099642813, "loss": 0.3713, "step": 455 }, { "epoch": 0.16838995568685378, "grad_norm": 0.2914236783981323, "learning_rate": 0.00018889025742086467, "loss": 0.3846, "step": 456 }, { "epoch": 0.1687592319054653, "grad_norm": 0.42819809913635254, "learning_rate": 0.00018886562384530116, "loss": 0.4756, "step": 457 }, { "epoch": 0.16912850812407682, "grad_norm": 0.3311080038547516, "learning_rate": 0.00018884099026973767, "loss": 0.4781, "step": 458 }, { "epoch": 0.16949778434268833, "grad_norm": 0.3339468836784363, "learning_rate": 0.00018881635669417416, "loss": 0.4054, "step": 459 }, { "epoch": 0.16986706056129985, "grad_norm": 0.282219797372818, "learning_rate": 0.00018879172311861067, "loss": 0.3678, "step": 460 }, { "epoch": 0.17023633677991137, "grad_norm": 0.34083542227745056, "learning_rate": 0.0001887670895430472, "loss": 0.3657, "step": 461 }, { "epoch": 0.1706056129985229, "grad_norm": 0.28936949372291565, "learning_rate": 0.0001887424559674837, "loss": 0.3375, "step": 462 }, { "epoch": 0.1709748892171344, "grad_norm": 0.2606366276741028, "learning_rate": 0.0001887178223919202, "loss": 0.3536, "step": 463 }, { "epoch": 0.17134416543574593, "grad_norm": 0.3246243894100189, "learning_rate": 0.0001886931888163567, "loss": 0.4642, "step": 464 }, { "epoch": 0.17171344165435745, "grad_norm": 0.3147363066673279, "learning_rate": 0.00018866855524079322, "loss": 0.4156, "step": 465 }, { "epoch": 0.17208271787296897, "grad_norm": 0.3270387351512909, "learning_rate": 0.00018864392166522973, "loss": 0.4417, "step": 466 }, { "epoch": 0.1724519940915805, "grad_norm": 0.3732544183731079, "learning_rate": 0.00018861928808966622, "loss": 0.4895, "step": 467 }, { "epoch": 0.172821270310192, "grad_norm": 0.3213241398334503, "learning_rate": 0.00018859465451410274, "loss": 0.3832, "step": 468 }, { "epoch": 0.17319054652880356, "grad_norm": 0.28574463725090027, "learning_rate": 0.00018857002093853922, "loss": 0.3981, "step": 469 }, { "epoch": 0.17355982274741508, "grad_norm": 0.29374152421951294, "learning_rate": 0.00018854538736297574, "loss": 0.3864, "step": 470 }, { "epoch": 0.1739290989660266, "grad_norm": 0.23526206612586975, "learning_rate": 0.00018852075378741225, "loss": 0.2948, "step": 471 }, { "epoch": 0.17429837518463812, "grad_norm": 0.30339983105659485, "learning_rate": 0.00018849612021184874, "loss": 0.4726, "step": 472 }, { "epoch": 0.17466765140324964, "grad_norm": 0.3456306457519531, "learning_rate": 0.00018847148663628526, "loss": 0.3829, "step": 473 }, { "epoch": 0.17503692762186115, "grad_norm": 0.3155786097049713, "learning_rate": 0.00018844685306072177, "loss": 0.4021, "step": 474 }, { "epoch": 0.17540620384047267, "grad_norm": 0.25776541233062744, "learning_rate": 0.00018842221948515829, "loss": 0.3003, "step": 475 }, { "epoch": 0.1757754800590842, "grad_norm": 0.40696579217910767, "learning_rate": 0.00018839758590959477, "loss": 0.4147, "step": 476 }, { "epoch": 0.1761447562776957, "grad_norm": 0.3052532374858856, "learning_rate": 0.0001883729523340313, "loss": 0.3591, "step": 477 }, { "epoch": 0.17651403249630723, "grad_norm": 0.2917540371417999, "learning_rate": 0.0001883483187584678, "loss": 0.399, "step": 478 }, { "epoch": 0.17688330871491875, "grad_norm": 0.3572671711444855, "learning_rate": 0.00018832368518290432, "loss": 0.3823, "step": 479 }, { "epoch": 0.17725258493353027, "grad_norm": 0.27778226137161255, "learning_rate": 0.0001882990516073408, "loss": 0.3781, "step": 480 }, { "epoch": 0.1776218611521418, "grad_norm": 0.2719777524471283, "learning_rate": 0.00018827441803177732, "loss": 0.3575, "step": 481 }, { "epoch": 0.1779911373707533, "grad_norm": 0.3379530906677246, "learning_rate": 0.0001882497844562138, "loss": 0.45, "step": 482 }, { "epoch": 0.17836041358936486, "grad_norm": 0.36061891913414, "learning_rate": 0.00018822515088065035, "loss": 0.4076, "step": 483 }, { "epoch": 0.17872968980797638, "grad_norm": 0.3114835023880005, "learning_rate": 0.00018820051730508684, "loss": 0.3469, "step": 484 }, { "epoch": 0.1790989660265879, "grad_norm": 0.382068008184433, "learning_rate": 0.00018817588372952335, "loss": 0.4085, "step": 485 }, { "epoch": 0.17946824224519942, "grad_norm": 0.2963516116142273, "learning_rate": 0.00018815125015395984, "loss": 0.3647, "step": 486 }, { "epoch": 0.17983751846381094, "grad_norm": 0.2859930396080017, "learning_rate": 0.00018812661657839635, "loss": 0.3862, "step": 487 }, { "epoch": 0.18020679468242246, "grad_norm": 0.2853689193725586, "learning_rate": 0.00018810198300283287, "loss": 0.3282, "step": 488 }, { "epoch": 0.18057607090103397, "grad_norm": 0.2989000082015991, "learning_rate": 0.00018807734942726938, "loss": 0.3584, "step": 489 }, { "epoch": 0.1809453471196455, "grad_norm": 0.3115358054637909, "learning_rate": 0.00018805271585170587, "loss": 0.3382, "step": 490 }, { "epoch": 0.181314623338257, "grad_norm": 0.2946613132953644, "learning_rate": 0.00018802808227614238, "loss": 0.3533, "step": 491 }, { "epoch": 0.18168389955686853, "grad_norm": 0.3262624144554138, "learning_rate": 0.0001880034487005789, "loss": 0.3515, "step": 492 }, { "epoch": 0.18205317577548005, "grad_norm": 0.2636186480522156, "learning_rate": 0.0001879788151250154, "loss": 0.3616, "step": 493 }, { "epoch": 0.18242245199409157, "grad_norm": 0.4874628484249115, "learning_rate": 0.0001879541815494519, "loss": 0.3429, "step": 494 }, { "epoch": 0.1827917282127031, "grad_norm": 0.3091892600059509, "learning_rate": 0.00018792954797388842, "loss": 0.4433, "step": 495 }, { "epoch": 0.1831610044313146, "grad_norm": 0.31530311703681946, "learning_rate": 0.0001879049143983249, "loss": 0.3118, "step": 496 }, { "epoch": 0.18353028064992616, "grad_norm": 0.3015526533126831, "learning_rate": 0.00018788028082276144, "loss": 0.4021, "step": 497 }, { "epoch": 0.18389955686853768, "grad_norm": 0.2697685658931732, "learning_rate": 0.00018785564724719793, "loss": 0.3786, "step": 498 }, { "epoch": 0.1842688330871492, "grad_norm": 0.2839634120464325, "learning_rate": 0.00018783101367163445, "loss": 0.374, "step": 499 }, { "epoch": 0.18463810930576072, "grad_norm": 0.3035372793674469, "learning_rate": 0.00018780638009607093, "loss": 0.3843, "step": 500 }, { "epoch": 0.18463810930576072, "eval_loss": 0.35243338346481323, "eval_runtime": 5.8487, "eval_samples_per_second": 8.549, "eval_steps_per_second": 1.197, "step": 500 }, { "epoch": 0.18500738552437224, "grad_norm": 0.2590104937553406, "learning_rate": 0.00018778174652050745, "loss": 0.292, "step": 501 }, { "epoch": 0.18537666174298376, "grad_norm": 0.25871342420578003, "learning_rate": 0.00018775711294494396, "loss": 0.3161, "step": 502 }, { "epoch": 0.18574593796159528, "grad_norm": 0.326386958360672, "learning_rate": 0.00018773247936938048, "loss": 0.3638, "step": 503 }, { "epoch": 0.1861152141802068, "grad_norm": 0.33548593521118164, "learning_rate": 0.00018770784579381697, "loss": 0.3966, "step": 504 }, { "epoch": 0.18648449039881831, "grad_norm": 0.2861921489238739, "learning_rate": 0.00018768321221825348, "loss": 0.3616, "step": 505 }, { "epoch": 0.18685376661742983, "grad_norm": 0.29699164628982544, "learning_rate": 0.00018765857864269, "loss": 0.3878, "step": 506 }, { "epoch": 0.18722304283604135, "grad_norm": 0.31700825691223145, "learning_rate": 0.0001876339450671265, "loss": 0.3539, "step": 507 }, { "epoch": 0.18759231905465287, "grad_norm": 0.2864786982536316, "learning_rate": 0.000187609311491563, "loss": 0.3535, "step": 508 }, { "epoch": 0.1879615952732644, "grad_norm": 0.293557345867157, "learning_rate": 0.0001875846779159995, "loss": 0.3172, "step": 509 }, { "epoch": 0.1883308714918759, "grad_norm": 0.2888350784778595, "learning_rate": 0.00018756004434043603, "loss": 0.3934, "step": 510 }, { "epoch": 0.18870014771048743, "grad_norm": 0.29072830080986023, "learning_rate": 0.00018753541076487254, "loss": 0.3309, "step": 511 }, { "epoch": 0.18906942392909898, "grad_norm": 0.3262498378753662, "learning_rate": 0.00018751077718930903, "loss": 0.4339, "step": 512 }, { "epoch": 0.1894387001477105, "grad_norm": 0.3033256530761719, "learning_rate": 0.00018748614361374554, "loss": 0.4156, "step": 513 }, { "epoch": 0.18980797636632202, "grad_norm": 0.3099603056907654, "learning_rate": 0.00018746151003818203, "loss": 0.3781, "step": 514 }, { "epoch": 0.19017725258493354, "grad_norm": 0.31491467356681824, "learning_rate": 0.00018743687646261857, "loss": 0.4179, "step": 515 }, { "epoch": 0.19054652880354506, "grad_norm": 0.32624998688697815, "learning_rate": 0.00018741224288705506, "loss": 0.4148, "step": 516 }, { "epoch": 0.19091580502215658, "grad_norm": 0.318103551864624, "learning_rate": 0.00018738760931149157, "loss": 0.3575, "step": 517 }, { "epoch": 0.1912850812407681, "grad_norm": 0.24202318489551544, "learning_rate": 0.00018736297573592806, "loss": 0.2451, "step": 518 }, { "epoch": 0.19165435745937962, "grad_norm": 0.3605307340621948, "learning_rate": 0.00018733834216036458, "loss": 0.3476, "step": 519 }, { "epoch": 0.19202363367799113, "grad_norm": 0.32067054510116577, "learning_rate": 0.0001873137085848011, "loss": 0.3924, "step": 520 }, { "epoch": 0.19239290989660265, "grad_norm": 0.2573811709880829, "learning_rate": 0.0001872890750092376, "loss": 0.3305, "step": 521 }, { "epoch": 0.19276218611521417, "grad_norm": 0.278073251247406, "learning_rate": 0.0001872644414336741, "loss": 0.3298, "step": 522 }, { "epoch": 0.1931314623338257, "grad_norm": 0.3071305453777313, "learning_rate": 0.0001872398078581106, "loss": 0.4292, "step": 523 }, { "epoch": 0.1935007385524372, "grad_norm": 0.2722351849079132, "learning_rate": 0.00018721517428254712, "loss": 0.3652, "step": 524 }, { "epoch": 0.19387001477104873, "grad_norm": 0.318668931722641, "learning_rate": 0.00018719054070698364, "loss": 0.3599, "step": 525 }, { "epoch": 0.19423929098966028, "grad_norm": 0.2947141230106354, "learning_rate": 0.00018716590713142013, "loss": 0.3261, "step": 526 }, { "epoch": 0.1946085672082718, "grad_norm": 0.31848984956741333, "learning_rate": 0.00018714127355585664, "loss": 0.3581, "step": 527 }, { "epoch": 0.19497784342688332, "grad_norm": 0.32939398288726807, "learning_rate": 0.00018711663998029313, "loss": 0.3677, "step": 528 }, { "epoch": 0.19534711964549484, "grad_norm": 0.2825155258178711, "learning_rate": 0.00018709200640472967, "loss": 0.3372, "step": 529 }, { "epoch": 0.19571639586410636, "grad_norm": 0.291613906621933, "learning_rate": 0.00018706737282916616, "loss": 0.3619, "step": 530 }, { "epoch": 0.19608567208271788, "grad_norm": 0.26579317450523376, "learning_rate": 0.00018704273925360267, "loss": 0.2948, "step": 531 }, { "epoch": 0.1964549483013294, "grad_norm": 0.2836756408214569, "learning_rate": 0.00018701810567803916, "loss": 0.3493, "step": 532 }, { "epoch": 0.19682422451994092, "grad_norm": 0.33159008622169495, "learning_rate": 0.00018699347210247567, "loss": 0.4069, "step": 533 }, { "epoch": 0.19719350073855244, "grad_norm": 0.26926809549331665, "learning_rate": 0.0001869688385269122, "loss": 0.3317, "step": 534 }, { "epoch": 0.19756277695716395, "grad_norm": 0.3338308334350586, "learning_rate": 0.0001869442049513487, "loss": 0.4025, "step": 535 }, { "epoch": 0.19793205317577547, "grad_norm": 0.3891758322715759, "learning_rate": 0.0001869195713757852, "loss": 0.4243, "step": 536 }, { "epoch": 0.198301329394387, "grad_norm": 0.2698891758918762, "learning_rate": 0.0001868949378002217, "loss": 0.2992, "step": 537 }, { "epoch": 0.1986706056129985, "grad_norm": 0.23717094957828522, "learning_rate": 0.00018687030422465822, "loss": 0.2854, "step": 538 }, { "epoch": 0.19903988183161003, "grad_norm": 0.3537690341472626, "learning_rate": 0.00018684567064909473, "loss": 0.435, "step": 539 }, { "epoch": 0.19940915805022155, "grad_norm": 0.32419681549072266, "learning_rate": 0.00018682103707353122, "loss": 0.323, "step": 540 }, { "epoch": 0.1997784342688331, "grad_norm": 0.31805306673049927, "learning_rate": 0.00018679640349796774, "loss": 0.4009, "step": 541 }, { "epoch": 0.20014771048744462, "grad_norm": 0.2685754895210266, "learning_rate": 0.00018677176992240425, "loss": 0.3502, "step": 542 }, { "epoch": 0.20051698670605614, "grad_norm": 0.2632409930229187, "learning_rate": 0.00018674713634684077, "loss": 0.4185, "step": 543 }, { "epoch": 0.20088626292466766, "grad_norm": 0.3476162552833557, "learning_rate": 0.00018672250277127725, "loss": 0.4793, "step": 544 }, { "epoch": 0.20125553914327918, "grad_norm": 0.3115142583847046, "learning_rate": 0.00018669786919571377, "loss": 0.4598, "step": 545 }, { "epoch": 0.2016248153618907, "grad_norm": 0.30133846402168274, "learning_rate": 0.00018667323562015026, "loss": 0.4154, "step": 546 }, { "epoch": 0.20199409158050222, "grad_norm": 0.2999810576438904, "learning_rate": 0.0001866486020445868, "loss": 0.338, "step": 547 }, { "epoch": 0.20236336779911374, "grad_norm": 0.3162882328033447, "learning_rate": 0.00018662396846902328, "loss": 0.3049, "step": 548 }, { "epoch": 0.20273264401772526, "grad_norm": 0.2815430760383606, "learning_rate": 0.0001865993348934598, "loss": 0.372, "step": 549 }, { "epoch": 0.20310192023633677, "grad_norm": 0.2795547544956207, "learning_rate": 0.0001865747013178963, "loss": 0.3706, "step": 550 }, { "epoch": 0.20310192023633677, "eval_loss": 0.350197970867157, "eval_runtime": 5.8621, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 550 }, { "epoch": 0.2034711964549483, "grad_norm": 0.39499425888061523, "learning_rate": 0.0001865500677423328, "loss": 0.442, "step": 551 }, { "epoch": 0.2038404726735598, "grad_norm": 0.32192522287368774, "learning_rate": 0.00018652543416676932, "loss": 0.4273, "step": 552 }, { "epoch": 0.20420974889217133, "grad_norm": 0.3077501654624939, "learning_rate": 0.00018650080059120583, "loss": 0.3648, "step": 553 }, { "epoch": 0.20457902511078285, "grad_norm": 0.24269762635231018, "learning_rate": 0.00018647616701564232, "loss": 0.3095, "step": 554 }, { "epoch": 0.2049483013293944, "grad_norm": 0.26936954259872437, "learning_rate": 0.00018645153344007883, "loss": 0.3352, "step": 555 }, { "epoch": 0.20531757754800592, "grad_norm": 0.2520939111709595, "learning_rate": 0.00018642689986451535, "loss": 0.3219, "step": 556 }, { "epoch": 0.20568685376661744, "grad_norm": 0.35456258058547974, "learning_rate": 0.00018640226628895186, "loss": 0.337, "step": 557 }, { "epoch": 0.20605612998522896, "grad_norm": 0.27598705887794495, "learning_rate": 0.00018637763271338835, "loss": 0.3331, "step": 558 }, { "epoch": 0.20642540620384048, "grad_norm": 0.3117420971393585, "learning_rate": 0.00018635299913782486, "loss": 0.4273, "step": 559 }, { "epoch": 0.206794682422452, "grad_norm": 0.3151145875453949, "learning_rate": 0.00018632836556226135, "loss": 0.3964, "step": 560 }, { "epoch": 0.20716395864106352, "grad_norm": 0.3065900206565857, "learning_rate": 0.0001863037319866979, "loss": 0.3189, "step": 561 }, { "epoch": 0.20753323485967504, "grad_norm": 0.23873600363731384, "learning_rate": 0.00018627909841113438, "loss": 0.3091, "step": 562 }, { "epoch": 0.20790251107828656, "grad_norm": 0.30931881070137024, "learning_rate": 0.0001862544648355709, "loss": 0.4215, "step": 563 }, { "epoch": 0.20827178729689808, "grad_norm": 0.328284353017807, "learning_rate": 0.00018622983126000738, "loss": 0.3981, "step": 564 }, { "epoch": 0.2086410635155096, "grad_norm": 0.3178199231624603, "learning_rate": 0.0001862051976844439, "loss": 0.4749, "step": 565 }, { "epoch": 0.20901033973412111, "grad_norm": 0.3410341441631317, "learning_rate": 0.0001861805641088804, "loss": 0.3619, "step": 566 }, { "epoch": 0.20937961595273263, "grad_norm": 0.28396570682525635, "learning_rate": 0.00018615593053331693, "loss": 0.3469, "step": 567 }, { "epoch": 0.20974889217134415, "grad_norm": 0.24647068977355957, "learning_rate": 0.00018613129695775341, "loss": 0.2867, "step": 568 }, { "epoch": 0.2101181683899557, "grad_norm": 0.3200088441371918, "learning_rate": 0.00018610666338218993, "loss": 0.3137, "step": 569 }, { "epoch": 0.21048744460856722, "grad_norm": 0.31181222200393677, "learning_rate": 0.00018608202980662644, "loss": 0.4194, "step": 570 }, { "epoch": 0.21085672082717874, "grad_norm": 0.2677057385444641, "learning_rate": 0.00018605739623106296, "loss": 0.3286, "step": 571 }, { "epoch": 0.21122599704579026, "grad_norm": 0.3450012803077698, "learning_rate": 0.00018603276265549945, "loss": 0.3933, "step": 572 }, { "epoch": 0.21159527326440178, "grad_norm": 0.2596141993999481, "learning_rate": 0.00018600812907993596, "loss": 0.3692, "step": 573 }, { "epoch": 0.2119645494830133, "grad_norm": 0.2887343466281891, "learning_rate": 0.00018598349550437248, "loss": 0.3645, "step": 574 }, { "epoch": 0.21233382570162482, "grad_norm": 0.2597286105155945, "learning_rate": 0.000185958861928809, "loss": 0.3141, "step": 575 }, { "epoch": 0.21270310192023634, "grad_norm": 0.27257657051086426, "learning_rate": 0.00018593422835324548, "loss": 0.298, "step": 576 }, { "epoch": 0.21307237813884786, "grad_norm": 0.26083192229270935, "learning_rate": 0.000185909594777682, "loss": 0.3124, "step": 577 }, { "epoch": 0.21344165435745938, "grad_norm": 0.2763814926147461, "learning_rate": 0.00018588496120211848, "loss": 0.3169, "step": 578 }, { "epoch": 0.2138109305760709, "grad_norm": 0.37325096130371094, "learning_rate": 0.00018586032762655502, "loss": 0.4646, "step": 579 }, { "epoch": 0.21418020679468242, "grad_norm": 0.26927992701530457, "learning_rate": 0.0001858356940509915, "loss": 0.3276, "step": 580 }, { "epoch": 0.21454948301329393, "grad_norm": 0.29396751523017883, "learning_rate": 0.00018581106047542802, "loss": 0.2939, "step": 581 }, { "epoch": 0.21491875923190545, "grad_norm": 0.2815605700016022, "learning_rate": 0.0001857864268998645, "loss": 0.4065, "step": 582 }, { "epoch": 0.21528803545051697, "grad_norm": 0.6534555554389954, "learning_rate": 0.00018576179332430103, "loss": 0.4855, "step": 583 }, { "epoch": 0.21565731166912852, "grad_norm": 0.27794015407562256, "learning_rate": 0.00018573715974873754, "loss": 0.3871, "step": 584 }, { "epoch": 0.21602658788774004, "grad_norm": 0.3053019940853119, "learning_rate": 0.00018571252617317406, "loss": 0.3112, "step": 585 }, { "epoch": 0.21639586410635156, "grad_norm": 0.28791019320487976, "learning_rate": 0.00018568789259761054, "loss": 0.3504, "step": 586 }, { "epoch": 0.21676514032496308, "grad_norm": 0.32081103324890137, "learning_rate": 0.00018566325902204706, "loss": 0.3937, "step": 587 }, { "epoch": 0.2171344165435746, "grad_norm": 0.3388102352619171, "learning_rate": 0.00018563862544648357, "loss": 0.4336, "step": 588 }, { "epoch": 0.21750369276218612, "grad_norm": 0.24963918328285217, "learning_rate": 0.0001856139918709201, "loss": 0.3251, "step": 589 }, { "epoch": 0.21787296898079764, "grad_norm": 0.2805040180683136, "learning_rate": 0.00018558935829535657, "loss": 0.2674, "step": 590 }, { "epoch": 0.21824224519940916, "grad_norm": 0.26187995076179504, "learning_rate": 0.0001855647247197931, "loss": 0.3648, "step": 591 }, { "epoch": 0.21861152141802068, "grad_norm": 0.25983595848083496, "learning_rate": 0.00018554009114422958, "loss": 0.3169, "step": 592 }, { "epoch": 0.2189807976366322, "grad_norm": 0.3068999946117401, "learning_rate": 0.00018551545756866612, "loss": 0.3912, "step": 593 }, { "epoch": 0.21935007385524372, "grad_norm": 0.2944723963737488, "learning_rate": 0.0001854908239931026, "loss": 0.3702, "step": 594 }, { "epoch": 0.21971935007385524, "grad_norm": 0.29646021127700806, "learning_rate": 0.00018546619041753912, "loss": 0.4046, "step": 595 }, { "epoch": 0.22008862629246675, "grad_norm": 0.3013732135295868, "learning_rate": 0.0001854415568419756, "loss": 0.389, "step": 596 }, { "epoch": 0.22045790251107827, "grad_norm": 0.33398228883743286, "learning_rate": 0.00018541692326641212, "loss": 0.382, "step": 597 }, { "epoch": 0.22082717872968982, "grad_norm": 0.27201974391937256, "learning_rate": 0.00018539228969084864, "loss": 0.3684, "step": 598 }, { "epoch": 0.22119645494830134, "grad_norm": 0.29355520009994507, "learning_rate": 0.00018536765611528515, "loss": 0.3608, "step": 599 }, { "epoch": 0.22156573116691286, "grad_norm": 0.30281344056129456, "learning_rate": 0.00018534302253972164, "loss": 0.3562, "step": 600 }, { "epoch": 0.22156573116691286, "eval_loss": 0.34722042083740234, "eval_runtime": 5.8648, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.194, "step": 600 }, { "epoch": 0.22193500738552438, "grad_norm": 0.27795374393463135, "learning_rate": 0.00018531838896415815, "loss": 0.3903, "step": 601 }, { "epoch": 0.2223042836041359, "grad_norm": 0.47363731265068054, "learning_rate": 0.00018529375538859467, "loss": 0.4548, "step": 602 }, { "epoch": 0.22267355982274742, "grad_norm": 0.3682205080986023, "learning_rate": 0.00018526912181303118, "loss": 0.451, "step": 603 }, { "epoch": 0.22304283604135894, "grad_norm": 0.35432472825050354, "learning_rate": 0.00018524448823746767, "loss": 0.3183, "step": 604 }, { "epoch": 0.22341211225997046, "grad_norm": 0.3637113571166992, "learning_rate": 0.00018521985466190419, "loss": 0.4657, "step": 605 }, { "epoch": 0.22378138847858198, "grad_norm": 0.32639309763908386, "learning_rate": 0.0001851952210863407, "loss": 0.4005, "step": 606 }, { "epoch": 0.2241506646971935, "grad_norm": 0.3073771595954895, "learning_rate": 0.00018517058751077721, "loss": 0.3629, "step": 607 }, { "epoch": 0.22451994091580502, "grad_norm": 0.26197266578674316, "learning_rate": 0.0001851459539352137, "loss": 0.3401, "step": 608 }, { "epoch": 0.22488921713441654, "grad_norm": 0.2626654803752899, "learning_rate": 0.00018512132035965022, "loss": 0.3404, "step": 609 }, { "epoch": 0.22525849335302806, "grad_norm": 0.3299645781517029, "learning_rate": 0.0001850966867840867, "loss": 0.3654, "step": 610 }, { "epoch": 0.22562776957163957, "grad_norm": 0.3153376877307892, "learning_rate": 0.00018507205320852325, "loss": 0.3739, "step": 611 }, { "epoch": 0.2259970457902511, "grad_norm": 0.25192391872406006, "learning_rate": 0.00018504741963295973, "loss": 0.3939, "step": 612 }, { "epoch": 0.22636632200886264, "grad_norm": 0.3275706470012665, "learning_rate": 0.00018502278605739625, "loss": 0.4047, "step": 613 }, { "epoch": 0.22673559822747416, "grad_norm": 0.2731536030769348, "learning_rate": 0.00018499815248183274, "loss": 0.3351, "step": 614 }, { "epoch": 0.22710487444608568, "grad_norm": 0.2870076298713684, "learning_rate": 0.00018497351890626925, "loss": 0.3622, "step": 615 }, { "epoch": 0.2274741506646972, "grad_norm": 0.3062557280063629, "learning_rate": 0.00018494888533070577, "loss": 0.3861, "step": 616 }, { "epoch": 0.22784342688330872, "grad_norm": 0.3006839454174042, "learning_rate": 0.00018492425175514228, "loss": 0.4001, "step": 617 }, { "epoch": 0.22821270310192024, "grad_norm": 0.3576716482639313, "learning_rate": 0.00018489961817957877, "loss": 0.3705, "step": 618 }, { "epoch": 0.22858197932053176, "grad_norm": 0.29118189215660095, "learning_rate": 0.00018487498460401528, "loss": 0.3285, "step": 619 }, { "epoch": 0.22895125553914328, "grad_norm": 0.3169896900653839, "learning_rate": 0.0001848503510284518, "loss": 0.3153, "step": 620 }, { "epoch": 0.2293205317577548, "grad_norm": 0.3830879032611847, "learning_rate": 0.0001848257174528883, "loss": 0.4512, "step": 621 }, { "epoch": 0.22968980797636632, "grad_norm": 0.3761618435382843, "learning_rate": 0.0001848010838773248, "loss": 0.397, "step": 622 }, { "epoch": 0.23005908419497784, "grad_norm": 0.3072148859500885, "learning_rate": 0.0001847764503017613, "loss": 0.3602, "step": 623 }, { "epoch": 0.23042836041358936, "grad_norm": 0.2576681971549988, "learning_rate": 0.0001847518167261978, "loss": 0.3387, "step": 624 }, { "epoch": 0.23079763663220088, "grad_norm": 0.2789335250854492, "learning_rate": 0.00018472718315063434, "loss": 0.3678, "step": 625 }, { "epoch": 0.2311669128508124, "grad_norm": 0.26227229833602905, "learning_rate": 0.00018470254957507083, "loss": 0.281, "step": 626 }, { "epoch": 0.23153618906942394, "grad_norm": 0.2812472879886627, "learning_rate": 0.00018467791599950734, "loss": 0.3269, "step": 627 }, { "epoch": 0.23190546528803546, "grad_norm": 0.24520502984523773, "learning_rate": 0.00018465328242394383, "loss": 0.2865, "step": 628 }, { "epoch": 0.23227474150664698, "grad_norm": 0.38156548142433167, "learning_rate": 0.00018462864884838035, "loss": 0.4165, "step": 629 }, { "epoch": 0.2326440177252585, "grad_norm": 0.3365303575992584, "learning_rate": 0.00018460401527281686, "loss": 0.4914, "step": 630 }, { "epoch": 0.23301329394387002, "grad_norm": 0.3444092869758606, "learning_rate": 0.00018457938169725338, "loss": 0.4132, "step": 631 }, { "epoch": 0.23338257016248154, "grad_norm": 0.2728528678417206, "learning_rate": 0.00018455474812168986, "loss": 0.3059, "step": 632 }, { "epoch": 0.23375184638109306, "grad_norm": 0.2688409686088562, "learning_rate": 0.00018453011454612638, "loss": 0.3817, "step": 633 }, { "epoch": 0.23412112259970458, "grad_norm": 0.32839080691337585, "learning_rate": 0.0001845054809705629, "loss": 0.4481, "step": 634 }, { "epoch": 0.2344903988183161, "grad_norm": 0.29927435517311096, "learning_rate": 0.0001844808473949994, "loss": 0.364, "step": 635 }, { "epoch": 0.23485967503692762, "grad_norm": 0.3030961751937866, "learning_rate": 0.0001844562138194359, "loss": 0.3521, "step": 636 }, { "epoch": 0.23522895125553914, "grad_norm": 0.30552420020103455, "learning_rate": 0.0001844315802438724, "loss": 0.3481, "step": 637 }, { "epoch": 0.23559822747415066, "grad_norm": 0.2642765939235687, "learning_rate": 0.0001844069466683089, "loss": 0.3218, "step": 638 }, { "epoch": 0.23596750369276218, "grad_norm": 0.3970206677913666, "learning_rate": 0.00018438231309274544, "loss": 0.4712, "step": 639 }, { "epoch": 0.2363367799113737, "grad_norm": 0.28402864933013916, "learning_rate": 0.00018435767951718193, "loss": 0.3304, "step": 640 }, { "epoch": 0.23670605612998524, "grad_norm": 0.3056892454624176, "learning_rate": 0.00018433304594161844, "loss": 0.3698, "step": 641 }, { "epoch": 0.23707533234859676, "grad_norm": 0.29281875491142273, "learning_rate": 0.00018430841236605493, "loss": 0.3682, "step": 642 }, { "epoch": 0.23744460856720828, "grad_norm": 0.3088505268096924, "learning_rate": 0.00018428377879049144, "loss": 0.3412, "step": 643 }, { "epoch": 0.2378138847858198, "grad_norm": 0.3349262475967407, "learning_rate": 0.00018425914521492796, "loss": 0.3773, "step": 644 }, { "epoch": 0.23818316100443132, "grad_norm": 0.2669546902179718, "learning_rate": 0.00018423451163936447, "loss": 0.3013, "step": 645 }, { "epoch": 0.23855243722304284, "grad_norm": 0.27441346645355225, "learning_rate": 0.00018420987806380096, "loss": 0.3547, "step": 646 }, { "epoch": 0.23892171344165436, "grad_norm": 0.37875619530677795, "learning_rate": 0.00018418524448823748, "loss": 0.4332, "step": 647 }, { "epoch": 0.23929098966026588, "grad_norm": 0.35620611906051636, "learning_rate": 0.000184160610912674, "loss": 0.3944, "step": 648 }, { "epoch": 0.2396602658788774, "grad_norm": 0.45027297735214233, "learning_rate": 0.0001841359773371105, "loss": 0.317, "step": 649 }, { "epoch": 0.24002954209748892, "grad_norm": 0.23810823261737823, "learning_rate": 0.000184111343761547, "loss": 0.2787, "step": 650 }, { "epoch": 0.24002954209748892, "eval_loss": 0.35080447793006897, "eval_runtime": 5.8683, "eval_samples_per_second": 8.52, "eval_steps_per_second": 1.193, "step": 650 }, { "epoch": 0.24039881831610044, "grad_norm": 0.31773853302001953, "learning_rate": 0.0001840867101859835, "loss": 0.2863, "step": 651 }, { "epoch": 0.24076809453471196, "grad_norm": 0.273213654756546, "learning_rate": 0.00018406207661042002, "loss": 0.3398, "step": 652 }, { "epoch": 0.24113737075332348, "grad_norm": 0.31761434674263, "learning_rate": 0.00018403744303485654, "loss": 0.3654, "step": 653 }, { "epoch": 0.241506646971935, "grad_norm": 0.309353232383728, "learning_rate": 0.00018401280945929302, "loss": 0.4567, "step": 654 }, { "epoch": 0.24187592319054652, "grad_norm": 0.2656030058860779, "learning_rate": 0.00018398817588372954, "loss": 0.3315, "step": 655 }, { "epoch": 0.24224519940915806, "grad_norm": 0.27817684412002563, "learning_rate": 0.00018396354230816603, "loss": 0.3224, "step": 656 }, { "epoch": 0.24261447562776958, "grad_norm": 0.26796382665634155, "learning_rate": 0.00018393890873260257, "loss": 0.3259, "step": 657 }, { "epoch": 0.2429837518463811, "grad_norm": 0.3298114538192749, "learning_rate": 0.00018391427515703905, "loss": 0.4933, "step": 658 }, { "epoch": 0.24335302806499262, "grad_norm": 0.29278478026390076, "learning_rate": 0.00018388964158147557, "loss": 0.379, "step": 659 }, { "epoch": 0.24372230428360414, "grad_norm": 0.3018966317176819, "learning_rate": 0.00018386500800591206, "loss": 0.3204, "step": 660 }, { "epoch": 0.24409158050221566, "grad_norm": 0.26778993010520935, "learning_rate": 0.00018384037443034857, "loss": 0.3322, "step": 661 }, { "epoch": 0.24446085672082718, "grad_norm": 0.4353688061237335, "learning_rate": 0.00018381574085478509, "loss": 0.4762, "step": 662 }, { "epoch": 0.2448301329394387, "grad_norm": 0.3074433505535126, "learning_rate": 0.0001837911072792216, "loss": 0.4012, "step": 663 }, { "epoch": 0.24519940915805022, "grad_norm": 0.29345113039016724, "learning_rate": 0.0001837664737036581, "loss": 0.3222, "step": 664 }, { "epoch": 0.24556868537666174, "grad_norm": 0.3226329982280731, "learning_rate": 0.0001837418401280946, "loss": 0.4165, "step": 665 }, { "epoch": 0.24593796159527326, "grad_norm": 0.28262099623680115, "learning_rate": 0.00018371720655253112, "loss": 0.3265, "step": 666 }, { "epoch": 0.24630723781388478, "grad_norm": 0.3262132704257965, "learning_rate": 0.00018369257297696763, "loss": 0.3889, "step": 667 }, { "epoch": 0.2466765140324963, "grad_norm": 0.26038438081741333, "learning_rate": 0.00018366793940140412, "loss": 0.3204, "step": 668 }, { "epoch": 0.24704579025110782, "grad_norm": 0.4496694803237915, "learning_rate": 0.00018364330582584063, "loss": 0.2907, "step": 669 }, { "epoch": 0.24741506646971936, "grad_norm": 0.2622362971305847, "learning_rate": 0.00018361867225027712, "loss": 0.3416, "step": 670 }, { "epoch": 0.24778434268833088, "grad_norm": 0.27988767623901367, "learning_rate": 0.00018359403867471366, "loss": 0.3254, "step": 671 }, { "epoch": 0.2481536189069424, "grad_norm": 0.23468215763568878, "learning_rate": 0.00018356940509915015, "loss": 0.3083, "step": 672 }, { "epoch": 0.24852289512555392, "grad_norm": 0.2847580909729004, "learning_rate": 0.00018354477152358667, "loss": 0.3204, "step": 673 }, { "epoch": 0.24889217134416544, "grad_norm": 0.2709912657737732, "learning_rate": 0.00018352013794802315, "loss": 0.3105, "step": 674 }, { "epoch": 0.24926144756277696, "grad_norm": 0.2994575798511505, "learning_rate": 0.00018349550437245967, "loss": 0.3747, "step": 675 }, { "epoch": 0.24963072378138848, "grad_norm": 0.2810092270374298, "learning_rate": 0.00018347087079689618, "loss": 0.3068, "step": 676 }, { "epoch": 0.25, "grad_norm": 0.362888902425766, "learning_rate": 0.0001834462372213327, "loss": 0.3894, "step": 677 }, { "epoch": 0.25036927621861155, "grad_norm": 0.26018789410591125, "learning_rate": 0.00018342160364576918, "loss": 0.3359, "step": 678 }, { "epoch": 0.25073855243722304, "grad_norm": 0.28938496112823486, "learning_rate": 0.0001833969700702057, "loss": 0.3727, "step": 679 }, { "epoch": 0.2511078286558346, "grad_norm": 0.4264602065086365, "learning_rate": 0.00018337233649464221, "loss": 0.4565, "step": 680 }, { "epoch": 0.2514771048744461, "grad_norm": 0.3633279800415039, "learning_rate": 0.00018334770291907873, "loss": 0.3408, "step": 681 }, { "epoch": 0.2518463810930576, "grad_norm": 0.3045649528503418, "learning_rate": 0.00018332306934351522, "loss": 0.3905, "step": 682 }, { "epoch": 0.2522156573116691, "grad_norm": 0.3043745756149292, "learning_rate": 0.00018329843576795173, "loss": 0.4355, "step": 683 }, { "epoch": 0.25258493353028066, "grad_norm": 0.2773546874523163, "learning_rate": 0.00018327380219238825, "loss": 0.3287, "step": 684 }, { "epoch": 0.25295420974889216, "grad_norm": 0.4251217842102051, "learning_rate": 0.00018324916861682476, "loss": 0.4987, "step": 685 }, { "epoch": 0.2533234859675037, "grad_norm": 0.25384971499443054, "learning_rate": 0.00018322453504126125, "loss": 0.3379, "step": 686 }, { "epoch": 0.2536927621861152, "grad_norm": 0.3329382538795471, "learning_rate": 0.00018319990146569776, "loss": 0.3969, "step": 687 }, { "epoch": 0.25406203840472674, "grad_norm": 0.30880457162857056, "learning_rate": 0.00018317526789013425, "loss": 0.3837, "step": 688 }, { "epoch": 0.25443131462333823, "grad_norm": 0.2606697380542755, "learning_rate": 0.0001831506343145708, "loss": 0.3755, "step": 689 }, { "epoch": 0.2548005908419498, "grad_norm": 0.2648289203643799, "learning_rate": 0.00018312600073900728, "loss": 0.3259, "step": 690 }, { "epoch": 0.2551698670605613, "grad_norm": 0.2406310737133026, "learning_rate": 0.0001831013671634438, "loss": 0.2888, "step": 691 }, { "epoch": 0.2555391432791728, "grad_norm": 0.3420051336288452, "learning_rate": 0.00018307673358788028, "loss": 0.3411, "step": 692 }, { "epoch": 0.25590841949778437, "grad_norm": 0.2932198643684387, "learning_rate": 0.0001830521000123168, "loss": 0.3348, "step": 693 }, { "epoch": 0.25627769571639586, "grad_norm": 0.29080647230148315, "learning_rate": 0.0001830274664367533, "loss": 0.3892, "step": 694 }, { "epoch": 0.2566469719350074, "grad_norm": 0.32302191853523254, "learning_rate": 0.00018300283286118983, "loss": 0.3488, "step": 695 }, { "epoch": 0.2570162481536189, "grad_norm": 0.29372337460517883, "learning_rate": 0.0001829781992856263, "loss": 0.3834, "step": 696 }, { "epoch": 0.25738552437223045, "grad_norm": 0.2657610774040222, "learning_rate": 0.00018295356571006283, "loss": 0.2718, "step": 697 }, { "epoch": 0.25775480059084194, "grad_norm": 0.2769089639186859, "learning_rate": 0.00018292893213449934, "loss": 0.3309, "step": 698 }, { "epoch": 0.2581240768094535, "grad_norm": 0.3249312937259674, "learning_rate": 0.00018290429855893586, "loss": 0.4475, "step": 699 }, { "epoch": 0.258493353028065, "grad_norm": 0.28939583897590637, "learning_rate": 0.00018287966498337234, "loss": 0.3775, "step": 700 }, { "epoch": 0.258493353028065, "eval_loss": 0.34346112608909607, "eval_runtime": 5.8707, "eval_samples_per_second": 8.517, "eval_steps_per_second": 1.192, "step": 700 }, { "epoch": 0.2588626292466765, "grad_norm": 0.3047376871109009, "learning_rate": 0.00018285503140780883, "loss": 0.3566, "step": 701 }, { "epoch": 0.259231905465288, "grad_norm": 0.28955501317977905, "learning_rate": 0.00018283039783224535, "loss": 0.3964, "step": 702 }, { "epoch": 0.25960118168389956, "grad_norm": 0.28000709414482117, "learning_rate": 0.00018280576425668186, "loss": 0.3614, "step": 703 }, { "epoch": 0.25997045790251105, "grad_norm": 0.37217116355895996, "learning_rate": 0.00018278113068111838, "loss": 0.3892, "step": 704 }, { "epoch": 0.2603397341211226, "grad_norm": 0.2819831967353821, "learning_rate": 0.00018275649710555486, "loss": 0.3415, "step": 705 }, { "epoch": 0.26070901033973415, "grad_norm": 0.28753209114074707, "learning_rate": 0.00018273186352999138, "loss": 0.2532, "step": 706 }, { "epoch": 0.26107828655834564, "grad_norm": 0.33074823021888733, "learning_rate": 0.0001827072299544279, "loss": 0.3379, "step": 707 }, { "epoch": 0.2614475627769572, "grad_norm": 0.26290759444236755, "learning_rate": 0.0001826825963788644, "loss": 0.3852, "step": 708 }, { "epoch": 0.2618168389955687, "grad_norm": 0.281848281621933, "learning_rate": 0.0001826579628033009, "loss": 0.3215, "step": 709 }, { "epoch": 0.2621861152141802, "grad_norm": 0.27442625164985657, "learning_rate": 0.0001826333292277374, "loss": 0.3497, "step": 710 }, { "epoch": 0.2625553914327917, "grad_norm": 0.32699814438819885, "learning_rate": 0.00018260869565217392, "loss": 0.2898, "step": 711 }, { "epoch": 0.26292466765140327, "grad_norm": 0.2687341570854187, "learning_rate": 0.00018258406207661044, "loss": 0.2902, "step": 712 }, { "epoch": 0.26329394387001476, "grad_norm": 0.29069650173187256, "learning_rate": 0.00018255942850104693, "loss": 0.3481, "step": 713 }, { "epoch": 0.2636632200886263, "grad_norm": 0.28157898783683777, "learning_rate": 0.00018253479492548344, "loss": 0.3434, "step": 714 }, { "epoch": 0.2640324963072378, "grad_norm": 0.3608490824699402, "learning_rate": 0.00018251016134991993, "loss": 0.3482, "step": 715 }, { "epoch": 0.26440177252584934, "grad_norm": 0.25900065898895264, "learning_rate": 0.00018248552777435647, "loss": 0.3285, "step": 716 }, { "epoch": 0.26477104874446084, "grad_norm": 0.32958030700683594, "learning_rate": 0.00018246089419879296, "loss": 0.4435, "step": 717 }, { "epoch": 0.2651403249630724, "grad_norm": 0.26965761184692383, "learning_rate": 0.00018243626062322947, "loss": 0.3037, "step": 718 }, { "epoch": 0.2655096011816839, "grad_norm": 0.30884167551994324, "learning_rate": 0.00018241162704766596, "loss": 0.3645, "step": 719 }, { "epoch": 0.2658788774002954, "grad_norm": 0.3773479163646698, "learning_rate": 0.00018238699347210247, "loss": 0.3808, "step": 720 }, { "epoch": 0.26624815361890697, "grad_norm": 0.3207281231880188, "learning_rate": 0.000182362359896539, "loss": 0.3381, "step": 721 }, { "epoch": 0.26661742983751846, "grad_norm": 0.28987714648246765, "learning_rate": 0.0001823377263209755, "loss": 0.335, "step": 722 }, { "epoch": 0.26698670605613, "grad_norm": 0.2974066436290741, "learning_rate": 0.000182313092745412, "loss": 0.3933, "step": 723 }, { "epoch": 0.2673559822747415, "grad_norm": 0.3069154620170593, "learning_rate": 0.0001822884591698485, "loss": 0.4665, "step": 724 }, { "epoch": 0.26772525849335305, "grad_norm": 0.32729649543762207, "learning_rate": 0.00018226382559428502, "loss": 0.3923, "step": 725 }, { "epoch": 0.26809453471196454, "grad_norm": 0.3093666136264801, "learning_rate": 0.00018223919201872154, "loss": 0.4225, "step": 726 }, { "epoch": 0.2684638109305761, "grad_norm": 0.29534193873405457, "learning_rate": 0.00018221455844315802, "loss": 0.3724, "step": 727 }, { "epoch": 0.2688330871491876, "grad_norm": 0.27556416392326355, "learning_rate": 0.00018218992486759454, "loss": 0.3039, "step": 728 }, { "epoch": 0.2692023633677991, "grad_norm": 0.291860967874527, "learning_rate": 0.00018216529129203103, "loss": 0.3996, "step": 729 }, { "epoch": 0.2695716395864106, "grad_norm": 0.25631803274154663, "learning_rate": 0.00018214065771646757, "loss": 0.326, "step": 730 }, { "epoch": 0.26994091580502216, "grad_norm": 0.2968412935733795, "learning_rate": 0.00018211602414090405, "loss": 0.3103, "step": 731 }, { "epoch": 0.27031019202363366, "grad_norm": 0.2535908818244934, "learning_rate": 0.00018209139056534057, "loss": 0.293, "step": 732 }, { "epoch": 0.2706794682422452, "grad_norm": 0.2964654266834259, "learning_rate": 0.00018206675698977706, "loss": 0.3582, "step": 733 }, { "epoch": 0.2710487444608567, "grad_norm": 0.2881055772304535, "learning_rate": 0.00018204212341421357, "loss": 0.3766, "step": 734 }, { "epoch": 0.27141802067946824, "grad_norm": 0.3032318949699402, "learning_rate": 0.00018201748983865009, "loss": 0.3798, "step": 735 }, { "epoch": 0.2717872968980798, "grad_norm": 0.28385844826698303, "learning_rate": 0.0001819928562630866, "loss": 0.3321, "step": 736 }, { "epoch": 0.2721565731166913, "grad_norm": 0.3154700994491577, "learning_rate": 0.0001819682226875231, "loss": 0.3764, "step": 737 }, { "epoch": 0.2725258493353028, "grad_norm": 0.2877965271472931, "learning_rate": 0.0001819435891119596, "loss": 0.3417, "step": 738 }, { "epoch": 0.2728951255539143, "grad_norm": 0.33885276317596436, "learning_rate": 0.00018191895553639612, "loss": 0.4016, "step": 739 }, { "epoch": 0.27326440177252587, "grad_norm": 0.33902284502983093, "learning_rate": 0.00018189432196083263, "loss": 0.3702, "step": 740 }, { "epoch": 0.27363367799113736, "grad_norm": 0.3601471483707428, "learning_rate": 0.00018186968838526912, "loss": 0.3286, "step": 741 }, { "epoch": 0.2740029542097489, "grad_norm": 0.298261821269989, "learning_rate": 0.00018184505480970563, "loss": 0.3901, "step": 742 }, { "epoch": 0.2743722304283604, "grad_norm": 0.35861852765083313, "learning_rate": 0.00018182042123414215, "loss": 0.4522, "step": 743 }, { "epoch": 0.27474150664697194, "grad_norm": 0.31579142808914185, "learning_rate": 0.00018179578765857866, "loss": 0.4179, "step": 744 }, { "epoch": 0.27511078286558344, "grad_norm": 0.32154661417007446, "learning_rate": 0.00018177115408301515, "loss": 0.4405, "step": 745 }, { "epoch": 0.275480059084195, "grad_norm": 0.28148001432418823, "learning_rate": 0.00018174652050745167, "loss": 0.3703, "step": 746 }, { "epoch": 0.2758493353028065, "grad_norm": 0.3106623888015747, "learning_rate": 0.00018172188693188815, "loss": 0.3184, "step": 747 }, { "epoch": 0.276218611521418, "grad_norm": 0.32666391134262085, "learning_rate": 0.0001816972533563247, "loss": 0.3548, "step": 748 }, { "epoch": 0.2765878877400295, "grad_norm": 0.3134174942970276, "learning_rate": 0.00018167261978076118, "loss": 0.3829, "step": 749 }, { "epoch": 0.27695716395864106, "grad_norm": 0.33211180567741394, "learning_rate": 0.0001816479862051977, "loss": 0.3714, "step": 750 }, { "epoch": 0.27695716395864106, "eval_loss": 0.3437245190143585, "eval_runtime": 5.8628, "eval_samples_per_second": 8.528, "eval_steps_per_second": 1.194, "step": 750 }, { "epoch": 0.2773264401772526, "grad_norm": 0.2977505028247833, "learning_rate": 0.00018162335262963418, "loss": 0.3801, "step": 751 }, { "epoch": 0.2776957163958641, "grad_norm": 0.30672118067741394, "learning_rate": 0.0001815987190540707, "loss": 0.3891, "step": 752 }, { "epoch": 0.27806499261447565, "grad_norm": 0.22556626796722412, "learning_rate": 0.00018157408547850721, "loss": 0.2849, "step": 753 }, { "epoch": 0.27843426883308714, "grad_norm": 0.30615079402923584, "learning_rate": 0.00018154945190294373, "loss": 0.4246, "step": 754 }, { "epoch": 0.2788035450516987, "grad_norm": 0.2854730486869812, "learning_rate": 0.00018152481832738022, "loss": 0.3344, "step": 755 }, { "epoch": 0.2791728212703102, "grad_norm": 0.2818675935268402, "learning_rate": 0.00018150018475181673, "loss": 0.3566, "step": 756 }, { "epoch": 0.2795420974889217, "grad_norm": 0.27592697739601135, "learning_rate": 0.00018147555117625325, "loss": 0.3282, "step": 757 }, { "epoch": 0.2799113737075332, "grad_norm": 0.29141223430633545, "learning_rate": 0.00018145091760068976, "loss": 0.4406, "step": 758 }, { "epoch": 0.28028064992614476, "grad_norm": 0.31673353910446167, "learning_rate": 0.00018142628402512625, "loss": 0.4559, "step": 759 }, { "epoch": 0.28064992614475626, "grad_norm": 0.2683919370174408, "learning_rate": 0.00018140165044956276, "loss": 0.3207, "step": 760 }, { "epoch": 0.2810192023633678, "grad_norm": 0.3255873918533325, "learning_rate": 0.00018137701687399925, "loss": 0.3369, "step": 761 }, { "epoch": 0.2813884785819793, "grad_norm": 0.273697167634964, "learning_rate": 0.0001813523832984358, "loss": 0.3523, "step": 762 }, { "epoch": 0.28175775480059084, "grad_norm": 0.32450446486473083, "learning_rate": 0.00018132774972287228, "loss": 0.3772, "step": 763 }, { "epoch": 0.2821270310192024, "grad_norm": 0.27704933285713196, "learning_rate": 0.0001813031161473088, "loss": 0.3428, "step": 764 }, { "epoch": 0.2824963072378139, "grad_norm": 0.27535080909729004, "learning_rate": 0.00018127848257174528, "loss": 0.3595, "step": 765 }, { "epoch": 0.28286558345642543, "grad_norm": 0.2901923656463623, "learning_rate": 0.0001812538489961818, "loss": 0.3636, "step": 766 }, { "epoch": 0.2832348596750369, "grad_norm": 0.2620357573032379, "learning_rate": 0.0001812292154206183, "loss": 0.3531, "step": 767 }, { "epoch": 0.28360413589364847, "grad_norm": 0.534293532371521, "learning_rate": 0.00018120458184505482, "loss": 0.3886, "step": 768 }, { "epoch": 0.28397341211225996, "grad_norm": 0.24337033927440643, "learning_rate": 0.0001811799482694913, "loss": 0.2845, "step": 769 }, { "epoch": 0.2843426883308715, "grad_norm": 0.3237406611442566, "learning_rate": 0.00018115531469392783, "loss": 0.372, "step": 770 }, { "epoch": 0.284711964549483, "grad_norm": 0.29243141412734985, "learning_rate": 0.00018113068111836434, "loss": 0.3658, "step": 771 }, { "epoch": 0.28508124076809455, "grad_norm": 0.2752094268798828, "learning_rate": 0.00018110604754280086, "loss": 0.4016, "step": 772 }, { "epoch": 0.28545051698670604, "grad_norm": 0.29873543977737427, "learning_rate": 0.00018108141396723734, "loss": 0.4039, "step": 773 }, { "epoch": 0.2858197932053176, "grad_norm": 0.3566305637359619, "learning_rate": 0.00018105678039167386, "loss": 0.3922, "step": 774 }, { "epoch": 0.2861890694239291, "grad_norm": 0.24314096570014954, "learning_rate": 0.00018103214681611035, "loss": 0.2998, "step": 775 }, { "epoch": 0.2865583456425406, "grad_norm": 0.49944359064102173, "learning_rate": 0.0001810075132405469, "loss": 0.3827, "step": 776 }, { "epoch": 0.2869276218611521, "grad_norm": 0.3388037085533142, "learning_rate": 0.00018098287966498338, "loss": 0.4065, "step": 777 }, { "epoch": 0.28729689807976366, "grad_norm": 0.31323766708374023, "learning_rate": 0.0001809582460894199, "loss": 0.3669, "step": 778 }, { "epoch": 0.2876661742983752, "grad_norm": 0.3184468746185303, "learning_rate": 0.00018093361251385638, "loss": 0.3813, "step": 779 }, { "epoch": 0.2880354505169867, "grad_norm": 0.35029998421669006, "learning_rate": 0.0001809089789382929, "loss": 0.3933, "step": 780 }, { "epoch": 0.28840472673559825, "grad_norm": 0.28155091404914856, "learning_rate": 0.0001808843453627294, "loss": 0.3198, "step": 781 }, { "epoch": 0.28877400295420974, "grad_norm": 0.28609567880630493, "learning_rate": 0.00018085971178716592, "loss": 0.3188, "step": 782 }, { "epoch": 0.2891432791728213, "grad_norm": 0.3034074008464813, "learning_rate": 0.0001808350782116024, "loss": 0.4208, "step": 783 }, { "epoch": 0.2895125553914328, "grad_norm": 0.27176764607429504, "learning_rate": 0.00018081044463603892, "loss": 0.3128, "step": 784 }, { "epoch": 0.2898818316100443, "grad_norm": 0.42909035086631775, "learning_rate": 0.00018078581106047544, "loss": 0.4614, "step": 785 }, { "epoch": 0.2902511078286558, "grad_norm": 0.26615554094314575, "learning_rate": 0.00018076117748491195, "loss": 0.3133, "step": 786 }, { "epoch": 0.29062038404726737, "grad_norm": 0.28979459404945374, "learning_rate": 0.00018073654390934844, "loss": 0.3536, "step": 787 }, { "epoch": 0.29098966026587886, "grad_norm": 0.266000896692276, "learning_rate": 0.00018071191033378496, "loss": 0.3465, "step": 788 }, { "epoch": 0.2913589364844904, "grad_norm": 0.2925066351890564, "learning_rate": 0.00018068727675822147, "loss": 0.439, "step": 789 }, { "epoch": 0.2917282127031019, "grad_norm": 0.284578412771225, "learning_rate": 0.00018066264318265798, "loss": 0.3795, "step": 790 }, { "epoch": 0.29209748892171344, "grad_norm": 0.2803080379962921, "learning_rate": 0.00018063800960709447, "loss": 0.3622, "step": 791 }, { "epoch": 0.29246676514032494, "grad_norm": 0.2882033586502075, "learning_rate": 0.000180613376031531, "loss": 0.3518, "step": 792 }, { "epoch": 0.2928360413589365, "grad_norm": 0.2589658200740814, "learning_rate": 0.00018058874245596747, "loss": 0.3091, "step": 793 }, { "epoch": 0.29320531757754803, "grad_norm": 0.2796440124511719, "learning_rate": 0.00018056410888040402, "loss": 0.3395, "step": 794 }, { "epoch": 0.2935745937961595, "grad_norm": 0.2894507050514221, "learning_rate": 0.0001805394753048405, "loss": 0.3363, "step": 795 }, { "epoch": 0.29394387001477107, "grad_norm": 0.29944106936454773, "learning_rate": 0.00018051484172927702, "loss": 0.3681, "step": 796 }, { "epoch": 0.29431314623338256, "grad_norm": 0.26254385709762573, "learning_rate": 0.0001804902081537135, "loss": 0.299, "step": 797 }, { "epoch": 0.2946824224519941, "grad_norm": 0.28542569279670715, "learning_rate": 0.00018046557457815002, "loss": 0.3372, "step": 798 }, { "epoch": 0.2950516986706056, "grad_norm": 0.3604605495929718, "learning_rate": 0.00018044094100258653, "loss": 0.405, "step": 799 }, { "epoch": 0.29542097488921715, "grad_norm": 0.25007471442222595, "learning_rate": 0.00018041630742702305, "loss": 0.3783, "step": 800 }, { "epoch": 0.29542097488921715, "eval_loss": 0.3364347219467163, "eval_runtime": 5.8652, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.193, "step": 800 }, { "epoch": 0.29579025110782864, "grad_norm": 0.28190720081329346, "learning_rate": 0.00018039167385145954, "loss": 0.3105, "step": 801 }, { "epoch": 0.2961595273264402, "grad_norm": 0.2979622483253479, "learning_rate": 0.00018036704027589605, "loss": 0.3427, "step": 802 }, { "epoch": 0.2965288035450517, "grad_norm": 0.30916163325309753, "learning_rate": 0.00018034240670033257, "loss": 0.3733, "step": 803 }, { "epoch": 0.2968980797636632, "grad_norm": 0.31245294213294983, "learning_rate": 0.00018031777312476908, "loss": 0.3744, "step": 804 }, { "epoch": 0.2972673559822747, "grad_norm": 0.2838025987148285, "learning_rate": 0.00018029313954920557, "loss": 0.3682, "step": 805 }, { "epoch": 0.29763663220088626, "grad_norm": 0.3152095079421997, "learning_rate": 0.00018026850597364208, "loss": 0.4009, "step": 806 }, { "epoch": 0.2980059084194978, "grad_norm": 0.2876656651496887, "learning_rate": 0.00018024387239807857, "loss": 0.3572, "step": 807 }, { "epoch": 0.2983751846381093, "grad_norm": 0.31334713101387024, "learning_rate": 0.0001802192388225151, "loss": 0.4065, "step": 808 }, { "epoch": 0.29874446085672085, "grad_norm": 0.2907058298587799, "learning_rate": 0.0001801946052469516, "loss": 0.406, "step": 809 }, { "epoch": 0.29911373707533234, "grad_norm": 0.25115716457366943, "learning_rate": 0.00018016997167138811, "loss": 0.3331, "step": 810 }, { "epoch": 0.2994830132939439, "grad_norm": 0.2785030007362366, "learning_rate": 0.0001801453380958246, "loss": 0.3398, "step": 811 }, { "epoch": 0.2998522895125554, "grad_norm": 0.2514455020427704, "learning_rate": 0.00018012070452026112, "loss": 0.3256, "step": 812 }, { "epoch": 0.30022156573116693, "grad_norm": 0.2781217396259308, "learning_rate": 0.00018009607094469763, "loss": 0.3162, "step": 813 }, { "epoch": 0.3005908419497784, "grad_norm": 0.2895994484424591, "learning_rate": 0.00018007143736913415, "loss": 0.3805, "step": 814 }, { "epoch": 0.30096011816838997, "grad_norm": 0.2785027027130127, "learning_rate": 0.00018004680379357063, "loss": 0.2684, "step": 815 }, { "epoch": 0.30132939438700146, "grad_norm": 0.2652154862880707, "learning_rate": 0.00018002217021800715, "loss": 0.2917, "step": 816 }, { "epoch": 0.301698670605613, "grad_norm": 0.28336066007614136, "learning_rate": 0.00017999753664244366, "loss": 0.3594, "step": 817 }, { "epoch": 0.3020679468242245, "grad_norm": 0.34773701429367065, "learning_rate": 0.00017997290306688018, "loss": 0.4795, "step": 818 }, { "epoch": 0.30243722304283605, "grad_norm": 0.3035285174846649, "learning_rate": 0.00017994826949131666, "loss": 0.3498, "step": 819 }, { "epoch": 0.30280649926144754, "grad_norm": 0.31368377804756165, "learning_rate": 0.00017992363591575318, "loss": 0.3344, "step": 820 }, { "epoch": 0.3031757754800591, "grad_norm": 0.2675492763519287, "learning_rate": 0.0001798990023401897, "loss": 0.2946, "step": 821 }, { "epoch": 0.30354505169867063, "grad_norm": 0.242357075214386, "learning_rate": 0.0001798743687646262, "loss": 0.3179, "step": 822 }, { "epoch": 0.3039143279172821, "grad_norm": 0.32820671796798706, "learning_rate": 0.0001798497351890627, "loss": 0.3931, "step": 823 }, { "epoch": 0.30428360413589367, "grad_norm": 0.2815065085887909, "learning_rate": 0.0001798251016134992, "loss": 0.361, "step": 824 }, { "epoch": 0.30465288035450516, "grad_norm": 0.313423752784729, "learning_rate": 0.0001798004680379357, "loss": 0.38, "step": 825 }, { "epoch": 0.3050221565731167, "grad_norm": 0.3085949420928955, "learning_rate": 0.00017977583446237224, "loss": 0.4053, "step": 826 }, { "epoch": 0.3053914327917282, "grad_norm": 0.3582143783569336, "learning_rate": 0.00017975120088680873, "loss": 0.3648, "step": 827 }, { "epoch": 0.30576070901033975, "grad_norm": 0.2894408106803894, "learning_rate": 0.00017972656731124524, "loss": 0.3525, "step": 828 }, { "epoch": 0.30612998522895124, "grad_norm": 0.2721264362335205, "learning_rate": 0.00017970193373568173, "loss": 0.3277, "step": 829 }, { "epoch": 0.3064992614475628, "grad_norm": 0.2568438947200775, "learning_rate": 0.00017967730016011824, "loss": 0.3054, "step": 830 }, { "epoch": 0.3068685376661743, "grad_norm": 0.29634156823158264, "learning_rate": 0.00017965266658455476, "loss": 0.3478, "step": 831 }, { "epoch": 0.3072378138847858, "grad_norm": 0.3144989311695099, "learning_rate": 0.00017962803300899127, "loss": 0.3488, "step": 832 }, { "epoch": 0.3076070901033973, "grad_norm": 0.316948801279068, "learning_rate": 0.00017960339943342776, "loss": 0.3406, "step": 833 }, { "epoch": 0.30797636632200887, "grad_norm": 0.3339119851589203, "learning_rate": 0.00017957876585786428, "loss": 0.3856, "step": 834 }, { "epoch": 0.30834564254062036, "grad_norm": 0.3087283670902252, "learning_rate": 0.0001795541322823008, "loss": 0.3235, "step": 835 }, { "epoch": 0.3087149187592319, "grad_norm": 0.25156664848327637, "learning_rate": 0.0001795294987067373, "loss": 0.3036, "step": 836 }, { "epoch": 0.30908419497784345, "grad_norm": 0.25502482056617737, "learning_rate": 0.0001795048651311738, "loss": 0.3215, "step": 837 }, { "epoch": 0.30945347119645494, "grad_norm": 0.2689816355705261, "learning_rate": 0.0001794802315556103, "loss": 0.348, "step": 838 }, { "epoch": 0.3098227474150665, "grad_norm": 0.4214155972003937, "learning_rate": 0.0001794555979800468, "loss": 0.3808, "step": 839 }, { "epoch": 0.310192023633678, "grad_norm": 0.2970890998840332, "learning_rate": 0.00017943096440448334, "loss": 0.3542, "step": 840 }, { "epoch": 0.31056129985228953, "grad_norm": 0.23918575048446655, "learning_rate": 0.00017940633082891982, "loss": 0.2426, "step": 841 }, { "epoch": 0.310930576070901, "grad_norm": 0.28873178362846375, "learning_rate": 0.00017938169725335634, "loss": 0.4068, "step": 842 }, { "epoch": 0.31129985228951257, "grad_norm": 0.3294038474559784, "learning_rate": 0.00017935706367779283, "loss": 0.4282, "step": 843 }, { "epoch": 0.31166912850812406, "grad_norm": 0.28786158561706543, "learning_rate": 0.00017933243010222934, "loss": 0.3551, "step": 844 }, { "epoch": 0.3120384047267356, "grad_norm": 0.3132387697696686, "learning_rate": 0.00017930779652666586, "loss": 0.3127, "step": 845 }, { "epoch": 0.3124076809453471, "grad_norm": 0.25213518738746643, "learning_rate": 0.00017928316295110237, "loss": 0.2984, "step": 846 }, { "epoch": 0.31277695716395865, "grad_norm": 0.2759394347667694, "learning_rate": 0.00017925852937553886, "loss": 0.326, "step": 847 }, { "epoch": 0.31314623338257014, "grad_norm": 0.27484017610549927, "learning_rate": 0.00017923389579997537, "loss": 0.2922, "step": 848 }, { "epoch": 0.3135155096011817, "grad_norm": 0.2975594103336334, "learning_rate": 0.0001792092622244119, "loss": 0.3467, "step": 849 }, { "epoch": 0.31388478581979323, "grad_norm": 0.3532097339630127, "learning_rate": 0.0001791846286488484, "loss": 0.386, "step": 850 }, { "epoch": 0.31388478581979323, "eval_loss": 0.33392348885536194, "eval_runtime": 5.8583, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 850 }, { "epoch": 0.3142540620384047, "grad_norm": 0.24326710402965546, "learning_rate": 0.0001791599950732849, "loss": 0.2547, "step": 851 }, { "epoch": 0.31462333825701627, "grad_norm": 0.2544013261795044, "learning_rate": 0.0001791353614977214, "loss": 0.3396, "step": 852 }, { "epoch": 0.31499261447562776, "grad_norm": 0.3768573999404907, "learning_rate": 0.00017911072792215792, "loss": 0.3424, "step": 853 }, { "epoch": 0.3153618906942393, "grad_norm": 0.3502283990383148, "learning_rate": 0.00017908609434659443, "loss": 0.4177, "step": 854 }, { "epoch": 0.3157311669128508, "grad_norm": 0.29516372084617615, "learning_rate": 0.00017906146077103092, "loss": 0.4256, "step": 855 }, { "epoch": 0.31610044313146235, "grad_norm": 0.27422836422920227, "learning_rate": 0.00017903682719546744, "loss": 0.3216, "step": 856 }, { "epoch": 0.31646971935007384, "grad_norm": 0.2891975939273834, "learning_rate": 0.00017901219361990392, "loss": 0.2996, "step": 857 }, { "epoch": 0.3168389955686854, "grad_norm": 0.34916412830352783, "learning_rate": 0.00017898756004434046, "loss": 0.3324, "step": 858 }, { "epoch": 0.3172082717872969, "grad_norm": 0.35271620750427246, "learning_rate": 0.00017896292646877695, "loss": 0.405, "step": 859 }, { "epoch": 0.3175775480059084, "grad_norm": 0.251388818025589, "learning_rate": 0.00017893829289321347, "loss": 0.3203, "step": 860 }, { "epoch": 0.3179468242245199, "grad_norm": 0.2781042754650116, "learning_rate": 0.00017891365931764995, "loss": 0.3063, "step": 861 }, { "epoch": 0.31831610044313147, "grad_norm": 0.2542610168457031, "learning_rate": 0.00017888902574208647, "loss": 0.3106, "step": 862 }, { "epoch": 0.31868537666174296, "grad_norm": 0.2617054283618927, "learning_rate": 0.00017886439216652298, "loss": 0.2967, "step": 863 }, { "epoch": 0.3190546528803545, "grad_norm": 0.27979138493537903, "learning_rate": 0.0001788397585909595, "loss": 0.2823, "step": 864 }, { "epoch": 0.31942392909896605, "grad_norm": 0.23414112627506256, "learning_rate": 0.00017881512501539599, "loss": 0.2796, "step": 865 }, { "epoch": 0.31979320531757754, "grad_norm": 0.30112823843955994, "learning_rate": 0.0001787904914398325, "loss": 0.3338, "step": 866 }, { "epoch": 0.3201624815361891, "grad_norm": 0.3144896924495697, "learning_rate": 0.00017876585786426902, "loss": 0.3887, "step": 867 }, { "epoch": 0.3205317577548006, "grad_norm": 0.34166356921195984, "learning_rate": 0.00017874122428870553, "loss": 0.4369, "step": 868 }, { "epoch": 0.32090103397341213, "grad_norm": 0.2831858694553375, "learning_rate": 0.00017871659071314202, "loss": 0.2869, "step": 869 }, { "epoch": 0.3212703101920236, "grad_norm": 0.3323805332183838, "learning_rate": 0.00017869195713757853, "loss": 0.3173, "step": 870 }, { "epoch": 0.32163958641063517, "grad_norm": 0.27442899346351624, "learning_rate": 0.00017866732356201502, "loss": 0.2456, "step": 871 }, { "epoch": 0.32200886262924666, "grad_norm": 0.25058212876319885, "learning_rate": 0.00017864268998645156, "loss": 0.3411, "step": 872 }, { "epoch": 0.3223781388478582, "grad_norm": 0.24768081307411194, "learning_rate": 0.00017861805641088805, "loss": 0.3238, "step": 873 }, { "epoch": 0.3227474150664697, "grad_norm": 0.3337574005126953, "learning_rate": 0.00017859342283532456, "loss": 0.4814, "step": 874 }, { "epoch": 0.32311669128508125, "grad_norm": 0.2864474058151245, "learning_rate": 0.00017856878925976105, "loss": 0.3523, "step": 875 }, { "epoch": 0.32348596750369274, "grad_norm": 0.27543604373931885, "learning_rate": 0.00017854415568419757, "loss": 0.3394, "step": 876 }, { "epoch": 0.3238552437223043, "grad_norm": 0.2650769352912903, "learning_rate": 0.00017851952210863408, "loss": 0.3473, "step": 877 }, { "epoch": 0.3242245199409158, "grad_norm": 0.29300224781036377, "learning_rate": 0.0001784948885330706, "loss": 0.3884, "step": 878 }, { "epoch": 0.3245937961595273, "grad_norm": 0.3521723747253418, "learning_rate": 0.00017847025495750708, "loss": 0.419, "step": 879 }, { "epoch": 0.3249630723781389, "grad_norm": 0.28816041350364685, "learning_rate": 0.0001784456213819436, "loss": 0.3261, "step": 880 }, { "epoch": 0.32533234859675036, "grad_norm": 0.27399736642837524, "learning_rate": 0.0001784209878063801, "loss": 0.3483, "step": 881 }, { "epoch": 0.3257016248153619, "grad_norm": 0.31869786977767944, "learning_rate": 0.00017839635423081663, "loss": 0.4149, "step": 882 }, { "epoch": 0.3260709010339734, "grad_norm": 0.26290929317474365, "learning_rate": 0.00017837172065525311, "loss": 0.3341, "step": 883 }, { "epoch": 0.32644017725258495, "grad_norm": 0.278089702129364, "learning_rate": 0.00017834708707968963, "loss": 0.4107, "step": 884 }, { "epoch": 0.32680945347119644, "grad_norm": 0.31756216287612915, "learning_rate": 0.00017832245350412614, "loss": 0.4187, "step": 885 }, { "epoch": 0.327178729689808, "grad_norm": 0.3544643223285675, "learning_rate": 0.00017829781992856266, "loss": 0.4412, "step": 886 }, { "epoch": 0.3275480059084195, "grad_norm": 0.31643715500831604, "learning_rate": 0.00017827318635299915, "loss": 0.3393, "step": 887 }, { "epoch": 0.32791728212703103, "grad_norm": 0.30142152309417725, "learning_rate": 0.00017824855277743566, "loss": 0.4242, "step": 888 }, { "epoch": 0.3282865583456425, "grad_norm": 0.2803404629230499, "learning_rate": 0.00017822391920187215, "loss": 0.3514, "step": 889 }, { "epoch": 0.32865583456425407, "grad_norm": 0.3276580572128296, "learning_rate": 0.0001781992856263087, "loss": 0.4045, "step": 890 }, { "epoch": 0.32902511078286556, "grad_norm": 0.24273940920829773, "learning_rate": 0.00017817465205074518, "loss": 0.3047, "step": 891 }, { "epoch": 0.3293943870014771, "grad_norm": 0.2541992664337158, "learning_rate": 0.0001781500184751817, "loss": 0.3145, "step": 892 }, { "epoch": 0.3297636632200886, "grad_norm": 0.2850353419780731, "learning_rate": 0.00017812538489961818, "loss": 0.3145, "step": 893 }, { "epoch": 0.33013293943870015, "grad_norm": 0.2915472686290741, "learning_rate": 0.0001781007513240547, "loss": 0.3703, "step": 894 }, { "epoch": 0.3305022156573117, "grad_norm": 0.2595556080341339, "learning_rate": 0.0001780761177484912, "loss": 0.3154, "step": 895 }, { "epoch": 0.3308714918759232, "grad_norm": 0.3130100965499878, "learning_rate": 0.00017805148417292772, "loss": 0.3693, "step": 896 }, { "epoch": 0.33124076809453473, "grad_norm": 0.26682716608047485, "learning_rate": 0.0001780268505973642, "loss": 0.2852, "step": 897 }, { "epoch": 0.3316100443131462, "grad_norm": 0.33813703060150146, "learning_rate": 0.00017800221702180073, "loss": 0.4482, "step": 898 }, { "epoch": 0.33197932053175777, "grad_norm": 0.3224930465221405, "learning_rate": 0.00017797758344623724, "loss": 0.3355, "step": 899 }, { "epoch": 0.33234859675036926, "grad_norm": 0.3120392858982086, "learning_rate": 0.00017795294987067375, "loss": 0.3994, "step": 900 }, { "epoch": 0.33234859675036926, "eval_loss": 0.33280622959136963, "eval_runtime": 5.8567, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 900 }, { "epoch": 0.3327178729689808, "grad_norm": 0.2731114327907562, "learning_rate": 0.00017792831629511024, "loss": 0.28, "step": 901 }, { "epoch": 0.3330871491875923, "grad_norm": 0.3192294239997864, "learning_rate": 0.00017790368271954676, "loss": 0.4177, "step": 902 }, { "epoch": 0.33345642540620385, "grad_norm": 0.28603675961494446, "learning_rate": 0.00017787904914398324, "loss": 0.3673, "step": 903 }, { "epoch": 0.33382570162481534, "grad_norm": 0.2774650454521179, "learning_rate": 0.00017785441556841979, "loss": 0.2758, "step": 904 }, { "epoch": 0.3341949778434269, "grad_norm": 0.31270119547843933, "learning_rate": 0.00017782978199285627, "loss": 0.3327, "step": 905 }, { "epoch": 0.3345642540620384, "grad_norm": 0.3085302710533142, "learning_rate": 0.0001778051484172928, "loss": 0.3492, "step": 906 }, { "epoch": 0.3349335302806499, "grad_norm": 0.2787379026412964, "learning_rate": 0.00017778051484172928, "loss": 0.2969, "step": 907 }, { "epoch": 0.3353028064992615, "grad_norm": 0.28858089447021484, "learning_rate": 0.0001777558812661658, "loss": 0.3691, "step": 908 }, { "epoch": 0.33567208271787297, "grad_norm": 0.3041202127933502, "learning_rate": 0.0001777312476906023, "loss": 0.3389, "step": 909 }, { "epoch": 0.3360413589364845, "grad_norm": 0.28655505180358887, "learning_rate": 0.00017770661411503882, "loss": 0.3701, "step": 910 }, { "epoch": 0.336410635155096, "grad_norm": 0.2661409378051758, "learning_rate": 0.0001776819805394753, "loss": 0.3454, "step": 911 }, { "epoch": 0.33677991137370755, "grad_norm": 0.2793481647968292, "learning_rate": 0.00017765734696391182, "loss": 0.3738, "step": 912 }, { "epoch": 0.33714918759231904, "grad_norm": 0.2695358097553253, "learning_rate": 0.00017763271338834834, "loss": 0.352, "step": 913 }, { "epoch": 0.3375184638109306, "grad_norm": 0.3453199863433838, "learning_rate": 0.00017760807981278485, "loss": 0.3278, "step": 914 }, { "epoch": 0.3378877400295421, "grad_norm": 0.2923884093761444, "learning_rate": 0.00017758344623722134, "loss": 0.368, "step": 915 }, { "epoch": 0.33825701624815363, "grad_norm": 0.3048684895038605, "learning_rate": 0.00017755881266165785, "loss": 0.3263, "step": 916 }, { "epoch": 0.3386262924667651, "grad_norm": 0.27479395270347595, "learning_rate": 0.00017753417908609437, "loss": 0.3459, "step": 917 }, { "epoch": 0.33899556868537667, "grad_norm": 0.34671294689178467, "learning_rate": 0.00017750954551053088, "loss": 0.3735, "step": 918 }, { "epoch": 0.33936484490398816, "grad_norm": 0.31332647800445557, "learning_rate": 0.00017748491193496737, "loss": 0.3626, "step": 919 }, { "epoch": 0.3397341211225997, "grad_norm": 0.23130548000335693, "learning_rate": 0.00017746027835940388, "loss": 0.3068, "step": 920 }, { "epoch": 0.3401033973412112, "grad_norm": 0.3047637939453125, "learning_rate": 0.00017743564478384037, "loss": 0.3439, "step": 921 }, { "epoch": 0.34047267355982275, "grad_norm": 0.33907750248908997, "learning_rate": 0.00017741101120827691, "loss": 0.3617, "step": 922 }, { "epoch": 0.3408419497784343, "grad_norm": 0.32694822549819946, "learning_rate": 0.0001773863776327134, "loss": 0.3503, "step": 923 }, { "epoch": 0.3412112259970458, "grad_norm": 0.27101773023605347, "learning_rate": 0.00017736174405714992, "loss": 0.3474, "step": 924 }, { "epoch": 0.34158050221565733, "grad_norm": 0.2639794945716858, "learning_rate": 0.0001773371104815864, "loss": 0.3078, "step": 925 }, { "epoch": 0.3419497784342688, "grad_norm": 0.2643485367298126, "learning_rate": 0.00017731247690602292, "loss": 0.2704, "step": 926 }, { "epoch": 0.3423190546528804, "grad_norm": 0.2682543992996216, "learning_rate": 0.00017728784333045943, "loss": 0.365, "step": 927 }, { "epoch": 0.34268833087149186, "grad_norm": 0.2998700439929962, "learning_rate": 0.00017726320975489595, "loss": 0.3521, "step": 928 }, { "epoch": 0.3430576070901034, "grad_norm": 0.3435867726802826, "learning_rate": 0.00017723857617933244, "loss": 0.3718, "step": 929 }, { "epoch": 0.3434268833087149, "grad_norm": 0.30668380856513977, "learning_rate": 0.00017721394260376895, "loss": 0.389, "step": 930 }, { "epoch": 0.34379615952732645, "grad_norm": 0.33005931973457336, "learning_rate": 0.00017718930902820546, "loss": 0.4633, "step": 931 }, { "epoch": 0.34416543574593794, "grad_norm": 0.3313938081264496, "learning_rate": 0.00017716467545264198, "loss": 0.4582, "step": 932 }, { "epoch": 0.3445347119645495, "grad_norm": 0.27533599734306335, "learning_rate": 0.00017714004187707847, "loss": 0.3448, "step": 933 }, { "epoch": 0.344903988183161, "grad_norm": 0.31112176179885864, "learning_rate": 0.00017711540830151495, "loss": 0.3978, "step": 934 }, { "epoch": 0.34527326440177253, "grad_norm": 0.30256086587905884, "learning_rate": 0.00017709077472595147, "loss": 0.3801, "step": 935 }, { "epoch": 0.345642540620384, "grad_norm": 0.2602297365665436, "learning_rate": 0.00017706614115038798, "loss": 0.3414, "step": 936 }, { "epoch": 0.34601181683899557, "grad_norm": 0.23241616785526276, "learning_rate": 0.0001770415075748245, "loss": 0.3051, "step": 937 }, { "epoch": 0.3463810930576071, "grad_norm": 0.2802417576313019, "learning_rate": 0.00017701687399926099, "loss": 0.2978, "step": 938 }, { "epoch": 0.3467503692762186, "grad_norm": 0.2938059866428375, "learning_rate": 0.0001769922404236975, "loss": 0.3361, "step": 939 }, { "epoch": 0.34711964549483015, "grad_norm": 0.2498949021100998, "learning_rate": 0.00017696760684813401, "loss": 0.3115, "step": 940 }, { "epoch": 0.34748892171344165, "grad_norm": 0.2669752240180969, "learning_rate": 0.00017694297327257053, "loss": 0.288, "step": 941 }, { "epoch": 0.3478581979320532, "grad_norm": 0.29485201835632324, "learning_rate": 0.00017691833969700702, "loss": 0.2898, "step": 942 }, { "epoch": 0.3482274741506647, "grad_norm": 0.28879061341285706, "learning_rate": 0.00017689370612144353, "loss": 0.311, "step": 943 }, { "epoch": 0.34859675036927623, "grad_norm": 0.25666582584381104, "learning_rate": 0.00017686907254588002, "loss": 0.3279, "step": 944 }, { "epoch": 0.3489660265878877, "grad_norm": 0.2344098687171936, "learning_rate": 0.00017684443897031656, "loss": 0.307, "step": 945 }, { "epoch": 0.34933530280649927, "grad_norm": 0.26992878317832947, "learning_rate": 0.00017681980539475305, "loss": 0.3829, "step": 946 }, { "epoch": 0.34970457902511076, "grad_norm": 0.28339770436286926, "learning_rate": 0.00017679517181918956, "loss": 0.3002, "step": 947 }, { "epoch": 0.3500738552437223, "grad_norm": 0.2965022921562195, "learning_rate": 0.00017677053824362605, "loss": 0.3751, "step": 948 }, { "epoch": 0.3504431314623338, "grad_norm": 0.2937091588973999, "learning_rate": 0.00017674590466806257, "loss": 0.3755, "step": 949 }, { "epoch": 0.35081240768094535, "grad_norm": 0.23199358582496643, "learning_rate": 0.00017672127109249908, "loss": 0.2383, "step": 950 }, { "epoch": 0.35081240768094535, "eval_loss": 0.3347838222980499, "eval_runtime": 5.8517, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 950 }, { "epoch": 0.3511816838995569, "grad_norm": 0.23240543901920319, "learning_rate": 0.0001766966375169356, "loss": 0.2812, "step": 951 }, { "epoch": 0.3515509601181684, "grad_norm": 0.27188020944595337, "learning_rate": 0.00017667200394137208, "loss": 0.3252, "step": 952 }, { "epoch": 0.35192023633677993, "grad_norm": 0.30740195512771606, "learning_rate": 0.0001766473703658086, "loss": 0.3731, "step": 953 }, { "epoch": 0.3522895125553914, "grad_norm": 0.27258527278900146, "learning_rate": 0.0001766227367902451, "loss": 0.2974, "step": 954 }, { "epoch": 0.352658788774003, "grad_norm": 0.2678452134132385, "learning_rate": 0.00017659810321468163, "loss": 0.2672, "step": 955 }, { "epoch": 0.35302806499261447, "grad_norm": 0.2737193703651428, "learning_rate": 0.0001765734696391181, "loss": 0.3152, "step": 956 }, { "epoch": 0.353397341211226, "grad_norm": 0.37009337544441223, "learning_rate": 0.00017654883606355463, "loss": 0.3848, "step": 957 }, { "epoch": 0.3537666174298375, "grad_norm": 0.272429496049881, "learning_rate": 0.00017652420248799114, "loss": 0.3315, "step": 958 }, { "epoch": 0.35413589364844905, "grad_norm": 0.2654068171977997, "learning_rate": 0.00017649956891242766, "loss": 0.3273, "step": 959 }, { "epoch": 0.35450516986706054, "grad_norm": 0.2581124007701874, "learning_rate": 0.00017647493533686414, "loss": 0.3332, "step": 960 }, { "epoch": 0.3548744460856721, "grad_norm": 0.38511034846305847, "learning_rate": 0.00017645030176130066, "loss": 0.3301, "step": 961 }, { "epoch": 0.3552437223042836, "grad_norm": 0.2135087251663208, "learning_rate": 0.00017642566818573715, "loss": 0.2557, "step": 962 }, { "epoch": 0.35561299852289513, "grad_norm": 0.23889155685901642, "learning_rate": 0.0001764010346101737, "loss": 0.2628, "step": 963 }, { "epoch": 0.3559822747415066, "grad_norm": 0.3801189064979553, "learning_rate": 0.00017637640103461018, "loss": 0.4222, "step": 964 }, { "epoch": 0.35635155096011817, "grad_norm": 0.2972833514213562, "learning_rate": 0.0001763517674590467, "loss": 0.3273, "step": 965 }, { "epoch": 0.3567208271787297, "grad_norm": 0.2821199893951416, "learning_rate": 0.00017632713388348318, "loss": 0.3254, "step": 966 }, { "epoch": 0.3570901033973412, "grad_norm": 0.24031786620616913, "learning_rate": 0.0001763025003079197, "loss": 0.2832, "step": 967 }, { "epoch": 0.35745937961595275, "grad_norm": 0.3090129494667053, "learning_rate": 0.0001762778667323562, "loss": 0.3797, "step": 968 }, { "epoch": 0.35782865583456425, "grad_norm": 0.2988395392894745, "learning_rate": 0.00017625323315679272, "loss": 0.2869, "step": 969 }, { "epoch": 0.3581979320531758, "grad_norm": 0.2711489498615265, "learning_rate": 0.0001762285995812292, "loss": 0.3346, "step": 970 }, { "epoch": 0.3585672082717873, "grad_norm": 0.2808724343776703, "learning_rate": 0.00017620396600566572, "loss": 0.3403, "step": 971 }, { "epoch": 0.35893648449039883, "grad_norm": 0.2992192804813385, "learning_rate": 0.00017617933243010224, "loss": 0.3174, "step": 972 }, { "epoch": 0.3593057607090103, "grad_norm": 0.3064062297344208, "learning_rate": 0.00017615469885453875, "loss": 0.3739, "step": 973 }, { "epoch": 0.35967503692762187, "grad_norm": 0.26694345474243164, "learning_rate": 0.00017613006527897524, "loss": 0.306, "step": 974 }, { "epoch": 0.36004431314623336, "grad_norm": 0.32117223739624023, "learning_rate": 0.00017610543170341176, "loss": 0.4091, "step": 975 }, { "epoch": 0.3604135893648449, "grad_norm": 0.2756253480911255, "learning_rate": 0.00017608079812784824, "loss": 0.3541, "step": 976 }, { "epoch": 0.3607828655834564, "grad_norm": 0.333671897649765, "learning_rate": 0.00017605616455228479, "loss": 0.3691, "step": 977 }, { "epoch": 0.36115214180206795, "grad_norm": 0.34428882598876953, "learning_rate": 0.00017603153097672127, "loss": 0.394, "step": 978 }, { "epoch": 0.36152141802067944, "grad_norm": 0.2882046401500702, "learning_rate": 0.0001760068974011578, "loss": 0.3056, "step": 979 }, { "epoch": 0.361890694239291, "grad_norm": 0.2911432683467865, "learning_rate": 0.00017598226382559428, "loss": 0.3761, "step": 980 }, { "epoch": 0.36225997045790254, "grad_norm": 0.29542505741119385, "learning_rate": 0.0001759576302500308, "loss": 0.3443, "step": 981 }, { "epoch": 0.362629246676514, "grad_norm": 0.2800205647945404, "learning_rate": 0.0001759329966744673, "loss": 0.3417, "step": 982 }, { "epoch": 0.3629985228951256, "grad_norm": 0.2971092462539673, "learning_rate": 0.00017590836309890382, "loss": 0.3583, "step": 983 }, { "epoch": 0.36336779911373707, "grad_norm": 0.24899259209632874, "learning_rate": 0.0001758837295233403, "loss": 0.3214, "step": 984 }, { "epoch": 0.3637370753323486, "grad_norm": 0.3360452651977539, "learning_rate": 0.00017585909594777682, "loss": 0.4322, "step": 985 }, { "epoch": 0.3641063515509601, "grad_norm": 0.2182115614414215, "learning_rate": 0.00017583446237221334, "loss": 0.2556, "step": 986 }, { "epoch": 0.36447562776957165, "grad_norm": 0.2815421223640442, "learning_rate": 0.00017580982879664985, "loss": 0.3169, "step": 987 }, { "epoch": 0.36484490398818314, "grad_norm": 0.25160837173461914, "learning_rate": 0.00017578519522108634, "loss": 0.3387, "step": 988 }, { "epoch": 0.3652141802067947, "grad_norm": 0.2743687331676483, "learning_rate": 0.00017576056164552285, "loss": 0.3412, "step": 989 }, { "epoch": 0.3655834564254062, "grad_norm": 0.27276766300201416, "learning_rate": 0.00017573592806995937, "loss": 0.3259, "step": 990 }, { "epoch": 0.36595273264401773, "grad_norm": 0.25966206192970276, "learning_rate": 0.00017571129449439588, "loss": 0.3491, "step": 991 }, { "epoch": 0.3663220088626292, "grad_norm": 0.2842404544353485, "learning_rate": 0.00017568666091883237, "loss": 0.3525, "step": 992 }, { "epoch": 0.36669128508124077, "grad_norm": 0.26605677604675293, "learning_rate": 0.00017566202734326888, "loss": 0.3481, "step": 993 }, { "epoch": 0.3670605612998523, "grad_norm": 0.30087584257125854, "learning_rate": 0.00017563739376770537, "loss": 0.3134, "step": 994 }, { "epoch": 0.3674298375184638, "grad_norm": 0.38111740350723267, "learning_rate": 0.0001756127601921419, "loss": 0.3858, "step": 995 }, { "epoch": 0.36779911373707536, "grad_norm": 0.3130910098552704, "learning_rate": 0.0001755881266165784, "loss": 0.3217, "step": 996 }, { "epoch": 0.36816838995568685, "grad_norm": 0.24693329632282257, "learning_rate": 0.00017556349304101492, "loss": 0.3074, "step": 997 }, { "epoch": 0.3685376661742984, "grad_norm": 0.2819662094116211, "learning_rate": 0.0001755388594654514, "loss": 0.3732, "step": 998 }, { "epoch": 0.3689069423929099, "grad_norm": 0.28971341252326965, "learning_rate": 0.00017551422588988792, "loss": 0.3584, "step": 999 }, { "epoch": 0.36927621861152143, "grad_norm": 0.3545287847518921, "learning_rate": 0.00017548959231432443, "loss": 0.4349, "step": 1000 }, { "epoch": 0.36927621861152143, "eval_loss": 0.33093270659446716, "eval_runtime": 5.854, "eval_samples_per_second": 8.541, "eval_steps_per_second": 1.196, "step": 1000 }, { "epoch": 0.3696454948301329, "grad_norm": 0.9900842905044556, "learning_rate": 0.00017546495873876095, "loss": 0.3643, "step": 1001 }, { "epoch": 0.3700147710487445, "grad_norm": 0.34718453884124756, "learning_rate": 0.00017544032516319743, "loss": 0.36, "step": 1002 }, { "epoch": 0.37038404726735596, "grad_norm": 0.27414506673812866, "learning_rate": 0.00017541569158763395, "loss": 0.3202, "step": 1003 }, { "epoch": 0.3707533234859675, "grad_norm": 0.2744106650352478, "learning_rate": 0.00017539105801207046, "loss": 0.3782, "step": 1004 }, { "epoch": 0.371122599704579, "grad_norm": 0.23309417068958282, "learning_rate": 0.00017536642443650698, "loss": 0.2805, "step": 1005 }, { "epoch": 0.37149187592319055, "grad_norm": 0.2333519160747528, "learning_rate": 0.00017534179086094347, "loss": 0.288, "step": 1006 }, { "epoch": 0.37186115214180204, "grad_norm": 0.2939945459365845, "learning_rate": 0.00017531715728537998, "loss": 0.4267, "step": 1007 }, { "epoch": 0.3722304283604136, "grad_norm": 0.2831266224384308, "learning_rate": 0.00017529252370981647, "loss": 0.3193, "step": 1008 }, { "epoch": 0.37259970457902514, "grad_norm": 0.3656313419342041, "learning_rate": 0.000175267890134253, "loss": 0.3855, "step": 1009 }, { "epoch": 0.37296898079763663, "grad_norm": 0.25398287177085876, "learning_rate": 0.0001752432565586895, "loss": 0.2697, "step": 1010 }, { "epoch": 0.3733382570162482, "grad_norm": 0.2820013165473938, "learning_rate": 0.000175218622983126, "loss": 0.3384, "step": 1011 }, { "epoch": 0.37370753323485967, "grad_norm": 0.27855661511421204, "learning_rate": 0.0001751939894075625, "loss": 0.3441, "step": 1012 }, { "epoch": 0.3740768094534712, "grad_norm": 0.33842533826828003, "learning_rate": 0.00017516935583199901, "loss": 0.3638, "step": 1013 }, { "epoch": 0.3744460856720827, "grad_norm": 0.418399453163147, "learning_rate": 0.00017514472225643553, "loss": 0.38, "step": 1014 }, { "epoch": 0.37481536189069425, "grad_norm": 0.29497411847114563, "learning_rate": 0.00017512008868087204, "loss": 0.3665, "step": 1015 }, { "epoch": 0.37518463810930575, "grad_norm": 0.3119269907474518, "learning_rate": 0.00017509545510530853, "loss": 0.3562, "step": 1016 }, { "epoch": 0.3755539143279173, "grad_norm": 0.3685709238052368, "learning_rate": 0.00017507082152974505, "loss": 0.3094, "step": 1017 }, { "epoch": 0.3759231905465288, "grad_norm": 0.31541427969932556, "learning_rate": 0.00017504618795418156, "loss": 0.4056, "step": 1018 }, { "epoch": 0.37629246676514033, "grad_norm": 0.28675368428230286, "learning_rate": 0.00017502155437861808, "loss": 0.305, "step": 1019 }, { "epoch": 0.3766617429837518, "grad_norm": 0.2524102032184601, "learning_rate": 0.00017499692080305456, "loss": 0.268, "step": 1020 }, { "epoch": 0.37703101920236337, "grad_norm": 0.3210057318210602, "learning_rate": 0.00017497228722749108, "loss": 0.4261, "step": 1021 }, { "epoch": 0.37740029542097486, "grad_norm": 0.24090994894504547, "learning_rate": 0.0001749476536519276, "loss": 0.2977, "step": 1022 }, { "epoch": 0.3777695716395864, "grad_norm": 0.384414404630661, "learning_rate": 0.0001749230200763641, "loss": 0.3994, "step": 1023 }, { "epoch": 0.37813884785819796, "grad_norm": 0.31013649702072144, "learning_rate": 0.0001748983865008006, "loss": 0.3856, "step": 1024 }, { "epoch": 0.37850812407680945, "grad_norm": 0.24665279686450958, "learning_rate": 0.0001748737529252371, "loss": 0.2265, "step": 1025 }, { "epoch": 0.378877400295421, "grad_norm": 0.4522167146205902, "learning_rate": 0.0001748491193496736, "loss": 0.3952, "step": 1026 }, { "epoch": 0.3792466765140325, "grad_norm": 0.270622581243515, "learning_rate": 0.00017482448577411014, "loss": 0.3162, "step": 1027 }, { "epoch": 0.37961595273264404, "grad_norm": 0.32009977102279663, "learning_rate": 0.00017479985219854663, "loss": 0.2769, "step": 1028 }, { "epoch": 0.3799852289512555, "grad_norm": 0.2599954903125763, "learning_rate": 0.00017477521862298314, "loss": 0.3331, "step": 1029 }, { "epoch": 0.3803545051698671, "grad_norm": 0.29672732949256897, "learning_rate": 0.00017475058504741963, "loss": 0.3365, "step": 1030 }, { "epoch": 0.38072378138847857, "grad_norm": 0.37659427523612976, "learning_rate": 0.00017472595147185614, "loss": 0.2988, "step": 1031 }, { "epoch": 0.3810930576070901, "grad_norm": 0.25498005747795105, "learning_rate": 0.00017470131789629266, "loss": 0.2885, "step": 1032 }, { "epoch": 0.3814623338257016, "grad_norm": 0.28253865242004395, "learning_rate": 0.00017467668432072917, "loss": 0.3096, "step": 1033 }, { "epoch": 0.38183161004431315, "grad_norm": 0.24322527647018433, "learning_rate": 0.00017465205074516566, "loss": 0.2841, "step": 1034 }, { "epoch": 0.38220088626292464, "grad_norm": 0.2777494192123413, "learning_rate": 0.00017462741716960217, "loss": 0.3519, "step": 1035 }, { "epoch": 0.3825701624815362, "grad_norm": 0.27474284172058105, "learning_rate": 0.0001746027835940387, "loss": 0.2772, "step": 1036 }, { "epoch": 0.3829394387001477, "grad_norm": 0.29572758078575134, "learning_rate": 0.0001745781500184752, "loss": 0.2974, "step": 1037 }, { "epoch": 0.38330871491875923, "grad_norm": 0.3762396275997162, "learning_rate": 0.0001745535164429117, "loss": 0.3297, "step": 1038 }, { "epoch": 0.3836779911373708, "grad_norm": 0.28851041197776794, "learning_rate": 0.0001745288828673482, "loss": 0.3641, "step": 1039 }, { "epoch": 0.38404726735598227, "grad_norm": 0.281868577003479, "learning_rate": 0.0001745042492917847, "loss": 0.342, "step": 1040 }, { "epoch": 0.3844165435745938, "grad_norm": 0.2661024332046509, "learning_rate": 0.00017447961571622123, "loss": 0.3789, "step": 1041 }, { "epoch": 0.3847858197932053, "grad_norm": 0.2997417449951172, "learning_rate": 0.00017445498214065772, "loss": 0.368, "step": 1042 }, { "epoch": 0.38515509601181686, "grad_norm": 0.3276411294937134, "learning_rate": 0.00017443034856509424, "loss": 0.3105, "step": 1043 }, { "epoch": 0.38552437223042835, "grad_norm": 0.3410065472126007, "learning_rate": 0.00017440571498953072, "loss": 0.3864, "step": 1044 }, { "epoch": 0.3858936484490399, "grad_norm": 0.30084607005119324, "learning_rate": 0.00017438108141396724, "loss": 0.309, "step": 1045 }, { "epoch": 0.3862629246676514, "grad_norm": 0.3507026433944702, "learning_rate": 0.00017435644783840375, "loss": 0.3668, "step": 1046 }, { "epoch": 0.38663220088626293, "grad_norm": 0.3287352919578552, "learning_rate": 0.00017433181426284027, "loss": 0.4588, "step": 1047 }, { "epoch": 0.3870014771048744, "grad_norm": 0.31868648529052734, "learning_rate": 0.00017430718068727676, "loss": 0.3658, "step": 1048 }, { "epoch": 0.387370753323486, "grad_norm": 0.29755568504333496, "learning_rate": 0.00017428254711171327, "loss": 0.3829, "step": 1049 }, { "epoch": 0.38774002954209746, "grad_norm": 0.2674624025821686, "learning_rate": 0.00017425791353614978, "loss": 0.3566, "step": 1050 }, { "epoch": 0.38774002954209746, "eval_loss": 0.3311212658882141, "eval_runtime": 5.8514, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 1050 }, { "epoch": 0.388109305760709, "grad_norm": 0.29503870010375977, "learning_rate": 0.0001742332799605863, "loss": 0.3396, "step": 1051 }, { "epoch": 0.38847858197932056, "grad_norm": 0.30640003085136414, "learning_rate": 0.0001742086463850228, "loss": 0.3289, "step": 1052 }, { "epoch": 0.38884785819793205, "grad_norm": 0.3086230158805847, "learning_rate": 0.0001741840128094593, "loss": 0.3619, "step": 1053 }, { "epoch": 0.3892171344165436, "grad_norm": 0.3033970892429352, "learning_rate": 0.00017415937923389582, "loss": 0.3432, "step": 1054 }, { "epoch": 0.3895864106351551, "grad_norm": 0.2585624158382416, "learning_rate": 0.00017413474565833233, "loss": 0.2708, "step": 1055 }, { "epoch": 0.38995568685376664, "grad_norm": 0.24220868945121765, "learning_rate": 0.00017411011208276882, "loss": 0.2979, "step": 1056 }, { "epoch": 0.39032496307237813, "grad_norm": 0.2443268597126007, "learning_rate": 0.00017408547850720533, "loss": 0.3052, "step": 1057 }, { "epoch": 0.3906942392909897, "grad_norm": 0.2863035202026367, "learning_rate": 0.00017406084493164182, "loss": 0.3404, "step": 1058 }, { "epoch": 0.39106351550960117, "grad_norm": 0.25512877106666565, "learning_rate": 0.00017403621135607836, "loss": 0.3231, "step": 1059 }, { "epoch": 0.3914327917282127, "grad_norm": 0.32268014550209045, "learning_rate": 0.00017401157778051485, "loss": 0.3773, "step": 1060 }, { "epoch": 0.3918020679468242, "grad_norm": 0.4049038290977478, "learning_rate": 0.00017398694420495136, "loss": 0.3639, "step": 1061 }, { "epoch": 0.39217134416543575, "grad_norm": 0.3472083508968353, "learning_rate": 0.00017396231062938785, "loss": 0.3084, "step": 1062 }, { "epoch": 0.39254062038404725, "grad_norm": 0.320273756980896, "learning_rate": 0.00017393767705382437, "loss": 0.3575, "step": 1063 }, { "epoch": 0.3929098966026588, "grad_norm": 0.2545672357082367, "learning_rate": 0.00017391304347826088, "loss": 0.3381, "step": 1064 }, { "epoch": 0.3932791728212703, "grad_norm": 0.31159624457359314, "learning_rate": 0.0001738884099026974, "loss": 0.3584, "step": 1065 }, { "epoch": 0.39364844903988183, "grad_norm": 0.2954980731010437, "learning_rate": 0.00017386377632713388, "loss": 0.3149, "step": 1066 }, { "epoch": 0.3940177252584934, "grad_norm": 0.3100495934486389, "learning_rate": 0.0001738391427515704, "loss": 0.3778, "step": 1067 }, { "epoch": 0.39438700147710487, "grad_norm": 0.2485426664352417, "learning_rate": 0.0001738145091760069, "loss": 0.2661, "step": 1068 }, { "epoch": 0.3947562776957164, "grad_norm": 0.297589510679245, "learning_rate": 0.00017378987560044343, "loss": 0.3443, "step": 1069 }, { "epoch": 0.3951255539143279, "grad_norm": 0.27318239212036133, "learning_rate": 0.00017376524202487992, "loss": 0.3468, "step": 1070 }, { "epoch": 0.39549483013293946, "grad_norm": 0.31159114837646484, "learning_rate": 0.00017374060844931643, "loss": 0.3596, "step": 1071 }, { "epoch": 0.39586410635155095, "grad_norm": 0.32290807366371155, "learning_rate": 0.00017371597487375292, "loss": 0.4303, "step": 1072 }, { "epoch": 0.3962333825701625, "grad_norm": 0.3056161403656006, "learning_rate": 0.00017369134129818946, "loss": 0.3181, "step": 1073 }, { "epoch": 0.396602658788774, "grad_norm": 0.3475019931793213, "learning_rate": 0.00017366670772262595, "loss": 0.3123, "step": 1074 }, { "epoch": 0.39697193500738553, "grad_norm": 0.2533319592475891, "learning_rate": 0.00017364207414706246, "loss": 0.3335, "step": 1075 }, { "epoch": 0.397341211225997, "grad_norm": 0.2805590331554413, "learning_rate": 0.00017361744057149895, "loss": 0.334, "step": 1076 }, { "epoch": 0.3977104874446086, "grad_norm": 0.2707286477088928, "learning_rate": 0.00017359280699593546, "loss": 0.3292, "step": 1077 }, { "epoch": 0.39807976366322007, "grad_norm": 0.26679933071136475, "learning_rate": 0.00017356817342037198, "loss": 0.334, "step": 1078 }, { "epoch": 0.3984490398818316, "grad_norm": 0.2567000389099121, "learning_rate": 0.0001735435398448085, "loss": 0.3601, "step": 1079 }, { "epoch": 0.3988183161004431, "grad_norm": 0.32759955525398254, "learning_rate": 0.00017351890626924498, "loss": 0.4266, "step": 1080 }, { "epoch": 0.39918759231905465, "grad_norm": 0.28385090827941895, "learning_rate": 0.0001734942726936815, "loss": 0.3725, "step": 1081 }, { "epoch": 0.3995568685376662, "grad_norm": 0.23193253576755524, "learning_rate": 0.000173469639118118, "loss": 0.358, "step": 1082 }, { "epoch": 0.3999261447562777, "grad_norm": 0.2786146104335785, "learning_rate": 0.00017344500554255452, "loss": 0.2927, "step": 1083 }, { "epoch": 0.40029542097488924, "grad_norm": 0.2281658947467804, "learning_rate": 0.000173420371966991, "loss": 0.3019, "step": 1084 }, { "epoch": 0.40066469719350073, "grad_norm": 0.2926419973373413, "learning_rate": 0.00017339573839142753, "loss": 0.2824, "step": 1085 }, { "epoch": 0.4010339734121123, "grad_norm": 0.24533440172672272, "learning_rate": 0.00017337110481586401, "loss": 0.307, "step": 1086 }, { "epoch": 0.40140324963072377, "grad_norm": 0.23103263974189758, "learning_rate": 0.00017334647124030056, "loss": 0.3354, "step": 1087 }, { "epoch": 0.4017725258493353, "grad_norm": 0.5301767587661743, "learning_rate": 0.00017332183766473704, "loss": 0.3756, "step": 1088 }, { "epoch": 0.4021418020679468, "grad_norm": 0.4169785976409912, "learning_rate": 0.00017329720408917356, "loss": 0.2962, "step": 1089 }, { "epoch": 0.40251107828655835, "grad_norm": 0.40505361557006836, "learning_rate": 0.00017327257051361005, "loss": 0.3986, "step": 1090 }, { "epoch": 0.40288035450516985, "grad_norm": 0.2466781735420227, "learning_rate": 0.00017324793693804656, "loss": 0.276, "step": 1091 }, { "epoch": 0.4032496307237814, "grad_norm": 0.3305412530899048, "learning_rate": 0.00017322330336248307, "loss": 0.2625, "step": 1092 }, { "epoch": 0.4036189069423929, "grad_norm": 0.28403934836387634, "learning_rate": 0.0001731986697869196, "loss": 0.3722, "step": 1093 }, { "epoch": 0.40398818316100443, "grad_norm": 0.2592989504337311, "learning_rate": 0.00017317403621135608, "loss": 0.3611, "step": 1094 }, { "epoch": 0.404357459379616, "grad_norm": 0.2557947337627411, "learning_rate": 0.0001731494026357926, "loss": 0.301, "step": 1095 }, { "epoch": 0.40472673559822747, "grad_norm": 0.25687193870544434, "learning_rate": 0.0001731247690602291, "loss": 0.3506, "step": 1096 }, { "epoch": 0.405096011816839, "grad_norm": 0.2546410858631134, "learning_rate": 0.00017310013548466562, "loss": 0.2816, "step": 1097 }, { "epoch": 0.4054652880354505, "grad_norm": 0.2537722587585449, "learning_rate": 0.0001730755019091021, "loss": 0.3187, "step": 1098 }, { "epoch": 0.40583456425406206, "grad_norm": 0.27400586009025574, "learning_rate": 0.00017305086833353862, "loss": 0.3443, "step": 1099 }, { "epoch": 0.40620384047267355, "grad_norm": 0.295478880405426, "learning_rate": 0.00017302623475797514, "loss": 0.3983, "step": 1100 }, { "epoch": 0.40620384047267355, "eval_loss": 0.3331240713596344, "eval_runtime": 5.8701, "eval_samples_per_second": 8.518, "eval_steps_per_second": 1.192, "step": 1100 }, { "epoch": 0.4065731166912851, "grad_norm": 0.29423660039901733, "learning_rate": 0.00017300160118241165, "loss": 0.3883, "step": 1101 }, { "epoch": 0.4069423929098966, "grad_norm": 0.27080485224723816, "learning_rate": 0.00017297696760684814, "loss": 0.3107, "step": 1102 }, { "epoch": 0.40731166912850814, "grad_norm": 0.35229095816612244, "learning_rate": 0.00017295233403128465, "loss": 0.4703, "step": 1103 }, { "epoch": 0.4076809453471196, "grad_norm": 0.2508137822151184, "learning_rate": 0.00017292770045572114, "loss": 0.3069, "step": 1104 }, { "epoch": 0.4080502215657312, "grad_norm": 0.2992240786552429, "learning_rate": 0.00017290306688015768, "loss": 0.3626, "step": 1105 }, { "epoch": 0.40841949778434267, "grad_norm": 0.2968301475048065, "learning_rate": 0.00017287843330459417, "loss": 0.342, "step": 1106 }, { "epoch": 0.4087887740029542, "grad_norm": 0.2686443030834198, "learning_rate": 0.00017285379972903069, "loss": 0.3164, "step": 1107 }, { "epoch": 0.4091580502215657, "grad_norm": 0.28745171427726746, "learning_rate": 0.00017282916615346717, "loss": 0.4006, "step": 1108 }, { "epoch": 0.40952732644017725, "grad_norm": 0.26906076073646545, "learning_rate": 0.0001728045325779037, "loss": 0.356, "step": 1109 }, { "epoch": 0.4098966026587888, "grad_norm": 0.3046380579471588, "learning_rate": 0.0001727798990023402, "loss": 0.3334, "step": 1110 }, { "epoch": 0.4102658788774003, "grad_norm": 0.3426929712295532, "learning_rate": 0.00017275526542677672, "loss": 0.4221, "step": 1111 }, { "epoch": 0.41063515509601184, "grad_norm": 0.34931817650794983, "learning_rate": 0.0001727306318512132, "loss": 0.3971, "step": 1112 }, { "epoch": 0.41100443131462333, "grad_norm": 0.22070133686065674, "learning_rate": 0.00017270599827564972, "loss": 0.2544, "step": 1113 }, { "epoch": 0.4113737075332349, "grad_norm": 0.29520881175994873, "learning_rate": 0.00017268136470008623, "loss": 0.3196, "step": 1114 }, { "epoch": 0.41174298375184637, "grad_norm": 0.2665400505065918, "learning_rate": 0.00017265673112452275, "loss": 0.3249, "step": 1115 }, { "epoch": 0.4121122599704579, "grad_norm": 0.25689318776130676, "learning_rate": 0.00017263209754895924, "loss": 0.3203, "step": 1116 }, { "epoch": 0.4124815361890694, "grad_norm": 0.31612950563430786, "learning_rate": 0.00017260746397339575, "loss": 0.3567, "step": 1117 }, { "epoch": 0.41285081240768096, "grad_norm": 0.26914507150650024, "learning_rate": 0.00017258283039783224, "loss": 0.2982, "step": 1118 }, { "epoch": 0.41322008862629245, "grad_norm": 0.2826154828071594, "learning_rate": 0.00017255819682226878, "loss": 0.3348, "step": 1119 }, { "epoch": 0.413589364844904, "grad_norm": 0.31187665462493896, "learning_rate": 0.00017253356324670527, "loss": 0.4929, "step": 1120 }, { "epoch": 0.4139586410635155, "grad_norm": 0.3398851454257965, "learning_rate": 0.00017250892967114178, "loss": 0.3903, "step": 1121 }, { "epoch": 0.41432791728212703, "grad_norm": 0.21482737362384796, "learning_rate": 0.00017248429609557827, "loss": 0.2502, "step": 1122 }, { "epoch": 0.4146971935007385, "grad_norm": 0.3095366358757019, "learning_rate": 0.00017245966252001478, "loss": 0.3447, "step": 1123 }, { "epoch": 0.4150664697193501, "grad_norm": 0.41809213161468506, "learning_rate": 0.0001724350289444513, "loss": 0.3193, "step": 1124 }, { "epoch": 0.4154357459379616, "grad_norm": 0.2830677330493927, "learning_rate": 0.00017241039536888781, "loss": 0.3829, "step": 1125 }, { "epoch": 0.4158050221565731, "grad_norm": 0.3771759271621704, "learning_rate": 0.0001723857617933243, "loss": 0.3566, "step": 1126 }, { "epoch": 0.41617429837518466, "grad_norm": 0.2990865707397461, "learning_rate": 0.00017236112821776082, "loss": 0.3912, "step": 1127 }, { "epoch": 0.41654357459379615, "grad_norm": 0.29477930068969727, "learning_rate": 0.00017233649464219733, "loss": 0.3346, "step": 1128 }, { "epoch": 0.4169128508124077, "grad_norm": 0.2764134407043457, "learning_rate": 0.00017231186106663385, "loss": 0.369, "step": 1129 }, { "epoch": 0.4172821270310192, "grad_norm": 0.24675750732421875, "learning_rate": 0.00017228722749107033, "loss": 0.28, "step": 1130 }, { "epoch": 0.41765140324963074, "grad_norm": 0.23767217993736267, "learning_rate": 0.00017226259391550685, "loss": 0.313, "step": 1131 }, { "epoch": 0.41802067946824223, "grad_norm": 0.2794254422187805, "learning_rate": 0.00017223796033994336, "loss": 0.3902, "step": 1132 }, { "epoch": 0.4183899556868538, "grad_norm": 0.27715378999710083, "learning_rate": 0.00017221332676437988, "loss": 0.3109, "step": 1133 }, { "epoch": 0.41875923190546527, "grad_norm": 0.25810864567756653, "learning_rate": 0.00017218869318881636, "loss": 0.3086, "step": 1134 }, { "epoch": 0.4191285081240768, "grad_norm": 0.22305242717266083, "learning_rate": 0.00017216405961325288, "loss": 0.313, "step": 1135 }, { "epoch": 0.4194977843426883, "grad_norm": 0.24376413226127625, "learning_rate": 0.00017213942603768937, "loss": 0.3192, "step": 1136 }, { "epoch": 0.41986706056129985, "grad_norm": 0.2753487825393677, "learning_rate": 0.0001721147924621259, "loss": 0.297, "step": 1137 }, { "epoch": 0.4202363367799114, "grad_norm": 0.24492758512496948, "learning_rate": 0.0001720901588865624, "loss": 0.2755, "step": 1138 }, { "epoch": 0.4206056129985229, "grad_norm": 0.2600553333759308, "learning_rate": 0.0001720655253109989, "loss": 0.2966, "step": 1139 }, { "epoch": 0.42097488921713444, "grad_norm": 0.25787171721458435, "learning_rate": 0.0001720408917354354, "loss": 0.3345, "step": 1140 }, { "epoch": 0.42134416543574593, "grad_norm": 0.26054611802101135, "learning_rate": 0.0001720162581598719, "loss": 0.3417, "step": 1141 }, { "epoch": 0.4217134416543575, "grad_norm": 0.23203696310520172, "learning_rate": 0.00017199162458430843, "loss": 0.2979, "step": 1142 }, { "epoch": 0.42208271787296897, "grad_norm": 0.3051937520503998, "learning_rate": 0.00017196699100874494, "loss": 0.3893, "step": 1143 }, { "epoch": 0.4224519940915805, "grad_norm": 0.32818612456321716, "learning_rate": 0.00017194235743318143, "loss": 0.4387, "step": 1144 }, { "epoch": 0.422821270310192, "grad_norm": 0.33093100786209106, "learning_rate": 0.00017191772385761794, "loss": 0.4634, "step": 1145 }, { "epoch": 0.42319054652880356, "grad_norm": 0.27430132031440735, "learning_rate": 0.00017189309028205446, "loss": 0.3448, "step": 1146 }, { "epoch": 0.42355982274741505, "grad_norm": 0.28466880321502686, "learning_rate": 0.00017186845670649097, "loss": 0.365, "step": 1147 }, { "epoch": 0.4239290989660266, "grad_norm": 0.3052275478839874, "learning_rate": 0.00017184382313092746, "loss": 0.3404, "step": 1148 }, { "epoch": 0.4242983751846381, "grad_norm": 0.2922669053077698, "learning_rate": 0.00017181918955536398, "loss": 0.307, "step": 1149 }, { "epoch": 0.42466765140324964, "grad_norm": 0.3192574083805084, "learning_rate": 0.00017179455597980046, "loss": 0.3636, "step": 1150 }, { "epoch": 0.42466765140324964, "eval_loss": 0.32198551297187805, "eval_runtime": 5.8536, "eval_samples_per_second": 8.542, "eval_steps_per_second": 1.196, "step": 1150 }, { "epoch": 0.4250369276218611, "grad_norm": 0.29327887296676636, "learning_rate": 0.000171769922404237, "loss": 0.3375, "step": 1151 }, { "epoch": 0.4254062038404727, "grad_norm": 0.3076665699481964, "learning_rate": 0.0001717452888286735, "loss": 0.4464, "step": 1152 }, { "epoch": 0.4257754800590842, "grad_norm": 0.27381011843681335, "learning_rate": 0.00017172065525311, "loss": 0.3446, "step": 1153 }, { "epoch": 0.4261447562776957, "grad_norm": 0.31370654702186584, "learning_rate": 0.0001716960216775465, "loss": 0.3587, "step": 1154 }, { "epoch": 0.42651403249630726, "grad_norm": 0.31399837136268616, "learning_rate": 0.000171671388101983, "loss": 0.3302, "step": 1155 }, { "epoch": 0.42688330871491875, "grad_norm": 0.26231488585472107, "learning_rate": 0.00017164675452641952, "loss": 0.3264, "step": 1156 }, { "epoch": 0.4272525849335303, "grad_norm": 0.2963448166847229, "learning_rate": 0.00017162212095085604, "loss": 0.3481, "step": 1157 }, { "epoch": 0.4276218611521418, "grad_norm": 0.28689444065093994, "learning_rate": 0.00017159748737529253, "loss": 0.3688, "step": 1158 }, { "epoch": 0.42799113737075334, "grad_norm": 0.3124240040779114, "learning_rate": 0.00017157285379972904, "loss": 0.3414, "step": 1159 }, { "epoch": 0.42836041358936483, "grad_norm": 0.27646341919898987, "learning_rate": 0.00017154822022416556, "loss": 0.3179, "step": 1160 }, { "epoch": 0.4287296898079764, "grad_norm": 0.36102649569511414, "learning_rate": 0.00017152358664860207, "loss": 0.4195, "step": 1161 }, { "epoch": 0.42909896602658787, "grad_norm": 0.28331008553504944, "learning_rate": 0.00017149895307303856, "loss": 0.3618, "step": 1162 }, { "epoch": 0.4294682422451994, "grad_norm": 0.3218463063240051, "learning_rate": 0.00017147431949747507, "loss": 0.2825, "step": 1163 }, { "epoch": 0.4298375184638109, "grad_norm": 0.2545153498649597, "learning_rate": 0.0001714496859219116, "loss": 0.304, "step": 1164 }, { "epoch": 0.43020679468242246, "grad_norm": 0.3132915496826172, "learning_rate": 0.00017142505234634807, "loss": 0.4584, "step": 1165 }, { "epoch": 0.43057607090103395, "grad_norm": 0.27413210272789, "learning_rate": 0.0001714004187707846, "loss": 0.3159, "step": 1166 }, { "epoch": 0.4309453471196455, "grad_norm": 0.27805855870246887, "learning_rate": 0.00017137578519522108, "loss": 0.44, "step": 1167 }, { "epoch": 0.43131462333825704, "grad_norm": 0.34869155287742615, "learning_rate": 0.0001713511516196576, "loss": 0.3168, "step": 1168 }, { "epoch": 0.43168389955686853, "grad_norm": 0.3351599872112274, "learning_rate": 0.0001713265180440941, "loss": 0.3541, "step": 1169 }, { "epoch": 0.4320531757754801, "grad_norm": 0.27900075912475586, "learning_rate": 0.00017130188446853062, "loss": 0.3222, "step": 1170 }, { "epoch": 0.4324224519940916, "grad_norm": 0.2621496021747589, "learning_rate": 0.0001712772508929671, "loss": 0.3318, "step": 1171 }, { "epoch": 0.4327917282127031, "grad_norm": 0.25951269268989563, "learning_rate": 0.00017125261731740362, "loss": 0.348, "step": 1172 }, { "epoch": 0.4331610044313146, "grad_norm": 0.2521866261959076, "learning_rate": 0.00017122798374184014, "loss": 0.3081, "step": 1173 }, { "epoch": 0.43353028064992616, "grad_norm": 0.2677977383136749, "learning_rate": 0.00017120335016627665, "loss": 0.3118, "step": 1174 }, { "epoch": 0.43389955686853765, "grad_norm": 0.2904331684112549, "learning_rate": 0.00017117871659071314, "loss": 0.2918, "step": 1175 }, { "epoch": 0.4342688330871492, "grad_norm": 0.327194482088089, "learning_rate": 0.00017115408301514965, "loss": 0.3482, "step": 1176 }, { "epoch": 0.4346381093057607, "grad_norm": 0.3180336356163025, "learning_rate": 0.00017112944943958614, "loss": 0.3451, "step": 1177 }, { "epoch": 0.43500738552437224, "grad_norm": 0.3503369390964508, "learning_rate": 0.00017110481586402268, "loss": 0.4447, "step": 1178 }, { "epoch": 0.43537666174298373, "grad_norm": 0.24465790390968323, "learning_rate": 0.00017108018228845917, "loss": 0.3023, "step": 1179 }, { "epoch": 0.4357459379615953, "grad_norm": 0.2671799659729004, "learning_rate": 0.00017105554871289569, "loss": 0.3607, "step": 1180 }, { "epoch": 0.43611521418020677, "grad_norm": 0.26522642374038696, "learning_rate": 0.00017103091513733217, "loss": 0.3386, "step": 1181 }, { "epoch": 0.4364844903988183, "grad_norm": 0.2625667452812195, "learning_rate": 0.0001710062815617687, "loss": 0.3492, "step": 1182 }, { "epoch": 0.43685376661742986, "grad_norm": 0.250750869512558, "learning_rate": 0.0001709816479862052, "loss": 0.2949, "step": 1183 }, { "epoch": 0.43722304283604135, "grad_norm": 0.295329213142395, "learning_rate": 0.00017095701441064172, "loss": 0.3851, "step": 1184 }, { "epoch": 0.4375923190546529, "grad_norm": 0.267910361289978, "learning_rate": 0.0001709323808350782, "loss": 0.3315, "step": 1185 }, { "epoch": 0.4379615952732644, "grad_norm": 0.31502625346183777, "learning_rate": 0.00017090774725951472, "loss": 0.2795, "step": 1186 }, { "epoch": 0.43833087149187594, "grad_norm": 0.33922597765922546, "learning_rate": 0.00017088311368395123, "loss": 0.3036, "step": 1187 }, { "epoch": 0.43870014771048743, "grad_norm": 0.27864083647727966, "learning_rate": 0.00017085848010838775, "loss": 0.3189, "step": 1188 }, { "epoch": 0.439069423929099, "grad_norm": 0.3772332966327667, "learning_rate": 0.00017083384653282424, "loss": 0.3755, "step": 1189 }, { "epoch": 0.43943870014771047, "grad_norm": 0.2781737446784973, "learning_rate": 0.00017080921295726075, "loss": 0.3292, "step": 1190 }, { "epoch": 0.439807976366322, "grad_norm": 0.28551939129829407, "learning_rate": 0.00017078457938169726, "loss": 0.3977, "step": 1191 }, { "epoch": 0.4401772525849335, "grad_norm": 0.2906353771686554, "learning_rate": 0.00017075994580613378, "loss": 0.3369, "step": 1192 }, { "epoch": 0.44054652880354506, "grad_norm": 0.29700130224227905, "learning_rate": 0.00017073531223057027, "loss": 0.3162, "step": 1193 }, { "epoch": 0.44091580502215655, "grad_norm": 0.30474820733070374, "learning_rate": 0.00017071067865500678, "loss": 0.3618, "step": 1194 }, { "epoch": 0.4412850812407681, "grad_norm": 0.35836559534072876, "learning_rate": 0.00017068604507944327, "loss": 0.4259, "step": 1195 }, { "epoch": 0.44165435745937964, "grad_norm": 0.2899966239929199, "learning_rate": 0.0001706614115038798, "loss": 0.3495, "step": 1196 }, { "epoch": 0.44202363367799113, "grad_norm": 0.24901117384433746, "learning_rate": 0.0001706367779283163, "loss": 0.2851, "step": 1197 }, { "epoch": 0.4423929098966027, "grad_norm": 0.28132179379463196, "learning_rate": 0.0001706121443527528, "loss": 0.3194, "step": 1198 }, { "epoch": 0.4427621861152142, "grad_norm": 0.28281378746032715, "learning_rate": 0.0001705875107771893, "loss": 0.336, "step": 1199 }, { "epoch": 0.4431314623338257, "grad_norm": 0.26517656445503235, "learning_rate": 0.00017056287720162582, "loss": 0.3346, "step": 1200 }, { "epoch": 0.4431314623338257, "eval_loss": 0.3183690905570984, "eval_runtime": 5.856, "eval_samples_per_second": 8.538, "eval_steps_per_second": 1.195, "step": 1200 }, { "epoch": 0.4435007385524372, "grad_norm": 0.30609777569770813, "learning_rate": 0.00017053824362606233, "loss": 0.375, "step": 1201 }, { "epoch": 0.44387001477104876, "grad_norm": 0.2742772698402405, "learning_rate": 0.00017051361005049884, "loss": 0.2902, "step": 1202 }, { "epoch": 0.44423929098966025, "grad_norm": 0.28248992562294006, "learning_rate": 0.00017048897647493533, "loss": 0.3687, "step": 1203 }, { "epoch": 0.4446085672082718, "grad_norm": 0.23091770708560944, "learning_rate": 0.00017046434289937185, "loss": 0.306, "step": 1204 }, { "epoch": 0.4449778434268833, "grad_norm": 0.29506057500839233, "learning_rate": 0.00017043970932380836, "loss": 0.3557, "step": 1205 }, { "epoch": 0.44534711964549484, "grad_norm": 0.36342182755470276, "learning_rate": 0.00017041507574824488, "loss": 0.4046, "step": 1206 }, { "epoch": 0.44571639586410633, "grad_norm": 0.27799978852272034, "learning_rate": 0.00017039044217268136, "loss": 0.2897, "step": 1207 }, { "epoch": 0.4460856720827179, "grad_norm": 0.24548093974590302, "learning_rate": 0.00017036580859711788, "loss": 0.3065, "step": 1208 }, { "epoch": 0.44645494830132937, "grad_norm": 0.2660404443740845, "learning_rate": 0.00017034117502155437, "loss": 0.3195, "step": 1209 }, { "epoch": 0.4468242245199409, "grad_norm": 0.268492192029953, "learning_rate": 0.0001703165414459909, "loss": 0.3035, "step": 1210 }, { "epoch": 0.44719350073855246, "grad_norm": 0.28435540199279785, "learning_rate": 0.0001702919078704274, "loss": 0.3665, "step": 1211 }, { "epoch": 0.44756277695716395, "grad_norm": 0.27771124243736267, "learning_rate": 0.0001702672742948639, "loss": 0.3397, "step": 1212 }, { "epoch": 0.4479320531757755, "grad_norm": 0.28504887223243713, "learning_rate": 0.0001702426407193004, "loss": 0.2854, "step": 1213 }, { "epoch": 0.448301329394387, "grad_norm": 0.24859274923801422, "learning_rate": 0.0001702180071437369, "loss": 0.2592, "step": 1214 }, { "epoch": 0.44867060561299854, "grad_norm": 0.21828439831733704, "learning_rate": 0.00017019337356817343, "loss": 0.2612, "step": 1215 }, { "epoch": 0.44903988183161003, "grad_norm": 0.29936301708221436, "learning_rate": 0.00017016873999260994, "loss": 0.3574, "step": 1216 }, { "epoch": 0.4494091580502216, "grad_norm": 0.30142107605934143, "learning_rate": 0.00017014410641704643, "loss": 0.3598, "step": 1217 }, { "epoch": 0.44977843426883307, "grad_norm": 0.2668401598930359, "learning_rate": 0.00017011947284148294, "loss": 0.2643, "step": 1218 }, { "epoch": 0.4501477104874446, "grad_norm": 0.33149152994155884, "learning_rate": 0.00017009483926591946, "loss": 0.3488, "step": 1219 }, { "epoch": 0.4505169867060561, "grad_norm": 0.26323407888412476, "learning_rate": 0.00017007020569035597, "loss": 0.2844, "step": 1220 }, { "epoch": 0.45088626292466766, "grad_norm": 0.26985248923301697, "learning_rate": 0.00017004557211479246, "loss": 0.3211, "step": 1221 }, { "epoch": 0.45125553914327915, "grad_norm": 0.22127485275268555, "learning_rate": 0.00017002093853922897, "loss": 0.2934, "step": 1222 }, { "epoch": 0.4516248153618907, "grad_norm": 0.2990404963493347, "learning_rate": 0.0001699963049636655, "loss": 0.3384, "step": 1223 }, { "epoch": 0.4519940915805022, "grad_norm": 0.3148226737976074, "learning_rate": 0.000169971671388102, "loss": 0.3381, "step": 1224 }, { "epoch": 0.45236336779911374, "grad_norm": 0.2582751512527466, "learning_rate": 0.0001699470378125385, "loss": 0.3513, "step": 1225 }, { "epoch": 0.4527326440177253, "grad_norm": 0.30039259791374207, "learning_rate": 0.000169922404236975, "loss": 0.3377, "step": 1226 }, { "epoch": 0.4531019202363368, "grad_norm": 0.26333391666412354, "learning_rate": 0.0001698977706614115, "loss": 0.3394, "step": 1227 }, { "epoch": 0.4534711964549483, "grad_norm": 0.29885348677635193, "learning_rate": 0.00016987313708584804, "loss": 0.4602, "step": 1228 }, { "epoch": 0.4538404726735598, "grad_norm": 0.2640257477760315, "learning_rate": 0.00016984850351028452, "loss": 0.3024, "step": 1229 }, { "epoch": 0.45420974889217136, "grad_norm": 0.2874453365802765, "learning_rate": 0.00016982386993472104, "loss": 0.3861, "step": 1230 }, { "epoch": 0.45457902511078285, "grad_norm": 0.3098587691783905, "learning_rate": 0.00016979923635915753, "loss": 0.3435, "step": 1231 }, { "epoch": 0.4549483013293944, "grad_norm": 0.28761473298072815, "learning_rate": 0.00016977460278359404, "loss": 0.3297, "step": 1232 }, { "epoch": 0.4553175775480059, "grad_norm": 0.29999786615371704, "learning_rate": 0.00016974996920803055, "loss": 0.3161, "step": 1233 }, { "epoch": 0.45568685376661744, "grad_norm": 0.28180021047592163, "learning_rate": 0.00016972533563246707, "loss": 0.3148, "step": 1234 }, { "epoch": 0.45605612998522893, "grad_norm": 0.2969822585582733, "learning_rate": 0.00016970070205690356, "loss": 0.3434, "step": 1235 }, { "epoch": 0.4564254062038405, "grad_norm": 0.32349124550819397, "learning_rate": 0.00016967606848134007, "loss": 0.3623, "step": 1236 }, { "epoch": 0.45679468242245197, "grad_norm": 0.24902084469795227, "learning_rate": 0.00016965143490577659, "loss": 0.2848, "step": 1237 }, { "epoch": 0.4571639586410635, "grad_norm": 0.262531042098999, "learning_rate": 0.0001696268013302131, "loss": 0.2806, "step": 1238 }, { "epoch": 0.45753323485967506, "grad_norm": 0.27351832389831543, "learning_rate": 0.0001696021677546496, "loss": 0.3061, "step": 1239 }, { "epoch": 0.45790251107828656, "grad_norm": 0.27112317085266113, "learning_rate": 0.0001695775341790861, "loss": 0.3001, "step": 1240 }, { "epoch": 0.4582717872968981, "grad_norm": 0.4085458219051361, "learning_rate": 0.0001695529006035226, "loss": 0.3857, "step": 1241 }, { "epoch": 0.4586410635155096, "grad_norm": 0.25504735112190247, "learning_rate": 0.00016952826702795913, "loss": 0.2865, "step": 1242 }, { "epoch": 0.45901033973412114, "grad_norm": 0.2736992835998535, "learning_rate": 0.00016950363345239562, "loss": 0.3633, "step": 1243 }, { "epoch": 0.45937961595273263, "grad_norm": 0.2200980931520462, "learning_rate": 0.00016947899987683213, "loss": 0.2461, "step": 1244 }, { "epoch": 0.4597488921713442, "grad_norm": 0.27418121695518494, "learning_rate": 0.00016945436630126862, "loss": 0.3235, "step": 1245 }, { "epoch": 0.4601181683899557, "grad_norm": 0.25174984335899353, "learning_rate": 0.00016942973272570514, "loss": 0.2813, "step": 1246 }, { "epoch": 0.4604874446085672, "grad_norm": 0.23484735190868378, "learning_rate": 0.00016940509915014165, "loss": 0.2863, "step": 1247 }, { "epoch": 0.4608567208271787, "grad_norm": 0.2589341104030609, "learning_rate": 0.00016938046557457817, "loss": 0.2789, "step": 1248 }, { "epoch": 0.46122599704579026, "grad_norm": 0.26269158720970154, "learning_rate": 0.00016935583199901465, "loss": 0.3234, "step": 1249 }, { "epoch": 0.46159527326440175, "grad_norm": 0.2763214707374573, "learning_rate": 0.00016933119842345117, "loss": 0.3252, "step": 1250 }, { "epoch": 0.46159527326440175, "eval_loss": 0.31730917096138, "eval_runtime": 5.8523, "eval_samples_per_second": 8.544, "eval_steps_per_second": 1.196, "step": 1250 }, { "epoch": 0.4619645494830133, "grad_norm": 0.35904350876808167, "learning_rate": 0.00016930656484788768, "loss": 0.3457, "step": 1251 }, { "epoch": 0.4623338257016248, "grad_norm": 0.2199958711862564, "learning_rate": 0.0001692819312723242, "loss": 0.2742, "step": 1252 }, { "epoch": 0.46270310192023634, "grad_norm": 0.3560868501663208, "learning_rate": 0.00016925729769676068, "loss": 0.4143, "step": 1253 }, { "epoch": 0.4630723781388479, "grad_norm": 0.2781903147697449, "learning_rate": 0.0001692326641211972, "loss": 0.316, "step": 1254 }, { "epoch": 0.4634416543574594, "grad_norm": 0.2549719512462616, "learning_rate": 0.0001692080305456337, "loss": 0.2591, "step": 1255 }, { "epoch": 0.4638109305760709, "grad_norm": 0.2696338891983032, "learning_rate": 0.00016918339697007023, "loss": 0.3245, "step": 1256 }, { "epoch": 0.4641802067946824, "grad_norm": 0.2692526876926422, "learning_rate": 0.00016915876339450672, "loss": 0.3394, "step": 1257 }, { "epoch": 0.46454948301329396, "grad_norm": 0.2564251720905304, "learning_rate": 0.00016913412981894323, "loss": 0.2974, "step": 1258 }, { "epoch": 0.46491875923190545, "grad_norm": 0.29035502672195435, "learning_rate": 0.00016910949624337972, "loss": 0.3674, "step": 1259 }, { "epoch": 0.465288035450517, "grad_norm": 0.34713611006736755, "learning_rate": 0.00016908486266781623, "loss": 0.4094, "step": 1260 }, { "epoch": 0.4656573116691285, "grad_norm": 0.302212119102478, "learning_rate": 0.00016906022909225275, "loss": 0.2891, "step": 1261 }, { "epoch": 0.46602658788774004, "grad_norm": 0.27764269709587097, "learning_rate": 0.00016903559551668926, "loss": 0.3353, "step": 1262 }, { "epoch": 0.46639586410635153, "grad_norm": 0.32426977157592773, "learning_rate": 0.00016901096194112575, "loss": 0.4366, "step": 1263 }, { "epoch": 0.4667651403249631, "grad_norm": 0.2631196081638336, "learning_rate": 0.00016898632836556226, "loss": 0.4091, "step": 1264 }, { "epoch": 0.46713441654357457, "grad_norm": 0.29156410694122314, "learning_rate": 0.00016896169478999878, "loss": 0.3326, "step": 1265 }, { "epoch": 0.4675036927621861, "grad_norm": 0.2738753855228424, "learning_rate": 0.0001689370612144353, "loss": 0.2922, "step": 1266 }, { "epoch": 0.4678729689807976, "grad_norm": 0.24467527866363525, "learning_rate": 0.00016891242763887178, "loss": 0.2921, "step": 1267 }, { "epoch": 0.46824224519940916, "grad_norm": 0.28518614172935486, "learning_rate": 0.0001688877940633083, "loss": 0.3344, "step": 1268 }, { "epoch": 0.4686115214180207, "grad_norm": 0.2887619435787201, "learning_rate": 0.0001688631604877448, "loss": 0.3639, "step": 1269 }, { "epoch": 0.4689807976366322, "grad_norm": 0.2452942281961441, "learning_rate": 0.00016883852691218133, "loss": 0.2884, "step": 1270 }, { "epoch": 0.46935007385524374, "grad_norm": 0.305033415555954, "learning_rate": 0.0001688138933366178, "loss": 0.3835, "step": 1271 }, { "epoch": 0.46971935007385524, "grad_norm": 0.26679572463035583, "learning_rate": 0.00016878925976105433, "loss": 0.3163, "step": 1272 }, { "epoch": 0.4700886262924668, "grad_norm": 0.2737630307674408, "learning_rate": 0.00016876462618549081, "loss": 0.3086, "step": 1273 }, { "epoch": 0.4704579025110783, "grad_norm": 0.24957697093486786, "learning_rate": 0.00016873999260992736, "loss": 0.3688, "step": 1274 }, { "epoch": 0.4708271787296898, "grad_norm": 0.30872097611427307, "learning_rate": 0.00016871535903436384, "loss": 0.3559, "step": 1275 }, { "epoch": 0.4711964549483013, "grad_norm": 0.31919223070144653, "learning_rate": 0.00016869072545880036, "loss": 0.4033, "step": 1276 }, { "epoch": 0.47156573116691286, "grad_norm": 0.2940533459186554, "learning_rate": 0.00016866609188323685, "loss": 0.3477, "step": 1277 }, { "epoch": 0.47193500738552435, "grad_norm": 0.27033281326293945, "learning_rate": 0.00016864145830767336, "loss": 0.3523, "step": 1278 }, { "epoch": 0.4723042836041359, "grad_norm": 0.2709430158138275, "learning_rate": 0.00016861682473210988, "loss": 0.299, "step": 1279 }, { "epoch": 0.4726735598227474, "grad_norm": 0.2631966173648834, "learning_rate": 0.0001685921911565464, "loss": 0.3367, "step": 1280 }, { "epoch": 0.47304283604135894, "grad_norm": 0.3195972740650177, "learning_rate": 0.00016856755758098288, "loss": 0.3934, "step": 1281 }, { "epoch": 0.4734121122599705, "grad_norm": 0.26415082812309265, "learning_rate": 0.0001685429240054194, "loss": 0.3251, "step": 1282 }, { "epoch": 0.473781388478582, "grad_norm": 0.27375754714012146, "learning_rate": 0.0001685182904298559, "loss": 0.3805, "step": 1283 }, { "epoch": 0.4741506646971935, "grad_norm": 0.2656939625740051, "learning_rate": 0.00016849365685429242, "loss": 0.3021, "step": 1284 }, { "epoch": 0.474519940915805, "grad_norm": 0.2884897291660309, "learning_rate": 0.0001684690232787289, "loss": 0.343, "step": 1285 }, { "epoch": 0.47488921713441656, "grad_norm": 0.26789557933807373, "learning_rate": 0.00016844438970316542, "loss": 0.3184, "step": 1286 }, { "epoch": 0.47525849335302806, "grad_norm": 0.3235875964164734, "learning_rate": 0.0001684197561276019, "loss": 0.3812, "step": 1287 }, { "epoch": 0.4756277695716396, "grad_norm": 0.31186652183532715, "learning_rate": 0.00016839512255203845, "loss": 0.3684, "step": 1288 }, { "epoch": 0.4759970457902511, "grad_norm": 0.29642534255981445, "learning_rate": 0.00016837048897647494, "loss": 0.3309, "step": 1289 }, { "epoch": 0.47636632200886264, "grad_norm": 0.27301859855651855, "learning_rate": 0.00016834585540091146, "loss": 0.3301, "step": 1290 }, { "epoch": 0.47673559822747413, "grad_norm": 0.36492156982421875, "learning_rate": 0.00016832122182534794, "loss": 0.3854, "step": 1291 }, { "epoch": 0.4771048744460857, "grad_norm": 0.36236897110939026, "learning_rate": 0.00016829658824978446, "loss": 0.4151, "step": 1292 }, { "epoch": 0.4774741506646972, "grad_norm": 0.3140570819377899, "learning_rate": 0.00016827195467422097, "loss": 0.3772, "step": 1293 }, { "epoch": 0.4778434268833087, "grad_norm": 0.2550193667411804, "learning_rate": 0.0001682473210986575, "loss": 0.2802, "step": 1294 }, { "epoch": 0.4782127031019202, "grad_norm": 0.233677476644516, "learning_rate": 0.00016822268752309397, "loss": 0.2433, "step": 1295 }, { "epoch": 0.47858197932053176, "grad_norm": 0.2635647654533386, "learning_rate": 0.0001681980539475305, "loss": 0.2876, "step": 1296 }, { "epoch": 0.4789512555391433, "grad_norm": 0.2737426459789276, "learning_rate": 0.000168173420371967, "loss": 0.3753, "step": 1297 }, { "epoch": 0.4793205317577548, "grad_norm": 0.2624737322330475, "learning_rate": 0.00016814878679640352, "loss": 0.3745, "step": 1298 }, { "epoch": 0.47968980797636634, "grad_norm": 0.26850634813308716, "learning_rate": 0.00016812415322084, "loss": 0.3085, "step": 1299 }, { "epoch": 0.48005908419497784, "grad_norm": 0.3231109082698822, "learning_rate": 0.00016809951964527652, "loss": 0.3975, "step": 1300 }, { "epoch": 0.48005908419497784, "eval_loss": 0.31863901019096375, "eval_runtime": 5.8691, "eval_samples_per_second": 8.519, "eval_steps_per_second": 1.193, "step": 1300 }, { "epoch": 0.4804283604135894, "grad_norm": 0.27670300006866455, "learning_rate": 0.00016807488606971303, "loss": 0.3309, "step": 1301 }, { "epoch": 0.4807976366322009, "grad_norm": 0.32811930775642395, "learning_rate": 0.00016805025249414955, "loss": 0.3523, "step": 1302 }, { "epoch": 0.4811669128508124, "grad_norm": 0.2851490378379822, "learning_rate": 0.00016802561891858604, "loss": 0.3168, "step": 1303 }, { "epoch": 0.4815361890694239, "grad_norm": 0.3192996680736542, "learning_rate": 0.00016800098534302255, "loss": 0.2723, "step": 1304 }, { "epoch": 0.48190546528803546, "grad_norm": 0.24803690612316132, "learning_rate": 0.00016797635176745904, "loss": 0.3068, "step": 1305 }, { "epoch": 0.48227474150664695, "grad_norm": 0.2650405466556549, "learning_rate": 0.00016795171819189558, "loss": 0.242, "step": 1306 }, { "epoch": 0.4826440177252585, "grad_norm": 0.2497330904006958, "learning_rate": 0.00016792708461633207, "loss": 0.2766, "step": 1307 }, { "epoch": 0.48301329394387, "grad_norm": 0.2995474934577942, "learning_rate": 0.00016790245104076858, "loss": 0.3323, "step": 1308 }, { "epoch": 0.48338257016248154, "grad_norm": 0.25276845693588257, "learning_rate": 0.00016787781746520507, "loss": 0.2761, "step": 1309 }, { "epoch": 0.48375184638109303, "grad_norm": 0.3646930158138275, "learning_rate": 0.00016785318388964159, "loss": 0.3951, "step": 1310 }, { "epoch": 0.4841211225997046, "grad_norm": 0.27568763494491577, "learning_rate": 0.0001678285503140781, "loss": 0.3092, "step": 1311 }, { "epoch": 0.4844903988183161, "grad_norm": 0.327421635389328, "learning_rate": 0.00016780391673851461, "loss": 0.3603, "step": 1312 }, { "epoch": 0.4848596750369276, "grad_norm": 0.3478201925754547, "learning_rate": 0.0001677792831629511, "loss": 0.3946, "step": 1313 }, { "epoch": 0.48522895125553916, "grad_norm": 0.35054340958595276, "learning_rate": 0.00016775464958738762, "loss": 0.3769, "step": 1314 }, { "epoch": 0.48559822747415066, "grad_norm": 0.2698875665664673, "learning_rate": 0.00016773001601182413, "loss": 0.3266, "step": 1315 }, { "epoch": 0.4859675036927622, "grad_norm": 0.27946022152900696, "learning_rate": 0.00016770538243626065, "loss": 0.2999, "step": 1316 }, { "epoch": 0.4863367799113737, "grad_norm": 0.2864866256713867, "learning_rate": 0.00016768074886069713, "loss": 0.3329, "step": 1317 }, { "epoch": 0.48670605612998524, "grad_norm": 0.269000768661499, "learning_rate": 0.00016765611528513365, "loss": 0.2967, "step": 1318 }, { "epoch": 0.48707533234859673, "grad_norm": 0.3208789527416229, "learning_rate": 0.00016763148170957014, "loss": 0.3269, "step": 1319 }, { "epoch": 0.4874446085672083, "grad_norm": 0.2848138213157654, "learning_rate": 0.00016760684813400668, "loss": 0.3035, "step": 1320 }, { "epoch": 0.4878138847858198, "grad_norm": 0.268852561712265, "learning_rate": 0.00016758221455844317, "loss": 0.3338, "step": 1321 }, { "epoch": 0.4881831610044313, "grad_norm": 0.2637290954589844, "learning_rate": 0.00016755758098287968, "loss": 0.2747, "step": 1322 }, { "epoch": 0.4885524372230428, "grad_norm": 0.2860165238380432, "learning_rate": 0.00016753294740731617, "loss": 0.2727, "step": 1323 }, { "epoch": 0.48892171344165436, "grad_norm": 0.3032236695289612, "learning_rate": 0.00016750831383175268, "loss": 0.3854, "step": 1324 }, { "epoch": 0.48929098966026585, "grad_norm": 0.27861616015434265, "learning_rate": 0.0001674836802561892, "loss": 0.33, "step": 1325 }, { "epoch": 0.4896602658788774, "grad_norm": 0.339982271194458, "learning_rate": 0.0001674590466806257, "loss": 0.341, "step": 1326 }, { "epoch": 0.49002954209748895, "grad_norm": 0.21589699387550354, "learning_rate": 0.0001674344131050622, "loss": 0.287, "step": 1327 }, { "epoch": 0.49039881831610044, "grad_norm": 0.2754301428794861, "learning_rate": 0.0001674097795294987, "loss": 0.3341, "step": 1328 }, { "epoch": 0.490768094534712, "grad_norm": 0.2732886075973511, "learning_rate": 0.00016738514595393523, "loss": 0.3055, "step": 1329 }, { "epoch": 0.4911373707533235, "grad_norm": 0.27740296721458435, "learning_rate": 0.00016736051237837174, "loss": 0.3335, "step": 1330 }, { "epoch": 0.491506646971935, "grad_norm": 0.29700103402137756, "learning_rate": 0.00016733587880280823, "loss": 0.3403, "step": 1331 }, { "epoch": 0.4918759231905465, "grad_norm": 0.23043584823608398, "learning_rate": 0.00016731124522724474, "loss": 0.2416, "step": 1332 }, { "epoch": 0.49224519940915806, "grad_norm": 0.25385019183158875, "learning_rate": 0.00016728661165168126, "loss": 0.2833, "step": 1333 }, { "epoch": 0.49261447562776955, "grad_norm": 0.23877032101154327, "learning_rate": 0.00016726197807611777, "loss": 0.3121, "step": 1334 }, { "epoch": 0.4929837518463811, "grad_norm": 0.34636810421943665, "learning_rate": 0.00016723734450055426, "loss": 0.3682, "step": 1335 }, { "epoch": 0.4933530280649926, "grad_norm": 0.27772367000579834, "learning_rate": 0.00016721271092499078, "loss": 0.3362, "step": 1336 }, { "epoch": 0.49372230428360414, "grad_norm": 0.26868563890457153, "learning_rate": 0.00016718807734942726, "loss": 0.3536, "step": 1337 }, { "epoch": 0.49409158050221563, "grad_norm": 0.23275655508041382, "learning_rate": 0.0001671634437738638, "loss": 0.2636, "step": 1338 }, { "epoch": 0.4944608567208272, "grad_norm": 0.2450389862060547, "learning_rate": 0.0001671388101983003, "loss": 0.3104, "step": 1339 }, { "epoch": 0.4948301329394387, "grad_norm": 0.2956058979034424, "learning_rate": 0.0001671141766227368, "loss": 0.3747, "step": 1340 }, { "epoch": 0.4951994091580502, "grad_norm": 0.3192928433418274, "learning_rate": 0.0001670895430471733, "loss": 0.3266, "step": 1341 }, { "epoch": 0.49556868537666177, "grad_norm": 0.2584354281425476, "learning_rate": 0.0001670649094716098, "loss": 0.3121, "step": 1342 }, { "epoch": 0.49593796159527326, "grad_norm": 0.3440920114517212, "learning_rate": 0.00016704027589604632, "loss": 0.3531, "step": 1343 }, { "epoch": 0.4963072378138848, "grad_norm": 0.283672958612442, "learning_rate": 0.00016701564232048284, "loss": 0.3271, "step": 1344 }, { "epoch": 0.4966765140324963, "grad_norm": 0.27528640627861023, "learning_rate": 0.00016699100874491933, "loss": 0.371, "step": 1345 }, { "epoch": 0.49704579025110784, "grad_norm": 0.3449751138687134, "learning_rate": 0.00016696637516935584, "loss": 0.3208, "step": 1346 }, { "epoch": 0.49741506646971934, "grad_norm": 0.2928100526332855, "learning_rate": 0.00016694174159379236, "loss": 0.3253, "step": 1347 }, { "epoch": 0.4977843426883309, "grad_norm": 0.27168670296669006, "learning_rate": 0.00016691710801822887, "loss": 0.3631, "step": 1348 }, { "epoch": 0.4981536189069424, "grad_norm": 0.2860596179962158, "learning_rate": 0.00016689247444266536, "loss": 0.3608, "step": 1349 }, { "epoch": 0.4985228951255539, "grad_norm": 0.2555064857006073, "learning_rate": 0.00016686784086710187, "loss": 0.2899, "step": 1350 }, { "epoch": 0.4985228951255539, "eval_loss": 0.32421794533729553, "eval_runtime": 6.0917, "eval_samples_per_second": 8.208, "eval_steps_per_second": 1.149, "step": 1350 }, { "epoch": 0.4988921713441654, "grad_norm": 0.26284271478652954, "learning_rate": 0.00016684320729153836, "loss": 0.2863, "step": 1351 }, { "epoch": 0.49926144756277696, "grad_norm": 0.2949911653995514, "learning_rate": 0.0001668185737159749, "loss": 0.3462, "step": 1352 }, { "epoch": 0.49963072378138845, "grad_norm": 0.24794632196426392, "learning_rate": 0.0001667939401404114, "loss": 0.2472, "step": 1353 }, { "epoch": 0.5, "grad_norm": 0.23740360140800476, "learning_rate": 0.0001667693065648479, "loss": 0.2951, "step": 1354 }, { "epoch": 0.5003692762186115, "grad_norm": 0.3350389897823334, "learning_rate": 0.0001667446729892844, "loss": 0.391, "step": 1355 }, { "epoch": 0.5007385524372231, "grad_norm": 0.24600738286972046, "learning_rate": 0.0001667200394137209, "loss": 0.2722, "step": 1356 }, { "epoch": 0.5011078286558346, "grad_norm": 0.3570854961872101, "learning_rate": 0.00016669540583815742, "loss": 0.4061, "step": 1357 }, { "epoch": 0.5014771048744461, "grad_norm": 0.2860664427280426, "learning_rate": 0.00016667077226259394, "loss": 0.3423, "step": 1358 }, { "epoch": 0.5018463810930576, "grad_norm": 0.27321067452430725, "learning_rate": 0.00016664613868703042, "loss": 0.3348, "step": 1359 }, { "epoch": 0.5022156573116692, "grad_norm": 0.31462761759757996, "learning_rate": 0.00016662150511146694, "loss": 0.3204, "step": 1360 }, { "epoch": 0.5025849335302807, "grad_norm": 0.3128102421760559, "learning_rate": 0.00016659687153590345, "loss": 0.3525, "step": 1361 }, { "epoch": 0.5029542097488922, "grad_norm": 0.3167819380760193, "learning_rate": 0.00016657223796033997, "loss": 0.3246, "step": 1362 }, { "epoch": 0.5033234859675036, "grad_norm": 0.25416892766952515, "learning_rate": 0.00016654760438477645, "loss": 0.2934, "step": 1363 }, { "epoch": 0.5036927621861153, "grad_norm": 0.32133153080940247, "learning_rate": 0.00016652297080921297, "loss": 0.3408, "step": 1364 }, { "epoch": 0.5040620384047267, "grad_norm": 0.29662302136421204, "learning_rate": 0.00016649833723364948, "loss": 0.3164, "step": 1365 }, { "epoch": 0.5044313146233382, "grad_norm": 0.2954084277153015, "learning_rate": 0.000166473703658086, "loss": 0.364, "step": 1366 }, { "epoch": 0.5048005908419497, "grad_norm": 0.3069119453430176, "learning_rate": 0.00016644907008252249, "loss": 0.3354, "step": 1367 }, { "epoch": 0.5051698670605613, "grad_norm": 0.28633591532707214, "learning_rate": 0.000166424436506959, "loss": 0.3643, "step": 1368 }, { "epoch": 0.5055391432791728, "grad_norm": 0.30108842253685, "learning_rate": 0.0001663998029313955, "loss": 0.3745, "step": 1369 }, { "epoch": 0.5059084194977843, "grad_norm": 0.2561410963535309, "learning_rate": 0.00016637516935583203, "loss": 0.2836, "step": 1370 }, { "epoch": 0.5062776957163959, "grad_norm": 0.3111201226711273, "learning_rate": 0.00016635053578026852, "loss": 0.3228, "step": 1371 }, { "epoch": 0.5066469719350074, "grad_norm": 0.41428470611572266, "learning_rate": 0.00016632590220470503, "loss": 0.3925, "step": 1372 }, { "epoch": 0.5070162481536189, "grad_norm": 0.28226950764656067, "learning_rate": 0.00016630126862914152, "loss": 0.3106, "step": 1373 }, { "epoch": 0.5073855243722304, "grad_norm": 0.25619128346443176, "learning_rate": 0.00016627663505357803, "loss": 0.2821, "step": 1374 }, { "epoch": 0.507754800590842, "grad_norm": 0.27361437678337097, "learning_rate": 0.00016625200147801455, "loss": 0.3405, "step": 1375 }, { "epoch": 0.5081240768094535, "grad_norm": 0.2697855830192566, "learning_rate": 0.00016622736790245106, "loss": 0.3282, "step": 1376 }, { "epoch": 0.508493353028065, "grad_norm": 0.3352266550064087, "learning_rate": 0.00016620273432688755, "loss": 0.3868, "step": 1377 }, { "epoch": 0.5088626292466765, "grad_norm": 0.27453646063804626, "learning_rate": 0.00016617810075132407, "loss": 0.2805, "step": 1378 }, { "epoch": 0.5092319054652881, "grad_norm": 0.2632004916667938, "learning_rate": 0.00016615346717576058, "loss": 0.3112, "step": 1379 }, { "epoch": 0.5096011816838996, "grad_norm": 0.2820645272731781, "learning_rate": 0.0001661288336001971, "loss": 0.3243, "step": 1380 }, { "epoch": 0.509970457902511, "grad_norm": 0.2841394543647766, "learning_rate": 0.00016610420002463358, "loss": 0.356, "step": 1381 }, { "epoch": 0.5103397341211225, "grad_norm": 0.24863271415233612, "learning_rate": 0.0001660795664490701, "loss": 0.2861, "step": 1382 }, { "epoch": 0.5107090103397341, "grad_norm": 0.33343276381492615, "learning_rate": 0.00016605493287350658, "loss": 0.3359, "step": 1383 }, { "epoch": 0.5110782865583456, "grad_norm": 0.3108684718608856, "learning_rate": 0.00016603029929794313, "loss": 0.383, "step": 1384 }, { "epoch": 0.5114475627769571, "grad_norm": 0.2670239210128784, "learning_rate": 0.00016600566572237961, "loss": 0.3371, "step": 1385 }, { "epoch": 0.5118168389955687, "grad_norm": 0.19630670547485352, "learning_rate": 0.00016598103214681613, "loss": 0.2472, "step": 1386 }, { "epoch": 0.5121861152141802, "grad_norm": 0.28781604766845703, "learning_rate": 0.00016595639857125262, "loss": 0.3235, "step": 1387 }, { "epoch": 0.5125553914327917, "grad_norm": 0.2403174489736557, "learning_rate": 0.00016593176499568913, "loss": 0.2812, "step": 1388 }, { "epoch": 0.5129246676514032, "grad_norm": 0.39566493034362793, "learning_rate": 0.00016590713142012565, "loss": 0.3435, "step": 1389 }, { "epoch": 0.5132939438700148, "grad_norm": 0.26088348031044006, "learning_rate": 0.00016588249784456216, "loss": 0.3114, "step": 1390 }, { "epoch": 0.5136632200886263, "grad_norm": 0.21565835177898407, "learning_rate": 0.00016585786426899865, "loss": 0.2756, "step": 1391 }, { "epoch": 0.5140324963072378, "grad_norm": 0.2134483903646469, "learning_rate": 0.00016583323069343516, "loss": 0.2684, "step": 1392 }, { "epoch": 0.5144017725258493, "grad_norm": 0.30606624484062195, "learning_rate": 0.00016580859711787168, "loss": 0.4073, "step": 1393 }, { "epoch": 0.5147710487444609, "grad_norm": 0.3195679485797882, "learning_rate": 0.0001657839635423082, "loss": 0.2985, "step": 1394 }, { "epoch": 0.5151403249630724, "grad_norm": 0.28844207525253296, "learning_rate": 0.00016575932996674468, "loss": 0.3097, "step": 1395 }, { "epoch": 0.5155096011816839, "grad_norm": 0.2352343499660492, "learning_rate": 0.0001657346963911812, "loss": 0.242, "step": 1396 }, { "epoch": 0.5158788774002954, "grad_norm": 0.2848149240016937, "learning_rate": 0.00016571006281561768, "loss": 0.3088, "step": 1397 }, { "epoch": 0.516248153618907, "grad_norm": 0.2923349440097809, "learning_rate": 0.0001656854292400542, "loss": 0.3575, "step": 1398 }, { "epoch": 0.5166174298375185, "grad_norm": 0.27221575379371643, "learning_rate": 0.0001656607956644907, "loss": 0.3295, "step": 1399 }, { "epoch": 0.51698670605613, "grad_norm": 0.32597747445106506, "learning_rate": 0.0001656361620889272, "loss": 0.2864, "step": 1400 }, { "epoch": 0.51698670605613, "eval_loss": 0.3158749043941498, "eval_runtime": 5.8634, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 1400 }, { "epoch": 0.5173559822747416, "grad_norm": 0.26604223251342773, "learning_rate": 0.0001656115285133637, "loss": 0.232, "step": 1401 }, { "epoch": 0.517725258493353, "grad_norm": 0.2920580506324768, "learning_rate": 0.00016558689493780023, "loss": 0.3138, "step": 1402 }, { "epoch": 0.5180945347119645, "grad_norm": 0.3129686117172241, "learning_rate": 0.00016556226136223674, "loss": 0.3843, "step": 1403 }, { "epoch": 0.518463810930576, "grad_norm": 0.2926866412162781, "learning_rate": 0.00016553762778667323, "loss": 0.3102, "step": 1404 }, { "epoch": 0.5188330871491876, "grad_norm": 0.3401108384132385, "learning_rate": 0.00016551299421110974, "loss": 0.3358, "step": 1405 }, { "epoch": 0.5192023633677991, "grad_norm": 0.3641965985298157, "learning_rate": 0.00016548836063554626, "loss": 0.3424, "step": 1406 }, { "epoch": 0.5195716395864106, "grad_norm": 0.24067524075508118, "learning_rate": 0.00016546372705998277, "loss": 0.2566, "step": 1407 }, { "epoch": 0.5199409158050221, "grad_norm": 0.28999650478363037, "learning_rate": 0.00016543909348441926, "loss": 0.3272, "step": 1408 }, { "epoch": 0.5203101920236337, "grad_norm": 0.25917020440101624, "learning_rate": 0.00016541445990885578, "loss": 0.2696, "step": 1409 }, { "epoch": 0.5206794682422452, "grad_norm": 0.3292158842086792, "learning_rate": 0.00016538982633329226, "loss": 0.3769, "step": 1410 }, { "epoch": 0.5210487444608567, "grad_norm": 0.24697770178318024, "learning_rate": 0.0001653651927577288, "loss": 0.2903, "step": 1411 }, { "epoch": 0.5214180206794683, "grad_norm": 0.35604941844940186, "learning_rate": 0.0001653405591821653, "loss": 0.3977, "step": 1412 }, { "epoch": 0.5217872968980798, "grad_norm": 0.2998948097229004, "learning_rate": 0.0001653159256066018, "loss": 0.3352, "step": 1413 }, { "epoch": 0.5221565731166913, "grad_norm": 0.3183724582195282, "learning_rate": 0.0001652912920310383, "loss": 0.3569, "step": 1414 }, { "epoch": 0.5225258493353028, "grad_norm": 0.36239326000213623, "learning_rate": 0.0001652666584554748, "loss": 0.412, "step": 1415 }, { "epoch": 0.5228951255539144, "grad_norm": 0.28282883763313293, "learning_rate": 0.00016524202487991132, "loss": 0.3554, "step": 1416 }, { "epoch": 0.5232644017725259, "grad_norm": 0.25308147072792053, "learning_rate": 0.00016521739130434784, "loss": 0.2832, "step": 1417 }, { "epoch": 0.5236336779911374, "grad_norm": 0.26561856269836426, "learning_rate": 0.00016519275772878433, "loss": 0.3491, "step": 1418 }, { "epoch": 0.5240029542097489, "grad_norm": 0.3264663517475128, "learning_rate": 0.00016516812415322084, "loss": 0.3381, "step": 1419 }, { "epoch": 0.5243722304283605, "grad_norm": 0.26392993330955505, "learning_rate": 0.00016514349057765736, "loss": 0.2983, "step": 1420 }, { "epoch": 0.524741506646972, "grad_norm": 0.22230780124664307, "learning_rate": 0.00016511885700209387, "loss": 0.2936, "step": 1421 }, { "epoch": 0.5251107828655834, "grad_norm": 0.2518426477909088, "learning_rate": 0.00016509422342653036, "loss": 0.3409, "step": 1422 }, { "epoch": 0.5254800590841949, "grad_norm": 0.28394436836242676, "learning_rate": 0.00016506958985096687, "loss": 0.2992, "step": 1423 }, { "epoch": 0.5258493353028065, "grad_norm": 0.2799946367740631, "learning_rate": 0.00016504495627540336, "loss": 0.339, "step": 1424 }, { "epoch": 0.526218611521418, "grad_norm": 0.24300873279571533, "learning_rate": 0.0001650203226998399, "loss": 0.2897, "step": 1425 }, { "epoch": 0.5265878877400295, "grad_norm": 0.2443646341562271, "learning_rate": 0.0001649956891242764, "loss": 0.2778, "step": 1426 }, { "epoch": 0.5269571639586411, "grad_norm": 0.24227364361286163, "learning_rate": 0.0001649710555487129, "loss": 0.3106, "step": 1427 }, { "epoch": 0.5273264401772526, "grad_norm": 0.24078361690044403, "learning_rate": 0.0001649464219731494, "loss": 0.3071, "step": 1428 }, { "epoch": 0.5276957163958641, "grad_norm": 0.26339876651763916, "learning_rate": 0.0001649217883975859, "loss": 0.3343, "step": 1429 }, { "epoch": 0.5280649926144756, "grad_norm": 0.28139811754226685, "learning_rate": 0.00016489715482202242, "loss": 0.307, "step": 1430 }, { "epoch": 0.5284342688330872, "grad_norm": 0.29128336906433105, "learning_rate": 0.00016487252124645894, "loss": 0.3207, "step": 1431 }, { "epoch": 0.5288035450516987, "grad_norm": 0.3636402189731598, "learning_rate": 0.00016484788767089542, "loss": 0.3457, "step": 1432 }, { "epoch": 0.5291728212703102, "grad_norm": 0.28574737906455994, "learning_rate": 0.00016482325409533194, "loss": 0.3648, "step": 1433 }, { "epoch": 0.5295420974889217, "grad_norm": 0.29462724924087524, "learning_rate": 0.00016479862051976845, "loss": 0.3001, "step": 1434 }, { "epoch": 0.5299113737075333, "grad_norm": 0.3110988438129425, "learning_rate": 0.00016477398694420497, "loss": 0.426, "step": 1435 }, { "epoch": 0.5302806499261448, "grad_norm": 0.2409798949956894, "learning_rate": 0.00016474935336864145, "loss": 0.2339, "step": 1436 }, { "epoch": 0.5306499261447563, "grad_norm": 0.387317031621933, "learning_rate": 0.00016472471979307797, "loss": 0.3548, "step": 1437 }, { "epoch": 0.5310192023633677, "grad_norm": 0.27128246426582336, "learning_rate": 0.00016470008621751448, "loss": 0.3148, "step": 1438 }, { "epoch": 0.5313884785819794, "grad_norm": 0.28824299573898315, "learning_rate": 0.000164675452641951, "loss": 0.3721, "step": 1439 }, { "epoch": 0.5317577548005908, "grad_norm": 0.27748748660087585, "learning_rate": 0.00016465081906638749, "loss": 0.3408, "step": 1440 }, { "epoch": 0.5321270310192023, "grad_norm": 0.2905904948711395, "learning_rate": 0.000164626185490824, "loss": 0.3243, "step": 1441 }, { "epoch": 0.5324963072378139, "grad_norm": 0.24771912395954132, "learning_rate": 0.0001646015519152605, "loss": 0.3371, "step": 1442 }, { "epoch": 0.5328655834564254, "grad_norm": 0.24672196805477142, "learning_rate": 0.00016457691833969703, "loss": 0.319, "step": 1443 }, { "epoch": 0.5332348596750369, "grad_norm": 0.3501986265182495, "learning_rate": 0.00016455228476413352, "loss": 0.3868, "step": 1444 }, { "epoch": 0.5336041358936484, "grad_norm": 0.2565975785255432, "learning_rate": 0.00016452765118857003, "loss": 0.3418, "step": 1445 }, { "epoch": 0.53397341211226, "grad_norm": 0.2441720813512802, "learning_rate": 0.00016450301761300652, "loss": 0.3622, "step": 1446 }, { "epoch": 0.5343426883308715, "grad_norm": 0.23341308534145355, "learning_rate": 0.00016447838403744303, "loss": 0.2762, "step": 1447 }, { "epoch": 0.534711964549483, "grad_norm": 0.2702963054180145, "learning_rate": 0.00016445375046187955, "loss": 0.3285, "step": 1448 }, { "epoch": 0.5350812407680945, "grad_norm": 0.3120301067829132, "learning_rate": 0.00016442911688631606, "loss": 0.3018, "step": 1449 }, { "epoch": 0.5354505169867061, "grad_norm": 0.2426011562347412, "learning_rate": 0.00016440448331075255, "loss": 0.3151, "step": 1450 }, { "epoch": 0.5354505169867061, "eval_loss": 0.3156881034374237, "eval_runtime": 5.8514, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 1450 }, { "epoch": 0.5358197932053176, "grad_norm": 0.24446044862270355, "learning_rate": 0.00016437984973518907, "loss": 0.3148, "step": 1451 }, { "epoch": 0.5361890694239291, "grad_norm": 0.3189534544944763, "learning_rate": 0.00016435521615962558, "loss": 0.3989, "step": 1452 }, { "epoch": 0.5365583456425406, "grad_norm": 0.24779005348682404, "learning_rate": 0.0001643305825840621, "loss": 0.279, "step": 1453 }, { "epoch": 0.5369276218611522, "grad_norm": 0.27577096223831177, "learning_rate": 0.00016430594900849858, "loss": 0.2801, "step": 1454 }, { "epoch": 0.5372968980797637, "grad_norm": 0.23628848791122437, "learning_rate": 0.0001642813154329351, "loss": 0.2881, "step": 1455 }, { "epoch": 0.5376661742983752, "grad_norm": 0.2869516909122467, "learning_rate": 0.00016425668185737158, "loss": 0.3083, "step": 1456 }, { "epoch": 0.5380354505169868, "grad_norm": 0.2787828743457794, "learning_rate": 0.00016423204828180813, "loss": 0.3451, "step": 1457 }, { "epoch": 0.5384047267355982, "grad_norm": 0.2797350585460663, "learning_rate": 0.00016420741470624461, "loss": 0.3591, "step": 1458 }, { "epoch": 0.5387740029542097, "grad_norm": 0.30753234028816223, "learning_rate": 0.00016418278113068113, "loss": 0.3568, "step": 1459 }, { "epoch": 0.5391432791728212, "grad_norm": 0.2536819875240326, "learning_rate": 0.00016415814755511762, "loss": 0.2977, "step": 1460 }, { "epoch": 0.5395125553914328, "grad_norm": 0.2714076638221741, "learning_rate": 0.00016413351397955413, "loss": 0.3208, "step": 1461 }, { "epoch": 0.5398818316100443, "grad_norm": 0.20879343152046204, "learning_rate": 0.00016410888040399065, "loss": 0.2451, "step": 1462 }, { "epoch": 0.5402511078286558, "grad_norm": 0.34940165281295776, "learning_rate": 0.00016408424682842716, "loss": 0.3927, "step": 1463 }, { "epoch": 0.5406203840472673, "grad_norm": 0.25528988242149353, "learning_rate": 0.00016405961325286365, "loss": 0.2849, "step": 1464 }, { "epoch": 0.5409896602658789, "grad_norm": 0.3349994719028473, "learning_rate": 0.00016403497967730016, "loss": 0.3715, "step": 1465 }, { "epoch": 0.5413589364844904, "grad_norm": 0.2674342393875122, "learning_rate": 0.00016401034610173668, "loss": 0.3148, "step": 1466 }, { "epoch": 0.5417282127031019, "grad_norm": 0.2494310438632965, "learning_rate": 0.0001639857125261732, "loss": 0.2822, "step": 1467 }, { "epoch": 0.5420974889217134, "grad_norm": 0.2767691910266876, "learning_rate": 0.00016396107895060968, "loss": 0.3036, "step": 1468 }, { "epoch": 0.542466765140325, "grad_norm": 0.24478130042552948, "learning_rate": 0.0001639364453750462, "loss": 0.2783, "step": 1469 }, { "epoch": 0.5428360413589365, "grad_norm": 0.25796937942504883, "learning_rate": 0.0001639118117994827, "loss": 0.3331, "step": 1470 }, { "epoch": 0.543205317577548, "grad_norm": 0.27148813009262085, "learning_rate": 0.00016388717822391922, "loss": 0.2906, "step": 1471 }, { "epoch": 0.5435745937961596, "grad_norm": 0.27060097455978394, "learning_rate": 0.0001638625446483557, "loss": 0.2871, "step": 1472 }, { "epoch": 0.5439438700147711, "grad_norm": 0.3541237413883209, "learning_rate": 0.00016383791107279222, "loss": 0.39, "step": 1473 }, { "epoch": 0.5443131462333826, "grad_norm": 0.2907795011997223, "learning_rate": 0.0001638132774972287, "loss": 0.3264, "step": 1474 }, { "epoch": 0.544682422451994, "grad_norm": 0.2731079161167145, "learning_rate": 0.00016378864392166525, "loss": 0.3401, "step": 1475 }, { "epoch": 0.5450516986706057, "grad_norm": 0.33572500944137573, "learning_rate": 0.00016376401034610174, "loss": 0.4631, "step": 1476 }, { "epoch": 0.5454209748892171, "grad_norm": 0.23366647958755493, "learning_rate": 0.00016373937677053826, "loss": 0.2643, "step": 1477 }, { "epoch": 0.5457902511078286, "grad_norm": 0.2857065200805664, "learning_rate": 0.00016371474319497474, "loss": 0.3489, "step": 1478 }, { "epoch": 0.5461595273264401, "grad_norm": 0.3102196455001831, "learning_rate": 0.00016369010961941126, "loss": 0.409, "step": 1479 }, { "epoch": 0.5465288035450517, "grad_norm": 0.3188258409500122, "learning_rate": 0.00016366547604384777, "loss": 0.3686, "step": 1480 }, { "epoch": 0.5468980797636632, "grad_norm": 0.3555956482887268, "learning_rate": 0.0001636408424682843, "loss": 0.3396, "step": 1481 }, { "epoch": 0.5472673559822747, "grad_norm": 0.2818538248538971, "learning_rate": 0.00016361620889272078, "loss": 0.3132, "step": 1482 }, { "epoch": 0.5476366322008862, "grad_norm": 0.23820848762989044, "learning_rate": 0.0001635915753171573, "loss": 0.3029, "step": 1483 }, { "epoch": 0.5480059084194978, "grad_norm": 0.3810647428035736, "learning_rate": 0.0001635669417415938, "loss": 0.3869, "step": 1484 }, { "epoch": 0.5483751846381093, "grad_norm": 0.27515873312950134, "learning_rate": 0.00016354230816603032, "loss": 0.3279, "step": 1485 }, { "epoch": 0.5487444608567208, "grad_norm": 0.23117667436599731, "learning_rate": 0.0001635176745904668, "loss": 0.337, "step": 1486 }, { "epoch": 0.5491137370753324, "grad_norm": 0.2633749842643738, "learning_rate": 0.00016349304101490332, "loss": 0.3625, "step": 1487 }, { "epoch": 0.5494830132939439, "grad_norm": 0.30863508582115173, "learning_rate": 0.0001634684074393398, "loss": 0.3967, "step": 1488 }, { "epoch": 0.5498522895125554, "grad_norm": 0.32042405009269714, "learning_rate": 0.00016344377386377635, "loss": 0.3298, "step": 1489 }, { "epoch": 0.5502215657311669, "grad_norm": 0.2894206941127777, "learning_rate": 0.00016341914028821284, "loss": 0.3117, "step": 1490 }, { "epoch": 0.5505908419497785, "grad_norm": 0.32331088185310364, "learning_rate": 0.00016339450671264935, "loss": 0.378, "step": 1491 }, { "epoch": 0.55096011816839, "grad_norm": 0.2450391948223114, "learning_rate": 0.00016336987313708584, "loss": 0.2865, "step": 1492 }, { "epoch": 0.5513293943870015, "grad_norm": 0.2539740800857544, "learning_rate": 0.00016334523956152236, "loss": 0.2886, "step": 1493 }, { "epoch": 0.551698670605613, "grad_norm": 0.29503604769706726, "learning_rate": 0.00016332060598595887, "loss": 0.335, "step": 1494 }, { "epoch": 0.5520679468242246, "grad_norm": 0.359852135181427, "learning_rate": 0.00016329597241039538, "loss": 0.3423, "step": 1495 }, { "epoch": 0.552437223042836, "grad_norm": 0.28774669766426086, "learning_rate": 0.00016327133883483187, "loss": 0.3651, "step": 1496 }, { "epoch": 0.5528064992614475, "grad_norm": 0.35923609137535095, "learning_rate": 0.0001632467052592684, "loss": 0.3273, "step": 1497 }, { "epoch": 0.553175775480059, "grad_norm": 0.23598815500736237, "learning_rate": 0.0001632220716837049, "loss": 0.2984, "step": 1498 }, { "epoch": 0.5535450516986706, "grad_norm": 0.3182399868965149, "learning_rate": 0.00016319743810814142, "loss": 0.3696, "step": 1499 }, { "epoch": 0.5539143279172821, "grad_norm": 0.23834967613220215, "learning_rate": 0.0001631728045325779, "loss": 0.2899, "step": 1500 }, { "epoch": 0.5539143279172821, "eval_loss": 0.3087127208709717, "eval_runtime": 5.8621, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 1500 }, { "epoch": 0.5542836041358936, "grad_norm": 0.2657660245895386, "learning_rate": 0.00016314817095701442, "loss": 0.3114, "step": 1501 }, { "epoch": 0.5546528803545052, "grad_norm": 0.29077285528182983, "learning_rate": 0.00016312353738145093, "loss": 0.3777, "step": 1502 }, { "epoch": 0.5550221565731167, "grad_norm": 0.2574448883533478, "learning_rate": 0.00016309890380588745, "loss": 0.3219, "step": 1503 }, { "epoch": 0.5553914327917282, "grad_norm": 0.26231813430786133, "learning_rate": 0.00016307427023032393, "loss": 0.3296, "step": 1504 }, { "epoch": 0.5557607090103397, "grad_norm": 0.2316693812608719, "learning_rate": 0.00016304963665476045, "loss": 0.2807, "step": 1505 }, { "epoch": 0.5561299852289513, "grad_norm": 0.3146055042743683, "learning_rate": 0.00016302500307919694, "loss": 0.3211, "step": 1506 }, { "epoch": 0.5564992614475628, "grad_norm": 0.2974916994571686, "learning_rate": 0.00016300036950363348, "loss": 0.3232, "step": 1507 }, { "epoch": 0.5568685376661743, "grad_norm": 0.25729867815971375, "learning_rate": 0.00016297573592806997, "loss": 0.3356, "step": 1508 }, { "epoch": 0.5572378138847858, "grad_norm": 0.23712551593780518, "learning_rate": 0.00016295110235250648, "loss": 0.2992, "step": 1509 }, { "epoch": 0.5576070901033974, "grad_norm": 0.3085513114929199, "learning_rate": 0.00016292646877694297, "loss": 0.4058, "step": 1510 }, { "epoch": 0.5579763663220089, "grad_norm": 0.3611229360103607, "learning_rate": 0.00016290183520137948, "loss": 0.4202, "step": 1511 }, { "epoch": 0.5583456425406204, "grad_norm": 0.2877354919910431, "learning_rate": 0.000162877201625816, "loss": 0.3227, "step": 1512 }, { "epoch": 0.558714918759232, "grad_norm": 0.32323339581489563, "learning_rate": 0.0001628525680502525, "loss": 0.3422, "step": 1513 }, { "epoch": 0.5590841949778435, "grad_norm": 0.32171431183815, "learning_rate": 0.000162827934474689, "loss": 0.3269, "step": 1514 }, { "epoch": 0.5594534711964549, "grad_norm": 0.28730499744415283, "learning_rate": 0.00016280330089912551, "loss": 0.3358, "step": 1515 }, { "epoch": 0.5598227474150664, "grad_norm": 0.3502312898635864, "learning_rate": 0.00016277866732356203, "loss": 0.3287, "step": 1516 }, { "epoch": 0.560192023633678, "grad_norm": 0.26959285140037537, "learning_rate": 0.00016275403374799854, "loss": 0.3193, "step": 1517 }, { "epoch": 0.5605612998522895, "grad_norm": 0.29513633251190186, "learning_rate": 0.00016272940017243503, "loss": 0.3663, "step": 1518 }, { "epoch": 0.560930576070901, "grad_norm": 0.3040134012699127, "learning_rate": 0.00016270476659687155, "loss": 0.3396, "step": 1519 }, { "epoch": 0.5612998522895125, "grad_norm": 0.3299552798271179, "learning_rate": 0.00016268013302130803, "loss": 0.3937, "step": 1520 }, { "epoch": 0.5616691285081241, "grad_norm": 0.32670196890830994, "learning_rate": 0.00016265549944574458, "loss": 0.293, "step": 1521 }, { "epoch": 0.5620384047267356, "grad_norm": 0.25746822357177734, "learning_rate": 0.00016263086587018106, "loss": 0.2973, "step": 1522 }, { "epoch": 0.5624076809453471, "grad_norm": 0.2539050281047821, "learning_rate": 0.00016260623229461758, "loss": 0.3251, "step": 1523 }, { "epoch": 0.5627769571639586, "grad_norm": 0.3070831894874573, "learning_rate": 0.00016258159871905406, "loss": 0.3441, "step": 1524 }, { "epoch": 0.5631462333825702, "grad_norm": 0.2641865015029907, "learning_rate": 0.00016255696514349058, "loss": 0.3218, "step": 1525 }, { "epoch": 0.5635155096011817, "grad_norm": 0.31374669075012207, "learning_rate": 0.0001625323315679271, "loss": 0.4017, "step": 1526 }, { "epoch": 0.5638847858197932, "grad_norm": 0.29651129245758057, "learning_rate": 0.0001625076979923636, "loss": 0.326, "step": 1527 }, { "epoch": 0.5642540620384048, "grad_norm": 0.30200543999671936, "learning_rate": 0.0001624830644168001, "loss": 0.332, "step": 1528 }, { "epoch": 0.5646233382570163, "grad_norm": 0.2974286675453186, "learning_rate": 0.0001624584308412366, "loss": 0.3546, "step": 1529 }, { "epoch": 0.5649926144756278, "grad_norm": 0.28115278482437134, "learning_rate": 0.00016243379726567313, "loss": 0.3282, "step": 1530 }, { "epoch": 0.5653618906942393, "grad_norm": 0.30444568395614624, "learning_rate": 0.00016240916369010964, "loss": 0.3091, "step": 1531 }, { "epoch": 0.5657311669128509, "grad_norm": 0.2837795317173004, "learning_rate": 0.00016238453011454613, "loss": 0.3632, "step": 1532 }, { "epoch": 0.5661004431314623, "grad_norm": 0.2722748816013336, "learning_rate": 0.00016235989653898264, "loss": 0.3298, "step": 1533 }, { "epoch": 0.5664697193500738, "grad_norm": 0.26394224166870117, "learning_rate": 0.00016233526296341913, "loss": 0.3369, "step": 1534 }, { "epoch": 0.5668389955686853, "grad_norm": 0.33864474296569824, "learning_rate": 0.00016231062938785567, "loss": 0.3344, "step": 1535 }, { "epoch": 0.5672082717872969, "grad_norm": 0.31261345744132996, "learning_rate": 0.00016228599581229216, "loss": 0.3695, "step": 1536 }, { "epoch": 0.5675775480059084, "grad_norm": 0.27573469281196594, "learning_rate": 0.00016226136223672867, "loss": 0.2538, "step": 1537 }, { "epoch": 0.5679468242245199, "grad_norm": 0.29221418499946594, "learning_rate": 0.00016223672866116516, "loss": 0.3761, "step": 1538 }, { "epoch": 0.5683161004431314, "grad_norm": 0.34038153290748596, "learning_rate": 0.0001622120950856017, "loss": 0.3817, "step": 1539 }, { "epoch": 0.568685376661743, "grad_norm": 0.3297428786754608, "learning_rate": 0.0001621874615100382, "loss": 0.3304, "step": 1540 }, { "epoch": 0.5690546528803545, "grad_norm": 0.3473551273345947, "learning_rate": 0.0001621628279344747, "loss": 0.3048, "step": 1541 }, { "epoch": 0.569423929098966, "grad_norm": 0.32104527950286865, "learning_rate": 0.0001621381943589112, "loss": 0.3712, "step": 1542 }, { "epoch": 0.5697932053175776, "grad_norm": 0.29048553109169006, "learning_rate": 0.0001621135607833477, "loss": 0.3003, "step": 1543 }, { "epoch": 0.5701624815361891, "grad_norm": 0.27641400694847107, "learning_rate": 0.00016208892720778422, "loss": 0.3013, "step": 1544 }, { "epoch": 0.5705317577548006, "grad_norm": 0.2748175263404846, "learning_rate": 0.00016206429363222074, "loss": 0.3312, "step": 1545 }, { "epoch": 0.5709010339734121, "grad_norm": 0.3276108503341675, "learning_rate": 0.00016203966005665722, "loss": 0.313, "step": 1546 }, { "epoch": 0.5712703101920237, "grad_norm": 0.2572093605995178, "learning_rate": 0.00016201502648109374, "loss": 0.3132, "step": 1547 }, { "epoch": 0.5716395864106352, "grad_norm": 0.29883354902267456, "learning_rate": 0.00016199039290553025, "loss": 0.2743, "step": 1548 }, { "epoch": 0.5720088626292467, "grad_norm": 0.24311719834804535, "learning_rate": 0.00016196575932996677, "loss": 0.2732, "step": 1549 }, { "epoch": 0.5723781388478582, "grad_norm": 0.25401267409324646, "learning_rate": 0.00016194112575440326, "loss": 0.2704, "step": 1550 }, { "epoch": 0.5723781388478582, "eval_loss": 0.3062502145767212, "eval_runtime": 5.8576, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 1550 }, { "epoch": 0.5727474150664698, "grad_norm": 0.339450865983963, "learning_rate": 0.00016191649217883977, "loss": 0.407, "step": 1551 }, { "epoch": 0.5731166912850812, "grad_norm": 0.3461816906929016, "learning_rate": 0.00016189185860327626, "loss": 0.3307, "step": 1552 }, { "epoch": 0.5734859675036927, "grad_norm": 0.3485092222690582, "learning_rate": 0.0001618672250277128, "loss": 0.3595, "step": 1553 }, { "epoch": 0.5738552437223042, "grad_norm": 0.29093149304389954, "learning_rate": 0.0001618425914521493, "loss": 0.3634, "step": 1554 }, { "epoch": 0.5742245199409158, "grad_norm": 0.2776423692703247, "learning_rate": 0.0001618179578765858, "loss": 0.2651, "step": 1555 }, { "epoch": 0.5745937961595273, "grad_norm": 0.2952934801578522, "learning_rate": 0.0001617933243010223, "loss": 0.3159, "step": 1556 }, { "epoch": 0.5749630723781388, "grad_norm": 0.27229222655296326, "learning_rate": 0.0001617686907254588, "loss": 0.2521, "step": 1557 }, { "epoch": 0.5753323485967504, "grad_norm": 0.2553476095199585, "learning_rate": 0.00016174405714989532, "loss": 0.2818, "step": 1558 }, { "epoch": 0.5757016248153619, "grad_norm": 0.32476967573165894, "learning_rate": 0.00016171942357433183, "loss": 0.4236, "step": 1559 }, { "epoch": 0.5760709010339734, "grad_norm": 0.2893090546131134, "learning_rate": 0.00016169478999876832, "loss": 0.3393, "step": 1560 }, { "epoch": 0.5764401772525849, "grad_norm": 0.4110506474971771, "learning_rate": 0.00016167015642320484, "loss": 0.3571, "step": 1561 }, { "epoch": 0.5768094534711965, "grad_norm": 0.33851632475852966, "learning_rate": 0.00016164552284764135, "loss": 0.3922, "step": 1562 }, { "epoch": 0.577178729689808, "grad_norm": 0.2789517343044281, "learning_rate": 0.00016162088927207786, "loss": 0.3559, "step": 1563 }, { "epoch": 0.5775480059084195, "grad_norm": 0.29222574830055237, "learning_rate": 0.00016159625569651435, "loss": 0.3155, "step": 1564 }, { "epoch": 0.577917282127031, "grad_norm": 0.2853766083717346, "learning_rate": 0.00016157162212095087, "loss": 0.3646, "step": 1565 }, { "epoch": 0.5782865583456426, "grad_norm": 0.317956805229187, "learning_rate": 0.00016154698854538735, "loss": 0.3804, "step": 1566 }, { "epoch": 0.5786558345642541, "grad_norm": 0.26587975025177, "learning_rate": 0.0001615223549698239, "loss": 0.357, "step": 1567 }, { "epoch": 0.5790251107828656, "grad_norm": 0.3031890392303467, "learning_rate": 0.00016149772139426038, "loss": 0.3592, "step": 1568 }, { "epoch": 0.579394387001477, "grad_norm": 0.2600473463535309, "learning_rate": 0.0001614730878186969, "loss": 0.2968, "step": 1569 }, { "epoch": 0.5797636632200887, "grad_norm": 0.2568517327308655, "learning_rate": 0.00016144845424313339, "loss": 0.3467, "step": 1570 }, { "epoch": 0.5801329394387001, "grad_norm": 0.2554601728916168, "learning_rate": 0.0001614238206675699, "loss": 0.3469, "step": 1571 }, { "epoch": 0.5805022156573116, "grad_norm": 0.31351980566978455, "learning_rate": 0.00016139918709200642, "loss": 0.3786, "step": 1572 }, { "epoch": 0.5808714918759232, "grad_norm": 0.2992032468318939, "learning_rate": 0.00016137455351644293, "loss": 0.3611, "step": 1573 }, { "epoch": 0.5812407680945347, "grad_norm": 0.3725816011428833, "learning_rate": 0.00016134991994087942, "loss": 0.3144, "step": 1574 }, { "epoch": 0.5816100443131462, "grad_norm": 0.31933802366256714, "learning_rate": 0.00016132528636531593, "loss": 0.4571, "step": 1575 }, { "epoch": 0.5819793205317577, "grad_norm": 0.33846473693847656, "learning_rate": 0.00016130065278975245, "loss": 0.3462, "step": 1576 }, { "epoch": 0.5823485967503693, "grad_norm": 0.28070947527885437, "learning_rate": 0.00016127601921418896, "loss": 0.3154, "step": 1577 }, { "epoch": 0.5827178729689808, "grad_norm": 0.2888851463794708, "learning_rate": 0.00016125138563862545, "loss": 0.4309, "step": 1578 }, { "epoch": 0.5830871491875923, "grad_norm": 0.30861085653305054, "learning_rate": 0.00016122675206306196, "loss": 0.3413, "step": 1579 }, { "epoch": 0.5834564254062038, "grad_norm": 0.27672407031059265, "learning_rate": 0.00016120211848749848, "loss": 0.3191, "step": 1580 }, { "epoch": 0.5838257016248154, "grad_norm": 0.2880988121032715, "learning_rate": 0.000161177484911935, "loss": 0.3164, "step": 1581 }, { "epoch": 0.5841949778434269, "grad_norm": 0.263375461101532, "learning_rate": 0.00016115285133637148, "loss": 0.3351, "step": 1582 }, { "epoch": 0.5845642540620384, "grad_norm": 0.2584865987300873, "learning_rate": 0.000161128217760808, "loss": 0.291, "step": 1583 }, { "epoch": 0.5849335302806499, "grad_norm": 0.30513739585876465, "learning_rate": 0.00016110358418524448, "loss": 0.3543, "step": 1584 }, { "epoch": 0.5853028064992615, "grad_norm": 0.2361738681793213, "learning_rate": 0.00016107895060968102, "loss": 0.2925, "step": 1585 }, { "epoch": 0.585672082717873, "grad_norm": 0.2888126075267792, "learning_rate": 0.0001610543170341175, "loss": 0.3024, "step": 1586 }, { "epoch": 0.5860413589364845, "grad_norm": 0.2634701430797577, "learning_rate": 0.00016102968345855403, "loss": 0.3267, "step": 1587 }, { "epoch": 0.5864106351550961, "grad_norm": 0.3056796193122864, "learning_rate": 0.00016100504988299051, "loss": 0.3574, "step": 1588 }, { "epoch": 0.5867799113737076, "grad_norm": 0.29132723808288574, "learning_rate": 0.00016098041630742703, "loss": 0.2785, "step": 1589 }, { "epoch": 0.587149187592319, "grad_norm": 0.24154838919639587, "learning_rate": 0.00016095578273186354, "loss": 0.3014, "step": 1590 }, { "epoch": 0.5875184638109305, "grad_norm": 0.295614093542099, "learning_rate": 0.00016093114915630006, "loss": 0.3135, "step": 1591 }, { "epoch": 0.5878877400295421, "grad_norm": 0.2389475256204605, "learning_rate": 0.00016090651558073655, "loss": 0.2642, "step": 1592 }, { "epoch": 0.5882570162481536, "grad_norm": 0.26387137174606323, "learning_rate": 0.00016088188200517306, "loss": 0.3154, "step": 1593 }, { "epoch": 0.5886262924667651, "grad_norm": 0.3425314128398895, "learning_rate": 0.00016085724842960957, "loss": 0.3827, "step": 1594 }, { "epoch": 0.5889955686853766, "grad_norm": 0.37990602850914, "learning_rate": 0.0001608326148540461, "loss": 0.4037, "step": 1595 }, { "epoch": 0.5893648449039882, "grad_norm": 0.30081379413604736, "learning_rate": 0.00016080798127848258, "loss": 0.3083, "step": 1596 }, { "epoch": 0.5897341211225997, "grad_norm": 0.2976696193218231, "learning_rate": 0.0001607833477029191, "loss": 0.3093, "step": 1597 }, { "epoch": 0.5901033973412112, "grad_norm": 0.25933101773262024, "learning_rate": 0.00016075871412735558, "loss": 0.2725, "step": 1598 }, { "epoch": 0.5904726735598228, "grad_norm": 0.27247390151023865, "learning_rate": 0.00016073408055179212, "loss": 0.2799, "step": 1599 }, { "epoch": 0.5908419497784343, "grad_norm": 0.30138272047042847, "learning_rate": 0.0001607094469762286, "loss": 0.3352, "step": 1600 }, { "epoch": 0.5908419497784343, "eval_loss": 0.30299845337867737, "eval_runtime": 5.8515, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 1600 }, { "epoch": 0.5912112259970458, "grad_norm": 0.2863950729370117, "learning_rate": 0.00016068481340066512, "loss": 0.3244, "step": 1601 }, { "epoch": 0.5915805022156573, "grad_norm": 0.28526681661605835, "learning_rate": 0.0001606601798251016, "loss": 0.3739, "step": 1602 }, { "epoch": 0.5919497784342689, "grad_norm": 0.32612308859825134, "learning_rate": 0.00016063554624953813, "loss": 0.3797, "step": 1603 }, { "epoch": 0.5923190546528804, "grad_norm": 0.2852473556995392, "learning_rate": 0.00016061091267397464, "loss": 0.3203, "step": 1604 }, { "epoch": 0.5926883308714919, "grad_norm": 0.23356491327285767, "learning_rate": 0.00016058627909841115, "loss": 0.2677, "step": 1605 }, { "epoch": 0.5930576070901034, "grad_norm": 0.27636459469795227, "learning_rate": 0.00016056164552284764, "loss": 0.2944, "step": 1606 }, { "epoch": 0.593426883308715, "grad_norm": 0.23538675904273987, "learning_rate": 0.00016053701194728416, "loss": 0.2815, "step": 1607 }, { "epoch": 0.5937961595273265, "grad_norm": 0.32497119903564453, "learning_rate": 0.00016051237837172067, "loss": 0.4065, "step": 1608 }, { "epoch": 0.5941654357459379, "grad_norm": 0.3353843092918396, "learning_rate": 0.00016048774479615719, "loss": 0.4053, "step": 1609 }, { "epoch": 0.5945347119645494, "grad_norm": 0.24563480913639069, "learning_rate": 0.00016046311122059367, "loss": 0.2794, "step": 1610 }, { "epoch": 0.594903988183161, "grad_norm": 0.31058967113494873, "learning_rate": 0.0001604384776450302, "loss": 0.3113, "step": 1611 }, { "epoch": 0.5952732644017725, "grad_norm": 0.2887929081916809, "learning_rate": 0.0001604138440694667, "loss": 0.3434, "step": 1612 }, { "epoch": 0.595642540620384, "grad_norm": 0.2597660422325134, "learning_rate": 0.00016038921049390322, "loss": 0.231, "step": 1613 }, { "epoch": 0.5960118168389956, "grad_norm": 0.3016369938850403, "learning_rate": 0.0001603645769183397, "loss": 0.3576, "step": 1614 }, { "epoch": 0.5963810930576071, "grad_norm": 0.37979385256767273, "learning_rate": 0.00016033994334277622, "loss": 0.3593, "step": 1615 }, { "epoch": 0.5967503692762186, "grad_norm": 0.2681538760662079, "learning_rate": 0.0001603153097672127, "loss": 0.2734, "step": 1616 }, { "epoch": 0.5971196454948301, "grad_norm": 0.2787245810031891, "learning_rate": 0.00016029067619164925, "loss": 0.3407, "step": 1617 }, { "epoch": 0.5974889217134417, "grad_norm": 0.39077043533325195, "learning_rate": 0.00016026604261608574, "loss": 0.4054, "step": 1618 }, { "epoch": 0.5978581979320532, "grad_norm": 0.29116880893707275, "learning_rate": 0.00016024140904052225, "loss": 0.3491, "step": 1619 }, { "epoch": 0.5982274741506647, "grad_norm": 0.32160866260528564, "learning_rate": 0.00016021677546495874, "loss": 0.3142, "step": 1620 }, { "epoch": 0.5985967503692762, "grad_norm": 0.29578158259391785, "learning_rate": 0.00016019214188939525, "loss": 0.3588, "step": 1621 }, { "epoch": 0.5989660265878878, "grad_norm": 0.2547626197338104, "learning_rate": 0.00016016750831383177, "loss": 0.3059, "step": 1622 }, { "epoch": 0.5993353028064993, "grad_norm": 0.3139692544937134, "learning_rate": 0.00016014287473826828, "loss": 0.3273, "step": 1623 }, { "epoch": 0.5997045790251108, "grad_norm": 0.2903187870979309, "learning_rate": 0.00016011824116270477, "loss": 0.3736, "step": 1624 }, { "epoch": 0.6000738552437223, "grad_norm": 0.2500844895839691, "learning_rate": 0.00016009360758714128, "loss": 0.3023, "step": 1625 }, { "epoch": 0.6004431314623339, "grad_norm": 0.29968592524528503, "learning_rate": 0.0001600689740115778, "loss": 0.324, "step": 1626 }, { "epoch": 0.6008124076809453, "grad_norm": 0.2452509105205536, "learning_rate": 0.00016004434043601431, "loss": 0.3263, "step": 1627 }, { "epoch": 0.6011816838995568, "grad_norm": 0.26588737964630127, "learning_rate": 0.0001600197068604508, "loss": 0.2953, "step": 1628 }, { "epoch": 0.6015509601181684, "grad_norm": 0.2977176010608673, "learning_rate": 0.0001599950732848873, "loss": 0.3184, "step": 1629 }, { "epoch": 0.6019202363367799, "grad_norm": 0.30278778076171875, "learning_rate": 0.0001599704397093238, "loss": 0.4025, "step": 1630 }, { "epoch": 0.6022895125553914, "grad_norm": 0.2819361090660095, "learning_rate": 0.00015994580613376032, "loss": 0.3347, "step": 1631 }, { "epoch": 0.6026587887740029, "grad_norm": 0.24064016342163086, "learning_rate": 0.00015992117255819683, "loss": 0.3209, "step": 1632 }, { "epoch": 0.6030280649926145, "grad_norm": 0.31922975182533264, "learning_rate": 0.00015989653898263332, "loss": 0.3373, "step": 1633 }, { "epoch": 0.603397341211226, "grad_norm": 0.35239550471305847, "learning_rate": 0.00015987190540706984, "loss": 0.3235, "step": 1634 }, { "epoch": 0.6037666174298375, "grad_norm": 0.35154789686203003, "learning_rate": 0.00015984727183150635, "loss": 0.371, "step": 1635 }, { "epoch": 0.604135893648449, "grad_norm": 0.2805372178554535, "learning_rate": 0.00015982263825594286, "loss": 0.316, "step": 1636 }, { "epoch": 0.6045051698670606, "grad_norm": 0.2914516031742096, "learning_rate": 0.00015979800468037935, "loss": 0.3622, "step": 1637 }, { "epoch": 0.6048744460856721, "grad_norm": 0.297642320394516, "learning_rate": 0.00015977337110481587, "loss": 0.3404, "step": 1638 }, { "epoch": 0.6052437223042836, "grad_norm": 0.32754257321357727, "learning_rate": 0.00015974873752925238, "loss": 0.3707, "step": 1639 }, { "epoch": 0.6056129985228951, "grad_norm": 0.2718541920185089, "learning_rate": 0.0001597241039536889, "loss": 0.3246, "step": 1640 }, { "epoch": 0.6059822747415067, "grad_norm": 0.28009459376335144, "learning_rate": 0.00015969947037812538, "loss": 0.2848, "step": 1641 }, { "epoch": 0.6063515509601182, "grad_norm": 0.27391892671585083, "learning_rate": 0.0001596748368025619, "loss": 0.2587, "step": 1642 }, { "epoch": 0.6067208271787297, "grad_norm": 0.25154897570610046, "learning_rate": 0.00015965020322699839, "loss": 0.2921, "step": 1643 }, { "epoch": 0.6070901033973413, "grad_norm": 0.30357787013053894, "learning_rate": 0.00015962556965143493, "loss": 0.3288, "step": 1644 }, { "epoch": 0.6074593796159528, "grad_norm": 0.3554220497608185, "learning_rate": 0.00015960093607587141, "loss": 0.4087, "step": 1645 }, { "epoch": 0.6078286558345642, "grad_norm": 0.29400530457496643, "learning_rate": 0.00015957630250030793, "loss": 0.3002, "step": 1646 }, { "epoch": 0.6081979320531757, "grad_norm": 0.25072628259658813, "learning_rate": 0.00015955166892474442, "loss": 0.2878, "step": 1647 }, { "epoch": 0.6085672082717873, "grad_norm": 0.24574296176433563, "learning_rate": 0.00015952703534918093, "loss": 0.306, "step": 1648 }, { "epoch": 0.6089364844903988, "grad_norm": 0.2616758346557617, "learning_rate": 0.00015950240177361745, "loss": 0.3238, "step": 1649 }, { "epoch": 0.6093057607090103, "grad_norm": 0.5619114637374878, "learning_rate": 0.00015947776819805396, "loss": 0.332, "step": 1650 }, { "epoch": 0.6093057607090103, "eval_loss": 0.3030269742012024, "eval_runtime": 5.8506, "eval_samples_per_second": 8.546, "eval_steps_per_second": 1.196, "step": 1650 }, { "epoch": 0.6096750369276218, "grad_norm": 0.2748061716556549, "learning_rate": 0.00015945313462249045, "loss": 0.326, "step": 1651 }, { "epoch": 0.6100443131462334, "grad_norm": 0.26775607466697693, "learning_rate": 0.00015942850104692696, "loss": 0.3089, "step": 1652 }, { "epoch": 0.6104135893648449, "grad_norm": 0.29094046354293823, "learning_rate": 0.00015940386747136348, "loss": 0.3442, "step": 1653 }, { "epoch": 0.6107828655834564, "grad_norm": 0.3499103784561157, "learning_rate": 0.0001593792338958, "loss": 0.3428, "step": 1654 }, { "epoch": 0.6111521418020679, "grad_norm": 0.29585933685302734, "learning_rate": 0.00015935460032023648, "loss": 0.3881, "step": 1655 }, { "epoch": 0.6115214180206795, "grad_norm": 0.28750079870224, "learning_rate": 0.000159329966744673, "loss": 0.3441, "step": 1656 }, { "epoch": 0.611890694239291, "grad_norm": 0.3632444441318512, "learning_rate": 0.00015930533316910948, "loss": 0.4613, "step": 1657 }, { "epoch": 0.6122599704579025, "grad_norm": 0.3101236820220947, "learning_rate": 0.00015928069959354602, "loss": 0.3227, "step": 1658 }, { "epoch": 0.6126292466765141, "grad_norm": 0.30292463302612305, "learning_rate": 0.0001592560660179825, "loss": 0.3794, "step": 1659 }, { "epoch": 0.6129985228951256, "grad_norm": 0.2677481472492218, "learning_rate": 0.00015923143244241903, "loss": 0.3185, "step": 1660 }, { "epoch": 0.6133677991137371, "grad_norm": 0.25809037685394287, "learning_rate": 0.0001592067988668555, "loss": 0.3089, "step": 1661 }, { "epoch": 0.6137370753323486, "grad_norm": 0.23848919570446014, "learning_rate": 0.00015918216529129203, "loss": 0.2741, "step": 1662 }, { "epoch": 0.6141063515509602, "grad_norm": 0.30430811643600464, "learning_rate": 0.00015915753171572854, "loss": 0.3688, "step": 1663 }, { "epoch": 0.6144756277695717, "grad_norm": 0.246963769197464, "learning_rate": 0.00015913289814016506, "loss": 0.2821, "step": 1664 }, { "epoch": 0.6148449039881831, "grad_norm": 0.24022459983825684, "learning_rate": 0.00015910826456460154, "loss": 0.3068, "step": 1665 }, { "epoch": 0.6152141802067946, "grad_norm": 0.2601327896118164, "learning_rate": 0.00015908363098903806, "loss": 0.2922, "step": 1666 }, { "epoch": 0.6155834564254062, "grad_norm": 0.27064478397369385, "learning_rate": 0.00015905899741347457, "loss": 0.3405, "step": 1667 }, { "epoch": 0.6159527326440177, "grad_norm": 0.3012557625770569, "learning_rate": 0.0001590343638379111, "loss": 0.3401, "step": 1668 }, { "epoch": 0.6163220088626292, "grad_norm": 0.237897127866745, "learning_rate": 0.00015900973026234758, "loss": 0.2843, "step": 1669 }, { "epoch": 0.6166912850812407, "grad_norm": 0.24375304579734802, "learning_rate": 0.0001589850966867841, "loss": 0.3108, "step": 1670 }, { "epoch": 0.6170605612998523, "grad_norm": 0.291055291891098, "learning_rate": 0.0001589604631112206, "loss": 0.3754, "step": 1671 }, { "epoch": 0.6174298375184638, "grad_norm": 0.23340968787670135, "learning_rate": 0.00015893582953565712, "loss": 0.3151, "step": 1672 }, { "epoch": 0.6177991137370753, "grad_norm": 0.25678062438964844, "learning_rate": 0.0001589111959600936, "loss": 0.3065, "step": 1673 }, { "epoch": 0.6181683899556869, "grad_norm": 0.27369391918182373, "learning_rate": 0.00015888656238453012, "loss": 0.2954, "step": 1674 }, { "epoch": 0.6185376661742984, "grad_norm": 0.2728053629398346, "learning_rate": 0.0001588619288089666, "loss": 0.331, "step": 1675 }, { "epoch": 0.6189069423929099, "grad_norm": 0.26507559418678284, "learning_rate": 0.00015883729523340315, "loss": 0.2763, "step": 1676 }, { "epoch": 0.6192762186115214, "grad_norm": 0.3048473000526428, "learning_rate": 0.00015881266165783964, "loss": 0.3407, "step": 1677 }, { "epoch": 0.619645494830133, "grad_norm": 0.2589951157569885, "learning_rate": 0.00015878802808227615, "loss": 0.3002, "step": 1678 }, { "epoch": 0.6200147710487445, "grad_norm": 0.2744160592556, "learning_rate": 0.00015876339450671264, "loss": 0.335, "step": 1679 }, { "epoch": 0.620384047267356, "grad_norm": 0.26012927293777466, "learning_rate": 0.00015873876093114916, "loss": 0.3154, "step": 1680 }, { "epoch": 0.6207533234859675, "grad_norm": 0.2780097424983978, "learning_rate": 0.00015871412735558567, "loss": 0.3, "step": 1681 }, { "epoch": 0.6211225997045791, "grad_norm": 0.20938009023666382, "learning_rate": 0.00015868949378002219, "loss": 0.2146, "step": 1682 }, { "epoch": 0.6214918759231906, "grad_norm": 0.2516339421272278, "learning_rate": 0.00015866486020445867, "loss": 0.2809, "step": 1683 }, { "epoch": 0.621861152141802, "grad_norm": 0.2812969386577606, "learning_rate": 0.0001586402266288952, "loss": 0.3003, "step": 1684 }, { "epoch": 0.6222304283604135, "grad_norm": 0.20179781317710876, "learning_rate": 0.0001586155930533317, "loss": 0.2153, "step": 1685 }, { "epoch": 0.6225997045790251, "grad_norm": 0.33972302079200745, "learning_rate": 0.00015859095947776822, "loss": 0.3614, "step": 1686 }, { "epoch": 0.6229689807976366, "grad_norm": 0.2498815804719925, "learning_rate": 0.0001585663259022047, "loss": 0.332, "step": 1687 }, { "epoch": 0.6233382570162481, "grad_norm": 0.30376482009887695, "learning_rate": 0.00015854169232664122, "loss": 0.3114, "step": 1688 }, { "epoch": 0.6237075332348597, "grad_norm": 0.2777147591114044, "learning_rate": 0.0001585170587510777, "loss": 0.3257, "step": 1689 }, { "epoch": 0.6240768094534712, "grad_norm": 0.28457579016685486, "learning_rate": 0.00015849242517551425, "loss": 0.3483, "step": 1690 }, { "epoch": 0.6244460856720827, "grad_norm": 0.3040483593940735, "learning_rate": 0.00015846779159995074, "loss": 0.3239, "step": 1691 }, { "epoch": 0.6248153618906942, "grad_norm": 0.2769504189491272, "learning_rate": 0.00015844315802438725, "loss": 0.3388, "step": 1692 }, { "epoch": 0.6251846381093058, "grad_norm": 0.2751236855983734, "learning_rate": 0.00015841852444882374, "loss": 0.3227, "step": 1693 }, { "epoch": 0.6255539143279173, "grad_norm": 0.2764040231704712, "learning_rate": 0.00015839389087326025, "loss": 0.3437, "step": 1694 }, { "epoch": 0.6259231905465288, "grad_norm": 0.25166937708854675, "learning_rate": 0.00015836925729769677, "loss": 0.2892, "step": 1695 }, { "epoch": 0.6262924667651403, "grad_norm": 0.307192325592041, "learning_rate": 0.00015834462372213328, "loss": 0.3501, "step": 1696 }, { "epoch": 0.6266617429837519, "grad_norm": 0.33956125378608704, "learning_rate": 0.00015831999014656977, "loss": 0.4298, "step": 1697 }, { "epoch": 0.6270310192023634, "grad_norm": 0.21702831983566284, "learning_rate": 0.00015829535657100628, "loss": 0.2942, "step": 1698 }, { "epoch": 0.6274002954209749, "grad_norm": 0.27251285314559937, "learning_rate": 0.0001582707229954428, "loss": 0.3085, "step": 1699 }, { "epoch": 0.6277695716395865, "grad_norm": 0.2970748245716095, "learning_rate": 0.0001582460894198793, "loss": 0.3112, "step": 1700 }, { "epoch": 0.6277695716395865, "eval_loss": 0.2985492944717407, "eval_runtime": 5.8512, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 1700 }, { "epoch": 0.628138847858198, "grad_norm": 0.32603204250335693, "learning_rate": 0.0001582214558443158, "loss": 0.3757, "step": 1701 }, { "epoch": 0.6285081240768094, "grad_norm": 0.3002977967262268, "learning_rate": 0.00015819682226875232, "loss": 0.3623, "step": 1702 }, { "epoch": 0.6288774002954209, "grad_norm": 0.2899661958217621, "learning_rate": 0.0001581721886931888, "loss": 0.321, "step": 1703 }, { "epoch": 0.6292466765140325, "grad_norm": 0.29438790678977966, "learning_rate": 0.00015814755511762534, "loss": 0.3842, "step": 1704 }, { "epoch": 0.629615952732644, "grad_norm": 0.28124135732650757, "learning_rate": 0.00015812292154206183, "loss": 0.3379, "step": 1705 }, { "epoch": 0.6299852289512555, "grad_norm": 0.29183897376060486, "learning_rate": 0.00015809828796649835, "loss": 0.3267, "step": 1706 }, { "epoch": 0.630354505169867, "grad_norm": 0.30754607915878296, "learning_rate": 0.00015807365439093483, "loss": 0.382, "step": 1707 }, { "epoch": 0.6307237813884786, "grad_norm": 0.2588324546813965, "learning_rate": 0.00015804902081537135, "loss": 0.3673, "step": 1708 }, { "epoch": 0.6310930576070901, "grad_norm": 0.3831642270088196, "learning_rate": 0.00015802438723980786, "loss": 0.3531, "step": 1709 }, { "epoch": 0.6314623338257016, "grad_norm": 0.24796739220619202, "learning_rate": 0.00015799975366424438, "loss": 0.2398, "step": 1710 }, { "epoch": 0.6318316100443131, "grad_norm": 0.27409496903419495, "learning_rate": 0.00015797512008868087, "loss": 0.3508, "step": 1711 }, { "epoch": 0.6322008862629247, "grad_norm": 0.3816826641559601, "learning_rate": 0.00015795048651311738, "loss": 0.3976, "step": 1712 }, { "epoch": 0.6325701624815362, "grad_norm": 0.26184147596359253, "learning_rate": 0.0001579258529375539, "loss": 0.3316, "step": 1713 }, { "epoch": 0.6329394387001477, "grad_norm": 0.24922801554203033, "learning_rate": 0.0001579012193619904, "loss": 0.3003, "step": 1714 }, { "epoch": 0.6333087149187593, "grad_norm": 0.3036719560623169, "learning_rate": 0.0001578765857864269, "loss": 0.3311, "step": 1715 }, { "epoch": 0.6336779911373708, "grad_norm": 0.2686334550380707, "learning_rate": 0.0001578519522108634, "loss": 0.3117, "step": 1716 }, { "epoch": 0.6340472673559823, "grad_norm": 0.3197580575942993, "learning_rate": 0.00015782731863529993, "loss": 0.3464, "step": 1717 }, { "epoch": 0.6344165435745938, "grad_norm": 0.2406512051820755, "learning_rate": 0.00015780268505973644, "loss": 0.2651, "step": 1718 }, { "epoch": 0.6347858197932054, "grad_norm": 0.29690489172935486, "learning_rate": 0.00015777805148417293, "loss": 0.4247, "step": 1719 }, { "epoch": 0.6351550960118169, "grad_norm": 0.28856369853019714, "learning_rate": 0.00015775341790860944, "loss": 0.3043, "step": 1720 }, { "epoch": 0.6355243722304283, "grad_norm": 0.27125653624534607, "learning_rate": 0.00015772878433304593, "loss": 0.2969, "step": 1721 }, { "epoch": 0.6358936484490398, "grad_norm": 0.23672378063201904, "learning_rate": 0.00015770415075748247, "loss": 0.2979, "step": 1722 }, { "epoch": 0.6362629246676514, "grad_norm": 0.2650850713253021, "learning_rate": 0.00015767951718191896, "loss": 0.2636, "step": 1723 }, { "epoch": 0.6366322008862629, "grad_norm": 0.22631783783435822, "learning_rate": 0.00015765488360635547, "loss": 0.2606, "step": 1724 }, { "epoch": 0.6370014771048744, "grad_norm": 0.28124675154685974, "learning_rate": 0.00015763025003079196, "loss": 0.2768, "step": 1725 }, { "epoch": 0.6373707533234859, "grad_norm": 0.26289159059524536, "learning_rate": 0.00015760561645522848, "loss": 0.3184, "step": 1726 }, { "epoch": 0.6377400295420975, "grad_norm": 0.26568078994750977, "learning_rate": 0.000157580982879665, "loss": 0.2835, "step": 1727 }, { "epoch": 0.638109305760709, "grad_norm": 0.26100102066993713, "learning_rate": 0.0001575563493041015, "loss": 0.3013, "step": 1728 }, { "epoch": 0.6384785819793205, "grad_norm": 0.2617495656013489, "learning_rate": 0.000157531715728538, "loss": 0.3099, "step": 1729 }, { "epoch": 0.6388478581979321, "grad_norm": 0.24113066494464874, "learning_rate": 0.0001575070821529745, "loss": 0.2601, "step": 1730 }, { "epoch": 0.6392171344165436, "grad_norm": 0.2747572958469391, "learning_rate": 0.00015748244857741102, "loss": 0.3056, "step": 1731 }, { "epoch": 0.6395864106351551, "grad_norm": 0.23434069752693176, "learning_rate": 0.00015745781500184754, "loss": 0.3247, "step": 1732 }, { "epoch": 0.6399556868537666, "grad_norm": 0.2345058023929596, "learning_rate": 0.00015743318142628403, "loss": 0.3043, "step": 1733 }, { "epoch": 0.6403249630723782, "grad_norm": 0.29427266120910645, "learning_rate": 0.00015740854785072054, "loss": 0.3494, "step": 1734 }, { "epoch": 0.6406942392909897, "grad_norm": 0.2852862775325775, "learning_rate": 0.00015738391427515703, "loss": 0.3587, "step": 1735 }, { "epoch": 0.6410635155096012, "grad_norm": 0.2725900113582611, "learning_rate": 0.00015735928069959357, "loss": 0.2966, "step": 1736 }, { "epoch": 0.6414327917282127, "grad_norm": 0.30448734760284424, "learning_rate": 0.00015733464712403006, "loss": 0.3303, "step": 1737 }, { "epoch": 0.6418020679468243, "grad_norm": 0.25536197423934937, "learning_rate": 0.00015731001354846657, "loss": 0.3302, "step": 1738 }, { "epoch": 0.6421713441654358, "grad_norm": 0.31083613634109497, "learning_rate": 0.00015728537997290306, "loss": 0.3966, "step": 1739 }, { "epoch": 0.6425406203840472, "grad_norm": 0.2993643879890442, "learning_rate": 0.00015726074639733957, "loss": 0.3034, "step": 1740 }, { "epoch": 0.6429098966026587, "grad_norm": 0.31519269943237305, "learning_rate": 0.0001572361128217761, "loss": 0.2813, "step": 1741 }, { "epoch": 0.6432791728212703, "grad_norm": 0.25670289993286133, "learning_rate": 0.0001572114792462126, "loss": 0.2646, "step": 1742 }, { "epoch": 0.6436484490398818, "grad_norm": 0.25279560685157776, "learning_rate": 0.0001571868456706491, "loss": 0.2919, "step": 1743 }, { "epoch": 0.6440177252584933, "grad_norm": 0.2225867360830307, "learning_rate": 0.0001571622120950856, "loss": 0.2817, "step": 1744 }, { "epoch": 0.6443870014771049, "grad_norm": 0.2555961608886719, "learning_rate": 0.00015713757851952212, "loss": 0.3269, "step": 1745 }, { "epoch": 0.6447562776957164, "grad_norm": 0.27634233236312866, "learning_rate": 0.00015711294494395863, "loss": 0.3494, "step": 1746 }, { "epoch": 0.6451255539143279, "grad_norm": 0.2750725746154785, "learning_rate": 0.00015708831136839512, "loss": 0.3578, "step": 1747 }, { "epoch": 0.6454948301329394, "grad_norm": 0.30054372549057007, "learning_rate": 0.00015706367779283164, "loss": 0.3006, "step": 1748 }, { "epoch": 0.645864106351551, "grad_norm": 0.25382721424102783, "learning_rate": 0.00015703904421726815, "loss": 0.2913, "step": 1749 }, { "epoch": 0.6462333825701625, "grad_norm": 0.2811352014541626, "learning_rate": 0.00015701441064170467, "loss": 0.3342, "step": 1750 }, { "epoch": 0.6462333825701625, "eval_loss": 0.29770615696907043, "eval_runtime": 5.8464, "eval_samples_per_second": 8.552, "eval_steps_per_second": 1.197, "step": 1750 }, { "epoch": 0.646602658788774, "grad_norm": 0.27353596687316895, "learning_rate": 0.00015698977706614115, "loss": 0.2868, "step": 1751 }, { "epoch": 0.6469719350073855, "grad_norm": 0.3044775128364563, "learning_rate": 0.00015696514349057767, "loss": 0.3465, "step": 1752 }, { "epoch": 0.6473412112259971, "grad_norm": 0.26236966252326965, "learning_rate": 0.00015694050991501416, "loss": 0.2984, "step": 1753 }, { "epoch": 0.6477104874446086, "grad_norm": 0.26648154854774475, "learning_rate": 0.0001569158763394507, "loss": 0.25, "step": 1754 }, { "epoch": 0.6480797636632201, "grad_norm": 0.22502334415912628, "learning_rate": 0.00015689124276388718, "loss": 0.3028, "step": 1755 }, { "epoch": 0.6484490398818316, "grad_norm": 0.3449307382106781, "learning_rate": 0.0001568666091883237, "loss": 0.3149, "step": 1756 }, { "epoch": 0.6488183161004432, "grad_norm": 0.2570543885231018, "learning_rate": 0.0001568419756127602, "loss": 0.2916, "step": 1757 }, { "epoch": 0.6491875923190547, "grad_norm": 0.30169492959976196, "learning_rate": 0.0001568173420371967, "loss": 0.3588, "step": 1758 }, { "epoch": 0.6495568685376661, "grad_norm": 0.2638286054134369, "learning_rate": 0.00015679270846163322, "loss": 0.3143, "step": 1759 }, { "epoch": 0.6499261447562777, "grad_norm": 0.2750193476676941, "learning_rate": 0.00015676807488606973, "loss": 0.3078, "step": 1760 }, { "epoch": 0.6502954209748892, "grad_norm": 0.25509074330329895, "learning_rate": 0.00015674344131050622, "loss": 0.2535, "step": 1761 }, { "epoch": 0.6506646971935007, "grad_norm": 0.23687632381916046, "learning_rate": 0.00015671880773494273, "loss": 0.2834, "step": 1762 }, { "epoch": 0.6510339734121122, "grad_norm": 0.25632792711257935, "learning_rate": 0.00015669417415937925, "loss": 0.3015, "step": 1763 }, { "epoch": 0.6514032496307238, "grad_norm": 0.2542288601398468, "learning_rate": 0.00015666954058381576, "loss": 0.2827, "step": 1764 }, { "epoch": 0.6517725258493353, "grad_norm": 0.3169395923614502, "learning_rate": 0.00015664490700825225, "loss": 0.3251, "step": 1765 }, { "epoch": 0.6521418020679468, "grad_norm": 0.28818562626838684, "learning_rate": 0.00015662027343268876, "loss": 0.3101, "step": 1766 }, { "epoch": 0.6525110782865583, "grad_norm": 0.2993204891681671, "learning_rate": 0.00015659563985712525, "loss": 0.336, "step": 1767 }, { "epoch": 0.6528803545051699, "grad_norm": 0.2634614109992981, "learning_rate": 0.0001565710062815618, "loss": 0.3328, "step": 1768 }, { "epoch": 0.6532496307237814, "grad_norm": 0.29314500093460083, "learning_rate": 0.00015654637270599828, "loss": 0.38, "step": 1769 }, { "epoch": 0.6536189069423929, "grad_norm": 0.24318912625312805, "learning_rate": 0.0001565217391304348, "loss": 0.2596, "step": 1770 }, { "epoch": 0.6539881831610044, "grad_norm": 0.28800836205482483, "learning_rate": 0.00015649710555487128, "loss": 0.3233, "step": 1771 }, { "epoch": 0.654357459379616, "grad_norm": 0.28744104504585266, "learning_rate": 0.0001564724719793078, "loss": 0.3106, "step": 1772 }, { "epoch": 0.6547267355982275, "grad_norm": 0.2733234465122223, "learning_rate": 0.0001564478384037443, "loss": 0.3522, "step": 1773 }, { "epoch": 0.655096011816839, "grad_norm": 0.3021034300327301, "learning_rate": 0.00015642320482818083, "loss": 0.3618, "step": 1774 }, { "epoch": 0.6554652880354506, "grad_norm": 0.28585657477378845, "learning_rate": 0.00015639857125261731, "loss": 0.325, "step": 1775 }, { "epoch": 0.6558345642540621, "grad_norm": 0.3492552936077118, "learning_rate": 0.00015637393767705383, "loss": 0.3431, "step": 1776 }, { "epoch": 0.6562038404726735, "grad_norm": 0.26289793848991394, "learning_rate": 0.00015634930410149034, "loss": 0.2999, "step": 1777 }, { "epoch": 0.656573116691285, "grad_norm": 0.2919107675552368, "learning_rate": 0.00015632467052592686, "loss": 0.2475, "step": 1778 }, { "epoch": 0.6569423929098966, "grad_norm": 0.2998773455619812, "learning_rate": 0.00015630003695036335, "loss": 0.3243, "step": 1779 }, { "epoch": 0.6573116691285081, "grad_norm": 0.30049073696136475, "learning_rate": 0.00015627540337479986, "loss": 0.3504, "step": 1780 }, { "epoch": 0.6576809453471196, "grad_norm": 0.2768280506134033, "learning_rate": 0.00015625076979923638, "loss": 0.3045, "step": 1781 }, { "epoch": 0.6580502215657311, "grad_norm": 0.2078462392091751, "learning_rate": 0.0001562261362236729, "loss": 0.2303, "step": 1782 }, { "epoch": 0.6584194977843427, "grad_norm": 0.3140887916088104, "learning_rate": 0.00015620150264810938, "loss": 0.3731, "step": 1783 }, { "epoch": 0.6587887740029542, "grad_norm": 0.23740491271018982, "learning_rate": 0.0001561768690725459, "loss": 0.2617, "step": 1784 }, { "epoch": 0.6591580502215657, "grad_norm": 0.38925233483314514, "learning_rate": 0.00015615223549698238, "loss": 0.416, "step": 1785 }, { "epoch": 0.6595273264401772, "grad_norm": 0.2521016001701355, "learning_rate": 0.00015612760192141892, "loss": 0.2772, "step": 1786 }, { "epoch": 0.6598966026587888, "grad_norm": 0.2719894349575043, "learning_rate": 0.0001561029683458554, "loss": 0.309, "step": 1787 }, { "epoch": 0.6602658788774003, "grad_norm": 0.28100425004959106, "learning_rate": 0.00015607833477029192, "loss": 0.3362, "step": 1788 }, { "epoch": 0.6606351550960118, "grad_norm": 0.2446250170469284, "learning_rate": 0.0001560537011947284, "loss": 0.2647, "step": 1789 }, { "epoch": 0.6610044313146234, "grad_norm": 0.28099167346954346, "learning_rate": 0.00015602906761916493, "loss": 0.3005, "step": 1790 }, { "epoch": 0.6613737075332349, "grad_norm": 0.23360125720500946, "learning_rate": 0.00015600443404360144, "loss": 0.2445, "step": 1791 }, { "epoch": 0.6617429837518464, "grad_norm": 0.28586718440055847, "learning_rate": 0.00015597980046803796, "loss": 0.3461, "step": 1792 }, { "epoch": 0.6621122599704579, "grad_norm": 0.2865604758262634, "learning_rate": 0.00015595516689247444, "loss": 0.3345, "step": 1793 }, { "epoch": 0.6624815361890695, "grad_norm": 0.26880353689193726, "learning_rate": 0.00015593053331691096, "loss": 0.2974, "step": 1794 }, { "epoch": 0.662850812407681, "grad_norm": 0.24530526995658875, "learning_rate": 0.00015590589974134747, "loss": 0.3113, "step": 1795 }, { "epoch": 0.6632200886262924, "grad_norm": 0.29964113235473633, "learning_rate": 0.000155881266165784, "loss": 0.3239, "step": 1796 }, { "epoch": 0.6635893648449039, "grad_norm": 0.24047644436359406, "learning_rate": 0.00015585663259022047, "loss": 0.2807, "step": 1797 }, { "epoch": 0.6639586410635155, "grad_norm": 0.2512739896774292, "learning_rate": 0.000155831999014657, "loss": 0.3065, "step": 1798 }, { "epoch": 0.664327917282127, "grad_norm": 0.2920621633529663, "learning_rate": 0.00015580736543909348, "loss": 0.3676, "step": 1799 }, { "epoch": 0.6646971935007385, "grad_norm": 0.2532987892627716, "learning_rate": 0.00015578273186353002, "loss": 0.268, "step": 1800 }, { "epoch": 0.6646971935007385, "eval_loss": 0.2948981523513794, "eval_runtime": 5.8636, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 1800 }, { "epoch": 0.6650664697193501, "grad_norm": 0.3084225356578827, "learning_rate": 0.0001557580982879665, "loss": 0.3474, "step": 1801 }, { "epoch": 0.6654357459379616, "grad_norm": 0.2657549977302551, "learning_rate": 0.00015573346471240302, "loss": 0.3271, "step": 1802 }, { "epoch": 0.6658050221565731, "grad_norm": 0.29108577966690063, "learning_rate": 0.0001557088311368395, "loss": 0.3392, "step": 1803 }, { "epoch": 0.6661742983751846, "grad_norm": 0.26664069294929504, "learning_rate": 0.00015568419756127602, "loss": 0.3126, "step": 1804 }, { "epoch": 0.6665435745937962, "grad_norm": 0.2917342782020569, "learning_rate": 0.00015565956398571254, "loss": 0.3592, "step": 1805 }, { "epoch": 0.6669128508124077, "grad_norm": 0.2624189853668213, "learning_rate": 0.00015563493041014905, "loss": 0.297, "step": 1806 }, { "epoch": 0.6672821270310192, "grad_norm": 0.29515746235847473, "learning_rate": 0.00015561029683458554, "loss": 0.317, "step": 1807 }, { "epoch": 0.6676514032496307, "grad_norm": 0.3128635287284851, "learning_rate": 0.00015558566325902205, "loss": 0.3551, "step": 1808 }, { "epoch": 0.6680206794682423, "grad_norm": 0.2330590784549713, "learning_rate": 0.00015556102968345857, "loss": 0.2613, "step": 1809 }, { "epoch": 0.6683899556868538, "grad_norm": 0.2911098003387451, "learning_rate": 0.00015553639610789508, "loss": 0.3516, "step": 1810 }, { "epoch": 0.6687592319054653, "grad_norm": 0.3263610601425171, "learning_rate": 0.00015551176253233157, "loss": 0.3869, "step": 1811 }, { "epoch": 0.6691285081240768, "grad_norm": 0.32457396388053894, "learning_rate": 0.00015548712895676809, "loss": 0.407, "step": 1812 }, { "epoch": 0.6694977843426884, "grad_norm": 0.2424752414226532, "learning_rate": 0.0001554624953812046, "loss": 0.3011, "step": 1813 }, { "epoch": 0.6698670605612999, "grad_norm": 0.29370442032814026, "learning_rate": 0.00015543786180564111, "loss": 0.3433, "step": 1814 }, { "epoch": 0.6702363367799113, "grad_norm": 0.30061599612236023, "learning_rate": 0.0001554132282300776, "loss": 0.3303, "step": 1815 }, { "epoch": 0.670605612998523, "grad_norm": 0.2254580557346344, "learning_rate": 0.00015538859465451412, "loss": 0.2378, "step": 1816 }, { "epoch": 0.6709748892171344, "grad_norm": 0.25725165009498596, "learning_rate": 0.0001553639610789506, "loss": 0.2848, "step": 1817 }, { "epoch": 0.6713441654357459, "grad_norm": 0.27629512548446655, "learning_rate": 0.00015533932750338715, "loss": 0.3425, "step": 1818 }, { "epoch": 0.6717134416543574, "grad_norm": 0.27350887656211853, "learning_rate": 0.00015531469392782363, "loss": 0.3021, "step": 1819 }, { "epoch": 0.672082717872969, "grad_norm": 0.28401410579681396, "learning_rate": 0.00015529006035226015, "loss": 0.2829, "step": 1820 }, { "epoch": 0.6724519940915805, "grad_norm": 0.21475425362586975, "learning_rate": 0.00015526542677669664, "loss": 0.2587, "step": 1821 }, { "epoch": 0.672821270310192, "grad_norm": 0.28110092878341675, "learning_rate": 0.00015524079320113315, "loss": 0.342, "step": 1822 }, { "epoch": 0.6731905465288035, "grad_norm": 0.2703840136528015, "learning_rate": 0.00015521615962556967, "loss": 0.2876, "step": 1823 }, { "epoch": 0.6735598227474151, "grad_norm": 0.29484739899635315, "learning_rate": 0.00015519152605000618, "loss": 0.32, "step": 1824 }, { "epoch": 0.6739290989660266, "grad_norm": 0.2943943440914154, "learning_rate": 0.00015516689247444267, "loss": 0.3942, "step": 1825 }, { "epoch": 0.6742983751846381, "grad_norm": 0.24084927141666412, "learning_rate": 0.00015514225889887918, "loss": 0.2797, "step": 1826 }, { "epoch": 0.6746676514032496, "grad_norm": 0.2672373354434967, "learning_rate": 0.0001551176253233157, "loss": 0.345, "step": 1827 }, { "epoch": 0.6750369276218612, "grad_norm": 0.32839083671569824, "learning_rate": 0.0001550929917477522, "loss": 0.3829, "step": 1828 }, { "epoch": 0.6754062038404727, "grad_norm": 0.3239312767982483, "learning_rate": 0.0001550683581721887, "loss": 0.3548, "step": 1829 }, { "epoch": 0.6757754800590842, "grad_norm": 0.24574360251426697, "learning_rate": 0.00015504372459662521, "loss": 0.2696, "step": 1830 }, { "epoch": 0.6761447562776958, "grad_norm": 0.3067609965801239, "learning_rate": 0.0001550190910210617, "loss": 0.357, "step": 1831 }, { "epoch": 0.6765140324963073, "grad_norm": 0.2867920994758606, "learning_rate": 0.00015499445744549824, "loss": 0.3265, "step": 1832 }, { "epoch": 0.6768833087149188, "grad_norm": 0.23799139261245728, "learning_rate": 0.00015496982386993473, "loss": 0.3095, "step": 1833 }, { "epoch": 0.6772525849335302, "grad_norm": 0.3008800745010376, "learning_rate": 0.00015494519029437125, "loss": 0.3585, "step": 1834 }, { "epoch": 0.6776218611521418, "grad_norm": 0.2254391759634018, "learning_rate": 0.00015492055671880773, "loss": 0.2715, "step": 1835 }, { "epoch": 0.6779911373707533, "grad_norm": 0.2766897976398468, "learning_rate": 0.00015489592314324425, "loss": 0.2888, "step": 1836 }, { "epoch": 0.6783604135893648, "grad_norm": 0.25781330466270447, "learning_rate": 0.00015487128956768076, "loss": 0.2988, "step": 1837 }, { "epoch": 0.6787296898079763, "grad_norm": 0.2826957106590271, "learning_rate": 0.00015484665599211728, "loss": 0.3256, "step": 1838 }, { "epoch": 0.6790989660265879, "grad_norm": 0.32387325167655945, "learning_rate": 0.00015482202241655376, "loss": 0.3751, "step": 1839 }, { "epoch": 0.6794682422451994, "grad_norm": 0.23880648612976074, "learning_rate": 0.00015479738884099028, "loss": 0.256, "step": 1840 }, { "epoch": 0.6798375184638109, "grad_norm": 0.2831343710422516, "learning_rate": 0.0001547727552654268, "loss": 0.2981, "step": 1841 }, { "epoch": 0.6802067946824224, "grad_norm": 0.2651851177215576, "learning_rate": 0.0001547481216898633, "loss": 0.3318, "step": 1842 }, { "epoch": 0.680576070901034, "grad_norm": 0.28197112679481506, "learning_rate": 0.0001547234881142998, "loss": 0.3174, "step": 1843 }, { "epoch": 0.6809453471196455, "grad_norm": 0.30436182022094727, "learning_rate": 0.0001546988545387363, "loss": 0.3748, "step": 1844 }, { "epoch": 0.681314623338257, "grad_norm": 0.300368994474411, "learning_rate": 0.0001546742209631728, "loss": 0.3496, "step": 1845 }, { "epoch": 0.6816838995568686, "grad_norm": 0.2905513048171997, "learning_rate": 0.00015464958738760934, "loss": 0.3193, "step": 1846 }, { "epoch": 0.6820531757754801, "grad_norm": 0.3128775358200073, "learning_rate": 0.00015462495381204583, "loss": 0.3175, "step": 1847 }, { "epoch": 0.6824224519940916, "grad_norm": 0.31336894631385803, "learning_rate": 0.00015460032023648234, "loss": 0.359, "step": 1848 }, { "epoch": 0.6827917282127031, "grad_norm": 0.33619949221611023, "learning_rate": 0.00015457568666091883, "loss": 0.3467, "step": 1849 }, { "epoch": 0.6831610044313147, "grad_norm": 0.3280799090862274, "learning_rate": 0.00015455105308535537, "loss": 0.3398, "step": 1850 }, { "epoch": 0.6831610044313147, "eval_loss": 0.2983386218547821, "eval_runtime": 5.8536, "eval_samples_per_second": 8.542, "eval_steps_per_second": 1.196, "step": 1850 }, { "epoch": 0.6835302806499262, "grad_norm": 0.33054792881011963, "learning_rate": 0.00015452641950979186, "loss": 0.353, "step": 1851 }, { "epoch": 0.6838995568685377, "grad_norm": 0.2995906472206116, "learning_rate": 0.00015450178593422837, "loss": 0.3251, "step": 1852 }, { "epoch": 0.6842688330871491, "grad_norm": 0.2561552822589874, "learning_rate": 0.00015447715235866486, "loss": 0.2844, "step": 1853 }, { "epoch": 0.6846381093057607, "grad_norm": 0.22382104396820068, "learning_rate": 0.00015445251878310138, "loss": 0.2641, "step": 1854 }, { "epoch": 0.6850073855243722, "grad_norm": 0.2484665811061859, "learning_rate": 0.0001544278852075379, "loss": 0.2528, "step": 1855 }, { "epoch": 0.6853766617429837, "grad_norm": 0.29215025901794434, "learning_rate": 0.0001544032516319744, "loss": 0.3128, "step": 1856 }, { "epoch": 0.6857459379615952, "grad_norm": 0.29227593541145325, "learning_rate": 0.0001543786180564109, "loss": 0.3142, "step": 1857 }, { "epoch": 0.6861152141802068, "grad_norm": 0.32659927010536194, "learning_rate": 0.0001543539844808474, "loss": 0.3625, "step": 1858 }, { "epoch": 0.6864844903988183, "grad_norm": 0.37861353158950806, "learning_rate": 0.00015432935090528392, "loss": 0.3605, "step": 1859 }, { "epoch": 0.6868537666174298, "grad_norm": 0.2864838242530823, "learning_rate": 0.00015430471732972044, "loss": 0.2972, "step": 1860 }, { "epoch": 0.6872230428360414, "grad_norm": 0.37199172377586365, "learning_rate": 0.00015428008375415692, "loss": 0.4036, "step": 1861 }, { "epoch": 0.6875923190546529, "grad_norm": 0.3137976825237274, "learning_rate": 0.0001542554501785934, "loss": 0.3722, "step": 1862 }, { "epoch": 0.6879615952732644, "grad_norm": 0.2716263234615326, "learning_rate": 0.00015423081660302993, "loss": 0.3142, "step": 1863 }, { "epoch": 0.6883308714918759, "grad_norm": 0.25997471809387207, "learning_rate": 0.00015420618302746644, "loss": 0.2782, "step": 1864 }, { "epoch": 0.6887001477104875, "grad_norm": 0.2467002421617508, "learning_rate": 0.00015418154945190295, "loss": 0.2902, "step": 1865 }, { "epoch": 0.689069423929099, "grad_norm": 0.3040105700492859, "learning_rate": 0.00015415691587633944, "loss": 0.3599, "step": 1866 }, { "epoch": 0.6894387001477105, "grad_norm": 0.27652984857559204, "learning_rate": 0.00015413228230077596, "loss": 0.3071, "step": 1867 }, { "epoch": 0.689807976366322, "grad_norm": 0.31874772906303406, "learning_rate": 0.00015410764872521247, "loss": 0.3063, "step": 1868 }, { "epoch": 0.6901772525849336, "grad_norm": 0.3031354248523712, "learning_rate": 0.00015408301514964899, "loss": 0.3321, "step": 1869 }, { "epoch": 0.6905465288035451, "grad_norm": 0.27512142062187195, "learning_rate": 0.00015405838157408547, "loss": 0.316, "step": 1870 }, { "epoch": 0.6909158050221565, "grad_norm": 0.23150752484798431, "learning_rate": 0.000154033747998522, "loss": 0.2924, "step": 1871 }, { "epoch": 0.691285081240768, "grad_norm": 0.2737838327884674, "learning_rate": 0.00015400911442295848, "loss": 0.2814, "step": 1872 }, { "epoch": 0.6916543574593796, "grad_norm": 0.2643684446811676, "learning_rate": 0.00015398448084739502, "loss": 0.303, "step": 1873 }, { "epoch": 0.6920236336779911, "grad_norm": 0.42378824949264526, "learning_rate": 0.0001539598472718315, "loss": 0.3167, "step": 1874 }, { "epoch": 0.6923929098966026, "grad_norm": 0.2837834656238556, "learning_rate": 0.00015393521369626802, "loss": 0.2948, "step": 1875 }, { "epoch": 0.6927621861152142, "grad_norm": 0.24619752168655396, "learning_rate": 0.0001539105801207045, "loss": 0.3024, "step": 1876 }, { "epoch": 0.6931314623338257, "grad_norm": 0.24698737263679504, "learning_rate": 0.00015388594654514102, "loss": 0.2985, "step": 1877 }, { "epoch": 0.6935007385524372, "grad_norm": 0.3062925636768341, "learning_rate": 0.00015386131296957754, "loss": 0.3311, "step": 1878 }, { "epoch": 0.6938700147710487, "grad_norm": 0.2970742881298065, "learning_rate": 0.00015383667939401405, "loss": 0.3188, "step": 1879 }, { "epoch": 0.6942392909896603, "grad_norm": 0.2796284854412079, "learning_rate": 0.00015381204581845054, "loss": 0.3468, "step": 1880 }, { "epoch": 0.6946085672082718, "grad_norm": 0.278063029050827, "learning_rate": 0.00015378741224288705, "loss": 0.3119, "step": 1881 }, { "epoch": 0.6949778434268833, "grad_norm": 0.32742705941200256, "learning_rate": 0.00015376277866732357, "loss": 0.2612, "step": 1882 }, { "epoch": 0.6953471196454948, "grad_norm": 0.29229697585105896, "learning_rate": 0.00015373814509176008, "loss": 0.2931, "step": 1883 }, { "epoch": 0.6957163958641064, "grad_norm": 0.27059805393218994, "learning_rate": 0.00015371351151619657, "loss": 0.3153, "step": 1884 }, { "epoch": 0.6960856720827179, "grad_norm": 0.3111379146575928, "learning_rate": 0.00015368887794063309, "loss": 0.3611, "step": 1885 }, { "epoch": 0.6964549483013294, "grad_norm": 0.23783890902996063, "learning_rate": 0.0001536642443650696, "loss": 0.2944, "step": 1886 }, { "epoch": 0.696824224519941, "grad_norm": 0.24644704163074493, "learning_rate": 0.00015363961078950611, "loss": 0.2545, "step": 1887 }, { "epoch": 0.6971935007385525, "grad_norm": 0.28930196166038513, "learning_rate": 0.0001536149772139426, "loss": 0.324, "step": 1888 }, { "epoch": 0.697562776957164, "grad_norm": 0.4145079553127289, "learning_rate": 0.00015359034363837912, "loss": 0.4474, "step": 1889 }, { "epoch": 0.6979320531757754, "grad_norm": 0.3037989139556885, "learning_rate": 0.0001535657100628156, "loss": 0.3236, "step": 1890 }, { "epoch": 0.698301329394387, "grad_norm": 0.26441720128059387, "learning_rate": 0.00015354107648725215, "loss": 0.26, "step": 1891 }, { "epoch": 0.6986706056129985, "grad_norm": 0.23267091810703278, "learning_rate": 0.00015351644291168863, "loss": 0.2776, "step": 1892 }, { "epoch": 0.69903988183161, "grad_norm": 0.489742636680603, "learning_rate": 0.00015349180933612515, "loss": 0.3788, "step": 1893 }, { "epoch": 0.6994091580502215, "grad_norm": 0.32214441895484924, "learning_rate": 0.00015346717576056164, "loss": 0.3675, "step": 1894 }, { "epoch": 0.6997784342688331, "grad_norm": 0.29301875829696655, "learning_rate": 0.00015344254218499815, "loss": 0.3549, "step": 1895 }, { "epoch": 0.7001477104874446, "grad_norm": 0.2997570037841797, "learning_rate": 0.00015341790860943466, "loss": 0.2991, "step": 1896 }, { "epoch": 0.7005169867060561, "grad_norm": 0.30343782901763916, "learning_rate": 0.00015339327503387118, "loss": 0.3299, "step": 1897 }, { "epoch": 0.7008862629246676, "grad_norm": 0.3107844889163971, "learning_rate": 0.00015336864145830767, "loss": 0.3547, "step": 1898 }, { "epoch": 0.7012555391432792, "grad_norm": 0.26904937624931335, "learning_rate": 0.00015334400788274418, "loss": 0.3074, "step": 1899 }, { "epoch": 0.7016248153618907, "grad_norm": 0.3177255690097809, "learning_rate": 0.0001533193743071807, "loss": 0.3576, "step": 1900 }, { "epoch": 0.7016248153618907, "eval_loss": 0.29446089267730713, "eval_runtime": 5.8552, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.196, "step": 1900 }, { "epoch": 0.7019940915805022, "grad_norm": 0.20602965354919434, "learning_rate": 0.0001532947407316172, "loss": 0.2409, "step": 1901 }, { "epoch": 0.7023633677991138, "grad_norm": 0.290698379278183, "learning_rate": 0.0001532701071560537, "loss": 0.3272, "step": 1902 }, { "epoch": 0.7027326440177253, "grad_norm": 0.2832525670528412, "learning_rate": 0.0001532454735804902, "loss": 0.2954, "step": 1903 }, { "epoch": 0.7031019202363368, "grad_norm": 0.2994844913482666, "learning_rate": 0.0001532208400049267, "loss": 0.3127, "step": 1904 }, { "epoch": 0.7034711964549483, "grad_norm": 0.27429550886154175, "learning_rate": 0.00015319620642936324, "loss": 0.3643, "step": 1905 }, { "epoch": 0.7038404726735599, "grad_norm": 0.2514464855194092, "learning_rate": 0.00015317157285379973, "loss": 0.2824, "step": 1906 }, { "epoch": 0.7042097488921714, "grad_norm": 0.2964741289615631, "learning_rate": 0.00015314693927823624, "loss": 0.3267, "step": 1907 }, { "epoch": 0.7045790251107829, "grad_norm": 0.2951356768608093, "learning_rate": 0.00015312230570267273, "loss": 0.3232, "step": 1908 }, { "epoch": 0.7049483013293943, "grad_norm": 0.2574373781681061, "learning_rate": 0.00015309767212710925, "loss": 0.3239, "step": 1909 }, { "epoch": 0.705317577548006, "grad_norm": 0.24858419597148895, "learning_rate": 0.00015307303855154576, "loss": 0.2769, "step": 1910 }, { "epoch": 0.7056868537666174, "grad_norm": 0.31815093755722046, "learning_rate": 0.00015304840497598228, "loss": 0.3433, "step": 1911 }, { "epoch": 0.7060561299852289, "grad_norm": 0.32545092701911926, "learning_rate": 0.00015302377140041876, "loss": 0.2926, "step": 1912 }, { "epoch": 0.7064254062038404, "grad_norm": 0.25615841150283813, "learning_rate": 0.00015299913782485528, "loss": 0.3072, "step": 1913 }, { "epoch": 0.706794682422452, "grad_norm": 0.2847903072834015, "learning_rate": 0.0001529745042492918, "loss": 0.3794, "step": 1914 }, { "epoch": 0.7071639586410635, "grad_norm": 0.2633518576622009, "learning_rate": 0.0001529498706737283, "loss": 0.2807, "step": 1915 }, { "epoch": 0.707533234859675, "grad_norm": 0.28141912817955017, "learning_rate": 0.0001529252370981648, "loss": 0.3122, "step": 1916 }, { "epoch": 0.7079025110782866, "grad_norm": 0.20738820731639862, "learning_rate": 0.0001529006035226013, "loss": 0.1951, "step": 1917 }, { "epoch": 0.7082717872968981, "grad_norm": 0.29588785767555237, "learning_rate": 0.00015287596994703782, "loss": 0.3424, "step": 1918 }, { "epoch": 0.7086410635155096, "grad_norm": 0.27013230323791504, "learning_rate": 0.00015285133637147434, "loss": 0.3418, "step": 1919 }, { "epoch": 0.7090103397341211, "grad_norm": 0.2913917005062103, "learning_rate": 0.00015282670279591083, "loss": 0.3648, "step": 1920 }, { "epoch": 0.7093796159527327, "grad_norm": 0.28088995814323425, "learning_rate": 0.00015280206922034734, "loss": 0.2865, "step": 1921 }, { "epoch": 0.7097488921713442, "grad_norm": 0.3325832486152649, "learning_rate": 0.00015277743564478383, "loss": 0.403, "step": 1922 }, { "epoch": 0.7101181683899557, "grad_norm": 0.2513841986656189, "learning_rate": 0.00015275280206922037, "loss": 0.3073, "step": 1923 }, { "epoch": 0.7104874446085672, "grad_norm": 0.2998408079147339, "learning_rate": 0.00015272816849365686, "loss": 0.3883, "step": 1924 }, { "epoch": 0.7108567208271788, "grad_norm": 0.2626917362213135, "learning_rate": 0.00015270353491809337, "loss": 0.3203, "step": 1925 }, { "epoch": 0.7112259970457903, "grad_norm": 0.222730815410614, "learning_rate": 0.00015267890134252986, "loss": 0.239, "step": 1926 }, { "epoch": 0.7115952732644018, "grad_norm": 0.31508567929267883, "learning_rate": 0.00015265426776696637, "loss": 0.3203, "step": 1927 }, { "epoch": 0.7119645494830132, "grad_norm": 0.23665258288383484, "learning_rate": 0.0001526296341914029, "loss": 0.3118, "step": 1928 }, { "epoch": 0.7123338257016248, "grad_norm": 0.2527763545513153, "learning_rate": 0.0001526050006158394, "loss": 0.2807, "step": 1929 }, { "epoch": 0.7127031019202363, "grad_norm": 0.234427347779274, "learning_rate": 0.0001525803670402759, "loss": 0.2415, "step": 1930 }, { "epoch": 0.7130723781388478, "grad_norm": 0.23413021862506866, "learning_rate": 0.0001525557334647124, "loss": 0.2821, "step": 1931 }, { "epoch": 0.7134416543574594, "grad_norm": 0.2665095329284668, "learning_rate": 0.00015253109988914892, "loss": 0.3385, "step": 1932 }, { "epoch": 0.7138109305760709, "grad_norm": 0.2771255373954773, "learning_rate": 0.00015250646631358544, "loss": 0.3215, "step": 1933 }, { "epoch": 0.7141802067946824, "grad_norm": 0.26301082968711853, "learning_rate": 0.00015248183273802192, "loss": 0.2928, "step": 1934 }, { "epoch": 0.7145494830132939, "grad_norm": 0.3336317539215088, "learning_rate": 0.00015245719916245844, "loss": 0.3703, "step": 1935 }, { "epoch": 0.7149187592319055, "grad_norm": 0.3198574483394623, "learning_rate": 0.00015243256558689493, "loss": 0.2901, "step": 1936 }, { "epoch": 0.715288035450517, "grad_norm": 0.2950039803981781, "learning_rate": 0.00015240793201133147, "loss": 0.3221, "step": 1937 }, { "epoch": 0.7156573116691285, "grad_norm": 0.25150543451309204, "learning_rate": 0.00015238329843576795, "loss": 0.3088, "step": 1938 }, { "epoch": 0.71602658788774, "grad_norm": 0.25831344723701477, "learning_rate": 0.00015235866486020447, "loss": 0.3233, "step": 1939 }, { "epoch": 0.7163958641063516, "grad_norm": 0.2541324496269226, "learning_rate": 0.00015233403128464096, "loss": 0.2441, "step": 1940 }, { "epoch": 0.7167651403249631, "grad_norm": 0.4397067725658417, "learning_rate": 0.00015230939770907747, "loss": 0.315, "step": 1941 }, { "epoch": 0.7171344165435746, "grad_norm": 0.3101305067539215, "learning_rate": 0.00015228476413351399, "loss": 0.3353, "step": 1942 }, { "epoch": 0.7175036927621861, "grad_norm": 0.23744426667690277, "learning_rate": 0.0001522601305579505, "loss": 0.2855, "step": 1943 }, { "epoch": 0.7178729689807977, "grad_norm": 0.267407089471817, "learning_rate": 0.000152235496982387, "loss": 0.2775, "step": 1944 }, { "epoch": 0.7182422451994092, "grad_norm": 0.2828403115272522, "learning_rate": 0.0001522108634068235, "loss": 0.2896, "step": 1945 }, { "epoch": 0.7186115214180206, "grad_norm": 0.25437045097351074, "learning_rate": 0.00015218622983126002, "loss": 0.2654, "step": 1946 }, { "epoch": 0.7189807976366323, "grad_norm": 0.22928428649902344, "learning_rate": 0.00015216159625569653, "loss": 0.2482, "step": 1947 }, { "epoch": 0.7193500738552437, "grad_norm": 0.24921920895576477, "learning_rate": 0.00015213696268013302, "loss": 0.2982, "step": 1948 }, { "epoch": 0.7197193500738552, "grad_norm": 0.27868038415908813, "learning_rate": 0.00015211232910456953, "loss": 0.258, "step": 1949 }, { "epoch": 0.7200886262924667, "grad_norm": 0.372651606798172, "learning_rate": 0.00015208769552900605, "loss": 0.3602, "step": 1950 }, { "epoch": 0.7200886262924667, "eval_loss": 0.29376015067100525, "eval_runtime": 5.8538, "eval_samples_per_second": 8.541, "eval_steps_per_second": 1.196, "step": 1950 }, { "epoch": 0.7204579025110783, "grad_norm": 0.27268344163894653, "learning_rate": 0.00015206306195344256, "loss": 0.3007, "step": 1951 }, { "epoch": 0.7208271787296898, "grad_norm": 0.33668243885040283, "learning_rate": 0.00015203842837787905, "loss": 0.3828, "step": 1952 }, { "epoch": 0.7211964549483013, "grad_norm": 0.2558956444263458, "learning_rate": 0.00015201379480231557, "loss": 0.2787, "step": 1953 }, { "epoch": 0.7215657311669128, "grad_norm": 0.25845855474472046, "learning_rate": 0.00015198916122675205, "loss": 0.3092, "step": 1954 }, { "epoch": 0.7219350073855244, "grad_norm": 0.2578001320362091, "learning_rate": 0.0001519645276511886, "loss": 0.302, "step": 1955 }, { "epoch": 0.7223042836041359, "grad_norm": 0.30225417017936707, "learning_rate": 0.00015193989407562508, "loss": 0.3521, "step": 1956 }, { "epoch": 0.7226735598227474, "grad_norm": 0.23431427776813507, "learning_rate": 0.0001519152605000616, "loss": 0.2989, "step": 1957 }, { "epoch": 0.7230428360413589, "grad_norm": 0.269161581993103, "learning_rate": 0.00015189062692449808, "loss": 0.2951, "step": 1958 }, { "epoch": 0.7234121122599705, "grad_norm": 0.26195868849754333, "learning_rate": 0.0001518659933489346, "loss": 0.2539, "step": 1959 }, { "epoch": 0.723781388478582, "grad_norm": 0.2628759443759918, "learning_rate": 0.00015184135977337111, "loss": 0.3122, "step": 1960 }, { "epoch": 0.7241506646971935, "grad_norm": 0.22648142278194427, "learning_rate": 0.00015181672619780763, "loss": 0.2497, "step": 1961 }, { "epoch": 0.7245199409158051, "grad_norm": 0.29775509238243103, "learning_rate": 0.00015179209262224412, "loss": 0.3033, "step": 1962 }, { "epoch": 0.7248892171344166, "grad_norm": 0.24567222595214844, "learning_rate": 0.00015176745904668063, "loss": 0.3178, "step": 1963 }, { "epoch": 0.725258493353028, "grad_norm": 0.2651590406894684, "learning_rate": 0.00015174282547111715, "loss": 0.3034, "step": 1964 }, { "epoch": 0.7256277695716395, "grad_norm": 0.24350565671920776, "learning_rate": 0.00015171819189555366, "loss": 0.2778, "step": 1965 }, { "epoch": 0.7259970457902511, "grad_norm": 0.2624291777610779, "learning_rate": 0.00015169355831999015, "loss": 0.2585, "step": 1966 }, { "epoch": 0.7263663220088626, "grad_norm": 0.28388434648513794, "learning_rate": 0.00015166892474442666, "loss": 0.2926, "step": 1967 }, { "epoch": 0.7267355982274741, "grad_norm": 0.29764947295188904, "learning_rate": 0.00015164429116886315, "loss": 0.3645, "step": 1968 }, { "epoch": 0.7271048744460856, "grad_norm": 0.25831013917922974, "learning_rate": 0.0001516196575932997, "loss": 0.3071, "step": 1969 }, { "epoch": 0.7274741506646972, "grad_norm": 0.3143414258956909, "learning_rate": 0.00015159502401773618, "loss": 0.3491, "step": 1970 }, { "epoch": 0.7278434268833087, "grad_norm": 0.2558245062828064, "learning_rate": 0.0001515703904421727, "loss": 0.2666, "step": 1971 }, { "epoch": 0.7282127031019202, "grad_norm": 0.2600441873073578, "learning_rate": 0.00015154575686660918, "loss": 0.289, "step": 1972 }, { "epoch": 0.7285819793205317, "grad_norm": 0.32279205322265625, "learning_rate": 0.0001515211232910457, "loss": 0.3442, "step": 1973 }, { "epoch": 0.7289512555391433, "grad_norm": 0.2380223423242569, "learning_rate": 0.0001514964897154822, "loss": 0.2647, "step": 1974 }, { "epoch": 0.7293205317577548, "grad_norm": 0.2663707435131073, "learning_rate": 0.00015147185613991873, "loss": 0.3083, "step": 1975 }, { "epoch": 0.7296898079763663, "grad_norm": 0.248251274228096, "learning_rate": 0.0001514472225643552, "loss": 0.2856, "step": 1976 }, { "epoch": 0.7300590841949779, "grad_norm": 0.28036782145500183, "learning_rate": 0.00015142258898879173, "loss": 0.3426, "step": 1977 }, { "epoch": 0.7304283604135894, "grad_norm": 0.30917128920555115, "learning_rate": 0.00015139795541322824, "loss": 0.3438, "step": 1978 }, { "epoch": 0.7307976366322009, "grad_norm": 0.2681553363800049, "learning_rate": 0.00015137332183766476, "loss": 0.3099, "step": 1979 }, { "epoch": 0.7311669128508124, "grad_norm": 0.2959456443786621, "learning_rate": 0.00015134868826210124, "loss": 0.2986, "step": 1980 }, { "epoch": 0.731536189069424, "grad_norm": 0.2918236255645752, "learning_rate": 0.00015132405468653776, "loss": 0.3043, "step": 1981 }, { "epoch": 0.7319054652880355, "grad_norm": 0.2856104075908661, "learning_rate": 0.00015129942111097427, "loss": 0.3195, "step": 1982 }, { "epoch": 0.732274741506647, "grad_norm": 0.33086833357810974, "learning_rate": 0.0001512747875354108, "loss": 0.4175, "step": 1983 }, { "epoch": 0.7326440177252584, "grad_norm": 0.2300816923379898, "learning_rate": 0.00015125015395984728, "loss": 0.2974, "step": 1984 }, { "epoch": 0.73301329394387, "grad_norm": 0.3335312604904175, "learning_rate": 0.0001512255203842838, "loss": 0.2704, "step": 1985 }, { "epoch": 0.7333825701624815, "grad_norm": 0.26959389448165894, "learning_rate": 0.00015120088680872028, "loss": 0.3085, "step": 1986 }, { "epoch": 0.733751846381093, "grad_norm": 0.29761821031570435, "learning_rate": 0.00015117625323315682, "loss": 0.3592, "step": 1987 }, { "epoch": 0.7341211225997046, "grad_norm": 0.2845323085784912, "learning_rate": 0.0001511516196575933, "loss": 0.3228, "step": 1988 }, { "epoch": 0.7344903988183161, "grad_norm": 0.29670780897140503, "learning_rate": 0.00015112698608202982, "loss": 0.3098, "step": 1989 }, { "epoch": 0.7348596750369276, "grad_norm": 0.2633662819862366, "learning_rate": 0.0001511023525064663, "loss": 0.2927, "step": 1990 }, { "epoch": 0.7352289512555391, "grad_norm": 0.2808889150619507, "learning_rate": 0.00015107771893090282, "loss": 0.2659, "step": 1991 }, { "epoch": 0.7355982274741507, "grad_norm": 0.2930167019367218, "learning_rate": 0.00015105308535533934, "loss": 0.3096, "step": 1992 }, { "epoch": 0.7359675036927622, "grad_norm": 0.2737327218055725, "learning_rate": 0.00015102845177977585, "loss": 0.3401, "step": 1993 }, { "epoch": 0.7363367799113737, "grad_norm": 0.30987557768821716, "learning_rate": 0.00015100381820421234, "loss": 0.3673, "step": 1994 }, { "epoch": 0.7367060561299852, "grad_norm": 0.2720504105091095, "learning_rate": 0.00015097918462864886, "loss": 0.2611, "step": 1995 }, { "epoch": 0.7370753323485968, "grad_norm": 0.26491352915763855, "learning_rate": 0.00015095455105308537, "loss": 0.2601, "step": 1996 }, { "epoch": 0.7374446085672083, "grad_norm": 0.2890115976333618, "learning_rate": 0.00015092991747752188, "loss": 0.2759, "step": 1997 }, { "epoch": 0.7378138847858198, "grad_norm": 0.26483097672462463, "learning_rate": 0.00015090528390195837, "loss": 0.2952, "step": 1998 }, { "epoch": 0.7381831610044313, "grad_norm": 0.4067525565624237, "learning_rate": 0.0001508806503263949, "loss": 0.3835, "step": 1999 }, { "epoch": 0.7385524372230429, "grad_norm": 0.25175902247428894, "learning_rate": 0.00015085601675083137, "loss": 0.3223, "step": 2000 }, { "epoch": 0.7385524372230429, "eval_loss": 0.29646578431129456, "eval_runtime": 5.8534, "eval_samples_per_second": 8.542, "eval_steps_per_second": 1.196, "step": 2000 }, { "epoch": 0.7389217134416544, "grad_norm": 0.23213210701942444, "learning_rate": 0.00015083138317526792, "loss": 0.2536, "step": 2001 }, { "epoch": 0.7392909896602659, "grad_norm": 0.3151390552520752, "learning_rate": 0.0001508067495997044, "loss": 0.3286, "step": 2002 }, { "epoch": 0.7396602658788775, "grad_norm": 0.30138006806373596, "learning_rate": 0.00015078211602414092, "loss": 0.345, "step": 2003 }, { "epoch": 0.740029542097489, "grad_norm": 0.3060076832771301, "learning_rate": 0.0001507574824485774, "loss": 0.3118, "step": 2004 }, { "epoch": 0.7403988183161004, "grad_norm": 0.26108142733573914, "learning_rate": 0.00015073284887301392, "loss": 0.281, "step": 2005 }, { "epoch": 0.7407680945347119, "grad_norm": 0.23906740546226501, "learning_rate": 0.00015070821529745043, "loss": 0.3418, "step": 2006 }, { "epoch": 0.7411373707533235, "grad_norm": 0.26794371008872986, "learning_rate": 0.00015068358172188695, "loss": 0.2954, "step": 2007 }, { "epoch": 0.741506646971935, "grad_norm": 0.2571132481098175, "learning_rate": 0.00015065894814632344, "loss": 0.2705, "step": 2008 }, { "epoch": 0.7418759231905465, "grad_norm": 0.38293299078941345, "learning_rate": 0.00015063431457075995, "loss": 0.3252, "step": 2009 }, { "epoch": 0.742245199409158, "grad_norm": 0.22673162817955017, "learning_rate": 0.00015060968099519647, "loss": 0.2243, "step": 2010 }, { "epoch": 0.7426144756277696, "grad_norm": 0.286089152097702, "learning_rate": 0.00015058504741963298, "loss": 0.2674, "step": 2011 }, { "epoch": 0.7429837518463811, "grad_norm": 0.25819486379623413, "learning_rate": 0.00015056041384406947, "loss": 0.3055, "step": 2012 }, { "epoch": 0.7433530280649926, "grad_norm": 0.24408473074436188, "learning_rate": 0.00015053578026850598, "loss": 0.2664, "step": 2013 }, { "epoch": 0.7437223042836041, "grad_norm": 0.2625264823436737, "learning_rate": 0.00015051114669294247, "loss": 0.3595, "step": 2014 }, { "epoch": 0.7440915805022157, "grad_norm": 0.2610447108745575, "learning_rate": 0.000150486513117379, "loss": 0.3264, "step": 2015 }, { "epoch": 0.7444608567208272, "grad_norm": 0.2926216721534729, "learning_rate": 0.0001504618795418155, "loss": 0.3684, "step": 2016 }, { "epoch": 0.7448301329394387, "grad_norm": 0.2876848876476288, "learning_rate": 0.00015043724596625201, "loss": 0.2529, "step": 2017 }, { "epoch": 0.7451994091580503, "grad_norm": 0.31420740485191345, "learning_rate": 0.0001504126123906885, "loss": 0.3609, "step": 2018 }, { "epoch": 0.7455686853766618, "grad_norm": 0.25228697061538696, "learning_rate": 0.00015038797881512502, "loss": 0.2823, "step": 2019 }, { "epoch": 0.7459379615952733, "grad_norm": 0.3094099462032318, "learning_rate": 0.00015036334523956153, "loss": 0.3351, "step": 2020 }, { "epoch": 0.7463072378138847, "grad_norm": 0.287903755903244, "learning_rate": 0.00015033871166399805, "loss": 0.3111, "step": 2021 }, { "epoch": 0.7466765140324964, "grad_norm": 0.3031036853790283, "learning_rate": 0.00015031407808843453, "loss": 0.3174, "step": 2022 }, { "epoch": 0.7470457902511078, "grad_norm": 0.21436475217342377, "learning_rate": 0.00015028944451287105, "loss": 0.2926, "step": 2023 }, { "epoch": 0.7474150664697193, "grad_norm": 0.3065086305141449, "learning_rate": 0.00015026481093730756, "loss": 0.3138, "step": 2024 }, { "epoch": 0.7477843426883308, "grad_norm": 0.25306811928749084, "learning_rate": 0.00015024017736174408, "loss": 0.2447, "step": 2025 }, { "epoch": 0.7481536189069424, "grad_norm": 0.2480946034193039, "learning_rate": 0.00015021554378618057, "loss": 0.2867, "step": 2026 }, { "epoch": 0.7485228951255539, "grad_norm": 0.3200647830963135, "learning_rate": 0.00015019091021061708, "loss": 0.3645, "step": 2027 }, { "epoch": 0.7488921713441654, "grad_norm": 0.25502264499664307, "learning_rate": 0.0001501662766350536, "loss": 0.2771, "step": 2028 }, { "epoch": 0.7492614475627769, "grad_norm": 0.2702923119068146, "learning_rate": 0.0001501416430594901, "loss": 0.287, "step": 2029 }, { "epoch": 0.7496307237813885, "grad_norm": 0.30667737126350403, "learning_rate": 0.0001501170094839266, "loss": 0.3356, "step": 2030 }, { "epoch": 0.75, "grad_norm": 0.26852285861968994, "learning_rate": 0.0001500923759083631, "loss": 0.3264, "step": 2031 }, { "epoch": 0.7503692762186115, "grad_norm": 0.2851296663284302, "learning_rate": 0.0001500677423327996, "loss": 0.359, "step": 2032 }, { "epoch": 0.7507385524372231, "grad_norm": 0.2569027841091156, "learning_rate": 0.00015004310875723614, "loss": 0.2933, "step": 2033 }, { "epoch": 0.7511078286558346, "grad_norm": 0.2416686713695526, "learning_rate": 0.00015001847518167263, "loss": 0.2826, "step": 2034 }, { "epoch": 0.7514771048744461, "grad_norm": 0.255740225315094, "learning_rate": 0.00014999384160610914, "loss": 0.3414, "step": 2035 }, { "epoch": 0.7518463810930576, "grad_norm": 0.28471001982688904, "learning_rate": 0.00014996920803054563, "loss": 0.3215, "step": 2036 }, { "epoch": 0.7522156573116692, "grad_norm": 0.21029578149318695, "learning_rate": 0.00014994457445498214, "loss": 0.1987, "step": 2037 }, { "epoch": 0.7525849335302807, "grad_norm": 0.23480457067489624, "learning_rate": 0.00014991994087941866, "loss": 0.2928, "step": 2038 }, { "epoch": 0.7529542097488922, "grad_norm": 0.3157004117965698, "learning_rate": 0.00014989530730385517, "loss": 0.3403, "step": 2039 }, { "epoch": 0.7533234859675036, "grad_norm": 0.23648284375667572, "learning_rate": 0.00014987067372829166, "loss": 0.288, "step": 2040 }, { "epoch": 0.7536927621861153, "grad_norm": 0.3547457754611969, "learning_rate": 0.00014984604015272818, "loss": 0.4128, "step": 2041 }, { "epoch": 0.7540620384047267, "grad_norm": 0.32118773460388184, "learning_rate": 0.0001498214065771647, "loss": 0.3829, "step": 2042 }, { "epoch": 0.7544313146233382, "grad_norm": 0.2910565137863159, "learning_rate": 0.0001497967730016012, "loss": 0.3197, "step": 2043 }, { "epoch": 0.7548005908419497, "grad_norm": 0.24634191393852234, "learning_rate": 0.0001497721394260377, "loss": 0.2618, "step": 2044 }, { "epoch": 0.7551698670605613, "grad_norm": 0.3879796862602234, "learning_rate": 0.0001497475058504742, "loss": 0.3678, "step": 2045 }, { "epoch": 0.7555391432791728, "grad_norm": 0.34548044204711914, "learning_rate": 0.0001497228722749107, "loss": 0.4285, "step": 2046 }, { "epoch": 0.7559084194977843, "grad_norm": 0.298462450504303, "learning_rate": 0.00014969823869934724, "loss": 0.36, "step": 2047 }, { "epoch": 0.7562776957163959, "grad_norm": 0.2912174165248871, "learning_rate": 0.00014967360512378372, "loss": 0.3545, "step": 2048 }, { "epoch": 0.7566469719350074, "grad_norm": 0.3759162724018097, "learning_rate": 0.00014964897154822024, "loss": 0.3304, "step": 2049 }, { "epoch": 0.7570162481536189, "grad_norm": 0.28800928592681885, "learning_rate": 0.00014962433797265673, "loss": 0.345, "step": 2050 }, { "epoch": 0.7570162481536189, "eval_loss": 0.2972644567489624, "eval_runtime": 5.8587, "eval_samples_per_second": 8.534, "eval_steps_per_second": 1.195, "step": 2050 }, { "epoch": 0.7573855243722304, "grad_norm": 0.23943784832954407, "learning_rate": 0.00014959970439709324, "loss": 0.2881, "step": 2051 }, { "epoch": 0.757754800590842, "grad_norm": 0.30913013219833374, "learning_rate": 0.00014957507082152976, "loss": 0.3886, "step": 2052 }, { "epoch": 0.7581240768094535, "grad_norm": 0.3367602527141571, "learning_rate": 0.00014955043724596627, "loss": 0.4031, "step": 2053 }, { "epoch": 0.758493353028065, "grad_norm": 0.2651554048061371, "learning_rate": 0.00014952580367040276, "loss": 0.263, "step": 2054 }, { "epoch": 0.7588626292466765, "grad_norm": 0.3087141215801239, "learning_rate": 0.00014950117009483927, "loss": 0.3262, "step": 2055 }, { "epoch": 0.7592319054652881, "grad_norm": 0.30410876870155334, "learning_rate": 0.0001494765365192758, "loss": 0.324, "step": 2056 }, { "epoch": 0.7596011816838996, "grad_norm": 0.31383657455444336, "learning_rate": 0.0001494519029437123, "loss": 0.3655, "step": 2057 }, { "epoch": 0.759970457902511, "grad_norm": 0.23047925531864166, "learning_rate": 0.0001494272693681488, "loss": 0.2653, "step": 2058 }, { "epoch": 0.7603397341211225, "grad_norm": 0.29686346650123596, "learning_rate": 0.0001494026357925853, "loss": 0.3427, "step": 2059 }, { "epoch": 0.7607090103397341, "grad_norm": 0.23575901985168457, "learning_rate": 0.00014937800221702182, "loss": 0.2934, "step": 2060 }, { "epoch": 0.7610782865583456, "grad_norm": 0.29045569896698, "learning_rate": 0.00014935336864145833, "loss": 0.3784, "step": 2061 }, { "epoch": 0.7614475627769571, "grad_norm": 0.22907930612564087, "learning_rate": 0.00014932873506589482, "loss": 0.2935, "step": 2062 }, { "epoch": 0.7618168389955687, "grad_norm": 0.28066712617874146, "learning_rate": 0.00014930410149033134, "loss": 0.3231, "step": 2063 }, { "epoch": 0.7621861152141802, "grad_norm": 0.30530381202697754, "learning_rate": 0.00014927946791476782, "loss": 0.2737, "step": 2064 }, { "epoch": 0.7625553914327917, "grad_norm": 0.286344051361084, "learning_rate": 0.00014925483433920436, "loss": 0.3369, "step": 2065 }, { "epoch": 0.7629246676514032, "grad_norm": 0.20019960403442383, "learning_rate": 0.00014923020076364085, "loss": 0.2157, "step": 2066 }, { "epoch": 0.7632939438700148, "grad_norm": 0.30169007182121277, "learning_rate": 0.00014920556718807737, "loss": 0.3248, "step": 2067 }, { "epoch": 0.7636632200886263, "grad_norm": 0.24721257388591766, "learning_rate": 0.00014918093361251385, "loss": 0.301, "step": 2068 }, { "epoch": 0.7640324963072378, "grad_norm": 0.25720271468162537, "learning_rate": 0.00014915630003695037, "loss": 0.2868, "step": 2069 }, { "epoch": 0.7644017725258493, "grad_norm": 0.26444628834724426, "learning_rate": 0.00014913166646138688, "loss": 0.3082, "step": 2070 }, { "epoch": 0.7647710487444609, "grad_norm": 0.2681595981121063, "learning_rate": 0.0001491070328858234, "loss": 0.2994, "step": 2071 }, { "epoch": 0.7651403249630724, "grad_norm": 0.27749788761138916, "learning_rate": 0.00014908239931025989, "loss": 0.3161, "step": 2072 }, { "epoch": 0.7655096011816839, "grad_norm": 0.28634199500083923, "learning_rate": 0.0001490577657346964, "loss": 0.3453, "step": 2073 }, { "epoch": 0.7658788774002954, "grad_norm": 0.24624323844909668, "learning_rate": 0.00014903313215913292, "loss": 0.2868, "step": 2074 }, { "epoch": 0.766248153618907, "grad_norm": 0.27104800939559937, "learning_rate": 0.00014900849858356943, "loss": 0.2837, "step": 2075 }, { "epoch": 0.7666174298375185, "grad_norm": 0.2961234152317047, "learning_rate": 0.00014898386500800592, "loss": 0.308, "step": 2076 }, { "epoch": 0.76698670605613, "grad_norm": 0.2492390125989914, "learning_rate": 0.00014895923143244243, "loss": 0.2736, "step": 2077 }, { "epoch": 0.7673559822747416, "grad_norm": 0.3363376557826996, "learning_rate": 0.00014893459785687892, "loss": 0.3132, "step": 2078 }, { "epoch": 0.767725258493353, "grad_norm": 0.41437986493110657, "learning_rate": 0.00014890996428131546, "loss": 0.3341, "step": 2079 }, { "epoch": 0.7680945347119645, "grad_norm": 0.3183686435222626, "learning_rate": 0.00014888533070575195, "loss": 0.376, "step": 2080 }, { "epoch": 0.768463810930576, "grad_norm": 0.2770771384239197, "learning_rate": 0.00014886069713018846, "loss": 0.3033, "step": 2081 }, { "epoch": 0.7688330871491876, "grad_norm": 0.3003086447715759, "learning_rate": 0.00014883606355462495, "loss": 0.3725, "step": 2082 }, { "epoch": 0.7692023633677991, "grad_norm": 0.2913537323474884, "learning_rate": 0.00014881142997906147, "loss": 0.3242, "step": 2083 }, { "epoch": 0.7695716395864106, "grad_norm": 0.31694701313972473, "learning_rate": 0.00014878679640349798, "loss": 0.3658, "step": 2084 }, { "epoch": 0.7699409158050221, "grad_norm": 0.2636033594608307, "learning_rate": 0.0001487621628279345, "loss": 0.2862, "step": 2085 }, { "epoch": 0.7703101920236337, "grad_norm": 0.3004938066005707, "learning_rate": 0.00014873752925237098, "loss": 0.2993, "step": 2086 }, { "epoch": 0.7706794682422452, "grad_norm": 0.31371644139289856, "learning_rate": 0.0001487128956768075, "loss": 0.3697, "step": 2087 }, { "epoch": 0.7710487444608567, "grad_norm": 0.30200207233428955, "learning_rate": 0.000148688262101244, "loss": 0.3745, "step": 2088 }, { "epoch": 0.7714180206794683, "grad_norm": 0.29897138476371765, "learning_rate": 0.00014866362852568053, "loss": 0.3078, "step": 2089 }, { "epoch": 0.7717872968980798, "grad_norm": 0.2919757664203644, "learning_rate": 0.00014863899495011701, "loss": 0.3105, "step": 2090 }, { "epoch": 0.7721565731166913, "grad_norm": 0.31833651661872864, "learning_rate": 0.00014861436137455353, "loss": 0.451, "step": 2091 }, { "epoch": 0.7725258493353028, "grad_norm": 0.2824268043041229, "learning_rate": 0.00014858972779899004, "loss": 0.3029, "step": 2092 }, { "epoch": 0.7728951255539144, "grad_norm": 0.37862062454223633, "learning_rate": 0.00014856509422342653, "loss": 0.3462, "step": 2093 }, { "epoch": 0.7732644017725259, "grad_norm": 0.2715948522090912, "learning_rate": 0.00014854046064786305, "loss": 0.2842, "step": 2094 }, { "epoch": 0.7736336779911374, "grad_norm": 0.2682742476463318, "learning_rate": 0.00014851582707229953, "loss": 0.3047, "step": 2095 }, { "epoch": 0.7740029542097489, "grad_norm": 0.28688040375709534, "learning_rate": 0.00014849119349673605, "loss": 0.3445, "step": 2096 }, { "epoch": 0.7743722304283605, "grad_norm": 0.29540103673934937, "learning_rate": 0.00014846655992117256, "loss": 0.3515, "step": 2097 }, { "epoch": 0.774741506646972, "grad_norm": 0.2762974798679352, "learning_rate": 0.00014844192634560908, "loss": 0.3373, "step": 2098 }, { "epoch": 0.7751107828655834, "grad_norm": 0.2746526896953583, "learning_rate": 0.00014841729277004556, "loss": 0.2879, "step": 2099 }, { "epoch": 0.7754800590841949, "grad_norm": 0.2964654862880707, "learning_rate": 0.00014839265919448208, "loss": 0.3493, "step": 2100 }, { "epoch": 0.7754800590841949, "eval_loss": 0.2935633361339569, "eval_runtime": 5.8653, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.193, "step": 2100 }, { "epoch": 0.7758493353028065, "grad_norm": 0.34667137265205383, "learning_rate": 0.0001483680256189186, "loss": 0.2905, "step": 2101 }, { "epoch": 0.776218611521418, "grad_norm": 0.30348148941993713, "learning_rate": 0.0001483433920433551, "loss": 0.3285, "step": 2102 }, { "epoch": 0.7765878877400295, "grad_norm": 0.2987040877342224, "learning_rate": 0.0001483187584677916, "loss": 0.25, "step": 2103 }, { "epoch": 0.7769571639586411, "grad_norm": 0.2796286642551422, "learning_rate": 0.0001482941248922281, "loss": 0.3412, "step": 2104 }, { "epoch": 0.7773264401772526, "grad_norm": 0.3094947040081024, "learning_rate": 0.0001482694913166646, "loss": 0.2876, "step": 2105 }, { "epoch": 0.7776957163958641, "grad_norm": 0.3286437690258026, "learning_rate": 0.00014824485774110114, "loss": 0.2762, "step": 2106 }, { "epoch": 0.7780649926144756, "grad_norm": 0.22929511964321136, "learning_rate": 0.00014822022416553763, "loss": 0.2928, "step": 2107 }, { "epoch": 0.7784342688330872, "grad_norm": 0.2838447093963623, "learning_rate": 0.00014819559058997414, "loss": 0.3409, "step": 2108 }, { "epoch": 0.7788035450516987, "grad_norm": 0.2506687343120575, "learning_rate": 0.00014817095701441063, "loss": 0.2909, "step": 2109 }, { "epoch": 0.7791728212703102, "grad_norm": 0.3038922846317291, "learning_rate": 0.00014814632343884714, "loss": 0.3414, "step": 2110 }, { "epoch": 0.7795420974889217, "grad_norm": 0.3076067566871643, "learning_rate": 0.00014812168986328366, "loss": 0.2651, "step": 2111 }, { "epoch": 0.7799113737075333, "grad_norm": 0.2969833016395569, "learning_rate": 0.00014809705628772017, "loss": 0.278, "step": 2112 }, { "epoch": 0.7802806499261448, "grad_norm": 0.44124898314476013, "learning_rate": 0.00014807242271215666, "loss": 0.3854, "step": 2113 }, { "epoch": 0.7806499261447563, "grad_norm": 0.24413622915744781, "learning_rate": 0.00014804778913659318, "loss": 0.2898, "step": 2114 }, { "epoch": 0.7810192023633677, "grad_norm": 0.27109289169311523, "learning_rate": 0.0001480231555610297, "loss": 0.2891, "step": 2115 }, { "epoch": 0.7813884785819794, "grad_norm": 0.31798434257507324, "learning_rate": 0.0001479985219854662, "loss": 0.3494, "step": 2116 }, { "epoch": 0.7817577548005908, "grad_norm": 0.298524409532547, "learning_rate": 0.0001479738884099027, "loss": 0.3293, "step": 2117 }, { "epoch": 0.7821270310192023, "grad_norm": 0.3131650388240814, "learning_rate": 0.0001479492548343392, "loss": 0.3088, "step": 2118 }, { "epoch": 0.7824963072378139, "grad_norm": 0.2757169008255005, "learning_rate": 0.00014792462125877572, "loss": 0.2962, "step": 2119 }, { "epoch": 0.7828655834564254, "grad_norm": 0.3562310039997101, "learning_rate": 0.00014789998768321224, "loss": 0.2901, "step": 2120 }, { "epoch": 0.7832348596750369, "grad_norm": 0.30650612711906433, "learning_rate": 0.00014787535410764872, "loss": 0.3469, "step": 2121 }, { "epoch": 0.7836041358936484, "grad_norm": 0.3404449224472046, "learning_rate": 0.00014785072053208524, "loss": 0.2914, "step": 2122 }, { "epoch": 0.78397341211226, "grad_norm": 0.3603094220161438, "learning_rate": 0.00014782608695652173, "loss": 0.429, "step": 2123 }, { "epoch": 0.7843426883308715, "grad_norm": 0.2617523670196533, "learning_rate": 0.00014780145338095827, "loss": 0.2303, "step": 2124 }, { "epoch": 0.784711964549483, "grad_norm": 0.3045143783092499, "learning_rate": 0.00014777681980539476, "loss": 0.3371, "step": 2125 }, { "epoch": 0.7850812407680945, "grad_norm": 0.25733378529548645, "learning_rate": 0.00014775218622983127, "loss": 0.2849, "step": 2126 }, { "epoch": 0.7854505169867061, "grad_norm": 0.26336920261383057, "learning_rate": 0.00014772755265426776, "loss": 0.3082, "step": 2127 }, { "epoch": 0.7858197932053176, "grad_norm": 0.23966217041015625, "learning_rate": 0.00014770291907870427, "loss": 0.2412, "step": 2128 }, { "epoch": 0.7861890694239291, "grad_norm": 0.34771105647087097, "learning_rate": 0.0001476782855031408, "loss": 0.3221, "step": 2129 }, { "epoch": 0.7865583456425406, "grad_norm": 0.26496586203575134, "learning_rate": 0.0001476536519275773, "loss": 0.2855, "step": 2130 }, { "epoch": 0.7869276218611522, "grad_norm": 0.2909802794456482, "learning_rate": 0.0001476290183520138, "loss": 0.324, "step": 2131 }, { "epoch": 0.7872968980797637, "grad_norm": 0.24908676743507385, "learning_rate": 0.0001476043847764503, "loss": 0.2918, "step": 2132 }, { "epoch": 0.7876661742983752, "grad_norm": 0.3057284653186798, "learning_rate": 0.00014757975120088682, "loss": 0.3545, "step": 2133 }, { "epoch": 0.7880354505169868, "grad_norm": 0.32036465406417847, "learning_rate": 0.00014755511762532333, "loss": 0.3512, "step": 2134 }, { "epoch": 0.7884047267355982, "grad_norm": 0.32289084792137146, "learning_rate": 0.00014753048404975982, "loss": 0.3814, "step": 2135 }, { "epoch": 0.7887740029542097, "grad_norm": 0.27264589071273804, "learning_rate": 0.00014750585047419634, "loss": 0.2964, "step": 2136 }, { "epoch": 0.7891432791728212, "grad_norm": 0.3594219386577606, "learning_rate": 0.00014748121689863282, "loss": 0.3189, "step": 2137 }, { "epoch": 0.7895125553914328, "grad_norm": 0.23406580090522766, "learning_rate": 0.00014745658332306936, "loss": 0.2497, "step": 2138 }, { "epoch": 0.7898818316100443, "grad_norm": 0.24143576622009277, "learning_rate": 0.00014743194974750585, "loss": 0.2709, "step": 2139 }, { "epoch": 0.7902511078286558, "grad_norm": 0.23684187233448029, "learning_rate": 0.00014740731617194237, "loss": 0.2494, "step": 2140 }, { "epoch": 0.7906203840472673, "grad_norm": 0.3377109467983246, "learning_rate": 0.00014738268259637885, "loss": 0.3863, "step": 2141 }, { "epoch": 0.7909896602658789, "grad_norm": 0.4215676188468933, "learning_rate": 0.00014735804902081537, "loss": 0.3644, "step": 2142 }, { "epoch": 0.7913589364844904, "grad_norm": 0.3114874064922333, "learning_rate": 0.00014733341544525188, "loss": 0.3626, "step": 2143 }, { "epoch": 0.7917282127031019, "grad_norm": 0.3052099347114563, "learning_rate": 0.0001473087818696884, "loss": 0.3348, "step": 2144 }, { "epoch": 0.7920974889217134, "grad_norm": 0.24991333484649658, "learning_rate": 0.00014728414829412489, "loss": 0.2905, "step": 2145 }, { "epoch": 0.792466765140325, "grad_norm": 0.2939784824848175, "learning_rate": 0.0001472595147185614, "loss": 0.2834, "step": 2146 }, { "epoch": 0.7928360413589365, "grad_norm": 0.24958360195159912, "learning_rate": 0.00014723488114299791, "loss": 0.2484, "step": 2147 }, { "epoch": 0.793205317577548, "grad_norm": 0.2922380566596985, "learning_rate": 0.00014721024756743443, "loss": 0.2775, "step": 2148 }, { "epoch": 0.7935745937961596, "grad_norm": 0.3258068561553955, "learning_rate": 0.00014718561399187092, "loss": 0.2699, "step": 2149 }, { "epoch": 0.7939438700147711, "grad_norm": 0.3072319030761719, "learning_rate": 0.00014716098041630743, "loss": 0.3221, "step": 2150 }, { "epoch": 0.7939438700147711, "eval_loss": 0.29739075899124146, "eval_runtime": 5.8631, "eval_samples_per_second": 8.528, "eval_steps_per_second": 1.194, "step": 2150 }, { "epoch": 0.7943131462333826, "grad_norm": 0.28948506712913513, "learning_rate": 0.00014713634684074392, "loss": 0.2879, "step": 2151 }, { "epoch": 0.794682422451994, "grad_norm": 0.338868647813797, "learning_rate": 0.00014711171326518046, "loss": 0.3321, "step": 2152 }, { "epoch": 0.7950516986706057, "grad_norm": 0.32467564940452576, "learning_rate": 0.00014708707968961695, "loss": 0.3321, "step": 2153 }, { "epoch": 0.7954209748892171, "grad_norm": 0.3621566891670227, "learning_rate": 0.00014706244611405346, "loss": 0.4035, "step": 2154 }, { "epoch": 0.7957902511078286, "grad_norm": 0.3736126720905304, "learning_rate": 0.00014703781253848995, "loss": 0.3209, "step": 2155 }, { "epoch": 0.7961595273264401, "grad_norm": 0.2838221788406372, "learning_rate": 0.00014701317896292647, "loss": 0.2987, "step": 2156 }, { "epoch": 0.7965288035450517, "grad_norm": 0.3212393522262573, "learning_rate": 0.00014698854538736298, "loss": 0.3889, "step": 2157 }, { "epoch": 0.7968980797636632, "grad_norm": 0.31640344858169556, "learning_rate": 0.0001469639118117995, "loss": 0.3563, "step": 2158 }, { "epoch": 0.7972673559822747, "grad_norm": 0.29704806208610535, "learning_rate": 0.00014693927823623598, "loss": 0.3044, "step": 2159 }, { "epoch": 0.7976366322008862, "grad_norm": 0.34912556409835815, "learning_rate": 0.0001469146446606725, "loss": 0.3201, "step": 2160 }, { "epoch": 0.7980059084194978, "grad_norm": 0.2666184604167938, "learning_rate": 0.000146890011085109, "loss": 0.346, "step": 2161 }, { "epoch": 0.7983751846381093, "grad_norm": 0.29834869503974915, "learning_rate": 0.00014686537750954553, "loss": 0.3404, "step": 2162 }, { "epoch": 0.7987444608567208, "grad_norm": 0.2953677475452423, "learning_rate": 0.00014684074393398201, "loss": 0.2415, "step": 2163 }, { "epoch": 0.7991137370753324, "grad_norm": 0.31550806760787964, "learning_rate": 0.00014681611035841853, "loss": 0.2797, "step": 2164 }, { "epoch": 0.7994830132939439, "grad_norm": 0.2836500108242035, "learning_rate": 0.00014679147678285504, "loss": 0.3396, "step": 2165 }, { "epoch": 0.7998522895125554, "grad_norm": 0.2543346881866455, "learning_rate": 0.00014676684320729156, "loss": 0.3069, "step": 2166 }, { "epoch": 0.8002215657311669, "grad_norm": 0.30015766620635986, "learning_rate": 0.00014674220963172805, "loss": 0.3124, "step": 2167 }, { "epoch": 0.8005908419497785, "grad_norm": 0.2661927342414856, "learning_rate": 0.00014671757605616456, "loss": 0.3075, "step": 2168 }, { "epoch": 0.80096011816839, "grad_norm": 0.3345184326171875, "learning_rate": 0.00014669294248060105, "loss": 0.3104, "step": 2169 }, { "epoch": 0.8013293943870015, "grad_norm": 0.2684672474861145, "learning_rate": 0.0001466683089050376, "loss": 0.3331, "step": 2170 }, { "epoch": 0.801698670605613, "grad_norm": 0.2914975583553314, "learning_rate": 0.00014664367532947408, "loss": 0.3058, "step": 2171 }, { "epoch": 0.8020679468242246, "grad_norm": 0.23483692109584808, "learning_rate": 0.0001466190417539106, "loss": 0.2726, "step": 2172 }, { "epoch": 0.802437223042836, "grad_norm": 0.2587801516056061, "learning_rate": 0.00014659440817834708, "loss": 0.3331, "step": 2173 }, { "epoch": 0.8028064992614475, "grad_norm": 0.3047831058502197, "learning_rate": 0.0001465697746027836, "loss": 0.3626, "step": 2174 }, { "epoch": 0.803175775480059, "grad_norm": 0.4038315415382385, "learning_rate": 0.0001465451410272201, "loss": 0.3259, "step": 2175 }, { "epoch": 0.8035450516986706, "grad_norm": 0.22119131684303284, "learning_rate": 0.00014652050745165662, "loss": 0.2991, "step": 2176 }, { "epoch": 0.8039143279172821, "grad_norm": 0.26064956188201904, "learning_rate": 0.0001464958738760931, "loss": 0.2791, "step": 2177 }, { "epoch": 0.8042836041358936, "grad_norm": 0.2967536449432373, "learning_rate": 0.00014647124030052962, "loss": 0.3199, "step": 2178 }, { "epoch": 0.8046528803545052, "grad_norm": 0.2943962812423706, "learning_rate": 0.00014644660672496614, "loss": 0.2945, "step": 2179 }, { "epoch": 0.8050221565731167, "grad_norm": 0.2509021759033203, "learning_rate": 0.00014642197314940265, "loss": 0.3017, "step": 2180 }, { "epoch": 0.8053914327917282, "grad_norm": 0.3554409146308899, "learning_rate": 0.00014639733957383914, "loss": 0.3672, "step": 2181 }, { "epoch": 0.8057607090103397, "grad_norm": 0.2364262491464615, "learning_rate": 0.00014637270599827566, "loss": 0.2554, "step": 2182 }, { "epoch": 0.8061299852289513, "grad_norm": 0.3607995808124542, "learning_rate": 0.00014634807242271214, "loss": 0.3322, "step": 2183 }, { "epoch": 0.8064992614475628, "grad_norm": 0.2653542160987854, "learning_rate": 0.00014632343884714869, "loss": 0.2872, "step": 2184 }, { "epoch": 0.8068685376661743, "grad_norm": 0.257291316986084, "learning_rate": 0.00014629880527158517, "loss": 0.251, "step": 2185 }, { "epoch": 0.8072378138847858, "grad_norm": 0.31101787090301514, "learning_rate": 0.0001462741716960217, "loss": 0.2955, "step": 2186 }, { "epoch": 0.8076070901033974, "grad_norm": 0.23836660385131836, "learning_rate": 0.00014624953812045818, "loss": 0.2923, "step": 2187 }, { "epoch": 0.8079763663220089, "grad_norm": 0.384665310382843, "learning_rate": 0.0001462249045448947, "loss": 0.3695, "step": 2188 }, { "epoch": 0.8083456425406204, "grad_norm": 0.3111051321029663, "learning_rate": 0.0001462002709693312, "loss": 0.3134, "step": 2189 }, { "epoch": 0.808714918759232, "grad_norm": 0.270060658454895, "learning_rate": 0.00014617563739376772, "loss": 0.258, "step": 2190 }, { "epoch": 0.8090841949778435, "grad_norm": 0.2257768213748932, "learning_rate": 0.0001461510038182042, "loss": 0.2212, "step": 2191 }, { "epoch": 0.8094534711964549, "grad_norm": 0.2495872676372528, "learning_rate": 0.00014612637024264072, "loss": 0.2808, "step": 2192 }, { "epoch": 0.8098227474150664, "grad_norm": 0.29808223247528076, "learning_rate": 0.00014610173666707724, "loss": 0.259, "step": 2193 }, { "epoch": 0.810192023633678, "grad_norm": 0.267164409160614, "learning_rate": 0.00014607710309151375, "loss": 0.3281, "step": 2194 }, { "epoch": 0.8105612998522895, "grad_norm": 0.2977560758590698, "learning_rate": 0.00014605246951595024, "loss": 0.3097, "step": 2195 }, { "epoch": 0.810930576070901, "grad_norm": 0.28259027004241943, "learning_rate": 0.00014602783594038675, "loss": 0.3511, "step": 2196 }, { "epoch": 0.8112998522895125, "grad_norm": 0.3065144419670105, "learning_rate": 0.00014600320236482327, "loss": 0.3589, "step": 2197 }, { "epoch": 0.8116691285081241, "grad_norm": 0.24390994012355804, "learning_rate": 0.00014597856878925978, "loss": 0.2485, "step": 2198 }, { "epoch": 0.8120384047267356, "grad_norm": 0.25467053055763245, "learning_rate": 0.00014595393521369627, "loss": 0.2408, "step": 2199 }, { "epoch": 0.8124076809453471, "grad_norm": 0.3040908873081207, "learning_rate": 0.00014592930163813278, "loss": 0.3687, "step": 2200 }, { "epoch": 0.8124076809453471, "eval_loss": 0.2931758761405945, "eval_runtime": 5.8676, "eval_samples_per_second": 8.521, "eval_steps_per_second": 1.193, "step": 2200 }, { "epoch": 0.8127769571639586, "grad_norm": 0.2869749665260315, "learning_rate": 0.00014590466806256927, "loss": 0.2708, "step": 2201 }, { "epoch": 0.8131462333825702, "grad_norm": 0.22452616691589355, "learning_rate": 0.0001458800344870058, "loss": 0.2482, "step": 2202 }, { "epoch": 0.8135155096011817, "grad_norm": 0.22195591032505035, "learning_rate": 0.0001458554009114423, "loss": 0.2664, "step": 2203 }, { "epoch": 0.8138847858197932, "grad_norm": 0.32908597588539124, "learning_rate": 0.00014583076733587882, "loss": 0.24, "step": 2204 }, { "epoch": 0.8142540620384048, "grad_norm": 0.29719024896621704, "learning_rate": 0.0001458061337603153, "loss": 0.3623, "step": 2205 }, { "epoch": 0.8146233382570163, "grad_norm": 0.28195294737815857, "learning_rate": 0.00014578150018475182, "loss": 0.3519, "step": 2206 }, { "epoch": 0.8149926144756278, "grad_norm": 0.2806910276412964, "learning_rate": 0.00014575686660918833, "loss": 0.2922, "step": 2207 }, { "epoch": 0.8153618906942393, "grad_norm": 0.31184086203575134, "learning_rate": 0.00014573223303362485, "loss": 0.3434, "step": 2208 }, { "epoch": 0.8157311669128509, "grad_norm": 0.2777872383594513, "learning_rate": 0.00014570759945806133, "loss": 0.3073, "step": 2209 }, { "epoch": 0.8161004431314623, "grad_norm": 0.24332855641841888, "learning_rate": 0.00014568296588249785, "loss": 0.2917, "step": 2210 }, { "epoch": 0.8164697193500738, "grad_norm": 0.23340560495853424, "learning_rate": 0.00014565833230693436, "loss": 0.2574, "step": 2211 }, { "epoch": 0.8168389955686853, "grad_norm": 0.2942739725112915, "learning_rate": 0.00014563369873137088, "loss": 0.3229, "step": 2212 }, { "epoch": 0.8172082717872969, "grad_norm": 0.283641517162323, "learning_rate": 0.00014560906515580737, "loss": 0.3321, "step": 2213 }, { "epoch": 0.8175775480059084, "grad_norm": 0.28562578558921814, "learning_rate": 0.00014558443158024388, "loss": 0.3235, "step": 2214 }, { "epoch": 0.8179468242245199, "grad_norm": 0.3028421700000763, "learning_rate": 0.00014555979800468037, "loss": 0.3386, "step": 2215 }, { "epoch": 0.8183161004431314, "grad_norm": 0.26572704315185547, "learning_rate": 0.0001455351644291169, "loss": 0.2618, "step": 2216 }, { "epoch": 0.818685376661743, "grad_norm": 0.3396584987640381, "learning_rate": 0.0001455105308535534, "loss": 0.278, "step": 2217 }, { "epoch": 0.8190546528803545, "grad_norm": 0.24953824281692505, "learning_rate": 0.0001454858972779899, "loss": 0.2935, "step": 2218 }, { "epoch": 0.819423929098966, "grad_norm": 0.27380311489105225, "learning_rate": 0.0001454612637024264, "loss": 0.3147, "step": 2219 }, { "epoch": 0.8197932053175776, "grad_norm": 0.33168891072273254, "learning_rate": 0.00014543663012686291, "loss": 0.3503, "step": 2220 }, { "epoch": 0.8201624815361891, "grad_norm": 0.24705637991428375, "learning_rate": 0.00014541199655129943, "loss": 0.2921, "step": 2221 }, { "epoch": 0.8205317577548006, "grad_norm": 0.25001150369644165, "learning_rate": 0.00014538736297573594, "loss": 0.2642, "step": 2222 }, { "epoch": 0.8209010339734121, "grad_norm": 0.27988019585609436, "learning_rate": 0.00014536272940017243, "loss": 0.3064, "step": 2223 }, { "epoch": 0.8212703101920237, "grad_norm": 0.267130047082901, "learning_rate": 0.00014533809582460895, "loss": 0.2935, "step": 2224 }, { "epoch": 0.8216395864106352, "grad_norm": 0.2690856456756592, "learning_rate": 0.00014531346224904546, "loss": 0.2765, "step": 2225 }, { "epoch": 0.8220088626292467, "grad_norm": 0.2621283233165741, "learning_rate": 0.00014528882867348198, "loss": 0.3099, "step": 2226 }, { "epoch": 0.8223781388478582, "grad_norm": 0.30594637989997864, "learning_rate": 0.00014526419509791846, "loss": 0.3001, "step": 2227 }, { "epoch": 0.8227474150664698, "grad_norm": 0.254181444644928, "learning_rate": 0.00014523956152235498, "loss": 0.2717, "step": 2228 }, { "epoch": 0.8231166912850812, "grad_norm": 0.26248815655708313, "learning_rate": 0.0001452149279467915, "loss": 0.2524, "step": 2229 }, { "epoch": 0.8234859675036927, "grad_norm": 0.2587265968322754, "learning_rate": 0.000145190294371228, "loss": 0.314, "step": 2230 }, { "epoch": 0.8238552437223042, "grad_norm": 0.2941606044769287, "learning_rate": 0.0001451656607956645, "loss": 0.339, "step": 2231 }, { "epoch": 0.8242245199409158, "grad_norm": 0.2971184253692627, "learning_rate": 0.000145141027220101, "loss": 0.2879, "step": 2232 }, { "epoch": 0.8245937961595273, "grad_norm": 0.21398895978927612, "learning_rate": 0.0001451163936445375, "loss": 0.2217, "step": 2233 }, { "epoch": 0.8249630723781388, "grad_norm": 0.26762524247169495, "learning_rate": 0.00014509176006897404, "loss": 0.2703, "step": 2234 }, { "epoch": 0.8253323485967504, "grad_norm": 0.2811284363269806, "learning_rate": 0.00014506712649341053, "loss": 0.3259, "step": 2235 }, { "epoch": 0.8257016248153619, "grad_norm": 0.33781254291534424, "learning_rate": 0.00014504249291784704, "loss": 0.348, "step": 2236 }, { "epoch": 0.8260709010339734, "grad_norm": 0.32253655791282654, "learning_rate": 0.00014501785934228353, "loss": 0.3812, "step": 2237 }, { "epoch": 0.8264401772525849, "grad_norm": 0.30919092893600464, "learning_rate": 0.00014499322576672004, "loss": 0.3099, "step": 2238 }, { "epoch": 0.8268094534711965, "grad_norm": 0.3416600823402405, "learning_rate": 0.00014496859219115656, "loss": 0.346, "step": 2239 }, { "epoch": 0.827178729689808, "grad_norm": 0.24034634232521057, "learning_rate": 0.00014494395861559307, "loss": 0.3034, "step": 2240 }, { "epoch": 0.8275480059084195, "grad_norm": 0.2517634332180023, "learning_rate": 0.00014491932504002956, "loss": 0.2773, "step": 2241 }, { "epoch": 0.827917282127031, "grad_norm": 0.2968411445617676, "learning_rate": 0.00014489469146446607, "loss": 0.3548, "step": 2242 }, { "epoch": 0.8282865583456426, "grad_norm": 0.33049216866493225, "learning_rate": 0.0001448700578889026, "loss": 0.3236, "step": 2243 }, { "epoch": 0.8286558345642541, "grad_norm": 0.25116103887557983, "learning_rate": 0.0001448454243133391, "loss": 0.3276, "step": 2244 }, { "epoch": 0.8290251107828656, "grad_norm": 0.29226842522621155, "learning_rate": 0.0001448207907377756, "loss": 0.3263, "step": 2245 }, { "epoch": 0.829394387001477, "grad_norm": 0.22805069386959076, "learning_rate": 0.0001447961571622121, "loss": 0.2605, "step": 2246 }, { "epoch": 0.8297636632200887, "grad_norm": 0.2949386239051819, "learning_rate": 0.0001447715235866486, "loss": 0.3113, "step": 2247 }, { "epoch": 0.8301329394387001, "grad_norm": 0.2299480140209198, "learning_rate": 0.00014474689001108513, "loss": 0.2869, "step": 2248 }, { "epoch": 0.8305022156573116, "grad_norm": 0.2542758285999298, "learning_rate": 0.00014472225643552162, "loss": 0.3202, "step": 2249 }, { "epoch": 0.8308714918759232, "grad_norm": 0.2857692241668701, "learning_rate": 0.00014469762285995814, "loss": 0.2913, "step": 2250 }, { "epoch": 0.8308714918759232, "eval_loss": 0.2872462570667267, "eval_runtime": 5.8643, "eval_samples_per_second": 8.526, "eval_steps_per_second": 1.194, "step": 2250 }, { "epoch": 0.8312407680945347, "grad_norm": 0.29522812366485596, "learning_rate": 0.00014467298928439462, "loss": 0.3988, "step": 2251 }, { "epoch": 0.8316100443131462, "grad_norm": 0.28120508790016174, "learning_rate": 0.00014464835570883114, "loss": 0.3674, "step": 2252 }, { "epoch": 0.8319793205317577, "grad_norm": 0.27494558691978455, "learning_rate": 0.00014462372213326765, "loss": 0.3127, "step": 2253 }, { "epoch": 0.8323485967503693, "grad_norm": 0.24010087549686432, "learning_rate": 0.00014459908855770417, "loss": 0.259, "step": 2254 }, { "epoch": 0.8327178729689808, "grad_norm": 0.2521667182445526, "learning_rate": 0.00014457445498214066, "loss": 0.263, "step": 2255 }, { "epoch": 0.8330871491875923, "grad_norm": 0.24542008340358734, "learning_rate": 0.00014454982140657717, "loss": 0.2655, "step": 2256 }, { "epoch": 0.8334564254062038, "grad_norm": 0.26360607147216797, "learning_rate": 0.00014452518783101369, "loss": 0.2994, "step": 2257 }, { "epoch": 0.8338257016248154, "grad_norm": 0.33817002177238464, "learning_rate": 0.0001445005542554502, "loss": 0.3448, "step": 2258 }, { "epoch": 0.8341949778434269, "grad_norm": 0.35080987215042114, "learning_rate": 0.0001444759206798867, "loss": 0.3449, "step": 2259 }, { "epoch": 0.8345642540620384, "grad_norm": 0.2686460316181183, "learning_rate": 0.0001444512871043232, "loss": 0.284, "step": 2260 }, { "epoch": 0.8349335302806499, "grad_norm": 0.3396205008029938, "learning_rate": 0.00014442665352875972, "loss": 0.3735, "step": 2261 }, { "epoch": 0.8353028064992615, "grad_norm": 0.25770655274391174, "learning_rate": 0.00014440201995319623, "loss": 0.2561, "step": 2262 }, { "epoch": 0.835672082717873, "grad_norm": 0.23628605902194977, "learning_rate": 0.00014437738637763272, "loss": 0.2772, "step": 2263 }, { "epoch": 0.8360413589364845, "grad_norm": 0.24436742067337036, "learning_rate": 0.00014435275280206923, "loss": 0.2348, "step": 2264 }, { "epoch": 0.8364106351550961, "grad_norm": 0.2738080620765686, "learning_rate": 0.00014432811922650572, "loss": 0.2965, "step": 2265 }, { "epoch": 0.8367799113737076, "grad_norm": 0.26231417059898376, "learning_rate": 0.00014430348565094226, "loss": 0.2665, "step": 2266 }, { "epoch": 0.837149187592319, "grad_norm": 0.30316510796546936, "learning_rate": 0.00014427885207537875, "loss": 0.3225, "step": 2267 }, { "epoch": 0.8375184638109305, "grad_norm": 0.2635595500469208, "learning_rate": 0.00014425421849981526, "loss": 0.2904, "step": 2268 }, { "epoch": 0.8378877400295421, "grad_norm": 0.24698545038700104, "learning_rate": 0.00014422958492425175, "loss": 0.2781, "step": 2269 }, { "epoch": 0.8382570162481536, "grad_norm": 0.2639318108558655, "learning_rate": 0.00014420495134868827, "loss": 0.3278, "step": 2270 }, { "epoch": 0.8386262924667651, "grad_norm": 0.3104601502418518, "learning_rate": 0.00014418031777312478, "loss": 0.2803, "step": 2271 }, { "epoch": 0.8389955686853766, "grad_norm": 0.2913442850112915, "learning_rate": 0.0001441556841975613, "loss": 0.2673, "step": 2272 }, { "epoch": 0.8393648449039882, "grad_norm": 0.3268960416316986, "learning_rate": 0.00014413105062199778, "loss": 0.311, "step": 2273 }, { "epoch": 0.8397341211225997, "grad_norm": 0.2585023045539856, "learning_rate": 0.0001441064170464343, "loss": 0.2834, "step": 2274 }, { "epoch": 0.8401033973412112, "grad_norm": 0.2671329975128174, "learning_rate": 0.0001440817834708708, "loss": 0.2926, "step": 2275 }, { "epoch": 0.8404726735598228, "grad_norm": 0.3078802824020386, "learning_rate": 0.00014405714989530733, "loss": 0.2788, "step": 2276 }, { "epoch": 0.8408419497784343, "grad_norm": 0.25752460956573486, "learning_rate": 0.00014403251631974382, "loss": 0.2948, "step": 2277 }, { "epoch": 0.8412112259970458, "grad_norm": 0.24336199462413788, "learning_rate": 0.00014400788274418033, "loss": 0.2689, "step": 2278 }, { "epoch": 0.8415805022156573, "grad_norm": 0.3261638283729553, "learning_rate": 0.00014398324916861682, "loss": 0.3442, "step": 2279 }, { "epoch": 0.8419497784342689, "grad_norm": 0.3471073508262634, "learning_rate": 0.00014395861559305336, "loss": 0.368, "step": 2280 }, { "epoch": 0.8423190546528804, "grad_norm": 0.2980787456035614, "learning_rate": 0.00014393398201748985, "loss": 0.3559, "step": 2281 }, { "epoch": 0.8426883308714919, "grad_norm": 0.29910537600517273, "learning_rate": 0.00014390934844192636, "loss": 0.2764, "step": 2282 }, { "epoch": 0.8430576070901034, "grad_norm": 0.27420300245285034, "learning_rate": 0.00014388471486636285, "loss": 0.3385, "step": 2283 }, { "epoch": 0.843426883308715, "grad_norm": 0.26107993721961975, "learning_rate": 0.00014386008129079936, "loss": 0.3044, "step": 2284 }, { "epoch": 0.8437961595273265, "grad_norm": 0.23961122334003448, "learning_rate": 0.00014383544771523588, "loss": 0.2556, "step": 2285 }, { "epoch": 0.8441654357459379, "grad_norm": 0.2519873380661011, "learning_rate": 0.0001438108141396724, "loss": 0.2721, "step": 2286 }, { "epoch": 0.8445347119645494, "grad_norm": 0.2999603748321533, "learning_rate": 0.00014378618056410888, "loss": 0.3113, "step": 2287 }, { "epoch": 0.844903988183161, "grad_norm": 0.24635626375675201, "learning_rate": 0.0001437615469885454, "loss": 0.2987, "step": 2288 }, { "epoch": 0.8452732644017725, "grad_norm": 0.26801541447639465, "learning_rate": 0.0001437369134129819, "loss": 0.3288, "step": 2289 }, { "epoch": 0.845642540620384, "grad_norm": 0.2706718146800995, "learning_rate": 0.00014371227983741842, "loss": 0.2828, "step": 2290 }, { "epoch": 0.8460118168389956, "grad_norm": 0.31260946393013, "learning_rate": 0.0001436876462618549, "loss": 0.3028, "step": 2291 }, { "epoch": 0.8463810930576071, "grad_norm": 0.26500388979911804, "learning_rate": 0.00014366301268629143, "loss": 0.2812, "step": 2292 }, { "epoch": 0.8467503692762186, "grad_norm": 0.28190380334854126, "learning_rate": 0.00014363837911072794, "loss": 0.3013, "step": 2293 }, { "epoch": 0.8471196454948301, "grad_norm": 0.30988219380378723, "learning_rate": 0.00014361374553516446, "loss": 0.3557, "step": 2294 }, { "epoch": 0.8474889217134417, "grad_norm": 0.3184913992881775, "learning_rate": 0.00014358911195960094, "loss": 0.3377, "step": 2295 }, { "epoch": 0.8478581979320532, "grad_norm": 0.2852730453014374, "learning_rate": 0.00014356447838403746, "loss": 0.3358, "step": 2296 }, { "epoch": 0.8482274741506647, "grad_norm": 0.3184800148010254, "learning_rate": 0.00014353984480847395, "loss": 0.2749, "step": 2297 }, { "epoch": 0.8485967503692762, "grad_norm": 0.31259408593177795, "learning_rate": 0.0001435152112329105, "loss": 0.3372, "step": 2298 }, { "epoch": 0.8489660265878878, "grad_norm": 0.2789030075073242, "learning_rate": 0.00014349057765734697, "loss": 0.3345, "step": 2299 }, { "epoch": 0.8493353028064993, "grad_norm": 0.290223091840744, "learning_rate": 0.0001434659440817835, "loss": 0.3217, "step": 2300 }, { "epoch": 0.8493353028064993, "eval_loss": 0.2873377799987793, "eval_runtime": 5.8564, "eval_samples_per_second": 8.538, "eval_steps_per_second": 1.195, "step": 2300 }, { "epoch": 0.8497045790251108, "grad_norm": 0.3126521706581116, "learning_rate": 0.00014344131050621998, "loss": 0.3818, "step": 2301 }, { "epoch": 0.8500738552437223, "grad_norm": 0.33014044165611267, "learning_rate": 0.0001434166769306565, "loss": 0.3932, "step": 2302 }, { "epoch": 0.8504431314623339, "grad_norm": 0.3226761221885681, "learning_rate": 0.000143392043355093, "loss": 0.321, "step": 2303 }, { "epoch": 0.8508124076809453, "grad_norm": 0.2537625730037689, "learning_rate": 0.00014336740977952952, "loss": 0.2429, "step": 2304 }, { "epoch": 0.8511816838995568, "grad_norm": 0.274652898311615, "learning_rate": 0.000143342776203966, "loss": 0.2672, "step": 2305 }, { "epoch": 0.8515509601181684, "grad_norm": 0.3447267711162567, "learning_rate": 0.00014331814262840252, "loss": 0.3365, "step": 2306 }, { "epoch": 0.8519202363367799, "grad_norm": 0.23127812147140503, "learning_rate": 0.00014329350905283904, "loss": 0.2598, "step": 2307 }, { "epoch": 0.8522895125553914, "grad_norm": 0.2893903851509094, "learning_rate": 0.00014326887547727555, "loss": 0.3396, "step": 2308 }, { "epoch": 0.8526587887740029, "grad_norm": 0.25716060400009155, "learning_rate": 0.00014324424190171204, "loss": 0.2831, "step": 2309 }, { "epoch": 0.8530280649926145, "grad_norm": 0.27485719323158264, "learning_rate": 0.00014321960832614855, "loss": 0.2711, "step": 2310 }, { "epoch": 0.853397341211226, "grad_norm": 0.2990472912788391, "learning_rate": 0.00014319497475058504, "loss": 0.3362, "step": 2311 }, { "epoch": 0.8537666174298375, "grad_norm": 0.27512043714523315, "learning_rate": 0.00014317034117502158, "loss": 0.3013, "step": 2312 }, { "epoch": 0.854135893648449, "grad_norm": 0.22718726098537445, "learning_rate": 0.00014314570759945807, "loss": 0.2655, "step": 2313 }, { "epoch": 0.8545051698670606, "grad_norm": 0.3475792407989502, "learning_rate": 0.00014312107402389459, "loss": 0.3464, "step": 2314 }, { "epoch": 0.8548744460856721, "grad_norm": 0.2669030427932739, "learning_rate": 0.00014309644044833107, "loss": 0.3412, "step": 2315 }, { "epoch": 0.8552437223042836, "grad_norm": 0.2855254113674164, "learning_rate": 0.0001430718068727676, "loss": 0.2962, "step": 2316 }, { "epoch": 0.8556129985228951, "grad_norm": 0.22337274253368378, "learning_rate": 0.0001430471732972041, "loss": 0.2728, "step": 2317 }, { "epoch": 0.8559822747415067, "grad_norm": 0.3366328477859497, "learning_rate": 0.00014302253972164062, "loss": 0.2873, "step": 2318 }, { "epoch": 0.8563515509601182, "grad_norm": 0.2495029717683792, "learning_rate": 0.0001429979061460771, "loss": 0.3059, "step": 2319 }, { "epoch": 0.8567208271787297, "grad_norm": 0.2128600776195526, "learning_rate": 0.00014297327257051362, "loss": 0.2419, "step": 2320 }, { "epoch": 0.8570901033973413, "grad_norm": 0.31186309456825256, "learning_rate": 0.00014294863899495013, "loss": 0.3131, "step": 2321 }, { "epoch": 0.8574593796159528, "grad_norm": 0.24287007749080658, "learning_rate": 0.00014292400541938665, "loss": 0.2501, "step": 2322 }, { "epoch": 0.8578286558345642, "grad_norm": 0.26953041553497314, "learning_rate": 0.00014289937184382314, "loss": 0.2907, "step": 2323 }, { "epoch": 0.8581979320531757, "grad_norm": 0.2574095129966736, "learning_rate": 0.00014287473826825965, "loss": 0.2775, "step": 2324 }, { "epoch": 0.8585672082717873, "grad_norm": 0.33651429414749146, "learning_rate": 0.00014285010469269614, "loss": 0.2649, "step": 2325 }, { "epoch": 0.8589364844903988, "grad_norm": 0.2808817923069, "learning_rate": 0.00014282547111713265, "loss": 0.3235, "step": 2326 }, { "epoch": 0.8593057607090103, "grad_norm": 0.2517601549625397, "learning_rate": 0.00014280083754156917, "loss": 0.2747, "step": 2327 }, { "epoch": 0.8596750369276218, "grad_norm": 0.4052937626838684, "learning_rate": 0.00014277620396600566, "loss": 0.361, "step": 2328 }, { "epoch": 0.8600443131462334, "grad_norm": 0.294210284948349, "learning_rate": 0.00014275157039044217, "loss": 0.2779, "step": 2329 }, { "epoch": 0.8604135893648449, "grad_norm": 0.30568668246269226, "learning_rate": 0.00014272693681487868, "loss": 0.2896, "step": 2330 }, { "epoch": 0.8607828655834564, "grad_norm": 0.3126921057701111, "learning_rate": 0.0001427023032393152, "loss": 0.3514, "step": 2331 }, { "epoch": 0.8611521418020679, "grad_norm": 0.29571497440338135, "learning_rate": 0.0001426776696637517, "loss": 0.3569, "step": 2332 }, { "epoch": 0.8615214180206795, "grad_norm": 0.22604425251483917, "learning_rate": 0.0001426530360881882, "loss": 0.2204, "step": 2333 }, { "epoch": 0.861890694239291, "grad_norm": 0.31958821415901184, "learning_rate": 0.00014262840251262472, "loss": 0.3159, "step": 2334 }, { "epoch": 0.8622599704579025, "grad_norm": 0.24583856761455536, "learning_rate": 0.00014260376893706123, "loss": 0.2815, "step": 2335 }, { "epoch": 0.8626292466765141, "grad_norm": 0.263694167137146, "learning_rate": 0.00014257913536149772, "loss": 0.303, "step": 2336 }, { "epoch": 0.8629985228951256, "grad_norm": 0.37396731972694397, "learning_rate": 0.00014255450178593423, "loss": 0.3883, "step": 2337 }, { "epoch": 0.8633677991137371, "grad_norm": 0.2750912010669708, "learning_rate": 0.00014252986821037072, "loss": 0.2583, "step": 2338 }, { "epoch": 0.8637370753323486, "grad_norm": 0.25109225511550903, "learning_rate": 0.00014250523463480726, "loss": 0.2766, "step": 2339 }, { "epoch": 0.8641063515509602, "grad_norm": 0.2866942584514618, "learning_rate": 0.00014248060105924375, "loss": 0.293, "step": 2340 }, { "epoch": 0.8644756277695717, "grad_norm": 0.24580956995487213, "learning_rate": 0.00014245596748368026, "loss": 0.3142, "step": 2341 }, { "epoch": 0.8648449039881831, "grad_norm": 0.2784372568130493, "learning_rate": 0.00014243133390811675, "loss": 0.3346, "step": 2342 }, { "epoch": 0.8652141802067946, "grad_norm": 0.34338557720184326, "learning_rate": 0.00014240670033255327, "loss": 0.3088, "step": 2343 }, { "epoch": 0.8655834564254062, "grad_norm": 0.25998613238334656, "learning_rate": 0.00014238206675698978, "loss": 0.2599, "step": 2344 }, { "epoch": 0.8659527326440177, "grad_norm": 0.22598931193351746, "learning_rate": 0.0001423574331814263, "loss": 0.2598, "step": 2345 }, { "epoch": 0.8663220088626292, "grad_norm": 0.2820592224597931, "learning_rate": 0.00014233279960586278, "loss": 0.3211, "step": 2346 }, { "epoch": 0.8666912850812407, "grad_norm": 0.37511417269706726, "learning_rate": 0.0001423081660302993, "loss": 0.2871, "step": 2347 }, { "epoch": 0.8670605612998523, "grad_norm": 0.24090541899204254, "learning_rate": 0.0001422835324547358, "loss": 0.3016, "step": 2348 }, { "epoch": 0.8674298375184638, "grad_norm": 0.2671396732330322, "learning_rate": 0.00014225889887917233, "loss": 0.3033, "step": 2349 }, { "epoch": 0.8677991137370753, "grad_norm": 0.2758866846561432, "learning_rate": 0.00014223426530360881, "loss": 0.2655, "step": 2350 }, { "epoch": 0.8677991137370753, "eval_loss": 0.28671786189079285, "eval_runtime": 5.8659, "eval_samples_per_second": 8.524, "eval_steps_per_second": 1.193, "step": 2350 }, { "epoch": 0.8681683899556869, "grad_norm": 0.22430068254470825, "learning_rate": 0.00014220963172804533, "loss": 0.2786, "step": 2351 }, { "epoch": 0.8685376661742984, "grad_norm": 0.2884713411331177, "learning_rate": 0.00014218499815248182, "loss": 0.2749, "step": 2352 }, { "epoch": 0.8689069423929099, "grad_norm": 0.27482378482818604, "learning_rate": 0.00014216036457691836, "loss": 0.3344, "step": 2353 }, { "epoch": 0.8692762186115214, "grad_norm": 0.3267439603805542, "learning_rate": 0.00014213573100135485, "loss": 0.3404, "step": 2354 }, { "epoch": 0.869645494830133, "grad_norm": 0.261015385389328, "learning_rate": 0.00014211109742579136, "loss": 0.2585, "step": 2355 }, { "epoch": 0.8700147710487445, "grad_norm": 0.28181374073028564, "learning_rate": 0.00014208646385022785, "loss": 0.2542, "step": 2356 }, { "epoch": 0.870384047267356, "grad_norm": 0.2474304735660553, "learning_rate": 0.00014206183027466436, "loss": 0.2562, "step": 2357 }, { "epoch": 0.8707533234859675, "grad_norm": 0.39997008442878723, "learning_rate": 0.00014203719669910088, "loss": 0.2984, "step": 2358 }, { "epoch": 0.8711225997045791, "grad_norm": 0.23000465333461761, "learning_rate": 0.0001420125631235374, "loss": 0.2343, "step": 2359 }, { "epoch": 0.8714918759231906, "grad_norm": 0.2604723274707794, "learning_rate": 0.00014198792954797388, "loss": 0.2993, "step": 2360 }, { "epoch": 0.871861152141802, "grad_norm": 0.21523945033550262, "learning_rate": 0.0001419632959724104, "loss": 0.2407, "step": 2361 }, { "epoch": 0.8722304283604135, "grad_norm": 0.2766049802303314, "learning_rate": 0.0001419386623968469, "loss": 0.284, "step": 2362 }, { "epoch": 0.8725997045790251, "grad_norm": 0.2615562081336975, "learning_rate": 0.00014191402882128342, "loss": 0.2522, "step": 2363 }, { "epoch": 0.8729689807976366, "grad_norm": 0.258593887090683, "learning_rate": 0.0001418893952457199, "loss": 0.2703, "step": 2364 }, { "epoch": 0.8733382570162481, "grad_norm": 0.2718462646007538, "learning_rate": 0.00014186476167015643, "loss": 0.2821, "step": 2365 }, { "epoch": 0.8737075332348597, "grad_norm": 0.2711513638496399, "learning_rate": 0.00014184012809459294, "loss": 0.2737, "step": 2366 }, { "epoch": 0.8740768094534712, "grad_norm": 0.25952938199043274, "learning_rate": 0.00014181549451902946, "loss": 0.2519, "step": 2367 }, { "epoch": 0.8744460856720827, "grad_norm": 0.24934151768684387, "learning_rate": 0.00014179086094346594, "loss": 0.2908, "step": 2368 }, { "epoch": 0.8748153618906942, "grad_norm": 0.2977253794670105, "learning_rate": 0.00014176622736790246, "loss": 0.3269, "step": 2369 }, { "epoch": 0.8751846381093058, "grad_norm": 0.24035468697547913, "learning_rate": 0.00014174159379233894, "loss": 0.2403, "step": 2370 }, { "epoch": 0.8755539143279173, "grad_norm": 0.3673158884048462, "learning_rate": 0.0001417169602167755, "loss": 0.3038, "step": 2371 }, { "epoch": 0.8759231905465288, "grad_norm": 0.2450585961341858, "learning_rate": 0.00014169232664121197, "loss": 0.3046, "step": 2372 }, { "epoch": 0.8762924667651403, "grad_norm": 0.2410673350095749, "learning_rate": 0.0001416676930656485, "loss": 0.2426, "step": 2373 }, { "epoch": 0.8766617429837519, "grad_norm": 0.3130303621292114, "learning_rate": 0.00014164305949008498, "loss": 0.3167, "step": 2374 }, { "epoch": 0.8770310192023634, "grad_norm": 0.3219814598560333, "learning_rate": 0.0001416184259145215, "loss": 0.341, "step": 2375 }, { "epoch": 0.8774002954209749, "grad_norm": 0.28468939661979675, "learning_rate": 0.000141593792338958, "loss": 0.334, "step": 2376 }, { "epoch": 0.8777695716395865, "grad_norm": 0.42089036107063293, "learning_rate": 0.00014156915876339452, "loss": 0.3025, "step": 2377 }, { "epoch": 0.878138847858198, "grad_norm": 0.26263752579689026, "learning_rate": 0.000141544525187831, "loss": 0.2553, "step": 2378 }, { "epoch": 0.8785081240768094, "grad_norm": 0.29241836071014404, "learning_rate": 0.00014151989161226752, "loss": 0.3417, "step": 2379 }, { "epoch": 0.8788774002954209, "grad_norm": 0.28693094849586487, "learning_rate": 0.00014149525803670404, "loss": 0.2685, "step": 2380 }, { "epoch": 0.8792466765140325, "grad_norm": 0.2222106158733368, "learning_rate": 0.00014147062446114055, "loss": 0.2658, "step": 2381 }, { "epoch": 0.879615952732644, "grad_norm": 0.32580694556236267, "learning_rate": 0.00014144599088557704, "loss": 0.3423, "step": 2382 }, { "epoch": 0.8799852289512555, "grad_norm": 0.32633674144744873, "learning_rate": 0.00014142135731001355, "loss": 0.2965, "step": 2383 }, { "epoch": 0.880354505169867, "grad_norm": 0.25918981432914734, "learning_rate": 0.00014139672373445004, "loss": 0.3084, "step": 2384 }, { "epoch": 0.8807237813884786, "grad_norm": 0.30258896946907043, "learning_rate": 0.00014137209015888658, "loss": 0.276, "step": 2385 }, { "epoch": 0.8810930576070901, "grad_norm": 0.2822190523147583, "learning_rate": 0.00014134745658332307, "loss": 0.2632, "step": 2386 }, { "epoch": 0.8814623338257016, "grad_norm": 0.27233344316482544, "learning_rate": 0.00014132282300775959, "loss": 0.3198, "step": 2387 }, { "epoch": 0.8818316100443131, "grad_norm": 0.31817078590393066, "learning_rate": 0.00014129818943219607, "loss": 0.2661, "step": 2388 }, { "epoch": 0.8822008862629247, "grad_norm": 0.22350935637950897, "learning_rate": 0.0001412735558566326, "loss": 0.2517, "step": 2389 }, { "epoch": 0.8825701624815362, "grad_norm": 0.32207536697387695, "learning_rate": 0.0001412489222810691, "loss": 0.3102, "step": 2390 }, { "epoch": 0.8829394387001477, "grad_norm": 0.327226459980011, "learning_rate": 0.00014122428870550562, "loss": 0.3549, "step": 2391 }, { "epoch": 0.8833087149187593, "grad_norm": 0.30438438057899475, "learning_rate": 0.0001411996551299421, "loss": 0.2674, "step": 2392 }, { "epoch": 0.8836779911373708, "grad_norm": 0.27010953426361084, "learning_rate": 0.00014117502155437862, "loss": 0.2689, "step": 2393 }, { "epoch": 0.8840472673559823, "grad_norm": 0.29276734590530396, "learning_rate": 0.00014115038797881513, "loss": 0.2905, "step": 2394 }, { "epoch": 0.8844165435745938, "grad_norm": 0.23772290349006653, "learning_rate": 0.00014112575440325165, "loss": 0.2877, "step": 2395 }, { "epoch": 0.8847858197932054, "grad_norm": 0.29832708835601807, "learning_rate": 0.00014110112082768814, "loss": 0.3108, "step": 2396 }, { "epoch": 0.8851550960118169, "grad_norm": 0.2792957127094269, "learning_rate": 0.00014107648725212465, "loss": 0.2976, "step": 2397 }, { "epoch": 0.8855243722304283, "grad_norm": 0.3596116006374359, "learning_rate": 0.00014105185367656116, "loss": 0.3909, "step": 2398 }, { "epoch": 0.8858936484490398, "grad_norm": 0.30374330282211304, "learning_rate": 0.00014102722010099768, "loss": 0.3588, "step": 2399 }, { "epoch": 0.8862629246676514, "grad_norm": 0.25111934542655945, "learning_rate": 0.00014100258652543417, "loss": 0.2565, "step": 2400 }, { "epoch": 0.8862629246676514, "eval_loss": 0.28929802775382996, "eval_runtime": 5.8529, "eval_samples_per_second": 8.543, "eval_steps_per_second": 1.196, "step": 2400 }, { "epoch": 0.8866322008862629, "grad_norm": 0.29126715660095215, "learning_rate": 0.00014097795294987068, "loss": 0.3392, "step": 2401 }, { "epoch": 0.8870014771048744, "grad_norm": 0.2631511688232422, "learning_rate": 0.00014095331937430717, "loss": 0.2636, "step": 2402 }, { "epoch": 0.8873707533234859, "grad_norm": 0.27585646510124207, "learning_rate": 0.0001409286857987437, "loss": 0.2639, "step": 2403 }, { "epoch": 0.8877400295420975, "grad_norm": 0.32870951294898987, "learning_rate": 0.0001409040522231802, "loss": 0.3372, "step": 2404 }, { "epoch": 0.888109305760709, "grad_norm": 0.3076072037220001, "learning_rate": 0.0001408794186476167, "loss": 0.3156, "step": 2405 }, { "epoch": 0.8884785819793205, "grad_norm": 0.25015804171562195, "learning_rate": 0.0001408547850720532, "loss": 0.2378, "step": 2406 }, { "epoch": 0.8888478581979321, "grad_norm": 0.2584746479988098, "learning_rate": 0.00014083015149648972, "loss": 0.2819, "step": 2407 }, { "epoch": 0.8892171344165436, "grad_norm": 0.23811596632003784, "learning_rate": 0.00014080551792092623, "loss": 0.2912, "step": 2408 }, { "epoch": 0.8895864106351551, "grad_norm": 0.28469327092170715, "learning_rate": 0.00014078088434536274, "loss": 0.2826, "step": 2409 }, { "epoch": 0.8899556868537666, "grad_norm": 0.30144456028938293, "learning_rate": 0.00014075625076979923, "loss": 0.2995, "step": 2410 }, { "epoch": 0.8903249630723782, "grad_norm": 0.3043076992034912, "learning_rate": 0.00014073161719423575, "loss": 0.2927, "step": 2411 }, { "epoch": 0.8906942392909897, "grad_norm": 0.2810080051422119, "learning_rate": 0.00014070698361867226, "loss": 0.2659, "step": 2412 }, { "epoch": 0.8910635155096012, "grad_norm": 0.23271699249744415, "learning_rate": 0.00014068235004310878, "loss": 0.2476, "step": 2413 }, { "epoch": 0.8914327917282127, "grad_norm": 0.36389675736427307, "learning_rate": 0.00014065771646754526, "loss": 0.3012, "step": 2414 }, { "epoch": 0.8918020679468243, "grad_norm": 0.2137039452791214, "learning_rate": 0.00014063308289198178, "loss": 0.2447, "step": 2415 }, { "epoch": 0.8921713441654358, "grad_norm": 0.24990826845169067, "learning_rate": 0.00014060844931641827, "loss": 0.2819, "step": 2416 }, { "epoch": 0.8925406203840472, "grad_norm": 0.2792392075061798, "learning_rate": 0.0001405838157408548, "loss": 0.3149, "step": 2417 }, { "epoch": 0.8929098966026587, "grad_norm": 0.2583014965057373, "learning_rate": 0.0001405591821652913, "loss": 0.2634, "step": 2418 }, { "epoch": 0.8932791728212703, "grad_norm": 0.25006797909736633, "learning_rate": 0.0001405345485897278, "loss": 0.2905, "step": 2419 }, { "epoch": 0.8936484490398818, "grad_norm": 0.31017082929611206, "learning_rate": 0.0001405099150141643, "loss": 0.3216, "step": 2420 }, { "epoch": 0.8940177252584933, "grad_norm": 0.28957879543304443, "learning_rate": 0.0001404852814386008, "loss": 0.2999, "step": 2421 }, { "epoch": 0.8943870014771049, "grad_norm": 0.25751760601997375, "learning_rate": 0.00014046064786303733, "loss": 0.2544, "step": 2422 }, { "epoch": 0.8947562776957164, "grad_norm": 0.2919774353504181, "learning_rate": 0.00014043601428747384, "loss": 0.2958, "step": 2423 }, { "epoch": 0.8951255539143279, "grad_norm": 0.24311396479606628, "learning_rate": 0.00014041138071191033, "loss": 0.2683, "step": 2424 }, { "epoch": 0.8954948301329394, "grad_norm": 0.2541423439979553, "learning_rate": 0.00014038674713634684, "loss": 0.2942, "step": 2425 }, { "epoch": 0.895864106351551, "grad_norm": 0.21922168135643005, "learning_rate": 0.00014036211356078336, "loss": 0.2335, "step": 2426 }, { "epoch": 0.8962333825701625, "grad_norm": 0.24970941245555878, "learning_rate": 0.00014033747998521987, "loss": 0.2468, "step": 2427 }, { "epoch": 0.896602658788774, "grad_norm": 0.27785637974739075, "learning_rate": 0.00014031284640965636, "loss": 0.3133, "step": 2428 }, { "epoch": 0.8969719350073855, "grad_norm": 0.4019975960254669, "learning_rate": 0.00014028821283409287, "loss": 0.3573, "step": 2429 }, { "epoch": 0.8973412112259971, "grad_norm": 0.23545053601264954, "learning_rate": 0.0001402635792585294, "loss": 0.272, "step": 2430 }, { "epoch": 0.8977104874446086, "grad_norm": 0.29188770055770874, "learning_rate": 0.0001402389456829659, "loss": 0.2965, "step": 2431 }, { "epoch": 0.8980797636632201, "grad_norm": 0.29059723019599915, "learning_rate": 0.0001402143121074024, "loss": 0.2701, "step": 2432 }, { "epoch": 0.8984490398818316, "grad_norm": 0.2653946578502655, "learning_rate": 0.0001401896785318389, "loss": 0.299, "step": 2433 }, { "epoch": 0.8988183161004432, "grad_norm": 0.22633157670497894, "learning_rate": 0.0001401650449562754, "loss": 0.2622, "step": 2434 }, { "epoch": 0.8991875923190547, "grad_norm": 0.2481098473072052, "learning_rate": 0.00014014041138071194, "loss": 0.2414, "step": 2435 }, { "epoch": 0.8995568685376661, "grad_norm": 0.26378968358039856, "learning_rate": 0.00014011577780514842, "loss": 0.2943, "step": 2436 }, { "epoch": 0.8999261447562777, "grad_norm": 0.26425355672836304, "learning_rate": 0.00014009114422958494, "loss": 0.2669, "step": 2437 }, { "epoch": 0.9002954209748892, "grad_norm": 0.31967341899871826, "learning_rate": 0.00014006651065402143, "loss": 0.3514, "step": 2438 }, { "epoch": 0.9006646971935007, "grad_norm": 0.22327451407909393, "learning_rate": 0.00014004187707845794, "loss": 0.2261, "step": 2439 }, { "epoch": 0.9010339734121122, "grad_norm": 0.2849748432636261, "learning_rate": 0.00014001724350289445, "loss": 0.2701, "step": 2440 }, { "epoch": 0.9014032496307238, "grad_norm": 0.37810251116752625, "learning_rate": 0.00013999260992733097, "loss": 0.3405, "step": 2441 }, { "epoch": 0.9017725258493353, "grad_norm": 0.23618659377098083, "learning_rate": 0.00013996797635176746, "loss": 0.2422, "step": 2442 }, { "epoch": 0.9021418020679468, "grad_norm": 0.3084317445755005, "learning_rate": 0.00013994334277620397, "loss": 0.3264, "step": 2443 }, { "epoch": 0.9025110782865583, "grad_norm": 0.31723859906196594, "learning_rate": 0.00013991870920064049, "loss": 0.3861, "step": 2444 }, { "epoch": 0.9028803545051699, "grad_norm": 0.281729519367218, "learning_rate": 0.000139894075625077, "loss": 0.3022, "step": 2445 }, { "epoch": 0.9032496307237814, "grad_norm": 0.28756844997406006, "learning_rate": 0.0001398694420495135, "loss": 0.2643, "step": 2446 }, { "epoch": 0.9036189069423929, "grad_norm": 0.3584077060222626, "learning_rate": 0.00013984480847395, "loss": 0.3533, "step": 2447 }, { "epoch": 0.9039881831610044, "grad_norm": 0.258378267288208, "learning_rate": 0.0001398201748983865, "loss": 0.3143, "step": 2448 }, { "epoch": 0.904357459379616, "grad_norm": 0.25656944513320923, "learning_rate": 0.00013979554132282303, "loss": 0.2575, "step": 2449 }, { "epoch": 0.9047267355982275, "grad_norm": 0.25176844000816345, "learning_rate": 0.00013977090774725952, "loss": 0.3463, "step": 2450 }, { "epoch": 0.9047267355982275, "eval_loss": 0.2817683219909668, "eval_runtime": 5.8463, "eval_samples_per_second": 8.552, "eval_steps_per_second": 1.197, "step": 2450 }, { "epoch": 0.905096011816839, "grad_norm": 0.3466639816761017, "learning_rate": 0.00013974627417169603, "loss": 0.3071, "step": 2451 }, { "epoch": 0.9054652880354506, "grad_norm": 0.2617734670639038, "learning_rate": 0.00013972164059613252, "loss": 0.33, "step": 2452 }, { "epoch": 0.9058345642540621, "grad_norm": 0.24130357801914215, "learning_rate": 0.00013969700702056904, "loss": 0.2765, "step": 2453 }, { "epoch": 0.9062038404726735, "grad_norm": 0.37545138597488403, "learning_rate": 0.00013967237344500555, "loss": 0.3411, "step": 2454 }, { "epoch": 0.906573116691285, "grad_norm": 0.23208720982074738, "learning_rate": 0.00013964773986944207, "loss": 0.234, "step": 2455 }, { "epoch": 0.9069423929098966, "grad_norm": 0.2672305405139923, "learning_rate": 0.00013962310629387855, "loss": 0.3081, "step": 2456 }, { "epoch": 0.9073116691285081, "grad_norm": 0.23274806141853333, "learning_rate": 0.00013959847271831507, "loss": 0.2658, "step": 2457 }, { "epoch": 0.9076809453471196, "grad_norm": 0.3449879288673401, "learning_rate": 0.00013957383914275158, "loss": 0.3397, "step": 2458 }, { "epoch": 0.9080502215657311, "grad_norm": 0.30551835894584656, "learning_rate": 0.0001395492055671881, "loss": 0.3049, "step": 2459 }, { "epoch": 0.9084194977843427, "grad_norm": 0.2604008913040161, "learning_rate": 0.00013952457199162458, "loss": 0.273, "step": 2460 }, { "epoch": 0.9087887740029542, "grad_norm": 0.31938475370407104, "learning_rate": 0.0001394999384160611, "loss": 0.3564, "step": 2461 }, { "epoch": 0.9091580502215657, "grad_norm": 0.24833115935325623, "learning_rate": 0.0001394753048404976, "loss": 0.2765, "step": 2462 }, { "epoch": 0.9095273264401772, "grad_norm": 0.26162466406822205, "learning_rate": 0.00013945067126493413, "loss": 0.3201, "step": 2463 }, { "epoch": 0.9098966026587888, "grad_norm": 0.3023029565811157, "learning_rate": 0.00013942603768937062, "loss": 0.2832, "step": 2464 }, { "epoch": 0.9102658788774003, "grad_norm": 0.3238774836063385, "learning_rate": 0.00013940140411380713, "loss": 0.2566, "step": 2465 }, { "epoch": 0.9106351550960118, "grad_norm": 0.3158404529094696, "learning_rate": 0.00013937677053824362, "loss": 0.4155, "step": 2466 }, { "epoch": 0.9110044313146234, "grad_norm": 0.3102486729621887, "learning_rate": 0.00013935213696268013, "loss": 0.3817, "step": 2467 }, { "epoch": 0.9113737075332349, "grad_norm": 0.2413311004638672, "learning_rate": 0.00013932750338711665, "loss": 0.246, "step": 2468 }, { "epoch": 0.9117429837518464, "grad_norm": 0.28252607583999634, "learning_rate": 0.00013930286981155316, "loss": 0.2975, "step": 2469 }, { "epoch": 0.9121122599704579, "grad_norm": 0.264702707529068, "learning_rate": 0.00013927823623598965, "loss": 0.31, "step": 2470 }, { "epoch": 0.9124815361890695, "grad_norm": 0.31399446725845337, "learning_rate": 0.00013925360266042616, "loss": 0.2865, "step": 2471 }, { "epoch": 0.912850812407681, "grad_norm": 0.337719589471817, "learning_rate": 0.00013922896908486268, "loss": 0.3609, "step": 2472 }, { "epoch": 0.9132200886262924, "grad_norm": 0.3153814673423767, "learning_rate": 0.0001392043355092992, "loss": 0.3028, "step": 2473 }, { "epoch": 0.9135893648449039, "grad_norm": 0.30539634823799133, "learning_rate": 0.00013917970193373568, "loss": 0.3257, "step": 2474 }, { "epoch": 0.9139586410635155, "grad_norm": 0.25480708479881287, "learning_rate": 0.0001391550683581722, "loss": 0.3038, "step": 2475 }, { "epoch": 0.914327917282127, "grad_norm": 0.2393716275691986, "learning_rate": 0.0001391304347826087, "loss": 0.2296, "step": 2476 }, { "epoch": 0.9146971935007385, "grad_norm": 0.245378315448761, "learning_rate": 0.00013910580120704523, "loss": 0.2824, "step": 2477 }, { "epoch": 0.9150664697193501, "grad_norm": 0.3395783603191376, "learning_rate": 0.0001390811676314817, "loss": 0.4217, "step": 2478 }, { "epoch": 0.9154357459379616, "grad_norm": 0.25693923234939575, "learning_rate": 0.00013905653405591823, "loss": 0.3075, "step": 2479 }, { "epoch": 0.9158050221565731, "grad_norm": 0.2574548125267029, "learning_rate": 0.00013903190048035471, "loss": 0.2576, "step": 2480 }, { "epoch": 0.9161742983751846, "grad_norm": 0.22771596908569336, "learning_rate": 0.00013900726690479126, "loss": 0.2517, "step": 2481 }, { "epoch": 0.9165435745937962, "grad_norm": 0.27070626616477966, "learning_rate": 0.00013898263332922774, "loss": 0.3002, "step": 2482 }, { "epoch": 0.9169128508124077, "grad_norm": 0.2886691093444824, "learning_rate": 0.00013895799975366426, "loss": 0.2792, "step": 2483 }, { "epoch": 0.9172821270310192, "grad_norm": 0.2978097200393677, "learning_rate": 0.00013893336617810075, "loss": 0.3101, "step": 2484 }, { "epoch": 0.9176514032496307, "grad_norm": 0.33635851740837097, "learning_rate": 0.00013890873260253726, "loss": 0.3293, "step": 2485 }, { "epoch": 0.9180206794682423, "grad_norm": 0.31264153122901917, "learning_rate": 0.00013888409902697378, "loss": 0.3546, "step": 2486 }, { "epoch": 0.9183899556868538, "grad_norm": 0.25478553771972656, "learning_rate": 0.0001388594654514103, "loss": 0.2992, "step": 2487 }, { "epoch": 0.9187592319054653, "grad_norm": 0.23154157400131226, "learning_rate": 0.00013883483187584678, "loss": 0.2276, "step": 2488 }, { "epoch": 0.9191285081240768, "grad_norm": 0.2702298164367676, "learning_rate": 0.0001388101983002833, "loss": 0.3144, "step": 2489 }, { "epoch": 0.9194977843426884, "grad_norm": 0.2805127203464508, "learning_rate": 0.0001387855647247198, "loss": 0.3173, "step": 2490 }, { "epoch": 0.9198670605612999, "grad_norm": 0.3024933934211731, "learning_rate": 0.00013876093114915632, "loss": 0.2916, "step": 2491 }, { "epoch": 0.9202363367799113, "grad_norm": 0.24952004849910736, "learning_rate": 0.0001387362975735928, "loss": 0.2645, "step": 2492 }, { "epoch": 0.920605612998523, "grad_norm": 0.34727632999420166, "learning_rate": 0.00013871166399802932, "loss": 0.3491, "step": 2493 }, { "epoch": 0.9209748892171344, "grad_norm": 0.3407987058162689, "learning_rate": 0.0001386870304224658, "loss": 0.3503, "step": 2494 }, { "epoch": 0.9213441654357459, "grad_norm": 0.2808375954627991, "learning_rate": 0.00013866239684690235, "loss": 0.3076, "step": 2495 }, { "epoch": 0.9217134416543574, "grad_norm": 0.28790482878685, "learning_rate": 0.00013863776327133884, "loss": 0.313, "step": 2496 }, { "epoch": 0.922082717872969, "grad_norm": 0.24627193808555603, "learning_rate": 0.00013861312969577536, "loss": 0.2657, "step": 2497 }, { "epoch": 0.9224519940915805, "grad_norm": 0.3345617651939392, "learning_rate": 0.00013858849612021184, "loss": 0.323, "step": 2498 }, { "epoch": 0.922821270310192, "grad_norm": 0.28228136897087097, "learning_rate": 0.00013856386254464836, "loss": 0.3108, "step": 2499 }, { "epoch": 0.9231905465288035, "grad_norm": 0.23251014947891235, "learning_rate": 0.00013853922896908487, "loss": 0.2689, "step": 2500 }, { "epoch": 0.9231905465288035, "eval_loss": 0.2845548391342163, "eval_runtime": 5.8619, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 2500 }, { "epoch": 0.9235598227474151, "grad_norm": 0.33126166462898254, "learning_rate": 0.0001385145953935214, "loss": 0.3824, "step": 2501 }, { "epoch": 0.9239290989660266, "grad_norm": 0.2767648994922638, "learning_rate": 0.00013848996181795787, "loss": 0.3169, "step": 2502 }, { "epoch": 0.9242983751846381, "grad_norm": 0.2855761647224426, "learning_rate": 0.0001384653282423944, "loss": 0.3259, "step": 2503 }, { "epoch": 0.9246676514032496, "grad_norm": 0.22694745659828186, "learning_rate": 0.0001384406946668309, "loss": 0.258, "step": 2504 }, { "epoch": 0.9250369276218612, "grad_norm": 0.27971693873405457, "learning_rate": 0.00013841606109126742, "loss": 0.3251, "step": 2505 }, { "epoch": 0.9254062038404727, "grad_norm": 0.21657156944274902, "learning_rate": 0.0001383914275157039, "loss": 0.272, "step": 2506 }, { "epoch": 0.9257754800590842, "grad_norm": 0.2831554114818573, "learning_rate": 0.00013836679394014042, "loss": 0.2946, "step": 2507 }, { "epoch": 0.9261447562776958, "grad_norm": 0.258457213640213, "learning_rate": 0.00013834216036457694, "loss": 0.2844, "step": 2508 }, { "epoch": 0.9265140324963073, "grad_norm": 0.25451990962028503, "learning_rate": 0.00013831752678901345, "loss": 0.2208, "step": 2509 }, { "epoch": 0.9268833087149188, "grad_norm": 0.29237785935401917, "learning_rate": 0.00013829289321344994, "loss": 0.3336, "step": 2510 }, { "epoch": 0.9272525849335302, "grad_norm": 0.3132020831108093, "learning_rate": 0.00013826825963788645, "loss": 0.2769, "step": 2511 }, { "epoch": 0.9276218611521418, "grad_norm": 0.3423916697502136, "learning_rate": 0.00013824362606232294, "loss": 0.3043, "step": 2512 }, { "epoch": 0.9279911373707533, "grad_norm": 0.2983170449733734, "learning_rate": 0.00013821899248675948, "loss": 0.3227, "step": 2513 }, { "epoch": 0.9283604135893648, "grad_norm": 0.23354722559452057, "learning_rate": 0.00013819435891119597, "loss": 0.2706, "step": 2514 }, { "epoch": 0.9287296898079763, "grad_norm": 0.30000245571136475, "learning_rate": 0.00013816972533563248, "loss": 0.3278, "step": 2515 }, { "epoch": 0.9290989660265879, "grad_norm": 0.3025997579097748, "learning_rate": 0.00013814509176006897, "loss": 0.284, "step": 2516 }, { "epoch": 0.9294682422451994, "grad_norm": 0.24907605350017548, "learning_rate": 0.00013812045818450549, "loss": 0.2629, "step": 2517 }, { "epoch": 0.9298375184638109, "grad_norm": 0.29721975326538086, "learning_rate": 0.000138095824608942, "loss": 0.2942, "step": 2518 }, { "epoch": 0.9302067946824224, "grad_norm": 0.31587401032447815, "learning_rate": 0.00013807119103337851, "loss": 0.251, "step": 2519 }, { "epoch": 0.930576070901034, "grad_norm": 0.2655812203884125, "learning_rate": 0.000138046557457815, "loss": 0.2574, "step": 2520 }, { "epoch": 0.9309453471196455, "grad_norm": 0.2984577715396881, "learning_rate": 0.00013802192388225152, "loss": 0.3091, "step": 2521 }, { "epoch": 0.931314623338257, "grad_norm": 0.280851274728775, "learning_rate": 0.00013799729030668803, "loss": 0.2661, "step": 2522 }, { "epoch": 0.9316838995568686, "grad_norm": 0.3139093220233917, "learning_rate": 0.00013797265673112455, "loss": 0.3495, "step": 2523 }, { "epoch": 0.9320531757754801, "grad_norm": 0.25780218839645386, "learning_rate": 0.00013794802315556103, "loss": 0.2793, "step": 2524 }, { "epoch": 0.9324224519940916, "grad_norm": 0.37408342957496643, "learning_rate": 0.00013792338957999755, "loss": 0.2843, "step": 2525 }, { "epoch": 0.9327917282127031, "grad_norm": 0.23762372136116028, "learning_rate": 0.00013789875600443404, "loss": 0.231, "step": 2526 }, { "epoch": 0.9331610044313147, "grad_norm": 0.2505086362361908, "learning_rate": 0.00013787412242887058, "loss": 0.2322, "step": 2527 }, { "epoch": 0.9335302806499262, "grad_norm": 0.2948438227176666, "learning_rate": 0.00013784948885330707, "loss": 0.3642, "step": 2528 }, { "epoch": 0.9338995568685377, "grad_norm": 0.3976789712905884, "learning_rate": 0.00013782485527774358, "loss": 0.3192, "step": 2529 }, { "epoch": 0.9342688330871491, "grad_norm": 0.2831787168979645, "learning_rate": 0.00013780022170218007, "loss": 0.2827, "step": 2530 }, { "epoch": 0.9346381093057607, "grad_norm": 0.26242488622665405, "learning_rate": 0.00013777558812661658, "loss": 0.272, "step": 2531 }, { "epoch": 0.9350073855243722, "grad_norm": 0.26361218094825745, "learning_rate": 0.0001377509545510531, "loss": 0.2967, "step": 2532 }, { "epoch": 0.9353766617429837, "grad_norm": 0.23245660960674286, "learning_rate": 0.0001377263209754896, "loss": 0.2221, "step": 2533 }, { "epoch": 0.9357459379615952, "grad_norm": 0.2824196517467499, "learning_rate": 0.0001377016873999261, "loss": 0.2967, "step": 2534 }, { "epoch": 0.9361152141802068, "grad_norm": 0.2880588471889496, "learning_rate": 0.0001376770538243626, "loss": 0.3025, "step": 2535 }, { "epoch": 0.9364844903988183, "grad_norm": 0.20747391879558563, "learning_rate": 0.00013765242024879913, "loss": 0.2493, "step": 2536 }, { "epoch": 0.9368537666174298, "grad_norm": 0.3134663701057434, "learning_rate": 0.00013762778667323564, "loss": 0.3315, "step": 2537 }, { "epoch": 0.9372230428360414, "grad_norm": 0.27924367785453796, "learning_rate": 0.00013760315309767213, "loss": 0.2743, "step": 2538 }, { "epoch": 0.9375923190546529, "grad_norm": 0.26552537083625793, "learning_rate": 0.00013757851952210864, "loss": 0.2625, "step": 2539 }, { "epoch": 0.9379615952732644, "grad_norm": 0.3043096959590912, "learning_rate": 0.00013755388594654516, "loss": 0.377, "step": 2540 }, { "epoch": 0.9383308714918759, "grad_norm": 0.26502060890197754, "learning_rate": 0.00013752925237098167, "loss": 0.3197, "step": 2541 }, { "epoch": 0.9387001477104875, "grad_norm": 0.2906153202056885, "learning_rate": 0.00013750461879541816, "loss": 0.2946, "step": 2542 }, { "epoch": 0.939069423929099, "grad_norm": 0.25271281599998474, "learning_rate": 0.00013747998521985468, "loss": 0.2851, "step": 2543 }, { "epoch": 0.9394387001477105, "grad_norm": 0.328204482793808, "learning_rate": 0.00013745535164429116, "loss": 0.3346, "step": 2544 }, { "epoch": 0.939807976366322, "grad_norm": 0.2681902050971985, "learning_rate": 0.0001374307180687277, "loss": 0.3299, "step": 2545 }, { "epoch": 0.9401772525849336, "grad_norm": 0.2553021311759949, "learning_rate": 0.0001374060844931642, "loss": 0.3179, "step": 2546 }, { "epoch": 0.9405465288035451, "grad_norm": 0.28603696823120117, "learning_rate": 0.0001373814509176007, "loss": 0.3275, "step": 2547 }, { "epoch": 0.9409158050221565, "grad_norm": 0.29382210969924927, "learning_rate": 0.0001373568173420372, "loss": 0.3132, "step": 2548 }, { "epoch": 0.941285081240768, "grad_norm": 0.29052960872650146, "learning_rate": 0.0001373321837664737, "loss": 0.324, "step": 2549 }, { "epoch": 0.9416543574593796, "grad_norm": 0.284537136554718, "learning_rate": 0.00013730755019091022, "loss": 0.3597, "step": 2550 }, { "epoch": 0.9416543574593796, "eval_loss": 0.2878875434398651, "eval_runtime": 5.8513, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 2550 }, { "epoch": 0.9420236336779911, "grad_norm": 0.2447899580001831, "learning_rate": 0.00013728291661534674, "loss": 0.244, "step": 2551 }, { "epoch": 0.9423929098966026, "grad_norm": 0.2734544277191162, "learning_rate": 0.00013725828303978323, "loss": 0.2923, "step": 2552 }, { "epoch": 0.9427621861152142, "grad_norm": 0.26823151111602783, "learning_rate": 0.00013723364946421974, "loss": 0.3097, "step": 2553 }, { "epoch": 0.9431314623338257, "grad_norm": 0.21306352317333221, "learning_rate": 0.00013720901588865626, "loss": 0.2454, "step": 2554 }, { "epoch": 0.9435007385524372, "grad_norm": 0.23448294401168823, "learning_rate": 0.00013718438231309277, "loss": 0.3058, "step": 2555 }, { "epoch": 0.9438700147710487, "grad_norm": 0.2050745040178299, "learning_rate": 0.00013715974873752926, "loss": 0.231, "step": 2556 }, { "epoch": 0.9442392909896603, "grad_norm": 0.28681740164756775, "learning_rate": 0.00013713511516196575, "loss": 0.32, "step": 2557 }, { "epoch": 0.9446085672082718, "grad_norm": 0.3256187438964844, "learning_rate": 0.00013711048158640226, "loss": 0.3114, "step": 2558 }, { "epoch": 0.9449778434268833, "grad_norm": 0.4193253517150879, "learning_rate": 0.00013708584801083878, "loss": 0.4179, "step": 2559 }, { "epoch": 0.9453471196454948, "grad_norm": 0.20358948409557343, "learning_rate": 0.0001370612144352753, "loss": 0.2017, "step": 2560 }, { "epoch": 0.9457163958641064, "grad_norm": 0.25791874527931213, "learning_rate": 0.00013703658085971178, "loss": 0.2399, "step": 2561 }, { "epoch": 0.9460856720827179, "grad_norm": 0.21454410254955292, "learning_rate": 0.0001370119472841483, "loss": 0.2495, "step": 2562 }, { "epoch": 0.9464549483013294, "grad_norm": 0.28488028049468994, "learning_rate": 0.0001369873137085848, "loss": 0.351, "step": 2563 }, { "epoch": 0.946824224519941, "grad_norm": 0.33706241846084595, "learning_rate": 0.00013696268013302132, "loss": 0.3406, "step": 2564 }, { "epoch": 0.9471935007385525, "grad_norm": 0.27496975660324097, "learning_rate": 0.0001369380465574578, "loss": 0.2703, "step": 2565 }, { "epoch": 0.947562776957164, "grad_norm": 0.28031080961227417, "learning_rate": 0.00013691341298189432, "loss": 0.2589, "step": 2566 }, { "epoch": 0.9479320531757754, "grad_norm": 0.3087734580039978, "learning_rate": 0.00013688877940633084, "loss": 0.3214, "step": 2567 }, { "epoch": 0.948301329394387, "grad_norm": 0.20123820006847382, "learning_rate": 0.00013686414583076735, "loss": 0.2379, "step": 2568 }, { "epoch": 0.9486706056129985, "grad_norm": 0.26751509308815, "learning_rate": 0.00013683951225520384, "loss": 0.3167, "step": 2569 }, { "epoch": 0.94903988183161, "grad_norm": 0.35837164521217346, "learning_rate": 0.00013681487867964035, "loss": 0.2686, "step": 2570 }, { "epoch": 0.9494091580502215, "grad_norm": 0.36316734552383423, "learning_rate": 0.00013679024510407684, "loss": 0.3615, "step": 2571 }, { "epoch": 0.9497784342688331, "grad_norm": 0.22303825616836548, "learning_rate": 0.00013676561152851338, "loss": 0.245, "step": 2572 }, { "epoch": 0.9501477104874446, "grad_norm": 0.3745732605457306, "learning_rate": 0.00013674097795294987, "loss": 0.3863, "step": 2573 }, { "epoch": 0.9505169867060561, "grad_norm": 0.2717512547969818, "learning_rate": 0.00013671634437738639, "loss": 0.2673, "step": 2574 }, { "epoch": 0.9508862629246676, "grad_norm": 0.27752625942230225, "learning_rate": 0.00013669171080182287, "loss": 0.277, "step": 2575 }, { "epoch": 0.9512555391432792, "grad_norm": 0.2683650255203247, "learning_rate": 0.0001366670772262594, "loss": 0.3122, "step": 2576 }, { "epoch": 0.9516248153618907, "grad_norm": 0.22293895483016968, "learning_rate": 0.0001366424436506959, "loss": 0.2263, "step": 2577 }, { "epoch": 0.9519940915805022, "grad_norm": 0.3275323808193207, "learning_rate": 0.00013661781007513242, "loss": 0.3681, "step": 2578 }, { "epoch": 0.9523633677991138, "grad_norm": 0.29128924012184143, "learning_rate": 0.0001365931764995689, "loss": 0.3474, "step": 2579 }, { "epoch": 0.9527326440177253, "grad_norm": 0.2997452914714813, "learning_rate": 0.00013656854292400542, "loss": 0.2639, "step": 2580 }, { "epoch": 0.9531019202363368, "grad_norm": 0.30281564593315125, "learning_rate": 0.00013654390934844193, "loss": 0.297, "step": 2581 }, { "epoch": 0.9534711964549483, "grad_norm": 0.23481415212154388, "learning_rate": 0.00013651927577287845, "loss": 0.2492, "step": 2582 }, { "epoch": 0.9538404726735599, "grad_norm": 0.28554078936576843, "learning_rate": 0.00013649464219731494, "loss": 0.2916, "step": 2583 }, { "epoch": 0.9542097488921714, "grad_norm": 0.27036115527153015, "learning_rate": 0.00013647000862175145, "loss": 0.2819, "step": 2584 }, { "epoch": 0.9545790251107829, "grad_norm": 0.26153528690338135, "learning_rate": 0.00013644537504618794, "loss": 0.2842, "step": 2585 }, { "epoch": 0.9549483013293943, "grad_norm": 0.436210960149765, "learning_rate": 0.00013642074147062448, "loss": 0.2785, "step": 2586 }, { "epoch": 0.955317577548006, "grad_norm": 0.25508248805999756, "learning_rate": 0.00013639610789506097, "loss": 0.2806, "step": 2587 }, { "epoch": 0.9556868537666174, "grad_norm": 0.2849658131599426, "learning_rate": 0.00013637147431949748, "loss": 0.2585, "step": 2588 }, { "epoch": 0.9560561299852289, "grad_norm": 0.336910605430603, "learning_rate": 0.00013634684074393397, "loss": 0.3214, "step": 2589 }, { "epoch": 0.9564254062038404, "grad_norm": 0.29144784808158875, "learning_rate": 0.00013632220716837049, "loss": 0.2821, "step": 2590 }, { "epoch": 0.956794682422452, "grad_norm": 0.2194124311208725, "learning_rate": 0.000136297573592807, "loss": 0.2094, "step": 2591 }, { "epoch": 0.9571639586410635, "grad_norm": 0.296017050743103, "learning_rate": 0.00013627294001724351, "loss": 0.2934, "step": 2592 }, { "epoch": 0.957533234859675, "grad_norm": 0.28335466980934143, "learning_rate": 0.00013624830644168, "loss": 0.3347, "step": 2593 }, { "epoch": 0.9579025110782866, "grad_norm": 0.27571043372154236, "learning_rate": 0.00013622367286611652, "loss": 0.2478, "step": 2594 }, { "epoch": 0.9582717872968981, "grad_norm": 0.24105633795261383, "learning_rate": 0.00013619903929055303, "loss": 0.2344, "step": 2595 }, { "epoch": 0.9586410635155096, "grad_norm": 0.30373385548591614, "learning_rate": 0.00013617440571498955, "loss": 0.3435, "step": 2596 }, { "epoch": 0.9590103397341211, "grad_norm": 0.2644283175468445, "learning_rate": 0.00013614977213942603, "loss": 0.2687, "step": 2597 }, { "epoch": 0.9593796159527327, "grad_norm": 0.33212679624557495, "learning_rate": 0.00013612513856386255, "loss": 0.3084, "step": 2598 }, { "epoch": 0.9597488921713442, "grad_norm": 0.29163432121276855, "learning_rate": 0.00013610050498829906, "loss": 0.3383, "step": 2599 }, { "epoch": 0.9601181683899557, "grad_norm": 0.31650787591934204, "learning_rate": 0.00013607587141273558, "loss": 0.3769, "step": 2600 }, { "epoch": 0.9601181683899557, "eval_loss": 0.279488205909729, "eval_runtime": 5.8583, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 2600 }, { "epoch": 0.9604874446085672, "grad_norm": 0.33627936244010925, "learning_rate": 0.00013605123783717206, "loss": 0.3784, "step": 2601 }, { "epoch": 0.9608567208271788, "grad_norm": 0.3369787037372589, "learning_rate": 0.00013602660426160858, "loss": 0.3478, "step": 2602 }, { "epoch": 0.9612259970457903, "grad_norm": 0.266859233379364, "learning_rate": 0.00013600197068604507, "loss": 0.2878, "step": 2603 }, { "epoch": 0.9615952732644018, "grad_norm": 0.28313395380973816, "learning_rate": 0.0001359773371104816, "loss": 0.2815, "step": 2604 }, { "epoch": 0.9619645494830132, "grad_norm": 0.2932313084602356, "learning_rate": 0.0001359527035349181, "loss": 0.281, "step": 2605 }, { "epoch": 0.9623338257016248, "grad_norm": 0.2748316526412964, "learning_rate": 0.0001359280699593546, "loss": 0.3368, "step": 2606 }, { "epoch": 0.9627031019202363, "grad_norm": 0.24314384162425995, "learning_rate": 0.0001359034363837911, "loss": 0.2786, "step": 2607 }, { "epoch": 0.9630723781388478, "grad_norm": 0.2865593433380127, "learning_rate": 0.0001358788028082276, "loss": 0.3516, "step": 2608 }, { "epoch": 0.9634416543574594, "grad_norm": 0.275774210691452, "learning_rate": 0.00013585416923266413, "loss": 0.2796, "step": 2609 }, { "epoch": 0.9638109305760709, "grad_norm": 0.27282050251960754, "learning_rate": 0.00013582953565710064, "loss": 0.2952, "step": 2610 }, { "epoch": 0.9641802067946824, "grad_norm": 0.2969392240047455, "learning_rate": 0.00013580490208153713, "loss": 0.316, "step": 2611 }, { "epoch": 0.9645494830132939, "grad_norm": 0.2910168170928955, "learning_rate": 0.00013578026850597364, "loss": 0.2658, "step": 2612 }, { "epoch": 0.9649187592319055, "grad_norm": 0.2322939783334732, "learning_rate": 0.00013575563493041016, "loss": 0.2802, "step": 2613 }, { "epoch": 0.965288035450517, "grad_norm": 0.3182610869407654, "learning_rate": 0.00013573100135484667, "loss": 0.3148, "step": 2614 }, { "epoch": 0.9656573116691285, "grad_norm": 0.2963951528072357, "learning_rate": 0.00013570636777928316, "loss": 0.3067, "step": 2615 }, { "epoch": 0.96602658788774, "grad_norm": 0.27038127183914185, "learning_rate": 0.00013568173420371968, "loss": 0.3095, "step": 2616 }, { "epoch": 0.9663958641063516, "grad_norm": 0.23234814405441284, "learning_rate": 0.00013565710062815616, "loss": 0.3002, "step": 2617 }, { "epoch": 0.9667651403249631, "grad_norm": 0.3561446964740753, "learning_rate": 0.0001356324670525927, "loss": 0.372, "step": 2618 }, { "epoch": 0.9671344165435746, "grad_norm": 0.39320874214172363, "learning_rate": 0.0001356078334770292, "loss": 0.3569, "step": 2619 }, { "epoch": 0.9675036927621861, "grad_norm": 0.2911607325077057, "learning_rate": 0.0001355831999014657, "loss": 0.2821, "step": 2620 }, { "epoch": 0.9678729689807977, "grad_norm": 0.26914578676223755, "learning_rate": 0.0001355585663259022, "loss": 0.2815, "step": 2621 }, { "epoch": 0.9682422451994092, "grad_norm": 0.28479838371276855, "learning_rate": 0.0001355339327503387, "loss": 0.289, "step": 2622 }, { "epoch": 0.9686115214180206, "grad_norm": 0.25384166836738586, "learning_rate": 0.00013550929917477522, "loss": 0.2992, "step": 2623 }, { "epoch": 0.9689807976366323, "grad_norm": 0.2510720193386078, "learning_rate": 0.00013548466559921174, "loss": 0.3051, "step": 2624 }, { "epoch": 0.9693500738552437, "grad_norm": 0.26602286100387573, "learning_rate": 0.00013546003202364823, "loss": 0.2962, "step": 2625 }, { "epoch": 0.9697193500738552, "grad_norm": 0.2997296452522278, "learning_rate": 0.00013543539844808474, "loss": 0.3212, "step": 2626 }, { "epoch": 0.9700886262924667, "grad_norm": 0.236396923661232, "learning_rate": 0.00013541076487252126, "loss": 0.258, "step": 2627 }, { "epoch": 0.9704579025110783, "grad_norm": 0.2335156798362732, "learning_rate": 0.00013538613129695777, "loss": 0.2814, "step": 2628 }, { "epoch": 0.9708271787296898, "grad_norm": 0.27555787563323975, "learning_rate": 0.00013536149772139426, "loss": 0.2626, "step": 2629 }, { "epoch": 0.9711964549483013, "grad_norm": 0.2781428098678589, "learning_rate": 0.00013533686414583077, "loss": 0.2911, "step": 2630 }, { "epoch": 0.9715657311669128, "grad_norm": 0.3219570219516754, "learning_rate": 0.00013531223057026726, "loss": 0.3326, "step": 2631 }, { "epoch": 0.9719350073855244, "grad_norm": 0.2987314760684967, "learning_rate": 0.0001352875969947038, "loss": 0.3199, "step": 2632 }, { "epoch": 0.9723042836041359, "grad_norm": 0.34429308772087097, "learning_rate": 0.0001352629634191403, "loss": 0.3914, "step": 2633 }, { "epoch": 0.9726735598227474, "grad_norm": 0.31119295954704285, "learning_rate": 0.0001352383298435768, "loss": 0.3947, "step": 2634 }, { "epoch": 0.9730428360413589, "grad_norm": 0.3772371709346771, "learning_rate": 0.0001352136962680133, "loss": 0.351, "step": 2635 }, { "epoch": 0.9734121122599705, "grad_norm": 0.40727177262306213, "learning_rate": 0.0001351890626924498, "loss": 0.3305, "step": 2636 }, { "epoch": 0.973781388478582, "grad_norm": 0.29313406348228455, "learning_rate": 0.00013516442911688632, "loss": 0.2813, "step": 2637 }, { "epoch": 0.9741506646971935, "grad_norm": 0.25514623522758484, "learning_rate": 0.00013513979554132284, "loss": 0.27, "step": 2638 }, { "epoch": 0.9745199409158051, "grad_norm": 0.28271159529685974, "learning_rate": 0.00013511516196575932, "loss": 0.3027, "step": 2639 }, { "epoch": 0.9748892171344166, "grad_norm": 0.2886260151863098, "learning_rate": 0.00013509052839019584, "loss": 0.3226, "step": 2640 }, { "epoch": 0.975258493353028, "grad_norm": 0.25844526290893555, "learning_rate": 0.00013506589481463235, "loss": 0.3192, "step": 2641 }, { "epoch": 0.9756277695716395, "grad_norm": 0.278309166431427, "learning_rate": 0.00013504126123906887, "loss": 0.3334, "step": 2642 }, { "epoch": 0.9759970457902511, "grad_norm": 0.3024517893791199, "learning_rate": 0.00013501662766350535, "loss": 0.3601, "step": 2643 }, { "epoch": 0.9763663220088626, "grad_norm": 0.29122552275657654, "learning_rate": 0.00013499199408794187, "loss": 0.3112, "step": 2644 }, { "epoch": 0.9767355982274741, "grad_norm": 0.3468911945819855, "learning_rate": 0.00013496736051237838, "loss": 0.2789, "step": 2645 }, { "epoch": 0.9771048744460856, "grad_norm": 0.30402520298957825, "learning_rate": 0.0001349427269368149, "loss": 0.3215, "step": 2646 }, { "epoch": 0.9774741506646972, "grad_norm": 0.2312483787536621, "learning_rate": 0.00013491809336125139, "loss": 0.2599, "step": 2647 }, { "epoch": 0.9778434268833087, "grad_norm": 0.23150765895843506, "learning_rate": 0.0001348934597856879, "loss": 0.2457, "step": 2648 }, { "epoch": 0.9782127031019202, "grad_norm": 0.25944799184799194, "learning_rate": 0.0001348688262101244, "loss": 0.2969, "step": 2649 }, { "epoch": 0.9785819793205317, "grad_norm": 0.3412152826786041, "learning_rate": 0.00013484419263456093, "loss": 0.367, "step": 2650 }, { "epoch": 0.9785819793205317, "eval_loss": 0.28251588344573975, "eval_runtime": 5.8648, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.194, "step": 2650 }, { "epoch": 0.9789512555391433, "grad_norm": 0.29037943482398987, "learning_rate": 0.00013481955905899742, "loss": 0.2868, "step": 2651 }, { "epoch": 0.9793205317577548, "grad_norm": 0.33751970529556274, "learning_rate": 0.00013479492548343393, "loss": 0.3976, "step": 2652 }, { "epoch": 0.9796898079763663, "grad_norm": 0.2732155919075012, "learning_rate": 0.00013477029190787042, "loss": 0.299, "step": 2653 }, { "epoch": 0.9800590841949779, "grad_norm": 0.2407206892967224, "learning_rate": 0.00013474565833230693, "loss": 0.2598, "step": 2654 }, { "epoch": 0.9804283604135894, "grad_norm": 0.30155372619628906, "learning_rate": 0.00013472102475674345, "loss": 0.285, "step": 2655 }, { "epoch": 0.9807976366322009, "grad_norm": 0.24309593439102173, "learning_rate": 0.00013469639118117996, "loss": 0.2762, "step": 2656 }, { "epoch": 0.9811669128508124, "grad_norm": 0.25054389238357544, "learning_rate": 0.00013467175760561645, "loss": 0.2697, "step": 2657 }, { "epoch": 0.981536189069424, "grad_norm": 0.3400585353374481, "learning_rate": 0.00013464712403005297, "loss": 0.3121, "step": 2658 }, { "epoch": 0.9819054652880355, "grad_norm": 0.39103445410728455, "learning_rate": 0.00013462249045448948, "loss": 0.3605, "step": 2659 }, { "epoch": 0.982274741506647, "grad_norm": 0.30087360739707947, "learning_rate": 0.000134597856878926, "loss": 0.2957, "step": 2660 }, { "epoch": 0.9826440177252584, "grad_norm": 0.29284095764160156, "learning_rate": 0.00013457322330336248, "loss": 0.2918, "step": 2661 }, { "epoch": 0.98301329394387, "grad_norm": 0.2765386402606964, "learning_rate": 0.000134548589727799, "loss": 0.3137, "step": 2662 }, { "epoch": 0.9833825701624815, "grad_norm": 0.24506525695323944, "learning_rate": 0.00013452395615223548, "loss": 0.2809, "step": 2663 }, { "epoch": 0.983751846381093, "grad_norm": 0.25712794065475464, "learning_rate": 0.00013449932257667203, "loss": 0.2752, "step": 2664 }, { "epoch": 0.9841211225997046, "grad_norm": 0.3054530620574951, "learning_rate": 0.00013447468900110851, "loss": 0.2917, "step": 2665 }, { "epoch": 0.9844903988183161, "grad_norm": 0.2664269506931305, "learning_rate": 0.00013445005542554503, "loss": 0.2626, "step": 2666 }, { "epoch": 0.9848596750369276, "grad_norm": 0.29204151034355164, "learning_rate": 0.00013442542184998152, "loss": 0.3547, "step": 2667 }, { "epoch": 0.9852289512555391, "grad_norm": 0.26726964116096497, "learning_rate": 0.00013440078827441803, "loss": 0.2614, "step": 2668 }, { "epoch": 0.9855982274741507, "grad_norm": 0.3160478472709656, "learning_rate": 0.00013437615469885455, "loss": 0.2919, "step": 2669 }, { "epoch": 0.9859675036927622, "grad_norm": 0.2872214913368225, "learning_rate": 0.00013435152112329106, "loss": 0.321, "step": 2670 }, { "epoch": 0.9863367799113737, "grad_norm": 0.2202260047197342, "learning_rate": 0.00013432688754772755, "loss": 0.2732, "step": 2671 }, { "epoch": 0.9867060561299852, "grad_norm": 0.30726248025894165, "learning_rate": 0.00013430225397216406, "loss": 0.3071, "step": 2672 }, { "epoch": 0.9870753323485968, "grad_norm": 0.2842807471752167, "learning_rate": 0.00013427762039660058, "loss": 0.3329, "step": 2673 }, { "epoch": 0.9874446085672083, "grad_norm": 0.29567378759384155, "learning_rate": 0.0001342529868210371, "loss": 0.2524, "step": 2674 }, { "epoch": 0.9878138847858198, "grad_norm": 0.29411858320236206, "learning_rate": 0.00013422835324547358, "loss": 0.2778, "step": 2675 }, { "epoch": 0.9881831610044313, "grad_norm": 0.24854087829589844, "learning_rate": 0.0001342037196699101, "loss": 0.2744, "step": 2676 }, { "epoch": 0.9885524372230429, "grad_norm": 0.2956259846687317, "learning_rate": 0.0001341790860943466, "loss": 0.272, "step": 2677 }, { "epoch": 0.9889217134416544, "grad_norm": 0.2957102060317993, "learning_rate": 0.00013415445251878312, "loss": 0.3295, "step": 2678 }, { "epoch": 0.9892909896602659, "grad_norm": 0.31470787525177, "learning_rate": 0.0001341298189432196, "loss": 0.3134, "step": 2679 }, { "epoch": 0.9896602658788775, "grad_norm": 0.31523028016090393, "learning_rate": 0.00013410518536765612, "loss": 0.3438, "step": 2680 }, { "epoch": 0.990029542097489, "grad_norm": 0.2612340748310089, "learning_rate": 0.0001340805517920926, "loss": 0.2339, "step": 2681 }, { "epoch": 0.9903988183161004, "grad_norm": 0.26308852434158325, "learning_rate": 0.00013405591821652915, "loss": 0.2653, "step": 2682 }, { "epoch": 0.9907680945347119, "grad_norm": 0.2718425691127777, "learning_rate": 0.00013403128464096564, "loss": 0.3107, "step": 2683 }, { "epoch": 0.9911373707533235, "grad_norm": 0.33294862508773804, "learning_rate": 0.00013400665106540216, "loss": 0.324, "step": 2684 }, { "epoch": 0.991506646971935, "grad_norm": 0.3325161635875702, "learning_rate": 0.00013398201748983864, "loss": 0.3826, "step": 2685 }, { "epoch": 0.9918759231905465, "grad_norm": 0.2503698766231537, "learning_rate": 0.00013395738391427516, "loss": 0.3366, "step": 2686 }, { "epoch": 0.992245199409158, "grad_norm": 0.27409908175468445, "learning_rate": 0.00013393275033871167, "loss": 0.2913, "step": 2687 }, { "epoch": 0.9926144756277696, "grad_norm": 0.3058303892612457, "learning_rate": 0.0001339081167631482, "loss": 0.3015, "step": 2688 }, { "epoch": 0.9929837518463811, "grad_norm": 0.2649807333946228, "learning_rate": 0.00013388348318758468, "loss": 0.3243, "step": 2689 }, { "epoch": 0.9933530280649926, "grad_norm": 0.249671071767807, "learning_rate": 0.0001338588496120212, "loss": 0.291, "step": 2690 }, { "epoch": 0.9937223042836041, "grad_norm": 0.2473740130662918, "learning_rate": 0.0001338342160364577, "loss": 0.3107, "step": 2691 }, { "epoch": 0.9940915805022157, "grad_norm": 0.2776089310646057, "learning_rate": 0.00013380958246089422, "loss": 0.3164, "step": 2692 }, { "epoch": 0.9944608567208272, "grad_norm": 0.2375720590353012, "learning_rate": 0.0001337849488853307, "loss": 0.241, "step": 2693 }, { "epoch": 0.9948301329394387, "grad_norm": 0.24300681054592133, "learning_rate": 0.00013376031530976722, "loss": 0.2543, "step": 2694 }, { "epoch": 0.9951994091580503, "grad_norm": 0.2924548387527466, "learning_rate": 0.0001337356817342037, "loss": 0.3668, "step": 2695 }, { "epoch": 0.9955686853766618, "grad_norm": 0.24749398231506348, "learning_rate": 0.00013371104815864025, "loss": 0.2591, "step": 2696 }, { "epoch": 0.9959379615952733, "grad_norm": 0.24914845824241638, "learning_rate": 0.00013368641458307674, "loss": 0.2659, "step": 2697 }, { "epoch": 0.9963072378138847, "grad_norm": 0.27471911907196045, "learning_rate": 0.00013366178100751325, "loss": 0.2832, "step": 2698 }, { "epoch": 0.9966765140324964, "grad_norm": 0.27133798599243164, "learning_rate": 0.00013363714743194974, "loss": 0.2787, "step": 2699 }, { "epoch": 0.9970457902511078, "grad_norm": 0.24822653830051422, "learning_rate": 0.00013361251385638626, "loss": 0.2716, "step": 2700 }, { "epoch": 0.9970457902511078, "eval_loss": 0.274232417345047, "eval_runtime": 5.8623, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 2700 }, { "epoch": 0.9974150664697193, "grad_norm": 0.261340856552124, "learning_rate": 0.00013358788028082277, "loss": 0.2797, "step": 2701 }, { "epoch": 0.9977843426883308, "grad_norm": 0.2826089859008789, "learning_rate": 0.00013356324670525928, "loss": 0.3014, "step": 2702 }, { "epoch": 0.9981536189069424, "grad_norm": 0.3125585913658142, "learning_rate": 0.00013353861312969577, "loss": 0.3643, "step": 2703 }, { "epoch": 0.9985228951255539, "grad_norm": 0.3222525417804718, "learning_rate": 0.0001335139795541323, "loss": 0.3355, "step": 2704 }, { "epoch": 0.9988921713441654, "grad_norm": 0.2628626525402069, "learning_rate": 0.0001334893459785688, "loss": 0.2584, "step": 2705 }, { "epoch": 0.9992614475627769, "grad_norm": 0.28646916151046753, "learning_rate": 0.00013346471240300532, "loss": 0.288, "step": 2706 }, { "epoch": 0.9996307237813885, "grad_norm": 0.27381405234336853, "learning_rate": 0.0001334400788274418, "loss": 0.2406, "step": 2707 }, { "epoch": 1.0, "grad_norm": 0.41870248317718506, "learning_rate": 0.00013341544525187832, "loss": 0.3323, "step": 2708 }, { "epoch": 1.0003692762186116, "grad_norm": 0.2796136438846588, "learning_rate": 0.00013339081167631483, "loss": 0.2912, "step": 2709 }, { "epoch": 1.000738552437223, "grad_norm": 0.23714330792427063, "learning_rate": 0.00013336617810075135, "loss": 0.2289, "step": 2710 }, { "epoch": 1.0011078286558346, "grad_norm": 0.24866294860839844, "learning_rate": 0.00013334154452518783, "loss": 0.2356, "step": 2711 }, { "epoch": 1.0014771048744462, "grad_norm": 0.22556094825267792, "learning_rate": 0.00013331691094962435, "loss": 0.2262, "step": 2712 }, { "epoch": 1.0018463810930576, "grad_norm": 0.25270405411720276, "learning_rate": 0.00013329227737406084, "loss": 0.2228, "step": 2713 }, { "epoch": 1.0022156573116692, "grad_norm": 0.2538648247718811, "learning_rate": 0.00013326764379849738, "loss": 0.2222, "step": 2714 }, { "epoch": 1.0025849335302806, "grad_norm": 0.252612441778183, "learning_rate": 0.00013324301022293387, "loss": 0.2648, "step": 2715 }, { "epoch": 1.0029542097488922, "grad_norm": 0.1996837705373764, "learning_rate": 0.00013321837664737038, "loss": 0.2038, "step": 2716 }, { "epoch": 1.0033234859675038, "grad_norm": 0.2726893424987793, "learning_rate": 0.00013319374307180687, "loss": 0.2449, "step": 2717 }, { "epoch": 1.0036927621861151, "grad_norm": 0.2307356745004654, "learning_rate": 0.00013316910949624338, "loss": 0.2301, "step": 2718 }, { "epoch": 1.0040620384047267, "grad_norm": 0.3182328939437866, "learning_rate": 0.0001331444759206799, "loss": 0.252, "step": 2719 }, { "epoch": 1.0044313146233383, "grad_norm": 0.2076626867055893, "learning_rate": 0.0001331198423451164, "loss": 0.2127, "step": 2720 }, { "epoch": 1.0048005908419497, "grad_norm": 0.289315402507782, "learning_rate": 0.0001330952087695529, "loss": 0.2599, "step": 2721 }, { "epoch": 1.0051698670605613, "grad_norm": 0.24626076221466064, "learning_rate": 0.00013307057519398941, "loss": 0.2289, "step": 2722 }, { "epoch": 1.005539143279173, "grad_norm": 0.25126105546951294, "learning_rate": 0.00013304594161842593, "loss": 0.1995, "step": 2723 }, { "epoch": 1.0059084194977843, "grad_norm": 0.3186296224594116, "learning_rate": 0.00013302130804286244, "loss": 0.2562, "step": 2724 }, { "epoch": 1.006277695716396, "grad_norm": 0.26250606775283813, "learning_rate": 0.00013299667446729893, "loss": 0.2666, "step": 2725 }, { "epoch": 1.0066469719350073, "grad_norm": 0.4268067181110382, "learning_rate": 0.00013297204089173545, "loss": 0.1914, "step": 2726 }, { "epoch": 1.007016248153619, "grad_norm": 0.22976088523864746, "learning_rate": 0.00013294740731617193, "loss": 0.1905, "step": 2727 }, { "epoch": 1.0073855243722305, "grad_norm": 0.26273614168167114, "learning_rate": 0.00013292277374060848, "loss": 0.254, "step": 2728 }, { "epoch": 1.0077548005908419, "grad_norm": 0.34758633375167847, "learning_rate": 0.00013289814016504496, "loss": 0.2775, "step": 2729 }, { "epoch": 1.0081240768094535, "grad_norm": 0.2611452639102936, "learning_rate": 0.00013287350658948148, "loss": 0.256, "step": 2730 }, { "epoch": 1.008493353028065, "grad_norm": 0.3135962188243866, "learning_rate": 0.00013284887301391797, "loss": 0.3241, "step": 2731 }, { "epoch": 1.0088626292466765, "grad_norm": 0.26270318031311035, "learning_rate": 0.00013282423943835448, "loss": 0.2741, "step": 2732 }, { "epoch": 1.009231905465288, "grad_norm": 0.22408580780029297, "learning_rate": 0.000132799605862791, "loss": 0.24, "step": 2733 }, { "epoch": 1.0096011816838995, "grad_norm": 0.30195099115371704, "learning_rate": 0.0001327749722872275, "loss": 0.2597, "step": 2734 }, { "epoch": 1.009970457902511, "grad_norm": 0.2831840515136719, "learning_rate": 0.000132750338711664, "loss": 0.269, "step": 2735 }, { "epoch": 1.0103397341211227, "grad_norm": 0.2893206477165222, "learning_rate": 0.0001327257051361005, "loss": 0.2366, "step": 2736 }, { "epoch": 1.010709010339734, "grad_norm": 0.2479332834482193, "learning_rate": 0.00013270107156053703, "loss": 0.2529, "step": 2737 }, { "epoch": 1.0110782865583456, "grad_norm": 0.26697227358818054, "learning_rate": 0.00013267643798497354, "loss": 0.265, "step": 2738 }, { "epoch": 1.0114475627769572, "grad_norm": 0.2914988696575165, "learning_rate": 0.00013265180440941003, "loss": 0.2525, "step": 2739 }, { "epoch": 1.0118168389955686, "grad_norm": 0.2896738350391388, "learning_rate": 0.00013262717083384654, "loss": 0.2927, "step": 2740 }, { "epoch": 1.0121861152141802, "grad_norm": 0.2432498037815094, "learning_rate": 0.00013260253725828306, "loss": 0.2197, "step": 2741 }, { "epoch": 1.0125553914327918, "grad_norm": 0.2340407818555832, "learning_rate": 0.00013257790368271957, "loss": 0.2821, "step": 2742 }, { "epoch": 1.0129246676514032, "grad_norm": 0.3191532492637634, "learning_rate": 0.00013255327010715606, "loss": 0.2787, "step": 2743 }, { "epoch": 1.0132939438700148, "grad_norm": 0.2743033170700073, "learning_rate": 0.00013252863653159257, "loss": 0.2802, "step": 2744 }, { "epoch": 1.0136632200886262, "grad_norm": 0.30244073271751404, "learning_rate": 0.00013250400295602906, "loss": 0.2712, "step": 2745 }, { "epoch": 1.0140324963072378, "grad_norm": 0.2862381935119629, "learning_rate": 0.0001324793693804656, "loss": 0.2567, "step": 2746 }, { "epoch": 1.0144017725258494, "grad_norm": 0.21522857248783112, "learning_rate": 0.0001324547358049021, "loss": 0.2134, "step": 2747 }, { "epoch": 1.0147710487444608, "grad_norm": 0.23117266595363617, "learning_rate": 0.0001324301022293386, "loss": 0.2232, "step": 2748 }, { "epoch": 1.0151403249630724, "grad_norm": 0.2996717095375061, "learning_rate": 0.0001324054686537751, "loss": 0.2629, "step": 2749 }, { "epoch": 1.015509601181684, "grad_norm": 0.2513776421546936, "learning_rate": 0.0001323808350782116, "loss": 0.2296, "step": 2750 }, { "epoch": 1.015509601181684, "eval_loss": 0.2765809893608093, "eval_runtime": 5.8669, "eval_samples_per_second": 8.522, "eval_steps_per_second": 1.193, "step": 2750 }, { "epoch": 1.0158788774002954, "grad_norm": 0.22069954872131348, "learning_rate": 0.00013235620150264812, "loss": 0.1827, "step": 2751 }, { "epoch": 1.016248153618907, "grad_norm": 0.23865504562854767, "learning_rate": 0.00013233156792708464, "loss": 0.2429, "step": 2752 }, { "epoch": 1.0166174298375186, "grad_norm": 0.3293781578540802, "learning_rate": 0.00013230693435152112, "loss": 0.297, "step": 2753 }, { "epoch": 1.01698670605613, "grad_norm": 0.26011794805526733, "learning_rate": 0.00013228230077595764, "loss": 0.2156, "step": 2754 }, { "epoch": 1.0173559822747416, "grad_norm": 0.28752657771110535, "learning_rate": 0.00013225766720039415, "loss": 0.2522, "step": 2755 }, { "epoch": 1.017725258493353, "grad_norm": 0.2574158310890198, "learning_rate": 0.00013223303362483067, "loss": 0.2289, "step": 2756 }, { "epoch": 1.0180945347119645, "grad_norm": 0.2493678331375122, "learning_rate": 0.00013220840004926716, "loss": 0.22, "step": 2757 }, { "epoch": 1.0184638109305761, "grad_norm": 0.24139755964279175, "learning_rate": 0.00013218376647370367, "loss": 0.2285, "step": 2758 }, { "epoch": 1.0188330871491875, "grad_norm": 0.30356085300445557, "learning_rate": 0.00013215913289814016, "loss": 0.2515, "step": 2759 }, { "epoch": 1.0192023633677991, "grad_norm": 0.31113043427467346, "learning_rate": 0.0001321344993225767, "loss": 0.2994, "step": 2760 }, { "epoch": 1.0195716395864107, "grad_norm": 0.26762205362319946, "learning_rate": 0.0001321098657470132, "loss": 0.2276, "step": 2761 }, { "epoch": 1.019940915805022, "grad_norm": 0.24827049672603607, "learning_rate": 0.0001320852321714497, "loss": 0.2461, "step": 2762 }, { "epoch": 1.0203101920236337, "grad_norm": 0.23613737523555756, "learning_rate": 0.0001320605985958862, "loss": 0.2419, "step": 2763 }, { "epoch": 1.020679468242245, "grad_norm": 0.3247947692871094, "learning_rate": 0.0001320359650203227, "loss": 0.3048, "step": 2764 }, { "epoch": 1.0210487444608567, "grad_norm": 0.2768281102180481, "learning_rate": 0.00013201133144475922, "loss": 0.2571, "step": 2765 }, { "epoch": 1.0214180206794683, "grad_norm": 0.24561360478401184, "learning_rate": 0.00013198669786919573, "loss": 0.2267, "step": 2766 }, { "epoch": 1.0217872968980797, "grad_norm": 0.29054397344589233, "learning_rate": 0.00013196206429363222, "loss": 0.268, "step": 2767 }, { "epoch": 1.0221565731166913, "grad_norm": 0.28214576840400696, "learning_rate": 0.00013193743071806874, "loss": 0.2309, "step": 2768 }, { "epoch": 1.0225258493353029, "grad_norm": 0.28700947761535645, "learning_rate": 0.00013191279714250525, "loss": 0.2764, "step": 2769 }, { "epoch": 1.0228951255539143, "grad_norm": 0.3498358726501465, "learning_rate": 0.00013188816356694176, "loss": 0.3117, "step": 2770 }, { "epoch": 1.0232644017725259, "grad_norm": 0.2784065306186676, "learning_rate": 0.00013186352999137825, "loss": 0.2392, "step": 2771 }, { "epoch": 1.0236336779911375, "grad_norm": 0.2639756202697754, "learning_rate": 0.00013183889641581477, "loss": 0.2034, "step": 2772 }, { "epoch": 1.0240029542097489, "grad_norm": 0.25432780385017395, "learning_rate": 0.00013181426284025125, "loss": 0.257, "step": 2773 }, { "epoch": 1.0243722304283605, "grad_norm": 0.24631430208683014, "learning_rate": 0.0001317896292646878, "loss": 0.26, "step": 2774 }, { "epoch": 1.0247415066469718, "grad_norm": 0.23217861354351044, "learning_rate": 0.00013176499568912428, "loss": 0.209, "step": 2775 }, { "epoch": 1.0251107828655834, "grad_norm": 0.24988959729671478, "learning_rate": 0.0001317403621135608, "loss": 0.2473, "step": 2776 }, { "epoch": 1.025480059084195, "grad_norm": 0.2612614035606384, "learning_rate": 0.00013171572853799729, "loss": 0.231, "step": 2777 }, { "epoch": 1.0258493353028064, "grad_norm": 0.23968425393104553, "learning_rate": 0.0001316910949624338, "loss": 0.2407, "step": 2778 }, { "epoch": 1.026218611521418, "grad_norm": 0.22235450148582458, "learning_rate": 0.00013166646138687032, "loss": 0.1932, "step": 2779 }, { "epoch": 1.0265878877400296, "grad_norm": 0.24761974811553955, "learning_rate": 0.00013164182781130683, "loss": 0.2374, "step": 2780 }, { "epoch": 1.026957163958641, "grad_norm": 0.3479146659374237, "learning_rate": 0.00013161719423574332, "loss": 0.3226, "step": 2781 }, { "epoch": 1.0273264401772526, "grad_norm": 0.31968605518341064, "learning_rate": 0.00013159256066017983, "loss": 0.2611, "step": 2782 }, { "epoch": 1.0276957163958642, "grad_norm": 0.3102746605873108, "learning_rate": 0.00013156792708461635, "loss": 0.2464, "step": 2783 }, { "epoch": 1.0280649926144756, "grad_norm": 0.22494328022003174, "learning_rate": 0.00013154329350905286, "loss": 0.2197, "step": 2784 }, { "epoch": 1.0284342688330872, "grad_norm": 0.2669663429260254, "learning_rate": 0.00013151865993348935, "loss": 0.2484, "step": 2785 }, { "epoch": 1.0288035450516986, "grad_norm": 0.22760596871376038, "learning_rate": 0.00013149402635792586, "loss": 0.2172, "step": 2786 }, { "epoch": 1.0291728212703102, "grad_norm": 0.2485930472612381, "learning_rate": 0.00013146939278236238, "loss": 0.2698, "step": 2787 }, { "epoch": 1.0295420974889218, "grad_norm": 0.2849338948726654, "learning_rate": 0.0001314447592067989, "loss": 0.2545, "step": 2788 }, { "epoch": 1.0299113737075332, "grad_norm": 0.26047658920288086, "learning_rate": 0.00013142012563123538, "loss": 0.2296, "step": 2789 }, { "epoch": 1.0302806499261448, "grad_norm": 0.2526266276836395, "learning_rate": 0.00013139549205567187, "loss": 0.2553, "step": 2790 }, { "epoch": 1.0306499261447564, "grad_norm": 0.2689168155193329, "learning_rate": 0.00013137085848010838, "loss": 0.2323, "step": 2791 }, { "epoch": 1.0310192023633677, "grad_norm": 0.28571009635925293, "learning_rate": 0.0001313462249045449, "loss": 0.2889, "step": 2792 }, { "epoch": 1.0313884785819794, "grad_norm": 0.2267007678747177, "learning_rate": 0.0001313215913289814, "loss": 0.2128, "step": 2793 }, { "epoch": 1.0317577548005907, "grad_norm": 0.23780003190040588, "learning_rate": 0.0001312969577534179, "loss": 0.1987, "step": 2794 }, { "epoch": 1.0321270310192023, "grad_norm": 0.2646009027957916, "learning_rate": 0.00013127232417785441, "loss": 0.2778, "step": 2795 }, { "epoch": 1.032496307237814, "grad_norm": 0.2047690451145172, "learning_rate": 0.00013124769060229093, "loss": 0.2001, "step": 2796 }, { "epoch": 1.0328655834564253, "grad_norm": 0.24524430930614471, "learning_rate": 0.00013122305702672744, "loss": 0.2674, "step": 2797 }, { "epoch": 1.033234859675037, "grad_norm": 0.2460377961397171, "learning_rate": 0.00013119842345116393, "loss": 0.2551, "step": 2798 }, { "epoch": 1.0336041358936485, "grad_norm": 0.2284260094165802, "learning_rate": 0.00013117378987560045, "loss": 0.1842, "step": 2799 }, { "epoch": 1.03397341211226, "grad_norm": 0.30626076459884644, "learning_rate": 0.00013114915630003693, "loss": 0.2457, "step": 2800 }, { "epoch": 1.03397341211226, "eval_loss": 0.27627602219581604, "eval_runtime": 5.8633, "eval_samples_per_second": 8.528, "eval_steps_per_second": 1.194, "step": 2800 }, { "epoch": 1.0343426883308715, "grad_norm": 0.27121275663375854, "learning_rate": 0.00013112452272447347, "loss": 0.2557, "step": 2801 }, { "epoch": 1.034711964549483, "grad_norm": 0.23903623223304749, "learning_rate": 0.00013109988914890996, "loss": 0.2436, "step": 2802 }, { "epoch": 1.0350812407680945, "grad_norm": 0.26895764470100403, "learning_rate": 0.00013107525557334648, "loss": 0.253, "step": 2803 }, { "epoch": 1.035450516986706, "grad_norm": 0.3080110251903534, "learning_rate": 0.00013105062199778296, "loss": 0.2786, "step": 2804 }, { "epoch": 1.0358197932053175, "grad_norm": 0.23358094692230225, "learning_rate": 0.00013102598842221948, "loss": 0.2456, "step": 2805 }, { "epoch": 1.036189069423929, "grad_norm": 0.22798776626586914, "learning_rate": 0.000131001354846656, "loss": 0.2256, "step": 2806 }, { "epoch": 1.0365583456425407, "grad_norm": 0.24336901307106018, "learning_rate": 0.0001309767212710925, "loss": 0.2367, "step": 2807 }, { "epoch": 1.036927621861152, "grad_norm": 0.2236175686120987, "learning_rate": 0.000130952087695529, "loss": 0.2216, "step": 2808 }, { "epoch": 1.0372968980797637, "grad_norm": 0.25561004877090454, "learning_rate": 0.0001309274541199655, "loss": 0.263, "step": 2809 }, { "epoch": 1.0376661742983753, "grad_norm": 0.3103543519973755, "learning_rate": 0.00013090282054440203, "loss": 0.2854, "step": 2810 }, { "epoch": 1.0380354505169866, "grad_norm": 0.266419917345047, "learning_rate": 0.00013087818696883854, "loss": 0.2509, "step": 2811 }, { "epoch": 1.0384047267355982, "grad_norm": 0.29172733426094055, "learning_rate": 0.00013085355339327503, "loss": 0.2717, "step": 2812 }, { "epoch": 1.0387740029542099, "grad_norm": 0.2763681411743164, "learning_rate": 0.00013082891981771154, "loss": 0.3048, "step": 2813 }, { "epoch": 1.0391432791728212, "grad_norm": 0.2837805449962616, "learning_rate": 0.00013080428624214806, "loss": 0.2704, "step": 2814 }, { "epoch": 1.0395125553914328, "grad_norm": 0.2764654755592346, "learning_rate": 0.00013077965266658457, "loss": 0.2343, "step": 2815 }, { "epoch": 1.0398818316100442, "grad_norm": 0.3275948166847229, "learning_rate": 0.00013075501909102106, "loss": 0.283, "step": 2816 }, { "epoch": 1.0402511078286558, "grad_norm": 0.27809953689575195, "learning_rate": 0.00013073038551545757, "loss": 0.2474, "step": 2817 }, { "epoch": 1.0406203840472674, "grad_norm": 0.2790237367153168, "learning_rate": 0.00013070575193989406, "loss": 0.2589, "step": 2818 }, { "epoch": 1.0409896602658788, "grad_norm": 0.2769007980823517, "learning_rate": 0.0001306811183643306, "loss": 0.2463, "step": 2819 }, { "epoch": 1.0413589364844904, "grad_norm": 0.38644951581954956, "learning_rate": 0.0001306564847887671, "loss": 0.2702, "step": 2820 }, { "epoch": 1.041728212703102, "grad_norm": 0.2395869642496109, "learning_rate": 0.0001306318512132036, "loss": 0.2409, "step": 2821 }, { "epoch": 1.0420974889217134, "grad_norm": 0.35608041286468506, "learning_rate": 0.0001306072176376401, "loss": 0.2944, "step": 2822 }, { "epoch": 1.042466765140325, "grad_norm": 0.3523600399494171, "learning_rate": 0.0001305825840620766, "loss": 0.2428, "step": 2823 }, { "epoch": 1.0428360413589366, "grad_norm": 0.26375600695610046, "learning_rate": 0.00013055795048651312, "loss": 0.2745, "step": 2824 }, { "epoch": 1.043205317577548, "grad_norm": 0.30907225608825684, "learning_rate": 0.00013053331691094964, "loss": 0.2518, "step": 2825 }, { "epoch": 1.0435745937961596, "grad_norm": 0.2785424292087555, "learning_rate": 0.00013050868333538612, "loss": 0.237, "step": 2826 }, { "epoch": 1.043943870014771, "grad_norm": 0.26143819093704224, "learning_rate": 0.00013048404975982264, "loss": 0.2381, "step": 2827 }, { "epoch": 1.0443131462333826, "grad_norm": 0.2775009274482727, "learning_rate": 0.00013045941618425915, "loss": 0.2384, "step": 2828 }, { "epoch": 1.0446824224519942, "grad_norm": 0.3305076062679291, "learning_rate": 0.00013043478260869567, "loss": 0.3022, "step": 2829 }, { "epoch": 1.0450516986706055, "grad_norm": 0.34179162979125977, "learning_rate": 0.00013041014903313216, "loss": 0.2568, "step": 2830 }, { "epoch": 1.0454209748892171, "grad_norm": 0.2659331262111664, "learning_rate": 0.00013038551545756867, "loss": 0.2243, "step": 2831 }, { "epoch": 1.0457902511078287, "grad_norm": 0.33292558789253235, "learning_rate": 0.00013036088188200516, "loss": 0.232, "step": 2832 }, { "epoch": 1.0461595273264401, "grad_norm": 0.28080078959465027, "learning_rate": 0.0001303362483064417, "loss": 0.2678, "step": 2833 }, { "epoch": 1.0465288035450517, "grad_norm": 0.27119290828704834, "learning_rate": 0.0001303116147308782, "loss": 0.2687, "step": 2834 }, { "epoch": 1.0468980797636631, "grad_norm": 0.2821057140827179, "learning_rate": 0.0001302869811553147, "loss": 0.2614, "step": 2835 }, { "epoch": 1.0472673559822747, "grad_norm": 0.30902543663978577, "learning_rate": 0.0001302623475797512, "loss": 0.227, "step": 2836 }, { "epoch": 1.0476366322008863, "grad_norm": 0.23212537169456482, "learning_rate": 0.0001302377140041877, "loss": 0.1726, "step": 2837 }, { "epoch": 1.0480059084194977, "grad_norm": 0.25574445724487305, "learning_rate": 0.00013021308042862422, "loss": 0.2019, "step": 2838 }, { "epoch": 1.0483751846381093, "grad_norm": 0.32025423645973206, "learning_rate": 0.00013018844685306073, "loss": 0.2587, "step": 2839 }, { "epoch": 1.048744460856721, "grad_norm": 0.23577898740768433, "learning_rate": 0.00013016381327749722, "loss": 0.2392, "step": 2840 }, { "epoch": 1.0491137370753323, "grad_norm": 0.23110058903694153, "learning_rate": 0.00013013917970193374, "loss": 0.2344, "step": 2841 }, { "epoch": 1.049483013293944, "grad_norm": 0.3195451498031616, "learning_rate": 0.00013011454612637025, "loss": 0.2291, "step": 2842 }, { "epoch": 1.0498522895125555, "grad_norm": 0.23478031158447266, "learning_rate": 0.00013008991255080676, "loss": 0.256, "step": 2843 }, { "epoch": 1.0502215657311669, "grad_norm": 0.2877449691295624, "learning_rate": 0.00013006527897524325, "loss": 0.2616, "step": 2844 }, { "epoch": 1.0505908419497785, "grad_norm": 0.28867214918136597, "learning_rate": 0.00013004064539967977, "loss": 0.2836, "step": 2845 }, { "epoch": 1.0509601181683899, "grad_norm": 0.299875408411026, "learning_rate": 0.00013001601182411628, "loss": 0.2304, "step": 2846 }, { "epoch": 1.0513293943870015, "grad_norm": 0.24449953436851501, "learning_rate": 0.0001299913782485528, "loss": 0.1973, "step": 2847 }, { "epoch": 1.051698670605613, "grad_norm": 0.2585119605064392, "learning_rate": 0.00012996674467298928, "loss": 0.187, "step": 2848 }, { "epoch": 1.0520679468242244, "grad_norm": 0.24200786650180817, "learning_rate": 0.0001299421110974258, "loss": 0.2077, "step": 2849 }, { "epoch": 1.052437223042836, "grad_norm": 0.31775715947151184, "learning_rate": 0.00012991747752186229, "loss": 0.2987, "step": 2850 }, { "epoch": 1.052437223042836, "eval_loss": 0.27618589997291565, "eval_runtime": 5.8566, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 2850 }, { "epoch": 1.0528064992614476, "grad_norm": 0.3076942265033722, "learning_rate": 0.00012989284394629883, "loss": 0.2668, "step": 2851 }, { "epoch": 1.053175775480059, "grad_norm": 0.2545338273048401, "learning_rate": 0.00012986821037073531, "loss": 0.2562, "step": 2852 }, { "epoch": 1.0535450516986706, "grad_norm": 0.23804230988025665, "learning_rate": 0.00012984357679517183, "loss": 0.1993, "step": 2853 }, { "epoch": 1.0539143279172822, "grad_norm": 0.26753851771354675, "learning_rate": 0.00012981894321960832, "loss": 0.2218, "step": 2854 }, { "epoch": 1.0542836041358936, "grad_norm": 0.2615346610546112, "learning_rate": 0.00012979430964404483, "loss": 0.2618, "step": 2855 }, { "epoch": 1.0546528803545052, "grad_norm": 0.23570382595062256, "learning_rate": 0.00012976967606848135, "loss": 0.1975, "step": 2856 }, { "epoch": 1.0550221565731166, "grad_norm": 0.2610030770301819, "learning_rate": 0.00012974504249291786, "loss": 0.2375, "step": 2857 }, { "epoch": 1.0553914327917282, "grad_norm": 0.3134918808937073, "learning_rate": 0.00012972040891735435, "loss": 0.2299, "step": 2858 }, { "epoch": 1.0557607090103398, "grad_norm": 0.27121251821517944, "learning_rate": 0.00012969577534179086, "loss": 0.268, "step": 2859 }, { "epoch": 1.0561299852289512, "grad_norm": 0.26513317227363586, "learning_rate": 0.00012967114176622738, "loss": 0.219, "step": 2860 }, { "epoch": 1.0564992614475628, "grad_norm": 0.25374358892440796, "learning_rate": 0.0001296465081906639, "loss": 0.2188, "step": 2861 }, { "epoch": 1.0568685376661744, "grad_norm": 0.35821032524108887, "learning_rate": 0.00012962187461510038, "loss": 0.2754, "step": 2862 }, { "epoch": 1.0572378138847858, "grad_norm": 0.2845410108566284, "learning_rate": 0.0001295972410395369, "loss": 0.2178, "step": 2863 }, { "epoch": 1.0576070901033974, "grad_norm": 0.29702913761138916, "learning_rate": 0.00012957260746397338, "loss": 0.2715, "step": 2864 }, { "epoch": 1.0579763663220088, "grad_norm": 0.2890467047691345, "learning_rate": 0.00012954797388840992, "loss": 0.2827, "step": 2865 }, { "epoch": 1.0583456425406204, "grad_norm": 0.2700360417366028, "learning_rate": 0.0001295233403128464, "loss": 0.2362, "step": 2866 }, { "epoch": 1.058714918759232, "grad_norm": 0.25590065121650696, "learning_rate": 0.00012949870673728293, "loss": 0.2079, "step": 2867 }, { "epoch": 1.0590841949778433, "grad_norm": 0.3024281859397888, "learning_rate": 0.0001294740731617194, "loss": 0.2651, "step": 2868 }, { "epoch": 1.059453471196455, "grad_norm": 0.23602551221847534, "learning_rate": 0.00012944943958615593, "loss": 0.195, "step": 2869 }, { "epoch": 1.0598227474150665, "grad_norm": 0.2701203227043152, "learning_rate": 0.00012942480601059244, "loss": 0.253, "step": 2870 }, { "epoch": 1.060192023633678, "grad_norm": 0.3156161606311798, "learning_rate": 0.00012940017243502896, "loss": 0.2887, "step": 2871 }, { "epoch": 1.0605612998522895, "grad_norm": 0.21855607628822327, "learning_rate": 0.00012937553885946544, "loss": 0.1972, "step": 2872 }, { "epoch": 1.0609305760709011, "grad_norm": 0.3342536687850952, "learning_rate": 0.00012935090528390196, "loss": 0.2403, "step": 2873 }, { "epoch": 1.0612998522895125, "grad_norm": 0.24294506013393402, "learning_rate": 0.00012932627170833847, "loss": 0.234, "step": 2874 }, { "epoch": 1.0616691285081241, "grad_norm": 0.2701275050640106, "learning_rate": 0.000129301638132775, "loss": 0.2289, "step": 2875 }, { "epoch": 1.0620384047267355, "grad_norm": 0.2676856219768524, "learning_rate": 0.00012927700455721148, "loss": 0.2238, "step": 2876 }, { "epoch": 1.062407680945347, "grad_norm": 0.2500893473625183, "learning_rate": 0.000129252370981648, "loss": 0.2226, "step": 2877 }, { "epoch": 1.0627769571639587, "grad_norm": 0.2777494490146637, "learning_rate": 0.0001292277374060845, "loss": 0.2412, "step": 2878 }, { "epoch": 1.06314623338257, "grad_norm": 0.3415136933326721, "learning_rate": 0.00012920310383052102, "loss": 0.2569, "step": 2879 }, { "epoch": 1.0635155096011817, "grad_norm": 0.269741028547287, "learning_rate": 0.0001291784702549575, "loss": 0.2576, "step": 2880 }, { "epoch": 1.0638847858197933, "grad_norm": 0.2691381275653839, "learning_rate": 0.00012915383667939402, "loss": 0.2098, "step": 2881 }, { "epoch": 1.0642540620384047, "grad_norm": 0.257697194814682, "learning_rate": 0.0001291292031038305, "loss": 0.2526, "step": 2882 }, { "epoch": 1.0646233382570163, "grad_norm": 0.2562095820903778, "learning_rate": 0.00012910456952826705, "loss": 0.2164, "step": 2883 }, { "epoch": 1.0649926144756279, "grad_norm": 0.2850838899612427, "learning_rate": 0.00012907993595270354, "loss": 0.2251, "step": 2884 }, { "epoch": 1.0653618906942393, "grad_norm": 0.23420360684394836, "learning_rate": 0.00012905530237714005, "loss": 0.2164, "step": 2885 }, { "epoch": 1.0657311669128509, "grad_norm": 0.29589250683784485, "learning_rate": 0.00012903066880157654, "loss": 0.3105, "step": 2886 }, { "epoch": 1.0661004431314622, "grad_norm": 0.23718823492527008, "learning_rate": 0.00012900603522601306, "loss": 0.2262, "step": 2887 }, { "epoch": 1.0664697193500738, "grad_norm": 0.26658695936203003, "learning_rate": 0.00012898140165044957, "loss": 0.2385, "step": 2888 }, { "epoch": 1.0668389955686854, "grad_norm": 0.3809283673763275, "learning_rate": 0.00012895676807488609, "loss": 0.3091, "step": 2889 }, { "epoch": 1.0672082717872968, "grad_norm": 0.2975718080997467, "learning_rate": 0.00012893213449932257, "loss": 0.2426, "step": 2890 }, { "epoch": 1.0675775480059084, "grad_norm": 0.27787643671035767, "learning_rate": 0.0001289075009237591, "loss": 0.2769, "step": 2891 }, { "epoch": 1.06794682422452, "grad_norm": 0.27044934034347534, "learning_rate": 0.0001288828673481956, "loss": 0.2623, "step": 2892 }, { "epoch": 1.0683161004431314, "grad_norm": 0.24681483209133148, "learning_rate": 0.00012885823377263212, "loss": 0.2196, "step": 2893 }, { "epoch": 1.068685376661743, "grad_norm": 0.26946964859962463, "learning_rate": 0.0001288336001970686, "loss": 0.2263, "step": 2894 }, { "epoch": 1.0690546528803546, "grad_norm": 0.23332957923412323, "learning_rate": 0.00012880896662150512, "loss": 0.2418, "step": 2895 }, { "epoch": 1.069423929098966, "grad_norm": 0.28630056977272034, "learning_rate": 0.0001287843330459416, "loss": 0.2658, "step": 2896 }, { "epoch": 1.0697932053175776, "grad_norm": 0.3108094036579132, "learning_rate": 0.00012875969947037815, "loss": 0.2755, "step": 2897 }, { "epoch": 1.070162481536189, "grad_norm": 0.39538905024528503, "learning_rate": 0.00012873506589481464, "loss": 0.2336, "step": 2898 }, { "epoch": 1.0705317577548006, "grad_norm": 0.2604738771915436, "learning_rate": 0.00012871043231925115, "loss": 0.2361, "step": 2899 }, { "epoch": 1.0709010339734122, "grad_norm": 0.33082088828086853, "learning_rate": 0.00012868579874368764, "loss": 0.2508, "step": 2900 }, { "epoch": 1.0709010339734122, "eval_loss": 0.2767813801765442, "eval_runtime": 5.8576, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 2900 }, { "epoch": 1.0712703101920236, "grad_norm": 0.2974438965320587, "learning_rate": 0.00012866116516812415, "loss": 0.2323, "step": 2901 }, { "epoch": 1.0716395864106352, "grad_norm": 0.24254891276359558, "learning_rate": 0.00012863653159256067, "loss": 0.2219, "step": 2902 }, { "epoch": 1.0720088626292468, "grad_norm": 0.35016727447509766, "learning_rate": 0.00012861189801699718, "loss": 0.2304, "step": 2903 }, { "epoch": 1.0723781388478582, "grad_norm": 0.31191927194595337, "learning_rate": 0.00012858726444143367, "loss": 0.252, "step": 2904 }, { "epoch": 1.0727474150664698, "grad_norm": 0.25844889879226685, "learning_rate": 0.00012856263086587018, "loss": 0.26, "step": 2905 }, { "epoch": 1.0731166912850811, "grad_norm": 0.22783496975898743, "learning_rate": 0.0001285379972903067, "loss": 0.2091, "step": 2906 }, { "epoch": 1.0734859675036927, "grad_norm": 0.34771570563316345, "learning_rate": 0.0001285133637147432, "loss": 0.2972, "step": 2907 }, { "epoch": 1.0738552437223043, "grad_norm": 0.2750411033630371, "learning_rate": 0.0001284887301391797, "loss": 0.2472, "step": 2908 }, { "epoch": 1.0742245199409157, "grad_norm": 0.2285848706960678, "learning_rate": 0.00012846409656361622, "loss": 0.2443, "step": 2909 }, { "epoch": 1.0745937961595273, "grad_norm": 0.21732871234416962, "learning_rate": 0.00012843946298805273, "loss": 0.2002, "step": 2910 }, { "epoch": 1.074963072378139, "grad_norm": 0.2981227934360504, "learning_rate": 0.00012841482941248924, "loss": 0.2692, "step": 2911 }, { "epoch": 1.0753323485967503, "grad_norm": 0.24542368948459625, "learning_rate": 0.00012839019583692573, "loss": 0.2218, "step": 2912 }, { "epoch": 1.075701624815362, "grad_norm": 0.26330628991127014, "learning_rate": 0.00012836556226136225, "loss": 0.2446, "step": 2913 }, { "epoch": 1.0760709010339735, "grad_norm": 0.3054066300392151, "learning_rate": 0.00012834092868579873, "loss": 0.2813, "step": 2914 }, { "epoch": 1.076440177252585, "grad_norm": 0.2733544111251831, "learning_rate": 0.00012831629511023528, "loss": 0.2107, "step": 2915 }, { "epoch": 1.0768094534711965, "grad_norm": 0.28098398447036743, "learning_rate": 0.00012829166153467176, "loss": 0.2273, "step": 2916 }, { "epoch": 1.0771787296898079, "grad_norm": 0.3208807706832886, "learning_rate": 0.00012826702795910828, "loss": 0.2772, "step": 2917 }, { "epoch": 1.0775480059084195, "grad_norm": 0.26194900274276733, "learning_rate": 0.00012824239438354477, "loss": 0.2187, "step": 2918 }, { "epoch": 1.077917282127031, "grad_norm": 0.275849848985672, "learning_rate": 0.00012821776080798128, "loss": 0.2297, "step": 2919 }, { "epoch": 1.0782865583456425, "grad_norm": 0.24260565638542175, "learning_rate": 0.0001281931272324178, "loss": 0.2383, "step": 2920 }, { "epoch": 1.078655834564254, "grad_norm": 0.2160511314868927, "learning_rate": 0.0001281684936568543, "loss": 0.1778, "step": 2921 }, { "epoch": 1.0790251107828657, "grad_norm": 0.2878185212612152, "learning_rate": 0.0001281438600812908, "loss": 0.2639, "step": 2922 }, { "epoch": 1.079394387001477, "grad_norm": 0.2513464391231537, "learning_rate": 0.0001281192265057273, "loss": 0.2457, "step": 2923 }, { "epoch": 1.0797636632200887, "grad_norm": 0.28811386227607727, "learning_rate": 0.00012809459293016383, "loss": 0.2617, "step": 2924 }, { "epoch": 1.0801329394387, "grad_norm": 0.29851648211479187, "learning_rate": 0.00012806995935460034, "loss": 0.236, "step": 2925 }, { "epoch": 1.0805022156573116, "grad_norm": 0.27574729919433594, "learning_rate": 0.00012804532577903683, "loss": 0.2802, "step": 2926 }, { "epoch": 1.0808714918759232, "grad_norm": 0.23372849822044373, "learning_rate": 0.00012802069220347334, "loss": 0.203, "step": 2927 }, { "epoch": 1.0812407680945346, "grad_norm": 0.25483450293540955, "learning_rate": 0.00012799605862790983, "loss": 0.248, "step": 2928 }, { "epoch": 1.0816100443131462, "grad_norm": 0.25262442231178284, "learning_rate": 0.00012797142505234637, "loss": 0.2007, "step": 2929 }, { "epoch": 1.0819793205317578, "grad_norm": 0.2934408187866211, "learning_rate": 0.00012794679147678286, "loss": 0.2396, "step": 2930 }, { "epoch": 1.0823485967503692, "grad_norm": 0.2514856159687042, "learning_rate": 0.00012792215790121938, "loss": 0.2186, "step": 2931 }, { "epoch": 1.0827178729689808, "grad_norm": 0.27486154437065125, "learning_rate": 0.00012789752432565586, "loss": 0.2593, "step": 2932 }, { "epoch": 1.0830871491875924, "grad_norm": 0.2366592139005661, "learning_rate": 0.00012787289075009238, "loss": 0.2194, "step": 2933 }, { "epoch": 1.0834564254062038, "grad_norm": 0.25081318616867065, "learning_rate": 0.0001278482571745289, "loss": 0.2246, "step": 2934 }, { "epoch": 1.0838257016248154, "grad_norm": 0.2999029755592346, "learning_rate": 0.0001278236235989654, "loss": 0.2836, "step": 2935 }, { "epoch": 1.0841949778434268, "grad_norm": 0.23383331298828125, "learning_rate": 0.0001277989900234019, "loss": 0.2103, "step": 2936 }, { "epoch": 1.0845642540620384, "grad_norm": 0.23219743371009827, "learning_rate": 0.0001277743564478384, "loss": 0.2301, "step": 2937 }, { "epoch": 1.08493353028065, "grad_norm": 0.33842626214027405, "learning_rate": 0.00012774972287227492, "loss": 0.2801, "step": 2938 }, { "epoch": 1.0853028064992614, "grad_norm": 0.34319981932640076, "learning_rate": 0.00012772508929671144, "loss": 0.268, "step": 2939 }, { "epoch": 1.085672082717873, "grad_norm": 0.23391127586364746, "learning_rate": 0.00012770045572114793, "loss": 0.2201, "step": 2940 }, { "epoch": 1.0860413589364846, "grad_norm": 0.25276488065719604, "learning_rate": 0.00012767582214558444, "loss": 0.236, "step": 2941 }, { "epoch": 1.086410635155096, "grad_norm": 0.30007094144821167, "learning_rate": 0.00012765118857002093, "loss": 0.2987, "step": 2942 }, { "epoch": 1.0867799113737076, "grad_norm": 0.3073122799396515, "learning_rate": 0.00012762655499445747, "loss": 0.2702, "step": 2943 }, { "epoch": 1.0871491875923192, "grad_norm": 0.27124929428100586, "learning_rate": 0.00012760192141889396, "loss": 0.2549, "step": 2944 }, { "epoch": 1.0875184638109305, "grad_norm": 0.22957691550254822, "learning_rate": 0.00012757728784333047, "loss": 0.209, "step": 2945 }, { "epoch": 1.0878877400295421, "grad_norm": 0.26437053084373474, "learning_rate": 0.00012755265426776696, "loss": 0.2192, "step": 2946 }, { "epoch": 1.0882570162481535, "grad_norm": 0.21729683876037598, "learning_rate": 0.00012752802069220347, "loss": 0.219, "step": 2947 }, { "epoch": 1.0886262924667651, "grad_norm": 0.22185112535953522, "learning_rate": 0.00012750338711664, "loss": 0.187, "step": 2948 }, { "epoch": 1.0889955686853767, "grad_norm": 0.23401515185832977, "learning_rate": 0.0001274787535410765, "loss": 0.2367, "step": 2949 }, { "epoch": 1.089364844903988, "grad_norm": 0.2597702443599701, "learning_rate": 0.000127454119965513, "loss": 0.2198, "step": 2950 }, { "epoch": 1.089364844903988, "eval_loss": 0.2758151888847351, "eval_runtime": 5.8574, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 2950 }, { "epoch": 1.0897341211225997, "grad_norm": 0.23240283131599426, "learning_rate": 0.0001274294863899495, "loss": 0.1825, "step": 2951 }, { "epoch": 1.0901033973412113, "grad_norm": 0.24644871056079865, "learning_rate": 0.00012740485281438602, "loss": 0.235, "step": 2952 }, { "epoch": 1.0904726735598227, "grad_norm": 0.24943925440311432, "learning_rate": 0.00012738021923882253, "loss": 0.2302, "step": 2953 }, { "epoch": 1.0908419497784343, "grad_norm": 0.264442503452301, "learning_rate": 0.00012735558566325902, "loss": 0.2442, "step": 2954 }, { "epoch": 1.091211225997046, "grad_norm": 0.2990545928478241, "learning_rate": 0.00012733095208769554, "loss": 0.2634, "step": 2955 }, { "epoch": 1.0915805022156573, "grad_norm": 0.28452932834625244, "learning_rate": 0.00012730631851213205, "loss": 0.2528, "step": 2956 }, { "epoch": 1.0919497784342689, "grad_norm": 0.2825779914855957, "learning_rate": 0.00012728168493656857, "loss": 0.2432, "step": 2957 }, { "epoch": 1.0923190546528803, "grad_norm": 0.27615490555763245, "learning_rate": 0.00012725705136100505, "loss": 0.2471, "step": 2958 }, { "epoch": 1.0926883308714919, "grad_norm": 0.32125625014305115, "learning_rate": 0.00012723241778544157, "loss": 0.2899, "step": 2959 }, { "epoch": 1.0930576070901035, "grad_norm": 0.2847040593624115, "learning_rate": 0.00012720778420987806, "loss": 0.2388, "step": 2960 }, { "epoch": 1.0934268833087148, "grad_norm": 0.30398204922676086, "learning_rate": 0.0001271831506343146, "loss": 0.2625, "step": 2961 }, { "epoch": 1.0937961595273265, "grad_norm": 0.2955167591571808, "learning_rate": 0.00012715851705875108, "loss": 0.2384, "step": 2962 }, { "epoch": 1.094165435745938, "grad_norm": 0.33222588896751404, "learning_rate": 0.0001271338834831876, "loss": 0.2391, "step": 2963 }, { "epoch": 1.0945347119645494, "grad_norm": 0.27684515714645386, "learning_rate": 0.0001271092499076241, "loss": 0.2507, "step": 2964 }, { "epoch": 1.094903988183161, "grad_norm": 0.29779449105262756, "learning_rate": 0.0001270846163320606, "loss": 0.2067, "step": 2965 }, { "epoch": 1.0952732644017726, "grad_norm": 0.29084447026252747, "learning_rate": 0.00012705998275649712, "loss": 0.2458, "step": 2966 }, { "epoch": 1.095642540620384, "grad_norm": 0.23796658217906952, "learning_rate": 0.00012703534918093363, "loss": 0.2051, "step": 2967 }, { "epoch": 1.0960118168389956, "grad_norm": 0.3309081792831421, "learning_rate": 0.00012701071560537012, "loss": 0.2744, "step": 2968 }, { "epoch": 1.096381093057607, "grad_norm": 0.3095639944076538, "learning_rate": 0.00012698608202980663, "loss": 0.2843, "step": 2969 }, { "epoch": 1.0967503692762186, "grad_norm": 0.3181617558002472, "learning_rate": 0.00012696144845424315, "loss": 0.239, "step": 2970 }, { "epoch": 1.0971196454948302, "grad_norm": 0.23281103372573853, "learning_rate": 0.00012693681487867966, "loss": 0.216, "step": 2971 }, { "epoch": 1.0974889217134416, "grad_norm": 0.38004666566848755, "learning_rate": 0.00012691218130311615, "loss": 0.3023, "step": 2972 }, { "epoch": 1.0978581979320532, "grad_norm": 0.24479223787784576, "learning_rate": 0.00012688754772755266, "loss": 0.2248, "step": 2973 }, { "epoch": 1.0982274741506648, "grad_norm": 0.2372145652770996, "learning_rate": 0.00012686291415198915, "loss": 0.2228, "step": 2974 }, { "epoch": 1.0985967503692762, "grad_norm": 0.2958376407623291, "learning_rate": 0.0001268382805764257, "loss": 0.2777, "step": 2975 }, { "epoch": 1.0989660265878878, "grad_norm": 0.3016323447227478, "learning_rate": 0.00012681364700086218, "loss": 0.2612, "step": 2976 }, { "epoch": 1.0993353028064992, "grad_norm": 0.2763654887676239, "learning_rate": 0.0001267890134252987, "loss": 0.2851, "step": 2977 }, { "epoch": 1.0997045790251108, "grad_norm": 0.25004446506500244, "learning_rate": 0.00012676437984973518, "loss": 0.3329, "step": 2978 }, { "epoch": 1.1000738552437224, "grad_norm": 0.25913912057876587, "learning_rate": 0.0001267397462741717, "loss": 0.2259, "step": 2979 }, { "epoch": 1.1004431314623337, "grad_norm": 0.23761507868766785, "learning_rate": 0.0001267151126986082, "loss": 0.2219, "step": 2980 }, { "epoch": 1.1008124076809453, "grad_norm": 0.2535478174686432, "learning_rate": 0.00012669047912304473, "loss": 0.2359, "step": 2981 }, { "epoch": 1.101181683899557, "grad_norm": 0.3021339476108551, "learning_rate": 0.00012666584554748122, "loss": 0.2442, "step": 2982 }, { "epoch": 1.1015509601181683, "grad_norm": 0.27801018953323364, "learning_rate": 0.00012664121197191773, "loss": 0.2425, "step": 2983 }, { "epoch": 1.10192023633678, "grad_norm": 0.2920657992362976, "learning_rate": 0.00012661657839635424, "loss": 0.2554, "step": 2984 }, { "epoch": 1.1022895125553913, "grad_norm": 0.28303468227386475, "learning_rate": 0.00012659194482079076, "loss": 0.2404, "step": 2985 }, { "epoch": 1.102658788774003, "grad_norm": 0.2457839995622635, "learning_rate": 0.00012656731124522725, "loss": 0.2542, "step": 2986 }, { "epoch": 1.1030280649926145, "grad_norm": 0.25335025787353516, "learning_rate": 0.00012654267766966376, "loss": 0.2328, "step": 2987 }, { "epoch": 1.103397341211226, "grad_norm": 0.23906651139259338, "learning_rate": 0.00012651804409410028, "loss": 0.2178, "step": 2988 }, { "epoch": 1.1037666174298375, "grad_norm": 0.2866811752319336, "learning_rate": 0.0001264934105185368, "loss": 0.2498, "step": 2989 }, { "epoch": 1.104135893648449, "grad_norm": 0.2759389579296112, "learning_rate": 0.00012646877694297328, "loss": 0.2734, "step": 2990 }, { "epoch": 1.1045051698670605, "grad_norm": 0.3065519630908966, "learning_rate": 0.0001264441433674098, "loss": 0.2538, "step": 2991 }, { "epoch": 1.104874446085672, "grad_norm": 0.2824118435382843, "learning_rate": 0.00012641950979184628, "loss": 0.238, "step": 2992 }, { "epoch": 1.1052437223042837, "grad_norm": 0.32216399908065796, "learning_rate": 0.00012639487621628282, "loss": 0.2665, "step": 2993 }, { "epoch": 1.105612998522895, "grad_norm": 0.23793873190879822, "learning_rate": 0.0001263702426407193, "loss": 0.2157, "step": 2994 }, { "epoch": 1.1059822747415067, "grad_norm": 0.27395644783973694, "learning_rate": 0.00012634560906515582, "loss": 0.2214, "step": 2995 }, { "epoch": 1.106351550960118, "grad_norm": 0.23240530490875244, "learning_rate": 0.0001263209754895923, "loss": 0.2248, "step": 2996 }, { "epoch": 1.1067208271787297, "grad_norm": 0.2937382459640503, "learning_rate": 0.00012629634191402883, "loss": 0.2368, "step": 2997 }, { "epoch": 1.1070901033973413, "grad_norm": 0.34913092851638794, "learning_rate": 0.00012627170833846534, "loss": 0.2829, "step": 2998 }, { "epoch": 1.1074593796159526, "grad_norm": 0.2377249151468277, "learning_rate": 0.00012624707476290186, "loss": 0.1998, "step": 2999 }, { "epoch": 1.1078286558345642, "grad_norm": 0.321977823972702, "learning_rate": 0.00012622244118733834, "loss": 0.243, "step": 3000 }, { "epoch": 1.1078286558345642, "eval_loss": 0.2727716565132141, "eval_runtime": 5.8782, "eval_samples_per_second": 8.506, "eval_steps_per_second": 1.191, "step": 3000 }, { "epoch": 1.1081979320531758, "grad_norm": 0.2637842297554016, "learning_rate": 0.00012619780761177486, "loss": 0.2128, "step": 3001 }, { "epoch": 1.1085672082717872, "grad_norm": 0.24146443605422974, "learning_rate": 0.00012617317403621137, "loss": 0.2346, "step": 3002 }, { "epoch": 1.1089364844903988, "grad_norm": 0.3135768473148346, "learning_rate": 0.0001261485404606479, "loss": 0.2604, "step": 3003 }, { "epoch": 1.1093057607090104, "grad_norm": 0.24217776954174042, "learning_rate": 0.00012612390688508437, "loss": 0.1956, "step": 3004 }, { "epoch": 1.1096750369276218, "grad_norm": 0.280843049287796, "learning_rate": 0.0001260992733095209, "loss": 0.2541, "step": 3005 }, { "epoch": 1.1100443131462334, "grad_norm": 0.2333623617887497, "learning_rate": 0.00012607463973395738, "loss": 0.2471, "step": 3006 }, { "epoch": 1.1104135893648448, "grad_norm": 0.2542230784893036, "learning_rate": 0.00012605000615839392, "loss": 0.2015, "step": 3007 }, { "epoch": 1.1107828655834564, "grad_norm": 0.3016259968280792, "learning_rate": 0.0001260253725828304, "loss": 0.2298, "step": 3008 }, { "epoch": 1.111152141802068, "grad_norm": 0.28542524576187134, "learning_rate": 0.00012600073900726692, "loss": 0.2378, "step": 3009 }, { "epoch": 1.1115214180206794, "grad_norm": 0.29506421089172363, "learning_rate": 0.0001259761054317034, "loss": 0.2443, "step": 3010 }, { "epoch": 1.111890694239291, "grad_norm": 0.2600952088832855, "learning_rate": 0.00012595147185613992, "loss": 0.2239, "step": 3011 }, { "epoch": 1.1122599704579026, "grad_norm": 0.2315218448638916, "learning_rate": 0.00012592683828057644, "loss": 0.2431, "step": 3012 }, { "epoch": 1.112629246676514, "grad_norm": 0.26650896668434143, "learning_rate": 0.00012590220470501295, "loss": 0.1982, "step": 3013 }, { "epoch": 1.1129985228951256, "grad_norm": 0.29857608675956726, "learning_rate": 0.00012587757112944944, "loss": 0.2117, "step": 3014 }, { "epoch": 1.1133677991137372, "grad_norm": 0.35646194219589233, "learning_rate": 0.00012585293755388595, "loss": 0.3068, "step": 3015 }, { "epoch": 1.1137370753323486, "grad_norm": 0.28620585799217224, "learning_rate": 0.00012582830397832247, "loss": 0.2126, "step": 3016 }, { "epoch": 1.1141063515509602, "grad_norm": 0.28611883521080017, "learning_rate": 0.00012580367040275898, "loss": 0.2815, "step": 3017 }, { "epoch": 1.1144756277695715, "grad_norm": 0.2662048041820526, "learning_rate": 0.00012577903682719547, "loss": 0.2568, "step": 3018 }, { "epoch": 1.1148449039881831, "grad_norm": 0.2725636661052704, "learning_rate": 0.00012575440325163199, "loss": 0.2119, "step": 3019 }, { "epoch": 1.1152141802067947, "grad_norm": 0.24701742827892303, "learning_rate": 0.0001257297696760685, "loss": 0.2022, "step": 3020 }, { "epoch": 1.1155834564254061, "grad_norm": 0.25611329078674316, "learning_rate": 0.000125705136100505, "loss": 0.2262, "step": 3021 }, { "epoch": 1.1159527326440177, "grad_norm": 0.2516343593597412, "learning_rate": 0.0001256805025249415, "loss": 0.2067, "step": 3022 }, { "epoch": 1.1163220088626293, "grad_norm": 0.30316340923309326, "learning_rate": 0.000125655868949378, "loss": 0.272, "step": 3023 }, { "epoch": 1.1166912850812407, "grad_norm": 0.23739416897296906, "learning_rate": 0.0001256312353738145, "loss": 0.2276, "step": 3024 }, { "epoch": 1.1170605612998523, "grad_norm": 0.22895297408103943, "learning_rate": 0.00012560660179825102, "loss": 0.2076, "step": 3025 }, { "epoch": 1.117429837518464, "grad_norm": 0.23244740068912506, "learning_rate": 0.00012558196822268753, "loss": 0.2225, "step": 3026 }, { "epoch": 1.1177991137370753, "grad_norm": 0.2905164659023285, "learning_rate": 0.00012555733464712402, "loss": 0.2649, "step": 3027 }, { "epoch": 1.118168389955687, "grad_norm": 0.287203848361969, "learning_rate": 0.00012553270107156054, "loss": 0.2411, "step": 3028 }, { "epoch": 1.1185376661742983, "grad_norm": 0.21558254957199097, "learning_rate": 0.00012550806749599705, "loss": 0.1992, "step": 3029 }, { "epoch": 1.1189069423929099, "grad_norm": 0.23100394010543823, "learning_rate": 0.00012548343392043357, "loss": 0.2251, "step": 3030 }, { "epoch": 1.1192762186115215, "grad_norm": 0.2505006492137909, "learning_rate": 0.00012545880034487005, "loss": 0.2372, "step": 3031 }, { "epoch": 1.1196454948301329, "grad_norm": 0.2551506459712982, "learning_rate": 0.00012543416676930657, "loss": 0.212, "step": 3032 }, { "epoch": 1.1200147710487445, "grad_norm": 0.30516448616981506, "learning_rate": 0.00012540953319374306, "loss": 0.299, "step": 3033 }, { "epoch": 1.120384047267356, "grad_norm": 0.28968724608421326, "learning_rate": 0.0001253848996181796, "loss": 0.2338, "step": 3034 }, { "epoch": 1.1207533234859675, "grad_norm": 0.2573033273220062, "learning_rate": 0.00012536026604261608, "loss": 0.2352, "step": 3035 }, { "epoch": 1.121122599704579, "grad_norm": 0.26480695605278015, "learning_rate": 0.0001253356324670526, "loss": 0.2167, "step": 3036 }, { "epoch": 1.1214918759231907, "grad_norm": 0.2624709904193878, "learning_rate": 0.0001253109988914891, "loss": 0.2233, "step": 3037 }, { "epoch": 1.121861152141802, "grad_norm": 0.32714954018592834, "learning_rate": 0.0001252863653159256, "loss": 0.2943, "step": 3038 }, { "epoch": 1.1222304283604136, "grad_norm": 0.333815336227417, "learning_rate": 0.00012526173174036212, "loss": 0.2691, "step": 3039 }, { "epoch": 1.122599704579025, "grad_norm": 0.3042587637901306, "learning_rate": 0.00012523709816479863, "loss": 0.2645, "step": 3040 }, { "epoch": 1.1229689807976366, "grad_norm": 0.23870015144348145, "learning_rate": 0.00012521246458923512, "loss": 0.2264, "step": 3041 }, { "epoch": 1.1233382570162482, "grad_norm": 0.25744694471359253, "learning_rate": 0.00012518783101367163, "loss": 0.2359, "step": 3042 }, { "epoch": 1.1237075332348596, "grad_norm": 0.3124404549598694, "learning_rate": 0.00012516319743810815, "loss": 0.2354, "step": 3043 }, { "epoch": 1.1240768094534712, "grad_norm": 0.24731005728244781, "learning_rate": 0.00012513856386254466, "loss": 0.2177, "step": 3044 }, { "epoch": 1.1244460856720828, "grad_norm": 0.3020634353160858, "learning_rate": 0.00012511393028698115, "loss": 0.2779, "step": 3045 }, { "epoch": 1.1248153618906942, "grad_norm": 0.2442103922367096, "learning_rate": 0.00012508929671141766, "loss": 0.2165, "step": 3046 }, { "epoch": 1.1251846381093058, "grad_norm": 0.3287731409072876, "learning_rate": 0.00012506466313585418, "loss": 0.2607, "step": 3047 }, { "epoch": 1.1255539143279174, "grad_norm": 0.2367086112499237, "learning_rate": 0.0001250400295602907, "loss": 0.2101, "step": 3048 }, { "epoch": 1.1259231905465288, "grad_norm": 0.25181809067726135, "learning_rate": 0.00012501539598472718, "loss": 0.2225, "step": 3049 }, { "epoch": 1.1262924667651404, "grad_norm": 0.2873280644416809, "learning_rate": 0.0001249907624091637, "loss": 0.2248, "step": 3050 }, { "epoch": 1.1262924667651404, "eval_loss": 0.2710103690624237, "eval_runtime": 5.8492, "eval_samples_per_second": 8.548, "eval_steps_per_second": 1.197, "step": 3050 }, { "epoch": 1.1266617429837518, "grad_norm": 0.254673033952713, "learning_rate": 0.00012496612883360018, "loss": 0.24, "step": 3051 }, { "epoch": 1.1270310192023634, "grad_norm": 0.3037097454071045, "learning_rate": 0.00012494149525803672, "loss": 0.2916, "step": 3052 }, { "epoch": 1.127400295420975, "grad_norm": 0.23909220099449158, "learning_rate": 0.0001249168616824732, "loss": 0.2091, "step": 3053 }, { "epoch": 1.1277695716395864, "grad_norm": 0.3136500418186188, "learning_rate": 0.00012489222810690973, "loss": 0.272, "step": 3054 }, { "epoch": 1.128138847858198, "grad_norm": 0.2772284150123596, "learning_rate": 0.00012486759453134621, "loss": 0.2307, "step": 3055 }, { "epoch": 1.1285081240768093, "grad_norm": 0.25221678614616394, "learning_rate": 0.00012484296095578273, "loss": 0.2367, "step": 3056 }, { "epoch": 1.128877400295421, "grad_norm": 0.25936242938041687, "learning_rate": 0.00012481832738021924, "loss": 0.2589, "step": 3057 }, { "epoch": 1.1292466765140325, "grad_norm": 0.28311699628829956, "learning_rate": 0.00012479369380465576, "loss": 0.2371, "step": 3058 }, { "epoch": 1.129615952732644, "grad_norm": 0.3582422435283661, "learning_rate": 0.00012476906022909225, "loss": 0.2705, "step": 3059 }, { "epoch": 1.1299852289512555, "grad_norm": 0.2714608907699585, "learning_rate": 0.00012474442665352876, "loss": 0.2387, "step": 3060 }, { "epoch": 1.1303545051698671, "grad_norm": 0.3010917603969574, "learning_rate": 0.00012471979307796528, "loss": 0.2948, "step": 3061 }, { "epoch": 1.1307237813884785, "grad_norm": 0.308224618434906, "learning_rate": 0.0001246951595024018, "loss": 0.2862, "step": 3062 }, { "epoch": 1.1310930576070901, "grad_norm": 0.2517126798629761, "learning_rate": 0.00012467052592683828, "loss": 0.2661, "step": 3063 }, { "epoch": 1.1314623338257017, "grad_norm": 0.27293944358825684, "learning_rate": 0.0001246458923512748, "loss": 0.2047, "step": 3064 }, { "epoch": 1.131831610044313, "grad_norm": 0.2842094898223877, "learning_rate": 0.00012462125877571128, "loss": 0.2204, "step": 3065 }, { "epoch": 1.1322008862629247, "grad_norm": 0.25981253385543823, "learning_rate": 0.00012459662520014782, "loss": 0.2146, "step": 3066 }, { "epoch": 1.132570162481536, "grad_norm": 0.2736140191555023, "learning_rate": 0.0001245719916245843, "loss": 0.266, "step": 3067 }, { "epoch": 1.1329394387001477, "grad_norm": 0.35096457600593567, "learning_rate": 0.00012454735804902082, "loss": 0.2942, "step": 3068 }, { "epoch": 1.1333087149187593, "grad_norm": 0.26719680428504944, "learning_rate": 0.0001245227244734573, "loss": 0.2735, "step": 3069 }, { "epoch": 1.1336779911373707, "grad_norm": 0.2518094778060913, "learning_rate": 0.00012449809089789383, "loss": 0.2328, "step": 3070 }, { "epoch": 1.1340472673559823, "grad_norm": 0.30160000920295715, "learning_rate": 0.00012447345732233034, "loss": 0.2994, "step": 3071 }, { "epoch": 1.1344165435745939, "grad_norm": 0.24626970291137695, "learning_rate": 0.00012444882374676686, "loss": 0.2315, "step": 3072 }, { "epoch": 1.1347858197932053, "grad_norm": 0.25943630933761597, "learning_rate": 0.00012442419017120334, "loss": 0.2437, "step": 3073 }, { "epoch": 1.1351550960118169, "grad_norm": 0.21921169757843018, "learning_rate": 0.00012439955659563986, "loss": 0.2081, "step": 3074 }, { "epoch": 1.1355243722304285, "grad_norm": 0.26043611764907837, "learning_rate": 0.00012437492302007637, "loss": 0.2137, "step": 3075 }, { "epoch": 1.1358936484490398, "grad_norm": 0.26178666949272156, "learning_rate": 0.0001243502894445129, "loss": 0.2058, "step": 3076 }, { "epoch": 1.1362629246676514, "grad_norm": 0.3557722866535187, "learning_rate": 0.00012432565586894937, "loss": 0.2798, "step": 3077 }, { "epoch": 1.1366322008862628, "grad_norm": 0.3032893240451813, "learning_rate": 0.0001243010222933859, "loss": 0.2662, "step": 3078 }, { "epoch": 1.1370014771048744, "grad_norm": 0.25780877470970154, "learning_rate": 0.00012427638871782238, "loss": 0.2378, "step": 3079 }, { "epoch": 1.137370753323486, "grad_norm": 0.2566320598125458, "learning_rate": 0.00012425175514225892, "loss": 0.235, "step": 3080 }, { "epoch": 1.1377400295420974, "grad_norm": 0.3095914125442505, "learning_rate": 0.0001242271215666954, "loss": 0.2498, "step": 3081 }, { "epoch": 1.138109305760709, "grad_norm": 0.3422093689441681, "learning_rate": 0.00012420248799113192, "loss": 0.2715, "step": 3082 }, { "epoch": 1.1384785819793206, "grad_norm": 0.24209055304527283, "learning_rate": 0.0001241778544155684, "loss": 0.219, "step": 3083 }, { "epoch": 1.138847858197932, "grad_norm": 0.4152233898639679, "learning_rate": 0.00012415322084000492, "loss": 0.2235, "step": 3084 }, { "epoch": 1.1392171344165436, "grad_norm": 0.26387819647789, "learning_rate": 0.00012412858726444144, "loss": 0.2195, "step": 3085 }, { "epoch": 1.1395864106351552, "grad_norm": 0.251911997795105, "learning_rate": 0.00012410395368887795, "loss": 0.2147, "step": 3086 }, { "epoch": 1.1399556868537666, "grad_norm": 0.2334921807050705, "learning_rate": 0.00012407932011331444, "loss": 0.2033, "step": 3087 }, { "epoch": 1.1403249630723782, "grad_norm": 0.28513142466545105, "learning_rate": 0.00012405468653775095, "loss": 0.2355, "step": 3088 }, { "epoch": 1.1406942392909896, "grad_norm": 0.22765091061592102, "learning_rate": 0.00012403005296218747, "loss": 0.2028, "step": 3089 }, { "epoch": 1.1410635155096012, "grad_norm": 0.224373921751976, "learning_rate": 0.00012400541938662398, "loss": 0.1976, "step": 3090 }, { "epoch": 1.1414327917282128, "grad_norm": 0.3062220811843872, "learning_rate": 0.00012398078581106047, "loss": 0.2535, "step": 3091 }, { "epoch": 1.1418020679468242, "grad_norm": 0.259894996881485, "learning_rate": 0.00012395615223549699, "loss": 0.2289, "step": 3092 }, { "epoch": 1.1421713441654358, "grad_norm": 0.24251849949359894, "learning_rate": 0.0001239315186599335, "loss": 0.2256, "step": 3093 }, { "epoch": 1.1425406203840474, "grad_norm": 0.27573704719543457, "learning_rate": 0.00012390688508437001, "loss": 0.2466, "step": 3094 }, { "epoch": 1.1429098966026587, "grad_norm": 0.29267674684524536, "learning_rate": 0.0001238822515088065, "loss": 0.2606, "step": 3095 }, { "epoch": 1.1432791728212703, "grad_norm": 0.253482848405838, "learning_rate": 0.00012385761793324302, "loss": 0.2301, "step": 3096 }, { "epoch": 1.143648449039882, "grad_norm": 0.2747405469417572, "learning_rate": 0.0001238329843576795, "loss": 0.2733, "step": 3097 }, { "epoch": 1.1440177252584933, "grad_norm": 0.26296141743659973, "learning_rate": 0.00012380835078211605, "loss": 0.2231, "step": 3098 }, { "epoch": 1.144387001477105, "grad_norm": 0.3350454568862915, "learning_rate": 0.00012378371720655253, "loss": 0.2461, "step": 3099 }, { "epoch": 1.1447562776957163, "grad_norm": 0.2649227976799011, "learning_rate": 0.00012375908363098905, "loss": 0.2802, "step": 3100 }, { "epoch": 1.1447562776957163, "eval_loss": 0.2746792435646057, "eval_runtime": 5.8569, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 3100 }, { "epoch": 1.145125553914328, "grad_norm": 0.34254997968673706, "learning_rate": 0.00012373445005542554, "loss": 0.2205, "step": 3101 }, { "epoch": 1.1454948301329395, "grad_norm": 0.29163575172424316, "learning_rate": 0.00012370981647986205, "loss": 0.2629, "step": 3102 }, { "epoch": 1.145864106351551, "grad_norm": 0.2712344527244568, "learning_rate": 0.00012368518290429856, "loss": 0.2303, "step": 3103 }, { "epoch": 1.1462333825701625, "grad_norm": 0.27596956491470337, "learning_rate": 0.00012366054932873508, "loss": 0.2243, "step": 3104 }, { "epoch": 1.146602658788774, "grad_norm": 0.253072589635849, "learning_rate": 0.00012363591575317157, "loss": 0.2265, "step": 3105 }, { "epoch": 1.1469719350073855, "grad_norm": 0.2831348776817322, "learning_rate": 0.00012361128217760808, "loss": 0.2431, "step": 3106 }, { "epoch": 1.147341211225997, "grad_norm": 0.25114157795906067, "learning_rate": 0.0001235866486020446, "loss": 0.2225, "step": 3107 }, { "epoch": 1.1477104874446087, "grad_norm": 0.25578370690345764, "learning_rate": 0.0001235620150264811, "loss": 0.2463, "step": 3108 }, { "epoch": 1.14807976366322, "grad_norm": 0.2710251212120056, "learning_rate": 0.0001235373814509176, "loss": 0.1833, "step": 3109 }, { "epoch": 1.1484490398818317, "grad_norm": 0.26096826791763306, "learning_rate": 0.0001235127478753541, "loss": 0.2136, "step": 3110 }, { "epoch": 1.148818316100443, "grad_norm": 0.30772802233695984, "learning_rate": 0.0001234881142997906, "loss": 0.2743, "step": 3111 }, { "epoch": 1.1491875923190547, "grad_norm": 0.33691519498825073, "learning_rate": 0.00012346348072422714, "loss": 0.3105, "step": 3112 }, { "epoch": 1.1495568685376663, "grad_norm": 0.2449200302362442, "learning_rate": 0.00012343884714866363, "loss": 0.2234, "step": 3113 }, { "epoch": 1.1499261447562776, "grad_norm": 0.29358598589897156, "learning_rate": 0.00012341421357310014, "loss": 0.2595, "step": 3114 }, { "epoch": 1.1502954209748892, "grad_norm": 0.3605344295501709, "learning_rate": 0.00012338957999753663, "loss": 0.3003, "step": 3115 }, { "epoch": 1.1506646971935006, "grad_norm": 0.2726616859436035, "learning_rate": 0.00012336494642197315, "loss": 0.2966, "step": 3116 }, { "epoch": 1.1510339734121122, "grad_norm": 0.28332284092903137, "learning_rate": 0.00012334031284640966, "loss": 0.2501, "step": 3117 }, { "epoch": 1.1514032496307238, "grad_norm": 0.2538849711418152, "learning_rate": 0.00012331567927084618, "loss": 0.2295, "step": 3118 }, { "epoch": 1.1517725258493354, "grad_norm": 0.24772146344184875, "learning_rate": 0.00012329104569528266, "loss": 0.2019, "step": 3119 }, { "epoch": 1.1521418020679468, "grad_norm": 0.24297967553138733, "learning_rate": 0.00012326641211971918, "loss": 0.2096, "step": 3120 }, { "epoch": 1.1525110782865584, "grad_norm": 0.2873733341693878, "learning_rate": 0.0001232417785441557, "loss": 0.2034, "step": 3121 }, { "epoch": 1.1528803545051698, "grad_norm": 0.2223925143480301, "learning_rate": 0.0001232171449685922, "loss": 0.2291, "step": 3122 }, { "epoch": 1.1532496307237814, "grad_norm": 0.2619134187698364, "learning_rate": 0.0001231925113930287, "loss": 0.2475, "step": 3123 }, { "epoch": 1.153618906942393, "grad_norm": 0.29636234045028687, "learning_rate": 0.0001231678778174652, "loss": 0.2267, "step": 3124 }, { "epoch": 1.1539881831610044, "grad_norm": 0.233274906873703, "learning_rate": 0.00012314324424190172, "loss": 0.2162, "step": 3125 }, { "epoch": 1.154357459379616, "grad_norm": 0.2567541301250458, "learning_rate": 0.00012311861066633824, "loss": 0.2105, "step": 3126 }, { "epoch": 1.1547267355982274, "grad_norm": 0.28177598118782043, "learning_rate": 0.00012309397709077473, "loss": 0.2777, "step": 3127 }, { "epoch": 1.155096011816839, "grad_norm": 0.295654296875, "learning_rate": 0.00012306934351521124, "loss": 0.2765, "step": 3128 }, { "epoch": 1.1554652880354506, "grad_norm": 0.2781972587108612, "learning_rate": 0.00012304470993964773, "loss": 0.2207, "step": 3129 }, { "epoch": 1.155834564254062, "grad_norm": 0.3001963794231415, "learning_rate": 0.00012302007636408427, "loss": 0.3032, "step": 3130 }, { "epoch": 1.1562038404726735, "grad_norm": 0.2637580633163452, "learning_rate": 0.00012299544278852076, "loss": 0.2143, "step": 3131 }, { "epoch": 1.1565731166912852, "grad_norm": 0.2356804609298706, "learning_rate": 0.00012297080921295727, "loss": 0.2345, "step": 3132 }, { "epoch": 1.1569423929098965, "grad_norm": 0.250374972820282, "learning_rate": 0.00012294617563739376, "loss": 0.2176, "step": 3133 }, { "epoch": 1.1573116691285081, "grad_norm": 0.3240339159965515, "learning_rate": 0.00012292154206183027, "loss": 0.251, "step": 3134 }, { "epoch": 1.1576809453471197, "grad_norm": 0.3229665756225586, "learning_rate": 0.0001228969084862668, "loss": 0.2673, "step": 3135 }, { "epoch": 1.1580502215657311, "grad_norm": 0.3183058202266693, "learning_rate": 0.0001228722749107033, "loss": 0.27, "step": 3136 }, { "epoch": 1.1584194977843427, "grad_norm": 0.26968470215797424, "learning_rate": 0.0001228476413351398, "loss": 0.2282, "step": 3137 }, { "epoch": 1.158788774002954, "grad_norm": 0.3959614634513855, "learning_rate": 0.0001228230077595763, "loss": 0.3621, "step": 3138 }, { "epoch": 1.1591580502215657, "grad_norm": 0.28337931632995605, "learning_rate": 0.00012279837418401282, "loss": 0.2816, "step": 3139 }, { "epoch": 1.1595273264401773, "grad_norm": 0.3202999234199524, "learning_rate": 0.00012277374060844934, "loss": 0.2623, "step": 3140 }, { "epoch": 1.1598966026587887, "grad_norm": 0.30495762825012207, "learning_rate": 0.00012274910703288582, "loss": 0.2255, "step": 3141 }, { "epoch": 1.1602658788774003, "grad_norm": 0.3435508906841278, "learning_rate": 0.00012272447345732234, "loss": 0.268, "step": 3142 }, { "epoch": 1.160635155096012, "grad_norm": 0.2825329303741455, "learning_rate": 0.00012269983988175883, "loss": 0.2358, "step": 3143 }, { "epoch": 1.1610044313146233, "grad_norm": 0.2530229091644287, "learning_rate": 0.00012267520630619537, "loss": 0.2061, "step": 3144 }, { "epoch": 1.1613737075332349, "grad_norm": 0.27512991428375244, "learning_rate": 0.00012265057273063185, "loss": 0.2342, "step": 3145 }, { "epoch": 1.1617429837518465, "grad_norm": 0.26814547181129456, "learning_rate": 0.00012262593915506837, "loss": 0.256, "step": 3146 }, { "epoch": 1.1621122599704579, "grad_norm": 0.2830052971839905, "learning_rate": 0.00012260130557950486, "loss": 0.2467, "step": 3147 }, { "epoch": 1.1624815361890695, "grad_norm": 0.29037684202194214, "learning_rate": 0.00012257667200394137, "loss": 0.2384, "step": 3148 }, { "epoch": 1.1628508124076808, "grad_norm": 0.2737725079059601, "learning_rate": 0.00012255203842837789, "loss": 0.2288, "step": 3149 }, { "epoch": 1.1632200886262924, "grad_norm": 0.28996843099594116, "learning_rate": 0.0001225274048528144, "loss": 0.2843, "step": 3150 }, { "epoch": 1.1632200886262924, "eval_loss": 0.2770858108997345, "eval_runtime": 5.8638, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 3150 }, { "epoch": 1.163589364844904, "grad_norm": 0.261055052280426, "learning_rate": 0.0001225027712772509, "loss": 0.2761, "step": 3151 }, { "epoch": 1.1639586410635154, "grad_norm": 0.2340794801712036, "learning_rate": 0.0001224781377016874, "loss": 0.2253, "step": 3152 }, { "epoch": 1.164327917282127, "grad_norm": 0.31548061966896057, "learning_rate": 0.00012245350412612392, "loss": 0.2828, "step": 3153 }, { "epoch": 1.1646971935007386, "grad_norm": 0.32488757371902466, "learning_rate": 0.00012242887055056043, "loss": 0.2747, "step": 3154 }, { "epoch": 1.16506646971935, "grad_norm": 0.2568996250629425, "learning_rate": 0.00012240423697499692, "loss": 0.2313, "step": 3155 }, { "epoch": 1.1654357459379616, "grad_norm": 0.23020611703395844, "learning_rate": 0.00012237960339943343, "loss": 0.2156, "step": 3156 }, { "epoch": 1.1658050221565732, "grad_norm": 0.23148971796035767, "learning_rate": 0.00012235496982386995, "loss": 0.2153, "step": 3157 }, { "epoch": 1.1661742983751846, "grad_norm": 0.3115021288394928, "learning_rate": 0.00012233033624830646, "loss": 0.2635, "step": 3158 }, { "epoch": 1.1665435745937962, "grad_norm": 0.2591547966003418, "learning_rate": 0.00012230570267274295, "loss": 0.2355, "step": 3159 }, { "epoch": 1.1669128508124076, "grad_norm": 0.2312883585691452, "learning_rate": 0.00012228106909717947, "loss": 0.2208, "step": 3160 }, { "epoch": 1.1672821270310192, "grad_norm": 0.26675599813461304, "learning_rate": 0.00012225643552161595, "loss": 0.2463, "step": 3161 }, { "epoch": 1.1676514032496308, "grad_norm": 0.26281824707984924, "learning_rate": 0.0001222318019460525, "loss": 0.2415, "step": 3162 }, { "epoch": 1.1680206794682422, "grad_norm": 0.24059298634529114, "learning_rate": 0.00012220716837048898, "loss": 0.2339, "step": 3163 }, { "epoch": 1.1683899556868538, "grad_norm": 0.2939622700214386, "learning_rate": 0.0001221825347949255, "loss": 0.2549, "step": 3164 }, { "epoch": 1.1687592319054654, "grad_norm": 0.2592087686061859, "learning_rate": 0.00012215790121936198, "loss": 0.239, "step": 3165 }, { "epoch": 1.1691285081240768, "grad_norm": 0.26341572403907776, "learning_rate": 0.0001221332676437985, "loss": 0.2254, "step": 3166 }, { "epoch": 1.1694977843426884, "grad_norm": 0.26745542883872986, "learning_rate": 0.00012210863406823501, "loss": 0.2166, "step": 3167 }, { "epoch": 1.1698670605613, "grad_norm": 0.26928338408470154, "learning_rate": 0.00012208400049267153, "loss": 0.2149, "step": 3168 }, { "epoch": 1.1702363367799113, "grad_norm": 0.2942584156990051, "learning_rate": 0.00012205936691710802, "loss": 0.2064, "step": 3169 }, { "epoch": 1.170605612998523, "grad_norm": 0.3437819480895996, "learning_rate": 0.00012203473334154454, "loss": 0.2328, "step": 3170 }, { "epoch": 1.1709748892171343, "grad_norm": 0.2597808837890625, "learning_rate": 0.00012201009976598103, "loss": 0.2175, "step": 3171 }, { "epoch": 1.171344165435746, "grad_norm": 0.27388855814933777, "learning_rate": 0.00012198546619041755, "loss": 0.263, "step": 3172 }, { "epoch": 1.1717134416543575, "grad_norm": 0.23952798545360565, "learning_rate": 0.00012196083261485405, "loss": 0.2097, "step": 3173 }, { "epoch": 1.172082717872969, "grad_norm": 0.2443709820508957, "learning_rate": 0.00012193619903929056, "loss": 0.1985, "step": 3174 }, { "epoch": 1.1724519940915805, "grad_norm": 0.2822941541671753, "learning_rate": 0.00012191156546372706, "loss": 0.244, "step": 3175 }, { "epoch": 1.172821270310192, "grad_norm": 0.24943234026432037, "learning_rate": 0.00012188693188816358, "loss": 0.2032, "step": 3176 }, { "epoch": 1.1731905465288035, "grad_norm": 0.2702721953392029, "learning_rate": 0.00012186229831260008, "loss": 0.2123, "step": 3177 }, { "epoch": 1.173559822747415, "grad_norm": 0.2794484794139862, "learning_rate": 0.0001218376647370366, "loss": 0.2593, "step": 3178 }, { "epoch": 1.1739290989660267, "grad_norm": 0.2570948600769043, "learning_rate": 0.0001218130311614731, "loss": 0.1915, "step": 3179 }, { "epoch": 1.174298375184638, "grad_norm": 0.24439947307109833, "learning_rate": 0.00012178839758590961, "loss": 0.2451, "step": 3180 }, { "epoch": 1.1746676514032497, "grad_norm": 0.2643444836139679, "learning_rate": 0.00012176376401034611, "loss": 0.2129, "step": 3181 }, { "epoch": 1.175036927621861, "grad_norm": 0.30255910754203796, "learning_rate": 0.00012173913043478263, "loss": 0.2637, "step": 3182 }, { "epoch": 1.1754062038404727, "grad_norm": 0.3298236131668091, "learning_rate": 0.00012171449685921911, "loss": 0.2776, "step": 3183 }, { "epoch": 1.1757754800590843, "grad_norm": 0.25277990102767944, "learning_rate": 0.00012168986328365564, "loss": 0.2088, "step": 3184 }, { "epoch": 1.1761447562776957, "grad_norm": 0.2743963599205017, "learning_rate": 0.00012166522970809213, "loss": 0.2579, "step": 3185 }, { "epoch": 1.1765140324963073, "grad_norm": 0.26288869976997375, "learning_rate": 0.00012164059613252866, "loss": 0.2181, "step": 3186 }, { "epoch": 1.1768833087149186, "grad_norm": 0.26495224237442017, "learning_rate": 0.00012161596255696514, "loss": 0.2185, "step": 3187 }, { "epoch": 1.1772525849335302, "grad_norm": 0.29768168926239014, "learning_rate": 0.00012159132898140166, "loss": 0.2642, "step": 3188 }, { "epoch": 1.1776218611521418, "grad_norm": 0.2437969595193863, "learning_rate": 0.00012156669540583816, "loss": 0.2102, "step": 3189 }, { "epoch": 1.1779911373707532, "grad_norm": 0.26859331130981445, "learning_rate": 0.00012154206183027467, "loss": 0.2545, "step": 3190 }, { "epoch": 1.1783604135893648, "grad_norm": 0.2850019335746765, "learning_rate": 0.00012151742825471118, "loss": 0.2372, "step": 3191 }, { "epoch": 1.1787296898079764, "grad_norm": 0.24644847214221954, "learning_rate": 0.00012149279467914769, "loss": 0.2193, "step": 3192 }, { "epoch": 1.1790989660265878, "grad_norm": 0.2813577950000763, "learning_rate": 0.00012146816110358419, "loss": 0.2636, "step": 3193 }, { "epoch": 1.1794682422451994, "grad_norm": 0.34138408303260803, "learning_rate": 0.0001214435275280207, "loss": 0.3041, "step": 3194 }, { "epoch": 1.179837518463811, "grad_norm": 0.29984021186828613, "learning_rate": 0.00012141889395245721, "loss": 0.2123, "step": 3195 }, { "epoch": 1.1802067946824224, "grad_norm": 0.23603621125221252, "learning_rate": 0.00012139426037689372, "loss": 0.2095, "step": 3196 }, { "epoch": 1.180576070901034, "grad_norm": 0.26498091220855713, "learning_rate": 0.00012136962680133022, "loss": 0.2647, "step": 3197 }, { "epoch": 1.1809453471196454, "grad_norm": 0.282332181930542, "learning_rate": 0.00012134499322576674, "loss": 0.2369, "step": 3198 }, { "epoch": 1.181314623338257, "grad_norm": 0.2596491575241089, "learning_rate": 0.00012132035965020322, "loss": 0.2263, "step": 3199 }, { "epoch": 1.1816838995568686, "grad_norm": 0.2347802370786667, "learning_rate": 0.00012129572607463975, "loss": 0.2103, "step": 3200 }, { "epoch": 1.1816838995568686, "eval_loss": 0.27301788330078125, "eval_runtime": 5.8695, "eval_samples_per_second": 8.519, "eval_steps_per_second": 1.193, "step": 3200 }, { "epoch": 1.18205317577548, "grad_norm": 0.25488463044166565, "learning_rate": 0.00012127109249907624, "loss": 0.2309, "step": 3201 }, { "epoch": 1.1824224519940916, "grad_norm": 0.2890549600124359, "learning_rate": 0.00012124645892351277, "loss": 0.2486, "step": 3202 }, { "epoch": 1.1827917282127032, "grad_norm": 0.30259156227111816, "learning_rate": 0.00012122182534794926, "loss": 0.2189, "step": 3203 }, { "epoch": 1.1831610044313146, "grad_norm": 0.23023875057697296, "learning_rate": 0.00012119719177238577, "loss": 0.2202, "step": 3204 }, { "epoch": 1.1835302806499262, "grad_norm": 0.2631145119667053, "learning_rate": 0.00012117255819682227, "loss": 0.2351, "step": 3205 }, { "epoch": 1.1838995568685378, "grad_norm": 0.2502356469631195, "learning_rate": 0.00012114792462125879, "loss": 0.2403, "step": 3206 }, { "epoch": 1.1842688330871491, "grad_norm": 0.25082942843437195, "learning_rate": 0.00012112329104569529, "loss": 0.2001, "step": 3207 }, { "epoch": 1.1846381093057607, "grad_norm": 0.3217408359050751, "learning_rate": 0.0001210986574701318, "loss": 0.2396, "step": 3208 }, { "epoch": 1.1850073855243721, "grad_norm": 0.2907535135746002, "learning_rate": 0.0001210740238945683, "loss": 0.2348, "step": 3209 }, { "epoch": 1.1853766617429837, "grad_norm": 0.2987769842147827, "learning_rate": 0.00012104939031900482, "loss": 0.2543, "step": 3210 }, { "epoch": 1.1857459379615953, "grad_norm": 0.2710782289505005, "learning_rate": 0.00012102475674344132, "loss": 0.2465, "step": 3211 }, { "epoch": 1.1861152141802067, "grad_norm": 0.2633363902568817, "learning_rate": 0.00012100012316787783, "loss": 0.2367, "step": 3212 }, { "epoch": 1.1864844903988183, "grad_norm": 0.23605535924434662, "learning_rate": 0.00012097548959231434, "loss": 0.2265, "step": 3213 }, { "epoch": 1.18685376661743, "grad_norm": 0.3763718008995056, "learning_rate": 0.00012095085601675085, "loss": 0.2352, "step": 3214 }, { "epoch": 1.1872230428360413, "grad_norm": 0.2694814205169678, "learning_rate": 0.00012092622244118734, "loss": 0.2123, "step": 3215 }, { "epoch": 1.187592319054653, "grad_norm": 0.23994052410125732, "learning_rate": 0.00012090158886562387, "loss": 0.2079, "step": 3216 }, { "epoch": 1.1879615952732645, "grad_norm": 0.3044906258583069, "learning_rate": 0.00012087695529006035, "loss": 0.2245, "step": 3217 }, { "epoch": 1.1883308714918759, "grad_norm": 0.2753120958805084, "learning_rate": 0.00012085232171449688, "loss": 0.2345, "step": 3218 }, { "epoch": 1.1887001477104875, "grad_norm": 0.28482258319854736, "learning_rate": 0.00012082768813893337, "loss": 0.2458, "step": 3219 }, { "epoch": 1.1890694239290989, "grad_norm": 0.2305278182029724, "learning_rate": 0.00012080305456336988, "loss": 0.2336, "step": 3220 }, { "epoch": 1.1894387001477105, "grad_norm": 0.2535829246044159, "learning_rate": 0.00012077842098780638, "loss": 0.2363, "step": 3221 }, { "epoch": 1.189807976366322, "grad_norm": 0.2498999387025833, "learning_rate": 0.0001207537874122429, "loss": 0.2378, "step": 3222 }, { "epoch": 1.1901772525849335, "grad_norm": 0.305387943983078, "learning_rate": 0.0001207291538366794, "loss": 0.248, "step": 3223 }, { "epoch": 1.190546528803545, "grad_norm": 0.27258002758026123, "learning_rate": 0.00012070452026111591, "loss": 0.2297, "step": 3224 }, { "epoch": 1.1909158050221567, "grad_norm": 0.25035011768341064, "learning_rate": 0.00012067988668555242, "loss": 0.2447, "step": 3225 }, { "epoch": 1.191285081240768, "grad_norm": 0.26438868045806885, "learning_rate": 0.00012065525310998893, "loss": 0.1997, "step": 3226 }, { "epoch": 1.1916543574593796, "grad_norm": 0.3002438247203827, "learning_rate": 0.00012063061953442543, "loss": 0.3065, "step": 3227 }, { "epoch": 1.1920236336779912, "grad_norm": 0.3621065020561218, "learning_rate": 0.00012060598595886195, "loss": 0.2116, "step": 3228 }, { "epoch": 1.1923929098966026, "grad_norm": 0.25311121344566345, "learning_rate": 0.00012058135238329843, "loss": 0.2062, "step": 3229 }, { "epoch": 1.1927621861152142, "grad_norm": 0.21173089742660522, "learning_rate": 0.00012055671880773496, "loss": 0.1927, "step": 3230 }, { "epoch": 1.1931314623338256, "grad_norm": 0.2535032331943512, "learning_rate": 0.00012053208523217145, "loss": 0.2252, "step": 3231 }, { "epoch": 1.1935007385524372, "grad_norm": 0.3750533163547516, "learning_rate": 0.00012050745165660798, "loss": 0.2191, "step": 3232 }, { "epoch": 1.1938700147710488, "grad_norm": 0.2945975363254547, "learning_rate": 0.00012048281808104447, "loss": 0.2761, "step": 3233 }, { "epoch": 1.1942392909896602, "grad_norm": 0.25776052474975586, "learning_rate": 0.00012045818450548098, "loss": 0.2611, "step": 3234 }, { "epoch": 1.1946085672082718, "grad_norm": 0.21198132634162903, "learning_rate": 0.00012043355092991748, "loss": 0.2043, "step": 3235 }, { "epoch": 1.1949778434268834, "grad_norm": 0.24576450884342194, "learning_rate": 0.000120408917354354, "loss": 0.2492, "step": 3236 }, { "epoch": 1.1953471196454948, "grad_norm": 0.27762892842292786, "learning_rate": 0.0001203842837787905, "loss": 0.2776, "step": 3237 }, { "epoch": 1.1957163958641064, "grad_norm": 0.24789641797542572, "learning_rate": 0.00012035965020322701, "loss": 0.2199, "step": 3238 }, { "epoch": 1.196085672082718, "grad_norm": 0.24140557646751404, "learning_rate": 0.00012033501662766351, "loss": 0.2301, "step": 3239 }, { "epoch": 1.1964549483013294, "grad_norm": 0.24686704576015472, "learning_rate": 0.00012031038305210003, "loss": 0.2119, "step": 3240 }, { "epoch": 1.196824224519941, "grad_norm": 0.23982328176498413, "learning_rate": 0.00012028574947653653, "loss": 0.2204, "step": 3241 }, { "epoch": 1.1971935007385524, "grad_norm": 0.24495044350624084, "learning_rate": 0.00012026111590097304, "loss": 0.2129, "step": 3242 }, { "epoch": 1.197562776957164, "grad_norm": 0.23792871832847595, "learning_rate": 0.00012023648232540954, "loss": 0.2325, "step": 3243 }, { "epoch": 1.1979320531757756, "grad_norm": 0.2640572190284729, "learning_rate": 0.00012021184874984606, "loss": 0.211, "step": 3244 }, { "epoch": 1.198301329394387, "grad_norm": 0.28885048627853394, "learning_rate": 0.00012018721517428255, "loss": 0.2044, "step": 3245 }, { "epoch": 1.1986706056129985, "grad_norm": 0.23012898862361908, "learning_rate": 0.00012016258159871907, "loss": 0.2272, "step": 3246 }, { "epoch": 1.19903988183161, "grad_norm": 0.26595887541770935, "learning_rate": 0.00012013794802315556, "loss": 0.2221, "step": 3247 }, { "epoch": 1.1994091580502215, "grad_norm": 0.3253090977668762, "learning_rate": 0.00012011331444759209, "loss": 0.2502, "step": 3248 }, { "epoch": 1.1997784342688331, "grad_norm": 0.26922520995140076, "learning_rate": 0.00012008868087202858, "loss": 0.2376, "step": 3249 }, { "epoch": 1.2001477104874447, "grad_norm": 0.2791667878627777, "learning_rate": 0.00012006404729646509, "loss": 0.2025, "step": 3250 }, { "epoch": 1.2001477104874447, "eval_loss": 0.27150794863700867, "eval_runtime": 5.8659, "eval_samples_per_second": 8.524, "eval_steps_per_second": 1.193, "step": 3250 }, { "epoch": 1.200516986706056, "grad_norm": 0.2856389880180359, "learning_rate": 0.00012003941372090159, "loss": 0.2313, "step": 3251 }, { "epoch": 1.2008862629246677, "grad_norm": 0.2949007749557495, "learning_rate": 0.00012001478014533811, "loss": 0.2517, "step": 3252 }, { "epoch": 1.201255539143279, "grad_norm": 0.214605912566185, "learning_rate": 0.00011999014656977461, "loss": 0.2241, "step": 3253 }, { "epoch": 1.2016248153618907, "grad_norm": 0.22103802859783173, "learning_rate": 0.00011996551299421111, "loss": 0.2072, "step": 3254 }, { "epoch": 1.2019940915805023, "grad_norm": 0.25153616070747375, "learning_rate": 0.00011994087941864762, "loss": 0.215, "step": 3255 }, { "epoch": 1.2023633677991137, "grad_norm": 0.38636547327041626, "learning_rate": 0.00011991624584308411, "loss": 0.2629, "step": 3256 }, { "epoch": 1.2027326440177253, "grad_norm": 0.2714768648147583, "learning_rate": 0.00011989161226752064, "loss": 0.2468, "step": 3257 }, { "epoch": 1.2031019202363367, "grad_norm": 0.29080840945243835, "learning_rate": 0.00011986697869195713, "loss": 0.2382, "step": 3258 }, { "epoch": 1.2034711964549483, "grad_norm": 0.31261980533599854, "learning_rate": 0.00011984234511639366, "loss": 0.2995, "step": 3259 }, { "epoch": 1.2038404726735599, "grad_norm": 0.2760385572910309, "learning_rate": 0.00011981771154083014, "loss": 0.2316, "step": 3260 }, { "epoch": 1.2042097488921713, "grad_norm": 0.3133525848388672, "learning_rate": 0.00011979307796526666, "loss": 0.2405, "step": 3261 }, { "epoch": 1.2045790251107829, "grad_norm": 0.2454364150762558, "learning_rate": 0.00011976844438970316, "loss": 0.2457, "step": 3262 }, { "epoch": 1.2049483013293945, "grad_norm": 0.22785277664661407, "learning_rate": 0.00011974381081413967, "loss": 0.1819, "step": 3263 }, { "epoch": 1.2053175775480058, "grad_norm": 0.25301745533943176, "learning_rate": 0.00011971917723857618, "loss": 0.2416, "step": 3264 }, { "epoch": 1.2056868537666174, "grad_norm": 0.25902584195137024, "learning_rate": 0.00011969454366301269, "loss": 0.2255, "step": 3265 }, { "epoch": 1.206056129985229, "grad_norm": 0.2496347427368164, "learning_rate": 0.00011966991008744919, "loss": 0.2418, "step": 3266 }, { "epoch": 1.2064254062038404, "grad_norm": 0.3066750466823578, "learning_rate": 0.0001196452765118857, "loss": 0.284, "step": 3267 }, { "epoch": 1.206794682422452, "grad_norm": 0.28269433975219727, "learning_rate": 0.0001196206429363222, "loss": 0.2428, "step": 3268 }, { "epoch": 1.2071639586410634, "grad_norm": 0.2589324712753296, "learning_rate": 0.00011959600936075872, "loss": 0.2433, "step": 3269 }, { "epoch": 1.207533234859675, "grad_norm": 0.2615012228488922, "learning_rate": 0.00011957137578519522, "loss": 0.2566, "step": 3270 }, { "epoch": 1.2079025110782866, "grad_norm": 0.26669344305992126, "learning_rate": 0.00011954674220963174, "loss": 0.2573, "step": 3271 }, { "epoch": 1.208271787296898, "grad_norm": 0.24091607332229614, "learning_rate": 0.00011952210863406822, "loss": 0.2455, "step": 3272 }, { "epoch": 1.2086410635155096, "grad_norm": 0.2977018654346466, "learning_rate": 0.00011949747505850475, "loss": 0.2662, "step": 3273 }, { "epoch": 1.2090103397341212, "grad_norm": 0.3145335912704468, "learning_rate": 0.00011947284148294124, "loss": 0.2715, "step": 3274 }, { "epoch": 1.2093796159527326, "grad_norm": 0.2826317846775055, "learning_rate": 0.00011944820790737777, "loss": 0.3068, "step": 3275 }, { "epoch": 1.2097488921713442, "grad_norm": 0.3221205174922943, "learning_rate": 0.00011942357433181426, "loss": 0.2602, "step": 3276 }, { "epoch": 1.2101181683899558, "grad_norm": 0.33315688371658325, "learning_rate": 0.00011939894075625077, "loss": 0.3002, "step": 3277 }, { "epoch": 1.2104874446085672, "grad_norm": 0.2790661156177521, "learning_rate": 0.00011937430718068727, "loss": 0.2621, "step": 3278 }, { "epoch": 1.2108567208271788, "grad_norm": 0.27199870347976685, "learning_rate": 0.00011934967360512379, "loss": 0.244, "step": 3279 }, { "epoch": 1.2112259970457901, "grad_norm": 0.2701851427555084, "learning_rate": 0.00011932504002956029, "loss": 0.2147, "step": 3280 }, { "epoch": 1.2115952732644018, "grad_norm": 0.35070064663887024, "learning_rate": 0.0001193004064539968, "loss": 0.2465, "step": 3281 }, { "epoch": 1.2119645494830134, "grad_norm": 0.5654131770133972, "learning_rate": 0.0001192757728784333, "loss": 0.2967, "step": 3282 }, { "epoch": 1.2123338257016247, "grad_norm": 0.267313688993454, "learning_rate": 0.00011925113930286982, "loss": 0.2194, "step": 3283 }, { "epoch": 1.2127031019202363, "grad_norm": 0.2560022175312042, "learning_rate": 0.00011922650572730632, "loss": 0.2502, "step": 3284 }, { "epoch": 1.213072378138848, "grad_norm": 0.21603523194789886, "learning_rate": 0.00011920187215174283, "loss": 0.1819, "step": 3285 }, { "epoch": 1.2134416543574593, "grad_norm": 0.32343846559524536, "learning_rate": 0.00011917723857617933, "loss": 0.2671, "step": 3286 }, { "epoch": 1.213810930576071, "grad_norm": 0.3197554647922516, "learning_rate": 0.00011915260500061585, "loss": 0.2421, "step": 3287 }, { "epoch": 1.2141802067946825, "grad_norm": 0.2887294292449951, "learning_rate": 0.00011912797142505234, "loss": 0.2321, "step": 3288 }, { "epoch": 1.214549483013294, "grad_norm": 0.2647448182106018, "learning_rate": 0.00011910333784948886, "loss": 0.2153, "step": 3289 }, { "epoch": 1.2149187592319055, "grad_norm": 0.31124141812324524, "learning_rate": 0.00011907870427392535, "loss": 0.2645, "step": 3290 }, { "epoch": 1.215288035450517, "grad_norm": 0.28668105602264404, "learning_rate": 0.00011905407069836188, "loss": 0.2719, "step": 3291 }, { "epoch": 1.2156573116691285, "grad_norm": 0.2660035789012909, "learning_rate": 0.00011902943712279837, "loss": 0.2273, "step": 3292 }, { "epoch": 1.21602658788774, "grad_norm": 0.2613721787929535, "learning_rate": 0.00011900480354723488, "loss": 0.2473, "step": 3293 }, { "epoch": 1.2163958641063515, "grad_norm": 0.29387205839157104, "learning_rate": 0.00011898016997167138, "loss": 0.2348, "step": 3294 }, { "epoch": 1.216765140324963, "grad_norm": 0.3106662929058075, "learning_rate": 0.0001189555363961079, "loss": 0.2437, "step": 3295 }, { "epoch": 1.2171344165435747, "grad_norm": 0.26709648966789246, "learning_rate": 0.0001189309028205444, "loss": 0.2829, "step": 3296 }, { "epoch": 1.217503692762186, "grad_norm": 0.338077187538147, "learning_rate": 0.00011890626924498091, "loss": 0.2754, "step": 3297 }, { "epoch": 1.2178729689807977, "grad_norm": 0.23778629302978516, "learning_rate": 0.00011888163566941742, "loss": 0.2257, "step": 3298 }, { "epoch": 1.2182422451994093, "grad_norm": 0.23242737352848053, "learning_rate": 0.00011885700209385393, "loss": 0.236, "step": 3299 }, { "epoch": 1.2186115214180206, "grad_norm": 0.2840418517589569, "learning_rate": 0.00011883236851829043, "loss": 0.254, "step": 3300 }, { "epoch": 1.2186115214180206, "eval_loss": 0.271670937538147, "eval_runtime": 5.8605, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.194, "step": 3300 }, { "epoch": 1.2189807976366323, "grad_norm": 0.26939278841018677, "learning_rate": 0.00011880773494272695, "loss": 0.2356, "step": 3301 }, { "epoch": 1.2193500738552436, "grad_norm": 0.2695137560367584, "learning_rate": 0.00011878310136716345, "loss": 0.2563, "step": 3302 }, { "epoch": 1.2197193500738552, "grad_norm": 0.30331453680992126, "learning_rate": 0.00011875846779159996, "loss": 0.2666, "step": 3303 }, { "epoch": 1.2200886262924668, "grad_norm": 0.2266516387462616, "learning_rate": 0.00011873383421603645, "loss": 0.2413, "step": 3304 }, { "epoch": 1.2204579025110782, "grad_norm": 0.314443439245224, "learning_rate": 0.00011870920064047298, "loss": 0.2487, "step": 3305 }, { "epoch": 1.2208271787296898, "grad_norm": 0.24367497861385345, "learning_rate": 0.00011868456706490946, "loss": 0.242, "step": 3306 }, { "epoch": 1.2211964549483014, "grad_norm": 0.2384032905101776, "learning_rate": 0.00011865993348934599, "loss": 0.2482, "step": 3307 }, { "epoch": 1.2215657311669128, "grad_norm": 0.28105780482292175, "learning_rate": 0.00011863529991378248, "loss": 0.2306, "step": 3308 }, { "epoch": 1.2219350073855244, "grad_norm": 0.2741890549659729, "learning_rate": 0.000118610666338219, "loss": 0.2582, "step": 3309 }, { "epoch": 1.222304283604136, "grad_norm": 0.28679463267326355, "learning_rate": 0.0001185860327626555, "loss": 0.2671, "step": 3310 }, { "epoch": 1.2226735598227474, "grad_norm": 0.24229590594768524, "learning_rate": 0.00011856139918709201, "loss": 0.207, "step": 3311 }, { "epoch": 1.223042836041359, "grad_norm": 0.27817606925964355, "learning_rate": 0.00011853676561152851, "loss": 0.2764, "step": 3312 }, { "epoch": 1.2234121122599704, "grad_norm": 0.2285379320383072, "learning_rate": 0.00011851213203596503, "loss": 0.2185, "step": 3313 }, { "epoch": 1.223781388478582, "grad_norm": 0.2450941950082779, "learning_rate": 0.00011848749846040153, "loss": 0.2631, "step": 3314 }, { "epoch": 1.2241506646971936, "grad_norm": 0.24236904084682465, "learning_rate": 0.00011846286488483804, "loss": 0.1996, "step": 3315 }, { "epoch": 1.224519940915805, "grad_norm": 0.26536843180656433, "learning_rate": 0.00011843823130927454, "loss": 0.2282, "step": 3316 }, { "epoch": 1.2248892171344166, "grad_norm": 0.31366321444511414, "learning_rate": 0.00011841359773371106, "loss": 0.2292, "step": 3317 }, { "epoch": 1.225258493353028, "grad_norm": 0.21941806375980377, "learning_rate": 0.00011838896415814756, "loss": 0.1989, "step": 3318 }, { "epoch": 1.2256277695716395, "grad_norm": 0.2817480266094208, "learning_rate": 0.00011836433058258407, "loss": 0.2342, "step": 3319 }, { "epoch": 1.2259970457902511, "grad_norm": 0.3172677755355835, "learning_rate": 0.00011833969700702056, "loss": 0.2157, "step": 3320 }, { "epoch": 1.2263663220088628, "grad_norm": 0.2760121822357178, "learning_rate": 0.00011831506343145709, "loss": 0.223, "step": 3321 }, { "epoch": 1.2267355982274741, "grad_norm": 0.29548752307891846, "learning_rate": 0.00011829042985589358, "loss": 0.2042, "step": 3322 }, { "epoch": 1.2271048744460857, "grad_norm": 0.3663739860057831, "learning_rate": 0.0001182657962803301, "loss": 0.2965, "step": 3323 }, { "epoch": 1.2274741506646971, "grad_norm": 0.3228389024734497, "learning_rate": 0.00011824116270476659, "loss": 0.2863, "step": 3324 }, { "epoch": 1.2278434268833087, "grad_norm": 0.41276058554649353, "learning_rate": 0.00011821652912920311, "loss": 0.257, "step": 3325 }, { "epoch": 1.2282127031019203, "grad_norm": 0.2709147334098816, "learning_rate": 0.00011819189555363961, "loss": 0.2395, "step": 3326 }, { "epoch": 1.2285819793205317, "grad_norm": 0.29278793931007385, "learning_rate": 0.00011816726197807612, "loss": 0.249, "step": 3327 }, { "epoch": 1.2289512555391433, "grad_norm": 0.23940005898475647, "learning_rate": 0.00011814262840251262, "loss": 0.2156, "step": 3328 }, { "epoch": 1.2293205317577547, "grad_norm": 0.2927228808403015, "learning_rate": 0.00011811799482694914, "loss": 0.2468, "step": 3329 }, { "epoch": 1.2296898079763663, "grad_norm": 0.28324705362319946, "learning_rate": 0.00011809336125138564, "loss": 0.2062, "step": 3330 }, { "epoch": 1.230059084194978, "grad_norm": 0.2211814522743225, "learning_rate": 0.00011806872767582215, "loss": 0.2028, "step": 3331 }, { "epoch": 1.2304283604135893, "grad_norm": 0.31836825609207153, "learning_rate": 0.00011804409410025866, "loss": 0.285, "step": 3332 }, { "epoch": 1.2307976366322009, "grad_norm": 0.29719287157058716, "learning_rate": 0.00011801946052469517, "loss": 0.2157, "step": 3333 }, { "epoch": 1.2311669128508125, "grad_norm": 0.2482568621635437, "learning_rate": 0.00011799482694913167, "loss": 0.2333, "step": 3334 }, { "epoch": 1.2315361890694239, "grad_norm": 0.27771538496017456, "learning_rate": 0.00011797019337356819, "loss": 0.2354, "step": 3335 }, { "epoch": 1.2319054652880355, "grad_norm": 0.2799612879753113, "learning_rate": 0.00011794555979800467, "loss": 0.2475, "step": 3336 }, { "epoch": 1.232274741506647, "grad_norm": 0.24728535115718842, "learning_rate": 0.0001179209262224412, "loss": 0.231, "step": 3337 }, { "epoch": 1.2326440177252584, "grad_norm": 0.24407118558883667, "learning_rate": 0.00011789629264687769, "loss": 0.2125, "step": 3338 }, { "epoch": 1.23301329394387, "grad_norm": 0.34859201312065125, "learning_rate": 0.00011787165907131422, "loss": 0.2651, "step": 3339 }, { "epoch": 1.2333825701624814, "grad_norm": 0.2828972041606903, "learning_rate": 0.0001178470254957507, "loss": 0.2557, "step": 3340 }, { "epoch": 1.233751846381093, "grad_norm": 0.3028653860092163, "learning_rate": 0.00011782239192018722, "loss": 0.2545, "step": 3341 }, { "epoch": 1.2341211225997046, "grad_norm": 0.2722667157649994, "learning_rate": 0.00011779775834462372, "loss": 0.2406, "step": 3342 }, { "epoch": 1.234490398818316, "grad_norm": 0.28357529640197754, "learning_rate": 0.00011777312476906024, "loss": 0.2486, "step": 3343 }, { "epoch": 1.2348596750369276, "grad_norm": 0.24250741302967072, "learning_rate": 0.00011774849119349674, "loss": 0.2506, "step": 3344 }, { "epoch": 1.2352289512555392, "grad_norm": 0.24236734211444855, "learning_rate": 0.00011772385761793325, "loss": 0.2218, "step": 3345 }, { "epoch": 1.2355982274741506, "grad_norm": 0.5239112377166748, "learning_rate": 0.00011769922404236975, "loss": 0.3046, "step": 3346 }, { "epoch": 1.2359675036927622, "grad_norm": 0.253016859292984, "learning_rate": 0.00011767459046680627, "loss": 0.2395, "step": 3347 }, { "epoch": 1.2363367799113738, "grad_norm": 0.29107823967933655, "learning_rate": 0.00011764995689124277, "loss": 0.2733, "step": 3348 }, { "epoch": 1.2367060561299852, "grad_norm": 0.28030651807785034, "learning_rate": 0.00011762532331567928, "loss": 0.2579, "step": 3349 }, { "epoch": 1.2370753323485968, "grad_norm": 0.2902437448501587, "learning_rate": 0.00011760068974011578, "loss": 0.2443, "step": 3350 }, { "epoch": 1.2370753323485968, "eval_loss": 0.2730793058872223, "eval_runtime": 5.8659, "eval_samples_per_second": 8.524, "eval_steps_per_second": 1.193, "step": 3350 }, { "epoch": 1.2374446085672082, "grad_norm": 0.29294252395629883, "learning_rate": 0.0001175760561645523, "loss": 0.2696, "step": 3351 }, { "epoch": 1.2378138847858198, "grad_norm": 0.25074857473373413, "learning_rate": 0.00011755142258898879, "loss": 0.216, "step": 3352 }, { "epoch": 1.2381831610044314, "grad_norm": 0.2616208791732788, "learning_rate": 0.00011752678901342531, "loss": 0.2676, "step": 3353 }, { "epoch": 1.2385524372230428, "grad_norm": 0.28293970227241516, "learning_rate": 0.0001175021554378618, "loss": 0.2803, "step": 3354 }, { "epoch": 1.2389217134416544, "grad_norm": 0.24231275916099548, "learning_rate": 0.00011747752186229833, "loss": 0.2366, "step": 3355 }, { "epoch": 1.239290989660266, "grad_norm": 0.217429518699646, "learning_rate": 0.00011745288828673482, "loss": 0.209, "step": 3356 }, { "epoch": 1.2396602658788773, "grad_norm": 0.2831771969795227, "learning_rate": 0.00011742825471117133, "loss": 0.2674, "step": 3357 }, { "epoch": 1.240029542097489, "grad_norm": 0.24386093020439148, "learning_rate": 0.00011740362113560783, "loss": 0.2052, "step": 3358 }, { "epoch": 1.2403988183161005, "grad_norm": 0.2491457760334015, "learning_rate": 0.00011737898756004435, "loss": 0.2344, "step": 3359 }, { "epoch": 1.240768094534712, "grad_norm": 0.22436252236366272, "learning_rate": 0.00011735435398448085, "loss": 0.1829, "step": 3360 }, { "epoch": 1.2411373707533235, "grad_norm": 0.2288627028465271, "learning_rate": 0.00011732972040891736, "loss": 0.2106, "step": 3361 }, { "epoch": 1.241506646971935, "grad_norm": 0.35576456785202026, "learning_rate": 0.00011730508683335386, "loss": 0.2411, "step": 3362 }, { "epoch": 1.2418759231905465, "grad_norm": 0.271321177482605, "learning_rate": 0.00011728045325779038, "loss": 0.2619, "step": 3363 }, { "epoch": 1.2422451994091581, "grad_norm": 0.2414872944355011, "learning_rate": 0.00011725581968222688, "loss": 0.2155, "step": 3364 }, { "epoch": 1.2426144756277695, "grad_norm": 0.25442835688591003, "learning_rate": 0.0001172311861066634, "loss": 0.2248, "step": 3365 }, { "epoch": 1.242983751846381, "grad_norm": 0.8595656156539917, "learning_rate": 0.0001172065525310999, "loss": 0.2787, "step": 3366 }, { "epoch": 1.2433530280649927, "grad_norm": 0.3119123578071594, "learning_rate": 0.00011718191895553641, "loss": 0.2665, "step": 3367 }, { "epoch": 1.243722304283604, "grad_norm": 0.29014065861701965, "learning_rate": 0.0001171572853799729, "loss": 0.2235, "step": 3368 }, { "epoch": 1.2440915805022157, "grad_norm": 0.2537941336631775, "learning_rate": 0.00011713265180440943, "loss": 0.235, "step": 3369 }, { "epoch": 1.2444608567208273, "grad_norm": 0.29093918204307556, "learning_rate": 0.00011710801822884591, "loss": 0.2536, "step": 3370 }, { "epoch": 1.2448301329394387, "grad_norm": 0.2843869626522064, "learning_rate": 0.00011708338465328244, "loss": 0.2347, "step": 3371 }, { "epoch": 1.2451994091580503, "grad_norm": 0.4438280463218689, "learning_rate": 0.00011705875107771893, "loss": 0.2377, "step": 3372 }, { "epoch": 1.2455686853766617, "grad_norm": 0.2818486988544464, "learning_rate": 0.00011703411750215544, "loss": 0.2344, "step": 3373 }, { "epoch": 1.2459379615952733, "grad_norm": 0.29656335711479187, "learning_rate": 0.00011700948392659195, "loss": 0.2495, "step": 3374 }, { "epoch": 1.2463072378138849, "grad_norm": 0.2644748389720917, "learning_rate": 0.00011698485035102846, "loss": 0.2356, "step": 3375 }, { "epoch": 1.2466765140324962, "grad_norm": 0.32339850068092346, "learning_rate": 0.00011696021677546496, "loss": 0.2401, "step": 3376 }, { "epoch": 1.2470457902511078, "grad_norm": 0.27086853981018066, "learning_rate": 0.00011693558319990148, "loss": 0.2429, "step": 3377 }, { "epoch": 1.2474150664697194, "grad_norm": 0.2287064641714096, "learning_rate": 0.00011691094962433798, "loss": 0.2272, "step": 3378 }, { "epoch": 1.2477843426883308, "grad_norm": 0.2801342010498047, "learning_rate": 0.00011688631604877449, "loss": 0.2895, "step": 3379 }, { "epoch": 1.2481536189069424, "grad_norm": 0.34044507145881653, "learning_rate": 0.00011686168247321099, "loss": 0.3109, "step": 3380 }, { "epoch": 1.248522895125554, "grad_norm": 0.2989860773086548, "learning_rate": 0.00011683704889764751, "loss": 0.2594, "step": 3381 }, { "epoch": 1.2488921713441654, "grad_norm": 0.2658219635486603, "learning_rate": 0.000116812415322084, "loss": 0.2233, "step": 3382 }, { "epoch": 1.249261447562777, "grad_norm": 0.2827605903148651, "learning_rate": 0.00011678778174652052, "loss": 0.218, "step": 3383 }, { "epoch": 1.2496307237813884, "grad_norm": 0.3575083315372467, "learning_rate": 0.00011676314817095701, "loss": 0.3148, "step": 3384 }, { "epoch": 1.25, "grad_norm": 0.32070037722587585, "learning_rate": 0.00011673851459539354, "loss": 0.2505, "step": 3385 }, { "epoch": 1.2503692762186116, "grad_norm": 0.23181554675102234, "learning_rate": 0.00011671388101983003, "loss": 0.2229, "step": 3386 }, { "epoch": 1.250738552437223, "grad_norm": 0.20842178165912628, "learning_rate": 0.00011668924744426654, "loss": 0.201, "step": 3387 }, { "epoch": 1.2511078286558346, "grad_norm": 0.21866343915462494, "learning_rate": 0.00011666461386870304, "loss": 0.2098, "step": 3388 }, { "epoch": 1.251477104874446, "grad_norm": 0.3296528160572052, "learning_rate": 0.00011663998029313956, "loss": 0.2112, "step": 3389 }, { "epoch": 1.2518463810930576, "grad_norm": 0.26271867752075195, "learning_rate": 0.00011661534671757606, "loss": 0.261, "step": 3390 }, { "epoch": 1.2522156573116692, "grad_norm": 0.2829124629497528, "learning_rate": 0.00011659071314201257, "loss": 0.2296, "step": 3391 }, { "epoch": 1.2525849335302808, "grad_norm": 0.26187101006507874, "learning_rate": 0.00011656607956644907, "loss": 0.2213, "step": 3392 }, { "epoch": 1.2529542097488922, "grad_norm": 0.2645774781703949, "learning_rate": 0.00011654144599088559, "loss": 0.2434, "step": 3393 }, { "epoch": 1.2533234859675038, "grad_norm": 0.27006688714027405, "learning_rate": 0.00011651681241532209, "loss": 0.2364, "step": 3394 }, { "epoch": 1.2536927621861151, "grad_norm": 0.2682693898677826, "learning_rate": 0.0001164921788397586, "loss": 0.2313, "step": 3395 }, { "epoch": 1.2540620384047267, "grad_norm": 0.29124483466148376, "learning_rate": 0.0001164675452641951, "loss": 0.2309, "step": 3396 }, { "epoch": 1.2544313146233383, "grad_norm": 0.2844143509864807, "learning_rate": 0.00011644291168863162, "loss": 0.2489, "step": 3397 }, { "epoch": 1.2548005908419497, "grad_norm": 0.24740177392959595, "learning_rate": 0.00011641827811306811, "loss": 0.212, "step": 3398 }, { "epoch": 1.2551698670605613, "grad_norm": 0.2769998013973236, "learning_rate": 0.00011639364453750463, "loss": 0.2421, "step": 3399 }, { "epoch": 1.2555391432791727, "grad_norm": 0.27294492721557617, "learning_rate": 0.00011636901096194112, "loss": 0.2532, "step": 3400 }, { "epoch": 1.2555391432791727, "eval_loss": 0.2674335241317749, "eval_runtime": 5.8599, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 3400 }, { "epoch": 1.2559084194977843, "grad_norm": 0.3451029658317566, "learning_rate": 0.00011634437738637765, "loss": 0.2621, "step": 3401 }, { "epoch": 1.256277695716396, "grad_norm": 0.28013747930526733, "learning_rate": 0.00011631974381081414, "loss": 0.2448, "step": 3402 }, { "epoch": 1.2566469719350075, "grad_norm": 0.26190176606178284, "learning_rate": 0.00011629511023525065, "loss": 0.2237, "step": 3403 }, { "epoch": 1.257016248153619, "grad_norm": 0.30306148529052734, "learning_rate": 0.00011627047665968715, "loss": 0.252, "step": 3404 }, { "epoch": 1.2573855243722305, "grad_norm": 0.2617952823638916, "learning_rate": 0.00011624584308412367, "loss": 0.2137, "step": 3405 }, { "epoch": 1.2577548005908419, "grad_norm": 0.35550904273986816, "learning_rate": 0.00011622120950856017, "loss": 0.266, "step": 3406 }, { "epoch": 1.2581240768094535, "grad_norm": 0.3020575940608978, "learning_rate": 0.00011619657593299668, "loss": 0.2807, "step": 3407 }, { "epoch": 1.258493353028065, "grad_norm": 0.30012327432632446, "learning_rate": 0.00011617194235743319, "loss": 0.2647, "step": 3408 }, { "epoch": 1.2588626292466765, "grad_norm": 0.29071366786956787, "learning_rate": 0.0001161473087818697, "loss": 0.2536, "step": 3409 }, { "epoch": 1.259231905465288, "grad_norm": 0.28802093863487244, "learning_rate": 0.0001161226752063062, "loss": 0.2628, "step": 3410 }, { "epoch": 1.2596011816838995, "grad_norm": 0.253380686044693, "learning_rate": 0.00011609804163074272, "loss": 0.2493, "step": 3411 }, { "epoch": 1.259970457902511, "grad_norm": 0.27335187792778015, "learning_rate": 0.00011607340805517922, "loss": 0.2475, "step": 3412 }, { "epoch": 1.2603397341211227, "grad_norm": 0.24106822907924652, "learning_rate": 0.00011604877447961573, "loss": 0.1973, "step": 3413 }, { "epoch": 1.2607090103397343, "grad_norm": 0.2284409999847412, "learning_rate": 0.00011602414090405222, "loss": 0.203, "step": 3414 }, { "epoch": 1.2610782865583456, "grad_norm": 0.2706008851528168, "learning_rate": 0.00011599950732848875, "loss": 0.2147, "step": 3415 }, { "epoch": 1.2614475627769572, "grad_norm": 0.27175143361091614, "learning_rate": 0.00011597487375292523, "loss": 0.2403, "step": 3416 }, { "epoch": 1.2618168389955686, "grad_norm": 0.2630060613155365, "learning_rate": 0.00011595024017736176, "loss": 0.2426, "step": 3417 }, { "epoch": 1.2621861152141802, "grad_norm": 0.266940176486969, "learning_rate": 0.00011592560660179825, "loss": 0.242, "step": 3418 }, { "epoch": 1.2625553914327918, "grad_norm": 0.2598400115966797, "learning_rate": 0.00011590097302623477, "loss": 0.2826, "step": 3419 }, { "epoch": 1.2629246676514032, "grad_norm": 0.2574462294578552, "learning_rate": 0.00011587633945067127, "loss": 0.2622, "step": 3420 }, { "epoch": 1.2632939438700148, "grad_norm": 0.27017250657081604, "learning_rate": 0.00011585170587510778, "loss": 0.2358, "step": 3421 }, { "epoch": 1.2636632200886262, "grad_norm": 0.30165326595306396, "learning_rate": 0.00011582707229954428, "loss": 0.2862, "step": 3422 }, { "epoch": 1.2640324963072378, "grad_norm": 0.2234553098678589, "learning_rate": 0.0001158024387239808, "loss": 0.1956, "step": 3423 }, { "epoch": 1.2644017725258494, "grad_norm": 0.299065500497818, "learning_rate": 0.0001157778051484173, "loss": 0.2207, "step": 3424 }, { "epoch": 1.2647710487444608, "grad_norm": 0.23009859025478363, "learning_rate": 0.00011575317157285381, "loss": 0.2, "step": 3425 }, { "epoch": 1.2651403249630724, "grad_norm": 0.31464433670043945, "learning_rate": 0.00011572853799729031, "loss": 0.2199, "step": 3426 }, { "epoch": 1.2655096011816838, "grad_norm": 0.2601170837879181, "learning_rate": 0.00011570390442172683, "loss": 0.2452, "step": 3427 }, { "epoch": 1.2658788774002954, "grad_norm": 0.21769161522388458, "learning_rate": 0.00011567927084616333, "loss": 0.1744, "step": 3428 }, { "epoch": 1.266248153618907, "grad_norm": 0.30806398391723633, "learning_rate": 0.00011565463727059984, "loss": 0.2467, "step": 3429 }, { "epoch": 1.2666174298375186, "grad_norm": 0.21978482604026794, "learning_rate": 0.00011563000369503633, "loss": 0.2407, "step": 3430 }, { "epoch": 1.26698670605613, "grad_norm": 0.2523432970046997, "learning_rate": 0.00011560537011947286, "loss": 0.2333, "step": 3431 }, { "epoch": 1.2673559822747416, "grad_norm": 0.31474968791007996, "learning_rate": 0.00011558073654390935, "loss": 0.2391, "step": 3432 }, { "epoch": 1.267725258493353, "grad_norm": 0.2926175892353058, "learning_rate": 0.00011555610296834588, "loss": 0.2344, "step": 3433 }, { "epoch": 1.2680945347119645, "grad_norm": 0.25597435235977173, "learning_rate": 0.00011553146939278236, "loss": 0.1791, "step": 3434 }, { "epoch": 1.2684638109305761, "grad_norm": 0.2518375813961029, "learning_rate": 0.00011550683581721888, "loss": 0.2316, "step": 3435 }, { "epoch": 1.2688330871491875, "grad_norm": 0.303515762090683, "learning_rate": 0.00011548220224165538, "loss": 0.2911, "step": 3436 }, { "epoch": 1.2692023633677991, "grad_norm": 0.33104947209358215, "learning_rate": 0.00011545756866609189, "loss": 0.2776, "step": 3437 }, { "epoch": 1.2695716395864105, "grad_norm": 0.2640734612941742, "learning_rate": 0.0001154329350905284, "loss": 0.2471, "step": 3438 }, { "epoch": 1.269940915805022, "grad_norm": 0.23754489421844482, "learning_rate": 0.00011540830151496491, "loss": 0.2209, "step": 3439 }, { "epoch": 1.2703101920236337, "grad_norm": 0.25826865434646606, "learning_rate": 0.00011538366793940141, "loss": 0.2148, "step": 3440 }, { "epoch": 1.2706794682422453, "grad_norm": 0.2712554335594177, "learning_rate": 0.00011535903436383792, "loss": 0.2439, "step": 3441 }, { "epoch": 1.2710487444608567, "grad_norm": 0.25869423151016235, "learning_rate": 0.00011533440078827443, "loss": 0.2273, "step": 3442 }, { "epoch": 1.2714180206794683, "grad_norm": 0.29017847776412964, "learning_rate": 0.00011530976721271094, "loss": 0.2136, "step": 3443 }, { "epoch": 1.2717872968980797, "grad_norm": 0.26316726207733154, "learning_rate": 0.00011528513363714744, "loss": 0.232, "step": 3444 }, { "epoch": 1.2721565731166913, "grad_norm": 0.27002277970314026, "learning_rate": 0.00011526050006158396, "loss": 0.226, "step": 3445 }, { "epoch": 1.2725258493353029, "grad_norm": 0.3249281048774719, "learning_rate": 0.00011523586648602044, "loss": 0.2423, "step": 3446 }, { "epoch": 1.2728951255539143, "grad_norm": 0.33024588227272034, "learning_rate": 0.00011521123291045697, "loss": 0.2401, "step": 3447 }, { "epoch": 1.2732644017725259, "grad_norm": 0.2906222939491272, "learning_rate": 0.00011518659933489346, "loss": 0.2713, "step": 3448 }, { "epoch": 1.2736336779911372, "grad_norm": 0.4145973324775696, "learning_rate": 0.00011516196575932999, "loss": 0.2249, "step": 3449 }, { "epoch": 1.2740029542097489, "grad_norm": 0.24750135838985443, "learning_rate": 0.00011513733218376648, "loss": 0.2217, "step": 3450 }, { "epoch": 1.2740029542097489, "eval_loss": 0.27090415358543396, "eval_runtime": 5.8672, "eval_samples_per_second": 8.522, "eval_steps_per_second": 1.193, "step": 3450 }, { "epoch": 1.2743722304283605, "grad_norm": 0.31642940640449524, "learning_rate": 0.00011511269860820299, "loss": 0.2239, "step": 3451 }, { "epoch": 1.274741506646972, "grad_norm": 0.30429863929748535, "learning_rate": 0.00011508806503263949, "loss": 0.2396, "step": 3452 }, { "epoch": 1.2751107828655834, "grad_norm": 0.3115158677101135, "learning_rate": 0.000115063431457076, "loss": 0.3013, "step": 3453 }, { "epoch": 1.275480059084195, "grad_norm": 0.32751041650772095, "learning_rate": 0.0001150387978815125, "loss": 0.2545, "step": 3454 }, { "epoch": 1.2758493353028064, "grad_norm": 0.35201385617256165, "learning_rate": 0.00011501416430594902, "loss": 0.2371, "step": 3455 }, { "epoch": 1.276218611521418, "grad_norm": 0.3706410229206085, "learning_rate": 0.00011498953073038552, "loss": 0.2305, "step": 3456 }, { "epoch": 1.2765878877400296, "grad_norm": 0.28357771039009094, "learning_rate": 0.00011496489715482204, "loss": 0.2265, "step": 3457 }, { "epoch": 1.276957163958641, "grad_norm": 0.2744147479534149, "learning_rate": 0.00011494026357925854, "loss": 0.2528, "step": 3458 }, { "epoch": 1.2773264401772526, "grad_norm": 0.3463950455188751, "learning_rate": 0.00011491563000369505, "loss": 0.3019, "step": 3459 }, { "epoch": 1.277695716395864, "grad_norm": 0.30260658264160156, "learning_rate": 0.00011489099642813155, "loss": 0.2202, "step": 3460 }, { "epoch": 1.2780649926144756, "grad_norm": 0.27453920245170593, "learning_rate": 0.00011486636285256807, "loss": 0.2178, "step": 3461 }, { "epoch": 1.2784342688330872, "grad_norm": 0.27138856053352356, "learning_rate": 0.00011484172927700456, "loss": 0.2267, "step": 3462 }, { "epoch": 1.2788035450516988, "grad_norm": 0.30846890807151794, "learning_rate": 0.00011481709570144108, "loss": 0.2572, "step": 3463 }, { "epoch": 1.2791728212703102, "grad_norm": 0.30298444628715515, "learning_rate": 0.00011479246212587757, "loss": 0.26, "step": 3464 }, { "epoch": 1.2795420974889218, "grad_norm": 0.29792797565460205, "learning_rate": 0.0001147678285503141, "loss": 0.2344, "step": 3465 }, { "epoch": 1.2799113737075332, "grad_norm": 0.3351114094257355, "learning_rate": 0.00011474319497475059, "loss": 0.2508, "step": 3466 }, { "epoch": 1.2802806499261448, "grad_norm": 0.4213079512119293, "learning_rate": 0.0001147185613991871, "loss": 0.2774, "step": 3467 }, { "epoch": 1.2806499261447564, "grad_norm": 0.4092477858066559, "learning_rate": 0.0001146939278236236, "loss": 0.3138, "step": 3468 }, { "epoch": 1.2810192023633677, "grad_norm": 0.28857889771461487, "learning_rate": 0.00011466929424806012, "loss": 0.2354, "step": 3469 }, { "epoch": 1.2813884785819794, "grad_norm": 0.29781675338745117, "learning_rate": 0.00011464466067249662, "loss": 0.2168, "step": 3470 }, { "epoch": 1.2817577548005907, "grad_norm": 0.2629866302013397, "learning_rate": 0.00011462002709693313, "loss": 0.201, "step": 3471 }, { "epoch": 1.2821270310192023, "grad_norm": 0.2955879271030426, "learning_rate": 0.00011459539352136963, "loss": 0.2853, "step": 3472 }, { "epoch": 1.282496307237814, "grad_norm": 0.3258877694606781, "learning_rate": 0.00011457075994580615, "loss": 0.2475, "step": 3473 }, { "epoch": 1.2828655834564255, "grad_norm": 0.28191569447517395, "learning_rate": 0.00011454612637024265, "loss": 0.2815, "step": 3474 }, { "epoch": 1.283234859675037, "grad_norm": 0.27228131890296936, "learning_rate": 0.00011452149279467916, "loss": 0.2305, "step": 3475 }, { "epoch": 1.2836041358936485, "grad_norm": 0.3138941526412964, "learning_rate": 0.00011449685921911567, "loss": 0.2513, "step": 3476 }, { "epoch": 1.28397341211226, "grad_norm": 0.2670232653617859, "learning_rate": 0.00011447222564355218, "loss": 0.2123, "step": 3477 }, { "epoch": 1.2843426883308715, "grad_norm": 0.2697356343269348, "learning_rate": 0.00011444759206798867, "loss": 0.212, "step": 3478 }, { "epoch": 1.284711964549483, "grad_norm": 0.26701635122299194, "learning_rate": 0.0001144229584924252, "loss": 0.2375, "step": 3479 }, { "epoch": 1.2850812407680945, "grad_norm": 0.25487884879112244, "learning_rate": 0.00011439832491686168, "loss": 0.2339, "step": 3480 }, { "epoch": 1.285450516986706, "grad_norm": 0.22980691492557526, "learning_rate": 0.00011437369134129821, "loss": 0.2139, "step": 3481 }, { "epoch": 1.2858197932053175, "grad_norm": 0.26740097999572754, "learning_rate": 0.0001143490577657347, "loss": 0.2707, "step": 3482 }, { "epoch": 1.286189069423929, "grad_norm": 0.2384112924337387, "learning_rate": 0.00011432442419017121, "loss": 0.222, "step": 3483 }, { "epoch": 1.2865583456425407, "grad_norm": 0.272722452878952, "learning_rate": 0.00011429979061460772, "loss": 0.2641, "step": 3484 }, { "epoch": 1.286927621861152, "grad_norm": 0.24369671940803528, "learning_rate": 0.00011427515703904422, "loss": 0.2633, "step": 3485 }, { "epoch": 1.2872968980797637, "grad_norm": 0.26729193329811096, "learning_rate": 0.00011425052346348073, "loss": 0.2526, "step": 3486 }, { "epoch": 1.2876661742983753, "grad_norm": 0.2396964579820633, "learning_rate": 0.00011422588988791723, "loss": 0.2016, "step": 3487 }, { "epoch": 1.2880354505169866, "grad_norm": 0.26808351278305054, "learning_rate": 0.00011420125631235375, "loss": 0.2454, "step": 3488 }, { "epoch": 1.2884047267355982, "grad_norm": 0.3159623146057129, "learning_rate": 0.00011417662273679023, "loss": 0.256, "step": 3489 }, { "epoch": 1.2887740029542099, "grad_norm": 0.24514681100845337, "learning_rate": 0.00011415198916122676, "loss": 0.2292, "step": 3490 }, { "epoch": 1.2891432791728212, "grad_norm": 0.301023006439209, "learning_rate": 0.00011412735558566325, "loss": 0.2833, "step": 3491 }, { "epoch": 1.2895125553914328, "grad_norm": 0.24914702773094177, "learning_rate": 0.00011410272201009978, "loss": 0.2005, "step": 3492 }, { "epoch": 1.2898818316100442, "grad_norm": 0.2552284896373749, "learning_rate": 0.00011407808843453627, "loss": 0.2318, "step": 3493 }, { "epoch": 1.2902511078286558, "grad_norm": 0.3317311108112335, "learning_rate": 0.00011405345485897278, "loss": 0.2868, "step": 3494 }, { "epoch": 1.2906203840472674, "grad_norm": 0.2974274158477783, "learning_rate": 0.00011402882128340928, "loss": 0.2212, "step": 3495 }, { "epoch": 1.2909896602658788, "grad_norm": 0.23321019113063812, "learning_rate": 0.0001140041877078458, "loss": 0.2282, "step": 3496 }, { "epoch": 1.2913589364844904, "grad_norm": 0.26129546761512756, "learning_rate": 0.0001139795541322823, "loss": 0.21, "step": 3497 }, { "epoch": 1.2917282127031018, "grad_norm": 0.32607483863830566, "learning_rate": 0.00011395492055671881, "loss": 0.2675, "step": 3498 }, { "epoch": 1.2920974889217134, "grad_norm": 0.24857379496097565, "learning_rate": 0.00011393028698115531, "loss": 0.1942, "step": 3499 }, { "epoch": 1.292466765140325, "grad_norm": 0.23598088324069977, "learning_rate": 0.00011390565340559183, "loss": 0.2058, "step": 3500 }, { "epoch": 1.292466765140325, "eval_loss": 0.27111950516700745, "eval_runtime": 5.8744, "eval_samples_per_second": 8.511, "eval_steps_per_second": 1.192, "step": 3500 }, { "epoch": 1.2928360413589366, "grad_norm": 0.26858481764793396, "learning_rate": 0.00011388101983002833, "loss": 0.2404, "step": 3501 }, { "epoch": 1.293205317577548, "grad_norm": 0.29856938123703003, "learning_rate": 0.00011385638625446484, "loss": 0.2611, "step": 3502 }, { "epoch": 1.2935745937961596, "grad_norm": 0.23774632811546326, "learning_rate": 0.00011383175267890134, "loss": 0.2067, "step": 3503 }, { "epoch": 1.293943870014771, "grad_norm": 0.31560781598091125, "learning_rate": 0.00011380711910333786, "loss": 0.2584, "step": 3504 }, { "epoch": 1.2943131462333826, "grad_norm": 0.29972249269485474, "learning_rate": 0.00011378248552777435, "loss": 0.2509, "step": 3505 }, { "epoch": 1.2946824224519942, "grad_norm": 0.5040701031684875, "learning_rate": 0.00011375785195221087, "loss": 0.2352, "step": 3506 }, { "epoch": 1.2950516986706055, "grad_norm": 0.2971581518650055, "learning_rate": 0.00011373321837664736, "loss": 0.2743, "step": 3507 }, { "epoch": 1.2954209748892171, "grad_norm": 0.3349955081939697, "learning_rate": 0.00011370858480108389, "loss": 0.2851, "step": 3508 }, { "epoch": 1.2957902511078285, "grad_norm": 0.2929718494415283, "learning_rate": 0.00011368395122552038, "loss": 0.2723, "step": 3509 }, { "epoch": 1.2961595273264401, "grad_norm": 0.31535041332244873, "learning_rate": 0.00011365931764995689, "loss": 0.2429, "step": 3510 }, { "epoch": 1.2965288035450517, "grad_norm": 0.328657865524292, "learning_rate": 0.0001136346840743934, "loss": 0.2653, "step": 3511 }, { "epoch": 1.2968980797636633, "grad_norm": 0.29910996556282043, "learning_rate": 0.00011361005049882991, "loss": 0.2304, "step": 3512 }, { "epoch": 1.2972673559822747, "grad_norm": 0.28474050760269165, "learning_rate": 0.00011358541692326641, "loss": 0.2538, "step": 3513 }, { "epoch": 1.2976366322008863, "grad_norm": 0.3016922175884247, "learning_rate": 0.00011356078334770292, "loss": 0.2498, "step": 3514 }, { "epoch": 1.2980059084194977, "grad_norm": 0.2724801301956177, "learning_rate": 0.00011353614977213943, "loss": 0.2384, "step": 3515 }, { "epoch": 1.2983751846381093, "grad_norm": 0.23370754718780518, "learning_rate": 0.00011351151619657594, "loss": 0.2254, "step": 3516 }, { "epoch": 1.298744460856721, "grad_norm": 0.2495436668395996, "learning_rate": 0.00011348688262101244, "loss": 0.2422, "step": 3517 }, { "epoch": 1.2991137370753323, "grad_norm": 0.2952888309955597, "learning_rate": 0.00011346224904544896, "loss": 0.2396, "step": 3518 }, { "epoch": 1.299483013293944, "grad_norm": 0.2261483073234558, "learning_rate": 0.00011343761546988546, "loss": 0.2039, "step": 3519 }, { "epoch": 1.2998522895125553, "grad_norm": 0.2913593351840973, "learning_rate": 0.00011341298189432197, "loss": 0.2595, "step": 3520 }, { "epoch": 1.3002215657311669, "grad_norm": 0.32711273431777954, "learning_rate": 0.00011338834831875846, "loss": 0.2758, "step": 3521 }, { "epoch": 1.3005908419497785, "grad_norm": 0.40282630920410156, "learning_rate": 0.00011336371474319499, "loss": 0.3096, "step": 3522 }, { "epoch": 1.30096011816839, "grad_norm": 0.26645639538764954, "learning_rate": 0.00011333908116763147, "loss": 0.2112, "step": 3523 }, { "epoch": 1.3013293943870015, "grad_norm": 0.34445178508758545, "learning_rate": 0.000113314447592068, "loss": 0.2779, "step": 3524 }, { "epoch": 1.301698670605613, "grad_norm": 0.2424442619085312, "learning_rate": 0.00011328981401650449, "loss": 0.2142, "step": 3525 }, { "epoch": 1.3020679468242244, "grad_norm": 0.27171117067337036, "learning_rate": 0.000113265180440941, "loss": 0.2487, "step": 3526 }, { "epoch": 1.302437223042836, "grad_norm": 0.2760293185710907, "learning_rate": 0.0001132405468653775, "loss": 0.2428, "step": 3527 }, { "epoch": 1.3028064992614476, "grad_norm": 0.2675725221633911, "learning_rate": 0.00011321591328981402, "loss": 0.2144, "step": 3528 }, { "epoch": 1.303175775480059, "grad_norm": 0.25027212500572205, "learning_rate": 0.00011319127971425052, "loss": 0.2479, "step": 3529 }, { "epoch": 1.3035450516986706, "grad_norm": 0.3069452941417694, "learning_rate": 0.00011316664613868704, "loss": 0.2759, "step": 3530 }, { "epoch": 1.303914327917282, "grad_norm": 0.2560293972492218, "learning_rate": 0.00011314201256312354, "loss": 0.2396, "step": 3531 }, { "epoch": 1.3042836041358936, "grad_norm": 0.24829941987991333, "learning_rate": 0.00011311737898756005, "loss": 0.2301, "step": 3532 }, { "epoch": 1.3046528803545052, "grad_norm": 0.21412433683872223, "learning_rate": 0.00011309274541199655, "loss": 0.192, "step": 3533 }, { "epoch": 1.3050221565731168, "grad_norm": 0.24372343719005585, "learning_rate": 0.00011306811183643307, "loss": 0.195, "step": 3534 }, { "epoch": 1.3053914327917282, "grad_norm": 0.32921963930130005, "learning_rate": 0.00011304347826086956, "loss": 0.2878, "step": 3535 }, { "epoch": 1.3057607090103398, "grad_norm": 0.3481973111629486, "learning_rate": 0.00011301884468530608, "loss": 0.2343, "step": 3536 }, { "epoch": 1.3061299852289512, "grad_norm": 0.31481435894966125, "learning_rate": 0.00011299421110974257, "loss": 0.3065, "step": 3537 }, { "epoch": 1.3064992614475628, "grad_norm": 0.24863918125629425, "learning_rate": 0.0001129695775341791, "loss": 0.2344, "step": 3538 }, { "epoch": 1.3068685376661744, "grad_norm": 0.30417582392692566, "learning_rate": 0.00011294494395861559, "loss": 0.2305, "step": 3539 }, { "epoch": 1.3072378138847858, "grad_norm": 0.2873446047306061, "learning_rate": 0.0001129203103830521, "loss": 0.2626, "step": 3540 }, { "epoch": 1.3076070901033974, "grad_norm": 0.2590044140815735, "learning_rate": 0.0001128956768074886, "loss": 0.2163, "step": 3541 }, { "epoch": 1.3079763663220088, "grad_norm": 0.23338545858860016, "learning_rate": 0.00011287104323192512, "loss": 0.2346, "step": 3542 }, { "epoch": 1.3083456425406204, "grad_norm": 0.2515639662742615, "learning_rate": 0.00011284640965636162, "loss": 0.2606, "step": 3543 }, { "epoch": 1.308714918759232, "grad_norm": 0.21571849286556244, "learning_rate": 0.00011282177608079813, "loss": 0.1929, "step": 3544 }, { "epoch": 1.3090841949778436, "grad_norm": 0.3184591829776764, "learning_rate": 0.00011279714250523463, "loss": 0.2396, "step": 3545 }, { "epoch": 1.309453471196455, "grad_norm": 0.22089743614196777, "learning_rate": 0.00011277250892967115, "loss": 0.1959, "step": 3546 }, { "epoch": 1.3098227474150665, "grad_norm": 0.23892144858837128, "learning_rate": 0.00011274787535410765, "loss": 0.2171, "step": 3547 }, { "epoch": 1.310192023633678, "grad_norm": 0.2656129002571106, "learning_rate": 0.00011272324177854416, "loss": 0.2251, "step": 3548 }, { "epoch": 1.3105612998522895, "grad_norm": 0.2840602993965149, "learning_rate": 0.00011269860820298067, "loss": 0.239, "step": 3549 }, { "epoch": 1.3109305760709011, "grad_norm": 0.25248202681541443, "learning_rate": 0.00011267397462741718, "loss": 0.2379, "step": 3550 }, { "epoch": 1.3109305760709011, "eval_loss": 0.27254676818847656, "eval_runtime": 5.8791, "eval_samples_per_second": 8.505, "eval_steps_per_second": 1.191, "step": 3550 }, { "epoch": 1.3112998522895125, "grad_norm": 0.23947429656982422, "learning_rate": 0.00011264934105185367, "loss": 0.2408, "step": 3551 }, { "epoch": 1.3116691285081241, "grad_norm": 0.2857097387313843, "learning_rate": 0.0001126247074762902, "loss": 0.2228, "step": 3552 }, { "epoch": 1.3120384047267355, "grad_norm": 0.25149619579315186, "learning_rate": 0.00011260007390072668, "loss": 0.2475, "step": 3553 }, { "epoch": 1.312407680945347, "grad_norm": 0.276044636964798, "learning_rate": 0.00011257544032516321, "loss": 0.2294, "step": 3554 }, { "epoch": 1.3127769571639587, "grad_norm": 0.2988446354866028, "learning_rate": 0.0001125508067495997, "loss": 0.2923, "step": 3555 }, { "epoch": 1.31314623338257, "grad_norm": 0.2972252666950226, "learning_rate": 0.00011252617317403621, "loss": 0.2797, "step": 3556 }, { "epoch": 1.3135155096011817, "grad_norm": 0.2943788170814514, "learning_rate": 0.00011250153959847271, "loss": 0.2874, "step": 3557 }, { "epoch": 1.3138847858197933, "grad_norm": 0.29646357893943787, "learning_rate": 0.00011247690602290923, "loss": 0.2625, "step": 3558 }, { "epoch": 1.3142540620384047, "grad_norm": 0.3290356397628784, "learning_rate": 0.00011245227244734573, "loss": 0.2276, "step": 3559 }, { "epoch": 1.3146233382570163, "grad_norm": 0.32188060879707336, "learning_rate": 0.00011242763887178225, "loss": 0.3005, "step": 3560 }, { "epoch": 1.3149926144756279, "grad_norm": 0.28455227613449097, "learning_rate": 0.00011240300529621875, "loss": 0.2535, "step": 3561 }, { "epoch": 1.3153618906942393, "grad_norm": 0.2558704614639282, "learning_rate": 0.00011237837172065526, "loss": 0.2569, "step": 3562 }, { "epoch": 1.3157311669128509, "grad_norm": 0.3473958671092987, "learning_rate": 0.00011235373814509176, "loss": 0.2938, "step": 3563 }, { "epoch": 1.3161004431314622, "grad_norm": 0.28356698155403137, "learning_rate": 0.00011232910456952828, "loss": 0.2519, "step": 3564 }, { "epoch": 1.3164697193500738, "grad_norm": 0.2912018895149231, "learning_rate": 0.00011230447099396478, "loss": 0.2517, "step": 3565 }, { "epoch": 1.3168389955686854, "grad_norm": 0.28475067019462585, "learning_rate": 0.00011227983741840129, "loss": 0.2061, "step": 3566 }, { "epoch": 1.3172082717872968, "grad_norm": 0.27545440196990967, "learning_rate": 0.00011225520384283778, "loss": 0.244, "step": 3567 }, { "epoch": 1.3175775480059084, "grad_norm": 0.250146746635437, "learning_rate": 0.00011223057026727431, "loss": 0.2372, "step": 3568 }, { "epoch": 1.3179468242245198, "grad_norm": 0.30133166909217834, "learning_rate": 0.0001122059366917108, "loss": 0.2763, "step": 3569 }, { "epoch": 1.3183161004431314, "grad_norm": 0.29099568724632263, "learning_rate": 0.00011218130311614732, "loss": 0.2607, "step": 3570 }, { "epoch": 1.318685376661743, "grad_norm": 0.25981763005256653, "learning_rate": 0.00011215666954058381, "loss": 0.2147, "step": 3571 }, { "epoch": 1.3190546528803546, "grad_norm": 0.2595869302749634, "learning_rate": 0.00011213203596502033, "loss": 0.2371, "step": 3572 }, { "epoch": 1.319423929098966, "grad_norm": 0.3137706518173218, "learning_rate": 0.00011210740238945683, "loss": 0.2162, "step": 3573 }, { "epoch": 1.3197932053175776, "grad_norm": 0.28788837790489197, "learning_rate": 0.00011208276881389334, "loss": 0.2508, "step": 3574 }, { "epoch": 1.320162481536189, "grad_norm": 0.29614338278770447, "learning_rate": 0.00011205813523832984, "loss": 0.2548, "step": 3575 }, { "epoch": 1.3205317577548006, "grad_norm": 0.3168517053127289, "learning_rate": 0.00011203350166276636, "loss": 0.2657, "step": 3576 }, { "epoch": 1.3209010339734122, "grad_norm": 0.250918447971344, "learning_rate": 0.00011200886808720286, "loss": 0.2235, "step": 3577 }, { "epoch": 1.3212703101920236, "grad_norm": 0.23113346099853516, "learning_rate": 0.00011198423451163937, "loss": 0.2034, "step": 3578 }, { "epoch": 1.3216395864106352, "grad_norm": 0.25568103790283203, "learning_rate": 0.00011195960093607587, "loss": 0.2168, "step": 3579 }, { "epoch": 1.3220088626292466, "grad_norm": 0.3186097741127014, "learning_rate": 0.00011193496736051239, "loss": 0.2325, "step": 3580 }, { "epoch": 1.3223781388478582, "grad_norm": 0.2718065679073334, "learning_rate": 0.00011191033378494889, "loss": 0.22, "step": 3581 }, { "epoch": 1.3227474150664698, "grad_norm": 0.23766477406024933, "learning_rate": 0.0001118857002093854, "loss": 0.2225, "step": 3582 }, { "epoch": 1.3231166912850814, "grad_norm": 0.256465345621109, "learning_rate": 0.00011186106663382189, "loss": 0.2044, "step": 3583 }, { "epoch": 1.3234859675036927, "grad_norm": 0.2958122491836548, "learning_rate": 0.00011183643305825842, "loss": 0.2402, "step": 3584 }, { "epoch": 1.3238552437223043, "grad_norm": 0.3272998034954071, "learning_rate": 0.00011181179948269491, "loss": 0.2863, "step": 3585 }, { "epoch": 1.3242245199409157, "grad_norm": 0.31955695152282715, "learning_rate": 0.00011178716590713144, "loss": 0.268, "step": 3586 }, { "epoch": 1.3245937961595273, "grad_norm": 0.26362356543540955, "learning_rate": 0.00011176253233156792, "loss": 0.201, "step": 3587 }, { "epoch": 1.324963072378139, "grad_norm": 0.23116347193717957, "learning_rate": 0.00011173789875600444, "loss": 0.2013, "step": 3588 }, { "epoch": 1.3253323485967503, "grad_norm": 0.3511759638786316, "learning_rate": 0.00011171326518044094, "loss": 0.3629, "step": 3589 }, { "epoch": 1.325701624815362, "grad_norm": 0.28257256746292114, "learning_rate": 0.00011168863160487745, "loss": 0.2534, "step": 3590 }, { "epoch": 1.3260709010339733, "grad_norm": 0.29674193263053894, "learning_rate": 0.00011166399802931396, "loss": 0.231, "step": 3591 }, { "epoch": 1.326440177252585, "grad_norm": 0.33192455768585205, "learning_rate": 0.00011163936445375047, "loss": 0.2413, "step": 3592 }, { "epoch": 1.3268094534711965, "grad_norm": 0.24622425436973572, "learning_rate": 0.00011161473087818697, "loss": 0.2075, "step": 3593 }, { "epoch": 1.327178729689808, "grad_norm": 0.2626505196094513, "learning_rate": 0.00011159009730262349, "loss": 0.2626, "step": 3594 }, { "epoch": 1.3275480059084195, "grad_norm": 0.25430116057395935, "learning_rate": 0.00011156546372705999, "loss": 0.1966, "step": 3595 }, { "epoch": 1.327917282127031, "grad_norm": 0.25840574502944946, "learning_rate": 0.0001115408301514965, "loss": 0.245, "step": 3596 }, { "epoch": 1.3282865583456425, "grad_norm": 0.2602691650390625, "learning_rate": 0.000111516196575933, "loss": 0.2162, "step": 3597 }, { "epoch": 1.328655834564254, "grad_norm": 0.3222030699253082, "learning_rate": 0.00011149156300036952, "loss": 0.2458, "step": 3598 }, { "epoch": 1.3290251107828657, "grad_norm": 0.3286999762058258, "learning_rate": 0.000111466929424806, "loss": 0.253, "step": 3599 }, { "epoch": 1.329394387001477, "grad_norm": 0.2511175274848938, "learning_rate": 0.00011144229584924253, "loss": 0.2356, "step": 3600 }, { "epoch": 1.329394387001477, "eval_loss": 0.26840391755104065, "eval_runtime": 5.8604, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.194, "step": 3600 }, { "epoch": 1.3297636632200887, "grad_norm": 0.2840924561023712, "learning_rate": 0.00011141766227367902, "loss": 0.2471, "step": 3601 }, { "epoch": 1.3301329394387, "grad_norm": 0.2691866159439087, "learning_rate": 0.00011139302869811555, "loss": 0.218, "step": 3602 }, { "epoch": 1.3305022156573116, "grad_norm": 0.25245973467826843, "learning_rate": 0.00011136839512255204, "loss": 0.2343, "step": 3603 }, { "epoch": 1.3308714918759232, "grad_norm": 0.3062613904476166, "learning_rate": 0.00011134376154698855, "loss": 0.2687, "step": 3604 }, { "epoch": 1.3312407680945348, "grad_norm": 0.2301330268383026, "learning_rate": 0.00011131912797142505, "loss": 0.2069, "step": 3605 }, { "epoch": 1.3316100443131462, "grad_norm": 0.22849799692630768, "learning_rate": 0.00011129449439586157, "loss": 0.2116, "step": 3606 }, { "epoch": 1.3319793205317578, "grad_norm": 0.32399725914001465, "learning_rate": 0.00011126986082029807, "loss": 0.3216, "step": 3607 }, { "epoch": 1.3323485967503692, "grad_norm": 0.3028990924358368, "learning_rate": 0.00011124522724473458, "loss": 0.2015, "step": 3608 }, { "epoch": 1.3327178729689808, "grad_norm": 0.25538018345832825, "learning_rate": 0.00011122059366917108, "loss": 0.234, "step": 3609 }, { "epoch": 1.3330871491875924, "grad_norm": 0.27684277296066284, "learning_rate": 0.0001111959600936076, "loss": 0.2358, "step": 3610 }, { "epoch": 1.3334564254062038, "grad_norm": 0.297973096370697, "learning_rate": 0.0001111713265180441, "loss": 0.2258, "step": 3611 }, { "epoch": 1.3338257016248154, "grad_norm": 0.25658944249153137, "learning_rate": 0.00011114669294248061, "loss": 0.2267, "step": 3612 }, { "epoch": 1.3341949778434268, "grad_norm": 0.27613380551338196, "learning_rate": 0.00011112205936691711, "loss": 0.2106, "step": 3613 }, { "epoch": 1.3345642540620384, "grad_norm": 0.3037564754486084, "learning_rate": 0.00011109742579135363, "loss": 0.2435, "step": 3614 }, { "epoch": 1.33493353028065, "grad_norm": 0.2635299265384674, "learning_rate": 0.00011107279221579012, "loss": 0.2252, "step": 3615 }, { "epoch": 1.3353028064992616, "grad_norm": 0.24047908186912537, "learning_rate": 0.00011104815864022664, "loss": 0.182, "step": 3616 }, { "epoch": 1.335672082717873, "grad_norm": 0.22222352027893066, "learning_rate": 0.00011102352506466313, "loss": 0.2407, "step": 3617 }, { "epoch": 1.3360413589364846, "grad_norm": 0.33356255292892456, "learning_rate": 0.00011099889148909966, "loss": 0.28, "step": 3618 }, { "epoch": 1.336410635155096, "grad_norm": 0.3413843810558319, "learning_rate": 0.00011097425791353615, "loss": 0.2963, "step": 3619 }, { "epoch": 1.3367799113737076, "grad_norm": 0.3379725217819214, "learning_rate": 0.00011094962433797266, "loss": 0.2675, "step": 3620 }, { "epoch": 1.3371491875923192, "grad_norm": 0.2826775908470154, "learning_rate": 0.00011092499076240916, "loss": 0.2481, "step": 3621 }, { "epoch": 1.3375184638109305, "grad_norm": 0.28579071164131165, "learning_rate": 0.00011090035718684568, "loss": 0.2113, "step": 3622 }, { "epoch": 1.3378877400295421, "grad_norm": 0.29486680030822754, "learning_rate": 0.00011087572361128218, "loss": 0.2591, "step": 3623 }, { "epoch": 1.3382570162481535, "grad_norm": 0.32082071900367737, "learning_rate": 0.0001108510900357187, "loss": 0.2274, "step": 3624 }, { "epoch": 1.3386262924667651, "grad_norm": 0.3154670000076294, "learning_rate": 0.0001108264564601552, "loss": 0.2413, "step": 3625 }, { "epoch": 1.3389955686853767, "grad_norm": 0.36913928389549255, "learning_rate": 0.00011080182288459171, "loss": 0.2319, "step": 3626 }, { "epoch": 1.339364844903988, "grad_norm": 0.3428744375705719, "learning_rate": 0.00011077718930902821, "loss": 0.2275, "step": 3627 }, { "epoch": 1.3397341211225997, "grad_norm": 0.3373352289199829, "learning_rate": 0.00011075255573346473, "loss": 0.2676, "step": 3628 }, { "epoch": 1.340103397341211, "grad_norm": 0.27837231755256653, "learning_rate": 0.00011072792215790123, "loss": 0.2508, "step": 3629 }, { "epoch": 1.3404726735598227, "grad_norm": 0.25249144434928894, "learning_rate": 0.00011070328858233774, "loss": 0.2428, "step": 3630 }, { "epoch": 1.3408419497784343, "grad_norm": 0.32394635677337646, "learning_rate": 0.00011067865500677423, "loss": 0.2871, "step": 3631 }, { "epoch": 1.341211225997046, "grad_norm": 0.27134162187576294, "learning_rate": 0.00011065402143121076, "loss": 0.2512, "step": 3632 }, { "epoch": 1.3415805022156573, "grad_norm": 0.2610749304294586, "learning_rate": 0.00011062938785564724, "loss": 0.1917, "step": 3633 }, { "epoch": 1.3419497784342689, "grad_norm": 0.30713263154029846, "learning_rate": 0.00011060475428008377, "loss": 0.258, "step": 3634 }, { "epoch": 1.3423190546528803, "grad_norm": 0.24225565791130066, "learning_rate": 0.00011058012070452026, "loss": 0.214, "step": 3635 }, { "epoch": 1.3426883308714919, "grad_norm": 0.261345237493515, "learning_rate": 0.00011055548712895677, "loss": 0.2562, "step": 3636 }, { "epoch": 1.3430576070901035, "grad_norm": 0.26719626784324646, "learning_rate": 0.00011053085355339328, "loss": 0.2272, "step": 3637 }, { "epoch": 1.3434268833087148, "grad_norm": 0.25453051924705505, "learning_rate": 0.00011050621997782979, "loss": 0.2172, "step": 3638 }, { "epoch": 1.3437961595273265, "grad_norm": 0.2616005539894104, "learning_rate": 0.00011048158640226629, "loss": 0.2432, "step": 3639 }, { "epoch": 1.3441654357459378, "grad_norm": 0.2986209988594055, "learning_rate": 0.0001104569528267028, "loss": 0.2212, "step": 3640 }, { "epoch": 1.3445347119645494, "grad_norm": 0.20458440482616425, "learning_rate": 0.00011043231925113931, "loss": 0.1946, "step": 3641 }, { "epoch": 1.344903988183161, "grad_norm": 0.2503405511379242, "learning_rate": 0.00011040768567557582, "loss": 0.2067, "step": 3642 }, { "epoch": 1.3452732644017726, "grad_norm": 0.2201780378818512, "learning_rate": 0.00011038305210001232, "loss": 0.1827, "step": 3643 }, { "epoch": 1.345642540620384, "grad_norm": 0.24748550355434418, "learning_rate": 0.00011035841852444884, "loss": 0.2154, "step": 3644 }, { "epoch": 1.3460118168389956, "grad_norm": 0.30875056982040405, "learning_rate": 0.00011033378494888534, "loss": 0.2138, "step": 3645 }, { "epoch": 1.346381093057607, "grad_norm": 0.3330707848072052, "learning_rate": 0.00011030915137332185, "loss": 0.267, "step": 3646 }, { "epoch": 1.3467503692762186, "grad_norm": 0.2122422307729721, "learning_rate": 0.00011028451779775834, "loss": 0.1964, "step": 3647 }, { "epoch": 1.3471196454948302, "grad_norm": 0.33327516913414, "learning_rate": 0.00011025988422219487, "loss": 0.2413, "step": 3648 }, { "epoch": 1.3474889217134416, "grad_norm": 0.2685951292514801, "learning_rate": 0.00011023525064663136, "loss": 0.2271, "step": 3649 }, { "epoch": 1.3478581979320532, "grad_norm": 0.2989233434200287, "learning_rate": 0.00011021061707106789, "loss": 0.2366, "step": 3650 }, { "epoch": 1.3478581979320532, "eval_loss": 0.27142253518104553, "eval_runtime": 5.8581, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 3650 }, { "epoch": 1.3482274741506646, "grad_norm": 0.31434720754623413, "learning_rate": 0.00011018598349550437, "loss": 0.2186, "step": 3651 }, { "epoch": 1.3485967503692762, "grad_norm": 0.2665393054485321, "learning_rate": 0.00011016134991994089, "loss": 0.2416, "step": 3652 }, { "epoch": 1.3489660265878878, "grad_norm": 0.251358300447464, "learning_rate": 0.00011013671634437739, "loss": 0.2153, "step": 3653 }, { "epoch": 1.3493353028064994, "grad_norm": 0.2691832184791565, "learning_rate": 0.0001101120827688139, "loss": 0.2225, "step": 3654 }, { "epoch": 1.3497045790251108, "grad_norm": 0.25989165902137756, "learning_rate": 0.0001100874491932504, "loss": 0.2588, "step": 3655 }, { "epoch": 1.3500738552437224, "grad_norm": 0.31075742840766907, "learning_rate": 0.00011006281561768692, "loss": 0.267, "step": 3656 }, { "epoch": 1.3504431314623337, "grad_norm": 0.28866028785705566, "learning_rate": 0.00011003818204212342, "loss": 0.2414, "step": 3657 }, { "epoch": 1.3508124076809453, "grad_norm": 0.24472151696681976, "learning_rate": 0.00011001354846655993, "loss": 0.2098, "step": 3658 }, { "epoch": 1.351181683899557, "grad_norm": 0.28179073333740234, "learning_rate": 0.00010998891489099644, "loss": 0.2555, "step": 3659 }, { "epoch": 1.3515509601181683, "grad_norm": 0.33718785643577576, "learning_rate": 0.00010996428131543295, "loss": 0.3021, "step": 3660 }, { "epoch": 1.35192023633678, "grad_norm": 0.2833876311779022, "learning_rate": 0.00010993964773986945, "loss": 0.2451, "step": 3661 }, { "epoch": 1.3522895125553913, "grad_norm": 0.30380919575691223, "learning_rate": 0.00010991501416430597, "loss": 0.2152, "step": 3662 }, { "epoch": 1.352658788774003, "grad_norm": 0.2670554220676422, "learning_rate": 0.00010989038058874245, "loss": 0.2037, "step": 3663 }, { "epoch": 1.3530280649926145, "grad_norm": 0.2859307825565338, "learning_rate": 0.00010986574701317898, "loss": 0.2597, "step": 3664 }, { "epoch": 1.3533973412112261, "grad_norm": 0.27518805861473083, "learning_rate": 0.00010984111343761547, "loss": 0.2299, "step": 3665 }, { "epoch": 1.3537666174298375, "grad_norm": 0.2528857886791229, "learning_rate": 0.000109816479862052, "loss": 0.2177, "step": 3666 }, { "epoch": 1.354135893648449, "grad_norm": 0.31784820556640625, "learning_rate": 0.00010979184628648848, "loss": 0.2466, "step": 3667 }, { "epoch": 1.3545051698670605, "grad_norm": 0.2805560827255249, "learning_rate": 0.000109767212710925, "loss": 0.2369, "step": 3668 }, { "epoch": 1.354874446085672, "grad_norm": 0.30466175079345703, "learning_rate": 0.0001097425791353615, "loss": 0.2562, "step": 3669 }, { "epoch": 1.3552437223042837, "grad_norm": 0.2664591670036316, "learning_rate": 0.00010971794555979802, "loss": 0.2238, "step": 3670 }, { "epoch": 1.355612998522895, "grad_norm": 0.26514458656311035, "learning_rate": 0.00010969331198423452, "loss": 0.227, "step": 3671 }, { "epoch": 1.3559822747415067, "grad_norm": 0.31222182512283325, "learning_rate": 0.00010966867840867103, "loss": 0.2218, "step": 3672 }, { "epoch": 1.356351550960118, "grad_norm": 0.2709215581417084, "learning_rate": 0.00010964404483310753, "loss": 0.2398, "step": 3673 }, { "epoch": 1.3567208271787297, "grad_norm": 0.3307381272315979, "learning_rate": 0.00010961941125754405, "loss": 0.2217, "step": 3674 }, { "epoch": 1.3570901033973413, "grad_norm": 0.27952146530151367, "learning_rate": 0.00010959477768198055, "loss": 0.2679, "step": 3675 }, { "epoch": 1.3574593796159529, "grad_norm": 0.31884104013442993, "learning_rate": 0.00010957014410641706, "loss": 0.2754, "step": 3676 }, { "epoch": 1.3578286558345642, "grad_norm": 0.29868417978286743, "learning_rate": 0.00010954551053085356, "loss": 0.226, "step": 3677 }, { "epoch": 1.3581979320531758, "grad_norm": 0.24867242574691772, "learning_rate": 0.00010952087695529008, "loss": 0.2512, "step": 3678 }, { "epoch": 1.3585672082717872, "grad_norm": 0.2631646692752838, "learning_rate": 0.00010949624337972657, "loss": 0.2219, "step": 3679 }, { "epoch": 1.3589364844903988, "grad_norm": 0.2733980417251587, "learning_rate": 0.0001094716098041631, "loss": 0.2473, "step": 3680 }, { "epoch": 1.3593057607090104, "grad_norm": 0.2543134391307831, "learning_rate": 0.00010944697622859958, "loss": 0.2016, "step": 3681 }, { "epoch": 1.3596750369276218, "grad_norm": 0.2649345397949219, "learning_rate": 0.00010942234265303611, "loss": 0.1919, "step": 3682 }, { "epoch": 1.3600443131462334, "grad_norm": 0.3075491487979889, "learning_rate": 0.0001093977090774726, "loss": 0.2867, "step": 3683 }, { "epoch": 1.3604135893648448, "grad_norm": 0.2779380679130554, "learning_rate": 0.00010937307550190911, "loss": 0.2578, "step": 3684 }, { "epoch": 1.3607828655834564, "grad_norm": 0.29548507928848267, "learning_rate": 0.00010934844192634561, "loss": 0.2865, "step": 3685 }, { "epoch": 1.361152141802068, "grad_norm": 0.26643845438957214, "learning_rate": 0.00010932380835078213, "loss": 0.2224, "step": 3686 }, { "epoch": 1.3615214180206794, "grad_norm": 0.2854791581630707, "learning_rate": 0.00010929917477521863, "loss": 0.2617, "step": 3687 }, { "epoch": 1.361890694239291, "grad_norm": 0.20975206792354584, "learning_rate": 0.00010927454119965514, "loss": 0.1908, "step": 3688 }, { "epoch": 1.3622599704579026, "grad_norm": 0.26472845673561096, "learning_rate": 0.00010924990762409164, "loss": 0.2211, "step": 3689 }, { "epoch": 1.362629246676514, "grad_norm": 0.3006521463394165, "learning_rate": 0.00010922527404852816, "loss": 0.2687, "step": 3690 }, { "epoch": 1.3629985228951256, "grad_norm": 0.25495338439941406, "learning_rate": 0.00010920064047296466, "loss": 0.2094, "step": 3691 }, { "epoch": 1.3633677991137372, "grad_norm": 0.3352770507335663, "learning_rate": 0.00010917600689740117, "loss": 0.2512, "step": 3692 }, { "epoch": 1.3637370753323486, "grad_norm": 0.26214849948883057, "learning_rate": 0.00010915137332183766, "loss": 0.2353, "step": 3693 }, { "epoch": 1.3641063515509602, "grad_norm": 0.24108755588531494, "learning_rate": 0.00010912673974627419, "loss": 0.1749, "step": 3694 }, { "epoch": 1.3644756277695715, "grad_norm": 0.2967040538787842, "learning_rate": 0.00010910210617071068, "loss": 0.2393, "step": 3695 }, { "epoch": 1.3648449039881831, "grad_norm": 0.282380610704422, "learning_rate": 0.0001090774725951472, "loss": 0.2463, "step": 3696 }, { "epoch": 1.3652141802067947, "grad_norm": 0.2374754101037979, "learning_rate": 0.0001090528390195837, "loss": 0.2207, "step": 3697 }, { "epoch": 1.3655834564254061, "grad_norm": 0.2998552620410919, "learning_rate": 0.00010902820544402021, "loss": 0.2802, "step": 3698 }, { "epoch": 1.3659527326440177, "grad_norm": 0.2999780774116516, "learning_rate": 0.00010900357186845671, "loss": 0.2614, "step": 3699 }, { "epoch": 1.3663220088626291, "grad_norm": 0.31329798698425293, "learning_rate": 0.00010897893829289322, "loss": 0.2861, "step": 3700 }, { "epoch": 1.3663220088626291, "eval_loss": 0.27130240201950073, "eval_runtime": 5.86, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.195, "step": 3700 }, { "epoch": 1.3666912850812407, "grad_norm": 0.2652784287929535, "learning_rate": 0.00010895430471732973, "loss": 0.2767, "step": 3701 }, { "epoch": 1.3670605612998523, "grad_norm": 0.32752612233161926, "learning_rate": 0.00010892967114176624, "loss": 0.2668, "step": 3702 }, { "epoch": 1.367429837518464, "grad_norm": 0.32744166254997253, "learning_rate": 0.00010890503756620274, "loss": 0.2559, "step": 3703 }, { "epoch": 1.3677991137370753, "grad_norm": 0.2567864656448364, "learning_rate": 0.00010888040399063926, "loss": 0.2455, "step": 3704 }, { "epoch": 1.368168389955687, "grad_norm": 0.3054686188697815, "learning_rate": 0.00010885577041507576, "loss": 0.2589, "step": 3705 }, { "epoch": 1.3685376661742983, "grad_norm": 0.27559489011764526, "learning_rate": 0.00010883113683951227, "loss": 0.2532, "step": 3706 }, { "epoch": 1.3689069423929099, "grad_norm": 0.26456356048583984, "learning_rate": 0.00010880650326394877, "loss": 0.2241, "step": 3707 }, { "epoch": 1.3692762186115215, "grad_norm": 0.2512317895889282, "learning_rate": 0.00010878186968838529, "loss": 0.2225, "step": 3708 }, { "epoch": 1.3696454948301329, "grad_norm": 0.2673681974411011, "learning_rate": 0.00010875723611282177, "loss": 0.2313, "step": 3709 }, { "epoch": 1.3700147710487445, "grad_norm": 0.30007287859916687, "learning_rate": 0.0001087326025372583, "loss": 0.2213, "step": 3710 }, { "epoch": 1.3703840472673559, "grad_norm": 0.3304269313812256, "learning_rate": 0.00010870796896169479, "loss": 0.2427, "step": 3711 }, { "epoch": 1.3707533234859675, "grad_norm": 0.2871852517127991, "learning_rate": 0.00010868333538613132, "loss": 0.2533, "step": 3712 }, { "epoch": 1.371122599704579, "grad_norm": 0.23127683997154236, "learning_rate": 0.0001086587018105678, "loss": 0.2301, "step": 3713 }, { "epoch": 1.3714918759231907, "grad_norm": 0.31266435980796814, "learning_rate": 0.00010863406823500432, "loss": 0.2789, "step": 3714 }, { "epoch": 1.371861152141802, "grad_norm": 0.28091543912887573, "learning_rate": 0.00010860943465944082, "loss": 0.2336, "step": 3715 }, { "epoch": 1.3722304283604136, "grad_norm": 0.3167265057563782, "learning_rate": 0.00010858480108387734, "loss": 0.2452, "step": 3716 }, { "epoch": 1.372599704579025, "grad_norm": 0.3556848466396332, "learning_rate": 0.00010856016750831384, "loss": 0.3206, "step": 3717 }, { "epoch": 1.3729689807976366, "grad_norm": 0.490351140499115, "learning_rate": 0.00010853553393275034, "loss": 0.2759, "step": 3718 }, { "epoch": 1.3733382570162482, "grad_norm": 0.29599419236183167, "learning_rate": 0.00010851090035718685, "loss": 0.2617, "step": 3719 }, { "epoch": 1.3737075332348596, "grad_norm": 0.2695614993572235, "learning_rate": 0.00010848626678162334, "loss": 0.2742, "step": 3720 }, { "epoch": 1.3740768094534712, "grad_norm": 0.27813664078712463, "learning_rate": 0.00010846163320605987, "loss": 0.2292, "step": 3721 }, { "epoch": 1.3744460856720826, "grad_norm": 0.21287007629871368, "learning_rate": 0.00010843699963049636, "loss": 0.1806, "step": 3722 }, { "epoch": 1.3748153618906942, "grad_norm": 0.26846447587013245, "learning_rate": 0.00010841236605493288, "loss": 0.2734, "step": 3723 }, { "epoch": 1.3751846381093058, "grad_norm": 0.2294163703918457, "learning_rate": 0.00010838773247936937, "loss": 0.2152, "step": 3724 }, { "epoch": 1.3755539143279174, "grad_norm": 0.3586331009864807, "learning_rate": 0.00010836309890380589, "loss": 0.2278, "step": 3725 }, { "epoch": 1.3759231905465288, "grad_norm": 0.254730224609375, "learning_rate": 0.00010833846532824239, "loss": 0.2161, "step": 3726 }, { "epoch": 1.3762924667651404, "grad_norm": 0.23388056457042694, "learning_rate": 0.0001083138317526789, "loss": 0.2165, "step": 3727 }, { "epoch": 1.3766617429837518, "grad_norm": 0.2615768015384674, "learning_rate": 0.0001082891981771154, "loss": 0.2066, "step": 3728 }, { "epoch": 1.3770310192023634, "grad_norm": 0.2843790650367737, "learning_rate": 0.00010826456460155192, "loss": 0.2206, "step": 3729 }, { "epoch": 1.377400295420975, "grad_norm": 0.31225937604904175, "learning_rate": 0.00010823993102598842, "loss": 0.2974, "step": 3730 }, { "epoch": 1.3777695716395864, "grad_norm": 0.25193703174591064, "learning_rate": 0.00010821529745042493, "loss": 0.2215, "step": 3731 }, { "epoch": 1.378138847858198, "grad_norm": 0.2730855941772461, "learning_rate": 0.00010819066387486143, "loss": 0.2546, "step": 3732 }, { "epoch": 1.3785081240768093, "grad_norm": 0.21629397571086884, "learning_rate": 0.00010816603029929795, "loss": 0.2282, "step": 3733 }, { "epoch": 1.378877400295421, "grad_norm": 0.30740419030189514, "learning_rate": 0.00010814139672373445, "loss": 0.26, "step": 3734 }, { "epoch": 1.3792466765140325, "grad_norm": 0.2641306221485138, "learning_rate": 0.00010811676314817097, "loss": 0.2159, "step": 3735 }, { "epoch": 1.3796159527326441, "grad_norm": 0.32722675800323486, "learning_rate": 0.00010809212957260745, "loss": 0.2639, "step": 3736 }, { "epoch": 1.3799852289512555, "grad_norm": 0.29486608505249023, "learning_rate": 0.00010806749599704398, "loss": 0.2597, "step": 3737 }, { "epoch": 1.3803545051698671, "grad_norm": 0.30441945791244507, "learning_rate": 0.00010804286242148047, "loss": 0.2393, "step": 3738 }, { "epoch": 1.3807237813884785, "grad_norm": 0.26642045378685, "learning_rate": 0.000108018228845917, "loss": 0.2471, "step": 3739 }, { "epoch": 1.3810930576070901, "grad_norm": 0.28097429871559143, "learning_rate": 0.00010799359527035348, "loss": 0.2535, "step": 3740 }, { "epoch": 1.3814623338257017, "grad_norm": 0.23791688680648804, "learning_rate": 0.00010796896169479, "loss": 0.197, "step": 3741 }, { "epoch": 1.381831610044313, "grad_norm": 0.24202603101730347, "learning_rate": 0.0001079443281192265, "loss": 0.1952, "step": 3742 }, { "epoch": 1.3822008862629247, "grad_norm": 0.24361658096313477, "learning_rate": 0.00010791969454366301, "loss": 0.2197, "step": 3743 }, { "epoch": 1.382570162481536, "grad_norm": 0.3844248652458191, "learning_rate": 0.00010789506096809952, "loss": 0.2484, "step": 3744 }, { "epoch": 1.3829394387001477, "grad_norm": 0.26775461435317993, "learning_rate": 0.00010787042739253603, "loss": 0.2645, "step": 3745 }, { "epoch": 1.3833087149187593, "grad_norm": 0.31835201382637024, "learning_rate": 0.00010784579381697253, "loss": 0.2766, "step": 3746 }, { "epoch": 1.3836779911373709, "grad_norm": 0.2494579553604126, "learning_rate": 0.00010782116024140905, "loss": 0.2101, "step": 3747 }, { "epoch": 1.3840472673559823, "grad_norm": 0.24214380979537964, "learning_rate": 0.00010779652666584555, "loss": 0.1978, "step": 3748 }, { "epoch": 1.3844165435745939, "grad_norm": 0.2529049813747406, "learning_rate": 0.00010777189309028206, "loss": 0.2452, "step": 3749 }, { "epoch": 1.3847858197932053, "grad_norm": 0.30903905630111694, "learning_rate": 0.00010774725951471856, "loss": 0.2745, "step": 3750 }, { "epoch": 1.3847858197932053, "eval_loss": 0.2686930000782013, "eval_runtime": 5.861, "eval_samples_per_second": 8.531, "eval_steps_per_second": 1.194, "step": 3750 }, { "epoch": 1.3851550960118169, "grad_norm": 0.3068532943725586, "learning_rate": 0.00010772262593915508, "loss": 0.253, "step": 3751 }, { "epoch": 1.3855243722304285, "grad_norm": 0.26122722029685974, "learning_rate": 0.00010769799236359157, "loss": 0.2271, "step": 3752 }, { "epoch": 1.3858936484490398, "grad_norm": 0.2446070909500122, "learning_rate": 0.0001076733587880281, "loss": 0.2386, "step": 3753 }, { "epoch": 1.3862629246676514, "grad_norm": 0.30802103877067566, "learning_rate": 0.00010764872521246458, "loss": 0.2234, "step": 3754 }, { "epoch": 1.3866322008862628, "grad_norm": 0.26052504777908325, "learning_rate": 0.00010762409163690111, "loss": 0.2106, "step": 3755 }, { "epoch": 1.3870014771048744, "grad_norm": 0.275816410779953, "learning_rate": 0.0001075994580613376, "loss": 0.2401, "step": 3756 }, { "epoch": 1.387370753323486, "grad_norm": 0.32631778717041016, "learning_rate": 0.00010757482448577411, "loss": 0.2508, "step": 3757 }, { "epoch": 1.3877400295420974, "grad_norm": 0.2649644613265991, "learning_rate": 0.00010755019091021061, "loss": 0.2446, "step": 3758 }, { "epoch": 1.388109305760709, "grad_norm": 0.29543444514274597, "learning_rate": 0.00010752555733464713, "loss": 0.2917, "step": 3759 }, { "epoch": 1.3884785819793206, "grad_norm": 0.26043254137039185, "learning_rate": 0.00010750092375908363, "loss": 0.2437, "step": 3760 }, { "epoch": 1.388847858197932, "grad_norm": 0.29932519793510437, "learning_rate": 0.00010747629018352014, "loss": 0.3138, "step": 3761 }, { "epoch": 1.3892171344165436, "grad_norm": 0.26750093698501587, "learning_rate": 0.00010745165660795664, "loss": 0.2127, "step": 3762 }, { "epoch": 1.3895864106351552, "grad_norm": 0.25077375769615173, "learning_rate": 0.00010742702303239316, "loss": 0.2429, "step": 3763 }, { "epoch": 1.3899556868537666, "grad_norm": 0.3207470774650574, "learning_rate": 0.00010740238945682966, "loss": 0.2614, "step": 3764 }, { "epoch": 1.3903249630723782, "grad_norm": 0.2767865061759949, "learning_rate": 0.00010737775588126617, "loss": 0.2872, "step": 3765 }, { "epoch": 1.3906942392909896, "grad_norm": 0.2964353859424591, "learning_rate": 0.00010735312230570268, "loss": 0.2995, "step": 3766 }, { "epoch": 1.3910635155096012, "grad_norm": 0.25089460611343384, "learning_rate": 0.00010732848873013919, "loss": 0.2297, "step": 3767 }, { "epoch": 1.3914327917282128, "grad_norm": 0.2735138237476349, "learning_rate": 0.00010730385515457568, "loss": 0.2655, "step": 3768 }, { "epoch": 1.3918020679468242, "grad_norm": 0.26005735993385315, "learning_rate": 0.0001072792215790122, "loss": 0.256, "step": 3769 }, { "epoch": 1.3921713441654358, "grad_norm": 0.4221028983592987, "learning_rate": 0.00010725458800344869, "loss": 0.2676, "step": 3770 }, { "epoch": 1.3925406203840471, "grad_norm": 0.23463353514671326, "learning_rate": 0.00010722995442788522, "loss": 0.2255, "step": 3771 }, { "epoch": 1.3929098966026587, "grad_norm": 0.2788041830062866, "learning_rate": 0.00010720532085232171, "loss": 0.2473, "step": 3772 }, { "epoch": 1.3932791728212703, "grad_norm": 0.2994675934314728, "learning_rate": 0.00010718068727675822, "loss": 0.2383, "step": 3773 }, { "epoch": 1.393648449039882, "grad_norm": 0.23976139724254608, "learning_rate": 0.00010715605370119472, "loss": 0.2157, "step": 3774 }, { "epoch": 1.3940177252584933, "grad_norm": 0.3295306861400604, "learning_rate": 0.00010713142012563124, "loss": 0.3445, "step": 3775 }, { "epoch": 1.394387001477105, "grad_norm": 0.2559310793876648, "learning_rate": 0.00010710678655006774, "loss": 0.2262, "step": 3776 }, { "epoch": 1.3947562776957163, "grad_norm": 0.2468082755804062, "learning_rate": 0.00010708215297450425, "loss": 0.2519, "step": 3777 }, { "epoch": 1.395125553914328, "grad_norm": 0.4029478430747986, "learning_rate": 0.00010705751939894076, "loss": 0.2463, "step": 3778 }, { "epoch": 1.3954948301329395, "grad_norm": 0.33493492007255554, "learning_rate": 0.00010703288582337727, "loss": 0.2887, "step": 3779 }, { "epoch": 1.395864106351551, "grad_norm": 0.30173709988594055, "learning_rate": 0.00010700825224781377, "loss": 0.2067, "step": 3780 }, { "epoch": 1.3962333825701625, "grad_norm": 0.2795202136039734, "learning_rate": 0.00010698361867225029, "loss": 0.2169, "step": 3781 }, { "epoch": 1.3966026587887739, "grad_norm": 0.2911227345466614, "learning_rate": 0.00010695898509668679, "loss": 0.2852, "step": 3782 }, { "epoch": 1.3969719350073855, "grad_norm": 0.25937220454216003, "learning_rate": 0.0001069343515211233, "loss": 0.2537, "step": 3783 }, { "epoch": 1.397341211225997, "grad_norm": 0.2779325842857361, "learning_rate": 0.00010690971794555979, "loss": 0.2231, "step": 3784 }, { "epoch": 1.3977104874446087, "grad_norm": 0.29453492164611816, "learning_rate": 0.00010688508436999632, "loss": 0.2488, "step": 3785 }, { "epoch": 1.39807976366322, "grad_norm": 0.2754989266395569, "learning_rate": 0.0001068604507944328, "loss": 0.2496, "step": 3786 }, { "epoch": 1.3984490398818317, "grad_norm": 0.31683996319770813, "learning_rate": 0.00010683581721886933, "loss": 0.2838, "step": 3787 }, { "epoch": 1.398818316100443, "grad_norm": 0.23349718749523163, "learning_rate": 0.00010681118364330582, "loss": 0.2114, "step": 3788 }, { "epoch": 1.3991875923190547, "grad_norm": 0.30497369170188904, "learning_rate": 0.00010678655006774234, "loss": 0.2579, "step": 3789 }, { "epoch": 1.3995568685376663, "grad_norm": 0.2672632932662964, "learning_rate": 0.00010676191649217884, "loss": 0.236, "step": 3790 }, { "epoch": 1.3999261447562776, "grad_norm": 0.25800979137420654, "learning_rate": 0.00010673728291661535, "loss": 0.2195, "step": 3791 }, { "epoch": 1.4002954209748892, "grad_norm": 0.29944947361946106, "learning_rate": 0.00010671264934105185, "loss": 0.269, "step": 3792 }, { "epoch": 1.4006646971935006, "grad_norm": 0.23728716373443604, "learning_rate": 0.00010668801576548837, "loss": 0.232, "step": 3793 }, { "epoch": 1.4010339734121122, "grad_norm": 0.27027857303619385, "learning_rate": 0.00010666338218992487, "loss": 0.2326, "step": 3794 }, { "epoch": 1.4014032496307238, "grad_norm": 0.3711899220943451, "learning_rate": 0.00010663874861436138, "loss": 0.2778, "step": 3795 }, { "epoch": 1.4017725258493354, "grad_norm": 0.3063708543777466, "learning_rate": 0.00010661411503879788, "loss": 0.2505, "step": 3796 }, { "epoch": 1.4021418020679468, "grad_norm": 0.3206406533718109, "learning_rate": 0.0001065894814632344, "loss": 0.2306, "step": 3797 }, { "epoch": 1.4025110782865584, "grad_norm": 0.28478190302848816, "learning_rate": 0.0001065648478876709, "loss": 0.225, "step": 3798 }, { "epoch": 1.4028803545051698, "grad_norm": 0.25216224789619446, "learning_rate": 0.00010654021431210741, "loss": 0.2308, "step": 3799 }, { "epoch": 1.4032496307237814, "grad_norm": 0.2996431589126587, "learning_rate": 0.0001065155807365439, "loss": 0.2577, "step": 3800 }, { "epoch": 1.4032496307237814, "eval_loss": 0.2693694233894348, "eval_runtime": 5.8472, "eval_samples_per_second": 8.551, "eval_steps_per_second": 1.197, "step": 3800 }, { "epoch": 1.403618906942393, "grad_norm": 0.35670316219329834, "learning_rate": 0.00010649094716098043, "loss": 0.2303, "step": 3801 }, { "epoch": 1.4039881831610044, "grad_norm": 0.23958542943000793, "learning_rate": 0.00010646631358541692, "loss": 0.2306, "step": 3802 }, { "epoch": 1.404357459379616, "grad_norm": 0.32521572709083557, "learning_rate": 0.00010644168000985345, "loss": 0.2521, "step": 3803 }, { "epoch": 1.4047267355982274, "grad_norm": 0.2848784327507019, "learning_rate": 0.00010641704643428993, "loss": 0.2835, "step": 3804 }, { "epoch": 1.405096011816839, "grad_norm": 0.2572305500507355, "learning_rate": 0.00010639241285872645, "loss": 0.2469, "step": 3805 }, { "epoch": 1.4054652880354506, "grad_norm": 0.3307071626186371, "learning_rate": 0.00010636777928316295, "loss": 0.2501, "step": 3806 }, { "epoch": 1.4058345642540622, "grad_norm": 0.2324734479188919, "learning_rate": 0.00010634314570759946, "loss": 0.2143, "step": 3807 }, { "epoch": 1.4062038404726735, "grad_norm": 0.2600086033344269, "learning_rate": 0.00010631851213203596, "loss": 0.2514, "step": 3808 }, { "epoch": 1.4065731166912852, "grad_norm": 0.25537365674972534, "learning_rate": 0.00010629387855647248, "loss": 0.222, "step": 3809 }, { "epoch": 1.4069423929098965, "grad_norm": 0.2847685217857361, "learning_rate": 0.00010626924498090898, "loss": 0.2719, "step": 3810 }, { "epoch": 1.4073116691285081, "grad_norm": 0.21500246226787567, "learning_rate": 0.0001062446114053455, "loss": 0.1824, "step": 3811 }, { "epoch": 1.4076809453471197, "grad_norm": 0.3191910982131958, "learning_rate": 0.000106219977829782, "loss": 0.2317, "step": 3812 }, { "epoch": 1.4080502215657311, "grad_norm": 0.22762149572372437, "learning_rate": 0.00010619534425421851, "loss": 0.217, "step": 3813 }, { "epoch": 1.4084194977843427, "grad_norm": 0.23422375321388245, "learning_rate": 0.00010617071067865501, "loss": 0.237, "step": 3814 }, { "epoch": 1.408788774002954, "grad_norm": 0.2757764458656311, "learning_rate": 0.00010614607710309153, "loss": 0.2148, "step": 3815 }, { "epoch": 1.4091580502215657, "grad_norm": 0.5969408750534058, "learning_rate": 0.00010612144352752801, "loss": 0.285, "step": 3816 }, { "epoch": 1.4095273264401773, "grad_norm": 0.3429873585700989, "learning_rate": 0.00010609680995196454, "loss": 0.2885, "step": 3817 }, { "epoch": 1.409896602658789, "grad_norm": 0.2952936887741089, "learning_rate": 0.00010607217637640103, "loss": 0.2207, "step": 3818 }, { "epoch": 1.4102658788774003, "grad_norm": 0.273403137922287, "learning_rate": 0.00010604754280083756, "loss": 0.2558, "step": 3819 }, { "epoch": 1.410635155096012, "grad_norm": 0.22843249142169952, "learning_rate": 0.00010602290922527405, "loss": 0.1915, "step": 3820 }, { "epoch": 1.4110044313146233, "grad_norm": 0.24980376660823822, "learning_rate": 0.00010599827564971056, "loss": 0.2318, "step": 3821 }, { "epoch": 1.4113737075332349, "grad_norm": 0.2425321340560913, "learning_rate": 0.00010597364207414706, "loss": 0.2232, "step": 3822 }, { "epoch": 1.4117429837518465, "grad_norm": 0.2785226106643677, "learning_rate": 0.00010594900849858358, "loss": 0.2312, "step": 3823 }, { "epoch": 1.4121122599704579, "grad_norm": 0.22242644429206848, "learning_rate": 0.00010592437492302008, "loss": 0.1902, "step": 3824 }, { "epoch": 1.4124815361890695, "grad_norm": 0.26450857520103455, "learning_rate": 0.00010589974134745659, "loss": 0.2219, "step": 3825 }, { "epoch": 1.4128508124076808, "grad_norm": 0.26558247208595276, "learning_rate": 0.00010587510777189309, "loss": 0.2296, "step": 3826 }, { "epoch": 1.4132200886262924, "grad_norm": 0.2665616571903229, "learning_rate": 0.00010585047419632961, "loss": 0.2357, "step": 3827 }, { "epoch": 1.413589364844904, "grad_norm": 0.24615925550460815, "learning_rate": 0.00010582584062076611, "loss": 0.233, "step": 3828 }, { "epoch": 1.4139586410635154, "grad_norm": 0.25677865743637085, "learning_rate": 0.00010580120704520262, "loss": 0.2011, "step": 3829 }, { "epoch": 1.414327917282127, "grad_norm": 0.25913333892822266, "learning_rate": 0.00010577657346963912, "loss": 0.2396, "step": 3830 }, { "epoch": 1.4146971935007384, "grad_norm": 0.3277101218700409, "learning_rate": 0.00010575193989407564, "loss": 0.2446, "step": 3831 }, { "epoch": 1.41506646971935, "grad_norm": 0.23574373126029968, "learning_rate": 0.00010572730631851213, "loss": 0.2325, "step": 3832 }, { "epoch": 1.4154357459379616, "grad_norm": 0.25479990243911743, "learning_rate": 0.00010570267274294865, "loss": 0.2242, "step": 3833 }, { "epoch": 1.4158050221565732, "grad_norm": 0.2911861538887024, "learning_rate": 0.00010567803916738514, "loss": 0.2126, "step": 3834 }, { "epoch": 1.4161742983751846, "grad_norm": 0.24522805213928223, "learning_rate": 0.00010565340559182167, "loss": 0.2366, "step": 3835 }, { "epoch": 1.4165435745937962, "grad_norm": 0.2456912100315094, "learning_rate": 0.00010562877201625816, "loss": 0.2008, "step": 3836 }, { "epoch": 1.4169128508124076, "grad_norm": 0.24651378393173218, "learning_rate": 0.00010560413844069467, "loss": 0.2032, "step": 3837 }, { "epoch": 1.4172821270310192, "grad_norm": 0.28535136580467224, "learning_rate": 0.00010557950486513117, "loss": 0.2572, "step": 3838 }, { "epoch": 1.4176514032496308, "grad_norm": 0.3038933277130127, "learning_rate": 0.00010555487128956769, "loss": 0.2531, "step": 3839 }, { "epoch": 1.4180206794682422, "grad_norm": 0.27011793851852417, "learning_rate": 0.00010553023771400419, "loss": 0.2549, "step": 3840 }, { "epoch": 1.4183899556868538, "grad_norm": 0.21485698223114014, "learning_rate": 0.0001055056041384407, "loss": 0.1899, "step": 3841 }, { "epoch": 1.4187592319054652, "grad_norm": 0.3180251717567444, "learning_rate": 0.0001054809705628772, "loss": 0.2393, "step": 3842 }, { "epoch": 1.4191285081240768, "grad_norm": 0.2945382297039032, "learning_rate": 0.00010545633698731372, "loss": 0.2455, "step": 3843 }, { "epoch": 1.4194977843426884, "grad_norm": 0.29279324412345886, "learning_rate": 0.00010543170341175022, "loss": 0.251, "step": 3844 }, { "epoch": 1.4198670605613, "grad_norm": 0.30234163999557495, "learning_rate": 0.00010540706983618674, "loss": 0.2545, "step": 3845 }, { "epoch": 1.4202363367799113, "grad_norm": 0.23178504407405853, "learning_rate": 0.00010538243626062322, "loss": 0.2203, "step": 3846 }, { "epoch": 1.420605612998523, "grad_norm": 0.24344073235988617, "learning_rate": 0.00010535780268505975, "loss": 0.2348, "step": 3847 }, { "epoch": 1.4209748892171343, "grad_norm": 0.2559746503829956, "learning_rate": 0.00010533316910949624, "loss": 0.2093, "step": 3848 }, { "epoch": 1.421344165435746, "grad_norm": 0.3378274738788605, "learning_rate": 0.00010530853553393277, "loss": 0.2978, "step": 3849 }, { "epoch": 1.4217134416543575, "grad_norm": 0.26383623480796814, "learning_rate": 0.00010528390195836925, "loss": 0.2372, "step": 3850 }, { "epoch": 1.4217134416543575, "eval_loss": 0.2668742835521698, "eval_runtime": 5.8577, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 3850 }, { "epoch": 1.422082717872969, "grad_norm": 0.27024218440055847, "learning_rate": 0.00010525926838280577, "loss": 0.2198, "step": 3851 }, { "epoch": 1.4224519940915805, "grad_norm": 0.2606055736541748, "learning_rate": 0.00010523463480724227, "loss": 0.2284, "step": 3852 }, { "epoch": 1.422821270310192, "grad_norm": 0.2458440065383911, "learning_rate": 0.00010521000123167878, "loss": 0.1999, "step": 3853 }, { "epoch": 1.4231905465288035, "grad_norm": 0.36551737785339355, "learning_rate": 0.00010518536765611529, "loss": 0.2538, "step": 3854 }, { "epoch": 1.423559822747415, "grad_norm": 0.30359822511672974, "learning_rate": 0.0001051607340805518, "loss": 0.2623, "step": 3855 }, { "epoch": 1.4239290989660267, "grad_norm": 0.19720108807086945, "learning_rate": 0.0001051361005049883, "loss": 0.1671, "step": 3856 }, { "epoch": 1.424298375184638, "grad_norm": 0.31897875666618347, "learning_rate": 0.00010511146692942482, "loss": 0.2078, "step": 3857 }, { "epoch": 1.4246676514032497, "grad_norm": 0.2386259138584137, "learning_rate": 0.00010508683335386132, "loss": 0.2147, "step": 3858 }, { "epoch": 1.425036927621861, "grad_norm": 0.2897254228591919, "learning_rate": 0.00010506219977829783, "loss": 0.2349, "step": 3859 }, { "epoch": 1.4254062038404727, "grad_norm": 0.3066774308681488, "learning_rate": 0.00010503756620273433, "loss": 0.241, "step": 3860 }, { "epoch": 1.4257754800590843, "grad_norm": 0.30762627720832825, "learning_rate": 0.00010501293262717085, "loss": 0.2634, "step": 3861 }, { "epoch": 1.4261447562776957, "grad_norm": 0.2821105122566223, "learning_rate": 0.00010498829905160734, "loss": 0.2391, "step": 3862 }, { "epoch": 1.4265140324963073, "grad_norm": 0.365055650472641, "learning_rate": 0.00010496366547604386, "loss": 0.2756, "step": 3863 }, { "epoch": 1.4268833087149186, "grad_norm": 0.31532981991767883, "learning_rate": 0.00010493903190048035, "loss": 0.2579, "step": 3864 }, { "epoch": 1.4272525849335302, "grad_norm": 0.27469801902770996, "learning_rate": 0.00010491439832491688, "loss": 0.2225, "step": 3865 }, { "epoch": 1.4276218611521418, "grad_norm": 0.30740001797676086, "learning_rate": 0.00010488976474935337, "loss": 0.3066, "step": 3866 }, { "epoch": 1.4279911373707534, "grad_norm": 0.23712503910064697, "learning_rate": 0.00010486513117378988, "loss": 0.1868, "step": 3867 }, { "epoch": 1.4283604135893648, "grad_norm": 0.28384414315223694, "learning_rate": 0.00010484049759822638, "loss": 0.2263, "step": 3868 }, { "epoch": 1.4287296898079764, "grad_norm": 0.22514435648918152, "learning_rate": 0.0001048158640226629, "loss": 0.2147, "step": 3869 }, { "epoch": 1.4290989660265878, "grad_norm": 0.31152236461639404, "learning_rate": 0.0001047912304470994, "loss": 0.2477, "step": 3870 }, { "epoch": 1.4294682422451994, "grad_norm": 0.35805216431617737, "learning_rate": 0.00010476659687153591, "loss": 0.2623, "step": 3871 }, { "epoch": 1.429837518463811, "grad_norm": 0.2790660858154297, "learning_rate": 0.00010474196329597241, "loss": 0.2352, "step": 3872 }, { "epoch": 1.4302067946824224, "grad_norm": 0.2426290512084961, "learning_rate": 0.00010471732972040893, "loss": 0.2381, "step": 3873 }, { "epoch": 1.430576070901034, "grad_norm": 0.24818822741508484, "learning_rate": 0.00010469269614484543, "loss": 0.2023, "step": 3874 }, { "epoch": 1.4309453471196454, "grad_norm": 0.325644850730896, "learning_rate": 0.00010466806256928194, "loss": 0.2917, "step": 3875 }, { "epoch": 1.431314623338257, "grad_norm": 0.2357746958732605, "learning_rate": 0.00010464342899371845, "loss": 0.2195, "step": 3876 }, { "epoch": 1.4316838995568686, "grad_norm": 0.292472779750824, "learning_rate": 0.00010461879541815496, "loss": 0.209, "step": 3877 }, { "epoch": 1.4320531757754802, "grad_norm": 0.2933250367641449, "learning_rate": 0.00010459416184259145, "loss": 0.2442, "step": 3878 }, { "epoch": 1.4324224519940916, "grad_norm": 0.2687840461730957, "learning_rate": 0.00010456952826702798, "loss": 0.21, "step": 3879 }, { "epoch": 1.4327917282127032, "grad_norm": 0.33804115653038025, "learning_rate": 0.00010454489469146446, "loss": 0.2396, "step": 3880 }, { "epoch": 1.4331610044313146, "grad_norm": 0.25264784693717957, "learning_rate": 0.00010452026111590099, "loss": 0.239, "step": 3881 }, { "epoch": 1.4335302806499262, "grad_norm": 0.3141342103481293, "learning_rate": 0.00010449562754033748, "loss": 0.2376, "step": 3882 }, { "epoch": 1.4338995568685378, "grad_norm": 0.23936063051223755, "learning_rate": 0.000104470993964774, "loss": 0.2067, "step": 3883 }, { "epoch": 1.4342688330871491, "grad_norm": 0.32110515236854553, "learning_rate": 0.0001044463603892105, "loss": 0.2483, "step": 3884 }, { "epoch": 1.4346381093057607, "grad_norm": 0.4062122702598572, "learning_rate": 0.00010442172681364701, "loss": 0.2665, "step": 3885 }, { "epoch": 1.4350073855243721, "grad_norm": 0.2796575725078583, "learning_rate": 0.00010439709323808351, "loss": 0.243, "step": 3886 }, { "epoch": 1.4353766617429837, "grad_norm": 0.23514513671398163, "learning_rate": 0.00010437245966252003, "loss": 0.2293, "step": 3887 }, { "epoch": 1.4357459379615953, "grad_norm": 0.2974199056625366, "learning_rate": 0.00010434782608695653, "loss": 0.2577, "step": 3888 }, { "epoch": 1.4361152141802067, "grad_norm": 0.31965237855911255, "learning_rate": 0.00010432319251139304, "loss": 0.2676, "step": 3889 }, { "epoch": 1.4364844903988183, "grad_norm": 0.31490933895111084, "learning_rate": 0.00010429855893582954, "loss": 0.2708, "step": 3890 }, { "epoch": 1.43685376661743, "grad_norm": 0.3214377164840698, "learning_rate": 0.00010427392536026606, "loss": 0.2886, "step": 3891 }, { "epoch": 1.4372230428360413, "grad_norm": 0.2550215721130371, "learning_rate": 0.00010424929178470256, "loss": 0.2547, "step": 3892 }, { "epoch": 1.437592319054653, "grad_norm": 0.2504318356513977, "learning_rate": 0.00010422465820913907, "loss": 0.2254, "step": 3893 }, { "epoch": 1.4379615952732645, "grad_norm": 0.2645464241504669, "learning_rate": 0.00010420002463357556, "loss": 0.286, "step": 3894 }, { "epoch": 1.4383308714918759, "grad_norm": 0.25396862626075745, "learning_rate": 0.00010417539105801209, "loss": 0.2016, "step": 3895 }, { "epoch": 1.4387001477104875, "grad_norm": 0.25158530473709106, "learning_rate": 0.00010415075748244858, "loss": 0.2286, "step": 3896 }, { "epoch": 1.4390694239290989, "grad_norm": 0.25421851873397827, "learning_rate": 0.0001041261239068851, "loss": 0.2338, "step": 3897 }, { "epoch": 1.4394387001477105, "grad_norm": 0.22105631232261658, "learning_rate": 0.00010410149033132159, "loss": 0.2012, "step": 3898 }, { "epoch": 1.439807976366322, "grad_norm": 0.3331243395805359, "learning_rate": 0.0001040768567557581, "loss": 0.2238, "step": 3899 }, { "epoch": 1.4401772525849335, "grad_norm": 0.3024211525917053, "learning_rate": 0.00010405222318019461, "loss": 0.2446, "step": 3900 }, { "epoch": 1.4401772525849335, "eval_loss": 0.2657105326652527, "eval_runtime": 5.8545, "eval_samples_per_second": 8.54, "eval_steps_per_second": 1.196, "step": 3900 }, { "epoch": 1.440546528803545, "grad_norm": 0.25392836332321167, "learning_rate": 0.00010402758960463112, "loss": 0.2286, "step": 3901 }, { "epoch": 1.4409158050221564, "grad_norm": 0.27044492959976196, "learning_rate": 0.00010400295602906762, "loss": 0.2382, "step": 3902 }, { "epoch": 1.441285081240768, "grad_norm": 0.2769128382205963, "learning_rate": 0.00010397832245350414, "loss": 0.2287, "step": 3903 }, { "epoch": 1.4416543574593796, "grad_norm": 0.2098458856344223, "learning_rate": 0.00010395368887794064, "loss": 0.1921, "step": 3904 }, { "epoch": 1.4420236336779912, "grad_norm": 0.2717629373073578, "learning_rate": 0.00010392905530237715, "loss": 0.2175, "step": 3905 }, { "epoch": 1.4423929098966026, "grad_norm": 0.26989179849624634, "learning_rate": 0.00010390442172681365, "loss": 0.2445, "step": 3906 }, { "epoch": 1.4427621861152142, "grad_norm": 0.26079505681991577, "learning_rate": 0.00010387978815125017, "loss": 0.2269, "step": 3907 }, { "epoch": 1.4431314623338256, "grad_norm": 0.32042261958122253, "learning_rate": 0.00010385515457568667, "loss": 0.2832, "step": 3908 }, { "epoch": 1.4435007385524372, "grad_norm": 0.24444133043289185, "learning_rate": 0.00010383052100012318, "loss": 0.1896, "step": 3909 }, { "epoch": 1.4438700147710488, "grad_norm": 0.27994006872177124, "learning_rate": 0.00010380588742455967, "loss": 0.2309, "step": 3910 }, { "epoch": 1.4442392909896602, "grad_norm": 0.2504062056541443, "learning_rate": 0.0001037812538489962, "loss": 0.2309, "step": 3911 }, { "epoch": 1.4446085672082718, "grad_norm": 0.2949374318122864, "learning_rate": 0.00010375662027343269, "loss": 0.2603, "step": 3912 }, { "epoch": 1.4449778434268832, "grad_norm": 0.3061377704143524, "learning_rate": 0.00010373198669786922, "loss": 0.2604, "step": 3913 }, { "epoch": 1.4453471196454948, "grad_norm": 0.25518345832824707, "learning_rate": 0.0001037073531223057, "loss": 0.1952, "step": 3914 }, { "epoch": 1.4457163958641064, "grad_norm": 0.2688634395599365, "learning_rate": 0.00010368271954674222, "loss": 0.2271, "step": 3915 }, { "epoch": 1.446085672082718, "grad_norm": 0.3324325680732727, "learning_rate": 0.00010365808597117872, "loss": 0.26, "step": 3916 }, { "epoch": 1.4464549483013294, "grad_norm": 0.3041841685771942, "learning_rate": 0.00010363345239561523, "loss": 0.2409, "step": 3917 }, { "epoch": 1.446824224519941, "grad_norm": 0.28644803166389465, "learning_rate": 0.00010360881882005173, "loss": 0.2554, "step": 3918 }, { "epoch": 1.4471935007385524, "grad_norm": 0.22980338335037231, "learning_rate": 0.00010358418524448825, "loss": 0.208, "step": 3919 }, { "epoch": 1.447562776957164, "grad_norm": 0.24705706536769867, "learning_rate": 0.00010355955166892475, "loss": 0.2072, "step": 3920 }, { "epoch": 1.4479320531757756, "grad_norm": 0.3009048402309418, "learning_rate": 0.00010353491809336127, "loss": 0.2503, "step": 3921 }, { "epoch": 1.448301329394387, "grad_norm": 0.28049546480178833, "learning_rate": 0.00010351028451779777, "loss": 0.2592, "step": 3922 }, { "epoch": 1.4486706056129985, "grad_norm": 0.26004844903945923, "learning_rate": 0.00010348565094223428, "loss": 0.2328, "step": 3923 }, { "epoch": 1.44903988183161, "grad_norm": 0.26922911405563354, "learning_rate": 0.00010346101736667078, "loss": 0.233, "step": 3924 }, { "epoch": 1.4494091580502215, "grad_norm": 0.27191996574401855, "learning_rate": 0.0001034363837911073, "loss": 0.2253, "step": 3925 }, { "epoch": 1.4497784342688331, "grad_norm": 0.2850154936313629, "learning_rate": 0.00010341175021554378, "loss": 0.2856, "step": 3926 }, { "epoch": 1.4501477104874447, "grad_norm": 0.4026567041873932, "learning_rate": 0.00010338711663998031, "loss": 0.2571, "step": 3927 }, { "epoch": 1.450516986706056, "grad_norm": 0.24033533036708832, "learning_rate": 0.0001033624830644168, "loss": 0.2057, "step": 3928 }, { "epoch": 1.4508862629246677, "grad_norm": 0.29191315174102783, "learning_rate": 0.00010333784948885333, "loss": 0.2538, "step": 3929 }, { "epoch": 1.451255539143279, "grad_norm": 0.31869855523109436, "learning_rate": 0.00010331321591328982, "loss": 0.2527, "step": 3930 }, { "epoch": 1.4516248153618907, "grad_norm": 0.2808881998062134, "learning_rate": 0.00010328858233772633, "loss": 0.2236, "step": 3931 }, { "epoch": 1.4519940915805023, "grad_norm": 0.2541932463645935, "learning_rate": 0.00010326394876216283, "loss": 0.2182, "step": 3932 }, { "epoch": 1.4523633677991137, "grad_norm": 0.2607090175151825, "learning_rate": 0.00010323931518659935, "loss": 0.2303, "step": 3933 }, { "epoch": 1.4527326440177253, "grad_norm": 0.2605319917201996, "learning_rate": 0.00010321468161103585, "loss": 0.2352, "step": 3934 }, { "epoch": 1.4531019202363367, "grad_norm": 0.27073273062705994, "learning_rate": 0.00010319004803547236, "loss": 0.221, "step": 3935 }, { "epoch": 1.4534711964549483, "grad_norm": 0.2717744708061218, "learning_rate": 0.00010316541445990886, "loss": 0.2141, "step": 3936 }, { "epoch": 1.4538404726735599, "grad_norm": 0.2980343699455261, "learning_rate": 0.00010314078088434538, "loss": 0.2532, "step": 3937 }, { "epoch": 1.4542097488921715, "grad_norm": 0.3076517581939697, "learning_rate": 0.00010311614730878188, "loss": 0.2429, "step": 3938 }, { "epoch": 1.4545790251107829, "grad_norm": 0.21933375298976898, "learning_rate": 0.0001030915137332184, "loss": 0.1826, "step": 3939 }, { "epoch": 1.4549483013293945, "grad_norm": 0.3170267939567566, "learning_rate": 0.0001030668801576549, "loss": 0.2596, "step": 3940 }, { "epoch": 1.4553175775480058, "grad_norm": 0.30827200412750244, "learning_rate": 0.00010304224658209141, "loss": 0.2941, "step": 3941 }, { "epoch": 1.4556868537666174, "grad_norm": 0.27948877215385437, "learning_rate": 0.0001030176130065279, "loss": 0.2578, "step": 3942 }, { "epoch": 1.456056129985229, "grad_norm": 0.3193192780017853, "learning_rate": 0.00010299297943096442, "loss": 0.3213, "step": 3943 }, { "epoch": 1.4564254062038404, "grad_norm": 0.2982330620288849, "learning_rate": 0.00010296834585540091, "loss": 0.213, "step": 3944 }, { "epoch": 1.456794682422452, "grad_norm": 0.25382521748542786, "learning_rate": 0.00010294371227983744, "loss": 0.191, "step": 3945 }, { "epoch": 1.4571639586410634, "grad_norm": 0.28603941202163696, "learning_rate": 0.00010291907870427393, "loss": 0.2206, "step": 3946 }, { "epoch": 1.457533234859675, "grad_norm": 0.2785376310348511, "learning_rate": 0.00010289444512871044, "loss": 0.2476, "step": 3947 }, { "epoch": 1.4579025110782866, "grad_norm": 0.3084624111652374, "learning_rate": 0.00010286981155314694, "loss": 0.2835, "step": 3948 }, { "epoch": 1.4582717872968982, "grad_norm": 0.2752220630645752, "learning_rate": 0.00010284517797758344, "loss": 0.2326, "step": 3949 }, { "epoch": 1.4586410635155096, "grad_norm": 0.22812512516975403, "learning_rate": 0.00010282054440201996, "loss": 0.2256, "step": 3950 }, { "epoch": 1.4586410635155096, "eval_loss": 0.26708507537841797, "eval_runtime": 5.8575, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 3950 }, { "epoch": 1.4590103397341212, "grad_norm": 0.2775406241416931, "learning_rate": 0.00010279591082645646, "loss": 0.2589, "step": 3951 }, { "epoch": 1.4593796159527326, "grad_norm": 0.23759698867797852, "learning_rate": 0.00010277127725089298, "loss": 0.1956, "step": 3952 }, { "epoch": 1.4597488921713442, "grad_norm": 0.243507981300354, "learning_rate": 0.00010274664367532946, "loss": 0.2154, "step": 3953 }, { "epoch": 1.4601181683899558, "grad_norm": 0.24589616060256958, "learning_rate": 0.00010272201009976599, "loss": 0.2494, "step": 3954 }, { "epoch": 1.4604874446085672, "grad_norm": 0.25779590010643005, "learning_rate": 0.00010269737652420248, "loss": 0.2359, "step": 3955 }, { "epoch": 1.4608567208271788, "grad_norm": 0.23699389398097992, "learning_rate": 0.000102672742948639, "loss": 0.2466, "step": 3956 }, { "epoch": 1.4612259970457901, "grad_norm": 0.22925809025764465, "learning_rate": 0.0001026481093730755, "loss": 0.2097, "step": 3957 }, { "epoch": 1.4615952732644018, "grad_norm": 0.2855239808559418, "learning_rate": 0.00010262347579751201, "loss": 0.2594, "step": 3958 }, { "epoch": 1.4619645494830134, "grad_norm": 0.2658534049987793, "learning_rate": 0.00010259884222194851, "loss": 0.2354, "step": 3959 }, { "epoch": 1.4623338257016247, "grad_norm": 0.31895971298217773, "learning_rate": 0.00010257420864638502, "loss": 0.2417, "step": 3960 }, { "epoch": 1.4627031019202363, "grad_norm": 0.24943141639232635, "learning_rate": 0.00010254957507082153, "loss": 0.2065, "step": 3961 }, { "epoch": 1.463072378138848, "grad_norm": 0.24429693818092346, "learning_rate": 0.00010252494149525804, "loss": 0.2326, "step": 3962 }, { "epoch": 1.4634416543574593, "grad_norm": 0.386321485042572, "learning_rate": 0.00010250030791969454, "loss": 0.2663, "step": 3963 }, { "epoch": 1.463810930576071, "grad_norm": 0.29140713810920715, "learning_rate": 0.00010247567434413106, "loss": 0.2227, "step": 3964 }, { "epoch": 1.4641802067946825, "grad_norm": 0.2905665934085846, "learning_rate": 0.00010245104076856756, "loss": 0.2463, "step": 3965 }, { "epoch": 1.464549483013294, "grad_norm": 0.3086444139480591, "learning_rate": 0.00010242640719300407, "loss": 0.2811, "step": 3966 }, { "epoch": 1.4649187592319055, "grad_norm": 0.3844276964664459, "learning_rate": 0.00010240177361744057, "loss": 0.2513, "step": 3967 }, { "epoch": 1.465288035450517, "grad_norm": 0.2641187310218811, "learning_rate": 0.00010237714004187709, "loss": 0.2173, "step": 3968 }, { "epoch": 1.4656573116691285, "grad_norm": 0.3104974329471588, "learning_rate": 0.00010235250646631357, "loss": 0.2709, "step": 3969 }, { "epoch": 1.46602658788774, "grad_norm": 0.35863199830055237, "learning_rate": 0.0001023278728907501, "loss": 0.2933, "step": 3970 }, { "epoch": 1.4663958641063515, "grad_norm": 0.26285624504089355, "learning_rate": 0.00010230323931518659, "loss": 0.2349, "step": 3971 }, { "epoch": 1.466765140324963, "grad_norm": 0.283856600522995, "learning_rate": 0.00010227860573962312, "loss": 0.2417, "step": 3972 }, { "epoch": 1.4671344165435745, "grad_norm": 0.24891315400600433, "learning_rate": 0.0001022539721640596, "loss": 0.2046, "step": 3973 }, { "epoch": 1.467503692762186, "grad_norm": 0.30018529295921326, "learning_rate": 0.00010222933858849612, "loss": 0.2569, "step": 3974 }, { "epoch": 1.4678729689807977, "grad_norm": 0.30528995394706726, "learning_rate": 0.00010220470501293262, "loss": 0.2863, "step": 3975 }, { "epoch": 1.4682422451994093, "grad_norm": 0.20827481150627136, "learning_rate": 0.00010218007143736914, "loss": 0.214, "step": 3976 }, { "epoch": 1.4686115214180206, "grad_norm": 0.2594269812107086, "learning_rate": 0.00010215543786180564, "loss": 0.224, "step": 3977 }, { "epoch": 1.4689807976366323, "grad_norm": 0.3854300379753113, "learning_rate": 0.00010213080428624215, "loss": 0.2729, "step": 3978 }, { "epoch": 1.4693500738552436, "grad_norm": 0.2656078636646271, "learning_rate": 0.00010210617071067865, "loss": 0.2111, "step": 3979 }, { "epoch": 1.4697193500738552, "grad_norm": 0.24714063107967377, "learning_rate": 0.00010208153713511517, "loss": 0.2162, "step": 3980 }, { "epoch": 1.4700886262924668, "grad_norm": 0.32045742869377136, "learning_rate": 0.00010205690355955167, "loss": 0.2485, "step": 3981 }, { "epoch": 1.4704579025110782, "grad_norm": 0.21791408956050873, "learning_rate": 0.00010203226998398818, "loss": 0.2354, "step": 3982 }, { "epoch": 1.4708271787296898, "grad_norm": 0.2862100601196289, "learning_rate": 0.00010200763640842467, "loss": 0.2308, "step": 3983 }, { "epoch": 1.4711964549483012, "grad_norm": 0.30597779154777527, "learning_rate": 0.0001019830028328612, "loss": 0.2212, "step": 3984 }, { "epoch": 1.4715657311669128, "grad_norm": 0.2577318847179413, "learning_rate": 0.00010195836925729769, "loss": 0.2214, "step": 3985 }, { "epoch": 1.4719350073855244, "grad_norm": 0.21729454398155212, "learning_rate": 0.00010193373568173422, "loss": 0.1774, "step": 3986 }, { "epoch": 1.472304283604136, "grad_norm": 0.39620912075042725, "learning_rate": 0.0001019091021061707, "loss": 0.2499, "step": 3987 }, { "epoch": 1.4726735598227474, "grad_norm": 0.30386099219322205, "learning_rate": 0.00010188446853060723, "loss": 0.2267, "step": 3988 }, { "epoch": 1.473042836041359, "grad_norm": 0.30357635021209717, "learning_rate": 0.00010185983495504372, "loss": 0.2692, "step": 3989 }, { "epoch": 1.4734121122599704, "grad_norm": 0.3040999472141266, "learning_rate": 0.00010183520137948023, "loss": 0.2825, "step": 3990 }, { "epoch": 1.473781388478582, "grad_norm": 0.27398502826690674, "learning_rate": 0.00010181056780391673, "loss": 0.2344, "step": 3991 }, { "epoch": 1.4741506646971936, "grad_norm": 0.297529935836792, "learning_rate": 0.00010178593422835325, "loss": 0.2824, "step": 3992 }, { "epoch": 1.474519940915805, "grad_norm": 0.26736244559288025, "learning_rate": 0.00010176130065278975, "loss": 0.2225, "step": 3993 }, { "epoch": 1.4748892171344166, "grad_norm": 0.25624769926071167, "learning_rate": 0.00010173666707722626, "loss": 0.2416, "step": 3994 }, { "epoch": 1.475258493353028, "grad_norm": 0.34669265151023865, "learning_rate": 0.00010171203350166277, "loss": 0.2522, "step": 3995 }, { "epoch": 1.4756277695716395, "grad_norm": 0.30900081992149353, "learning_rate": 0.00010168739992609928, "loss": 0.2328, "step": 3996 }, { "epoch": 1.4759970457902511, "grad_norm": 0.28143182396888733, "learning_rate": 0.00010166276635053578, "loss": 0.2544, "step": 3997 }, { "epoch": 1.4763663220088628, "grad_norm": 0.2547646462917328, "learning_rate": 0.0001016381327749723, "loss": 0.2429, "step": 3998 }, { "epoch": 1.4767355982274741, "grad_norm": 0.25624385476112366, "learning_rate": 0.00010161349919940878, "loss": 0.192, "step": 3999 }, { "epoch": 1.4771048744460857, "grad_norm": 0.35263240337371826, "learning_rate": 0.00010158886562384531, "loss": 0.2793, "step": 4000 }, { "epoch": 1.4771048744460857, "eval_loss": 0.2678302824497223, "eval_runtime": 5.8712, "eval_samples_per_second": 8.516, "eval_steps_per_second": 1.192, "step": 4000 }, { "epoch": 1.4774741506646971, "grad_norm": 0.26784220337867737, "learning_rate": 0.0001015642320482818, "loss": 0.2109, "step": 4001 }, { "epoch": 1.4778434268833087, "grad_norm": 0.305281400680542, "learning_rate": 0.00010153959847271833, "loss": 0.2599, "step": 4002 }, { "epoch": 1.4782127031019203, "grad_norm": 0.2713955342769623, "learning_rate": 0.00010151496489715482, "loss": 0.2653, "step": 4003 }, { "epoch": 1.4785819793205317, "grad_norm": 0.2568889856338501, "learning_rate": 0.00010149033132159133, "loss": 0.2374, "step": 4004 }, { "epoch": 1.4789512555391433, "grad_norm": 0.27562272548675537, "learning_rate": 0.00010146569774602783, "loss": 0.2313, "step": 4005 }, { "epoch": 1.4793205317577547, "grad_norm": 0.2352875918149948, "learning_rate": 0.00010144106417046435, "loss": 0.1985, "step": 4006 }, { "epoch": 1.4796898079763663, "grad_norm": 0.31400978565216064, "learning_rate": 0.00010141643059490085, "loss": 0.2322, "step": 4007 }, { "epoch": 1.480059084194978, "grad_norm": 0.2479861080646515, "learning_rate": 0.00010139179701933736, "loss": 0.2156, "step": 4008 }, { "epoch": 1.4804283604135895, "grad_norm": 0.255172461271286, "learning_rate": 0.00010136716344377386, "loss": 0.2241, "step": 4009 }, { "epoch": 1.4807976366322009, "grad_norm": 0.2996932864189148, "learning_rate": 0.00010134252986821038, "loss": 0.2656, "step": 4010 }, { "epoch": 1.4811669128508125, "grad_norm": 0.24185395240783691, "learning_rate": 0.00010131789629264688, "loss": 0.1916, "step": 4011 }, { "epoch": 1.4815361890694239, "grad_norm": 0.29753515124320984, "learning_rate": 0.00010129326271708339, "loss": 0.2564, "step": 4012 }, { "epoch": 1.4819054652880355, "grad_norm": 0.2661593556404114, "learning_rate": 0.0001012686291415199, "loss": 0.2245, "step": 4013 }, { "epoch": 1.482274741506647, "grad_norm": 0.27129074931144714, "learning_rate": 0.00010124399556595641, "loss": 0.2234, "step": 4014 }, { "epoch": 1.4826440177252584, "grad_norm": 0.2453327476978302, "learning_rate": 0.0001012193619903929, "loss": 0.2384, "step": 4015 }, { "epoch": 1.48301329394387, "grad_norm": 0.28773802518844604, "learning_rate": 0.00010119472841482942, "loss": 0.2505, "step": 4016 }, { "epoch": 1.4833825701624814, "grad_norm": 0.2755826413631439, "learning_rate": 0.00010117009483926591, "loss": 0.2131, "step": 4017 }, { "epoch": 1.483751846381093, "grad_norm": 0.2814352512359619, "learning_rate": 0.00010114546126370244, "loss": 0.217, "step": 4018 }, { "epoch": 1.4841211225997046, "grad_norm": 0.28828540444374084, "learning_rate": 0.00010112082768813893, "loss": 0.263, "step": 4019 }, { "epoch": 1.4844903988183162, "grad_norm": 0.2544025182723999, "learning_rate": 0.00010109619411257544, "loss": 0.2509, "step": 4020 }, { "epoch": 1.4848596750369276, "grad_norm": 0.2896730899810791, "learning_rate": 0.00010107156053701194, "loss": 0.2519, "step": 4021 }, { "epoch": 1.4852289512555392, "grad_norm": 0.2682430148124695, "learning_rate": 0.00010104692696144846, "loss": 0.2538, "step": 4022 }, { "epoch": 1.4855982274741506, "grad_norm": 0.24106591939926147, "learning_rate": 0.00010102229338588496, "loss": 0.2411, "step": 4023 }, { "epoch": 1.4859675036927622, "grad_norm": 0.28132450580596924, "learning_rate": 0.00010099765981032147, "loss": 0.2722, "step": 4024 }, { "epoch": 1.4863367799113738, "grad_norm": 0.22729288041591644, "learning_rate": 0.00010097302623475797, "loss": 0.1995, "step": 4025 }, { "epoch": 1.4867060561299852, "grad_norm": 0.28534844517707825, "learning_rate": 0.00010094839265919449, "loss": 0.2272, "step": 4026 }, { "epoch": 1.4870753323485968, "grad_norm": 0.2466653436422348, "learning_rate": 0.00010092375908363099, "loss": 0.1938, "step": 4027 }, { "epoch": 1.4874446085672082, "grad_norm": 0.3041146695613861, "learning_rate": 0.0001008991255080675, "loss": 0.2308, "step": 4028 }, { "epoch": 1.4878138847858198, "grad_norm": 0.27459827065467834, "learning_rate": 0.000100874491932504, "loss": 0.2349, "step": 4029 }, { "epoch": 1.4881831610044314, "grad_norm": 0.2825285494327545, "learning_rate": 0.00010084985835694052, "loss": 0.2419, "step": 4030 }, { "epoch": 1.4885524372230428, "grad_norm": 0.27188214659690857, "learning_rate": 0.00010082522478137701, "loss": 0.2105, "step": 4031 }, { "epoch": 1.4889217134416544, "grad_norm": 0.2776201665401459, "learning_rate": 0.00010080059120581354, "loss": 0.2144, "step": 4032 }, { "epoch": 1.4892909896602657, "grad_norm": 0.5040718913078308, "learning_rate": 0.00010077595763025002, "loss": 0.2785, "step": 4033 }, { "epoch": 1.4896602658788773, "grad_norm": 0.3240455687046051, "learning_rate": 0.00010075132405468655, "loss": 0.29, "step": 4034 }, { "epoch": 1.490029542097489, "grad_norm": 0.27039119601249695, "learning_rate": 0.00010072669047912304, "loss": 0.247, "step": 4035 }, { "epoch": 1.4903988183161005, "grad_norm": 0.2560168206691742, "learning_rate": 0.00010070205690355955, "loss": 0.2038, "step": 4036 }, { "epoch": 1.490768094534712, "grad_norm": 0.2725893259048462, "learning_rate": 0.00010067742332799606, "loss": 0.2249, "step": 4037 }, { "epoch": 1.4911373707533235, "grad_norm": 0.24265554547309875, "learning_rate": 0.00010065278975243257, "loss": 0.2332, "step": 4038 }, { "epoch": 1.491506646971935, "grad_norm": 0.2514985203742981, "learning_rate": 0.00010062815617686907, "loss": 0.2242, "step": 4039 }, { "epoch": 1.4918759231905465, "grad_norm": 0.359241783618927, "learning_rate": 0.00010060352260130559, "loss": 0.2744, "step": 4040 }, { "epoch": 1.4922451994091581, "grad_norm": 0.25917497277259827, "learning_rate": 0.00010057888902574209, "loss": 0.1946, "step": 4041 }, { "epoch": 1.4926144756277695, "grad_norm": 0.28097108006477356, "learning_rate": 0.0001005542554501786, "loss": 0.2319, "step": 4042 }, { "epoch": 1.492983751846381, "grad_norm": 0.29191944003105164, "learning_rate": 0.0001005296218746151, "loss": 0.2646, "step": 4043 }, { "epoch": 1.4933530280649925, "grad_norm": 0.41488784551620483, "learning_rate": 0.00010050498829905162, "loss": 0.2781, "step": 4044 }, { "epoch": 1.493722304283604, "grad_norm": 0.27251023054122925, "learning_rate": 0.00010048035472348812, "loss": 0.223, "step": 4045 }, { "epoch": 1.4940915805022157, "grad_norm": 0.22081872820854187, "learning_rate": 0.00010045572114792463, "loss": 0.2026, "step": 4046 }, { "epoch": 1.4944608567208273, "grad_norm": 0.2805134057998657, "learning_rate": 0.00010043108757236112, "loss": 0.255, "step": 4047 }, { "epoch": 1.4948301329394387, "grad_norm": 0.2585667371749878, "learning_rate": 0.00010040645399679765, "loss": 0.227, "step": 4048 }, { "epoch": 1.4951994091580503, "grad_norm": 0.32681065797805786, "learning_rate": 0.00010038182042123414, "loss": 0.2357, "step": 4049 }, { "epoch": 1.4955686853766617, "grad_norm": 0.2742816209793091, "learning_rate": 0.00010035718684567066, "loss": 0.2469, "step": 4050 }, { "epoch": 1.4955686853766617, "eval_loss": 0.26486024260520935, "eval_runtime": 5.8567, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 4050 }, { "epoch": 1.4959379615952733, "grad_norm": 0.25103434920310974, "learning_rate": 0.00010033255327010715, "loss": 0.2167, "step": 4051 }, { "epoch": 1.4963072378138849, "grad_norm": 0.25343605875968933, "learning_rate": 0.00010030791969454367, "loss": 0.2016, "step": 4052 }, { "epoch": 1.4966765140324962, "grad_norm": 0.3064647614955902, "learning_rate": 0.00010028328611898017, "loss": 0.2382, "step": 4053 }, { "epoch": 1.4970457902511078, "grad_norm": 0.3272014558315277, "learning_rate": 0.00010025865254341668, "loss": 0.2299, "step": 4054 }, { "epoch": 1.4974150664697192, "grad_norm": 0.2515038847923279, "learning_rate": 0.00010023401896785318, "loss": 0.2166, "step": 4055 }, { "epoch": 1.4977843426883308, "grad_norm": 0.25827375054359436, "learning_rate": 0.0001002093853922897, "loss": 0.2273, "step": 4056 }, { "epoch": 1.4981536189069424, "grad_norm": 0.2369767129421234, "learning_rate": 0.0001001847518167262, "loss": 0.1964, "step": 4057 }, { "epoch": 1.498522895125554, "grad_norm": 0.25905394554138184, "learning_rate": 0.00010016011824116271, "loss": 0.2243, "step": 4058 }, { "epoch": 1.4988921713441654, "grad_norm": 0.3252527415752411, "learning_rate": 0.00010013548466559921, "loss": 0.2476, "step": 4059 }, { "epoch": 1.499261447562777, "grad_norm": 0.2727453410625458, "learning_rate": 0.00010011085109003573, "loss": 0.2642, "step": 4060 }, { "epoch": 1.4996307237813884, "grad_norm": 0.24760130047798157, "learning_rate": 0.00010008621751447223, "loss": 0.1964, "step": 4061 }, { "epoch": 1.5, "grad_norm": 0.29868680238723755, "learning_rate": 0.00010006158393890875, "loss": 0.2332, "step": 4062 }, { "epoch": 1.5003692762186116, "grad_norm": 0.308330774307251, "learning_rate": 0.00010003695036334523, "loss": 0.2338, "step": 4063 }, { "epoch": 1.5007385524372232, "grad_norm": 0.2614595890045166, "learning_rate": 0.00010001231678778176, "loss": 0.2229, "step": 4064 }, { "epoch": 1.5011078286558346, "grad_norm": 0.23017829656600952, "learning_rate": 9.998768321221825e-05, "loss": 0.1753, "step": 4065 }, { "epoch": 1.501477104874446, "grad_norm": 0.2502877414226532, "learning_rate": 9.996304963665476e-05, "loss": 0.2181, "step": 4066 }, { "epoch": 1.5018463810930576, "grad_norm": 0.29967954754829407, "learning_rate": 9.993841606109126e-05, "loss": 0.2221, "step": 4067 }, { "epoch": 1.5022156573116692, "grad_norm": 0.2727149426937103, "learning_rate": 9.991378248552778e-05, "loss": 0.2207, "step": 4068 }, { "epoch": 1.5025849335302808, "grad_norm": 0.2796933352947235, "learning_rate": 9.988914890996428e-05, "loss": 0.2224, "step": 4069 }, { "epoch": 1.5029542097488922, "grad_norm": 0.24057498574256897, "learning_rate": 9.986451533440078e-05, "loss": 0.1949, "step": 4070 }, { "epoch": 1.5033234859675035, "grad_norm": 0.28947803378105164, "learning_rate": 9.98398817588373e-05, "loss": 0.2285, "step": 4071 }, { "epoch": 1.5036927621861151, "grad_norm": 0.27775415778160095, "learning_rate": 9.98152481832738e-05, "loss": 0.2277, "step": 4072 }, { "epoch": 1.5040620384047267, "grad_norm": 0.22638939321041107, "learning_rate": 9.979061460771031e-05, "loss": 0.1869, "step": 4073 }, { "epoch": 1.5044313146233383, "grad_norm": 0.28084293007850647, "learning_rate": 9.976598103214681e-05, "loss": 0.2281, "step": 4074 }, { "epoch": 1.5048005908419497, "grad_norm": 0.337291419506073, "learning_rate": 9.974134745658333e-05, "loss": 0.2373, "step": 4075 }, { "epoch": 1.5051698670605613, "grad_norm": 0.26597699522972107, "learning_rate": 9.971671388101983e-05, "loss": 0.2415, "step": 4076 }, { "epoch": 1.5055391432791727, "grad_norm": 0.3119012713432312, "learning_rate": 9.969208030545634e-05, "loss": 0.2505, "step": 4077 }, { "epoch": 1.5059084194977843, "grad_norm": 0.2967747449874878, "learning_rate": 9.966744672989284e-05, "loss": 0.2172, "step": 4078 }, { "epoch": 1.506277695716396, "grad_norm": 0.2876927852630615, "learning_rate": 9.964281315432935e-05, "loss": 0.225, "step": 4079 }, { "epoch": 1.5066469719350075, "grad_norm": 0.24519896507263184, "learning_rate": 9.961817957876586e-05, "loss": 0.2047, "step": 4080 }, { "epoch": 1.507016248153619, "grad_norm": 0.25056275725364685, "learning_rate": 9.959354600320236e-05, "loss": 0.2122, "step": 4081 }, { "epoch": 1.5073855243722303, "grad_norm": 0.26125919818878174, "learning_rate": 9.956891242763888e-05, "loss": 0.2156, "step": 4082 }, { "epoch": 1.5077548005908419, "grad_norm": 0.2800387740135193, "learning_rate": 9.954427885207538e-05, "loss": 0.221, "step": 4083 }, { "epoch": 1.5081240768094535, "grad_norm": 0.26222723722457886, "learning_rate": 9.951964527651189e-05, "loss": 0.2235, "step": 4084 }, { "epoch": 1.508493353028065, "grad_norm": 0.3005081117153168, "learning_rate": 9.949501170094839e-05, "loss": 0.2423, "step": 4085 }, { "epoch": 1.5088626292466765, "grad_norm": 0.30846738815307617, "learning_rate": 9.94703781253849e-05, "loss": 0.2833, "step": 4086 }, { "epoch": 1.509231905465288, "grad_norm": 0.24236111342906952, "learning_rate": 9.944574454982141e-05, "loss": 0.2138, "step": 4087 }, { "epoch": 1.5096011816838995, "grad_norm": 0.2984859347343445, "learning_rate": 9.942111097425791e-05, "loss": 0.2585, "step": 4088 }, { "epoch": 1.509970457902511, "grad_norm": 0.32192322611808777, "learning_rate": 9.939647739869442e-05, "loss": 0.2461, "step": 4089 }, { "epoch": 1.5103397341211227, "grad_norm": 0.3119538426399231, "learning_rate": 9.937184382313092e-05, "loss": 0.2687, "step": 4090 }, { "epoch": 1.5107090103397343, "grad_norm": 0.2933902144432068, "learning_rate": 9.934721024756744e-05, "loss": 0.2594, "step": 4091 }, { "epoch": 1.5110782865583456, "grad_norm": 0.2725570499897003, "learning_rate": 9.932257667200394e-05, "loss": 0.2278, "step": 4092 }, { "epoch": 1.511447562776957, "grad_norm": 0.23285800218582153, "learning_rate": 9.929794309644046e-05, "loss": 0.1939, "step": 4093 }, { "epoch": 1.5118168389955686, "grad_norm": 0.2728244960308075, "learning_rate": 9.927330952087696e-05, "loss": 0.2354, "step": 4094 }, { "epoch": 1.5121861152141802, "grad_norm": 0.2810523509979248, "learning_rate": 9.924867594531346e-05, "loss": 0.2171, "step": 4095 }, { "epoch": 1.5125553914327918, "grad_norm": 0.31245458126068115, "learning_rate": 9.922404236974997e-05, "loss": 0.2041, "step": 4096 }, { "epoch": 1.5129246676514032, "grad_norm": 0.27133268117904663, "learning_rate": 9.919940879418647e-05, "loss": 0.2438, "step": 4097 }, { "epoch": 1.5132939438700148, "grad_norm": 0.29600241780281067, "learning_rate": 9.917477521862299e-05, "loss": 0.2176, "step": 4098 }, { "epoch": 1.5136632200886262, "grad_norm": 0.26659178733825684, "learning_rate": 9.915014164305949e-05, "loss": 0.2302, "step": 4099 }, { "epoch": 1.5140324963072378, "grad_norm": 0.3164130449295044, "learning_rate": 9.9125508067496e-05, "loss": 0.2331, "step": 4100 }, { "epoch": 1.5140324963072378, "eval_loss": 0.271483451128006, "eval_runtime": 5.8555, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.195, "step": 4100 }, { "epoch": 1.5144017725258494, "grad_norm": 0.34098005294799805, "learning_rate": 9.91008744919325e-05, "loss": 0.2351, "step": 4101 }, { "epoch": 1.514771048744461, "grad_norm": 0.2760215401649475, "learning_rate": 9.9076240916369e-05, "loss": 0.2209, "step": 4102 }, { "epoch": 1.5151403249630724, "grad_norm": 0.2948487102985382, "learning_rate": 9.905160734080552e-05, "loss": 0.2412, "step": 4103 }, { "epoch": 1.5155096011816838, "grad_norm": 0.2646871507167816, "learning_rate": 9.902697376524202e-05, "loss": 0.2237, "step": 4104 }, { "epoch": 1.5158788774002954, "grad_norm": 0.28485599160194397, "learning_rate": 9.900234018967854e-05, "loss": 0.2455, "step": 4105 }, { "epoch": 1.516248153618907, "grad_norm": 0.33545544743537903, "learning_rate": 9.897770661411504e-05, "loss": 0.2699, "step": 4106 }, { "epoch": 1.5166174298375186, "grad_norm": 0.2689124643802643, "learning_rate": 9.895307303855155e-05, "loss": 0.2091, "step": 4107 }, { "epoch": 1.51698670605613, "grad_norm": 0.2539463937282562, "learning_rate": 9.892843946298805e-05, "loss": 0.2525, "step": 4108 }, { "epoch": 1.5173559822747416, "grad_norm": 0.23484019935131073, "learning_rate": 9.890380588742457e-05, "loss": 0.2118, "step": 4109 }, { "epoch": 1.517725258493353, "grad_norm": 0.3526381552219391, "learning_rate": 9.887917231186107e-05, "loss": 0.2821, "step": 4110 }, { "epoch": 1.5180945347119645, "grad_norm": 0.2914596498012543, "learning_rate": 9.885453873629757e-05, "loss": 0.2647, "step": 4111 }, { "epoch": 1.5184638109305761, "grad_norm": 0.3403332531452179, "learning_rate": 9.882990516073408e-05, "loss": 0.2353, "step": 4112 }, { "epoch": 1.5188330871491877, "grad_norm": 0.26303350925445557, "learning_rate": 9.880527158517059e-05, "loss": 0.2439, "step": 4113 }, { "epoch": 1.5192023633677991, "grad_norm": 0.330528199672699, "learning_rate": 9.87806380096071e-05, "loss": 0.2661, "step": 4114 }, { "epoch": 1.5195716395864105, "grad_norm": 0.2686528265476227, "learning_rate": 9.87560044340436e-05, "loss": 0.2218, "step": 4115 }, { "epoch": 1.519940915805022, "grad_norm": 0.24084657430648804, "learning_rate": 9.873137085848012e-05, "loss": 0.2061, "step": 4116 }, { "epoch": 1.5203101920236337, "grad_norm": 0.23300257325172424, "learning_rate": 9.870673728291662e-05, "loss": 0.1991, "step": 4117 }, { "epoch": 1.5206794682422453, "grad_norm": 0.23994757235050201, "learning_rate": 9.868210370735312e-05, "loss": 0.2217, "step": 4118 }, { "epoch": 1.5210487444608567, "grad_norm": 0.2562894821166992, "learning_rate": 9.865747013178963e-05, "loss": 0.2143, "step": 4119 }, { "epoch": 1.5214180206794683, "grad_norm": 0.26783227920532227, "learning_rate": 9.863283655622613e-05, "loss": 0.2375, "step": 4120 }, { "epoch": 1.5217872968980797, "grad_norm": 0.2599738538265228, "learning_rate": 9.860820298066265e-05, "loss": 0.2536, "step": 4121 }, { "epoch": 1.5221565731166913, "grad_norm": 0.24114684760570526, "learning_rate": 9.858356940509915e-05, "loss": 0.2085, "step": 4122 }, { "epoch": 1.5225258493353029, "grad_norm": 0.300831139087677, "learning_rate": 9.855893582953566e-05, "loss": 0.2434, "step": 4123 }, { "epoch": 1.5228951255539145, "grad_norm": 0.24409927427768707, "learning_rate": 9.853430225397217e-05, "loss": 0.2232, "step": 4124 }, { "epoch": 1.5232644017725259, "grad_norm": 0.2840551733970642, "learning_rate": 9.850966867840868e-05, "loss": 0.2477, "step": 4125 }, { "epoch": 1.5236336779911372, "grad_norm": 0.24531321227550507, "learning_rate": 9.848503510284518e-05, "loss": 0.1941, "step": 4126 }, { "epoch": 1.5240029542097489, "grad_norm": 0.3306686580181122, "learning_rate": 9.846040152728168e-05, "loss": 0.2586, "step": 4127 }, { "epoch": 1.5243722304283605, "grad_norm": 0.3042660355567932, "learning_rate": 9.84357679517182e-05, "loss": 0.2395, "step": 4128 }, { "epoch": 1.524741506646972, "grad_norm": 0.28639450669288635, "learning_rate": 9.84111343761547e-05, "loss": 0.2309, "step": 4129 }, { "epoch": 1.5251107828655834, "grad_norm": 0.27038899064064026, "learning_rate": 9.838650080059121e-05, "loss": 0.2459, "step": 4130 }, { "epoch": 1.5254800590841948, "grad_norm": 0.3015116751194, "learning_rate": 9.836186722502771e-05, "loss": 0.248, "step": 4131 }, { "epoch": 1.5258493353028064, "grad_norm": 0.27150559425354004, "learning_rate": 9.833723364946423e-05, "loss": 0.273, "step": 4132 }, { "epoch": 1.526218611521418, "grad_norm": 0.30560436844825745, "learning_rate": 9.831260007390073e-05, "loss": 0.2358, "step": 4133 }, { "epoch": 1.5265878877400296, "grad_norm": 0.23112213611602783, "learning_rate": 9.828796649833723e-05, "loss": 0.2131, "step": 4134 }, { "epoch": 1.5269571639586412, "grad_norm": 0.24395301938056946, "learning_rate": 9.826333292277374e-05, "loss": 0.2216, "step": 4135 }, { "epoch": 1.5273264401772526, "grad_norm": 0.2670608162879944, "learning_rate": 9.823869934721025e-05, "loss": 0.1996, "step": 4136 }, { "epoch": 1.527695716395864, "grad_norm": 0.2092277556657791, "learning_rate": 9.821406577164676e-05, "loss": 0.2079, "step": 4137 }, { "epoch": 1.5280649926144756, "grad_norm": 0.31602388620376587, "learning_rate": 9.818943219608326e-05, "loss": 0.2632, "step": 4138 }, { "epoch": 1.5284342688330872, "grad_norm": 0.3431403636932373, "learning_rate": 9.816479862051978e-05, "loss": 0.2848, "step": 4139 }, { "epoch": 1.5288035450516988, "grad_norm": 0.2444666624069214, "learning_rate": 9.814016504495628e-05, "loss": 0.1999, "step": 4140 }, { "epoch": 1.5291728212703102, "grad_norm": 0.2779955565929413, "learning_rate": 9.811553146939278e-05, "loss": 0.2132, "step": 4141 }, { "epoch": 1.5295420974889216, "grad_norm": 0.40997451543807983, "learning_rate": 9.809089789382929e-05, "loss": 0.2804, "step": 4142 }, { "epoch": 1.5299113737075332, "grad_norm": 0.23134274780750275, "learning_rate": 9.80662643182658e-05, "loss": 0.2427, "step": 4143 }, { "epoch": 1.5302806499261448, "grad_norm": 0.28587669134140015, "learning_rate": 9.804163074270231e-05, "loss": 0.2208, "step": 4144 }, { "epoch": 1.5306499261447564, "grad_norm": 0.254698783159256, "learning_rate": 9.801699716713881e-05, "loss": 0.2099, "step": 4145 }, { "epoch": 1.5310192023633677, "grad_norm": 0.28558284044265747, "learning_rate": 9.799236359157532e-05, "loss": 0.2553, "step": 4146 }, { "epoch": 1.5313884785819794, "grad_norm": 0.2878744602203369, "learning_rate": 9.796773001601183e-05, "loss": 0.229, "step": 4147 }, { "epoch": 1.5317577548005907, "grad_norm": 0.26767411828041077, "learning_rate": 9.794309644044834e-05, "loss": 0.2418, "step": 4148 }, { "epoch": 1.5321270310192023, "grad_norm": 0.2922110855579376, "learning_rate": 9.791846286488484e-05, "loss": 0.2326, "step": 4149 }, { "epoch": 1.532496307237814, "grad_norm": 0.31100940704345703, "learning_rate": 9.789382928932134e-05, "loss": 0.2941, "step": 4150 }, { "epoch": 1.532496307237814, "eval_loss": 0.26925957202911377, "eval_runtime": 5.8619, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 4150 }, { "epoch": 1.5328655834564255, "grad_norm": 0.2562064230442047, "learning_rate": 9.786919571375786e-05, "loss": 0.218, "step": 4151 }, { "epoch": 1.533234859675037, "grad_norm": 0.28819769620895386, "learning_rate": 9.784456213819436e-05, "loss": 0.237, "step": 4152 }, { "epoch": 1.5336041358936483, "grad_norm": 0.23982426524162292, "learning_rate": 9.781992856263087e-05, "loss": 0.2009, "step": 4153 }, { "epoch": 1.53397341211226, "grad_norm": 0.33476221561431885, "learning_rate": 9.779529498706737e-05, "loss": 0.2213, "step": 4154 }, { "epoch": 1.5343426883308715, "grad_norm": 0.24757163226604462, "learning_rate": 9.777066141150389e-05, "loss": 0.2138, "step": 4155 }, { "epoch": 1.534711964549483, "grad_norm": 0.24515816569328308, "learning_rate": 9.774602783594039e-05, "loss": 0.2073, "step": 4156 }, { "epoch": 1.5350812407680945, "grad_norm": 0.35198482871055603, "learning_rate": 9.772139426037689e-05, "loss": 0.2536, "step": 4157 }, { "epoch": 1.535450516986706, "grad_norm": 0.3029174506664276, "learning_rate": 9.76967606848134e-05, "loss": 0.2366, "step": 4158 }, { "epoch": 1.5358197932053175, "grad_norm": 0.25651854276657104, "learning_rate": 9.76721271092499e-05, "loss": 0.2193, "step": 4159 }, { "epoch": 1.536189069423929, "grad_norm": 0.2608349919319153, "learning_rate": 9.764749353368642e-05, "loss": 0.23, "step": 4160 }, { "epoch": 1.5365583456425407, "grad_norm": 0.2651025354862213, "learning_rate": 9.762285995812292e-05, "loss": 0.2139, "step": 4161 }, { "epoch": 1.5369276218611523, "grad_norm": 0.3118842840194702, "learning_rate": 9.759822638255944e-05, "loss": 0.2408, "step": 4162 }, { "epoch": 1.5372968980797637, "grad_norm": 0.29180708527565, "learning_rate": 9.757359280699594e-05, "loss": 0.2316, "step": 4163 }, { "epoch": 1.537666174298375, "grad_norm": 0.2783593237400055, "learning_rate": 9.754895923143245e-05, "loss": 0.2431, "step": 4164 }, { "epoch": 1.5380354505169866, "grad_norm": 0.2899194061756134, "learning_rate": 9.752432565586895e-05, "loss": 0.2407, "step": 4165 }, { "epoch": 1.5384047267355982, "grad_norm": 0.26361024379730225, "learning_rate": 9.749969208030545e-05, "loss": 0.248, "step": 4166 }, { "epoch": 1.5387740029542099, "grad_norm": 0.2489016205072403, "learning_rate": 9.747505850474197e-05, "loss": 0.225, "step": 4167 }, { "epoch": 1.5391432791728212, "grad_norm": 0.25891217589378357, "learning_rate": 9.745042492917847e-05, "loss": 0.2284, "step": 4168 }, { "epoch": 1.5395125553914328, "grad_norm": 0.2712463140487671, "learning_rate": 9.742579135361499e-05, "loss": 0.2313, "step": 4169 }, { "epoch": 1.5398818316100442, "grad_norm": 0.278853178024292, "learning_rate": 9.740115777805149e-05, "loss": 0.2315, "step": 4170 }, { "epoch": 1.5402511078286558, "grad_norm": 0.27925750613212585, "learning_rate": 9.7376524202488e-05, "loss": 0.2541, "step": 4171 }, { "epoch": 1.5406203840472674, "grad_norm": 0.3000519871711731, "learning_rate": 9.73518906269245e-05, "loss": 0.2243, "step": 4172 }, { "epoch": 1.540989660265879, "grad_norm": 0.2703240215778351, "learning_rate": 9.7327257051361e-05, "loss": 0.2324, "step": 4173 }, { "epoch": 1.5413589364844904, "grad_norm": 0.1922842413187027, "learning_rate": 9.730262347579752e-05, "loss": 0.1967, "step": 4174 }, { "epoch": 1.5417282127031018, "grad_norm": 0.3185964524745941, "learning_rate": 9.727798990023402e-05, "loss": 0.2745, "step": 4175 }, { "epoch": 1.5420974889217134, "grad_norm": 0.268771767616272, "learning_rate": 9.725335632467053e-05, "loss": 0.2343, "step": 4176 }, { "epoch": 1.542466765140325, "grad_norm": 0.31947267055511475, "learning_rate": 9.722872274910703e-05, "loss": 0.2212, "step": 4177 }, { "epoch": 1.5428360413589366, "grad_norm": 0.265396386384964, "learning_rate": 9.720408917354355e-05, "loss": 0.2335, "step": 4178 }, { "epoch": 1.543205317577548, "grad_norm": 0.2607923448085785, "learning_rate": 9.717945559798005e-05, "loss": 0.2071, "step": 4179 }, { "epoch": 1.5435745937961596, "grad_norm": 0.2686561346054077, "learning_rate": 9.715482202241656e-05, "loss": 0.2312, "step": 4180 }, { "epoch": 1.543943870014771, "grad_norm": 0.2784029245376587, "learning_rate": 9.713018844685307e-05, "loss": 0.2427, "step": 4181 }, { "epoch": 1.5443131462333826, "grad_norm": 0.33051469922065735, "learning_rate": 9.710555487128957e-05, "loss": 0.2471, "step": 4182 }, { "epoch": 1.5446824224519942, "grad_norm": 0.2989259958267212, "learning_rate": 9.708092129572608e-05, "loss": 0.2369, "step": 4183 }, { "epoch": 1.5450516986706058, "grad_norm": 0.28553035855293274, "learning_rate": 9.705628772016258e-05, "loss": 0.2339, "step": 4184 }, { "epoch": 1.5454209748892171, "grad_norm": 0.2696689963340759, "learning_rate": 9.70316541445991e-05, "loss": 0.2452, "step": 4185 }, { "epoch": 1.5457902511078285, "grad_norm": 0.27339741587638855, "learning_rate": 9.70070205690356e-05, "loss": 0.2686, "step": 4186 }, { "epoch": 1.5461595273264401, "grad_norm": 0.2567724883556366, "learning_rate": 9.698238699347211e-05, "loss": 0.2203, "step": 4187 }, { "epoch": 1.5465288035450517, "grad_norm": 0.26803749799728394, "learning_rate": 9.695775341790861e-05, "loss": 0.2262, "step": 4188 }, { "epoch": 1.5468980797636633, "grad_norm": 0.28053510189056396, "learning_rate": 9.693311984234512e-05, "loss": 0.236, "step": 4189 }, { "epoch": 1.5472673559822747, "grad_norm": 0.27617505192756653, "learning_rate": 9.690848626678163e-05, "loss": 0.2536, "step": 4190 }, { "epoch": 1.547636632200886, "grad_norm": 0.2775917649269104, "learning_rate": 9.688385269121813e-05, "loss": 0.2638, "step": 4191 }, { "epoch": 1.5480059084194977, "grad_norm": 0.23348627984523773, "learning_rate": 9.685921911565465e-05, "loss": 0.2177, "step": 4192 }, { "epoch": 1.5483751846381093, "grad_norm": 0.3035806715488434, "learning_rate": 9.683458554009115e-05, "loss": 0.2106, "step": 4193 }, { "epoch": 1.548744460856721, "grad_norm": 0.2474101036787033, "learning_rate": 9.680995196452766e-05, "loss": 0.2106, "step": 4194 }, { "epoch": 1.5491137370753325, "grad_norm": 0.2989620268344879, "learning_rate": 9.678531838896416e-05, "loss": 0.2498, "step": 4195 }, { "epoch": 1.549483013293944, "grad_norm": 0.2706199884414673, "learning_rate": 9.676068481340068e-05, "loss": 0.2492, "step": 4196 }, { "epoch": 1.5498522895125553, "grad_norm": 0.2636919617652893, "learning_rate": 9.673605123783718e-05, "loss": 0.2539, "step": 4197 }, { "epoch": 1.5502215657311669, "grad_norm": 0.25813478231430054, "learning_rate": 9.671141766227368e-05, "loss": 0.2344, "step": 4198 }, { "epoch": 1.5505908419497785, "grad_norm": 0.26605188846588135, "learning_rate": 9.66867840867102e-05, "loss": 0.2125, "step": 4199 }, { "epoch": 1.55096011816839, "grad_norm": 0.28155773878097534, "learning_rate": 9.66621505111467e-05, "loss": 0.2536, "step": 4200 }, { "epoch": 1.55096011816839, "eval_loss": 0.26491570472717285, "eval_runtime": 5.856, "eval_samples_per_second": 8.538, "eval_steps_per_second": 1.195, "step": 4200 }, { "epoch": 1.5513293943870015, "grad_norm": 0.27864086627960205, "learning_rate": 9.663751693558321e-05, "loss": 0.3091, "step": 4201 }, { "epoch": 1.5516986706056128, "grad_norm": 0.27152353525161743, "learning_rate": 9.661288336001971e-05, "loss": 0.2605, "step": 4202 }, { "epoch": 1.5520679468242244, "grad_norm": 0.27896204590797424, "learning_rate": 9.658824978445623e-05, "loss": 0.2688, "step": 4203 }, { "epoch": 1.552437223042836, "grad_norm": 0.2809586822986603, "learning_rate": 9.656361620889273e-05, "loss": 0.2211, "step": 4204 }, { "epoch": 1.5528064992614476, "grad_norm": 0.3071771264076233, "learning_rate": 9.653898263332923e-05, "loss": 0.25, "step": 4205 }, { "epoch": 1.553175775480059, "grad_norm": 0.23369184136390686, "learning_rate": 9.651434905776574e-05, "loss": 0.219, "step": 4206 }, { "epoch": 1.5535450516986706, "grad_norm": 0.27824506163597107, "learning_rate": 9.648971548220224e-05, "loss": 0.248, "step": 4207 }, { "epoch": 1.553914327917282, "grad_norm": 0.3109837472438812, "learning_rate": 9.646508190663876e-05, "loss": 0.2723, "step": 4208 }, { "epoch": 1.5542836041358936, "grad_norm": 0.322033166885376, "learning_rate": 9.644044833107526e-05, "loss": 0.2423, "step": 4209 }, { "epoch": 1.5546528803545052, "grad_norm": 0.2386239618062973, "learning_rate": 9.641581475551177e-05, "loss": 0.2235, "step": 4210 }, { "epoch": 1.5550221565731168, "grad_norm": 0.29537051916122437, "learning_rate": 9.639118117994827e-05, "loss": 0.1921, "step": 4211 }, { "epoch": 1.5553914327917282, "grad_norm": 0.2355179637670517, "learning_rate": 9.636654760438479e-05, "loss": 0.2129, "step": 4212 }, { "epoch": 1.5557607090103396, "grad_norm": 0.26878058910369873, "learning_rate": 9.634191402882129e-05, "loss": 0.2275, "step": 4213 }, { "epoch": 1.5561299852289512, "grad_norm": 0.5514440536499023, "learning_rate": 9.631728045325779e-05, "loss": 0.2754, "step": 4214 }, { "epoch": 1.5564992614475628, "grad_norm": 0.2654857039451599, "learning_rate": 9.62926468776943e-05, "loss": 0.2261, "step": 4215 }, { "epoch": 1.5568685376661744, "grad_norm": 0.25727444887161255, "learning_rate": 9.626801330213081e-05, "loss": 0.2622, "step": 4216 }, { "epoch": 1.5572378138847858, "grad_norm": 0.2708493769168854, "learning_rate": 9.624337972656732e-05, "loss": 0.2145, "step": 4217 }, { "epoch": 1.5576070901033974, "grad_norm": 0.30159351229667664, "learning_rate": 9.621874615100382e-05, "loss": 0.2645, "step": 4218 }, { "epoch": 1.5579763663220088, "grad_norm": 0.29582643508911133, "learning_rate": 9.619411257544034e-05, "loss": 0.3205, "step": 4219 }, { "epoch": 1.5583456425406204, "grad_norm": 0.2497013956308365, "learning_rate": 9.616947899987684e-05, "loss": 0.2433, "step": 4220 }, { "epoch": 1.558714918759232, "grad_norm": 0.2375938892364502, "learning_rate": 9.614484542431334e-05, "loss": 0.2153, "step": 4221 }, { "epoch": 1.5590841949778436, "grad_norm": 0.2669118046760559, "learning_rate": 9.612021184874985e-05, "loss": 0.2114, "step": 4222 }, { "epoch": 1.559453471196455, "grad_norm": 0.3310312330722809, "learning_rate": 9.609557827318636e-05, "loss": 0.2379, "step": 4223 }, { "epoch": 1.5598227474150663, "grad_norm": 0.2724936008453369, "learning_rate": 9.607094469762287e-05, "loss": 0.2079, "step": 4224 }, { "epoch": 1.560192023633678, "grad_norm": 0.326214075088501, "learning_rate": 9.604631112205937e-05, "loss": 0.233, "step": 4225 }, { "epoch": 1.5605612998522895, "grad_norm": 0.2730662524700165, "learning_rate": 9.602167754649589e-05, "loss": 0.2331, "step": 4226 }, { "epoch": 1.5609305760709011, "grad_norm": 0.2563663721084595, "learning_rate": 9.599704397093239e-05, "loss": 0.2211, "step": 4227 }, { "epoch": 1.5612998522895125, "grad_norm": 0.24609240889549255, "learning_rate": 9.597241039536889e-05, "loss": 0.2223, "step": 4228 }, { "epoch": 1.5616691285081241, "grad_norm": 0.2626035213470459, "learning_rate": 9.59477768198054e-05, "loss": 0.2328, "step": 4229 }, { "epoch": 1.5620384047267355, "grad_norm": 0.24416084587574005, "learning_rate": 9.59231432442419e-05, "loss": 0.2087, "step": 4230 }, { "epoch": 1.562407680945347, "grad_norm": 0.25252941250801086, "learning_rate": 9.589850966867842e-05, "loss": 0.2319, "step": 4231 }, { "epoch": 1.5627769571639587, "grad_norm": 0.2506401538848877, "learning_rate": 9.587387609311492e-05, "loss": 0.2246, "step": 4232 }, { "epoch": 1.5631462333825703, "grad_norm": 0.29225239157676697, "learning_rate": 9.584924251755143e-05, "loss": 0.234, "step": 4233 }, { "epoch": 1.5635155096011817, "grad_norm": 0.2837737500667572, "learning_rate": 9.582460894198794e-05, "loss": 0.2275, "step": 4234 }, { "epoch": 1.563884785819793, "grad_norm": 0.26541614532470703, "learning_rate": 9.579997536642445e-05, "loss": 0.232, "step": 4235 }, { "epoch": 1.5642540620384047, "grad_norm": 0.24503584206104279, "learning_rate": 9.577534179086095e-05, "loss": 0.2246, "step": 4236 }, { "epoch": 1.5646233382570163, "grad_norm": 0.282164067029953, "learning_rate": 9.575070821529745e-05, "loss": 0.2383, "step": 4237 }, { "epoch": 1.5649926144756279, "grad_norm": 0.2575039565563202, "learning_rate": 9.572607463973397e-05, "loss": 0.2191, "step": 4238 }, { "epoch": 1.5653618906942393, "grad_norm": 0.17597277462482452, "learning_rate": 9.570144106417047e-05, "loss": 0.1631, "step": 4239 }, { "epoch": 1.5657311669128509, "grad_norm": 0.3703117370605469, "learning_rate": 9.567680748860698e-05, "loss": 0.2321, "step": 4240 }, { "epoch": 1.5661004431314622, "grad_norm": 0.24686135351657867, "learning_rate": 9.565217391304348e-05, "loss": 0.2189, "step": 4241 }, { "epoch": 1.5664697193500738, "grad_norm": 0.28961747884750366, "learning_rate": 9.562754033748e-05, "loss": 0.2488, "step": 4242 }, { "epoch": 1.5668389955686854, "grad_norm": 0.25977790355682373, "learning_rate": 9.56029067619165e-05, "loss": 0.2177, "step": 4243 }, { "epoch": 1.567208271787297, "grad_norm": 0.27536264061927795, "learning_rate": 9.5578273186353e-05, "loss": 0.2142, "step": 4244 }, { "epoch": 1.5675775480059084, "grad_norm": 0.3315596580505371, "learning_rate": 9.555363961078951e-05, "loss": 0.2689, "step": 4245 }, { "epoch": 1.5679468242245198, "grad_norm": 0.35292044281959534, "learning_rate": 9.552900603522602e-05, "loss": 0.2434, "step": 4246 }, { "epoch": 1.5683161004431314, "grad_norm": 0.29545414447784424, "learning_rate": 9.550437245966253e-05, "loss": 0.2612, "step": 4247 }, { "epoch": 1.568685376661743, "grad_norm": 0.2903496026992798, "learning_rate": 9.547973888409903e-05, "loss": 0.2248, "step": 4248 }, { "epoch": 1.5690546528803546, "grad_norm": 0.3032342493534088, "learning_rate": 9.545510530853555e-05, "loss": 0.2754, "step": 4249 }, { "epoch": 1.569423929098966, "grad_norm": 0.23315784335136414, "learning_rate": 9.543047173297205e-05, "loss": 0.1979, "step": 4250 }, { "epoch": 1.569423929098966, "eval_loss": 0.26710450649261475, "eval_runtime": 5.8493, "eval_samples_per_second": 8.548, "eval_steps_per_second": 1.197, "step": 4250 }, { "epoch": 1.5697932053175776, "grad_norm": 0.24444057047367096, "learning_rate": 9.540583815740856e-05, "loss": 0.2093, "step": 4251 }, { "epoch": 1.570162481536189, "grad_norm": 0.30540236830711365, "learning_rate": 9.538120458184506e-05, "loss": 0.2497, "step": 4252 }, { "epoch": 1.5705317577548006, "grad_norm": 0.3088052272796631, "learning_rate": 9.535657100628156e-05, "loss": 0.2237, "step": 4253 }, { "epoch": 1.5709010339734122, "grad_norm": 0.3352174460887909, "learning_rate": 9.533193743071808e-05, "loss": 0.2535, "step": 4254 }, { "epoch": 1.5712703101920238, "grad_norm": 0.2540420889854431, "learning_rate": 9.530730385515458e-05, "loss": 0.2359, "step": 4255 }, { "epoch": 1.5716395864106352, "grad_norm": 0.2622866630554199, "learning_rate": 9.52826702795911e-05, "loss": 0.2152, "step": 4256 }, { "epoch": 1.5720088626292466, "grad_norm": 0.30007508397102356, "learning_rate": 9.52580367040276e-05, "loss": 0.2747, "step": 4257 }, { "epoch": 1.5723781388478582, "grad_norm": 0.3477882146835327, "learning_rate": 9.523340312846411e-05, "loss": 0.2534, "step": 4258 }, { "epoch": 1.5727474150664698, "grad_norm": 0.26416993141174316, "learning_rate": 9.520876955290061e-05, "loss": 0.184, "step": 4259 }, { "epoch": 1.5731166912850814, "grad_norm": 0.29270222783088684, "learning_rate": 9.518413597733711e-05, "loss": 0.2596, "step": 4260 }, { "epoch": 1.5734859675036927, "grad_norm": 0.2309563308954239, "learning_rate": 9.515950240177363e-05, "loss": 0.1776, "step": 4261 }, { "epoch": 1.5738552437223041, "grad_norm": 0.3029315173625946, "learning_rate": 9.513486882621013e-05, "loss": 0.2582, "step": 4262 }, { "epoch": 1.5742245199409157, "grad_norm": 0.27996766567230225, "learning_rate": 9.511023525064664e-05, "loss": 0.224, "step": 4263 }, { "epoch": 1.5745937961595273, "grad_norm": 0.25720569491386414, "learning_rate": 9.508560167508314e-05, "loss": 0.203, "step": 4264 }, { "epoch": 1.574963072378139, "grad_norm": 0.3253585696220398, "learning_rate": 9.506096809951966e-05, "loss": 0.2461, "step": 4265 }, { "epoch": 1.5753323485967505, "grad_norm": 0.30563682317733765, "learning_rate": 9.503633452395616e-05, "loss": 0.2321, "step": 4266 }, { "epoch": 1.575701624815362, "grad_norm": 0.26521727442741394, "learning_rate": 9.501170094839267e-05, "loss": 0.2599, "step": 4267 }, { "epoch": 1.5760709010339733, "grad_norm": 0.3069210350513458, "learning_rate": 9.498706737282918e-05, "loss": 0.245, "step": 4268 }, { "epoch": 1.576440177252585, "grad_norm": 0.2539284825325012, "learning_rate": 9.496243379726568e-05, "loss": 0.1993, "step": 4269 }, { "epoch": 1.5768094534711965, "grad_norm": 0.2633403539657593, "learning_rate": 9.493780022170219e-05, "loss": 0.2012, "step": 4270 }, { "epoch": 1.577178729689808, "grad_norm": 0.30276525020599365, "learning_rate": 9.491316664613869e-05, "loss": 0.2738, "step": 4271 }, { "epoch": 1.5775480059084195, "grad_norm": 0.24089419841766357, "learning_rate": 9.488853307057521e-05, "loss": 0.2213, "step": 4272 }, { "epoch": 1.5779172821270309, "grad_norm": 0.29699668288230896, "learning_rate": 9.486389949501171e-05, "loss": 0.2707, "step": 4273 }, { "epoch": 1.5782865583456425, "grad_norm": 0.2622053623199463, "learning_rate": 9.483926591944822e-05, "loss": 0.2418, "step": 4274 }, { "epoch": 1.578655834564254, "grad_norm": 0.29047226905822754, "learning_rate": 9.481463234388472e-05, "loss": 0.2224, "step": 4275 }, { "epoch": 1.5790251107828657, "grad_norm": 0.3342186510562897, "learning_rate": 9.478999876832122e-05, "loss": 0.2878, "step": 4276 }, { "epoch": 1.579394387001477, "grad_norm": 0.24895580112934113, "learning_rate": 9.476536519275774e-05, "loss": 0.1882, "step": 4277 }, { "epoch": 1.5797636632200887, "grad_norm": 0.3772604763507843, "learning_rate": 9.474073161719424e-05, "loss": 0.2985, "step": 4278 }, { "epoch": 1.5801329394387, "grad_norm": 0.260893315076828, "learning_rate": 9.471609804163076e-05, "loss": 0.2064, "step": 4279 }, { "epoch": 1.5805022156573116, "grad_norm": 0.27421626448631287, "learning_rate": 9.469146446606726e-05, "loss": 0.2387, "step": 4280 }, { "epoch": 1.5808714918759232, "grad_norm": 0.27801311016082764, "learning_rate": 9.466683089050377e-05, "loss": 0.2703, "step": 4281 }, { "epoch": 1.5812407680945348, "grad_norm": 0.2561623156070709, "learning_rate": 9.464219731494027e-05, "loss": 0.2304, "step": 4282 }, { "epoch": 1.5816100443131462, "grad_norm": 0.26812848448753357, "learning_rate": 9.461756373937679e-05, "loss": 0.231, "step": 4283 }, { "epoch": 1.5819793205317576, "grad_norm": 0.3009648025035858, "learning_rate": 9.459293016381329e-05, "loss": 0.2445, "step": 4284 }, { "epoch": 1.5823485967503692, "grad_norm": 0.29099470376968384, "learning_rate": 9.456829658824979e-05, "loss": 0.215, "step": 4285 }, { "epoch": 1.5827178729689808, "grad_norm": 0.2548910677433014, "learning_rate": 9.45436630126863e-05, "loss": 0.2143, "step": 4286 }, { "epoch": 1.5830871491875924, "grad_norm": 0.5736655592918396, "learning_rate": 9.45190294371228e-05, "loss": 0.2081, "step": 4287 }, { "epoch": 1.5834564254062038, "grad_norm": 0.2578420341014862, "learning_rate": 9.449439586155932e-05, "loss": 0.215, "step": 4288 }, { "epoch": 1.5838257016248154, "grad_norm": 0.3453446924686432, "learning_rate": 9.446976228599582e-05, "loss": 0.2783, "step": 4289 }, { "epoch": 1.5841949778434268, "grad_norm": 0.31030216813087463, "learning_rate": 9.444512871043233e-05, "loss": 0.2469, "step": 4290 }, { "epoch": 1.5845642540620384, "grad_norm": 0.283894807100296, "learning_rate": 9.442049513486884e-05, "loss": 0.2173, "step": 4291 }, { "epoch": 1.58493353028065, "grad_norm": 0.20466217398643494, "learning_rate": 9.439586155930534e-05, "loss": 0.1964, "step": 4292 }, { "epoch": 1.5853028064992616, "grad_norm": 0.33935806155204773, "learning_rate": 9.437122798374185e-05, "loss": 0.2731, "step": 4293 }, { "epoch": 1.585672082717873, "grad_norm": 0.3064669370651245, "learning_rate": 9.434659440817835e-05, "loss": 0.2202, "step": 4294 }, { "epoch": 1.5860413589364843, "grad_norm": 0.31215453147888184, "learning_rate": 9.432196083261487e-05, "loss": 0.2566, "step": 4295 }, { "epoch": 1.586410635155096, "grad_norm": 0.24009639024734497, "learning_rate": 9.429732725705137e-05, "loss": 0.1968, "step": 4296 }, { "epoch": 1.5867799113737076, "grad_norm": 0.34021276235580444, "learning_rate": 9.427269368148787e-05, "loss": 0.302, "step": 4297 }, { "epoch": 1.5871491875923192, "grad_norm": 0.2594415545463562, "learning_rate": 9.424806010592437e-05, "loss": 0.1936, "step": 4298 }, { "epoch": 1.5875184638109305, "grad_norm": 0.26243454217910767, "learning_rate": 9.422342653036089e-05, "loss": 0.2316, "step": 4299 }, { "epoch": 1.5878877400295421, "grad_norm": 0.25612276792526245, "learning_rate": 9.419879295479739e-05, "loss": 0.2599, "step": 4300 }, { "epoch": 1.5878877400295421, "eval_loss": 0.2657601833343506, "eval_runtime": 5.8652, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.193, "step": 4300 }, { "epoch": 1.5882570162481535, "grad_norm": 0.2628798186779022, "learning_rate": 9.41741593792339e-05, "loss": 0.2186, "step": 4301 }, { "epoch": 1.5886262924667651, "grad_norm": 0.3352354168891907, "learning_rate": 9.41495258036704e-05, "loss": 0.2316, "step": 4302 }, { "epoch": 1.5889955686853767, "grad_norm": 0.2656700313091278, "learning_rate": 9.41248922281069e-05, "loss": 0.2315, "step": 4303 }, { "epoch": 1.5893648449039883, "grad_norm": 0.291464239358902, "learning_rate": 9.410025865254342e-05, "loss": 0.2204, "step": 4304 }, { "epoch": 1.5897341211225997, "grad_norm": 0.2992652654647827, "learning_rate": 9.407562507697992e-05, "loss": 0.2159, "step": 4305 }, { "epoch": 1.590103397341211, "grad_norm": 0.2757866680622101, "learning_rate": 9.405099150141643e-05, "loss": 0.2129, "step": 4306 }, { "epoch": 1.5904726735598227, "grad_norm": 0.2500758171081543, "learning_rate": 9.402635792585293e-05, "loss": 0.2006, "step": 4307 }, { "epoch": 1.5908419497784343, "grad_norm": 0.2814030051231384, "learning_rate": 9.400172435028945e-05, "loss": 0.2456, "step": 4308 }, { "epoch": 1.591211225997046, "grad_norm": 0.37658214569091797, "learning_rate": 9.397709077472595e-05, "loss": 0.2899, "step": 4309 }, { "epoch": 1.5915805022156573, "grad_norm": 0.29137834906578064, "learning_rate": 9.395245719916245e-05, "loss": 0.2364, "step": 4310 }, { "epoch": 1.5919497784342689, "grad_norm": 0.28656837344169617, "learning_rate": 9.392782362359897e-05, "loss": 0.2448, "step": 4311 }, { "epoch": 1.5923190546528803, "grad_norm": 0.21193453669548035, "learning_rate": 9.390319004803547e-05, "loss": 0.1986, "step": 4312 }, { "epoch": 1.5926883308714919, "grad_norm": 0.22614000737667084, "learning_rate": 9.387855647247198e-05, "loss": 0.1926, "step": 4313 }, { "epoch": 1.5930576070901035, "grad_norm": 0.34158673882484436, "learning_rate": 9.385392289690848e-05, "loss": 0.2315, "step": 4314 }, { "epoch": 1.593426883308715, "grad_norm": 0.274847149848938, "learning_rate": 9.3829289321345e-05, "loss": 0.2177, "step": 4315 }, { "epoch": 1.5937961595273265, "grad_norm": 0.2810226380825043, "learning_rate": 9.38046557457815e-05, "loss": 0.2522, "step": 4316 }, { "epoch": 1.5941654357459378, "grad_norm": 0.33540499210357666, "learning_rate": 9.378002217021801e-05, "loss": 0.2631, "step": 4317 }, { "epoch": 1.5945347119645494, "grad_norm": 0.3179091215133667, "learning_rate": 9.375538859465451e-05, "loss": 0.2796, "step": 4318 }, { "epoch": 1.594903988183161, "grad_norm": 0.2694183588027954, "learning_rate": 9.373075501909102e-05, "loss": 0.2675, "step": 4319 }, { "epoch": 1.5952732644017726, "grad_norm": 0.26848018169403076, "learning_rate": 9.370612144352753e-05, "loss": 0.2381, "step": 4320 }, { "epoch": 1.595642540620384, "grad_norm": 0.27807292342185974, "learning_rate": 9.368148786796403e-05, "loss": 0.2211, "step": 4321 }, { "epoch": 1.5960118168389956, "grad_norm": 0.37645626068115234, "learning_rate": 9.365685429240055e-05, "loss": 0.3056, "step": 4322 }, { "epoch": 1.596381093057607, "grad_norm": 0.2805720567703247, "learning_rate": 9.363222071683705e-05, "loss": 0.226, "step": 4323 }, { "epoch": 1.5967503692762186, "grad_norm": 0.2529265284538269, "learning_rate": 9.360758714127356e-05, "loss": 0.2304, "step": 4324 }, { "epoch": 1.5971196454948302, "grad_norm": 0.2973952293395996, "learning_rate": 9.358295356571006e-05, "loss": 0.2078, "step": 4325 }, { "epoch": 1.5974889217134418, "grad_norm": 0.3685002624988556, "learning_rate": 9.355831999014656e-05, "loss": 0.2365, "step": 4326 }, { "epoch": 1.5978581979320532, "grad_norm": 0.30593231320381165, "learning_rate": 9.353368641458308e-05, "loss": 0.2397, "step": 4327 }, { "epoch": 1.5982274741506646, "grad_norm": 0.287945032119751, "learning_rate": 9.350905283901958e-05, "loss": 0.2171, "step": 4328 }, { "epoch": 1.5985967503692762, "grad_norm": 0.353727787733078, "learning_rate": 9.34844192634561e-05, "loss": 0.2962, "step": 4329 }, { "epoch": 1.5989660265878878, "grad_norm": 0.27644434571266174, "learning_rate": 9.34597856878926e-05, "loss": 0.2336, "step": 4330 }, { "epoch": 1.5993353028064994, "grad_norm": 0.30194446444511414, "learning_rate": 9.343515211232911e-05, "loss": 0.2742, "step": 4331 }, { "epoch": 1.5997045790251108, "grad_norm": 0.2946053743362427, "learning_rate": 9.341051853676561e-05, "loss": 0.2354, "step": 4332 }, { "epoch": 1.6000738552437221, "grad_norm": 0.28401628136634827, "learning_rate": 9.338588496120213e-05, "loss": 0.2406, "step": 4333 }, { "epoch": 1.6004431314623337, "grad_norm": 0.27871960401535034, "learning_rate": 9.336125138563863e-05, "loss": 0.2448, "step": 4334 }, { "epoch": 1.6008124076809453, "grad_norm": 0.29729315638542175, "learning_rate": 9.333661781007513e-05, "loss": 0.2727, "step": 4335 }, { "epoch": 1.601181683899557, "grad_norm": 0.2805337905883789, "learning_rate": 9.331198423451164e-05, "loss": 0.2661, "step": 4336 }, { "epoch": 1.6015509601181686, "grad_norm": 0.26619741320610046, "learning_rate": 9.328735065894814e-05, "loss": 0.2459, "step": 4337 }, { "epoch": 1.60192023633678, "grad_norm": 0.2586030662059784, "learning_rate": 9.326271708338466e-05, "loss": 0.2413, "step": 4338 }, { "epoch": 1.6022895125553913, "grad_norm": 0.28171306848526, "learning_rate": 9.323808350782116e-05, "loss": 0.2425, "step": 4339 }, { "epoch": 1.602658788774003, "grad_norm": 0.2730322480201721, "learning_rate": 9.321344993225767e-05, "loss": 0.231, "step": 4340 }, { "epoch": 1.6030280649926145, "grad_norm": 0.2533946633338928, "learning_rate": 9.318881635669417e-05, "loss": 0.2484, "step": 4341 }, { "epoch": 1.6033973412112261, "grad_norm": 0.3294993042945862, "learning_rate": 9.316418278113068e-05, "loss": 0.2564, "step": 4342 }, { "epoch": 1.6037666174298375, "grad_norm": 0.2796359360218048, "learning_rate": 9.313954920556719e-05, "loss": 0.2769, "step": 4343 }, { "epoch": 1.6041358936484489, "grad_norm": 0.3053548336029053, "learning_rate": 9.311491563000369e-05, "loss": 0.2712, "step": 4344 }, { "epoch": 1.6045051698670605, "grad_norm": 0.23783549666404724, "learning_rate": 9.30902820544402e-05, "loss": 0.2177, "step": 4345 }, { "epoch": 1.604874446085672, "grad_norm": 0.2943350076675415, "learning_rate": 9.306564847887671e-05, "loss": 0.2202, "step": 4346 }, { "epoch": 1.6052437223042837, "grad_norm": 0.29149776697158813, "learning_rate": 9.304101490331322e-05, "loss": 0.231, "step": 4347 }, { "epoch": 1.605612998522895, "grad_norm": 0.2750820219516754, "learning_rate": 9.301638132774972e-05, "loss": 0.2227, "step": 4348 }, { "epoch": 1.6059822747415067, "grad_norm": 0.2783328890800476, "learning_rate": 9.299174775218624e-05, "loss": 0.223, "step": 4349 }, { "epoch": 1.606351550960118, "grad_norm": 0.23471657931804657, "learning_rate": 9.296711417662274e-05, "loss": 0.2069, "step": 4350 }, { "epoch": 1.606351550960118, "eval_loss": 0.2663235068321228, "eval_runtime": 5.8608, "eval_samples_per_second": 8.531, "eval_steps_per_second": 1.194, "step": 4350 }, { "epoch": 1.6067208271787297, "grad_norm": 0.3521765470504761, "learning_rate": 9.294248060105924e-05, "loss": 0.2556, "step": 4351 }, { "epoch": 1.6070901033973413, "grad_norm": 0.307546466588974, "learning_rate": 9.291784702549575e-05, "loss": 0.2705, "step": 4352 }, { "epoch": 1.6074593796159529, "grad_norm": 0.31380757689476013, "learning_rate": 9.289321344993226e-05, "loss": 0.2548, "step": 4353 }, { "epoch": 1.6078286558345642, "grad_norm": 0.20991113781929016, "learning_rate": 9.286857987436877e-05, "loss": 0.1966, "step": 4354 }, { "epoch": 1.6081979320531756, "grad_norm": 0.2921874225139618, "learning_rate": 9.284394629880527e-05, "loss": 0.2539, "step": 4355 }, { "epoch": 1.6085672082717872, "grad_norm": 0.2973816692829132, "learning_rate": 9.281931272324179e-05, "loss": 0.2543, "step": 4356 }, { "epoch": 1.6089364844903988, "grad_norm": 0.23171468079090118, "learning_rate": 9.279467914767829e-05, "loss": 0.2032, "step": 4357 }, { "epoch": 1.6093057607090104, "grad_norm": 0.28898993134498596, "learning_rate": 9.277004557211479e-05, "loss": 0.2166, "step": 4358 }, { "epoch": 1.6096750369276218, "grad_norm": 0.3716563284397125, "learning_rate": 9.27454119965513e-05, "loss": 0.2859, "step": 4359 }, { "epoch": 1.6100443131462334, "grad_norm": 0.3015817403793335, "learning_rate": 9.27207784209878e-05, "loss": 0.2055, "step": 4360 }, { "epoch": 1.6104135893648448, "grad_norm": 0.26564592123031616, "learning_rate": 9.269614484542432e-05, "loss": 0.1959, "step": 4361 }, { "epoch": 1.6107828655834564, "grad_norm": 0.3186861574649811, "learning_rate": 9.267151126986082e-05, "loss": 0.26, "step": 4362 }, { "epoch": 1.611152141802068, "grad_norm": 0.28396859765052795, "learning_rate": 9.264687769429733e-05, "loss": 0.2461, "step": 4363 }, { "epoch": 1.6115214180206796, "grad_norm": 0.2832779288291931, "learning_rate": 9.262224411873384e-05, "loss": 0.2233, "step": 4364 }, { "epoch": 1.611890694239291, "grad_norm": 0.28067728877067566, "learning_rate": 9.259761054317035e-05, "loss": 0.243, "step": 4365 }, { "epoch": 1.6122599704579024, "grad_norm": 0.29506027698516846, "learning_rate": 9.257297696760685e-05, "loss": 0.2475, "step": 4366 }, { "epoch": 1.612629246676514, "grad_norm": 0.2841379940509796, "learning_rate": 9.254834339204335e-05, "loss": 0.243, "step": 4367 }, { "epoch": 1.6129985228951256, "grad_norm": 0.28466540575027466, "learning_rate": 9.252370981647987e-05, "loss": 0.2117, "step": 4368 }, { "epoch": 1.6133677991137372, "grad_norm": 0.32267874479293823, "learning_rate": 9.249907624091637e-05, "loss": 0.27, "step": 4369 }, { "epoch": 1.6137370753323486, "grad_norm": 0.2639078199863434, "learning_rate": 9.247444266535288e-05, "loss": 0.2213, "step": 4370 }, { "epoch": 1.6141063515509602, "grad_norm": 0.3134765625, "learning_rate": 9.244980908978938e-05, "loss": 0.2457, "step": 4371 }, { "epoch": 1.6144756277695715, "grad_norm": 0.2824104130268097, "learning_rate": 9.24251755142259e-05, "loss": 0.2116, "step": 4372 }, { "epoch": 1.6148449039881831, "grad_norm": 0.21312622725963593, "learning_rate": 9.24005419386624e-05, "loss": 0.197, "step": 4373 }, { "epoch": 1.6152141802067947, "grad_norm": 0.2423839569091797, "learning_rate": 9.23759083630989e-05, "loss": 0.2054, "step": 4374 }, { "epoch": 1.6155834564254064, "grad_norm": 0.2941286861896515, "learning_rate": 9.235127478753542e-05, "loss": 0.2178, "step": 4375 }, { "epoch": 1.6159527326440177, "grad_norm": 0.28134721517562866, "learning_rate": 9.232664121197192e-05, "loss": 0.2427, "step": 4376 }, { "epoch": 1.6163220088626291, "grad_norm": 0.2671574354171753, "learning_rate": 9.230200763640843e-05, "loss": 0.2322, "step": 4377 }, { "epoch": 1.6166912850812407, "grad_norm": 0.2510659992694855, "learning_rate": 9.227737406084493e-05, "loss": 0.2012, "step": 4378 }, { "epoch": 1.6170605612998523, "grad_norm": 0.28398624062538147, "learning_rate": 9.225274048528145e-05, "loss": 0.2313, "step": 4379 }, { "epoch": 1.617429837518464, "grad_norm": 0.260732501745224, "learning_rate": 9.222810690971795e-05, "loss": 0.2456, "step": 4380 }, { "epoch": 1.6177991137370753, "grad_norm": 0.255723774433136, "learning_rate": 9.220347333415445e-05, "loss": 0.1865, "step": 4381 }, { "epoch": 1.618168389955687, "grad_norm": 0.29838570952415466, "learning_rate": 9.217883975859096e-05, "loss": 0.2384, "step": 4382 }, { "epoch": 1.6185376661742983, "grad_norm": 0.3303617238998413, "learning_rate": 9.215420618302746e-05, "loss": 0.2708, "step": 4383 }, { "epoch": 1.6189069423929099, "grad_norm": 0.3967146873474121, "learning_rate": 9.212957260746398e-05, "loss": 0.2847, "step": 4384 }, { "epoch": 1.6192762186115215, "grad_norm": 0.2737744152545929, "learning_rate": 9.210493903190048e-05, "loss": 0.2434, "step": 4385 }, { "epoch": 1.619645494830133, "grad_norm": 0.3345085382461548, "learning_rate": 9.2080305456337e-05, "loss": 0.241, "step": 4386 }, { "epoch": 1.6200147710487445, "grad_norm": 0.298921138048172, "learning_rate": 9.20556718807735e-05, "loss": 0.2408, "step": 4387 }, { "epoch": 1.6203840472673559, "grad_norm": 0.2851993441581726, "learning_rate": 9.203103830521001e-05, "loss": 0.2264, "step": 4388 }, { "epoch": 1.6207533234859675, "grad_norm": 0.2866692841053009, "learning_rate": 9.200640472964651e-05, "loss": 0.1903, "step": 4389 }, { "epoch": 1.621122599704579, "grad_norm": 0.27634477615356445, "learning_rate": 9.198177115408301e-05, "loss": 0.1854, "step": 4390 }, { "epoch": 1.6214918759231907, "grad_norm": 0.27173739671707153, "learning_rate": 9.195713757851953e-05, "loss": 0.271, "step": 4391 }, { "epoch": 1.621861152141802, "grad_norm": 0.22114601731300354, "learning_rate": 9.193250400295603e-05, "loss": 0.1995, "step": 4392 }, { "epoch": 1.6222304283604134, "grad_norm": 0.25258347392082214, "learning_rate": 9.190787042739254e-05, "loss": 0.2447, "step": 4393 }, { "epoch": 1.622599704579025, "grad_norm": 0.3033640384674072, "learning_rate": 9.188323685182904e-05, "loss": 0.265, "step": 4394 }, { "epoch": 1.6229689807976366, "grad_norm": 0.2696057856082916, "learning_rate": 9.185860327626556e-05, "loss": 0.2213, "step": 4395 }, { "epoch": 1.6233382570162482, "grad_norm": 0.3131449222564697, "learning_rate": 9.183396970070206e-05, "loss": 0.2536, "step": 4396 }, { "epoch": 1.6237075332348598, "grad_norm": 0.28059932589530945, "learning_rate": 9.180933612513856e-05, "loss": 0.2064, "step": 4397 }, { "epoch": 1.6240768094534712, "grad_norm": 0.26027733087539673, "learning_rate": 9.178470254957508e-05, "loss": 0.2053, "step": 4398 }, { "epoch": 1.6244460856720826, "grad_norm": 0.27992936968803406, "learning_rate": 9.176006897401158e-05, "loss": 0.2565, "step": 4399 }, { "epoch": 1.6248153618906942, "grad_norm": 0.26828402280807495, "learning_rate": 9.173543539844809e-05, "loss": 0.253, "step": 4400 }, { "epoch": 1.6248153618906942, "eval_loss": 0.265991747379303, "eval_runtime": 5.8558, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.195, "step": 4400 }, { "epoch": 1.6251846381093058, "grad_norm": 0.285685271024704, "learning_rate": 9.171080182288459e-05, "loss": 0.2289, "step": 4401 }, { "epoch": 1.6255539143279174, "grad_norm": 0.2752537131309509, "learning_rate": 9.168616824732111e-05, "loss": 0.2482, "step": 4402 }, { "epoch": 1.6259231905465288, "grad_norm": 0.3914795219898224, "learning_rate": 9.166153467175761e-05, "loss": 0.304, "step": 4403 }, { "epoch": 1.6262924667651402, "grad_norm": 0.26765310764312744, "learning_rate": 9.163690109619412e-05, "loss": 0.2098, "step": 4404 }, { "epoch": 1.6266617429837518, "grad_norm": 0.2741362154483795, "learning_rate": 9.161226752063062e-05, "loss": 0.2248, "step": 4405 }, { "epoch": 1.6270310192023634, "grad_norm": 0.30598682165145874, "learning_rate": 9.158763394506713e-05, "loss": 0.2531, "step": 4406 }, { "epoch": 1.627400295420975, "grad_norm": 0.2636612057685852, "learning_rate": 9.156300036950364e-05, "loss": 0.2289, "step": 4407 }, { "epoch": 1.6277695716395866, "grad_norm": 0.283623069524765, "learning_rate": 9.153836679394014e-05, "loss": 0.2506, "step": 4408 }, { "epoch": 1.628138847858198, "grad_norm": 0.2700044512748718, "learning_rate": 9.151373321837666e-05, "loss": 0.1795, "step": 4409 }, { "epoch": 1.6285081240768093, "grad_norm": 0.25703635811805725, "learning_rate": 9.148909964281316e-05, "loss": 0.2389, "step": 4410 }, { "epoch": 1.628877400295421, "grad_norm": 0.28995630145072937, "learning_rate": 9.146446606724967e-05, "loss": 0.2339, "step": 4411 }, { "epoch": 1.6292466765140325, "grad_norm": 0.25523653626441956, "learning_rate": 9.143983249168617e-05, "loss": 0.2343, "step": 4412 }, { "epoch": 1.6296159527326441, "grad_norm": 0.36017218232154846, "learning_rate": 9.141519891612267e-05, "loss": 0.2713, "step": 4413 }, { "epoch": 1.6299852289512555, "grad_norm": 0.27270761132240295, "learning_rate": 9.139056534055919e-05, "loss": 0.2085, "step": 4414 }, { "epoch": 1.630354505169867, "grad_norm": 0.2864871621131897, "learning_rate": 9.136593176499569e-05, "loss": 0.2321, "step": 4415 }, { "epoch": 1.6307237813884785, "grad_norm": 0.25306713581085205, "learning_rate": 9.13412981894322e-05, "loss": 0.2117, "step": 4416 }, { "epoch": 1.6310930576070901, "grad_norm": 0.2895197570323944, "learning_rate": 9.13166646138687e-05, "loss": 0.2356, "step": 4417 }, { "epoch": 1.6314623338257017, "grad_norm": 0.2583214044570923, "learning_rate": 9.129203103830522e-05, "loss": 0.226, "step": 4418 }, { "epoch": 1.631831610044313, "grad_norm": 0.23806144297122955, "learning_rate": 9.126739746274172e-05, "loss": 0.2325, "step": 4419 }, { "epoch": 1.6322008862629247, "grad_norm": 0.2492138296365738, "learning_rate": 9.124276388717824e-05, "loss": 0.1986, "step": 4420 }, { "epoch": 1.632570162481536, "grad_norm": 0.2343548685312271, "learning_rate": 9.121813031161474e-05, "loss": 0.1947, "step": 4421 }, { "epoch": 1.6329394387001477, "grad_norm": 0.25394871830940247, "learning_rate": 9.119349673605124e-05, "loss": 0.2158, "step": 4422 }, { "epoch": 1.6333087149187593, "grad_norm": 0.33500936627388, "learning_rate": 9.116886316048775e-05, "loss": 0.2538, "step": 4423 }, { "epoch": 1.6336779911373709, "grad_norm": 0.2588845193386078, "learning_rate": 9.114422958492425e-05, "loss": 0.2169, "step": 4424 }, { "epoch": 1.6340472673559823, "grad_norm": 0.2695541977882385, "learning_rate": 9.111959600936077e-05, "loss": 0.2016, "step": 4425 }, { "epoch": 1.6344165435745936, "grad_norm": 0.3496728837490082, "learning_rate": 9.109496243379727e-05, "loss": 0.2643, "step": 4426 }, { "epoch": 1.6347858197932053, "grad_norm": 0.2309270054101944, "learning_rate": 9.107032885823378e-05, "loss": 0.1892, "step": 4427 }, { "epoch": 1.6351550960118169, "grad_norm": 0.29163655638694763, "learning_rate": 9.104569528267028e-05, "loss": 0.2454, "step": 4428 }, { "epoch": 1.6355243722304285, "grad_norm": 0.2710795998573303, "learning_rate": 9.102106170710679e-05, "loss": 0.2136, "step": 4429 }, { "epoch": 1.6358936484490398, "grad_norm": 0.2789679765701294, "learning_rate": 9.09964281315433e-05, "loss": 0.2397, "step": 4430 }, { "epoch": 1.6362629246676514, "grad_norm": 0.2854340970516205, "learning_rate": 9.09717945559798e-05, "loss": 0.231, "step": 4431 }, { "epoch": 1.6366322008862628, "grad_norm": 0.24197128415107727, "learning_rate": 9.094716098041632e-05, "loss": 0.1917, "step": 4432 }, { "epoch": 1.6370014771048744, "grad_norm": 0.35419929027557373, "learning_rate": 9.092252740485282e-05, "loss": 0.2305, "step": 4433 }, { "epoch": 1.637370753323486, "grad_norm": 0.28594160079956055, "learning_rate": 9.089789382928933e-05, "loss": 0.2398, "step": 4434 }, { "epoch": 1.6377400295420976, "grad_norm": 0.23706410825252533, "learning_rate": 9.087326025372583e-05, "loss": 0.2111, "step": 4435 }, { "epoch": 1.638109305760709, "grad_norm": 0.3336345851421356, "learning_rate": 9.084862667816235e-05, "loss": 0.3033, "step": 4436 }, { "epoch": 1.6384785819793204, "grad_norm": 0.25109750032424927, "learning_rate": 9.082399310259885e-05, "loss": 0.2006, "step": 4437 }, { "epoch": 1.638847858197932, "grad_norm": 0.247293621301651, "learning_rate": 9.079935952703535e-05, "loss": 0.224, "step": 4438 }, { "epoch": 1.6392171344165436, "grad_norm": 0.2591431140899658, "learning_rate": 9.077472595147186e-05, "loss": 0.2625, "step": 4439 }, { "epoch": 1.6395864106351552, "grad_norm": 0.2373046725988388, "learning_rate": 9.075009237590837e-05, "loss": 0.2196, "step": 4440 }, { "epoch": 1.6399556868537666, "grad_norm": 0.277576208114624, "learning_rate": 9.072545880034488e-05, "loss": 0.2159, "step": 4441 }, { "epoch": 1.6403249630723782, "grad_norm": 0.24450775980949402, "learning_rate": 9.070082522478138e-05, "loss": 0.2184, "step": 4442 }, { "epoch": 1.6406942392909896, "grad_norm": 0.2535002827644348, "learning_rate": 9.06761916492179e-05, "loss": 0.2281, "step": 4443 }, { "epoch": 1.6410635155096012, "grad_norm": 0.3445992171764374, "learning_rate": 9.06515580736544e-05, "loss": 0.2395, "step": 4444 }, { "epoch": 1.6414327917282128, "grad_norm": 0.347196102142334, "learning_rate": 9.06269244980909e-05, "loss": 0.2574, "step": 4445 }, { "epoch": 1.6418020679468244, "grad_norm": 0.2402815967798233, "learning_rate": 9.060229092252741e-05, "loss": 0.2097, "step": 4446 }, { "epoch": 1.6421713441654358, "grad_norm": 0.29701727628707886, "learning_rate": 9.057765734696391e-05, "loss": 0.2728, "step": 4447 }, { "epoch": 1.6425406203840471, "grad_norm": 0.2463408261537552, "learning_rate": 9.055302377140043e-05, "loss": 0.2204, "step": 4448 }, { "epoch": 1.6429098966026587, "grad_norm": 0.2880644202232361, "learning_rate": 9.052839019583693e-05, "loss": 0.2739, "step": 4449 }, { "epoch": 1.6432791728212703, "grad_norm": 0.27345898747444153, "learning_rate": 9.050375662027344e-05, "loss": 0.2796, "step": 4450 }, { "epoch": 1.6432791728212703, "eval_loss": 0.2640303075313568, "eval_runtime": 5.8618, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 4450 }, { "epoch": 1.643648449039882, "grad_norm": 0.25137606263160706, "learning_rate": 9.047912304470995e-05, "loss": 0.239, "step": 4451 }, { "epoch": 1.6440177252584933, "grad_norm": 0.2716238498687744, "learning_rate": 9.045448946914645e-05, "loss": 0.229, "step": 4452 }, { "epoch": 1.644387001477105, "grad_norm": 0.2977883517742157, "learning_rate": 9.042985589358296e-05, "loss": 0.2709, "step": 4453 }, { "epoch": 1.6447562776957163, "grad_norm": 0.2766415774822235, "learning_rate": 9.040522231801946e-05, "loss": 0.2337, "step": 4454 }, { "epoch": 1.645125553914328, "grad_norm": 0.2886631488800049, "learning_rate": 9.038058874245598e-05, "loss": 0.2359, "step": 4455 }, { "epoch": 1.6454948301329395, "grad_norm": 0.4014659523963928, "learning_rate": 9.035595516689248e-05, "loss": 0.2775, "step": 4456 }, { "epoch": 1.6458641063515511, "grad_norm": 0.24665050208568573, "learning_rate": 9.033132159132899e-05, "loss": 0.2314, "step": 4457 }, { "epoch": 1.6462333825701625, "grad_norm": 0.2761084735393524, "learning_rate": 9.03066880157655e-05, "loss": 0.2267, "step": 4458 }, { "epoch": 1.6466026587887739, "grad_norm": 0.25735220313072205, "learning_rate": 9.028205444020201e-05, "loss": 0.2106, "step": 4459 }, { "epoch": 1.6469719350073855, "grad_norm": 0.2363380342721939, "learning_rate": 9.025742086463851e-05, "loss": 0.207, "step": 4460 }, { "epoch": 1.647341211225997, "grad_norm": 0.32384592294692993, "learning_rate": 9.023278728907501e-05, "loss": 0.28, "step": 4461 }, { "epoch": 1.6477104874446087, "grad_norm": 0.2280770242214203, "learning_rate": 9.020815371351152e-05, "loss": 0.2146, "step": 4462 }, { "epoch": 1.64807976366322, "grad_norm": 0.24917244911193848, "learning_rate": 9.018352013794803e-05, "loss": 0.2427, "step": 4463 }, { "epoch": 1.6484490398818314, "grad_norm": 0.2832244634628296, "learning_rate": 9.015888656238454e-05, "loss": 0.2309, "step": 4464 }, { "epoch": 1.648818316100443, "grad_norm": 0.2867893576622009, "learning_rate": 9.013425298682104e-05, "loss": 0.2156, "step": 4465 }, { "epoch": 1.6491875923190547, "grad_norm": 0.30194559693336487, "learning_rate": 9.010961941125756e-05, "loss": 0.2429, "step": 4466 }, { "epoch": 1.6495568685376663, "grad_norm": 0.343932181596756, "learning_rate": 9.008498583569406e-05, "loss": 0.2461, "step": 4467 }, { "epoch": 1.6499261447562779, "grad_norm": 0.2459821254014969, "learning_rate": 9.006035226013056e-05, "loss": 0.2294, "step": 4468 }, { "epoch": 1.6502954209748892, "grad_norm": 0.227996826171875, "learning_rate": 9.003571868456707e-05, "loss": 0.1928, "step": 4469 }, { "epoch": 1.6506646971935006, "grad_norm": 0.25969910621643066, "learning_rate": 9.001108510900357e-05, "loss": 0.2261, "step": 4470 }, { "epoch": 1.6510339734121122, "grad_norm": 0.2653331458568573, "learning_rate": 8.998645153344009e-05, "loss": 0.2167, "step": 4471 }, { "epoch": 1.6514032496307238, "grad_norm": 0.22981014847755432, "learning_rate": 8.996181795787659e-05, "loss": 0.1892, "step": 4472 }, { "epoch": 1.6517725258493354, "grad_norm": 0.2953481674194336, "learning_rate": 8.99371843823131e-05, "loss": 0.2623, "step": 4473 }, { "epoch": 1.6521418020679468, "grad_norm": 0.26547351479530334, "learning_rate": 8.99125508067496e-05, "loss": 0.2397, "step": 4474 }, { "epoch": 1.6525110782865582, "grad_norm": 0.23823410272598267, "learning_rate": 8.988791723118612e-05, "loss": 0.1963, "step": 4475 }, { "epoch": 1.6528803545051698, "grad_norm": 0.25190117955207825, "learning_rate": 8.986328365562262e-05, "loss": 0.2078, "step": 4476 }, { "epoch": 1.6532496307237814, "grad_norm": 0.2754233777523041, "learning_rate": 8.983865008005912e-05, "loss": 0.2287, "step": 4477 }, { "epoch": 1.653618906942393, "grad_norm": 0.267267644405365, "learning_rate": 8.981401650449564e-05, "loss": 0.2004, "step": 4478 }, { "epoch": 1.6539881831610044, "grad_norm": 0.24427461624145508, "learning_rate": 8.978938292893214e-05, "loss": 0.1887, "step": 4479 }, { "epoch": 1.654357459379616, "grad_norm": 0.3076989948749542, "learning_rate": 8.976474935336865e-05, "loss": 0.2569, "step": 4480 }, { "epoch": 1.6547267355982274, "grad_norm": 0.2608012855052948, "learning_rate": 8.974011577780515e-05, "loss": 0.2135, "step": 4481 }, { "epoch": 1.655096011816839, "grad_norm": 0.294429749250412, "learning_rate": 8.971548220224167e-05, "loss": 0.2327, "step": 4482 }, { "epoch": 1.6554652880354506, "grad_norm": 0.30032092332839966, "learning_rate": 8.969084862667817e-05, "loss": 0.2462, "step": 4483 }, { "epoch": 1.6558345642540622, "grad_norm": 0.3220147490501404, "learning_rate": 8.966621505111467e-05, "loss": 0.2502, "step": 4484 }, { "epoch": 1.6562038404726735, "grad_norm": 0.23546169698238373, "learning_rate": 8.964158147555119e-05, "loss": 0.2258, "step": 4485 }, { "epoch": 1.656573116691285, "grad_norm": 0.2772619426250458, "learning_rate": 8.961694789998769e-05, "loss": 0.2546, "step": 4486 }, { "epoch": 1.6569423929098965, "grad_norm": 0.28485119342803955, "learning_rate": 8.95923143244242e-05, "loss": 0.2253, "step": 4487 }, { "epoch": 1.6573116691285081, "grad_norm": 0.2849203050136566, "learning_rate": 8.95676807488607e-05, "loss": 0.2222, "step": 4488 }, { "epoch": 1.6576809453471197, "grad_norm": 0.25608864426612854, "learning_rate": 8.954304717329722e-05, "loss": 0.211, "step": 4489 }, { "epoch": 1.6580502215657311, "grad_norm": 0.2595456838607788, "learning_rate": 8.951841359773372e-05, "loss": 0.2306, "step": 4490 }, { "epoch": 1.6584194977843427, "grad_norm": 0.32323145866394043, "learning_rate": 8.949378002217023e-05, "loss": 0.239, "step": 4491 }, { "epoch": 1.658788774002954, "grad_norm": 0.22644315659999847, "learning_rate": 8.946914644660673e-05, "loss": 0.1855, "step": 4492 }, { "epoch": 1.6591580502215657, "grad_norm": 0.2632284164428711, "learning_rate": 8.944451287104323e-05, "loss": 0.226, "step": 4493 }, { "epoch": 1.6595273264401773, "grad_norm": 0.3604695796966553, "learning_rate": 8.941987929547975e-05, "loss": 0.2408, "step": 4494 }, { "epoch": 1.659896602658789, "grad_norm": 0.2264641374349594, "learning_rate": 8.939524571991625e-05, "loss": 0.2295, "step": 4495 }, { "epoch": 1.6602658788774003, "grad_norm": 0.23985488712787628, "learning_rate": 8.937061214435276e-05, "loss": 0.2187, "step": 4496 }, { "epoch": 1.6606351550960117, "grad_norm": 0.27624571323394775, "learning_rate": 8.934597856878927e-05, "loss": 0.2037, "step": 4497 }, { "epoch": 1.6610044313146233, "grad_norm": 0.36503899097442627, "learning_rate": 8.932134499322578e-05, "loss": 0.3095, "step": 4498 }, { "epoch": 1.6613737075332349, "grad_norm": 0.2893148362636566, "learning_rate": 8.929671141766228e-05, "loss": 0.2332, "step": 4499 }, { "epoch": 1.6617429837518465, "grad_norm": 0.2621174156665802, "learning_rate": 8.927207784209878e-05, "loss": 0.2218, "step": 4500 }, { "epoch": 1.6617429837518465, "eval_loss": 0.25753548741340637, "eval_runtime": 5.8668, "eval_samples_per_second": 8.522, "eval_steps_per_second": 1.193, "step": 4500 }, { "epoch": 1.6621122599704579, "grad_norm": 0.25062716007232666, "learning_rate": 8.92474442665353e-05, "loss": 0.2219, "step": 4501 }, { "epoch": 1.6624815361890695, "grad_norm": 0.3061050772666931, "learning_rate": 8.92228106909718e-05, "loss": 0.2455, "step": 4502 }, { "epoch": 1.6628508124076808, "grad_norm": 0.272899329662323, "learning_rate": 8.919817711540831e-05, "loss": 0.2413, "step": 4503 }, { "epoch": 1.6632200886262924, "grad_norm": 0.4094938337802887, "learning_rate": 8.917354353984481e-05, "loss": 0.2343, "step": 4504 }, { "epoch": 1.663589364844904, "grad_norm": 0.25752800703048706, "learning_rate": 8.914890996428133e-05, "loss": 0.2151, "step": 4505 }, { "epoch": 1.6639586410635157, "grad_norm": 0.29961711168289185, "learning_rate": 8.912427638871783e-05, "loss": 0.2309, "step": 4506 }, { "epoch": 1.664327917282127, "grad_norm": 0.31224143505096436, "learning_rate": 8.909964281315434e-05, "loss": 0.2556, "step": 4507 }, { "epoch": 1.6646971935007384, "grad_norm": 0.2943941354751587, "learning_rate": 8.907500923759085e-05, "loss": 0.2823, "step": 4508 }, { "epoch": 1.66506646971935, "grad_norm": 0.22451508045196533, "learning_rate": 8.905037566202735e-05, "loss": 0.1987, "step": 4509 }, { "epoch": 1.6654357459379616, "grad_norm": 0.30652108788490295, "learning_rate": 8.902574208646386e-05, "loss": 0.2419, "step": 4510 }, { "epoch": 1.6658050221565732, "grad_norm": 0.2775302827358246, "learning_rate": 8.900110851090036e-05, "loss": 0.2633, "step": 4511 }, { "epoch": 1.6661742983751846, "grad_norm": 0.2944474518299103, "learning_rate": 8.897647493533688e-05, "loss": 0.2385, "step": 4512 }, { "epoch": 1.6665435745937962, "grad_norm": 0.24264825880527496, "learning_rate": 8.895184135977338e-05, "loss": 0.2241, "step": 4513 }, { "epoch": 1.6669128508124076, "grad_norm": 0.24153397977352142, "learning_rate": 8.892720778420989e-05, "loss": 0.2132, "step": 4514 }, { "epoch": 1.6672821270310192, "grad_norm": 0.27261120080947876, "learning_rate": 8.89025742086464e-05, "loss": 0.1873, "step": 4515 }, { "epoch": 1.6676514032496308, "grad_norm": 0.4182679057121277, "learning_rate": 8.88779406330829e-05, "loss": 0.2904, "step": 4516 }, { "epoch": 1.6680206794682424, "grad_norm": 0.3107384443283081, "learning_rate": 8.885330705751941e-05, "loss": 0.2524, "step": 4517 }, { "epoch": 1.6683899556868538, "grad_norm": 0.2381308525800705, "learning_rate": 8.882867348195591e-05, "loss": 0.2298, "step": 4518 }, { "epoch": 1.6687592319054652, "grad_norm": 0.30288082361221313, "learning_rate": 8.880403990639243e-05, "loss": 0.2311, "step": 4519 }, { "epoch": 1.6691285081240768, "grad_norm": 0.2832752466201782, "learning_rate": 8.877940633082893e-05, "loss": 0.224, "step": 4520 }, { "epoch": 1.6694977843426884, "grad_norm": 0.24044345319271088, "learning_rate": 8.875477275526544e-05, "loss": 0.2119, "step": 4521 }, { "epoch": 1.6698670605613, "grad_norm": 0.24596406519412994, "learning_rate": 8.873013917970194e-05, "loss": 0.2175, "step": 4522 }, { "epoch": 1.6702363367799113, "grad_norm": 0.2973885238170624, "learning_rate": 8.870550560413846e-05, "loss": 0.2842, "step": 4523 }, { "epoch": 1.670605612998523, "grad_norm": 0.29346567392349243, "learning_rate": 8.868087202857496e-05, "loss": 0.2474, "step": 4524 }, { "epoch": 1.6709748892171343, "grad_norm": 0.26802217960357666, "learning_rate": 8.865623845301146e-05, "loss": 0.2219, "step": 4525 }, { "epoch": 1.671344165435746, "grad_norm": 0.2719583809375763, "learning_rate": 8.863160487744797e-05, "loss": 0.2357, "step": 4526 }, { "epoch": 1.6717134416543575, "grad_norm": 0.2853793203830719, "learning_rate": 8.860697130188447e-05, "loss": 0.238, "step": 4527 }, { "epoch": 1.6720827178729691, "grad_norm": 0.3272760808467865, "learning_rate": 8.858233772632099e-05, "loss": 0.2365, "step": 4528 }, { "epoch": 1.6724519940915805, "grad_norm": 0.23719562590122223, "learning_rate": 8.855770415075748e-05, "loss": 0.2154, "step": 4529 }, { "epoch": 1.672821270310192, "grad_norm": 0.30899596214294434, "learning_rate": 8.853307057519399e-05, "loss": 0.2706, "step": 4530 }, { "epoch": 1.6731905465288035, "grad_norm": 0.2901187241077423, "learning_rate": 8.850843699963049e-05, "loss": 0.2165, "step": 4531 }, { "epoch": 1.673559822747415, "grad_norm": 0.21402893960475922, "learning_rate": 8.848380342406701e-05, "loss": 0.1978, "step": 4532 }, { "epoch": 1.6739290989660267, "grad_norm": 0.23306670784950256, "learning_rate": 8.845916984850351e-05, "loss": 0.2095, "step": 4533 }, { "epoch": 1.674298375184638, "grad_norm": 0.3484254777431488, "learning_rate": 8.843453627294001e-05, "loss": 0.2216, "step": 4534 }, { "epoch": 1.6746676514032495, "grad_norm": 0.26495423913002014, "learning_rate": 8.840990269737652e-05, "loss": 0.1954, "step": 4535 }, { "epoch": 1.675036927621861, "grad_norm": 0.2986883819103241, "learning_rate": 8.838526912181303e-05, "loss": 0.2434, "step": 4536 }, { "epoch": 1.6754062038404727, "grad_norm": 0.35357728600502014, "learning_rate": 8.836063554624954e-05, "loss": 0.2083, "step": 4537 }, { "epoch": 1.6757754800590843, "grad_norm": 0.2808247208595276, "learning_rate": 8.833600197068604e-05, "loss": 0.2356, "step": 4538 }, { "epoch": 1.6761447562776959, "grad_norm": 0.30437755584716797, "learning_rate": 8.831136839512256e-05, "loss": 0.2718, "step": 4539 }, { "epoch": 1.6765140324963073, "grad_norm": 0.25901874899864197, "learning_rate": 8.828673481955906e-05, "loss": 0.1908, "step": 4540 }, { "epoch": 1.6768833087149186, "grad_norm": 0.22702591121196747, "learning_rate": 8.826210124399557e-05, "loss": 0.1957, "step": 4541 }, { "epoch": 1.6772525849335302, "grad_norm": 0.32408803701400757, "learning_rate": 8.823746766843207e-05, "loss": 0.2264, "step": 4542 }, { "epoch": 1.6776218611521418, "grad_norm": 0.2785623371601105, "learning_rate": 8.821283409286857e-05, "loss": 0.2275, "step": 4543 }, { "epoch": 1.6779911373707534, "grad_norm": 0.2758251428604126, "learning_rate": 8.818820051730509e-05, "loss": 0.2208, "step": 4544 }, { "epoch": 1.6783604135893648, "grad_norm": 0.25355419516563416, "learning_rate": 8.816356694174159e-05, "loss": 0.2387, "step": 4545 }, { "epoch": 1.6787296898079762, "grad_norm": 0.24101129174232483, "learning_rate": 8.81389333661781e-05, "loss": 0.1959, "step": 4546 }, { "epoch": 1.6790989660265878, "grad_norm": 0.27683645486831665, "learning_rate": 8.81142997906146e-05, "loss": 0.2197, "step": 4547 }, { "epoch": 1.6794682422451994, "grad_norm": 0.25528275966644287, "learning_rate": 8.808966621505112e-05, "loss": 0.209, "step": 4548 }, { "epoch": 1.679837518463811, "grad_norm": 0.2559058368206024, "learning_rate": 8.806503263948762e-05, "loss": 0.1847, "step": 4549 }, { "epoch": 1.6802067946824224, "grad_norm": 0.30595335364341736, "learning_rate": 8.804039906392412e-05, "loss": 0.2645, "step": 4550 }, { "epoch": 1.6802067946824224, "eval_loss": 0.2636949419975281, "eval_runtime": 5.8582, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 4550 }, { "epoch": 1.680576070901034, "grad_norm": 0.27721020579338074, "learning_rate": 8.801576548836064e-05, "loss": 0.2319, "step": 4551 }, { "epoch": 1.6809453471196454, "grad_norm": 0.2904880940914154, "learning_rate": 8.799113191279714e-05, "loss": 0.2272, "step": 4552 }, { "epoch": 1.681314623338257, "grad_norm": 0.293811172246933, "learning_rate": 8.796649833723365e-05, "loss": 0.2521, "step": 4553 }, { "epoch": 1.6816838995568686, "grad_norm": 0.26758673787117004, "learning_rate": 8.794186476167015e-05, "loss": 0.2028, "step": 4554 }, { "epoch": 1.6820531757754802, "grad_norm": 0.27245649695396423, "learning_rate": 8.791723118610667e-05, "loss": 0.2471, "step": 4555 }, { "epoch": 1.6824224519940916, "grad_norm": 0.27987295389175415, "learning_rate": 8.789259761054317e-05, "loss": 0.2391, "step": 4556 }, { "epoch": 1.682791728212703, "grad_norm": 0.2732281982898712, "learning_rate": 8.786796403497968e-05, "loss": 0.268, "step": 4557 }, { "epoch": 1.6831610044313146, "grad_norm": 0.2969675362110138, "learning_rate": 8.784333045941618e-05, "loss": 0.2581, "step": 4558 }, { "epoch": 1.6835302806499262, "grad_norm": 0.41811317205429077, "learning_rate": 8.781869688385269e-05, "loss": 0.3046, "step": 4559 }, { "epoch": 1.6838995568685378, "grad_norm": 0.2814512848854065, "learning_rate": 8.77940633082892e-05, "loss": 0.2348, "step": 4560 }, { "epoch": 1.6842688330871491, "grad_norm": 0.2987380027770996, "learning_rate": 8.77694297327257e-05, "loss": 0.232, "step": 4561 }, { "epoch": 1.6846381093057607, "grad_norm": 0.3873238265514374, "learning_rate": 8.774479615716222e-05, "loss": 0.2471, "step": 4562 }, { "epoch": 1.6850073855243721, "grad_norm": 0.28666067123413086, "learning_rate": 8.772016258159872e-05, "loss": 0.2145, "step": 4563 }, { "epoch": 1.6853766617429837, "grad_norm": 0.28261733055114746, "learning_rate": 8.769552900603523e-05, "loss": 0.2161, "step": 4564 }, { "epoch": 1.6857459379615953, "grad_norm": 0.29818618297576904, "learning_rate": 8.767089543047173e-05, "loss": 0.2175, "step": 4565 }, { "epoch": 1.686115214180207, "grad_norm": 0.265331506729126, "learning_rate": 8.764626185490823e-05, "loss": 0.1898, "step": 4566 }, { "epoch": 1.6864844903988183, "grad_norm": 0.24167734384536743, "learning_rate": 8.762162827934475e-05, "loss": 0.2373, "step": 4567 }, { "epoch": 1.6868537666174297, "grad_norm": 0.2740474045276642, "learning_rate": 8.759699470378125e-05, "loss": 0.2291, "step": 4568 }, { "epoch": 1.6872230428360413, "grad_norm": 0.2038453072309494, "learning_rate": 8.757236112821776e-05, "loss": 0.1733, "step": 4569 }, { "epoch": 1.687592319054653, "grad_norm": 0.2524508535861969, "learning_rate": 8.754772755265427e-05, "loss": 0.2104, "step": 4570 }, { "epoch": 1.6879615952732645, "grad_norm": 0.24824324250221252, "learning_rate": 8.752309397709078e-05, "loss": 0.2092, "step": 4571 }, { "epoch": 1.6883308714918759, "grad_norm": 0.2843952476978302, "learning_rate": 8.749846040152728e-05, "loss": 0.2103, "step": 4572 }, { "epoch": 1.6887001477104875, "grad_norm": 0.2717718482017517, "learning_rate": 8.74738268259638e-05, "loss": 0.2165, "step": 4573 }, { "epoch": 1.6890694239290989, "grad_norm": 0.2635425627231598, "learning_rate": 8.74491932504003e-05, "loss": 0.1931, "step": 4574 }, { "epoch": 1.6894387001477105, "grad_norm": 0.23467527329921722, "learning_rate": 8.74245596748368e-05, "loss": 0.2161, "step": 4575 }, { "epoch": 1.689807976366322, "grad_norm": 0.25482845306396484, "learning_rate": 8.739992609927331e-05, "loss": 0.2205, "step": 4576 }, { "epoch": 1.6901772525849337, "grad_norm": 0.2853875160217285, "learning_rate": 8.737529252370981e-05, "loss": 0.2433, "step": 4577 }, { "epoch": 1.690546528803545, "grad_norm": 0.24550633132457733, "learning_rate": 8.735065894814633e-05, "loss": 0.2229, "step": 4578 }, { "epoch": 1.6909158050221564, "grad_norm": 0.2933661937713623, "learning_rate": 8.732602537258283e-05, "loss": 0.245, "step": 4579 }, { "epoch": 1.691285081240768, "grad_norm": 0.28975528478622437, "learning_rate": 8.730139179701934e-05, "loss": 0.2349, "step": 4580 }, { "epoch": 1.6916543574593796, "grad_norm": 0.23025333881378174, "learning_rate": 8.727675822145585e-05, "loss": 0.185, "step": 4581 }, { "epoch": 1.6920236336779912, "grad_norm": 0.30284446477890015, "learning_rate": 8.725212464589235e-05, "loss": 0.2706, "step": 4582 }, { "epoch": 1.6923929098966026, "grad_norm": 0.2338089495897293, "learning_rate": 8.722749107032886e-05, "loss": 0.2031, "step": 4583 }, { "epoch": 1.6927621861152142, "grad_norm": 0.3378913700580597, "learning_rate": 8.720285749476536e-05, "loss": 0.2438, "step": 4584 }, { "epoch": 1.6931314623338256, "grad_norm": 0.2701866328716278, "learning_rate": 8.717822391920188e-05, "loss": 0.232, "step": 4585 }, { "epoch": 1.6935007385524372, "grad_norm": 0.2801609933376312, "learning_rate": 8.715359034363838e-05, "loss": 0.236, "step": 4586 }, { "epoch": 1.6938700147710488, "grad_norm": 0.21978351473808289, "learning_rate": 8.712895676807489e-05, "loss": 0.19, "step": 4587 }, { "epoch": 1.6942392909896604, "grad_norm": 0.3093269169330597, "learning_rate": 8.71043231925114e-05, "loss": 0.2268, "step": 4588 }, { "epoch": 1.6946085672082718, "grad_norm": 0.31482091546058655, "learning_rate": 8.707968961694791e-05, "loss": 0.2468, "step": 4589 }, { "epoch": 1.6949778434268832, "grad_norm": 0.27576708793640137, "learning_rate": 8.705505604138441e-05, "loss": 0.2307, "step": 4590 }, { "epoch": 1.6953471196454948, "grad_norm": 0.2709692716598511, "learning_rate": 8.703042246582091e-05, "loss": 0.2458, "step": 4591 }, { "epoch": 1.6957163958641064, "grad_norm": 0.21820217370986938, "learning_rate": 8.700578889025742e-05, "loss": 0.2004, "step": 4592 }, { "epoch": 1.696085672082718, "grad_norm": 0.27680477499961853, "learning_rate": 8.698115531469393e-05, "loss": 0.2415, "step": 4593 }, { "epoch": 1.6964549483013294, "grad_norm": 0.29179415106773376, "learning_rate": 8.695652173913044e-05, "loss": 0.2502, "step": 4594 }, { "epoch": 1.696824224519941, "grad_norm": 0.24993900954723358, "learning_rate": 8.693188816356694e-05, "loss": 0.2044, "step": 4595 }, { "epoch": 1.6971935007385524, "grad_norm": 0.2431059628725052, "learning_rate": 8.690725458800346e-05, "loss": 0.2139, "step": 4596 }, { "epoch": 1.697562776957164, "grad_norm": 0.26724016666412354, "learning_rate": 8.688262101243996e-05, "loss": 0.22, "step": 4597 }, { "epoch": 1.6979320531757756, "grad_norm": 0.33457979559898376, "learning_rate": 8.685798743687646e-05, "loss": 0.2887, "step": 4598 }, { "epoch": 1.6983013293943872, "grad_norm": 0.40689313411712646, "learning_rate": 8.683335386131297e-05, "loss": 0.2475, "step": 4599 }, { "epoch": 1.6986706056129985, "grad_norm": 0.2402809113264084, "learning_rate": 8.680872028574947e-05, "loss": 0.1933, "step": 4600 }, { "epoch": 1.6986706056129985, "eval_loss": 0.26016414165496826, "eval_runtime": 5.8648, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.194, "step": 4600 }, { "epoch": 1.69903988183161, "grad_norm": 0.2964528799057007, "learning_rate": 8.678408671018599e-05, "loss": 0.2453, "step": 4601 }, { "epoch": 1.6994091580502215, "grad_norm": 0.28239455819129944, "learning_rate": 8.675945313462249e-05, "loss": 0.2347, "step": 4602 }, { "epoch": 1.6997784342688331, "grad_norm": 0.24995984137058258, "learning_rate": 8.6734819559059e-05, "loss": 0.2098, "step": 4603 }, { "epoch": 1.7001477104874447, "grad_norm": 0.3133336901664734, "learning_rate": 8.67101859834955e-05, "loss": 0.2285, "step": 4604 }, { "epoch": 1.700516986706056, "grad_norm": 0.2739347517490387, "learning_rate": 8.668555240793201e-05, "loss": 0.2346, "step": 4605 }, { "epoch": 1.7008862629246675, "grad_norm": 0.3014254570007324, "learning_rate": 8.666091883236852e-05, "loss": 0.2354, "step": 4606 }, { "epoch": 1.701255539143279, "grad_norm": 0.26933860778808594, "learning_rate": 8.663628525680502e-05, "loss": 0.2298, "step": 4607 }, { "epoch": 1.7016248153618907, "grad_norm": 0.2875133156776428, "learning_rate": 8.661165168124154e-05, "loss": 0.2197, "step": 4608 }, { "epoch": 1.7019940915805023, "grad_norm": 0.26846441626548767, "learning_rate": 8.658701810567804e-05, "loss": 0.2189, "step": 4609 }, { "epoch": 1.702363367799114, "grad_norm": 0.2907378673553467, "learning_rate": 8.656238453011455e-05, "loss": 0.226, "step": 4610 }, { "epoch": 1.7027326440177253, "grad_norm": 0.3114635646343231, "learning_rate": 8.653775095455105e-05, "loss": 0.2558, "step": 4611 }, { "epoch": 1.7031019202363367, "grad_norm": 0.2575693726539612, "learning_rate": 8.651311737898757e-05, "loss": 0.2317, "step": 4612 }, { "epoch": 1.7034711964549483, "grad_norm": 0.24758966267108917, "learning_rate": 8.648848380342407e-05, "loss": 0.1889, "step": 4613 }, { "epoch": 1.7038404726735599, "grad_norm": 0.28485026955604553, "learning_rate": 8.646385022786057e-05, "loss": 0.2215, "step": 4614 }, { "epoch": 1.7042097488921715, "grad_norm": 0.2999398410320282, "learning_rate": 8.643921665229709e-05, "loss": 0.2394, "step": 4615 }, { "epoch": 1.7045790251107829, "grad_norm": 0.2679917514324188, "learning_rate": 8.641458307673359e-05, "loss": 0.2249, "step": 4616 }, { "epoch": 1.7049483013293942, "grad_norm": 0.24768604338169098, "learning_rate": 8.63899495011701e-05, "loss": 0.2063, "step": 4617 }, { "epoch": 1.7053175775480058, "grad_norm": 0.31003034114837646, "learning_rate": 8.63653159256066e-05, "loss": 0.2638, "step": 4618 }, { "epoch": 1.7056868537666174, "grad_norm": 0.2762015163898468, "learning_rate": 8.634068235004312e-05, "loss": 0.2248, "step": 4619 }, { "epoch": 1.706056129985229, "grad_norm": 0.28417354822158813, "learning_rate": 8.631604877447962e-05, "loss": 0.2207, "step": 4620 }, { "epoch": 1.7064254062038404, "grad_norm": 0.24697527289390564, "learning_rate": 8.629141519891612e-05, "loss": 0.1839, "step": 4621 }, { "epoch": 1.706794682422452, "grad_norm": 0.239962637424469, "learning_rate": 8.626678162335263e-05, "loss": 0.1851, "step": 4622 }, { "epoch": 1.7071639586410634, "grad_norm": 0.2700730860233307, "learning_rate": 8.624214804778913e-05, "loss": 0.2132, "step": 4623 }, { "epoch": 1.707533234859675, "grad_norm": 0.3187514543533325, "learning_rate": 8.621751447222565e-05, "loss": 0.241, "step": 4624 }, { "epoch": 1.7079025110782866, "grad_norm": 0.33720922470092773, "learning_rate": 8.619288089666215e-05, "loss": 0.2846, "step": 4625 }, { "epoch": 1.7082717872968982, "grad_norm": 0.28173840045928955, "learning_rate": 8.616824732109867e-05, "loss": 0.1962, "step": 4626 }, { "epoch": 1.7086410635155096, "grad_norm": 0.24108169972896576, "learning_rate": 8.614361374553517e-05, "loss": 0.2244, "step": 4627 }, { "epoch": 1.709010339734121, "grad_norm": 0.2797034978866577, "learning_rate": 8.611898016997168e-05, "loss": 0.2327, "step": 4628 }, { "epoch": 1.7093796159527326, "grad_norm": 0.22712896764278412, "learning_rate": 8.609434659440818e-05, "loss": 0.2033, "step": 4629 }, { "epoch": 1.7097488921713442, "grad_norm": 0.22765718400478363, "learning_rate": 8.606971301884468e-05, "loss": 0.2111, "step": 4630 }, { "epoch": 1.7101181683899558, "grad_norm": 0.24808034300804138, "learning_rate": 8.60450794432812e-05, "loss": 0.231, "step": 4631 }, { "epoch": 1.7104874446085672, "grad_norm": 0.3694831132888794, "learning_rate": 8.60204458677177e-05, "loss": 0.2653, "step": 4632 }, { "epoch": 1.7108567208271788, "grad_norm": 0.2156008630990982, "learning_rate": 8.599581229215421e-05, "loss": 0.1874, "step": 4633 }, { "epoch": 1.7112259970457901, "grad_norm": 0.285347580909729, "learning_rate": 8.597117871659071e-05, "loss": 0.2487, "step": 4634 }, { "epoch": 1.7115952732644018, "grad_norm": 0.3789399564266205, "learning_rate": 8.594654514102723e-05, "loss": 0.2251, "step": 4635 }, { "epoch": 1.7119645494830134, "grad_norm": 0.3379298746585846, "learning_rate": 8.592191156546373e-05, "loss": 0.2731, "step": 4636 }, { "epoch": 1.712333825701625, "grad_norm": 0.22673995792865753, "learning_rate": 8.589727798990023e-05, "loss": 0.198, "step": 4637 }, { "epoch": 1.7127031019202363, "grad_norm": 0.393801748752594, "learning_rate": 8.587264441433675e-05, "loss": 0.2793, "step": 4638 }, { "epoch": 1.7130723781388477, "grad_norm": 0.27242693305015564, "learning_rate": 8.584801083877325e-05, "loss": 0.2253, "step": 4639 }, { "epoch": 1.7134416543574593, "grad_norm": 0.2874417304992676, "learning_rate": 8.582337726320976e-05, "loss": 0.2258, "step": 4640 }, { "epoch": 1.713810930576071, "grad_norm": 0.2719419002532959, "learning_rate": 8.579874368764626e-05, "loss": 0.2354, "step": 4641 }, { "epoch": 1.7141802067946825, "grad_norm": 0.28539299964904785, "learning_rate": 8.577411011208278e-05, "loss": 0.2446, "step": 4642 }, { "epoch": 1.714549483013294, "grad_norm": 0.2207421213388443, "learning_rate": 8.574947653651928e-05, "loss": 0.2098, "step": 4643 }, { "epoch": 1.7149187592319055, "grad_norm": 0.2763689458370209, "learning_rate": 8.57248429609558e-05, "loss": 0.1912, "step": 4644 }, { "epoch": 1.715288035450517, "grad_norm": 0.28716400265693665, "learning_rate": 8.57002093853923e-05, "loss": 0.2179, "step": 4645 }, { "epoch": 1.7156573116691285, "grad_norm": 0.28598442673683167, "learning_rate": 8.56755758098288e-05, "loss": 0.2423, "step": 4646 }, { "epoch": 1.71602658788774, "grad_norm": 0.2622353434562683, "learning_rate": 8.565094223426531e-05, "loss": 0.2281, "step": 4647 }, { "epoch": 1.7163958641063517, "grad_norm": 0.3016565144062042, "learning_rate": 8.562630865870181e-05, "loss": 0.2634, "step": 4648 }, { "epoch": 1.716765140324963, "grad_norm": 0.3286326825618744, "learning_rate": 8.560167508313833e-05, "loss": 0.2699, "step": 4649 }, { "epoch": 1.7171344165435745, "grad_norm": 0.2957834005355835, "learning_rate": 8.557704150757483e-05, "loss": 0.205, "step": 4650 }, { "epoch": 1.7171344165435745, "eval_loss": 0.2559199631214142, "eval_runtime": 5.8598, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 4650 }, { "epoch": 1.717503692762186, "grad_norm": 0.24889697134494781, "learning_rate": 8.555240793201134e-05, "loss": 0.196, "step": 4651 }, { "epoch": 1.7178729689807977, "grad_norm": 0.2743191421031952, "learning_rate": 8.552777435644784e-05, "loss": 0.2563, "step": 4652 }, { "epoch": 1.7182422451994093, "grad_norm": 0.27960363030433655, "learning_rate": 8.550314078088434e-05, "loss": 0.2355, "step": 4653 }, { "epoch": 1.7186115214180206, "grad_norm": 0.30967599153518677, "learning_rate": 8.547850720532086e-05, "loss": 0.2272, "step": 4654 }, { "epoch": 1.7189807976366323, "grad_norm": 0.2617909908294678, "learning_rate": 8.545387362975736e-05, "loss": 0.1914, "step": 4655 }, { "epoch": 1.7193500738552436, "grad_norm": 0.2783837914466858, "learning_rate": 8.542924005419387e-05, "loss": 0.2323, "step": 4656 }, { "epoch": 1.7197193500738552, "grad_norm": 0.39419353008270264, "learning_rate": 8.540460647863038e-05, "loss": 0.2028, "step": 4657 }, { "epoch": 1.7200886262924668, "grad_norm": 0.28697970509529114, "learning_rate": 8.537997290306689e-05, "loss": 0.2222, "step": 4658 }, { "epoch": 1.7204579025110784, "grad_norm": 0.29508844017982483, "learning_rate": 8.535533932750339e-05, "loss": 0.2427, "step": 4659 }, { "epoch": 1.7208271787296898, "grad_norm": 0.22784453630447388, "learning_rate": 8.53307057519399e-05, "loss": 0.1948, "step": 4660 }, { "epoch": 1.7211964549483012, "grad_norm": 0.26522296667099, "learning_rate": 8.53060721763764e-05, "loss": 0.2431, "step": 4661 }, { "epoch": 1.7215657311669128, "grad_norm": 0.3083113431930542, "learning_rate": 8.528143860081291e-05, "loss": 0.231, "step": 4662 }, { "epoch": 1.7219350073855244, "grad_norm": 0.2493920624256134, "learning_rate": 8.525680502524942e-05, "loss": 0.2212, "step": 4663 }, { "epoch": 1.722304283604136, "grad_norm": 0.24538254737854004, "learning_rate": 8.523217144968592e-05, "loss": 0.2517, "step": 4664 }, { "epoch": 1.7226735598227474, "grad_norm": 0.2629144489765167, "learning_rate": 8.520753787412244e-05, "loss": 0.2157, "step": 4665 }, { "epoch": 1.7230428360413588, "grad_norm": 0.22240932285785675, "learning_rate": 8.518290429855894e-05, "loss": 0.2052, "step": 4666 }, { "epoch": 1.7234121122599704, "grad_norm": 0.9502313137054443, "learning_rate": 8.515827072299545e-05, "loss": 0.2784, "step": 4667 }, { "epoch": 1.723781388478582, "grad_norm": 0.27637672424316406, "learning_rate": 8.513363714743195e-05, "loss": 0.2267, "step": 4668 }, { "epoch": 1.7241506646971936, "grad_norm": 0.307858943939209, "learning_rate": 8.510900357186846e-05, "loss": 0.2254, "step": 4669 }, { "epoch": 1.7245199409158052, "grad_norm": 0.31493571400642395, "learning_rate": 8.508436999630497e-05, "loss": 0.2913, "step": 4670 }, { "epoch": 1.7248892171344166, "grad_norm": 0.28521549701690674, "learning_rate": 8.505973642074147e-05, "loss": 0.2528, "step": 4671 }, { "epoch": 1.725258493353028, "grad_norm": 0.3285808265209198, "learning_rate": 8.503510284517799e-05, "loss": 0.2496, "step": 4672 }, { "epoch": 1.7256277695716395, "grad_norm": 0.27094313502311707, "learning_rate": 8.501046926961449e-05, "loss": 0.2302, "step": 4673 }, { "epoch": 1.7259970457902511, "grad_norm": 0.3215968906879425, "learning_rate": 8.4985835694051e-05, "loss": 0.245, "step": 4674 }, { "epoch": 1.7263663220088628, "grad_norm": 0.2997489273548126, "learning_rate": 8.49612021184875e-05, "loss": 0.2155, "step": 4675 }, { "epoch": 1.7267355982274741, "grad_norm": 0.3190780580043793, "learning_rate": 8.493656854292402e-05, "loss": 0.3006, "step": 4676 }, { "epoch": 1.7271048744460855, "grad_norm": 0.23564372956752777, "learning_rate": 8.491193496736052e-05, "loss": 0.2057, "step": 4677 }, { "epoch": 1.7274741506646971, "grad_norm": 0.27775612473487854, "learning_rate": 8.488730139179702e-05, "loss": 0.2498, "step": 4678 }, { "epoch": 1.7278434268833087, "grad_norm": 0.40059298276901245, "learning_rate": 8.486266781623353e-05, "loss": 0.3065, "step": 4679 }, { "epoch": 1.7282127031019203, "grad_norm": 0.25466418266296387, "learning_rate": 8.483803424067004e-05, "loss": 0.1927, "step": 4680 }, { "epoch": 1.7285819793205317, "grad_norm": 0.300536572933197, "learning_rate": 8.481340066510655e-05, "loss": 0.2383, "step": 4681 }, { "epoch": 1.7289512555391433, "grad_norm": 0.234557643532753, "learning_rate": 8.478876708954305e-05, "loss": 0.2091, "step": 4682 }, { "epoch": 1.7293205317577547, "grad_norm": 0.24355682730674744, "learning_rate": 8.476413351397957e-05, "loss": 0.2011, "step": 4683 }, { "epoch": 1.7296898079763663, "grad_norm": 0.28211480379104614, "learning_rate": 8.473949993841607e-05, "loss": 0.2605, "step": 4684 }, { "epoch": 1.730059084194978, "grad_norm": 0.27587154507637024, "learning_rate": 8.471486636285257e-05, "loss": 0.196, "step": 4685 }, { "epoch": 1.7304283604135895, "grad_norm": 0.2767505943775177, "learning_rate": 8.469023278728908e-05, "loss": 0.2479, "step": 4686 }, { "epoch": 1.7307976366322009, "grad_norm": 0.27924713492393494, "learning_rate": 8.466559921172558e-05, "loss": 0.226, "step": 4687 }, { "epoch": 1.7311669128508123, "grad_norm": 0.27785757184028625, "learning_rate": 8.46409656361621e-05, "loss": 0.2573, "step": 4688 }, { "epoch": 1.7315361890694239, "grad_norm": 0.28567370772361755, "learning_rate": 8.46163320605986e-05, "loss": 0.2321, "step": 4689 }, { "epoch": 1.7319054652880355, "grad_norm": 0.25874269008636475, "learning_rate": 8.459169848503511e-05, "loss": 0.2262, "step": 4690 }, { "epoch": 1.732274741506647, "grad_norm": 0.2584947645664215, "learning_rate": 8.456706490947162e-05, "loss": 0.2258, "step": 4691 }, { "epoch": 1.7326440177252584, "grad_norm": 0.29869017004966736, "learning_rate": 8.454243133390812e-05, "loss": 0.2681, "step": 4692 }, { "epoch": 1.73301329394387, "grad_norm": 0.27775976061820984, "learning_rate": 8.451779775834463e-05, "loss": 0.226, "step": 4693 }, { "epoch": 1.7333825701624814, "grad_norm": 0.2534462511539459, "learning_rate": 8.449316418278113e-05, "loss": 0.2036, "step": 4694 }, { "epoch": 1.733751846381093, "grad_norm": 0.2533627152442932, "learning_rate": 8.446853060721765e-05, "loss": 0.1978, "step": 4695 }, { "epoch": 1.7341211225997046, "grad_norm": 0.2667185366153717, "learning_rate": 8.444389703165415e-05, "loss": 0.2061, "step": 4696 }, { "epoch": 1.7344903988183162, "grad_norm": 0.23923826217651367, "learning_rate": 8.441926345609066e-05, "loss": 0.2153, "step": 4697 }, { "epoch": 1.7348596750369276, "grad_norm": 0.2720220386981964, "learning_rate": 8.439462988052716e-05, "loss": 0.1984, "step": 4698 }, { "epoch": 1.735228951255539, "grad_norm": 0.2901582717895508, "learning_rate": 8.436999630496368e-05, "loss": 0.285, "step": 4699 }, { "epoch": 1.7355982274741506, "grad_norm": 0.22563205659389496, "learning_rate": 8.434536272940018e-05, "loss": 0.1913, "step": 4700 }, { "epoch": 1.7355982274741506, "eval_loss": 0.2577421963214874, "eval_runtime": 5.8614, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 4700 }, { "epoch": 1.7359675036927622, "grad_norm": 0.28509747982025146, "learning_rate": 8.432072915383668e-05, "loss": 0.2285, "step": 4701 }, { "epoch": 1.7363367799113738, "grad_norm": 0.3507232367992401, "learning_rate": 8.42960955782732e-05, "loss": 0.2764, "step": 4702 }, { "epoch": 1.7367060561299852, "grad_norm": 0.28448420763015747, "learning_rate": 8.42714620027097e-05, "loss": 0.2002, "step": 4703 }, { "epoch": 1.7370753323485968, "grad_norm": 0.28723815083503723, "learning_rate": 8.424682842714621e-05, "loss": 0.193, "step": 4704 }, { "epoch": 1.7374446085672082, "grad_norm": 0.29733386635780334, "learning_rate": 8.422219485158271e-05, "loss": 0.2499, "step": 4705 }, { "epoch": 1.7378138847858198, "grad_norm": 0.27715975046157837, "learning_rate": 8.419756127601923e-05, "loss": 0.2344, "step": 4706 }, { "epoch": 1.7381831610044314, "grad_norm": 0.2789750397205353, "learning_rate": 8.417292770045573e-05, "loss": 0.2556, "step": 4707 }, { "epoch": 1.738552437223043, "grad_norm": 0.27307644486427307, "learning_rate": 8.414829412489223e-05, "loss": 0.2221, "step": 4708 }, { "epoch": 1.7389217134416544, "grad_norm": 0.288707435131073, "learning_rate": 8.412366054932874e-05, "loss": 0.2176, "step": 4709 }, { "epoch": 1.7392909896602657, "grad_norm": 0.3327498733997345, "learning_rate": 8.409902697376524e-05, "loss": 0.2652, "step": 4710 }, { "epoch": 1.7396602658788773, "grad_norm": 0.405509352684021, "learning_rate": 8.407439339820176e-05, "loss": 0.2141, "step": 4711 }, { "epoch": 1.740029542097489, "grad_norm": 0.29631343483924866, "learning_rate": 8.404975982263826e-05, "loss": 0.2259, "step": 4712 }, { "epoch": 1.7403988183161005, "grad_norm": 0.2549144923686981, "learning_rate": 8.402512624707477e-05, "loss": 0.2226, "step": 4713 }, { "epoch": 1.740768094534712, "grad_norm": 0.30277979373931885, "learning_rate": 8.400049267151128e-05, "loss": 0.226, "step": 4714 }, { "epoch": 1.7411373707533235, "grad_norm": 0.24072663486003876, "learning_rate": 8.397585909594779e-05, "loss": 0.2008, "step": 4715 }, { "epoch": 1.741506646971935, "grad_norm": 0.3189060688018799, "learning_rate": 8.395122552038429e-05, "loss": 0.2356, "step": 4716 }, { "epoch": 1.7418759231905465, "grad_norm": 0.2404140830039978, "learning_rate": 8.392659194482079e-05, "loss": 0.2212, "step": 4717 }, { "epoch": 1.7422451994091581, "grad_norm": 0.29535868763923645, "learning_rate": 8.390195836925731e-05, "loss": 0.2348, "step": 4718 }, { "epoch": 1.7426144756277697, "grad_norm": 0.27296799421310425, "learning_rate": 8.387732479369381e-05, "loss": 0.2101, "step": 4719 }, { "epoch": 1.742983751846381, "grad_norm": 0.2823295593261719, "learning_rate": 8.385269121813032e-05, "loss": 0.216, "step": 4720 }, { "epoch": 1.7433530280649925, "grad_norm": 0.3427070081233978, "learning_rate": 8.382805764256682e-05, "loss": 0.2699, "step": 4721 }, { "epoch": 1.743722304283604, "grad_norm": 0.26912540197372437, "learning_rate": 8.380342406700334e-05, "loss": 0.2049, "step": 4722 }, { "epoch": 1.7440915805022157, "grad_norm": 0.30528247356414795, "learning_rate": 8.377879049143984e-05, "loss": 0.2585, "step": 4723 }, { "epoch": 1.7444608567208273, "grad_norm": 0.2788606882095337, "learning_rate": 8.375415691587634e-05, "loss": 0.2297, "step": 4724 }, { "epoch": 1.7448301329394387, "grad_norm": 0.3505759537220001, "learning_rate": 8.372952334031286e-05, "loss": 0.2437, "step": 4725 }, { "epoch": 1.7451994091580503, "grad_norm": 0.2676331400871277, "learning_rate": 8.370488976474936e-05, "loss": 0.1927, "step": 4726 }, { "epoch": 1.7455686853766617, "grad_norm": 0.4100620746612549, "learning_rate": 8.368025618918587e-05, "loss": 0.3176, "step": 4727 }, { "epoch": 1.7459379615952733, "grad_norm": 0.25829556584358215, "learning_rate": 8.365562261362237e-05, "loss": 0.212, "step": 4728 }, { "epoch": 1.7463072378138849, "grad_norm": 0.23964823782444, "learning_rate": 8.363098903805889e-05, "loss": 0.2282, "step": 4729 }, { "epoch": 1.7466765140324965, "grad_norm": 0.2864070236682892, "learning_rate": 8.360635546249539e-05, "loss": 0.2392, "step": 4730 }, { "epoch": 1.7470457902511078, "grad_norm": 0.2946716547012329, "learning_rate": 8.35817218869319e-05, "loss": 0.2233, "step": 4731 }, { "epoch": 1.7474150664697192, "grad_norm": 0.27674975991249084, "learning_rate": 8.35570883113684e-05, "loss": 0.2156, "step": 4732 }, { "epoch": 1.7477843426883308, "grad_norm": 0.2801598608493805, "learning_rate": 8.35324547358049e-05, "loss": 0.2279, "step": 4733 }, { "epoch": 1.7481536189069424, "grad_norm": 0.25461244583129883, "learning_rate": 8.350782116024142e-05, "loss": 0.2174, "step": 4734 }, { "epoch": 1.748522895125554, "grad_norm": 0.29004809260368347, "learning_rate": 8.348318758467792e-05, "loss": 0.259, "step": 4735 }, { "epoch": 1.7488921713441654, "grad_norm": 0.37906742095947266, "learning_rate": 8.345855400911444e-05, "loss": 0.2609, "step": 4736 }, { "epoch": 1.7492614475627768, "grad_norm": 0.2857353389263153, "learning_rate": 8.343392043355094e-05, "loss": 0.2181, "step": 4737 }, { "epoch": 1.7496307237813884, "grad_norm": 0.2577114999294281, "learning_rate": 8.340928685798745e-05, "loss": 0.2202, "step": 4738 }, { "epoch": 1.75, "grad_norm": 0.36852386593818665, "learning_rate": 8.338465328242395e-05, "loss": 0.2747, "step": 4739 }, { "epoch": 1.7503692762186116, "grad_norm": 0.26849859952926636, "learning_rate": 8.336001970686045e-05, "loss": 0.2054, "step": 4740 }, { "epoch": 1.7507385524372232, "grad_norm": 0.24560749530792236, "learning_rate": 8.333538613129697e-05, "loss": 0.1925, "step": 4741 }, { "epoch": 1.7511078286558346, "grad_norm": 0.2803412973880768, "learning_rate": 8.331075255573347e-05, "loss": 0.2511, "step": 4742 }, { "epoch": 1.751477104874446, "grad_norm": 0.24951079487800598, "learning_rate": 8.328611898016998e-05, "loss": 0.2099, "step": 4743 }, { "epoch": 1.7518463810930576, "grad_norm": 0.2854885160923004, "learning_rate": 8.326148540460648e-05, "loss": 0.2514, "step": 4744 }, { "epoch": 1.7522156573116692, "grad_norm": 0.33116501569747925, "learning_rate": 8.3236851829043e-05, "loss": 0.2803, "step": 4745 }, { "epoch": 1.7525849335302808, "grad_norm": 0.23747555911540985, "learning_rate": 8.32122182534795e-05, "loss": 0.1952, "step": 4746 }, { "epoch": 1.7529542097488922, "grad_norm": 0.2971426248550415, "learning_rate": 8.318758467791602e-05, "loss": 0.2456, "step": 4747 }, { "epoch": 1.7533234859675035, "grad_norm": 0.2350413203239441, "learning_rate": 8.316295110235252e-05, "loss": 0.1831, "step": 4748 }, { "epoch": 1.7536927621861151, "grad_norm": 0.2742297947406769, "learning_rate": 8.313831752678902e-05, "loss": 0.2131, "step": 4749 }, { "epoch": 1.7540620384047267, "grad_norm": 0.2849682867527008, "learning_rate": 8.311368395122553e-05, "loss": 0.23, "step": 4750 }, { "epoch": 1.7540620384047267, "eval_loss": 0.25629734992980957, "eval_runtime": 5.8662, "eval_samples_per_second": 8.523, "eval_steps_per_second": 1.193, "step": 4750 }, { "epoch": 1.7544313146233383, "grad_norm": 0.2834409475326538, "learning_rate": 8.308905037566203e-05, "loss": 0.2206, "step": 4751 }, { "epoch": 1.7548005908419497, "grad_norm": 0.24136458337306976, "learning_rate": 8.306441680009855e-05, "loss": 0.2136, "step": 4752 }, { "epoch": 1.7551698670605613, "grad_norm": 0.28627488017082214, "learning_rate": 8.303978322453505e-05, "loss": 0.2252, "step": 4753 }, { "epoch": 1.7555391432791727, "grad_norm": 0.27557557821273804, "learning_rate": 8.301514964897156e-05, "loss": 0.2256, "step": 4754 }, { "epoch": 1.7559084194977843, "grad_norm": 0.28202036023139954, "learning_rate": 8.299051607340806e-05, "loss": 0.2478, "step": 4755 }, { "epoch": 1.756277695716396, "grad_norm": 0.2877001464366913, "learning_rate": 8.296588249784457e-05, "loss": 0.2426, "step": 4756 }, { "epoch": 1.7566469719350075, "grad_norm": 0.2511405646800995, "learning_rate": 8.294124892228108e-05, "loss": 0.2104, "step": 4757 }, { "epoch": 1.757016248153619, "grad_norm": 0.24227432906627655, "learning_rate": 8.291661534671758e-05, "loss": 0.2121, "step": 4758 }, { "epoch": 1.7573855243722303, "grad_norm": 0.2546199560165405, "learning_rate": 8.28919817711541e-05, "loss": 0.2281, "step": 4759 }, { "epoch": 1.7577548005908419, "grad_norm": 0.31248635053634644, "learning_rate": 8.28673481955906e-05, "loss": 0.2274, "step": 4760 }, { "epoch": 1.7581240768094535, "grad_norm": 0.2701592743396759, "learning_rate": 8.28427146200271e-05, "loss": 0.21, "step": 4761 }, { "epoch": 1.758493353028065, "grad_norm": 0.29244470596313477, "learning_rate": 8.28180810444636e-05, "loss": 0.2072, "step": 4762 }, { "epoch": 1.7588626292466765, "grad_norm": 0.25586065649986267, "learning_rate": 8.279344746890011e-05, "loss": 0.2153, "step": 4763 }, { "epoch": 1.759231905465288, "grad_norm": 0.3217445909976959, "learning_rate": 8.276881389333661e-05, "loss": 0.2646, "step": 4764 }, { "epoch": 1.7596011816838995, "grad_norm": 0.32226285338401794, "learning_rate": 8.274418031777313e-05, "loss": 0.2796, "step": 4765 }, { "epoch": 1.759970457902511, "grad_norm": 0.3295985758304596, "learning_rate": 8.271954674220963e-05, "loss": 0.2184, "step": 4766 }, { "epoch": 1.7603397341211227, "grad_norm": 0.28168031573295593, "learning_rate": 8.269491316664613e-05, "loss": 0.249, "step": 4767 }, { "epoch": 1.7607090103397343, "grad_norm": 0.2556081712245941, "learning_rate": 8.267027959108265e-05, "loss": 0.2415, "step": 4768 }, { "epoch": 1.7610782865583456, "grad_norm": 0.2822915315628052, "learning_rate": 8.264564601551915e-05, "loss": 0.2393, "step": 4769 }, { "epoch": 1.761447562776957, "grad_norm": 0.2232300341129303, "learning_rate": 8.262101243995566e-05, "loss": 0.1948, "step": 4770 }, { "epoch": 1.7618168389955686, "grad_norm": 0.2830291986465454, "learning_rate": 8.259637886439216e-05, "loss": 0.2162, "step": 4771 }, { "epoch": 1.7621861152141802, "grad_norm": 0.2080407440662384, "learning_rate": 8.257174528882868e-05, "loss": 0.1822, "step": 4772 }, { "epoch": 1.7625553914327918, "grad_norm": 0.27873337268829346, "learning_rate": 8.254711171326518e-05, "loss": 0.2165, "step": 4773 }, { "epoch": 1.7629246676514032, "grad_norm": 0.22472889721393585, "learning_rate": 8.252247813770168e-05, "loss": 0.2033, "step": 4774 }, { "epoch": 1.7632939438700148, "grad_norm": 0.24323783814907074, "learning_rate": 8.24978445621382e-05, "loss": 0.1833, "step": 4775 }, { "epoch": 1.7636632200886262, "grad_norm": 0.344291090965271, "learning_rate": 8.24732109865747e-05, "loss": 0.2771, "step": 4776 }, { "epoch": 1.7640324963072378, "grad_norm": 0.23993299901485443, "learning_rate": 8.244857741101121e-05, "loss": 0.2151, "step": 4777 }, { "epoch": 1.7644017725258494, "grad_norm": 0.2538295090198517, "learning_rate": 8.242394383544771e-05, "loss": 0.193, "step": 4778 }, { "epoch": 1.764771048744461, "grad_norm": 0.24399816989898682, "learning_rate": 8.239931025988423e-05, "loss": 0.2047, "step": 4779 }, { "epoch": 1.7651403249630724, "grad_norm": 0.27396318316459656, "learning_rate": 8.237467668432073e-05, "loss": 0.2298, "step": 4780 }, { "epoch": 1.7655096011816838, "grad_norm": 0.2602836489677429, "learning_rate": 8.235004310875724e-05, "loss": 0.2292, "step": 4781 }, { "epoch": 1.7658788774002954, "grad_norm": 0.3122258186340332, "learning_rate": 8.232540953319374e-05, "loss": 0.2347, "step": 4782 }, { "epoch": 1.766248153618907, "grad_norm": 0.21484318375587463, "learning_rate": 8.230077595763024e-05, "loss": 0.1888, "step": 4783 }, { "epoch": 1.7666174298375186, "grad_norm": 0.3026772439479828, "learning_rate": 8.227614238206676e-05, "loss": 0.2152, "step": 4784 }, { "epoch": 1.76698670605613, "grad_norm": 0.2766391634941101, "learning_rate": 8.225150880650326e-05, "loss": 0.2407, "step": 4785 }, { "epoch": 1.7673559822747416, "grad_norm": 0.3706916868686676, "learning_rate": 8.222687523093977e-05, "loss": 0.2933, "step": 4786 }, { "epoch": 1.767725258493353, "grad_norm": 0.2903095483779907, "learning_rate": 8.220224165537628e-05, "loss": 0.2039, "step": 4787 }, { "epoch": 1.7680945347119645, "grad_norm": 0.2527204155921936, "learning_rate": 8.217760807981279e-05, "loss": 0.206, "step": 4788 }, { "epoch": 1.7684638109305761, "grad_norm": 0.2885925769805908, "learning_rate": 8.215297450424929e-05, "loss": 0.2627, "step": 4789 }, { "epoch": 1.7688330871491877, "grad_norm": 0.2507830262184143, "learning_rate": 8.212834092868579e-05, "loss": 0.2128, "step": 4790 }, { "epoch": 1.7692023633677991, "grad_norm": 0.28362154960632324, "learning_rate": 8.210370735312231e-05, "loss": 0.2192, "step": 4791 }, { "epoch": 1.7695716395864105, "grad_norm": 0.2576613426208496, "learning_rate": 8.207907377755881e-05, "loss": 0.1932, "step": 4792 }, { "epoch": 1.769940915805022, "grad_norm": 0.27201154828071594, "learning_rate": 8.205444020199532e-05, "loss": 0.243, "step": 4793 }, { "epoch": 1.7703101920236337, "grad_norm": 0.24626173079013824, "learning_rate": 8.202980662643182e-05, "loss": 0.2322, "step": 4794 }, { "epoch": 1.7706794682422453, "grad_norm": 0.24108372628688812, "learning_rate": 8.200517305086834e-05, "loss": 0.2444, "step": 4795 }, { "epoch": 1.7710487444608567, "grad_norm": 0.3414941430091858, "learning_rate": 8.198053947530484e-05, "loss": 0.2722, "step": 4796 }, { "epoch": 1.7714180206794683, "grad_norm": 0.3452490270137787, "learning_rate": 8.195590589974135e-05, "loss": 0.2412, "step": 4797 }, { "epoch": 1.7717872968980797, "grad_norm": 0.3017958104610443, "learning_rate": 8.193127232417786e-05, "loss": 0.2278, "step": 4798 }, { "epoch": 1.7721565731166913, "grad_norm": 0.25969305634498596, "learning_rate": 8.190663874861436e-05, "loss": 0.2016, "step": 4799 }, { "epoch": 1.7725258493353029, "grad_norm": 0.24949228763580322, "learning_rate": 8.188200517305087e-05, "loss": 0.2216, "step": 4800 }, { "epoch": 1.7725258493353029, "eval_loss": 0.2572707235813141, "eval_runtime": 5.8665, "eval_samples_per_second": 8.523, "eval_steps_per_second": 1.193, "step": 4800 }, { "epoch": 1.7728951255539145, "grad_norm": 0.3111516833305359, "learning_rate": 8.185737159748737e-05, "loss": 0.2475, "step": 4801 }, { "epoch": 1.7732644017725259, "grad_norm": 0.26070913672447205, "learning_rate": 8.183273802192389e-05, "loss": 0.2392, "step": 4802 }, { "epoch": 1.7736336779911372, "grad_norm": 0.29763057827949524, "learning_rate": 8.180810444636039e-05, "loss": 0.2547, "step": 4803 }, { "epoch": 1.7740029542097489, "grad_norm": 0.342492938041687, "learning_rate": 8.17834708707969e-05, "loss": 0.2694, "step": 4804 }, { "epoch": 1.7743722304283605, "grad_norm": 0.34890803694725037, "learning_rate": 8.17588372952334e-05, "loss": 0.2875, "step": 4805 }, { "epoch": 1.774741506646972, "grad_norm": 0.2868111729621887, "learning_rate": 8.17342037196699e-05, "loss": 0.2002, "step": 4806 }, { "epoch": 1.7751107828655834, "grad_norm": 0.2373673915863037, "learning_rate": 8.170957014410642e-05, "loss": 0.2005, "step": 4807 }, { "epoch": 1.7754800590841948, "grad_norm": 0.30628499388694763, "learning_rate": 8.168493656854292e-05, "loss": 0.225, "step": 4808 }, { "epoch": 1.7758493353028064, "grad_norm": 0.2583736181259155, "learning_rate": 8.166030299297943e-05, "loss": 0.2286, "step": 4809 }, { "epoch": 1.776218611521418, "grad_norm": 0.2954586148262024, "learning_rate": 8.163566941741594e-05, "loss": 0.279, "step": 4810 }, { "epoch": 1.7765878877400296, "grad_norm": 0.23172208666801453, "learning_rate": 8.161103584185245e-05, "loss": 0.2113, "step": 4811 }, { "epoch": 1.7769571639586412, "grad_norm": 0.28336572647094727, "learning_rate": 8.158640226628895e-05, "loss": 0.2553, "step": 4812 }, { "epoch": 1.7773264401772526, "grad_norm": 0.3208393454551697, "learning_rate": 8.156176869072547e-05, "loss": 0.2283, "step": 4813 }, { "epoch": 1.777695716395864, "grad_norm": 0.30786389112472534, "learning_rate": 8.153713511516197e-05, "loss": 0.2594, "step": 4814 }, { "epoch": 1.7780649926144756, "grad_norm": 0.30445629358291626, "learning_rate": 8.151250153959847e-05, "loss": 0.276, "step": 4815 }, { "epoch": 1.7784342688330872, "grad_norm": 0.3171381652355194, "learning_rate": 8.148786796403498e-05, "loss": 0.2435, "step": 4816 }, { "epoch": 1.7788035450516988, "grad_norm": 0.36137646436691284, "learning_rate": 8.146323438847148e-05, "loss": 0.334, "step": 4817 }, { "epoch": 1.7791728212703102, "grad_norm": 0.3493507504463196, "learning_rate": 8.1438600812908e-05, "loss": 0.2908, "step": 4818 }, { "epoch": 1.7795420974889216, "grad_norm": 0.2983262836933136, "learning_rate": 8.14139672373445e-05, "loss": 0.2556, "step": 4819 }, { "epoch": 1.7799113737075332, "grad_norm": 0.23181068897247314, "learning_rate": 8.138933366178101e-05, "loss": 0.2151, "step": 4820 }, { "epoch": 1.7802806499261448, "grad_norm": 0.3134463429450989, "learning_rate": 8.136470008621752e-05, "loss": 0.247, "step": 4821 }, { "epoch": 1.7806499261447564, "grad_norm": 0.22138331830501556, "learning_rate": 8.134006651065402e-05, "loss": 0.1989, "step": 4822 }, { "epoch": 1.7810192023633677, "grad_norm": 0.30223992466926575, "learning_rate": 8.131543293509053e-05, "loss": 0.2792, "step": 4823 }, { "epoch": 1.7813884785819794, "grad_norm": 0.22743350267410278, "learning_rate": 8.129079935952703e-05, "loss": 0.2085, "step": 4824 }, { "epoch": 1.7817577548005907, "grad_norm": 0.23054760694503784, "learning_rate": 8.126616578396355e-05, "loss": 0.1987, "step": 4825 }, { "epoch": 1.7821270310192023, "grad_norm": 0.2348661571741104, "learning_rate": 8.124153220840005e-05, "loss": 0.2121, "step": 4826 }, { "epoch": 1.782496307237814, "grad_norm": 0.27090921998023987, "learning_rate": 8.121689863283656e-05, "loss": 0.2425, "step": 4827 }, { "epoch": 1.7828655834564255, "grad_norm": 0.27171093225479126, "learning_rate": 8.119226505727306e-05, "loss": 0.2258, "step": 4828 }, { "epoch": 1.783234859675037, "grad_norm": 0.29861265420913696, "learning_rate": 8.116763148170956e-05, "loss": 0.222, "step": 4829 }, { "epoch": 1.7836041358936483, "grad_norm": 0.3010806441307068, "learning_rate": 8.114299790614608e-05, "loss": 0.2617, "step": 4830 }, { "epoch": 1.78397341211226, "grad_norm": 0.30580946803092957, "learning_rate": 8.111836433058258e-05, "loss": 0.2289, "step": 4831 }, { "epoch": 1.7843426883308715, "grad_norm": 0.29021722078323364, "learning_rate": 8.10937307550191e-05, "loss": 0.2407, "step": 4832 }, { "epoch": 1.784711964549483, "grad_norm": 0.2731410264968872, "learning_rate": 8.10690971794556e-05, "loss": 0.2624, "step": 4833 }, { "epoch": 1.7850812407680945, "grad_norm": 0.24605174362659454, "learning_rate": 8.104446360389211e-05, "loss": 0.1958, "step": 4834 }, { "epoch": 1.785450516986706, "grad_norm": 0.22555018961429596, "learning_rate": 8.101983002832861e-05, "loss": 0.2166, "step": 4835 }, { "epoch": 1.7858197932053175, "grad_norm": 0.27406737208366394, "learning_rate": 8.099519645276513e-05, "loss": 0.2206, "step": 4836 }, { "epoch": 1.786189069423929, "grad_norm": 0.35636645555496216, "learning_rate": 8.097056287720163e-05, "loss": 0.208, "step": 4837 }, { "epoch": 1.7865583456425407, "grad_norm": 0.25280651450157166, "learning_rate": 8.094592930163813e-05, "loss": 0.2125, "step": 4838 }, { "epoch": 1.7869276218611523, "grad_norm": 0.27054616808891296, "learning_rate": 8.092129572607464e-05, "loss": 0.2327, "step": 4839 }, { "epoch": 1.7872968980797637, "grad_norm": 0.2454398274421692, "learning_rate": 8.089666215051114e-05, "loss": 0.2025, "step": 4840 }, { "epoch": 1.787666174298375, "grad_norm": 0.24820207059383392, "learning_rate": 8.087202857494766e-05, "loss": 0.1859, "step": 4841 }, { "epoch": 1.7880354505169866, "grad_norm": 0.2680160403251648, "learning_rate": 8.084739499938416e-05, "loss": 0.1961, "step": 4842 }, { "epoch": 1.7884047267355982, "grad_norm": 0.27619653940200806, "learning_rate": 8.082276142382068e-05, "loss": 0.2179, "step": 4843 }, { "epoch": 1.7887740029542099, "grad_norm": 0.2535490393638611, "learning_rate": 8.079812784825718e-05, "loss": 0.2169, "step": 4844 }, { "epoch": 1.7891432791728212, "grad_norm": 0.27932682633399963, "learning_rate": 8.077349427269368e-05, "loss": 0.2326, "step": 4845 }, { "epoch": 1.7895125553914328, "grad_norm": 0.2859097421169281, "learning_rate": 8.074886069713019e-05, "loss": 0.2111, "step": 4846 }, { "epoch": 1.7898818316100442, "grad_norm": 0.3570568263530731, "learning_rate": 8.072422712156669e-05, "loss": 0.268, "step": 4847 }, { "epoch": 1.7902511078286558, "grad_norm": 0.2741071879863739, "learning_rate": 8.069959354600321e-05, "loss": 0.2286, "step": 4848 }, { "epoch": 1.7906203840472674, "grad_norm": 0.3137156069278717, "learning_rate": 8.067495997043971e-05, "loss": 0.2646, "step": 4849 }, { "epoch": 1.790989660265879, "grad_norm": 0.300356388092041, "learning_rate": 8.065032639487622e-05, "loss": 0.2334, "step": 4850 }, { "epoch": 1.790989660265879, "eval_loss": 0.2527002692222595, "eval_runtime": 5.8733, "eval_samples_per_second": 8.513, "eval_steps_per_second": 1.192, "step": 4850 }, { "epoch": 1.7913589364844904, "grad_norm": 0.2975861430168152, "learning_rate": 8.062569281931272e-05, "loss": 0.2254, "step": 4851 }, { "epoch": 1.7917282127031018, "grad_norm": 0.2494833618402481, "learning_rate": 8.060105924374924e-05, "loss": 0.2048, "step": 4852 }, { "epoch": 1.7920974889217134, "grad_norm": 0.31400489807128906, "learning_rate": 8.057642566818574e-05, "loss": 0.1961, "step": 4853 }, { "epoch": 1.792466765140325, "grad_norm": 0.2428143173456192, "learning_rate": 8.055179209262224e-05, "loss": 0.2085, "step": 4854 }, { "epoch": 1.7928360413589366, "grad_norm": 0.28034815192222595, "learning_rate": 8.052715851705876e-05, "loss": 0.2512, "step": 4855 }, { "epoch": 1.793205317577548, "grad_norm": 0.2713885009288788, "learning_rate": 8.050252494149526e-05, "loss": 0.2438, "step": 4856 }, { "epoch": 1.7935745937961596, "grad_norm": 0.2918667793273926, "learning_rate": 8.047789136593177e-05, "loss": 0.2473, "step": 4857 }, { "epoch": 1.793943870014771, "grad_norm": 0.33813074231147766, "learning_rate": 8.045325779036827e-05, "loss": 0.2621, "step": 4858 }, { "epoch": 1.7943131462333826, "grad_norm": 0.29765844345092773, "learning_rate": 8.042862421480479e-05, "loss": 0.2367, "step": 4859 }, { "epoch": 1.7946824224519942, "grad_norm": 0.2806040048599243, "learning_rate": 8.040399063924129e-05, "loss": 0.2404, "step": 4860 }, { "epoch": 1.7950516986706058, "grad_norm": 0.32674238085746765, "learning_rate": 8.037935706367779e-05, "loss": 0.2396, "step": 4861 }, { "epoch": 1.7954209748892171, "grad_norm": 0.28877851366996765, "learning_rate": 8.03547234881143e-05, "loss": 0.2611, "step": 4862 }, { "epoch": 1.7957902511078285, "grad_norm": 0.2757430076599121, "learning_rate": 8.03300899125508e-05, "loss": 0.1673, "step": 4863 }, { "epoch": 1.7961595273264401, "grad_norm": 0.27084866166114807, "learning_rate": 8.030545633698732e-05, "loss": 0.2196, "step": 4864 }, { "epoch": 1.7965288035450517, "grad_norm": 0.2568441927433014, "learning_rate": 8.028082276142382e-05, "loss": 0.2232, "step": 4865 }, { "epoch": 1.7968980797636633, "grad_norm": 0.27764129638671875, "learning_rate": 8.025618918586034e-05, "loss": 0.264, "step": 4866 }, { "epoch": 1.7972673559822747, "grad_norm": 0.2663974463939667, "learning_rate": 8.023155561029684e-05, "loss": 0.2321, "step": 4867 }, { "epoch": 1.797636632200886, "grad_norm": 0.3205213248729706, "learning_rate": 8.020692203473335e-05, "loss": 0.2311, "step": 4868 }, { "epoch": 1.7980059084194977, "grad_norm": 0.2799781858921051, "learning_rate": 8.018228845916985e-05, "loss": 0.2061, "step": 4869 }, { "epoch": 1.7983751846381093, "grad_norm": 0.27617910504341125, "learning_rate": 8.015765488360635e-05, "loss": 0.2078, "step": 4870 }, { "epoch": 1.798744460856721, "grad_norm": 0.2966643273830414, "learning_rate": 8.013302130804287e-05, "loss": 0.1973, "step": 4871 }, { "epoch": 1.7991137370753325, "grad_norm": 0.2345452457666397, "learning_rate": 8.010838773247937e-05, "loss": 0.2251, "step": 4872 }, { "epoch": 1.799483013293944, "grad_norm": 0.2593734562397003, "learning_rate": 8.008375415691588e-05, "loss": 0.2127, "step": 4873 }, { "epoch": 1.7998522895125553, "grad_norm": 0.3477763533592224, "learning_rate": 8.005912058135238e-05, "loss": 0.2617, "step": 4874 }, { "epoch": 1.8002215657311669, "grad_norm": 0.29197418689727783, "learning_rate": 8.00344870057889e-05, "loss": 0.2382, "step": 4875 }, { "epoch": 1.8005908419497785, "grad_norm": 0.26208192110061646, "learning_rate": 8.00098534302254e-05, "loss": 0.2264, "step": 4876 }, { "epoch": 1.80096011816839, "grad_norm": 0.30810004472732544, "learning_rate": 7.99852198546619e-05, "loss": 0.2339, "step": 4877 }, { "epoch": 1.8013293943870015, "grad_norm": 0.31405359506607056, "learning_rate": 7.996058627909842e-05, "loss": 0.2072, "step": 4878 }, { "epoch": 1.8016986706056128, "grad_norm": 0.22917915880680084, "learning_rate": 7.993595270353492e-05, "loss": 0.2053, "step": 4879 }, { "epoch": 1.8020679468242244, "grad_norm": 0.23804986476898193, "learning_rate": 7.991131912797143e-05, "loss": 0.1675, "step": 4880 }, { "epoch": 1.802437223042836, "grad_norm": 0.24717144668102264, "learning_rate": 7.988668555240793e-05, "loss": 0.21, "step": 4881 }, { "epoch": 1.8028064992614476, "grad_norm": 0.2823777198791504, "learning_rate": 7.986205197684445e-05, "loss": 0.2119, "step": 4882 }, { "epoch": 1.803175775480059, "grad_norm": 0.27875393629074097, "learning_rate": 7.983741840128095e-05, "loss": 0.2538, "step": 4883 }, { "epoch": 1.8035450516986706, "grad_norm": 0.2991819381713867, "learning_rate": 7.981278482571746e-05, "loss": 0.2463, "step": 4884 }, { "epoch": 1.803914327917282, "grad_norm": 0.28031617403030396, "learning_rate": 7.978815125015396e-05, "loss": 0.2174, "step": 4885 }, { "epoch": 1.8042836041358936, "grad_norm": 0.2618853747844696, "learning_rate": 7.976351767459047e-05, "loss": 0.2306, "step": 4886 }, { "epoch": 1.8046528803545052, "grad_norm": 0.27993547916412354, "learning_rate": 7.973888409902698e-05, "loss": 0.2692, "step": 4887 }, { "epoch": 1.8050221565731168, "grad_norm": 0.29700198769569397, "learning_rate": 7.971425052346348e-05, "loss": 0.2188, "step": 4888 }, { "epoch": 1.8053914327917282, "grad_norm": 0.26962313055992126, "learning_rate": 7.96896169479e-05, "loss": 0.1874, "step": 4889 }, { "epoch": 1.8057607090103396, "grad_norm": 0.2779366672039032, "learning_rate": 7.96649833723365e-05, "loss": 0.212, "step": 4890 }, { "epoch": 1.8061299852289512, "grad_norm": 0.3361537754535675, "learning_rate": 7.964034979677301e-05, "loss": 0.2599, "step": 4891 }, { "epoch": 1.8064992614475628, "grad_norm": 0.2609734833240509, "learning_rate": 7.961571622120951e-05, "loss": 0.202, "step": 4892 }, { "epoch": 1.8068685376661744, "grad_norm": 0.28654301166534424, "learning_rate": 7.959108264564601e-05, "loss": 0.2053, "step": 4893 }, { "epoch": 1.8072378138847858, "grad_norm": 0.2660316526889801, "learning_rate": 7.956644907008253e-05, "loss": 0.2237, "step": 4894 }, { "epoch": 1.8076070901033974, "grad_norm": 0.28078603744506836, "learning_rate": 7.954181549451903e-05, "loss": 0.2098, "step": 4895 }, { "epoch": 1.8079763663220088, "grad_norm": 0.30883359909057617, "learning_rate": 7.951718191895554e-05, "loss": 0.2774, "step": 4896 }, { "epoch": 1.8083456425406204, "grad_norm": 0.2667382061481476, "learning_rate": 7.949254834339205e-05, "loss": 0.2029, "step": 4897 }, { "epoch": 1.808714918759232, "grad_norm": 0.24940231442451477, "learning_rate": 7.946791476782856e-05, "loss": 0.207, "step": 4898 }, { "epoch": 1.8090841949778436, "grad_norm": 0.2791793942451477, "learning_rate": 7.944328119226506e-05, "loss": 0.2278, "step": 4899 }, { "epoch": 1.809453471196455, "grad_norm": 0.30021530389785767, "learning_rate": 7.941864761670158e-05, "loss": 0.2268, "step": 4900 }, { "epoch": 1.809453471196455, "eval_loss": 0.25379928946495056, "eval_runtime": 5.8681, "eval_samples_per_second": 8.521, "eval_steps_per_second": 1.193, "step": 4900 }, { "epoch": 1.8098227474150663, "grad_norm": 0.27923181653022766, "learning_rate": 7.939401404113808e-05, "loss": 0.2709, "step": 4901 }, { "epoch": 1.810192023633678, "grad_norm": 0.26607418060302734, "learning_rate": 7.936938046557458e-05, "loss": 0.2299, "step": 4902 }, { "epoch": 1.8105612998522895, "grad_norm": 0.28474894165992737, "learning_rate": 7.934474689001109e-05, "loss": 0.2338, "step": 4903 }, { "epoch": 1.8109305760709011, "grad_norm": 0.3211461901664734, "learning_rate": 7.93201133144476e-05, "loss": 0.2629, "step": 4904 }, { "epoch": 1.8112998522895125, "grad_norm": 0.3524608612060547, "learning_rate": 7.929547973888411e-05, "loss": 0.2475, "step": 4905 }, { "epoch": 1.8116691285081241, "grad_norm": 0.3104952871799469, "learning_rate": 7.927084616332061e-05, "loss": 0.2325, "step": 4906 }, { "epoch": 1.8120384047267355, "grad_norm": 0.2545289099216461, "learning_rate": 7.924621258775712e-05, "loss": 0.234, "step": 4907 }, { "epoch": 1.812407680945347, "grad_norm": 0.37227439880371094, "learning_rate": 7.922157901219363e-05, "loss": 0.319, "step": 4908 }, { "epoch": 1.8127769571639587, "grad_norm": 0.23678454756736755, "learning_rate": 7.919694543663013e-05, "loss": 0.2029, "step": 4909 }, { "epoch": 1.8131462333825703, "grad_norm": 0.29453420639038086, "learning_rate": 7.917231186106664e-05, "loss": 0.2516, "step": 4910 }, { "epoch": 1.8135155096011817, "grad_norm": 0.298469215631485, "learning_rate": 7.914767828550314e-05, "loss": 0.2024, "step": 4911 }, { "epoch": 1.813884785819793, "grad_norm": 0.29158827662467957, "learning_rate": 7.912304470993966e-05, "loss": 0.2493, "step": 4912 }, { "epoch": 1.8142540620384047, "grad_norm": 0.36862993240356445, "learning_rate": 7.909841113437616e-05, "loss": 0.2848, "step": 4913 }, { "epoch": 1.8146233382570163, "grad_norm": 0.3295852541923523, "learning_rate": 7.907377755881267e-05, "loss": 0.2472, "step": 4914 }, { "epoch": 1.8149926144756279, "grad_norm": 0.2569772005081177, "learning_rate": 7.904914398324917e-05, "loss": 0.2247, "step": 4915 }, { "epoch": 1.8153618906942393, "grad_norm": 0.26352742314338684, "learning_rate": 7.902451040768567e-05, "loss": 0.2043, "step": 4916 }, { "epoch": 1.8157311669128509, "grad_norm": 0.2315554916858673, "learning_rate": 7.899987683212219e-05, "loss": 0.2121, "step": 4917 }, { "epoch": 1.8161004431314622, "grad_norm": 0.23188723623752594, "learning_rate": 7.897524325655869e-05, "loss": 0.1938, "step": 4918 }, { "epoch": 1.8164697193500738, "grad_norm": 0.2778877317905426, "learning_rate": 7.89506096809952e-05, "loss": 0.2, "step": 4919 }, { "epoch": 1.8168389955686854, "grad_norm": 0.2500912547111511, "learning_rate": 7.89259761054317e-05, "loss": 0.201, "step": 4920 }, { "epoch": 1.817208271787297, "grad_norm": 0.27916452288627625, "learning_rate": 7.890134252986822e-05, "loss": 0.2155, "step": 4921 }, { "epoch": 1.8175775480059084, "grad_norm": 0.24194051325321198, "learning_rate": 7.887670895430472e-05, "loss": 0.212, "step": 4922 }, { "epoch": 1.8179468242245198, "grad_norm": 0.2625460922718048, "learning_rate": 7.885207537874124e-05, "loss": 0.215, "step": 4923 }, { "epoch": 1.8183161004431314, "grad_norm": 0.30083608627319336, "learning_rate": 7.882744180317774e-05, "loss": 0.2235, "step": 4924 }, { "epoch": 1.818685376661743, "grad_norm": 0.24189461767673492, "learning_rate": 7.880280822761424e-05, "loss": 0.2138, "step": 4925 }, { "epoch": 1.8190546528803546, "grad_norm": 0.3320610821247101, "learning_rate": 7.877817465205075e-05, "loss": 0.256, "step": 4926 }, { "epoch": 1.819423929098966, "grad_norm": 0.31430310010910034, "learning_rate": 7.875354107648725e-05, "loss": 0.2411, "step": 4927 }, { "epoch": 1.8197932053175776, "grad_norm": 0.27846720814704895, "learning_rate": 7.872890750092377e-05, "loss": 0.2235, "step": 4928 }, { "epoch": 1.820162481536189, "grad_norm": 0.3182385265827179, "learning_rate": 7.870427392536027e-05, "loss": 0.253, "step": 4929 }, { "epoch": 1.8205317577548006, "grad_norm": 0.24252618849277496, "learning_rate": 7.867964034979678e-05, "loss": 0.2231, "step": 4930 }, { "epoch": 1.8209010339734122, "grad_norm": 0.2974163293838501, "learning_rate": 7.865500677423329e-05, "loss": 0.2217, "step": 4931 }, { "epoch": 1.8212703101920238, "grad_norm": 0.3330936133861542, "learning_rate": 7.863037319866979e-05, "loss": 0.2496, "step": 4932 }, { "epoch": 1.8216395864106352, "grad_norm": 0.23207621276378632, "learning_rate": 7.86057396231063e-05, "loss": 0.2205, "step": 4933 }, { "epoch": 1.8220088626292466, "grad_norm": 0.29393038153648376, "learning_rate": 7.85811060475428e-05, "loss": 0.2608, "step": 4934 }, { "epoch": 1.8223781388478582, "grad_norm": 0.32417479157447815, "learning_rate": 7.855647247197932e-05, "loss": 0.2421, "step": 4935 }, { "epoch": 1.8227474150664698, "grad_norm": 0.3282499313354492, "learning_rate": 7.853183889641582e-05, "loss": 0.2752, "step": 4936 }, { "epoch": 1.8231166912850814, "grad_norm": 0.2969152331352234, "learning_rate": 7.850720532085233e-05, "loss": 0.2614, "step": 4937 }, { "epoch": 1.8234859675036927, "grad_norm": 0.2571909427642822, "learning_rate": 7.848257174528883e-05, "loss": 0.1975, "step": 4938 }, { "epoch": 1.8238552437223041, "grad_norm": 0.3285469710826874, "learning_rate": 7.845793816972535e-05, "loss": 0.2396, "step": 4939 }, { "epoch": 1.8242245199409157, "grad_norm": 0.2782268524169922, "learning_rate": 7.843330459416185e-05, "loss": 0.1893, "step": 4940 }, { "epoch": 1.8245937961595273, "grad_norm": 0.25318440794944763, "learning_rate": 7.840867101859835e-05, "loss": 0.1934, "step": 4941 }, { "epoch": 1.824963072378139, "grad_norm": 0.2724683880805969, "learning_rate": 7.838403744303487e-05, "loss": 0.2136, "step": 4942 }, { "epoch": 1.8253323485967505, "grad_norm": 0.23463191092014313, "learning_rate": 7.835940386747137e-05, "loss": 0.1947, "step": 4943 }, { "epoch": 1.825701624815362, "grad_norm": 0.2644791007041931, "learning_rate": 7.833477029190788e-05, "loss": 0.2036, "step": 4944 }, { "epoch": 1.8260709010339733, "grad_norm": 0.32884877920150757, "learning_rate": 7.831013671634438e-05, "loss": 0.256, "step": 4945 }, { "epoch": 1.826440177252585, "grad_norm": 0.27714481949806213, "learning_rate": 7.82855031407809e-05, "loss": 0.22, "step": 4946 }, { "epoch": 1.8268094534711965, "grad_norm": 0.3246936798095703, "learning_rate": 7.82608695652174e-05, "loss": 0.2527, "step": 4947 }, { "epoch": 1.827178729689808, "grad_norm": 0.23440565168857574, "learning_rate": 7.82362359896539e-05, "loss": 0.1986, "step": 4948 }, { "epoch": 1.8275480059084195, "grad_norm": 0.22908934950828552, "learning_rate": 7.821160241409041e-05, "loss": 0.1855, "step": 4949 }, { "epoch": 1.8279172821270309, "grad_norm": 0.33465614914894104, "learning_rate": 7.818696883852691e-05, "loss": 0.2077, "step": 4950 }, { "epoch": 1.8279172821270309, "eval_loss": 0.2532367408275604, "eval_runtime": 5.8662, "eval_samples_per_second": 8.523, "eval_steps_per_second": 1.193, "step": 4950 }, { "epoch": 1.8282865583456425, "grad_norm": 0.22871074080467224, "learning_rate": 7.816233526296343e-05, "loss": 0.1997, "step": 4951 }, { "epoch": 1.828655834564254, "grad_norm": 0.28895342350006104, "learning_rate": 7.813770168739993e-05, "loss": 0.2432, "step": 4952 }, { "epoch": 1.8290251107828657, "grad_norm": 0.30292877554893494, "learning_rate": 7.811306811183645e-05, "loss": 0.2621, "step": 4953 }, { "epoch": 1.829394387001477, "grad_norm": 0.26060134172439575, "learning_rate": 7.808843453627295e-05, "loss": 0.1941, "step": 4954 }, { "epoch": 1.8297636632200887, "grad_norm": 0.29909786581993103, "learning_rate": 7.806380096070946e-05, "loss": 0.233, "step": 4955 }, { "epoch": 1.8301329394387, "grad_norm": 0.29001250863075256, "learning_rate": 7.803916738514596e-05, "loss": 0.2142, "step": 4956 }, { "epoch": 1.8305022156573116, "grad_norm": 0.30807846784591675, "learning_rate": 7.801453380958246e-05, "loss": 0.2308, "step": 4957 }, { "epoch": 1.8308714918759232, "grad_norm": 0.26788103580474854, "learning_rate": 7.798990023401898e-05, "loss": 0.2613, "step": 4958 }, { "epoch": 1.8312407680945348, "grad_norm": 0.24326567351818085, "learning_rate": 7.796526665845548e-05, "loss": 0.2056, "step": 4959 }, { "epoch": 1.8316100443131462, "grad_norm": 0.21416209638118744, "learning_rate": 7.7940633082892e-05, "loss": 0.2035, "step": 4960 }, { "epoch": 1.8319793205317576, "grad_norm": 0.28870993852615356, "learning_rate": 7.79159995073285e-05, "loss": 0.2139, "step": 4961 }, { "epoch": 1.8323485967503692, "grad_norm": 0.27659568190574646, "learning_rate": 7.789136593176501e-05, "loss": 0.2399, "step": 4962 }, { "epoch": 1.8327178729689808, "grad_norm": 0.2337316870689392, "learning_rate": 7.786673235620151e-05, "loss": 0.2157, "step": 4963 }, { "epoch": 1.8330871491875924, "grad_norm": 0.2649693489074707, "learning_rate": 7.784209878063801e-05, "loss": 0.2153, "step": 4964 }, { "epoch": 1.8334564254062038, "grad_norm": 0.3539893925189972, "learning_rate": 7.781746520507453e-05, "loss": 0.2891, "step": 4965 }, { "epoch": 1.8338257016248154, "grad_norm": 0.2172566056251526, "learning_rate": 7.779283162951103e-05, "loss": 0.1724, "step": 4966 }, { "epoch": 1.8341949778434268, "grad_norm": 0.24495668709278107, "learning_rate": 7.776819805394754e-05, "loss": 0.2153, "step": 4967 }, { "epoch": 1.8345642540620384, "grad_norm": 0.2146148979663849, "learning_rate": 7.774356447838404e-05, "loss": 0.1984, "step": 4968 }, { "epoch": 1.83493353028065, "grad_norm": 0.2669045627117157, "learning_rate": 7.771893090282056e-05, "loss": 0.2334, "step": 4969 }, { "epoch": 1.8353028064992616, "grad_norm": 0.2946428656578064, "learning_rate": 7.769429732725706e-05, "loss": 0.2593, "step": 4970 }, { "epoch": 1.835672082717873, "grad_norm": 0.21473877131938934, "learning_rate": 7.766966375169357e-05, "loss": 0.159, "step": 4971 }, { "epoch": 1.8360413589364843, "grad_norm": 0.2652877867221832, "learning_rate": 7.764503017613007e-05, "loss": 0.2641, "step": 4972 }, { "epoch": 1.836410635155096, "grad_norm": 0.30034440755844116, "learning_rate": 7.762039660056658e-05, "loss": 0.2464, "step": 4973 }, { "epoch": 1.8367799113737076, "grad_norm": 0.2821067273616791, "learning_rate": 7.759576302500309e-05, "loss": 0.2093, "step": 4974 }, { "epoch": 1.8371491875923192, "grad_norm": 0.2559252381324768, "learning_rate": 7.757112944943959e-05, "loss": 0.197, "step": 4975 }, { "epoch": 1.8375184638109305, "grad_norm": 0.2899158000946045, "learning_rate": 7.75464958738761e-05, "loss": 0.2441, "step": 4976 }, { "epoch": 1.8378877400295421, "grad_norm": 0.2630823254585266, "learning_rate": 7.752186229831261e-05, "loss": 0.2376, "step": 4977 }, { "epoch": 1.8382570162481535, "grad_norm": 0.25868675112724304, "learning_rate": 7.749722872274912e-05, "loss": 0.2282, "step": 4978 }, { "epoch": 1.8386262924667651, "grad_norm": 0.30225345492362976, "learning_rate": 7.747259514718562e-05, "loss": 0.2634, "step": 4979 }, { "epoch": 1.8389955686853767, "grad_norm": 0.3172706067562103, "learning_rate": 7.744796157162212e-05, "loss": 0.221, "step": 4980 }, { "epoch": 1.8393648449039883, "grad_norm": 0.2501530945301056, "learning_rate": 7.742332799605864e-05, "loss": 0.21, "step": 4981 }, { "epoch": 1.8397341211225997, "grad_norm": 0.2722257077693939, "learning_rate": 7.739869442049514e-05, "loss": 0.241, "step": 4982 }, { "epoch": 1.840103397341211, "grad_norm": 0.25727182626724243, "learning_rate": 7.737406084493165e-05, "loss": 0.223, "step": 4983 }, { "epoch": 1.8404726735598227, "grad_norm": 0.2605494260787964, "learning_rate": 7.734942726936816e-05, "loss": 0.1922, "step": 4984 }, { "epoch": 1.8408419497784343, "grad_norm": 0.2520425617694855, "learning_rate": 7.732479369380467e-05, "loss": 0.2214, "step": 4985 }, { "epoch": 1.841211225997046, "grad_norm": 0.2699052393436432, "learning_rate": 7.730016011824117e-05, "loss": 0.2277, "step": 4986 }, { "epoch": 1.8415805022156573, "grad_norm": 0.24361835420131683, "learning_rate": 7.727552654267769e-05, "loss": 0.2008, "step": 4987 }, { "epoch": 1.8419497784342689, "grad_norm": 0.2995036244392395, "learning_rate": 7.725089296711419e-05, "loss": 0.2413, "step": 4988 }, { "epoch": 1.8423190546528803, "grad_norm": 0.31261393427848816, "learning_rate": 7.722625939155069e-05, "loss": 0.2105, "step": 4989 }, { "epoch": 1.8426883308714919, "grad_norm": 0.3322518467903137, "learning_rate": 7.72016258159872e-05, "loss": 0.2204, "step": 4990 }, { "epoch": 1.8430576070901035, "grad_norm": 0.24115926027297974, "learning_rate": 7.71769922404237e-05, "loss": 0.2078, "step": 4991 }, { "epoch": 1.843426883308715, "grad_norm": 0.2732994258403778, "learning_rate": 7.715235866486022e-05, "loss": 0.2078, "step": 4992 }, { "epoch": 1.8437961595273265, "grad_norm": 0.26314637064933777, "learning_rate": 7.71277250892967e-05, "loss": 0.2212, "step": 4993 }, { "epoch": 1.8441654357459378, "grad_norm": 0.2788105010986328, "learning_rate": 7.710309151373322e-05, "loss": 0.2082, "step": 4994 }, { "epoch": 1.8445347119645494, "grad_norm": 0.2987094521522522, "learning_rate": 7.707845793816972e-05, "loss": 0.2213, "step": 4995 }, { "epoch": 1.844903988183161, "grad_norm": 0.29209956526756287, "learning_rate": 7.705382436260624e-05, "loss": 0.2482, "step": 4996 }, { "epoch": 1.8452732644017726, "grad_norm": 0.33971697092056274, "learning_rate": 7.702919078704274e-05, "loss": 0.2339, "step": 4997 }, { "epoch": 1.845642540620384, "grad_norm": 0.27986255288124084, "learning_rate": 7.700455721147924e-05, "loss": 0.2744, "step": 4998 }, { "epoch": 1.8460118168389956, "grad_norm": 0.2867780923843384, "learning_rate": 7.697992363591575e-05, "loss": 0.2315, "step": 4999 }, { "epoch": 1.846381093057607, "grad_norm": 0.26802054047584534, "learning_rate": 7.695529006035225e-05, "loss": 0.1738, "step": 5000 }, { "epoch": 1.846381093057607, "eval_loss": 0.25385627150535583, "eval_runtime": 5.861, "eval_samples_per_second": 8.531, "eval_steps_per_second": 1.194, "step": 5000 }, { "epoch": 1.8467503692762186, "grad_norm": 0.2724579870700836, "learning_rate": 7.693065648478877e-05, "loss": 0.2288, "step": 5001 }, { "epoch": 1.8471196454948302, "grad_norm": 0.2348249852657318, "learning_rate": 7.690602290922527e-05, "loss": 0.1983, "step": 5002 }, { "epoch": 1.8474889217134418, "grad_norm": 0.25820598006248474, "learning_rate": 7.688138933366178e-05, "loss": 0.2524, "step": 5003 }, { "epoch": 1.8478581979320532, "grad_norm": 0.27874141931533813, "learning_rate": 7.685675575809829e-05, "loss": 0.235, "step": 5004 }, { "epoch": 1.8482274741506646, "grad_norm": 0.24114708602428436, "learning_rate": 7.68321221825348e-05, "loss": 0.1999, "step": 5005 }, { "epoch": 1.8485967503692762, "grad_norm": 0.282846599817276, "learning_rate": 7.68074886069713e-05, "loss": 0.2693, "step": 5006 }, { "epoch": 1.8489660265878878, "grad_norm": 0.2789778709411621, "learning_rate": 7.67828550314078e-05, "loss": 0.2348, "step": 5007 }, { "epoch": 1.8493353028064994, "grad_norm": 0.30808788537979126, "learning_rate": 7.675822145584432e-05, "loss": 0.2801, "step": 5008 }, { "epoch": 1.8497045790251108, "grad_norm": 0.2590267062187195, "learning_rate": 7.673358788028082e-05, "loss": 0.2391, "step": 5009 }, { "epoch": 1.8500738552437221, "grad_norm": 0.3276561200618744, "learning_rate": 7.670895430471733e-05, "loss": 0.2578, "step": 5010 }, { "epoch": 1.8504431314623337, "grad_norm": 0.4562641680240631, "learning_rate": 7.668432072915383e-05, "loss": 0.2748, "step": 5011 }, { "epoch": 1.8508124076809453, "grad_norm": 0.24998047947883606, "learning_rate": 7.665968715359035e-05, "loss": 0.2161, "step": 5012 }, { "epoch": 1.851181683899557, "grad_norm": 0.2959882915019989, "learning_rate": 7.663505357802685e-05, "loss": 0.2769, "step": 5013 }, { "epoch": 1.8515509601181686, "grad_norm": 0.2754557728767395, "learning_rate": 7.661042000246335e-05, "loss": 0.2096, "step": 5014 }, { "epoch": 1.85192023633678, "grad_norm": 0.2741212844848633, "learning_rate": 7.658578642689986e-05, "loss": 0.2303, "step": 5015 }, { "epoch": 1.8522895125553913, "grad_norm": 0.2592034935951233, "learning_rate": 7.656115285133637e-05, "loss": 0.2072, "step": 5016 }, { "epoch": 1.852658788774003, "grad_norm": 0.27041250467300415, "learning_rate": 7.653651927577288e-05, "loss": 0.2168, "step": 5017 }, { "epoch": 1.8530280649926145, "grad_norm": 0.2315387725830078, "learning_rate": 7.651188570020938e-05, "loss": 0.1993, "step": 5018 }, { "epoch": 1.8533973412112261, "grad_norm": 0.2641536295413971, "learning_rate": 7.64872521246459e-05, "loss": 0.2271, "step": 5019 }, { "epoch": 1.8537666174298375, "grad_norm": 0.28545939922332764, "learning_rate": 7.64626185490824e-05, "loss": 0.2229, "step": 5020 }, { "epoch": 1.8541358936484489, "grad_norm": 0.2999255657196045, "learning_rate": 7.643798497351891e-05, "loss": 0.245, "step": 5021 }, { "epoch": 1.8545051698670605, "grad_norm": 0.25720641016960144, "learning_rate": 7.641335139795541e-05, "loss": 0.212, "step": 5022 }, { "epoch": 1.854874446085672, "grad_norm": 0.2911759316921234, "learning_rate": 7.638871782239191e-05, "loss": 0.2396, "step": 5023 }, { "epoch": 1.8552437223042837, "grad_norm": 0.2456437200307846, "learning_rate": 7.636408424682843e-05, "loss": 0.2242, "step": 5024 }, { "epoch": 1.855612998522895, "grad_norm": 0.23431618511676788, "learning_rate": 7.633945067126493e-05, "loss": 0.2165, "step": 5025 }, { "epoch": 1.8559822747415067, "grad_norm": 0.28363487124443054, "learning_rate": 7.631481709570144e-05, "loss": 0.2407, "step": 5026 }, { "epoch": 1.856351550960118, "grad_norm": 0.2409060150384903, "learning_rate": 7.629018352013795e-05, "loss": 0.2017, "step": 5027 }, { "epoch": 1.8567208271787297, "grad_norm": 0.3640720546245575, "learning_rate": 7.626554994457446e-05, "loss": 0.2299, "step": 5028 }, { "epoch": 1.8570901033973413, "grad_norm": 0.2626873850822449, "learning_rate": 7.624091636901096e-05, "loss": 0.1962, "step": 5029 }, { "epoch": 1.8574593796159529, "grad_norm": 0.2808993458747864, "learning_rate": 7.621628279344746e-05, "loss": 0.224, "step": 5030 }, { "epoch": 1.8578286558345642, "grad_norm": 0.23122955858707428, "learning_rate": 7.619164921788398e-05, "loss": 0.1914, "step": 5031 }, { "epoch": 1.8581979320531756, "grad_norm": 0.2904467284679413, "learning_rate": 7.616701564232048e-05, "loss": 0.2252, "step": 5032 }, { "epoch": 1.8585672082717872, "grad_norm": 0.27927500009536743, "learning_rate": 7.614238206675699e-05, "loss": 0.211, "step": 5033 }, { "epoch": 1.8589364844903988, "grad_norm": 0.3011045455932617, "learning_rate": 7.61177484911935e-05, "loss": 0.235, "step": 5034 }, { "epoch": 1.8593057607090104, "grad_norm": 0.28089842200279236, "learning_rate": 7.609311491563001e-05, "loss": 0.2617, "step": 5035 }, { "epoch": 1.8596750369276218, "grad_norm": 0.2582519054412842, "learning_rate": 7.606848134006651e-05, "loss": 0.2072, "step": 5036 }, { "epoch": 1.8600443131462334, "grad_norm": 0.3025238513946533, "learning_rate": 7.604384776450302e-05, "loss": 0.2946, "step": 5037 }, { "epoch": 1.8604135893648448, "grad_norm": 0.276224821805954, "learning_rate": 7.601921418893953e-05, "loss": 0.2209, "step": 5038 }, { "epoch": 1.8607828655834564, "grad_norm": 0.23579522967338562, "learning_rate": 7.599458061337603e-05, "loss": 0.206, "step": 5039 }, { "epoch": 1.861152141802068, "grad_norm": 0.29115140438079834, "learning_rate": 7.596994703781254e-05, "loss": 0.2361, "step": 5040 }, { "epoch": 1.8615214180206796, "grad_norm": 0.2666078209877014, "learning_rate": 7.594531346224904e-05, "loss": 0.2266, "step": 5041 }, { "epoch": 1.861890694239291, "grad_norm": 0.2762908637523651, "learning_rate": 7.592067988668556e-05, "loss": 0.2058, "step": 5042 }, { "epoch": 1.8622599704579024, "grad_norm": 0.2928195297718048, "learning_rate": 7.589604631112206e-05, "loss": 0.219, "step": 5043 }, { "epoch": 1.862629246676514, "grad_norm": 0.31124547123908997, "learning_rate": 7.587141273555857e-05, "loss": 0.256, "step": 5044 }, { "epoch": 1.8629985228951256, "grad_norm": 0.2583990693092346, "learning_rate": 7.584677915999507e-05, "loss": 0.2116, "step": 5045 }, { "epoch": 1.8633677991137372, "grad_norm": 0.291933536529541, "learning_rate": 7.582214558443157e-05, "loss": 0.2408, "step": 5046 }, { "epoch": 1.8637370753323486, "grad_norm": 0.28724706172943115, "learning_rate": 7.579751200886809e-05, "loss": 0.2101, "step": 5047 }, { "epoch": 1.8641063515509602, "grad_norm": 0.23857556283473969, "learning_rate": 7.577287843330459e-05, "loss": 0.2056, "step": 5048 }, { "epoch": 1.8644756277695715, "grad_norm": 0.29885801672935486, "learning_rate": 7.57482448577411e-05, "loss": 0.2172, "step": 5049 }, { "epoch": 1.8648449039881831, "grad_norm": 0.3281831741333008, "learning_rate": 7.57236112821776e-05, "loss": 0.2321, "step": 5050 }, { "epoch": 1.8648449039881831, "eval_loss": 0.2508034110069275, "eval_runtime": 5.8639, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 5050 }, { "epoch": 1.8652141802067947, "grad_norm": 0.3108578026294708, "learning_rate": 7.569897770661412e-05, "loss": 0.2422, "step": 5051 }, { "epoch": 1.8655834564254064, "grad_norm": 0.3225805163383484, "learning_rate": 7.567434413105062e-05, "loss": 0.2449, "step": 5052 }, { "epoch": 1.8659527326440177, "grad_norm": 0.23075884580612183, "learning_rate": 7.564971055548714e-05, "loss": 0.2048, "step": 5053 }, { "epoch": 1.8663220088626291, "grad_norm": 0.2535512149333954, "learning_rate": 7.562507697992364e-05, "loss": 0.2184, "step": 5054 }, { "epoch": 1.8666912850812407, "grad_norm": 0.21854400634765625, "learning_rate": 7.560044340436014e-05, "loss": 0.1803, "step": 5055 }, { "epoch": 1.8670605612998523, "grad_norm": 0.30747005343437195, "learning_rate": 7.557580982879665e-05, "loss": 0.2439, "step": 5056 }, { "epoch": 1.867429837518464, "grad_norm": 0.25755876302719116, "learning_rate": 7.555117625323315e-05, "loss": 0.216, "step": 5057 }, { "epoch": 1.8677991137370753, "grad_norm": 0.2670411765575409, "learning_rate": 7.552654267766967e-05, "loss": 0.2101, "step": 5058 }, { "epoch": 1.868168389955687, "grad_norm": 0.23956617712974548, "learning_rate": 7.550190910210617e-05, "loss": 0.2226, "step": 5059 }, { "epoch": 1.8685376661742983, "grad_norm": 0.24762064218521118, "learning_rate": 7.547727552654268e-05, "loss": 0.2119, "step": 5060 }, { "epoch": 1.8689069423929099, "grad_norm": 0.23967112600803375, "learning_rate": 7.545264195097919e-05, "loss": 0.2027, "step": 5061 }, { "epoch": 1.8692762186115215, "grad_norm": 0.25305482745170593, "learning_rate": 7.542800837541569e-05, "loss": 0.2391, "step": 5062 }, { "epoch": 1.869645494830133, "grad_norm": 0.3014960289001465, "learning_rate": 7.54033747998522e-05, "loss": 0.226, "step": 5063 }, { "epoch": 1.8700147710487445, "grad_norm": 0.3208469748497009, "learning_rate": 7.53787412242887e-05, "loss": 0.2194, "step": 5064 }, { "epoch": 1.8703840472673559, "grad_norm": 0.3345724642276764, "learning_rate": 7.535410764872522e-05, "loss": 0.252, "step": 5065 }, { "epoch": 1.8707533234859675, "grad_norm": 0.2663443386554718, "learning_rate": 7.532947407316172e-05, "loss": 0.2241, "step": 5066 }, { "epoch": 1.871122599704579, "grad_norm": 0.27943527698516846, "learning_rate": 7.530484049759823e-05, "loss": 0.2229, "step": 5067 }, { "epoch": 1.8714918759231907, "grad_norm": 0.26502811908721924, "learning_rate": 7.528020692203473e-05, "loss": 0.2471, "step": 5068 }, { "epoch": 1.871861152141802, "grad_norm": 0.3713107705116272, "learning_rate": 7.525557334647124e-05, "loss": 0.2697, "step": 5069 }, { "epoch": 1.8722304283604134, "grad_norm": 0.2940278947353363, "learning_rate": 7.523093977090775e-05, "loss": 0.2426, "step": 5070 }, { "epoch": 1.872599704579025, "grad_norm": 0.27475041151046753, "learning_rate": 7.520630619534425e-05, "loss": 0.2441, "step": 5071 }, { "epoch": 1.8729689807976366, "grad_norm": 0.3016396462917328, "learning_rate": 7.518167261978077e-05, "loss": 0.2287, "step": 5072 }, { "epoch": 1.8733382570162482, "grad_norm": 0.2879699170589447, "learning_rate": 7.515703904421727e-05, "loss": 0.2825, "step": 5073 }, { "epoch": 1.8737075332348598, "grad_norm": 0.32314956188201904, "learning_rate": 7.513240546865378e-05, "loss": 0.2476, "step": 5074 }, { "epoch": 1.8740768094534712, "grad_norm": 0.3410947322845459, "learning_rate": 7.510777189309028e-05, "loss": 0.2736, "step": 5075 }, { "epoch": 1.8744460856720826, "grad_norm": 0.30659565329551697, "learning_rate": 7.50831383175268e-05, "loss": 0.2425, "step": 5076 }, { "epoch": 1.8748153618906942, "grad_norm": 0.2589021623134613, "learning_rate": 7.50585047419633e-05, "loss": 0.2123, "step": 5077 }, { "epoch": 1.8751846381093058, "grad_norm": 0.2562973201274872, "learning_rate": 7.50338711663998e-05, "loss": 0.215, "step": 5078 }, { "epoch": 1.8755539143279174, "grad_norm": 0.26437804102897644, "learning_rate": 7.500923759083631e-05, "loss": 0.2137, "step": 5079 }, { "epoch": 1.8759231905465288, "grad_norm": 0.3351561427116394, "learning_rate": 7.498460401527282e-05, "loss": 0.2973, "step": 5080 }, { "epoch": 1.8762924667651402, "grad_norm": 0.26624685525894165, "learning_rate": 7.495997043970933e-05, "loss": 0.2211, "step": 5081 }, { "epoch": 1.8766617429837518, "grad_norm": 0.3059654235839844, "learning_rate": 7.493533686414583e-05, "loss": 0.2444, "step": 5082 }, { "epoch": 1.8770310192023634, "grad_norm": 0.2658170461654663, "learning_rate": 7.491070328858235e-05, "loss": 0.1953, "step": 5083 }, { "epoch": 1.877400295420975, "grad_norm": 0.21191762387752533, "learning_rate": 7.488606971301885e-05, "loss": 0.1851, "step": 5084 }, { "epoch": 1.8777695716395866, "grad_norm": 0.29865381121635437, "learning_rate": 7.486143613745535e-05, "loss": 0.2357, "step": 5085 }, { "epoch": 1.878138847858198, "grad_norm": 0.26615944504737854, "learning_rate": 7.483680256189186e-05, "loss": 0.2252, "step": 5086 }, { "epoch": 1.8785081240768093, "grad_norm": 0.27517449855804443, "learning_rate": 7.481216898632836e-05, "loss": 0.1947, "step": 5087 }, { "epoch": 1.878877400295421, "grad_norm": 0.2770524024963379, "learning_rate": 7.478753541076488e-05, "loss": 0.2113, "step": 5088 }, { "epoch": 1.8792466765140325, "grad_norm": 0.31695282459259033, "learning_rate": 7.476290183520138e-05, "loss": 0.2874, "step": 5089 }, { "epoch": 1.8796159527326441, "grad_norm": 0.2557409405708313, "learning_rate": 7.47382682596379e-05, "loss": 0.2178, "step": 5090 }, { "epoch": 1.8799852289512555, "grad_norm": 0.2620340585708618, "learning_rate": 7.47136346840744e-05, "loss": 0.2342, "step": 5091 }, { "epoch": 1.880354505169867, "grad_norm": 0.31770604848861694, "learning_rate": 7.468900110851091e-05, "loss": 0.1944, "step": 5092 }, { "epoch": 1.8807237813884785, "grad_norm": 0.27841833233833313, "learning_rate": 7.466436753294741e-05, "loss": 0.2305, "step": 5093 }, { "epoch": 1.8810930576070901, "grad_norm": 0.3052809536457062, "learning_rate": 7.463973395738391e-05, "loss": 0.2707, "step": 5094 }, { "epoch": 1.8814623338257017, "grad_norm": 0.2491428405046463, "learning_rate": 7.461510038182043e-05, "loss": 0.2319, "step": 5095 }, { "epoch": 1.881831610044313, "grad_norm": 0.25979429483413696, "learning_rate": 7.459046680625693e-05, "loss": 0.2145, "step": 5096 }, { "epoch": 1.8822008862629247, "grad_norm": 0.25400927662849426, "learning_rate": 7.456583323069344e-05, "loss": 0.2167, "step": 5097 }, { "epoch": 1.882570162481536, "grad_norm": 0.31190866231918335, "learning_rate": 7.454119965512994e-05, "loss": 0.1903, "step": 5098 }, { "epoch": 1.8829394387001477, "grad_norm": 0.3358284831047058, "learning_rate": 7.451656607956646e-05, "loss": 0.2728, "step": 5099 }, { "epoch": 1.8833087149187593, "grad_norm": 0.2279479205608368, "learning_rate": 7.449193250400296e-05, "loss": 0.181, "step": 5100 }, { "epoch": 1.8833087149187593, "eval_loss": 0.25126925110816956, "eval_runtime": 5.8627, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 5100 }, { "epoch": 1.8836779911373709, "grad_norm": 0.27150535583496094, "learning_rate": 7.446729892843946e-05, "loss": 0.2157, "step": 5101 }, { "epoch": 1.8840472673559823, "grad_norm": 0.2573368549346924, "learning_rate": 7.444266535287597e-05, "loss": 0.2291, "step": 5102 }, { "epoch": 1.8844165435745936, "grad_norm": 0.28329354524612427, "learning_rate": 7.441803177731248e-05, "loss": 0.2202, "step": 5103 }, { "epoch": 1.8847858197932053, "grad_norm": 0.30760088562965393, "learning_rate": 7.439339820174899e-05, "loss": 0.2167, "step": 5104 }, { "epoch": 1.8851550960118169, "grad_norm": 0.30110180377960205, "learning_rate": 7.436876462618549e-05, "loss": 0.2096, "step": 5105 }, { "epoch": 1.8855243722304285, "grad_norm": 0.2775033116340637, "learning_rate": 7.4344131050622e-05, "loss": 0.2281, "step": 5106 }, { "epoch": 1.8858936484490398, "grad_norm": 0.2764638364315033, "learning_rate": 7.431949747505851e-05, "loss": 0.1961, "step": 5107 }, { "epoch": 1.8862629246676514, "grad_norm": 0.24575462937355042, "learning_rate": 7.429486389949502e-05, "loss": 0.1838, "step": 5108 }, { "epoch": 1.8866322008862628, "grad_norm": 0.3272894322872162, "learning_rate": 7.427023032393152e-05, "loss": 0.2512, "step": 5109 }, { "epoch": 1.8870014771048744, "grad_norm": 0.2593172788619995, "learning_rate": 7.424559674836802e-05, "loss": 0.2513, "step": 5110 }, { "epoch": 1.887370753323486, "grad_norm": 0.25487715005874634, "learning_rate": 7.422096317280454e-05, "loss": 0.209, "step": 5111 }, { "epoch": 1.8877400295420976, "grad_norm": 0.26173579692840576, "learning_rate": 7.419632959724104e-05, "loss": 0.2293, "step": 5112 }, { "epoch": 1.888109305760709, "grad_norm": 0.3187013864517212, "learning_rate": 7.417169602167755e-05, "loss": 0.2772, "step": 5113 }, { "epoch": 1.8884785819793204, "grad_norm": 0.3143283724784851, "learning_rate": 7.414706244611406e-05, "loss": 0.1915, "step": 5114 }, { "epoch": 1.888847858197932, "grad_norm": 0.30088192224502563, "learning_rate": 7.412242887055057e-05, "loss": 0.2472, "step": 5115 }, { "epoch": 1.8892171344165436, "grad_norm": 0.2510607838630676, "learning_rate": 7.409779529498707e-05, "loss": 0.2135, "step": 5116 }, { "epoch": 1.8895864106351552, "grad_norm": 0.2626101076602936, "learning_rate": 7.407316171942357e-05, "loss": 0.2145, "step": 5117 }, { "epoch": 1.8899556868537666, "grad_norm": 0.2522996962070465, "learning_rate": 7.404852814386009e-05, "loss": 0.1827, "step": 5118 }, { "epoch": 1.8903249630723782, "grad_norm": 0.2846025824546814, "learning_rate": 7.402389456829659e-05, "loss": 0.2304, "step": 5119 }, { "epoch": 1.8906942392909896, "grad_norm": 0.33159270882606506, "learning_rate": 7.39992609927331e-05, "loss": 0.2637, "step": 5120 }, { "epoch": 1.8910635155096012, "grad_norm": 0.24131393432617188, "learning_rate": 7.39746274171696e-05, "loss": 0.1861, "step": 5121 }, { "epoch": 1.8914327917282128, "grad_norm": 0.25703150033950806, "learning_rate": 7.394999384160612e-05, "loss": 0.2022, "step": 5122 }, { "epoch": 1.8918020679468244, "grad_norm": 0.29193922877311707, "learning_rate": 7.392536026604262e-05, "loss": 0.2539, "step": 5123 }, { "epoch": 1.8921713441654358, "grad_norm": 0.3627915382385254, "learning_rate": 7.390072669047913e-05, "loss": 0.2959, "step": 5124 }, { "epoch": 1.8925406203840471, "grad_norm": 0.27202221751213074, "learning_rate": 7.387609311491564e-05, "loss": 0.1999, "step": 5125 }, { "epoch": 1.8929098966026587, "grad_norm": 0.23262353241443634, "learning_rate": 7.385145953935214e-05, "loss": 0.1994, "step": 5126 }, { "epoch": 1.8932791728212703, "grad_norm": 0.27949628233909607, "learning_rate": 7.382682596378865e-05, "loss": 0.2639, "step": 5127 }, { "epoch": 1.893648449039882, "grad_norm": 0.27095603942871094, "learning_rate": 7.380219238822515e-05, "loss": 0.2037, "step": 5128 }, { "epoch": 1.8940177252584933, "grad_norm": 0.3029235601425171, "learning_rate": 7.377755881266167e-05, "loss": 0.2282, "step": 5129 }, { "epoch": 1.894387001477105, "grad_norm": 0.32053259015083313, "learning_rate": 7.375292523709817e-05, "loss": 0.1963, "step": 5130 }, { "epoch": 1.8947562776957163, "grad_norm": 0.25598132610321045, "learning_rate": 7.372829166153468e-05, "loss": 0.2195, "step": 5131 }, { "epoch": 1.895125553914328, "grad_norm": 0.27937015891075134, "learning_rate": 7.370365808597118e-05, "loss": 0.2338, "step": 5132 }, { "epoch": 1.8954948301329395, "grad_norm": 0.28756415843963623, "learning_rate": 7.367902451040768e-05, "loss": 0.2117, "step": 5133 }, { "epoch": 1.8958641063515511, "grad_norm": 0.2926604747772217, "learning_rate": 7.36543909348442e-05, "loss": 0.2589, "step": 5134 }, { "epoch": 1.8962333825701625, "grad_norm": 0.28033578395843506, "learning_rate": 7.36297573592807e-05, "loss": 0.2024, "step": 5135 }, { "epoch": 1.8966026587887739, "grad_norm": 0.2677004039287567, "learning_rate": 7.360512378371721e-05, "loss": 0.2347, "step": 5136 }, { "epoch": 1.8969719350073855, "grad_norm": 0.25729888677597046, "learning_rate": 7.358049020815372e-05, "loss": 0.2258, "step": 5137 }, { "epoch": 1.897341211225997, "grad_norm": 0.28130093216896057, "learning_rate": 7.355585663259023e-05, "loss": 0.2339, "step": 5138 }, { "epoch": 1.8977104874446087, "grad_norm": 0.31810981035232544, "learning_rate": 7.353122305702673e-05, "loss": 0.2431, "step": 5139 }, { "epoch": 1.89807976366322, "grad_norm": 0.23127999901771545, "learning_rate": 7.350658948146323e-05, "loss": 0.1878, "step": 5140 }, { "epoch": 1.8984490398818314, "grad_norm": 0.3018636405467987, "learning_rate": 7.348195590589975e-05, "loss": 0.2358, "step": 5141 }, { "epoch": 1.898818316100443, "grad_norm": 0.25260666012763977, "learning_rate": 7.345732233033625e-05, "loss": 0.2065, "step": 5142 }, { "epoch": 1.8991875923190547, "grad_norm": 0.2912799119949341, "learning_rate": 7.343268875477276e-05, "loss": 0.2322, "step": 5143 }, { "epoch": 1.8995568685376663, "grad_norm": 0.3367374539375305, "learning_rate": 7.340805517920926e-05, "loss": 0.2929, "step": 5144 }, { "epoch": 1.8999261447562779, "grad_norm": 0.3466246724128723, "learning_rate": 7.338342160364578e-05, "loss": 0.277, "step": 5145 }, { "epoch": 1.9002954209748892, "grad_norm": 0.2413496971130371, "learning_rate": 7.335878802808228e-05, "loss": 0.1826, "step": 5146 }, { "epoch": 1.9006646971935006, "grad_norm": 0.22739775478839874, "learning_rate": 7.33341544525188e-05, "loss": 0.207, "step": 5147 }, { "epoch": 1.9010339734121122, "grad_norm": 0.28335708379745483, "learning_rate": 7.33095208769553e-05, "loss": 0.2324, "step": 5148 }, { "epoch": 1.9014032496307238, "grad_norm": 0.33643093705177307, "learning_rate": 7.32848873013918e-05, "loss": 0.2339, "step": 5149 }, { "epoch": 1.9017725258493354, "grad_norm": 0.31164810061454773, "learning_rate": 7.326025372582831e-05, "loss": 0.2206, "step": 5150 }, { "epoch": 1.9017725258493354, "eval_loss": 0.25461217761039734, "eval_runtime": 5.854, "eval_samples_per_second": 8.541, "eval_steps_per_second": 1.196, "step": 5150 }, { "epoch": 1.9021418020679468, "grad_norm": 0.2678544521331787, "learning_rate": 7.323562015026481e-05, "loss": 0.2198, "step": 5151 }, { "epoch": 1.9025110782865582, "grad_norm": 0.35127219557762146, "learning_rate": 7.321098657470133e-05, "loss": 0.2759, "step": 5152 }, { "epoch": 1.9028803545051698, "grad_norm": 0.2919602394104004, "learning_rate": 7.318635299913783e-05, "loss": 0.2515, "step": 5153 }, { "epoch": 1.9032496307237814, "grad_norm": 0.2930256426334381, "learning_rate": 7.316171942357434e-05, "loss": 0.2082, "step": 5154 }, { "epoch": 1.903618906942393, "grad_norm": 0.3079281747341156, "learning_rate": 7.313708584801084e-05, "loss": 0.246, "step": 5155 }, { "epoch": 1.9039881831610044, "grad_norm": 0.2257537692785263, "learning_rate": 7.311245227244734e-05, "loss": 0.2006, "step": 5156 }, { "epoch": 1.904357459379616, "grad_norm": 0.2506227493286133, "learning_rate": 7.308781869688386e-05, "loss": 0.1847, "step": 5157 }, { "epoch": 1.9047267355982274, "grad_norm": 0.2114635407924652, "learning_rate": 7.306318512132036e-05, "loss": 0.1858, "step": 5158 }, { "epoch": 1.905096011816839, "grad_norm": 0.26355382800102234, "learning_rate": 7.303855154575688e-05, "loss": 0.2101, "step": 5159 }, { "epoch": 1.9054652880354506, "grad_norm": 0.293997585773468, "learning_rate": 7.301391797019338e-05, "loss": 0.2402, "step": 5160 }, { "epoch": 1.9058345642540622, "grad_norm": 0.26220446825027466, "learning_rate": 7.298928439462989e-05, "loss": 0.2145, "step": 5161 }, { "epoch": 1.9062038404726735, "grad_norm": 0.2998253405094147, "learning_rate": 7.296465081906639e-05, "loss": 0.2333, "step": 5162 }, { "epoch": 1.906573116691285, "grad_norm": 0.24114952981472015, "learning_rate": 7.29400172435029e-05, "loss": 0.2071, "step": 5163 }, { "epoch": 1.9069423929098965, "grad_norm": 0.326474130153656, "learning_rate": 7.291538366793941e-05, "loss": 0.2944, "step": 5164 }, { "epoch": 1.9073116691285081, "grad_norm": 0.2474275529384613, "learning_rate": 7.289075009237591e-05, "loss": 0.1793, "step": 5165 }, { "epoch": 1.9076809453471197, "grad_norm": 0.286654531955719, "learning_rate": 7.286611651681242e-05, "loss": 0.2206, "step": 5166 }, { "epoch": 1.9080502215657311, "grad_norm": 0.23051148653030396, "learning_rate": 7.284148294124892e-05, "loss": 0.1843, "step": 5167 }, { "epoch": 1.9084194977843427, "grad_norm": 0.31883504986763, "learning_rate": 7.281684936568544e-05, "loss": 0.2534, "step": 5168 }, { "epoch": 1.908788774002954, "grad_norm": 0.27582499384880066, "learning_rate": 7.279221579012194e-05, "loss": 0.2059, "step": 5169 }, { "epoch": 1.9091580502215657, "grad_norm": 0.2696746587753296, "learning_rate": 7.276758221455846e-05, "loss": 0.2242, "step": 5170 }, { "epoch": 1.9095273264401773, "grad_norm": 0.23835311830043793, "learning_rate": 7.274294863899496e-05, "loss": 0.1761, "step": 5171 }, { "epoch": 1.909896602658789, "grad_norm": 0.21435067057609558, "learning_rate": 7.271831506343146e-05, "loss": 0.1664, "step": 5172 }, { "epoch": 1.9102658788774003, "grad_norm": 0.3019494116306305, "learning_rate": 7.269368148786797e-05, "loss": 0.2086, "step": 5173 }, { "epoch": 1.9106351550960117, "grad_norm": 0.3723675012588501, "learning_rate": 7.266904791230447e-05, "loss": 0.2816, "step": 5174 }, { "epoch": 1.9110044313146233, "grad_norm": 0.2309865802526474, "learning_rate": 7.264441433674099e-05, "loss": 0.1917, "step": 5175 }, { "epoch": 1.9113737075332349, "grad_norm": 0.3170361816883087, "learning_rate": 7.261978076117749e-05, "loss": 0.2658, "step": 5176 }, { "epoch": 1.9117429837518465, "grad_norm": 0.29325804114341736, "learning_rate": 7.2595147185614e-05, "loss": 0.2351, "step": 5177 }, { "epoch": 1.9121122599704579, "grad_norm": 0.340961217880249, "learning_rate": 7.25705136100505e-05, "loss": 0.2204, "step": 5178 }, { "epoch": 1.9124815361890695, "grad_norm": 0.37032073736190796, "learning_rate": 7.254588003448702e-05, "loss": 0.2669, "step": 5179 }, { "epoch": 1.9128508124076808, "grad_norm": 0.27405688166618347, "learning_rate": 7.252124645892352e-05, "loss": 0.2193, "step": 5180 }, { "epoch": 1.9132200886262924, "grad_norm": 0.2763373553752899, "learning_rate": 7.249661288336002e-05, "loss": 0.1861, "step": 5181 }, { "epoch": 1.913589364844904, "grad_norm": 0.22096115350723267, "learning_rate": 7.247197930779654e-05, "loss": 0.1934, "step": 5182 }, { "epoch": 1.9139586410635157, "grad_norm": 0.2649868428707123, "learning_rate": 7.244734573223304e-05, "loss": 0.2002, "step": 5183 }, { "epoch": 1.914327917282127, "grad_norm": 0.24385647475719452, "learning_rate": 7.242271215666955e-05, "loss": 0.1991, "step": 5184 }, { "epoch": 1.9146971935007384, "grad_norm": 0.25321164727211, "learning_rate": 7.239807858110605e-05, "loss": 0.2015, "step": 5185 }, { "epoch": 1.91506646971935, "grad_norm": 0.2865865230560303, "learning_rate": 7.237344500554257e-05, "loss": 0.2198, "step": 5186 }, { "epoch": 1.9154357459379616, "grad_norm": 0.2982091009616852, "learning_rate": 7.234881142997907e-05, "loss": 0.2113, "step": 5187 }, { "epoch": 1.9158050221565732, "grad_norm": 0.37226611375808716, "learning_rate": 7.232417785441557e-05, "loss": 0.2114, "step": 5188 }, { "epoch": 1.9161742983751846, "grad_norm": 0.29692623019218445, "learning_rate": 7.229954427885208e-05, "loss": 0.2616, "step": 5189 }, { "epoch": 1.9165435745937962, "grad_norm": 0.28733372688293457, "learning_rate": 7.227491070328859e-05, "loss": 0.2408, "step": 5190 }, { "epoch": 1.9169128508124076, "grad_norm": 0.23799575865268707, "learning_rate": 7.22502771277251e-05, "loss": 0.1799, "step": 5191 }, { "epoch": 1.9172821270310192, "grad_norm": 0.20326031744480133, "learning_rate": 7.22256435521616e-05, "loss": 0.1758, "step": 5192 }, { "epoch": 1.9176514032496308, "grad_norm": 0.2550643980503082, "learning_rate": 7.220100997659812e-05, "loss": 0.2171, "step": 5193 }, { "epoch": 1.9180206794682424, "grad_norm": 0.253792941570282, "learning_rate": 7.217637640103462e-05, "loss": 0.202, "step": 5194 }, { "epoch": 1.9183899556868538, "grad_norm": 0.24402236938476562, "learning_rate": 7.215174282547113e-05, "loss": 0.1828, "step": 5195 }, { "epoch": 1.9187592319054652, "grad_norm": 0.35768184065818787, "learning_rate": 7.212710924990763e-05, "loss": 0.2377, "step": 5196 }, { "epoch": 1.9191285081240768, "grad_norm": 0.2649560868740082, "learning_rate": 7.210247567434413e-05, "loss": 0.1984, "step": 5197 }, { "epoch": 1.9194977843426884, "grad_norm": 0.25678685307502747, "learning_rate": 7.207784209878065e-05, "loss": 0.1793, "step": 5198 }, { "epoch": 1.9198670605613, "grad_norm": 0.25464650988578796, "learning_rate": 7.205320852321715e-05, "loss": 0.1932, "step": 5199 }, { "epoch": 1.9202363367799113, "grad_norm": 0.26528725028038025, "learning_rate": 7.202857494765366e-05, "loss": 0.1838, "step": 5200 }, { "epoch": 1.9202363367799113, "eval_loss": 0.25316575169563293, "eval_runtime": 5.8554, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.195, "step": 5200 }, { "epoch": 1.920605612998523, "grad_norm": 0.26819831132888794, "learning_rate": 7.200394137209016e-05, "loss": 0.1863, "step": 5201 }, { "epoch": 1.9209748892171343, "grad_norm": 0.3028496205806732, "learning_rate": 7.197930779652668e-05, "loss": 0.2565, "step": 5202 }, { "epoch": 1.921344165435746, "grad_norm": 0.4224024713039398, "learning_rate": 7.195467422096318e-05, "loss": 0.2971, "step": 5203 }, { "epoch": 1.9217134416543575, "grad_norm": 0.29464343190193176, "learning_rate": 7.193004064539968e-05, "loss": 0.2084, "step": 5204 }, { "epoch": 1.9220827178729691, "grad_norm": 0.2331015020608902, "learning_rate": 7.19054070698362e-05, "loss": 0.202, "step": 5205 }, { "epoch": 1.9224519940915805, "grad_norm": 0.34567561745643616, "learning_rate": 7.18807734942727e-05, "loss": 0.2981, "step": 5206 }, { "epoch": 1.922821270310192, "grad_norm": 0.26549074053764343, "learning_rate": 7.185613991870921e-05, "loss": 0.2161, "step": 5207 }, { "epoch": 1.9231905465288035, "grad_norm": 0.22987030446529388, "learning_rate": 7.183150634314571e-05, "loss": 0.1869, "step": 5208 }, { "epoch": 1.923559822747415, "grad_norm": 0.23912249505519867, "learning_rate": 7.180687276758223e-05, "loss": 0.1894, "step": 5209 }, { "epoch": 1.9239290989660267, "grad_norm": 0.2937624454498291, "learning_rate": 7.178223919201873e-05, "loss": 0.2183, "step": 5210 }, { "epoch": 1.924298375184638, "grad_norm": 0.27470824122428894, "learning_rate": 7.175760561645524e-05, "loss": 0.2037, "step": 5211 }, { "epoch": 1.9246676514032495, "grad_norm": 0.2689036428928375, "learning_rate": 7.173297204089174e-05, "loss": 0.1985, "step": 5212 }, { "epoch": 1.925036927621861, "grad_norm": 0.22076676785945892, "learning_rate": 7.170833846532825e-05, "loss": 0.1907, "step": 5213 }, { "epoch": 1.9254062038404727, "grad_norm": 0.28454598784446716, "learning_rate": 7.168370488976476e-05, "loss": 0.2264, "step": 5214 }, { "epoch": 1.9257754800590843, "grad_norm": 0.3232066035270691, "learning_rate": 7.165907131420126e-05, "loss": 0.2165, "step": 5215 }, { "epoch": 1.9261447562776959, "grad_norm": 0.2641231417655945, "learning_rate": 7.163443773863778e-05, "loss": 0.1965, "step": 5216 }, { "epoch": 1.9265140324963073, "grad_norm": 0.25479385256767273, "learning_rate": 7.160980416307428e-05, "loss": 0.2045, "step": 5217 }, { "epoch": 1.9268833087149186, "grad_norm": 0.25852954387664795, "learning_rate": 7.158517058751079e-05, "loss": 0.2051, "step": 5218 }, { "epoch": 1.9272525849335302, "grad_norm": 0.25149908661842346, "learning_rate": 7.156053701194729e-05, "loss": 0.1825, "step": 5219 }, { "epoch": 1.9276218611521418, "grad_norm": 0.2664920687675476, "learning_rate": 7.15359034363838e-05, "loss": 0.2118, "step": 5220 }, { "epoch": 1.9279911373707534, "grad_norm": 0.31141358613967896, "learning_rate": 7.151126986082031e-05, "loss": 0.2056, "step": 5221 }, { "epoch": 1.9283604135893648, "grad_norm": 0.28500014543533325, "learning_rate": 7.148663628525681e-05, "loss": 0.2746, "step": 5222 }, { "epoch": 1.9287296898079762, "grad_norm": 0.30131691694259644, "learning_rate": 7.146200270969332e-05, "loss": 0.2218, "step": 5223 }, { "epoch": 1.9290989660265878, "grad_norm": 0.2767343819141388, "learning_rate": 7.143736913412983e-05, "loss": 0.2258, "step": 5224 }, { "epoch": 1.9294682422451994, "grad_norm": 0.2807043790817261, "learning_rate": 7.141273555856633e-05, "loss": 0.2179, "step": 5225 }, { "epoch": 1.929837518463811, "grad_norm": 0.24697333574295044, "learning_rate": 7.138810198300283e-05, "loss": 0.196, "step": 5226 }, { "epoch": 1.9302067946824224, "grad_norm": 0.26040416955947876, "learning_rate": 7.136346840743934e-05, "loss": 0.195, "step": 5227 }, { "epoch": 1.930576070901034, "grad_norm": 0.2872878313064575, "learning_rate": 7.133883483187584e-05, "loss": 0.215, "step": 5228 }, { "epoch": 1.9309453471196454, "grad_norm": 0.3290000855922699, "learning_rate": 7.131420125631236e-05, "loss": 0.2409, "step": 5229 }, { "epoch": 1.931314623338257, "grad_norm": 0.2693164348602295, "learning_rate": 7.128956768074886e-05, "loss": 0.2028, "step": 5230 }, { "epoch": 1.9316838995568686, "grad_norm": 0.33382052183151245, "learning_rate": 7.126493410518536e-05, "loss": 0.2364, "step": 5231 }, { "epoch": 1.9320531757754802, "grad_norm": 0.2566787898540497, "learning_rate": 7.124030052962187e-05, "loss": 0.2047, "step": 5232 }, { "epoch": 1.9324224519940916, "grad_norm": 0.2897343635559082, "learning_rate": 7.121566695405838e-05, "loss": 0.2402, "step": 5233 }, { "epoch": 1.932791728212703, "grad_norm": 0.2652934491634369, "learning_rate": 7.119103337849489e-05, "loss": 0.2394, "step": 5234 }, { "epoch": 1.9331610044313146, "grad_norm": 0.3066202700138092, "learning_rate": 7.116639980293139e-05, "loss": 0.2273, "step": 5235 }, { "epoch": 1.9335302806499262, "grad_norm": 0.2847360074520111, "learning_rate": 7.11417662273679e-05, "loss": 0.2526, "step": 5236 }, { "epoch": 1.9338995568685378, "grad_norm": 0.2967818081378937, "learning_rate": 7.111713265180441e-05, "loss": 0.257, "step": 5237 }, { "epoch": 1.9342688330871491, "grad_norm": 0.4042901396751404, "learning_rate": 7.109249907624091e-05, "loss": 0.2662, "step": 5238 }, { "epoch": 1.9346381093057607, "grad_norm": 0.3567046821117401, "learning_rate": 7.106786550067742e-05, "loss": 0.268, "step": 5239 }, { "epoch": 1.9350073855243721, "grad_norm": 0.2794325649738312, "learning_rate": 7.104323192511392e-05, "loss": 0.2427, "step": 5240 }, { "epoch": 1.9353766617429837, "grad_norm": 0.28259146213531494, "learning_rate": 7.101859834955044e-05, "loss": 0.2398, "step": 5241 }, { "epoch": 1.9357459379615953, "grad_norm": 0.19708877801895142, "learning_rate": 7.099396477398694e-05, "loss": 0.1614, "step": 5242 }, { "epoch": 1.936115214180207, "grad_norm": 0.23787015676498413, "learning_rate": 7.096933119842345e-05, "loss": 0.1789, "step": 5243 }, { "epoch": 1.9364844903988183, "grad_norm": 0.30355992913246155, "learning_rate": 7.094469762285996e-05, "loss": 0.2219, "step": 5244 }, { "epoch": 1.9368537666174297, "grad_norm": 0.2670661509037018, "learning_rate": 7.092006404729647e-05, "loss": 0.2096, "step": 5245 }, { "epoch": 1.9372230428360413, "grad_norm": 0.29191645979881287, "learning_rate": 7.089543047173297e-05, "loss": 0.2071, "step": 5246 }, { "epoch": 1.937592319054653, "grad_norm": 0.2873486876487732, "learning_rate": 7.087079689616947e-05, "loss": 0.2219, "step": 5247 }, { "epoch": 1.9379615952732645, "grad_norm": 0.2358044534921646, "learning_rate": 7.084616332060599e-05, "loss": 0.1702, "step": 5248 }, { "epoch": 1.9383308714918759, "grad_norm": 0.2574106752872467, "learning_rate": 7.082152974504249e-05, "loss": 0.2199, "step": 5249 }, { "epoch": 1.9387001477104875, "grad_norm": 0.28576961159706116, "learning_rate": 7.0796896169479e-05, "loss": 0.2223, "step": 5250 }, { "epoch": 1.9387001477104875, "eval_loss": 0.2517310678958893, "eval_runtime": 5.8581, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 5250 }, { "epoch": 1.9390694239290989, "grad_norm": 0.23987966775894165, "learning_rate": 7.07722625939155e-05, "loss": 0.2067, "step": 5251 }, { "epoch": 1.9394387001477105, "grad_norm": 0.24387197196483612, "learning_rate": 7.074762901835202e-05, "loss": 0.2108, "step": 5252 }, { "epoch": 1.939807976366322, "grad_norm": 0.24827134609222412, "learning_rate": 7.072299544278852e-05, "loss": 0.1944, "step": 5253 }, { "epoch": 1.9401772525849337, "grad_norm": 0.2801428735256195, "learning_rate": 7.069836186722502e-05, "loss": 0.2428, "step": 5254 }, { "epoch": 1.940546528803545, "grad_norm": 0.24826756119728088, "learning_rate": 7.067372829166154e-05, "loss": 0.2212, "step": 5255 }, { "epoch": 1.9409158050221564, "grad_norm": 0.4102340340614319, "learning_rate": 7.064909471609804e-05, "loss": 0.2307, "step": 5256 }, { "epoch": 1.941285081240768, "grad_norm": 0.3000921308994293, "learning_rate": 7.062446114053455e-05, "loss": 0.2384, "step": 5257 }, { "epoch": 1.9416543574593796, "grad_norm": 0.2311098873615265, "learning_rate": 7.059982756497105e-05, "loss": 0.192, "step": 5258 }, { "epoch": 1.9420236336779912, "grad_norm": 0.28601640462875366, "learning_rate": 7.057519398940757e-05, "loss": 0.2215, "step": 5259 }, { "epoch": 1.9423929098966026, "grad_norm": 0.3139057159423828, "learning_rate": 7.055056041384407e-05, "loss": 0.2216, "step": 5260 }, { "epoch": 1.9427621861152142, "grad_norm": 0.24312689900398254, "learning_rate": 7.052592683828058e-05, "loss": 0.1908, "step": 5261 }, { "epoch": 1.9431314623338256, "grad_norm": 0.27276453375816345, "learning_rate": 7.050129326271708e-05, "loss": 0.2196, "step": 5262 }, { "epoch": 1.9435007385524372, "grad_norm": 0.3077089786529541, "learning_rate": 7.047665968715358e-05, "loss": 0.2178, "step": 5263 }, { "epoch": 1.9438700147710488, "grad_norm": 0.27808618545532227, "learning_rate": 7.04520261115901e-05, "loss": 0.2101, "step": 5264 }, { "epoch": 1.9442392909896604, "grad_norm": 0.2939068078994751, "learning_rate": 7.04273925360266e-05, "loss": 0.2634, "step": 5265 }, { "epoch": 1.9446085672082718, "grad_norm": 0.3380891978740692, "learning_rate": 7.040275896046312e-05, "loss": 0.2587, "step": 5266 }, { "epoch": 1.9449778434268832, "grad_norm": 0.25648233294487, "learning_rate": 7.037812538489962e-05, "loss": 0.1687, "step": 5267 }, { "epoch": 1.9453471196454948, "grad_norm": 0.4467228353023529, "learning_rate": 7.035349180933613e-05, "loss": 0.3074, "step": 5268 }, { "epoch": 1.9457163958641064, "grad_norm": 0.2929859161376953, "learning_rate": 7.032885823377263e-05, "loss": 0.2413, "step": 5269 }, { "epoch": 1.946085672082718, "grad_norm": 0.27283674478530884, "learning_rate": 7.030422465820913e-05, "loss": 0.232, "step": 5270 }, { "epoch": 1.9464549483013294, "grad_norm": 0.7194961905479431, "learning_rate": 7.027959108264565e-05, "loss": 0.2689, "step": 5271 }, { "epoch": 1.946824224519941, "grad_norm": 0.29518184065818787, "learning_rate": 7.025495750708215e-05, "loss": 0.2365, "step": 5272 }, { "epoch": 1.9471935007385524, "grad_norm": 0.24883228540420532, "learning_rate": 7.023032393151866e-05, "loss": 0.2224, "step": 5273 }, { "epoch": 1.947562776957164, "grad_norm": 0.29781991243362427, "learning_rate": 7.020569035595516e-05, "loss": 0.2041, "step": 5274 }, { "epoch": 1.9479320531757756, "grad_norm": 0.30253690481185913, "learning_rate": 7.018105678039168e-05, "loss": 0.2135, "step": 5275 }, { "epoch": 1.9483013293943872, "grad_norm": 0.25308331847190857, "learning_rate": 7.015642320482818e-05, "loss": 0.2281, "step": 5276 }, { "epoch": 1.9486706056129985, "grad_norm": 0.25189143419265747, "learning_rate": 7.01317896292647e-05, "loss": 0.2486, "step": 5277 }, { "epoch": 1.94903988183161, "grad_norm": 0.33361706137657166, "learning_rate": 7.01071560537012e-05, "loss": 0.2502, "step": 5278 }, { "epoch": 1.9494091580502215, "grad_norm": 0.2573055326938629, "learning_rate": 7.00825224781377e-05, "loss": 0.2221, "step": 5279 }, { "epoch": 1.9497784342688331, "grad_norm": 0.2805079221725464, "learning_rate": 7.005788890257421e-05, "loss": 0.2358, "step": 5280 }, { "epoch": 1.9501477104874447, "grad_norm": 0.2887052297592163, "learning_rate": 7.003325532701071e-05, "loss": 0.2126, "step": 5281 }, { "epoch": 1.950516986706056, "grad_norm": 0.27408942580223083, "learning_rate": 7.000862175144723e-05, "loss": 0.1986, "step": 5282 }, { "epoch": 1.9508862629246675, "grad_norm": 0.29488441348075867, "learning_rate": 6.998398817588373e-05, "loss": 0.2137, "step": 5283 }, { "epoch": 1.951255539143279, "grad_norm": 0.2893330752849579, "learning_rate": 6.995935460032024e-05, "loss": 0.2705, "step": 5284 }, { "epoch": 1.9516248153618907, "grad_norm": 0.2756158411502838, "learning_rate": 6.993472102475674e-05, "loss": 0.2014, "step": 5285 }, { "epoch": 1.9519940915805023, "grad_norm": 0.30714696645736694, "learning_rate": 6.991008744919325e-05, "loss": 0.2513, "step": 5286 }, { "epoch": 1.952363367799114, "grad_norm": 0.28137123584747314, "learning_rate": 6.988545387362976e-05, "loss": 0.1936, "step": 5287 }, { "epoch": 1.9527326440177253, "grad_norm": 0.26411962509155273, "learning_rate": 6.986082029806626e-05, "loss": 0.2098, "step": 5288 }, { "epoch": 1.9531019202363367, "grad_norm": 0.3234144449234009, "learning_rate": 6.983618672250278e-05, "loss": 0.2091, "step": 5289 }, { "epoch": 1.9534711964549483, "grad_norm": 0.2658655345439911, "learning_rate": 6.981155314693928e-05, "loss": 0.229, "step": 5290 }, { "epoch": 1.9538404726735599, "grad_norm": 0.2824770212173462, "learning_rate": 6.978691957137579e-05, "loss": 0.2324, "step": 5291 }, { "epoch": 1.9542097488921715, "grad_norm": 0.34331393241882324, "learning_rate": 6.976228599581229e-05, "loss": 0.2794, "step": 5292 }, { "epoch": 1.9545790251107829, "grad_norm": 0.38514310121536255, "learning_rate": 6.97376524202488e-05, "loss": 0.2777, "step": 5293 }, { "epoch": 1.9549483013293942, "grad_norm": 0.3110654354095459, "learning_rate": 6.971301884468531e-05, "loss": 0.231, "step": 5294 }, { "epoch": 1.9553175775480058, "grad_norm": 0.29559555649757385, "learning_rate": 6.968838526912181e-05, "loss": 0.1947, "step": 5295 }, { "epoch": 1.9556868537666174, "grad_norm": 0.20234008133411407, "learning_rate": 6.966375169355832e-05, "loss": 0.1782, "step": 5296 }, { "epoch": 1.956056129985229, "grad_norm": 0.2966025471687317, "learning_rate": 6.963911811799482e-05, "loss": 0.1906, "step": 5297 }, { "epoch": 1.9564254062038404, "grad_norm": 0.26159605383872986, "learning_rate": 6.961448454243134e-05, "loss": 0.2147, "step": 5298 }, { "epoch": 1.956794682422452, "grad_norm": 0.23278647661209106, "learning_rate": 6.958985096686784e-05, "loss": 0.1919, "step": 5299 }, { "epoch": 1.9571639586410634, "grad_norm": 0.25554853677749634, "learning_rate": 6.956521739130436e-05, "loss": 0.1835, "step": 5300 }, { "epoch": 1.9571639586410634, "eval_loss": 0.25134599208831787, "eval_runtime": 5.8596, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 5300 }, { "epoch": 1.957533234859675, "grad_norm": 0.2563522756099701, "learning_rate": 6.954058381574086e-05, "loss": 0.2164, "step": 5301 }, { "epoch": 1.9579025110782866, "grad_norm": 0.24337248504161835, "learning_rate": 6.951595024017736e-05, "loss": 0.194, "step": 5302 }, { "epoch": 1.9582717872968982, "grad_norm": 0.26176807284355164, "learning_rate": 6.949131666461387e-05, "loss": 0.1842, "step": 5303 }, { "epoch": 1.9586410635155096, "grad_norm": 0.31088557839393616, "learning_rate": 6.946668308905037e-05, "loss": 0.2423, "step": 5304 }, { "epoch": 1.959010339734121, "grad_norm": 0.3026870787143707, "learning_rate": 6.944204951348689e-05, "loss": 0.2301, "step": 5305 }, { "epoch": 1.9593796159527326, "grad_norm": 0.24130895733833313, "learning_rate": 6.941741593792339e-05, "loss": 0.1882, "step": 5306 }, { "epoch": 1.9597488921713442, "grad_norm": 0.25114375352859497, "learning_rate": 6.93927823623599e-05, "loss": 0.1871, "step": 5307 }, { "epoch": 1.9601181683899558, "grad_norm": 0.29826170206069946, "learning_rate": 6.93681487867964e-05, "loss": 0.2672, "step": 5308 }, { "epoch": 1.9604874446085672, "grad_norm": 0.2573586702346802, "learning_rate": 6.93435152112329e-05, "loss": 0.2161, "step": 5309 }, { "epoch": 1.9608567208271788, "grad_norm": 0.2709135413169861, "learning_rate": 6.931888163566942e-05, "loss": 0.2192, "step": 5310 }, { "epoch": 1.9612259970457901, "grad_norm": 0.29191112518310547, "learning_rate": 6.929424806010592e-05, "loss": 0.1997, "step": 5311 }, { "epoch": 1.9615952732644018, "grad_norm": 0.2645627558231354, "learning_rate": 6.926961448454244e-05, "loss": 0.2374, "step": 5312 }, { "epoch": 1.9619645494830134, "grad_norm": 0.29544684290885925, "learning_rate": 6.924498090897894e-05, "loss": 0.2678, "step": 5313 }, { "epoch": 1.962333825701625, "grad_norm": 0.2862391471862793, "learning_rate": 6.922034733341545e-05, "loss": 0.2529, "step": 5314 }, { "epoch": 1.9627031019202363, "grad_norm": 0.29966267943382263, "learning_rate": 6.919571375785195e-05, "loss": 0.2454, "step": 5315 }, { "epoch": 1.9630723781388477, "grad_norm": 0.27755698561668396, "learning_rate": 6.917108018228847e-05, "loss": 0.2241, "step": 5316 }, { "epoch": 1.9634416543574593, "grad_norm": 0.2564755082130432, "learning_rate": 6.914644660672497e-05, "loss": 0.2023, "step": 5317 }, { "epoch": 1.963810930576071, "grad_norm": 0.37182116508483887, "learning_rate": 6.912181303116147e-05, "loss": 0.2837, "step": 5318 }, { "epoch": 1.9641802067946825, "grad_norm": 0.2805497348308563, "learning_rate": 6.909717945559798e-05, "loss": 0.2356, "step": 5319 }, { "epoch": 1.964549483013294, "grad_norm": 0.2930833101272583, "learning_rate": 6.907254588003449e-05, "loss": 0.2019, "step": 5320 }, { "epoch": 1.9649187592319055, "grad_norm": 0.24924765527248383, "learning_rate": 6.9047912304471e-05, "loss": 0.2044, "step": 5321 }, { "epoch": 1.965288035450517, "grad_norm": 0.26734644174575806, "learning_rate": 6.90232787289075e-05, "loss": 0.2452, "step": 5322 }, { "epoch": 1.9656573116691285, "grad_norm": 0.2351471185684204, "learning_rate": 6.899864515334402e-05, "loss": 0.1839, "step": 5323 }, { "epoch": 1.96602658788774, "grad_norm": 0.24871516227722168, "learning_rate": 6.897401157778052e-05, "loss": 0.1809, "step": 5324 }, { "epoch": 1.9663958641063517, "grad_norm": 0.24512727558612823, "learning_rate": 6.894937800221702e-05, "loss": 0.1941, "step": 5325 }, { "epoch": 1.966765140324963, "grad_norm": 0.2477511316537857, "learning_rate": 6.892474442665353e-05, "loss": 0.186, "step": 5326 }, { "epoch": 1.9671344165435745, "grad_norm": 0.2817474901676178, "learning_rate": 6.890011085109003e-05, "loss": 0.2411, "step": 5327 }, { "epoch": 1.967503692762186, "grad_norm": 0.26521196961402893, "learning_rate": 6.887547727552655e-05, "loss": 0.2075, "step": 5328 }, { "epoch": 1.9678729689807977, "grad_norm": 0.2950814366340637, "learning_rate": 6.885084369996305e-05, "loss": 0.2158, "step": 5329 }, { "epoch": 1.9682422451994093, "grad_norm": 0.25968989729881287, "learning_rate": 6.882621012439956e-05, "loss": 0.2009, "step": 5330 }, { "epoch": 1.9686115214180206, "grad_norm": 0.24506819248199463, "learning_rate": 6.880157654883607e-05, "loss": 0.2173, "step": 5331 }, { "epoch": 1.9689807976366323, "grad_norm": 0.26394718885421753, "learning_rate": 6.877694297327258e-05, "loss": 0.2073, "step": 5332 }, { "epoch": 1.9693500738552436, "grad_norm": 0.3314740061759949, "learning_rate": 6.875230939770908e-05, "loss": 0.2251, "step": 5333 }, { "epoch": 1.9697193500738552, "grad_norm": 0.23417414724826813, "learning_rate": 6.872767582214558e-05, "loss": 0.187, "step": 5334 }, { "epoch": 1.9700886262924668, "grad_norm": 0.2197985053062439, "learning_rate": 6.87030422465821e-05, "loss": 0.18, "step": 5335 }, { "epoch": 1.9704579025110784, "grad_norm": 0.3090750277042389, "learning_rate": 6.86784086710186e-05, "loss": 0.2503, "step": 5336 }, { "epoch": 1.9708271787296898, "grad_norm": 0.2460378110408783, "learning_rate": 6.865377509545511e-05, "loss": 0.1782, "step": 5337 }, { "epoch": 1.9711964549483012, "grad_norm": 0.2581346929073334, "learning_rate": 6.862914151989161e-05, "loss": 0.2076, "step": 5338 }, { "epoch": 1.9715657311669128, "grad_norm": 0.31432342529296875, "learning_rate": 6.860450794432813e-05, "loss": 0.2526, "step": 5339 }, { "epoch": 1.9719350073855244, "grad_norm": 0.24623006582260132, "learning_rate": 6.857987436876463e-05, "loss": 0.2085, "step": 5340 }, { "epoch": 1.972304283604136, "grad_norm": 0.2564246356487274, "learning_rate": 6.855524079320113e-05, "loss": 0.2047, "step": 5341 }, { "epoch": 1.9726735598227474, "grad_norm": 0.2790607511997223, "learning_rate": 6.853060721763764e-05, "loss": 0.1971, "step": 5342 }, { "epoch": 1.9730428360413588, "grad_norm": 0.3743658661842346, "learning_rate": 6.850597364207415e-05, "loss": 0.2582, "step": 5343 }, { "epoch": 1.9734121122599704, "grad_norm": 0.27603679895401, "learning_rate": 6.848134006651066e-05, "loss": 0.2305, "step": 5344 }, { "epoch": 1.973781388478582, "grad_norm": 0.23608393967151642, "learning_rate": 6.845670649094716e-05, "loss": 0.1668, "step": 5345 }, { "epoch": 1.9741506646971936, "grad_norm": 0.2994793653488159, "learning_rate": 6.843207291538368e-05, "loss": 0.2404, "step": 5346 }, { "epoch": 1.9745199409158052, "grad_norm": 0.25873616337776184, "learning_rate": 6.840743933982018e-05, "loss": 0.2167, "step": 5347 }, { "epoch": 1.9748892171344166, "grad_norm": 0.24730846285820007, "learning_rate": 6.838280576425669e-05, "loss": 0.2044, "step": 5348 }, { "epoch": 1.975258493353028, "grad_norm": 0.28112494945526123, "learning_rate": 6.835817218869319e-05, "loss": 0.1811, "step": 5349 }, { "epoch": 1.9756277695716395, "grad_norm": 0.2771839499473572, "learning_rate": 6.83335386131297e-05, "loss": 0.2312, "step": 5350 }, { "epoch": 1.9756277695716395, "eval_loss": 0.2520390450954437, "eval_runtime": 5.8584, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 5350 }, { "epoch": 1.9759970457902511, "grad_norm": 0.30305880308151245, "learning_rate": 6.830890503756621e-05, "loss": 0.2449, "step": 5351 }, { "epoch": 1.9763663220088628, "grad_norm": 0.5072112083435059, "learning_rate": 6.828427146200271e-05, "loss": 0.2536, "step": 5352 }, { "epoch": 1.9767355982274741, "grad_norm": 0.3998947739601135, "learning_rate": 6.825963788643922e-05, "loss": 0.2253, "step": 5353 }, { "epoch": 1.9771048744460855, "grad_norm": 0.2706662118434906, "learning_rate": 6.823500431087573e-05, "loss": 0.1938, "step": 5354 }, { "epoch": 1.9774741506646971, "grad_norm": 0.3075130581855774, "learning_rate": 6.821037073531224e-05, "loss": 0.236, "step": 5355 }, { "epoch": 1.9778434268833087, "grad_norm": 0.24399112164974213, "learning_rate": 6.818573715974874e-05, "loss": 0.225, "step": 5356 }, { "epoch": 1.9782127031019203, "grad_norm": 0.28036928176879883, "learning_rate": 6.816110358418524e-05, "loss": 0.2248, "step": 5357 }, { "epoch": 1.9785819793205317, "grad_norm": 0.2429688572883606, "learning_rate": 6.813647000862176e-05, "loss": 0.2194, "step": 5358 }, { "epoch": 1.9789512555391433, "grad_norm": 0.2006453573703766, "learning_rate": 6.811183643305826e-05, "loss": 0.1701, "step": 5359 }, { "epoch": 1.9793205317577547, "grad_norm": 0.2574852406978607, "learning_rate": 6.808720285749477e-05, "loss": 0.1866, "step": 5360 }, { "epoch": 1.9796898079763663, "grad_norm": 0.31463634967803955, "learning_rate": 6.806256928193127e-05, "loss": 0.2167, "step": 5361 }, { "epoch": 1.980059084194978, "grad_norm": 0.26930132508277893, "learning_rate": 6.803793570636779e-05, "loss": 0.2061, "step": 5362 }, { "epoch": 1.9804283604135895, "grad_norm": 0.30395740270614624, "learning_rate": 6.801330213080429e-05, "loss": 0.2296, "step": 5363 }, { "epoch": 1.9807976366322009, "grad_norm": 0.27030232548713684, "learning_rate": 6.79886685552408e-05, "loss": 0.2207, "step": 5364 }, { "epoch": 1.9811669128508123, "grad_norm": 0.2603955566883087, "learning_rate": 6.79640349796773e-05, "loss": 0.2224, "step": 5365 }, { "epoch": 1.9815361890694239, "grad_norm": 0.2569535970687866, "learning_rate": 6.79394014041138e-05, "loss": 0.2044, "step": 5366 }, { "epoch": 1.9819054652880355, "grad_norm": 0.28096988797187805, "learning_rate": 6.791476782855032e-05, "loss": 0.2207, "step": 5367 }, { "epoch": 1.982274741506647, "grad_norm": 0.2559007704257965, "learning_rate": 6.789013425298682e-05, "loss": 0.1836, "step": 5368 }, { "epoch": 1.9826440177252584, "grad_norm": 0.2963027060031891, "learning_rate": 6.786550067742334e-05, "loss": 0.2192, "step": 5369 }, { "epoch": 1.98301329394387, "grad_norm": 0.34823083877563477, "learning_rate": 6.784086710185984e-05, "loss": 0.2689, "step": 5370 }, { "epoch": 1.9833825701624814, "grad_norm": 0.30800661444664, "learning_rate": 6.781623352629635e-05, "loss": 0.27, "step": 5371 }, { "epoch": 1.983751846381093, "grad_norm": 0.29237210750579834, "learning_rate": 6.779159995073285e-05, "loss": 0.2415, "step": 5372 }, { "epoch": 1.9841211225997046, "grad_norm": 0.25844356417655945, "learning_rate": 6.776696637516935e-05, "loss": 0.1993, "step": 5373 }, { "epoch": 1.9844903988183162, "grad_norm": 0.3420666456222534, "learning_rate": 6.774233279960587e-05, "loss": 0.2425, "step": 5374 }, { "epoch": 1.9848596750369276, "grad_norm": 0.2594550549983978, "learning_rate": 6.771769922404237e-05, "loss": 0.2281, "step": 5375 }, { "epoch": 1.985228951255539, "grad_norm": 0.2631921172142029, "learning_rate": 6.769306564847889e-05, "loss": 0.208, "step": 5376 }, { "epoch": 1.9855982274741506, "grad_norm": 0.26732879877090454, "learning_rate": 6.766843207291539e-05, "loss": 0.2159, "step": 5377 }, { "epoch": 1.9859675036927622, "grad_norm": 0.24175330996513367, "learning_rate": 6.76437984973519e-05, "loss": 0.2031, "step": 5378 }, { "epoch": 1.9863367799113738, "grad_norm": 0.2626853585243225, "learning_rate": 6.76191649217884e-05, "loss": 0.21, "step": 5379 }, { "epoch": 1.9867060561299852, "grad_norm": 0.28533270955085754, "learning_rate": 6.75945313462249e-05, "loss": 0.2124, "step": 5380 }, { "epoch": 1.9870753323485968, "grad_norm": 0.284945011138916, "learning_rate": 6.756989777066142e-05, "loss": 0.2281, "step": 5381 }, { "epoch": 1.9874446085672082, "grad_norm": 0.38396868109703064, "learning_rate": 6.754526419509792e-05, "loss": 0.2384, "step": 5382 }, { "epoch": 1.9878138847858198, "grad_norm": 0.3203868567943573, "learning_rate": 6.752063061953443e-05, "loss": 0.241, "step": 5383 }, { "epoch": 1.9881831610044314, "grad_norm": 0.32469770312309265, "learning_rate": 6.749599704397093e-05, "loss": 0.2473, "step": 5384 }, { "epoch": 1.988552437223043, "grad_norm": 0.2803540527820587, "learning_rate": 6.747136346840745e-05, "loss": 0.2066, "step": 5385 }, { "epoch": 1.9889217134416544, "grad_norm": 0.3043845593929291, "learning_rate": 6.744672989284395e-05, "loss": 0.242, "step": 5386 }, { "epoch": 1.9892909896602657, "grad_norm": 0.2872565686702728, "learning_rate": 6.742209631728046e-05, "loss": 0.2266, "step": 5387 }, { "epoch": 1.9896602658788773, "grad_norm": 0.284017413854599, "learning_rate": 6.739746274171697e-05, "loss": 0.2524, "step": 5388 }, { "epoch": 1.990029542097489, "grad_norm": 0.2788662910461426, "learning_rate": 6.737282916615347e-05, "loss": 0.1927, "step": 5389 }, { "epoch": 1.9903988183161005, "grad_norm": 0.2847667634487152, "learning_rate": 6.734819559058998e-05, "loss": 0.2155, "step": 5390 }, { "epoch": 1.990768094534712, "grad_norm": 0.3192155361175537, "learning_rate": 6.732356201502648e-05, "loss": 0.2498, "step": 5391 }, { "epoch": 1.9911373707533235, "grad_norm": 0.28015974164009094, "learning_rate": 6.7298928439463e-05, "loss": 0.2315, "step": 5392 }, { "epoch": 1.991506646971935, "grad_norm": 0.23439064621925354, "learning_rate": 6.72742948638995e-05, "loss": 0.2264, "step": 5393 }, { "epoch": 1.9918759231905465, "grad_norm": 0.2239377647638321, "learning_rate": 6.724966128833601e-05, "loss": 0.1971, "step": 5394 }, { "epoch": 1.9922451994091581, "grad_norm": 0.28067389130592346, "learning_rate": 6.722502771277251e-05, "loss": 0.2239, "step": 5395 }, { "epoch": 1.9926144756277697, "grad_norm": 0.28197333216667175, "learning_rate": 6.720039413720902e-05, "loss": 0.2223, "step": 5396 }, { "epoch": 1.992983751846381, "grad_norm": 0.25778698921203613, "learning_rate": 6.717576056164553e-05, "loss": 0.2065, "step": 5397 }, { "epoch": 1.9933530280649925, "grad_norm": 0.2842516303062439, "learning_rate": 6.715112698608203e-05, "loss": 0.2293, "step": 5398 }, { "epoch": 1.993722304283604, "grad_norm": 0.23083434998989105, "learning_rate": 6.712649341051855e-05, "loss": 0.2006, "step": 5399 }, { "epoch": 1.9940915805022157, "grad_norm": 0.2679019272327423, "learning_rate": 6.710185983495505e-05, "loss": 0.2441, "step": 5400 }, { "epoch": 1.9940915805022157, "eval_loss": 0.24856449663639069, "eval_runtime": 5.8555, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.195, "step": 5400 }, { "epoch": 1.9944608567208273, "grad_norm": 0.2704894542694092, "learning_rate": 6.707722625939156e-05, "loss": 0.26, "step": 5401 }, { "epoch": 1.9948301329394387, "grad_norm": 0.28078630566596985, "learning_rate": 6.705259268382806e-05, "loss": 0.2296, "step": 5402 }, { "epoch": 1.9951994091580503, "grad_norm": 0.30929094552993774, "learning_rate": 6.702795910826458e-05, "loss": 0.2217, "step": 5403 }, { "epoch": 1.9955686853766617, "grad_norm": 0.2846240699291229, "learning_rate": 6.700332553270108e-05, "loss": 0.207, "step": 5404 }, { "epoch": 1.9959379615952733, "grad_norm": 0.25282275676727295, "learning_rate": 6.697869195713758e-05, "loss": 0.1919, "step": 5405 }, { "epoch": 1.9963072378138849, "grad_norm": 0.2732691764831543, "learning_rate": 6.69540583815741e-05, "loss": 0.2137, "step": 5406 }, { "epoch": 1.9966765140324965, "grad_norm": 0.316196084022522, "learning_rate": 6.69294248060106e-05, "loss": 0.239, "step": 5407 }, { "epoch": 1.9970457902511078, "grad_norm": 0.2614864110946655, "learning_rate": 6.690479123044711e-05, "loss": 0.2092, "step": 5408 }, { "epoch": 1.9974150664697192, "grad_norm": 0.30646562576293945, "learning_rate": 6.688015765488361e-05, "loss": 0.218, "step": 5409 }, { "epoch": 1.9977843426883308, "grad_norm": 0.29279881715774536, "learning_rate": 6.685552407932013e-05, "loss": 0.2254, "step": 5410 }, { "epoch": 1.9981536189069424, "grad_norm": 0.22450609505176544, "learning_rate": 6.683089050375663e-05, "loss": 0.1983, "step": 5411 }, { "epoch": 1.998522895125554, "grad_norm": 0.2599349617958069, "learning_rate": 6.680625692819313e-05, "loss": 0.1904, "step": 5412 }, { "epoch": 1.9988921713441654, "grad_norm": 0.23740121722221375, "learning_rate": 6.678162335262964e-05, "loss": 0.2221, "step": 5413 }, { "epoch": 1.9992614475627768, "grad_norm": 0.24713720381259918, "learning_rate": 6.675698977706614e-05, "loss": 0.2003, "step": 5414 }, { "epoch": 1.9996307237813884, "grad_norm": 0.3049919903278351, "learning_rate": 6.673235620150266e-05, "loss": 0.2308, "step": 5415 }, { "epoch": 2.0, "grad_norm": 0.3003247082233429, "learning_rate": 6.670772262593916e-05, "loss": 0.2386, "step": 5416 }, { "epoch": 2.0003692762186116, "grad_norm": 0.23018822073936462, "learning_rate": 6.668308905037567e-05, "loss": 0.149, "step": 5417 }, { "epoch": 2.000738552437223, "grad_norm": 0.21735535562038422, "learning_rate": 6.665845547481217e-05, "loss": 0.1752, "step": 5418 }, { "epoch": 2.0011078286558344, "grad_norm": 0.21702949702739716, "learning_rate": 6.663382189924869e-05, "loss": 0.1806, "step": 5419 }, { "epoch": 2.001477104874446, "grad_norm": 0.22789764404296875, "learning_rate": 6.660918832368519e-05, "loss": 0.2246, "step": 5420 }, { "epoch": 2.0018463810930576, "grad_norm": 0.22062279284000397, "learning_rate": 6.658455474812169e-05, "loss": 0.1985, "step": 5421 }, { "epoch": 2.002215657311669, "grad_norm": 0.18831095099449158, "learning_rate": 6.65599211725582e-05, "loss": 0.15, "step": 5422 }, { "epoch": 2.0025849335302808, "grad_norm": 0.25476741790771484, "learning_rate": 6.653528759699471e-05, "loss": 0.1663, "step": 5423 }, { "epoch": 2.0029542097488924, "grad_norm": 0.2441297024488449, "learning_rate": 6.651065402143122e-05, "loss": 0.1603, "step": 5424 }, { "epoch": 2.0033234859675035, "grad_norm": 0.25700199604034424, "learning_rate": 6.648602044586772e-05, "loss": 0.2059, "step": 5425 }, { "epoch": 2.003692762186115, "grad_norm": 0.25105035305023193, "learning_rate": 6.646138687030424e-05, "loss": 0.1579, "step": 5426 }, { "epoch": 2.0040620384047267, "grad_norm": 0.24316802620887756, "learning_rate": 6.643675329474074e-05, "loss": 0.1654, "step": 5427 }, { "epoch": 2.0044313146233383, "grad_norm": 0.21353621780872345, "learning_rate": 6.641211971917724e-05, "loss": 0.1283, "step": 5428 }, { "epoch": 2.00480059084195, "grad_norm": 0.2258773148059845, "learning_rate": 6.638748614361375e-05, "loss": 0.158, "step": 5429 }, { "epoch": 2.005169867060561, "grad_norm": 0.22706244885921478, "learning_rate": 6.636285256805026e-05, "loss": 0.1829, "step": 5430 }, { "epoch": 2.0055391432791727, "grad_norm": 0.252373069524765, "learning_rate": 6.633821899248677e-05, "loss": 0.1573, "step": 5431 }, { "epoch": 2.0059084194977843, "grad_norm": 0.2511407136917114, "learning_rate": 6.631358541692327e-05, "loss": 0.1481, "step": 5432 }, { "epoch": 2.006277695716396, "grad_norm": 0.19744746387004852, "learning_rate": 6.628895184135979e-05, "loss": 0.1547, "step": 5433 }, { "epoch": 2.0066469719350075, "grad_norm": 0.2841181755065918, "learning_rate": 6.626431826579629e-05, "loss": 0.1925, "step": 5434 }, { "epoch": 2.007016248153619, "grad_norm": 0.260434627532959, "learning_rate": 6.62396846902328e-05, "loss": 0.1855, "step": 5435 }, { "epoch": 2.0073855243722303, "grad_norm": 0.21629783511161804, "learning_rate": 6.62150511146693e-05, "loss": 0.1538, "step": 5436 }, { "epoch": 2.007754800590842, "grad_norm": 0.25323525071144104, "learning_rate": 6.61904175391058e-05, "loss": 0.1468, "step": 5437 }, { "epoch": 2.0081240768094535, "grad_norm": 0.2551720440387726, "learning_rate": 6.616578396354232e-05, "loss": 0.18, "step": 5438 }, { "epoch": 2.008493353028065, "grad_norm": 0.30982598662376404, "learning_rate": 6.614115038797882e-05, "loss": 0.1966, "step": 5439 }, { "epoch": 2.0088626292466767, "grad_norm": 0.2552310526371002, "learning_rate": 6.611651681241533e-05, "loss": 0.1756, "step": 5440 }, { "epoch": 2.009231905465288, "grad_norm": 0.3050388991832733, "learning_rate": 6.609188323685184e-05, "loss": 0.1782, "step": 5441 }, { "epoch": 2.0096011816838995, "grad_norm": 0.28946226835250854, "learning_rate": 6.606724966128835e-05, "loss": 0.1919, "step": 5442 }, { "epoch": 2.009970457902511, "grad_norm": 0.36026933789253235, "learning_rate": 6.604261608572485e-05, "loss": 0.1804, "step": 5443 }, { "epoch": 2.0103397341211227, "grad_norm": 0.22443272173404694, "learning_rate": 6.601798251016135e-05, "loss": 0.1823, "step": 5444 }, { "epoch": 2.0107090103397343, "grad_norm": 0.24858684837818146, "learning_rate": 6.599334893459787e-05, "loss": 0.1645, "step": 5445 }, { "epoch": 2.011078286558346, "grad_norm": 0.2676042914390564, "learning_rate": 6.596871535903437e-05, "loss": 0.1724, "step": 5446 }, { "epoch": 2.011447562776957, "grad_norm": 0.28836727142333984, "learning_rate": 6.594408178347088e-05, "loss": 0.193, "step": 5447 }, { "epoch": 2.0118168389955686, "grad_norm": 0.2760215997695923, "learning_rate": 6.591944820790738e-05, "loss": 0.1856, "step": 5448 }, { "epoch": 2.0121861152141802, "grad_norm": 0.29110774397850037, "learning_rate": 6.58948146323439e-05, "loss": 0.1709, "step": 5449 }, { "epoch": 2.012555391432792, "grad_norm": 0.2904491722583771, "learning_rate": 6.58701810567804e-05, "loss": 0.1596, "step": 5450 }, { "epoch": 2.012555391432792, "eval_loss": 0.2550851106643677, "eval_runtime": 5.8496, "eval_samples_per_second": 8.548, "eval_steps_per_second": 1.197, "step": 5450 }, { "epoch": 2.0129246676514034, "grad_norm": 0.3089062571525574, "learning_rate": 6.58455474812169e-05, "loss": 0.167, "step": 5451 }, { "epoch": 2.0132939438700146, "grad_norm": 0.2824161648750305, "learning_rate": 6.582091390565341e-05, "loss": 0.1735, "step": 5452 }, { "epoch": 2.013663220088626, "grad_norm": 0.23326873779296875, "learning_rate": 6.579628033008992e-05, "loss": 0.1657, "step": 5453 }, { "epoch": 2.014032496307238, "grad_norm": 0.27446603775024414, "learning_rate": 6.577164675452643e-05, "loss": 0.1713, "step": 5454 }, { "epoch": 2.0144017725258494, "grad_norm": 0.3505135476589203, "learning_rate": 6.574701317896293e-05, "loss": 0.1818, "step": 5455 }, { "epoch": 2.014771048744461, "grad_norm": 0.26045656204223633, "learning_rate": 6.572237960339945e-05, "loss": 0.1604, "step": 5456 }, { "epoch": 2.015140324963072, "grad_norm": 0.33755213022232056, "learning_rate": 6.569774602783593e-05, "loss": 0.1634, "step": 5457 }, { "epoch": 2.0155096011816838, "grad_norm": 0.2607596814632416, "learning_rate": 6.567311245227245e-05, "loss": 0.181, "step": 5458 }, { "epoch": 2.0158788774002954, "grad_norm": 0.33283165097236633, "learning_rate": 6.564847887670895e-05, "loss": 0.1899, "step": 5459 }, { "epoch": 2.016248153618907, "grad_norm": 0.24610725045204163, "learning_rate": 6.562384530114546e-05, "loss": 0.1495, "step": 5460 }, { "epoch": 2.0166174298375186, "grad_norm": 0.2705758810043335, "learning_rate": 6.559921172558197e-05, "loss": 0.1493, "step": 5461 }, { "epoch": 2.01698670605613, "grad_norm": 0.2775185704231262, "learning_rate": 6.557457815001847e-05, "loss": 0.1896, "step": 5462 }, { "epoch": 2.0173559822747413, "grad_norm": 0.23754863440990448, "learning_rate": 6.554994457445498e-05, "loss": 0.1584, "step": 5463 }, { "epoch": 2.017725258493353, "grad_norm": 0.25853317975997925, "learning_rate": 6.552531099889148e-05, "loss": 0.1713, "step": 5464 }, { "epoch": 2.0180945347119645, "grad_norm": 0.2862049341201782, "learning_rate": 6.5500677423328e-05, "loss": 0.1737, "step": 5465 }, { "epoch": 2.018463810930576, "grad_norm": 0.252202570438385, "learning_rate": 6.54760438477645e-05, "loss": 0.1961, "step": 5466 }, { "epoch": 2.0188330871491877, "grad_norm": 0.3276780843734741, "learning_rate": 6.545141027220101e-05, "loss": 0.1951, "step": 5467 }, { "epoch": 2.019202363367799, "grad_norm": 0.2186700999736786, "learning_rate": 6.542677669663751e-05, "loss": 0.1542, "step": 5468 }, { "epoch": 2.0195716395864105, "grad_norm": 0.263078510761261, "learning_rate": 6.540214312107403e-05, "loss": 0.1858, "step": 5469 }, { "epoch": 2.019940915805022, "grad_norm": 0.2864452004432678, "learning_rate": 6.537750954551053e-05, "loss": 0.2138, "step": 5470 }, { "epoch": 2.0203101920236337, "grad_norm": 0.2598871886730194, "learning_rate": 6.535287596994703e-05, "loss": 0.179, "step": 5471 }, { "epoch": 2.0206794682422453, "grad_norm": 0.25144585967063904, "learning_rate": 6.532824239438355e-05, "loss": 0.1509, "step": 5472 }, { "epoch": 2.021048744460857, "grad_norm": 0.2702910304069519, "learning_rate": 6.530360881882005e-05, "loss": 0.1444, "step": 5473 }, { "epoch": 2.021418020679468, "grad_norm": 0.3086947202682495, "learning_rate": 6.527897524325656e-05, "loss": 0.1638, "step": 5474 }, { "epoch": 2.0217872968980797, "grad_norm": 0.22952225804328918, "learning_rate": 6.525434166769306e-05, "loss": 0.1438, "step": 5475 }, { "epoch": 2.0221565731166913, "grad_norm": 0.23374100029468536, "learning_rate": 6.522970809212958e-05, "loss": 0.1483, "step": 5476 }, { "epoch": 2.022525849335303, "grad_norm": 0.2832000255584717, "learning_rate": 6.520507451656608e-05, "loss": 0.1914, "step": 5477 }, { "epoch": 2.0228951255539145, "grad_norm": 0.23163892328739166, "learning_rate": 6.518044094100258e-05, "loss": 0.1499, "step": 5478 }, { "epoch": 2.0232644017725256, "grad_norm": 0.27283135056495667, "learning_rate": 6.51558073654391e-05, "loss": 0.1714, "step": 5479 }, { "epoch": 2.0236336779911372, "grad_norm": 0.20658475160598755, "learning_rate": 6.51311737898756e-05, "loss": 0.1277, "step": 5480 }, { "epoch": 2.024002954209749, "grad_norm": 0.25312045216560364, "learning_rate": 6.510654021431211e-05, "loss": 0.1585, "step": 5481 }, { "epoch": 2.0243722304283605, "grad_norm": 0.27742844820022583, "learning_rate": 6.508190663874861e-05, "loss": 0.1957, "step": 5482 }, { "epoch": 2.024741506646972, "grad_norm": 0.2213859111070633, "learning_rate": 6.505727306318512e-05, "loss": 0.1625, "step": 5483 }, { "epoch": 2.0251107828655837, "grad_norm": 0.24842393398284912, "learning_rate": 6.503263948762163e-05, "loss": 0.1702, "step": 5484 }, { "epoch": 2.025480059084195, "grad_norm": 0.3059009313583374, "learning_rate": 6.500800591205814e-05, "loss": 0.1568, "step": 5485 }, { "epoch": 2.0258493353028064, "grad_norm": 0.2236127257347107, "learning_rate": 6.498337233649464e-05, "loss": 0.1861, "step": 5486 }, { "epoch": 2.026218611521418, "grad_norm": 0.27241796255111694, "learning_rate": 6.495873876093114e-05, "loss": 0.1783, "step": 5487 }, { "epoch": 2.0265878877400296, "grad_norm": 0.27785786986351013, "learning_rate": 6.493410518536766e-05, "loss": 0.1795, "step": 5488 }, { "epoch": 2.0269571639586412, "grad_norm": 0.27707716822624207, "learning_rate": 6.490947160980416e-05, "loss": 0.1861, "step": 5489 }, { "epoch": 2.0273264401772524, "grad_norm": 0.23874297738075256, "learning_rate": 6.488483803424067e-05, "loss": 0.1639, "step": 5490 }, { "epoch": 2.027695716395864, "grad_norm": 0.30940431356430054, "learning_rate": 6.486020445867717e-05, "loss": 0.1895, "step": 5491 }, { "epoch": 2.0280649926144756, "grad_norm": 0.25607529282569885, "learning_rate": 6.483557088311369e-05, "loss": 0.1672, "step": 5492 }, { "epoch": 2.028434268833087, "grad_norm": 0.2539975345134735, "learning_rate": 6.481093730755019e-05, "loss": 0.1549, "step": 5493 }, { "epoch": 2.028803545051699, "grad_norm": 0.26156702637672424, "learning_rate": 6.478630373198669e-05, "loss": 0.1491, "step": 5494 }, { "epoch": 2.0291728212703104, "grad_norm": 0.24374203383922577, "learning_rate": 6.47616701564232e-05, "loss": 0.14, "step": 5495 }, { "epoch": 2.0295420974889216, "grad_norm": 0.2342911958694458, "learning_rate": 6.47370365808597e-05, "loss": 0.1472, "step": 5496 }, { "epoch": 2.029911373707533, "grad_norm": 0.21979846060276031, "learning_rate": 6.471240300529622e-05, "loss": 0.1593, "step": 5497 }, { "epoch": 2.0302806499261448, "grad_norm": 0.22841975092887878, "learning_rate": 6.468776942973272e-05, "loss": 0.1472, "step": 5498 }, { "epoch": 2.0306499261447564, "grad_norm": 0.31238147616386414, "learning_rate": 6.466313585416924e-05, "loss": 0.1901, "step": 5499 }, { "epoch": 2.031019202363368, "grad_norm": 0.24541234970092773, "learning_rate": 6.463850227860574e-05, "loss": 0.1618, "step": 5500 }, { "epoch": 2.031019202363368, "eval_loss": 0.2564271092414856, "eval_runtime": 5.8692, "eval_samples_per_second": 8.519, "eval_steps_per_second": 1.193, "step": 5500 }, { "epoch": 2.031388478581979, "grad_norm": 0.2768562138080597, "learning_rate": 6.461386870304225e-05, "loss": 0.1751, "step": 5501 }, { "epoch": 2.0317577548005907, "grad_norm": 0.2732270658016205, "learning_rate": 6.458923512747875e-05, "loss": 0.1786, "step": 5502 }, { "epoch": 2.0321270310192023, "grad_norm": 0.296970933675766, "learning_rate": 6.456460155191526e-05, "loss": 0.1761, "step": 5503 }, { "epoch": 2.032496307237814, "grad_norm": 0.2972950339317322, "learning_rate": 6.453996797635177e-05, "loss": 0.1774, "step": 5504 }, { "epoch": 2.0328655834564255, "grad_norm": 0.24320095777511597, "learning_rate": 6.451533440078827e-05, "loss": 0.1708, "step": 5505 }, { "epoch": 2.033234859675037, "grad_norm": 0.2769164443016052, "learning_rate": 6.449070082522479e-05, "loss": 0.1649, "step": 5506 }, { "epoch": 2.0336041358936483, "grad_norm": 0.2682860195636749, "learning_rate": 6.446606724966129e-05, "loss": 0.1511, "step": 5507 }, { "epoch": 2.03397341211226, "grad_norm": 0.22829866409301758, "learning_rate": 6.44414336740978e-05, "loss": 0.1419, "step": 5508 }, { "epoch": 2.0343426883308715, "grad_norm": 0.34367161989212036, "learning_rate": 6.44168000985343e-05, "loss": 0.1817, "step": 5509 }, { "epoch": 2.034711964549483, "grad_norm": 0.2511090636253357, "learning_rate": 6.43921665229708e-05, "loss": 0.1587, "step": 5510 }, { "epoch": 2.0350812407680947, "grad_norm": 0.30627578496932983, "learning_rate": 6.436753294740732e-05, "loss": 0.1792, "step": 5511 }, { "epoch": 2.035450516986706, "grad_norm": 0.26363179087638855, "learning_rate": 6.434289937184382e-05, "loss": 0.1718, "step": 5512 }, { "epoch": 2.0358197932053175, "grad_norm": 0.2435271292924881, "learning_rate": 6.431826579628033e-05, "loss": 0.162, "step": 5513 }, { "epoch": 2.036189069423929, "grad_norm": 0.23334969580173492, "learning_rate": 6.429363222071683e-05, "loss": 0.1541, "step": 5514 }, { "epoch": 2.0365583456425407, "grad_norm": 0.22303305566310883, "learning_rate": 6.426899864515335e-05, "loss": 0.1788, "step": 5515 }, { "epoch": 2.0369276218611523, "grad_norm": 0.25421905517578125, "learning_rate": 6.424436506958985e-05, "loss": 0.1942, "step": 5516 }, { "epoch": 2.037296898079764, "grad_norm": 0.21403385698795319, "learning_rate": 6.421973149402637e-05, "loss": 0.1451, "step": 5517 }, { "epoch": 2.037666174298375, "grad_norm": 0.2948196232318878, "learning_rate": 6.419509791846287e-05, "loss": 0.1988, "step": 5518 }, { "epoch": 2.0380354505169866, "grad_norm": 0.2889154553413391, "learning_rate": 6.417046434289937e-05, "loss": 0.1902, "step": 5519 }, { "epoch": 2.0384047267355982, "grad_norm": 0.2615763545036316, "learning_rate": 6.414583076733588e-05, "loss": 0.1752, "step": 5520 }, { "epoch": 2.03877400295421, "grad_norm": 0.2170265167951584, "learning_rate": 6.412119719177238e-05, "loss": 0.1562, "step": 5521 }, { "epoch": 2.0391432791728215, "grad_norm": 0.26524242758750916, "learning_rate": 6.40965636162089e-05, "loss": 0.151, "step": 5522 }, { "epoch": 2.0395125553914326, "grad_norm": 0.2578350007534027, "learning_rate": 6.40719300406454e-05, "loss": 0.1652, "step": 5523 }, { "epoch": 2.039881831610044, "grad_norm": 0.24465960264205933, "learning_rate": 6.404729646508191e-05, "loss": 0.1788, "step": 5524 }, { "epoch": 2.040251107828656, "grad_norm": 0.27955836057662964, "learning_rate": 6.402266288951841e-05, "loss": 0.1638, "step": 5525 }, { "epoch": 2.0406203840472674, "grad_norm": 0.3361818492412567, "learning_rate": 6.399802931395492e-05, "loss": 0.1654, "step": 5526 }, { "epoch": 2.040989660265879, "grad_norm": 0.3011032044887543, "learning_rate": 6.397339573839143e-05, "loss": 0.1583, "step": 5527 }, { "epoch": 2.04135893648449, "grad_norm": 0.34659239649772644, "learning_rate": 6.394876216282793e-05, "loss": 0.2047, "step": 5528 }, { "epoch": 2.041728212703102, "grad_norm": 0.24762535095214844, "learning_rate": 6.392412858726445e-05, "loss": 0.1635, "step": 5529 }, { "epoch": 2.0420974889217134, "grad_norm": 0.25023153424263, "learning_rate": 6.389949501170095e-05, "loss": 0.165, "step": 5530 }, { "epoch": 2.042466765140325, "grad_norm": 0.34106186032295227, "learning_rate": 6.387486143613746e-05, "loss": 0.19, "step": 5531 }, { "epoch": 2.0428360413589366, "grad_norm": 0.21632955968379974, "learning_rate": 6.385022786057396e-05, "loss": 0.1568, "step": 5532 }, { "epoch": 2.043205317577548, "grad_norm": 0.24720235168933868, "learning_rate": 6.382559428501046e-05, "loss": 0.1443, "step": 5533 }, { "epoch": 2.0435745937961594, "grad_norm": 0.23382613062858582, "learning_rate": 6.380096070944698e-05, "loss": 0.1661, "step": 5534 }, { "epoch": 2.043943870014771, "grad_norm": 0.277658075094223, "learning_rate": 6.377632713388348e-05, "loss": 0.1561, "step": 5535 }, { "epoch": 2.0443131462333826, "grad_norm": 0.2528332769870758, "learning_rate": 6.375169355832e-05, "loss": 0.1645, "step": 5536 }, { "epoch": 2.044682422451994, "grad_norm": 0.30018797516822815, "learning_rate": 6.37270599827565e-05, "loss": 0.1701, "step": 5537 }, { "epoch": 2.0450516986706058, "grad_norm": 0.2822677195072174, "learning_rate": 6.370242640719301e-05, "loss": 0.1809, "step": 5538 }, { "epoch": 2.045420974889217, "grad_norm": 0.2654092013835907, "learning_rate": 6.367779283162951e-05, "loss": 0.1632, "step": 5539 }, { "epoch": 2.0457902511078285, "grad_norm": 0.3735482394695282, "learning_rate": 6.365315925606603e-05, "loss": 0.1651, "step": 5540 }, { "epoch": 2.04615952732644, "grad_norm": 0.2744656205177307, "learning_rate": 6.362852568050253e-05, "loss": 0.1583, "step": 5541 }, { "epoch": 2.0465288035450517, "grad_norm": 0.275022029876709, "learning_rate": 6.360389210493903e-05, "loss": 0.1627, "step": 5542 }, { "epoch": 2.0468980797636633, "grad_norm": 0.2592197060585022, "learning_rate": 6.357925852937554e-05, "loss": 0.1931, "step": 5543 }, { "epoch": 2.047267355982275, "grad_norm": 0.27321985363960266, "learning_rate": 6.355462495381204e-05, "loss": 0.1649, "step": 5544 }, { "epoch": 2.047636632200886, "grad_norm": 0.2739521265029907, "learning_rate": 6.352999137824856e-05, "loss": 0.1475, "step": 5545 }, { "epoch": 2.0480059084194977, "grad_norm": 0.36737266182899475, "learning_rate": 6.350535780268506e-05, "loss": 0.1709, "step": 5546 }, { "epoch": 2.0483751846381093, "grad_norm": 0.22715000808238983, "learning_rate": 6.348072422712157e-05, "loss": 0.1704, "step": 5547 }, { "epoch": 2.048744460856721, "grad_norm": 0.31132203340530396, "learning_rate": 6.345609065155808e-05, "loss": 0.1977, "step": 5548 }, { "epoch": 2.0491137370753325, "grad_norm": 0.3072618842124939, "learning_rate": 6.343145707599458e-05, "loss": 0.1749, "step": 5549 }, { "epoch": 2.0494830132939437, "grad_norm": 0.2475789487361908, "learning_rate": 6.340682350043109e-05, "loss": 0.1598, "step": 5550 }, { "epoch": 2.0494830132939437, "eval_loss": 0.25815775990486145, "eval_runtime": 5.8602, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.194, "step": 5550 }, { "epoch": 2.0498522895125553, "grad_norm": 0.23874615132808685, "learning_rate": 6.338218992486759e-05, "loss": 0.1459, "step": 5551 }, { "epoch": 2.050221565731167, "grad_norm": 0.25430697202682495, "learning_rate": 6.33575563493041e-05, "loss": 0.192, "step": 5552 }, { "epoch": 2.0505908419497785, "grad_norm": 0.2793968915939331, "learning_rate": 6.333292277374061e-05, "loss": 0.1772, "step": 5553 }, { "epoch": 2.05096011816839, "grad_norm": 0.26261192560195923, "learning_rate": 6.330828919817712e-05, "loss": 0.1658, "step": 5554 }, { "epoch": 2.0513293943870017, "grad_norm": 0.3066626489162445, "learning_rate": 6.328365562261362e-05, "loss": 0.1926, "step": 5555 }, { "epoch": 2.051698670605613, "grad_norm": 0.2415955513715744, "learning_rate": 6.325902204705014e-05, "loss": 0.1712, "step": 5556 }, { "epoch": 2.0520679468242244, "grad_norm": 0.2773614525794983, "learning_rate": 6.323438847148664e-05, "loss": 0.1604, "step": 5557 }, { "epoch": 2.052437223042836, "grad_norm": 0.2865541875362396, "learning_rate": 6.320975489592314e-05, "loss": 0.1712, "step": 5558 }, { "epoch": 2.0528064992614476, "grad_norm": 0.351630836725235, "learning_rate": 6.318512132035965e-05, "loss": 0.1914, "step": 5559 }, { "epoch": 2.0531757754800593, "grad_norm": 0.2456756830215454, "learning_rate": 6.316048774479616e-05, "loss": 0.1666, "step": 5560 }, { "epoch": 2.0535450516986704, "grad_norm": 0.23966416716575623, "learning_rate": 6.313585416923267e-05, "loss": 0.1635, "step": 5561 }, { "epoch": 2.053914327917282, "grad_norm": 0.2779577970504761, "learning_rate": 6.311122059366917e-05, "loss": 0.158, "step": 5562 }, { "epoch": 2.0542836041358936, "grad_norm": 0.31444647908210754, "learning_rate": 6.308658701810569e-05, "loss": 0.1747, "step": 5563 }, { "epoch": 2.054652880354505, "grad_norm": 0.2886969745159149, "learning_rate": 6.306195344254219e-05, "loss": 0.185, "step": 5564 }, { "epoch": 2.055022156573117, "grad_norm": 0.2612532377243042, "learning_rate": 6.303731986697869e-05, "loss": 0.1785, "step": 5565 }, { "epoch": 2.0553914327917284, "grad_norm": 0.22525450587272644, "learning_rate": 6.30126862914152e-05, "loss": 0.1506, "step": 5566 }, { "epoch": 2.0557607090103396, "grad_norm": 0.2780967354774475, "learning_rate": 6.29880527158517e-05, "loss": 0.1709, "step": 5567 }, { "epoch": 2.056129985228951, "grad_norm": 0.29679059982299805, "learning_rate": 6.296341914028822e-05, "loss": 0.1633, "step": 5568 }, { "epoch": 2.056499261447563, "grad_norm": 0.27485817670822144, "learning_rate": 6.293878556472472e-05, "loss": 0.1782, "step": 5569 }, { "epoch": 2.0568685376661744, "grad_norm": 0.2389979511499405, "learning_rate": 6.291415198916123e-05, "loss": 0.1636, "step": 5570 }, { "epoch": 2.057237813884786, "grad_norm": 0.2602463662624359, "learning_rate": 6.288951841359774e-05, "loss": 0.1719, "step": 5571 }, { "epoch": 2.057607090103397, "grad_norm": 0.25759291648864746, "learning_rate": 6.286488483803425e-05, "loss": 0.1322, "step": 5572 }, { "epoch": 2.0579763663220088, "grad_norm": 0.24198909103870392, "learning_rate": 6.284025126247075e-05, "loss": 0.1609, "step": 5573 }, { "epoch": 2.0583456425406204, "grad_norm": 0.301605224609375, "learning_rate": 6.281561768690725e-05, "loss": 0.1893, "step": 5574 }, { "epoch": 2.058714918759232, "grad_norm": 0.3183390498161316, "learning_rate": 6.279098411134377e-05, "loss": 0.1738, "step": 5575 }, { "epoch": 2.0590841949778436, "grad_norm": 0.27507612109184265, "learning_rate": 6.276635053578027e-05, "loss": 0.176, "step": 5576 }, { "epoch": 2.059453471196455, "grad_norm": 0.2369697540998459, "learning_rate": 6.274171696021678e-05, "loss": 0.1693, "step": 5577 }, { "epoch": 2.0598227474150663, "grad_norm": 0.2782268524169922, "learning_rate": 6.271708338465328e-05, "loss": 0.1729, "step": 5578 }, { "epoch": 2.060192023633678, "grad_norm": 0.2308182567358017, "learning_rate": 6.26924498090898e-05, "loss": 0.1646, "step": 5579 }, { "epoch": 2.0605612998522895, "grad_norm": 0.30158933997154236, "learning_rate": 6.26678162335263e-05, "loss": 0.1658, "step": 5580 }, { "epoch": 2.060930576070901, "grad_norm": 0.21813644468784332, "learning_rate": 6.26431826579628e-05, "loss": 0.1644, "step": 5581 }, { "epoch": 2.0612998522895127, "grad_norm": 0.21864870190620422, "learning_rate": 6.261854908239932e-05, "loss": 0.1573, "step": 5582 }, { "epoch": 2.061669128508124, "grad_norm": 0.27184343338012695, "learning_rate": 6.259391550683582e-05, "loss": 0.1883, "step": 5583 }, { "epoch": 2.0620384047267355, "grad_norm": 0.2512473464012146, "learning_rate": 6.256928193127233e-05, "loss": 0.1885, "step": 5584 }, { "epoch": 2.062407680945347, "grad_norm": 0.25281256437301636, "learning_rate": 6.254464835570883e-05, "loss": 0.1699, "step": 5585 }, { "epoch": 2.0627769571639587, "grad_norm": 0.25659340620040894, "learning_rate": 6.252001478014535e-05, "loss": 0.179, "step": 5586 }, { "epoch": 2.0631462333825703, "grad_norm": 0.2874915897846222, "learning_rate": 6.249538120458185e-05, "loss": 0.1606, "step": 5587 }, { "epoch": 2.0635155096011815, "grad_norm": 0.24694329500198364, "learning_rate": 6.247074762901836e-05, "loss": 0.1731, "step": 5588 }, { "epoch": 2.063884785819793, "grad_norm": 0.3023461401462555, "learning_rate": 6.244611405345486e-05, "loss": 0.199, "step": 5589 }, { "epoch": 2.0642540620384047, "grad_norm": 0.2268141806125641, "learning_rate": 6.242148047789136e-05, "loss": 0.1469, "step": 5590 }, { "epoch": 2.0646233382570163, "grad_norm": 0.2839057743549347, "learning_rate": 6.239684690232788e-05, "loss": 0.2117, "step": 5591 }, { "epoch": 2.064992614475628, "grad_norm": 0.24757793545722961, "learning_rate": 6.237221332676438e-05, "loss": 0.1649, "step": 5592 }, { "epoch": 2.0653618906942395, "grad_norm": 0.2589331865310669, "learning_rate": 6.23475797512009e-05, "loss": 0.1732, "step": 5593 }, { "epoch": 2.0657311669128506, "grad_norm": 0.2764144539833069, "learning_rate": 6.23229461756374e-05, "loss": 0.1644, "step": 5594 }, { "epoch": 2.0661004431314622, "grad_norm": 0.27270591259002686, "learning_rate": 6.229831260007391e-05, "loss": 0.1705, "step": 5595 }, { "epoch": 2.066469719350074, "grad_norm": 0.2604628801345825, "learning_rate": 6.227367902451041e-05, "loss": 0.1793, "step": 5596 }, { "epoch": 2.0668389955686854, "grad_norm": 0.255330890417099, "learning_rate": 6.224904544894691e-05, "loss": 0.1455, "step": 5597 }, { "epoch": 2.067208271787297, "grad_norm": 0.23751473426818848, "learning_rate": 6.222441187338343e-05, "loss": 0.1654, "step": 5598 }, { "epoch": 2.067577548005908, "grad_norm": 0.27682632207870483, "learning_rate": 6.219977829781993e-05, "loss": 0.1736, "step": 5599 }, { "epoch": 2.06794682422452, "grad_norm": 0.24163684248924255, "learning_rate": 6.217514472225644e-05, "loss": 0.1672, "step": 5600 }, { "epoch": 2.06794682422452, "eval_loss": 0.260868102312088, "eval_runtime": 5.8622, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 5600 }, { "epoch": 2.0683161004431314, "grad_norm": 0.25598397850990295, "learning_rate": 6.215051114669294e-05, "loss": 0.1682, "step": 5601 }, { "epoch": 2.068685376661743, "grad_norm": 0.30159834027290344, "learning_rate": 6.212587757112946e-05, "loss": 0.2142, "step": 5602 }, { "epoch": 2.0690546528803546, "grad_norm": 0.25344955921173096, "learning_rate": 6.210124399556596e-05, "loss": 0.1854, "step": 5603 }, { "epoch": 2.069423929098966, "grad_norm": 0.24966850876808167, "learning_rate": 6.207661042000246e-05, "loss": 0.1933, "step": 5604 }, { "epoch": 2.0697932053175774, "grad_norm": 0.2880747616291046, "learning_rate": 6.205197684443898e-05, "loss": 0.1898, "step": 5605 }, { "epoch": 2.070162481536189, "grad_norm": 0.2810298800468445, "learning_rate": 6.202734326887548e-05, "loss": 0.1896, "step": 5606 }, { "epoch": 2.0705317577548006, "grad_norm": 0.2763424217700958, "learning_rate": 6.200270969331199e-05, "loss": 0.1501, "step": 5607 }, { "epoch": 2.070901033973412, "grad_norm": 0.36380735039711, "learning_rate": 6.197807611774849e-05, "loss": 0.1919, "step": 5608 }, { "epoch": 2.071270310192024, "grad_norm": 0.28891077637672424, "learning_rate": 6.195344254218501e-05, "loss": 0.1598, "step": 5609 }, { "epoch": 2.071639586410635, "grad_norm": 0.23979365825653076, "learning_rate": 6.192880896662151e-05, "loss": 0.1534, "step": 5610 }, { "epoch": 2.0720088626292466, "grad_norm": 0.3012255132198334, "learning_rate": 6.190417539105802e-05, "loss": 0.1798, "step": 5611 }, { "epoch": 2.072378138847858, "grad_norm": 0.27794837951660156, "learning_rate": 6.187954181549452e-05, "loss": 0.1948, "step": 5612 }, { "epoch": 2.0727474150664698, "grad_norm": 0.2556271255016327, "learning_rate": 6.185490823993103e-05, "loss": 0.1652, "step": 5613 }, { "epoch": 2.0731166912850814, "grad_norm": 0.3252602219581604, "learning_rate": 6.183027466436754e-05, "loss": 0.1782, "step": 5614 }, { "epoch": 2.073485967503693, "grad_norm": 0.32153990864753723, "learning_rate": 6.180564108880404e-05, "loss": 0.1885, "step": 5615 }, { "epoch": 2.073855243722304, "grad_norm": 0.3016199469566345, "learning_rate": 6.178100751324056e-05, "loss": 0.1746, "step": 5616 }, { "epoch": 2.0742245199409157, "grad_norm": 0.25367647409439087, "learning_rate": 6.175637393767706e-05, "loss": 0.1647, "step": 5617 }, { "epoch": 2.0745937961595273, "grad_norm": 0.34137043356895447, "learning_rate": 6.173174036211357e-05, "loss": 0.1779, "step": 5618 }, { "epoch": 2.074963072378139, "grad_norm": 0.26231876015663147, "learning_rate": 6.170710678655007e-05, "loss": 0.1695, "step": 5619 }, { "epoch": 2.0753323485967505, "grad_norm": 0.26675739884376526, "learning_rate": 6.168247321098657e-05, "loss": 0.1706, "step": 5620 }, { "epoch": 2.0757016248153617, "grad_norm": 0.3152809739112854, "learning_rate": 6.165783963542309e-05, "loss": 0.1905, "step": 5621 }, { "epoch": 2.0760709010339733, "grad_norm": 0.24944278597831726, "learning_rate": 6.163320605985959e-05, "loss": 0.1589, "step": 5622 }, { "epoch": 2.076440177252585, "grad_norm": 0.23552517592906952, "learning_rate": 6.16085724842961e-05, "loss": 0.1459, "step": 5623 }, { "epoch": 2.0768094534711965, "grad_norm": 0.26706674695014954, "learning_rate": 6.15839389087326e-05, "loss": 0.1648, "step": 5624 }, { "epoch": 2.077178729689808, "grad_norm": 0.28164857625961304, "learning_rate": 6.155930533316912e-05, "loss": 0.1594, "step": 5625 }, { "epoch": 2.0775480059084197, "grad_norm": 0.2577347457408905, "learning_rate": 6.153467175760562e-05, "loss": 0.1957, "step": 5626 }, { "epoch": 2.077917282127031, "grad_norm": 0.29985523223876953, "learning_rate": 6.151003818204214e-05, "loss": 0.2031, "step": 5627 }, { "epoch": 2.0782865583456425, "grad_norm": 0.3037504255771637, "learning_rate": 6.148540460647864e-05, "loss": 0.1784, "step": 5628 }, { "epoch": 2.078655834564254, "grad_norm": 0.2820575535297394, "learning_rate": 6.146077103091514e-05, "loss": 0.1661, "step": 5629 }, { "epoch": 2.0790251107828657, "grad_norm": 0.30463945865631104, "learning_rate": 6.143613745535165e-05, "loss": 0.2109, "step": 5630 }, { "epoch": 2.0793943870014773, "grad_norm": 0.26707136631011963, "learning_rate": 6.141150387978815e-05, "loss": 0.1838, "step": 5631 }, { "epoch": 2.0797636632200884, "grad_norm": 0.29655009508132935, "learning_rate": 6.138687030422467e-05, "loss": 0.165, "step": 5632 }, { "epoch": 2.0801329394387, "grad_norm": 0.24325284361839294, "learning_rate": 6.136223672866117e-05, "loss": 0.1723, "step": 5633 }, { "epoch": 2.0805022156573116, "grad_norm": 0.26495853066444397, "learning_rate": 6.133760315309768e-05, "loss": 0.1832, "step": 5634 }, { "epoch": 2.0808714918759232, "grad_norm": 0.21448932588100433, "learning_rate": 6.131296957753418e-05, "loss": 0.1364, "step": 5635 }, { "epoch": 2.081240768094535, "grad_norm": 0.2707633376121521, "learning_rate": 6.128833600197069e-05, "loss": 0.1593, "step": 5636 }, { "epoch": 2.0816100443131464, "grad_norm": 0.3235558271408081, "learning_rate": 6.12637024264072e-05, "loss": 0.1714, "step": 5637 }, { "epoch": 2.0819793205317576, "grad_norm": 0.24738089740276337, "learning_rate": 6.12390688508437e-05, "loss": 0.1734, "step": 5638 }, { "epoch": 2.082348596750369, "grad_norm": 0.2667107880115509, "learning_rate": 6.121443527528022e-05, "loss": 0.1825, "step": 5639 }, { "epoch": 2.082717872968981, "grad_norm": 0.307449609041214, "learning_rate": 6.118980169971672e-05, "loss": 0.1724, "step": 5640 }, { "epoch": 2.0830871491875924, "grad_norm": 0.25481271743774414, "learning_rate": 6.116516812415323e-05, "loss": 0.1765, "step": 5641 }, { "epoch": 2.083456425406204, "grad_norm": 0.22381843626499176, "learning_rate": 6.114053454858973e-05, "loss": 0.1563, "step": 5642 }, { "epoch": 2.083825701624815, "grad_norm": 0.26986509561538696, "learning_rate": 6.111590097302625e-05, "loss": 0.1697, "step": 5643 }, { "epoch": 2.0841949778434268, "grad_norm": 0.22092558443546295, "learning_rate": 6.109126739746275e-05, "loss": 0.1517, "step": 5644 }, { "epoch": 2.0845642540620384, "grad_norm": 0.29334571957588196, "learning_rate": 6.106663382189925e-05, "loss": 0.1742, "step": 5645 }, { "epoch": 2.08493353028065, "grad_norm": 0.29605746269226074, "learning_rate": 6.104200024633576e-05, "loss": 0.1656, "step": 5646 }, { "epoch": 2.0853028064992616, "grad_norm": 0.28364232182502747, "learning_rate": 6.101736667077227e-05, "loss": 0.1706, "step": 5647 }, { "epoch": 2.085672082717873, "grad_norm": 0.21887627243995667, "learning_rate": 6.099273309520877e-05, "loss": 0.1366, "step": 5648 }, { "epoch": 2.0860413589364843, "grad_norm": 0.2478788197040558, "learning_rate": 6.096809951964528e-05, "loss": 0.1793, "step": 5649 }, { "epoch": 2.086410635155096, "grad_norm": 0.27544811367988586, "learning_rate": 6.094346594408179e-05, "loss": 0.2005, "step": 5650 }, { "epoch": 2.086410635155096, "eval_loss": 0.25888141989707947, "eval_runtime": 5.8564, "eval_samples_per_second": 8.538, "eval_steps_per_second": 1.195, "step": 5650 }, { "epoch": 2.0867799113737076, "grad_norm": 0.24575506150722504, "learning_rate": 6.09188323685183e-05, "loss": 0.1667, "step": 5651 }, { "epoch": 2.087149187592319, "grad_norm": 0.236062154173851, "learning_rate": 6.0894198792954805e-05, "loss": 0.1763, "step": 5652 }, { "epoch": 2.0875184638109308, "grad_norm": 0.245724618434906, "learning_rate": 6.086956521739131e-05, "loss": 0.165, "step": 5653 }, { "epoch": 2.087887740029542, "grad_norm": 0.22567129135131836, "learning_rate": 6.084493164182782e-05, "loss": 0.1586, "step": 5654 }, { "epoch": 2.0882570162481535, "grad_norm": 0.2369643896818161, "learning_rate": 6.082029806626433e-05, "loss": 0.1555, "step": 5655 }, { "epoch": 2.088626292466765, "grad_norm": 0.2373889833688736, "learning_rate": 6.079566449070083e-05, "loss": 0.1789, "step": 5656 }, { "epoch": 2.0889955686853767, "grad_norm": 0.2397778481245041, "learning_rate": 6.077103091513734e-05, "loss": 0.1715, "step": 5657 }, { "epoch": 2.0893648449039883, "grad_norm": 0.28299954533576965, "learning_rate": 6.0746397339573845e-05, "loss": 0.182, "step": 5658 }, { "epoch": 2.0897341211226, "grad_norm": 0.26077738404273987, "learning_rate": 6.072176376401035e-05, "loss": 0.1925, "step": 5659 }, { "epoch": 2.090103397341211, "grad_norm": 0.26217538118362427, "learning_rate": 6.069713018844686e-05, "loss": 0.1806, "step": 5660 }, { "epoch": 2.0904726735598227, "grad_norm": 0.2796938717365265, "learning_rate": 6.067249661288337e-05, "loss": 0.176, "step": 5661 }, { "epoch": 2.0908419497784343, "grad_norm": 0.2654780149459839, "learning_rate": 6.0647863037319877e-05, "loss": 0.1719, "step": 5662 }, { "epoch": 2.091211225997046, "grad_norm": 0.35504603385925293, "learning_rate": 6.0623229461756384e-05, "loss": 0.1886, "step": 5663 }, { "epoch": 2.0915805022156575, "grad_norm": 0.2887597680091858, "learning_rate": 6.0598595886192886e-05, "loss": 0.1772, "step": 5664 }, { "epoch": 2.0919497784342687, "grad_norm": 0.22546793520450592, "learning_rate": 6.057396231062939e-05, "loss": 0.1572, "step": 5665 }, { "epoch": 2.0923190546528803, "grad_norm": 0.29904258251190186, "learning_rate": 6.05493287350659e-05, "loss": 0.2133, "step": 5666 }, { "epoch": 2.092688330871492, "grad_norm": 0.24293597042560577, "learning_rate": 6.052469515950241e-05, "loss": 0.1823, "step": 5667 }, { "epoch": 2.0930576070901035, "grad_norm": 0.24957305192947388, "learning_rate": 6.050006158393892e-05, "loss": 0.1585, "step": 5668 }, { "epoch": 2.093426883308715, "grad_norm": 0.28926122188568115, "learning_rate": 6.0475428008375425e-05, "loss": 0.1833, "step": 5669 }, { "epoch": 2.0937961595273262, "grad_norm": 0.2231256365776062, "learning_rate": 6.045079443281193e-05, "loss": 0.1712, "step": 5670 }, { "epoch": 2.094165435745938, "grad_norm": 0.22925357520580292, "learning_rate": 6.042616085724844e-05, "loss": 0.166, "step": 5671 }, { "epoch": 2.0945347119645494, "grad_norm": 0.2851234972476959, "learning_rate": 6.040152728168494e-05, "loss": 0.1831, "step": 5672 }, { "epoch": 2.094903988183161, "grad_norm": 0.28612107038497925, "learning_rate": 6.037689370612145e-05, "loss": 0.1596, "step": 5673 }, { "epoch": 2.0952732644017726, "grad_norm": 0.26201173663139343, "learning_rate": 6.035226013055796e-05, "loss": 0.1571, "step": 5674 }, { "epoch": 2.0956425406203842, "grad_norm": 0.3124045431613922, "learning_rate": 6.0327626554994465e-05, "loss": 0.1723, "step": 5675 }, { "epoch": 2.0960118168389954, "grad_norm": 0.33962711691856384, "learning_rate": 6.030299297943097e-05, "loss": 0.1974, "step": 5676 }, { "epoch": 2.096381093057607, "grad_norm": 0.28157955408096313, "learning_rate": 6.027835940386748e-05, "loss": 0.1745, "step": 5677 }, { "epoch": 2.0967503692762186, "grad_norm": 0.27035462856292725, "learning_rate": 6.025372582830399e-05, "loss": 0.161, "step": 5678 }, { "epoch": 2.09711964549483, "grad_norm": 0.26044461131095886, "learning_rate": 6.022909225274049e-05, "loss": 0.1587, "step": 5679 }, { "epoch": 2.097488921713442, "grad_norm": 0.25345587730407715, "learning_rate": 6.0204458677177e-05, "loss": 0.1576, "step": 5680 }, { "epoch": 2.097858197932053, "grad_norm": 0.3048918843269348, "learning_rate": 6.0179825101613506e-05, "loss": 0.1781, "step": 5681 }, { "epoch": 2.0982274741506646, "grad_norm": 0.29959362745285034, "learning_rate": 6.0155191526050014e-05, "loss": 0.1477, "step": 5682 }, { "epoch": 2.098596750369276, "grad_norm": 0.38074469566345215, "learning_rate": 6.013055795048652e-05, "loss": 0.1791, "step": 5683 }, { "epoch": 2.098966026587888, "grad_norm": 0.278363436460495, "learning_rate": 6.010592437492303e-05, "loss": 0.1654, "step": 5684 }, { "epoch": 2.0993353028064994, "grad_norm": 0.28001198172569275, "learning_rate": 6.008129079935954e-05, "loss": 0.1688, "step": 5685 }, { "epoch": 2.099704579025111, "grad_norm": 0.32029715180397034, "learning_rate": 6.0056657223796045e-05, "loss": 0.1629, "step": 5686 }, { "epoch": 2.100073855243722, "grad_norm": 0.2780267298221588, "learning_rate": 6.0032023648232546e-05, "loss": 0.1759, "step": 5687 }, { "epoch": 2.1004431314623337, "grad_norm": 0.2489921599626541, "learning_rate": 6.0007390072669054e-05, "loss": 0.1575, "step": 5688 }, { "epoch": 2.1008124076809453, "grad_norm": 0.2580353915691376, "learning_rate": 5.9982756497105555e-05, "loss": 0.1509, "step": 5689 }, { "epoch": 2.101181683899557, "grad_norm": 0.24058429896831512, "learning_rate": 5.9958122921542056e-05, "loss": 0.1673, "step": 5690 }, { "epoch": 2.1015509601181686, "grad_norm": 0.2796524167060852, "learning_rate": 5.9933489345978564e-05, "loss": 0.1593, "step": 5691 }, { "epoch": 2.1019202363367797, "grad_norm": 0.2787950336933136, "learning_rate": 5.990885577041507e-05, "loss": 0.1819, "step": 5692 }, { "epoch": 2.1022895125553913, "grad_norm": 0.2514388859272003, "learning_rate": 5.988422219485158e-05, "loss": 0.1902, "step": 5693 }, { "epoch": 2.102658788774003, "grad_norm": 0.2770799696445465, "learning_rate": 5.985958861928809e-05, "loss": 0.1737, "step": 5694 }, { "epoch": 2.1030280649926145, "grad_norm": 0.2315564602613449, "learning_rate": 5.9834955043724595e-05, "loss": 0.1452, "step": 5695 }, { "epoch": 2.103397341211226, "grad_norm": 0.29391729831695557, "learning_rate": 5.98103214681611e-05, "loss": 0.2115, "step": 5696 }, { "epoch": 2.1037666174298377, "grad_norm": 0.2725535035133362, "learning_rate": 5.978568789259761e-05, "loss": 0.1654, "step": 5697 }, { "epoch": 2.104135893648449, "grad_norm": 0.24598264694213867, "learning_rate": 5.976105431703411e-05, "loss": 0.1526, "step": 5698 }, { "epoch": 2.1045051698670605, "grad_norm": 0.28126004338264465, "learning_rate": 5.973642074147062e-05, "loss": 0.1583, "step": 5699 }, { "epoch": 2.104874446085672, "grad_norm": 0.3185833692550659, "learning_rate": 5.971178716590713e-05, "loss": 0.2104, "step": 5700 }, { "epoch": 2.104874446085672, "eval_loss": 0.26025980710983276, "eval_runtime": 5.8698, "eval_samples_per_second": 8.518, "eval_steps_per_second": 1.193, "step": 5700 }, { "epoch": 2.1052437223042837, "grad_norm": 0.25908419489860535, "learning_rate": 5.9687153590343636e-05, "loss": 0.1657, "step": 5701 }, { "epoch": 2.1056129985228953, "grad_norm": 0.2654663324356079, "learning_rate": 5.9662520014780144e-05, "loss": 0.1687, "step": 5702 }, { "epoch": 2.1059822747415065, "grad_norm": 0.2636914849281311, "learning_rate": 5.963788643921665e-05, "loss": 0.1869, "step": 5703 }, { "epoch": 2.106351550960118, "grad_norm": 0.3741796314716339, "learning_rate": 5.961325286365316e-05, "loss": 0.2066, "step": 5704 }, { "epoch": 2.1067208271787297, "grad_norm": 0.2857508659362793, "learning_rate": 5.958861928808967e-05, "loss": 0.2045, "step": 5705 }, { "epoch": 2.1070901033973413, "grad_norm": 0.30921655893325806, "learning_rate": 5.956398571252617e-05, "loss": 0.1772, "step": 5706 }, { "epoch": 2.107459379615953, "grad_norm": 0.2541576027870178, "learning_rate": 5.9539352136962676e-05, "loss": 0.1938, "step": 5707 }, { "epoch": 2.1078286558345645, "grad_norm": 0.2691689133644104, "learning_rate": 5.9514718561399184e-05, "loss": 0.1625, "step": 5708 }, { "epoch": 2.1081979320531756, "grad_norm": 0.2552073895931244, "learning_rate": 5.949008498583569e-05, "loss": 0.1543, "step": 5709 }, { "epoch": 2.1085672082717872, "grad_norm": 0.2687050700187683, "learning_rate": 5.94654514102722e-05, "loss": 0.1543, "step": 5710 }, { "epoch": 2.108936484490399, "grad_norm": 0.29799342155456543, "learning_rate": 5.944081783470871e-05, "loss": 0.1869, "step": 5711 }, { "epoch": 2.1093057607090104, "grad_norm": 0.2936353087425232, "learning_rate": 5.9416184259145216e-05, "loss": 0.1764, "step": 5712 }, { "epoch": 2.109675036927622, "grad_norm": 0.28229206800460815, "learning_rate": 5.939155068358172e-05, "loss": 0.1837, "step": 5713 }, { "epoch": 2.110044313146233, "grad_norm": 0.3060760498046875, "learning_rate": 5.9366917108018224e-05, "loss": 0.1811, "step": 5714 }, { "epoch": 2.110413589364845, "grad_norm": 0.26369336247444153, "learning_rate": 5.934228353245473e-05, "loss": 0.1574, "step": 5715 }, { "epoch": 2.1107828655834564, "grad_norm": 0.26808932423591614, "learning_rate": 5.931764995689124e-05, "loss": 0.1754, "step": 5716 }, { "epoch": 2.111152141802068, "grad_norm": 0.28503137826919556, "learning_rate": 5.929301638132775e-05, "loss": 0.176, "step": 5717 }, { "epoch": 2.1115214180206796, "grad_norm": 0.22209247946739197, "learning_rate": 5.9268382805764256e-05, "loss": 0.1708, "step": 5718 }, { "epoch": 2.1118906942392908, "grad_norm": 0.25102144479751587, "learning_rate": 5.9243749230200764e-05, "loss": 0.1665, "step": 5719 }, { "epoch": 2.1122599704579024, "grad_norm": 0.24346455931663513, "learning_rate": 5.921911565463727e-05, "loss": 0.1555, "step": 5720 }, { "epoch": 2.112629246676514, "grad_norm": 0.2438468486070633, "learning_rate": 5.919448207907378e-05, "loss": 0.153, "step": 5721 }, { "epoch": 2.1129985228951256, "grad_norm": 0.2507217228412628, "learning_rate": 5.916984850351028e-05, "loss": 0.1757, "step": 5722 }, { "epoch": 2.113367799113737, "grad_norm": 0.23745280504226685, "learning_rate": 5.914521492794679e-05, "loss": 0.1485, "step": 5723 }, { "epoch": 2.113737075332349, "grad_norm": 0.30220702290534973, "learning_rate": 5.9120581352383296e-05, "loss": 0.1832, "step": 5724 }, { "epoch": 2.11410635155096, "grad_norm": 0.28973954916000366, "learning_rate": 5.9095947776819804e-05, "loss": 0.1827, "step": 5725 }, { "epoch": 2.1144756277695715, "grad_norm": 0.23914393782615662, "learning_rate": 5.907131420125631e-05, "loss": 0.1597, "step": 5726 }, { "epoch": 2.114844903988183, "grad_norm": 0.33213070034980774, "learning_rate": 5.904668062569282e-05, "loss": 0.1586, "step": 5727 }, { "epoch": 2.1152141802067947, "grad_norm": 0.25139182806015015, "learning_rate": 5.902204705012933e-05, "loss": 0.1668, "step": 5728 }, { "epoch": 2.1155834564254064, "grad_norm": 0.19673173129558563, "learning_rate": 5.8997413474565836e-05, "loss": 0.1407, "step": 5729 }, { "epoch": 2.1159527326440175, "grad_norm": 0.25255340337753296, "learning_rate": 5.897277989900234e-05, "loss": 0.1603, "step": 5730 }, { "epoch": 2.116322008862629, "grad_norm": 0.22645212709903717, "learning_rate": 5.8948146323438845e-05, "loss": 0.1504, "step": 5731 }, { "epoch": 2.1166912850812407, "grad_norm": 0.2959424555301666, "learning_rate": 5.892351274787535e-05, "loss": 0.1824, "step": 5732 }, { "epoch": 2.1170605612998523, "grad_norm": 0.3307090103626251, "learning_rate": 5.889887917231186e-05, "loss": 0.1943, "step": 5733 }, { "epoch": 2.117429837518464, "grad_norm": 0.31114888191223145, "learning_rate": 5.887424559674837e-05, "loss": 0.1661, "step": 5734 }, { "epoch": 2.1177991137370755, "grad_norm": 0.28659382462501526, "learning_rate": 5.8849612021184876e-05, "loss": 0.1703, "step": 5735 }, { "epoch": 2.1181683899556867, "grad_norm": 0.25932013988494873, "learning_rate": 5.8824978445621384e-05, "loss": 0.1631, "step": 5736 }, { "epoch": 2.1185376661742983, "grad_norm": 0.27428874373435974, "learning_rate": 5.880034487005789e-05, "loss": 0.1679, "step": 5737 }, { "epoch": 2.11890694239291, "grad_norm": 0.2641359269618988, "learning_rate": 5.877571129449439e-05, "loss": 0.1571, "step": 5738 }, { "epoch": 2.1192762186115215, "grad_norm": 0.23932605981826782, "learning_rate": 5.87510777189309e-05, "loss": 0.1538, "step": 5739 }, { "epoch": 2.119645494830133, "grad_norm": 0.3590959906578064, "learning_rate": 5.872644414336741e-05, "loss": 0.1934, "step": 5740 }, { "epoch": 2.1200147710487443, "grad_norm": 0.23727695643901825, "learning_rate": 5.8701810567803916e-05, "loss": 0.1663, "step": 5741 }, { "epoch": 2.120384047267356, "grad_norm": 0.2921147048473358, "learning_rate": 5.8677176992240424e-05, "loss": 0.1569, "step": 5742 }, { "epoch": 2.1207533234859675, "grad_norm": 0.2539224326610565, "learning_rate": 5.865254341667693e-05, "loss": 0.148, "step": 5743 }, { "epoch": 2.121122599704579, "grad_norm": 0.29284903407096863, "learning_rate": 5.862790984111344e-05, "loss": 0.1742, "step": 5744 }, { "epoch": 2.1214918759231907, "grad_norm": 0.2959270477294922, "learning_rate": 5.860327626554995e-05, "loss": 0.1724, "step": 5745 }, { "epoch": 2.1218611521418023, "grad_norm": 0.28707221150398254, "learning_rate": 5.857864268998645e-05, "loss": 0.1881, "step": 5746 }, { "epoch": 2.1222304283604134, "grad_norm": 0.2809945046901703, "learning_rate": 5.855400911442296e-05, "loss": 0.1885, "step": 5747 }, { "epoch": 2.122599704579025, "grad_norm": 0.2609975337982178, "learning_rate": 5.8529375538859465e-05, "loss": 0.1779, "step": 5748 }, { "epoch": 2.1229689807976366, "grad_norm": 0.25174084305763245, "learning_rate": 5.850474196329597e-05, "loss": 0.1698, "step": 5749 }, { "epoch": 2.1233382570162482, "grad_norm": 0.260440468788147, "learning_rate": 5.848010838773248e-05, "loss": 0.1646, "step": 5750 }, { "epoch": 2.1233382570162482, "eval_loss": 0.2585288882255554, "eval_runtime": 5.8601, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.195, "step": 5750 }, { "epoch": 2.12370753323486, "grad_norm": 0.27335822582244873, "learning_rate": 5.845547481216899e-05, "loss": 0.1782, "step": 5751 }, { "epoch": 2.124076809453471, "grad_norm": 0.2632400393486023, "learning_rate": 5.8430841236605496e-05, "loss": 0.1629, "step": 5752 }, { "epoch": 2.1244460856720826, "grad_norm": 0.24932514131069183, "learning_rate": 5.8406207661042e-05, "loss": 0.1559, "step": 5753 }, { "epoch": 2.124815361890694, "grad_norm": 0.28199252486228943, "learning_rate": 5.8381574085478505e-05, "loss": 0.1625, "step": 5754 }, { "epoch": 2.125184638109306, "grad_norm": 0.22602632641792297, "learning_rate": 5.835694050991501e-05, "loss": 0.1477, "step": 5755 }, { "epoch": 2.1255539143279174, "grad_norm": 0.24132585525512695, "learning_rate": 5.833230693435152e-05, "loss": 0.1722, "step": 5756 }, { "epoch": 2.125923190546529, "grad_norm": 0.2727733850479126, "learning_rate": 5.830767335878803e-05, "loss": 0.217, "step": 5757 }, { "epoch": 2.12629246676514, "grad_norm": 0.24890835583209991, "learning_rate": 5.8283039783224537e-05, "loss": 0.1613, "step": 5758 }, { "epoch": 2.1266617429837518, "grad_norm": 0.26370131969451904, "learning_rate": 5.8258406207661044e-05, "loss": 0.1726, "step": 5759 }, { "epoch": 2.1270310192023634, "grad_norm": 0.28788357973098755, "learning_rate": 5.823377263209755e-05, "loss": 0.2016, "step": 5760 }, { "epoch": 2.127400295420975, "grad_norm": 0.27368101477622986, "learning_rate": 5.8209139056534053e-05, "loss": 0.1735, "step": 5761 }, { "epoch": 2.1277695716395866, "grad_norm": 0.24496977031230927, "learning_rate": 5.818450548097056e-05, "loss": 0.1596, "step": 5762 }, { "epoch": 2.1281388478581977, "grad_norm": 0.25704407691955566, "learning_rate": 5.815987190540707e-05, "loss": 0.1717, "step": 5763 }, { "epoch": 2.1285081240768093, "grad_norm": 0.23020388185977936, "learning_rate": 5.813523832984358e-05, "loss": 0.1651, "step": 5764 }, { "epoch": 2.128877400295421, "grad_norm": 0.3059319257736206, "learning_rate": 5.8110604754280085e-05, "loss": 0.1856, "step": 5765 }, { "epoch": 2.1292466765140325, "grad_norm": 0.24818597733974457, "learning_rate": 5.808597117871659e-05, "loss": 0.183, "step": 5766 }, { "epoch": 2.129615952732644, "grad_norm": 0.2819104492664337, "learning_rate": 5.80613376031531e-05, "loss": 0.1699, "step": 5767 }, { "epoch": 2.1299852289512557, "grad_norm": 0.24581509828567505, "learning_rate": 5.803670402758961e-05, "loss": 0.1971, "step": 5768 }, { "epoch": 2.130354505169867, "grad_norm": 0.30326130986213684, "learning_rate": 5.801207045202611e-05, "loss": 0.2084, "step": 5769 }, { "epoch": 2.1307237813884785, "grad_norm": 0.2676447331905365, "learning_rate": 5.798743687646262e-05, "loss": 0.1895, "step": 5770 }, { "epoch": 2.13109305760709, "grad_norm": 0.23784242570400238, "learning_rate": 5.7962803300899125e-05, "loss": 0.1491, "step": 5771 }, { "epoch": 2.1314623338257017, "grad_norm": 0.2932337522506714, "learning_rate": 5.793816972533563e-05, "loss": 0.1595, "step": 5772 }, { "epoch": 2.1318316100443133, "grad_norm": 0.22179383039474487, "learning_rate": 5.791353614977214e-05, "loss": 0.1439, "step": 5773 }, { "epoch": 2.1322008862629245, "grad_norm": 0.3005230128765106, "learning_rate": 5.788890257420865e-05, "loss": 0.1892, "step": 5774 }, { "epoch": 2.132570162481536, "grad_norm": 0.2520924210548401, "learning_rate": 5.786426899864516e-05, "loss": 0.1728, "step": 5775 }, { "epoch": 2.1329394387001477, "grad_norm": 0.31041574478149414, "learning_rate": 5.7839635423081665e-05, "loss": 0.1919, "step": 5776 }, { "epoch": 2.1333087149187593, "grad_norm": 0.20861373841762543, "learning_rate": 5.7815001847518166e-05, "loss": 0.1528, "step": 5777 }, { "epoch": 2.133677991137371, "grad_norm": 0.2767115831375122, "learning_rate": 5.7790368271954674e-05, "loss": 0.1721, "step": 5778 }, { "epoch": 2.1340472673559825, "grad_norm": 0.30877935886383057, "learning_rate": 5.776573469639118e-05, "loss": 0.1567, "step": 5779 }, { "epoch": 2.1344165435745936, "grad_norm": 0.2880702614784241, "learning_rate": 5.774110112082769e-05, "loss": 0.1788, "step": 5780 }, { "epoch": 2.1347858197932053, "grad_norm": 0.30518677830696106, "learning_rate": 5.77164675452642e-05, "loss": 0.1861, "step": 5781 }, { "epoch": 2.135155096011817, "grad_norm": 0.2239362746477127, "learning_rate": 5.7691833969700705e-05, "loss": 0.1443, "step": 5782 }, { "epoch": 2.1355243722304285, "grad_norm": 0.2619790732860565, "learning_rate": 5.766720039413721e-05, "loss": 0.1874, "step": 5783 }, { "epoch": 2.13589364844904, "grad_norm": 0.2446897327899933, "learning_rate": 5.764256681857372e-05, "loss": 0.16, "step": 5784 }, { "epoch": 2.136262924667651, "grad_norm": 0.2709537148475647, "learning_rate": 5.761793324301022e-05, "loss": 0.1859, "step": 5785 }, { "epoch": 2.136632200886263, "grad_norm": 0.20826995372772217, "learning_rate": 5.759329966744673e-05, "loss": 0.1308, "step": 5786 }, { "epoch": 2.1370014771048744, "grad_norm": 0.31698474287986755, "learning_rate": 5.756866609188324e-05, "loss": 0.192, "step": 5787 }, { "epoch": 2.137370753323486, "grad_norm": 0.23542068898677826, "learning_rate": 5.7544032516319745e-05, "loss": 0.1529, "step": 5788 }, { "epoch": 2.1377400295420976, "grad_norm": 0.2614317834377289, "learning_rate": 5.751939894075625e-05, "loss": 0.183, "step": 5789 }, { "epoch": 2.1381093057607092, "grad_norm": 0.2492826133966446, "learning_rate": 5.749476536519276e-05, "loss": 0.1744, "step": 5790 }, { "epoch": 2.1384785819793204, "grad_norm": 0.2894151210784912, "learning_rate": 5.747013178962927e-05, "loss": 0.1757, "step": 5791 }, { "epoch": 2.138847858197932, "grad_norm": 0.2812666594982147, "learning_rate": 5.744549821406578e-05, "loss": 0.1722, "step": 5792 }, { "epoch": 2.1392171344165436, "grad_norm": 0.2668617069721222, "learning_rate": 5.742086463850228e-05, "loss": 0.1902, "step": 5793 }, { "epoch": 2.139586410635155, "grad_norm": 0.2633315324783325, "learning_rate": 5.7396231062938786e-05, "loss": 0.1762, "step": 5794 }, { "epoch": 2.139955686853767, "grad_norm": 0.3152179718017578, "learning_rate": 5.7371597487375294e-05, "loss": 0.173, "step": 5795 }, { "epoch": 2.140324963072378, "grad_norm": 0.26436901092529297, "learning_rate": 5.73469639118118e-05, "loss": 0.1651, "step": 5796 }, { "epoch": 2.1406942392909896, "grad_norm": 0.239777609705925, "learning_rate": 5.732233033624831e-05, "loss": 0.151, "step": 5797 }, { "epoch": 2.141063515509601, "grad_norm": 0.2569291889667511, "learning_rate": 5.729769676068482e-05, "loss": 0.1573, "step": 5798 }, { "epoch": 2.1414327917282128, "grad_norm": 0.2817891538143158, "learning_rate": 5.7273063185121325e-05, "loss": 0.1643, "step": 5799 }, { "epoch": 2.1418020679468244, "grad_norm": 0.2648109793663025, "learning_rate": 5.724842960955783e-05, "loss": 0.1826, "step": 5800 }, { "epoch": 2.1418020679468244, "eval_loss": 0.25734588503837585, "eval_runtime": 5.8587, "eval_samples_per_second": 8.534, "eval_steps_per_second": 1.195, "step": 5800 }, { "epoch": 2.142171344165436, "grad_norm": 0.2397744059562683, "learning_rate": 5.7223796033994334e-05, "loss": 0.1669, "step": 5801 }, { "epoch": 2.142540620384047, "grad_norm": 0.2661373019218445, "learning_rate": 5.719916245843084e-05, "loss": 0.1676, "step": 5802 }, { "epoch": 2.1429098966026587, "grad_norm": 0.21583642065525055, "learning_rate": 5.717452888286735e-05, "loss": 0.156, "step": 5803 }, { "epoch": 2.1432791728212703, "grad_norm": 0.23864522576332092, "learning_rate": 5.714989530730386e-05, "loss": 0.154, "step": 5804 }, { "epoch": 2.143648449039882, "grad_norm": 0.23954026401042938, "learning_rate": 5.7125261731740365e-05, "loss": 0.1631, "step": 5805 }, { "epoch": 2.1440177252584935, "grad_norm": 0.2587621510028839, "learning_rate": 5.710062815617687e-05, "loss": 0.1845, "step": 5806 }, { "epoch": 2.1443870014771047, "grad_norm": 0.34114527702331543, "learning_rate": 5.707599458061338e-05, "loss": 0.1747, "step": 5807 }, { "epoch": 2.1447562776957163, "grad_norm": 0.25066620111465454, "learning_rate": 5.705136100504989e-05, "loss": 0.1615, "step": 5808 }, { "epoch": 2.145125553914328, "grad_norm": 0.3193507492542267, "learning_rate": 5.702672742948639e-05, "loss": 0.1563, "step": 5809 }, { "epoch": 2.1454948301329395, "grad_norm": 0.2463066577911377, "learning_rate": 5.70020938539229e-05, "loss": 0.1512, "step": 5810 }, { "epoch": 2.145864106351551, "grad_norm": 0.26367413997650146, "learning_rate": 5.6977460278359406e-05, "loss": 0.1619, "step": 5811 }, { "epoch": 2.1462333825701623, "grad_norm": 0.22284801304340363, "learning_rate": 5.6952826702795914e-05, "loss": 0.1516, "step": 5812 }, { "epoch": 2.146602658788774, "grad_norm": 0.2699580788612366, "learning_rate": 5.692819312723242e-05, "loss": 0.1846, "step": 5813 }, { "epoch": 2.1469719350073855, "grad_norm": 0.332487553358078, "learning_rate": 5.690355955166893e-05, "loss": 0.1928, "step": 5814 }, { "epoch": 2.147341211225997, "grad_norm": 0.27749398350715637, "learning_rate": 5.687892597610544e-05, "loss": 0.1653, "step": 5815 }, { "epoch": 2.1477104874446087, "grad_norm": 0.22983166575431824, "learning_rate": 5.6854292400541945e-05, "loss": 0.139, "step": 5816 }, { "epoch": 2.1480797636632203, "grad_norm": 0.2825463116168976, "learning_rate": 5.6829658824978446e-05, "loss": 0.1898, "step": 5817 }, { "epoch": 2.1484490398818314, "grad_norm": 0.2563636302947998, "learning_rate": 5.6805025249414954e-05, "loss": 0.1649, "step": 5818 }, { "epoch": 2.148818316100443, "grad_norm": 0.24772876501083374, "learning_rate": 5.678039167385146e-05, "loss": 0.1769, "step": 5819 }, { "epoch": 2.1491875923190547, "grad_norm": 0.24235022068023682, "learning_rate": 5.675575809828797e-05, "loss": 0.1478, "step": 5820 }, { "epoch": 2.1495568685376663, "grad_norm": 0.24281422793865204, "learning_rate": 5.673112452272448e-05, "loss": 0.1721, "step": 5821 }, { "epoch": 2.149926144756278, "grad_norm": 0.1965787410736084, "learning_rate": 5.6706490947160986e-05, "loss": 0.155, "step": 5822 }, { "epoch": 2.150295420974889, "grad_norm": 0.2670396566390991, "learning_rate": 5.6681857371597493e-05, "loss": 0.1879, "step": 5823 }, { "epoch": 2.1506646971935006, "grad_norm": 0.2415950447320938, "learning_rate": 5.6657223796034e-05, "loss": 0.1527, "step": 5824 }, { "epoch": 2.151033973412112, "grad_norm": 0.23829121887683868, "learning_rate": 5.66325902204705e-05, "loss": 0.1563, "step": 5825 }, { "epoch": 2.151403249630724, "grad_norm": 0.27524328231811523, "learning_rate": 5.660795664490701e-05, "loss": 0.1787, "step": 5826 }, { "epoch": 2.1517725258493354, "grad_norm": 0.27747201919555664, "learning_rate": 5.658332306934352e-05, "loss": 0.1871, "step": 5827 }, { "epoch": 2.152141802067947, "grad_norm": 0.3109496533870697, "learning_rate": 5.6558689493780026e-05, "loss": 0.1947, "step": 5828 }, { "epoch": 2.152511078286558, "grad_norm": 0.29694050550460815, "learning_rate": 5.6534055918216534e-05, "loss": 0.1835, "step": 5829 }, { "epoch": 2.15288035450517, "grad_norm": 0.3042897582054138, "learning_rate": 5.650942234265304e-05, "loss": 0.1931, "step": 5830 }, { "epoch": 2.1532496307237814, "grad_norm": 0.3608880639076233, "learning_rate": 5.648478876708955e-05, "loss": 0.1923, "step": 5831 }, { "epoch": 2.153618906942393, "grad_norm": 0.2922372817993164, "learning_rate": 5.646015519152605e-05, "loss": 0.1953, "step": 5832 }, { "epoch": 2.1539881831610046, "grad_norm": 0.23728720843791962, "learning_rate": 5.643552161596256e-05, "loss": 0.1562, "step": 5833 }, { "epoch": 2.1543574593796158, "grad_norm": 0.24144093692302704, "learning_rate": 5.6410888040399066e-05, "loss": 0.1655, "step": 5834 }, { "epoch": 2.1547267355982274, "grad_norm": 0.2800363600254059, "learning_rate": 5.6386254464835574e-05, "loss": 0.1776, "step": 5835 }, { "epoch": 2.155096011816839, "grad_norm": 0.3141731917858124, "learning_rate": 5.636162088927208e-05, "loss": 0.1852, "step": 5836 }, { "epoch": 2.1554652880354506, "grad_norm": 0.284499853849411, "learning_rate": 5.633698731370859e-05, "loss": 0.1824, "step": 5837 }, { "epoch": 2.155834564254062, "grad_norm": 0.29822319746017456, "learning_rate": 5.63123537381451e-05, "loss": 0.167, "step": 5838 }, { "epoch": 2.1562038404726733, "grad_norm": 0.30732518434524536, "learning_rate": 5.6287720162581606e-05, "loss": 0.1819, "step": 5839 }, { "epoch": 2.156573116691285, "grad_norm": 0.2756134867668152, "learning_rate": 5.626308658701811e-05, "loss": 0.1527, "step": 5840 }, { "epoch": 2.1569423929098965, "grad_norm": 0.27252382040023804, "learning_rate": 5.6238453011454615e-05, "loss": 0.1897, "step": 5841 }, { "epoch": 2.157311669128508, "grad_norm": 0.27324724197387695, "learning_rate": 5.621381943589112e-05, "loss": 0.1805, "step": 5842 }, { "epoch": 2.1576809453471197, "grad_norm": 0.2699751555919647, "learning_rate": 5.618918586032763e-05, "loss": 0.1842, "step": 5843 }, { "epoch": 2.1580502215657313, "grad_norm": 0.22370193898677826, "learning_rate": 5.616455228476414e-05, "loss": 0.1603, "step": 5844 }, { "epoch": 2.1584194977843425, "grad_norm": 0.272770494222641, "learning_rate": 5.6139918709200646e-05, "loss": 0.1885, "step": 5845 }, { "epoch": 2.158788774002954, "grad_norm": 0.30952098965644836, "learning_rate": 5.6115285133637154e-05, "loss": 0.1865, "step": 5846 }, { "epoch": 2.1591580502215657, "grad_norm": 0.23765036463737488, "learning_rate": 5.609065155807366e-05, "loss": 0.1527, "step": 5847 }, { "epoch": 2.1595273264401773, "grad_norm": 0.3192552328109741, "learning_rate": 5.606601798251016e-05, "loss": 0.1782, "step": 5848 }, { "epoch": 2.159896602658789, "grad_norm": 0.25643131136894226, "learning_rate": 5.604138440694667e-05, "loss": 0.1742, "step": 5849 }, { "epoch": 2.1602658788774, "grad_norm": 0.25116267800331116, "learning_rate": 5.601675083138318e-05, "loss": 0.1644, "step": 5850 }, { "epoch": 2.1602658788774, "eval_loss": 0.25662660598754883, "eval_runtime": 5.8517, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 5850 }, { "epoch": 2.1606351550960117, "grad_norm": 0.23183858394622803, "learning_rate": 5.5992117255819687e-05, "loss": 0.1669, "step": 5851 }, { "epoch": 2.1610044313146233, "grad_norm": 0.23837564885616302, "learning_rate": 5.5967483680256194e-05, "loss": 0.1608, "step": 5852 }, { "epoch": 2.161373707533235, "grad_norm": 0.2505916655063629, "learning_rate": 5.59428501046927e-05, "loss": 0.1722, "step": 5853 }, { "epoch": 2.1617429837518465, "grad_norm": 0.21763832867145538, "learning_rate": 5.591821652912921e-05, "loss": 0.1438, "step": 5854 }, { "epoch": 2.162112259970458, "grad_norm": 0.2366868257522583, "learning_rate": 5.589358295356572e-05, "loss": 0.169, "step": 5855 }, { "epoch": 2.1624815361890692, "grad_norm": 0.23833197355270386, "learning_rate": 5.586894937800222e-05, "loss": 0.1854, "step": 5856 }, { "epoch": 2.162850812407681, "grad_norm": 0.31813618540763855, "learning_rate": 5.584431580243873e-05, "loss": 0.1752, "step": 5857 }, { "epoch": 2.1632200886262924, "grad_norm": 0.27261093258857727, "learning_rate": 5.5819682226875235e-05, "loss": 0.1701, "step": 5858 }, { "epoch": 2.163589364844904, "grad_norm": 0.31021595001220703, "learning_rate": 5.579504865131174e-05, "loss": 0.1774, "step": 5859 }, { "epoch": 2.1639586410635157, "grad_norm": 0.2569142282009125, "learning_rate": 5.577041507574825e-05, "loss": 0.1765, "step": 5860 }, { "epoch": 2.164327917282127, "grad_norm": 0.24459010362625122, "learning_rate": 5.574578150018476e-05, "loss": 0.1658, "step": 5861 }, { "epoch": 2.1646971935007384, "grad_norm": 0.27214524149894714, "learning_rate": 5.5721147924621266e-05, "loss": 0.1574, "step": 5862 }, { "epoch": 2.16506646971935, "grad_norm": 0.3365628719329834, "learning_rate": 5.5696514349057774e-05, "loss": 0.1833, "step": 5863 }, { "epoch": 2.1654357459379616, "grad_norm": 0.24673442542552948, "learning_rate": 5.5671880773494275e-05, "loss": 0.1724, "step": 5864 }, { "epoch": 2.1658050221565732, "grad_norm": 0.28301650285720825, "learning_rate": 5.564724719793078e-05, "loss": 0.1741, "step": 5865 }, { "epoch": 2.166174298375185, "grad_norm": 0.25149691104888916, "learning_rate": 5.562261362236729e-05, "loss": 0.1668, "step": 5866 }, { "epoch": 2.166543574593796, "grad_norm": 0.2798976004123688, "learning_rate": 5.55979800468038e-05, "loss": 0.1699, "step": 5867 }, { "epoch": 2.1669128508124076, "grad_norm": 0.31019899249076843, "learning_rate": 5.557334647124031e-05, "loss": 0.183, "step": 5868 }, { "epoch": 2.167282127031019, "grad_norm": 0.3291718363761902, "learning_rate": 5.5548712895676815e-05, "loss": 0.1629, "step": 5869 }, { "epoch": 2.167651403249631, "grad_norm": 0.23700349032878876, "learning_rate": 5.552407932011332e-05, "loss": 0.1634, "step": 5870 }, { "epoch": 2.1680206794682424, "grad_norm": 0.29189571738243103, "learning_rate": 5.549944574454983e-05, "loss": 0.1852, "step": 5871 }, { "epoch": 2.1683899556868536, "grad_norm": 0.23060524463653564, "learning_rate": 5.547481216898633e-05, "loss": 0.1377, "step": 5872 }, { "epoch": 2.168759231905465, "grad_norm": 0.2611374258995056, "learning_rate": 5.545017859342284e-05, "loss": 0.1765, "step": 5873 }, { "epoch": 2.1691285081240768, "grad_norm": 0.2506032884120941, "learning_rate": 5.542554501785935e-05, "loss": 0.1471, "step": 5874 }, { "epoch": 2.1694977843426884, "grad_norm": 0.280585378408432, "learning_rate": 5.5400911442295855e-05, "loss": 0.158, "step": 5875 }, { "epoch": 2.1698670605613, "grad_norm": 0.27997133135795593, "learning_rate": 5.537627786673236e-05, "loss": 0.1544, "step": 5876 }, { "epoch": 2.1702363367799116, "grad_norm": 0.2693052589893341, "learning_rate": 5.535164429116887e-05, "loss": 0.1932, "step": 5877 }, { "epoch": 2.1706056129985227, "grad_norm": 0.2868502736091614, "learning_rate": 5.532701071560538e-05, "loss": 0.1742, "step": 5878 }, { "epoch": 2.1709748892171343, "grad_norm": 0.2899225652217865, "learning_rate": 5.5302377140041886e-05, "loss": 0.2052, "step": 5879 }, { "epoch": 2.171344165435746, "grad_norm": 0.2309410274028778, "learning_rate": 5.527774356447839e-05, "loss": 0.1585, "step": 5880 }, { "epoch": 2.1717134416543575, "grad_norm": 0.2919321060180664, "learning_rate": 5.5253109988914895e-05, "loss": 0.1513, "step": 5881 }, { "epoch": 2.172082717872969, "grad_norm": 0.27619990706443787, "learning_rate": 5.52284764133514e-05, "loss": 0.1836, "step": 5882 }, { "epoch": 2.1724519940915803, "grad_norm": 0.21038885414600372, "learning_rate": 5.520384283778791e-05, "loss": 0.1528, "step": 5883 }, { "epoch": 2.172821270310192, "grad_norm": 0.26871126890182495, "learning_rate": 5.517920926222442e-05, "loss": 0.1784, "step": 5884 }, { "epoch": 2.1731905465288035, "grad_norm": 0.2990822196006775, "learning_rate": 5.515457568666093e-05, "loss": 0.1697, "step": 5885 }, { "epoch": 2.173559822747415, "grad_norm": 0.27839839458465576, "learning_rate": 5.5129942111097435e-05, "loss": 0.18, "step": 5886 }, { "epoch": 2.1739290989660267, "grad_norm": 0.30338016152381897, "learning_rate": 5.510530853553394e-05, "loss": 0.2025, "step": 5887 }, { "epoch": 2.1742983751846383, "grad_norm": 0.25437456369400024, "learning_rate": 5.5080674959970444e-05, "loss": 0.1733, "step": 5888 }, { "epoch": 2.1746676514032495, "grad_norm": 0.26489147543907166, "learning_rate": 5.505604138440695e-05, "loss": 0.1791, "step": 5889 }, { "epoch": 2.175036927621861, "grad_norm": 0.24482989311218262, "learning_rate": 5.503140780884346e-05, "loss": 0.1542, "step": 5890 }, { "epoch": 2.1754062038404727, "grad_norm": 0.24775661528110504, "learning_rate": 5.500677423327997e-05, "loss": 0.1643, "step": 5891 }, { "epoch": 2.1757754800590843, "grad_norm": 0.2753461003303528, "learning_rate": 5.4982140657716475e-05, "loss": 0.1722, "step": 5892 }, { "epoch": 2.176144756277696, "grad_norm": 0.2582942545413971, "learning_rate": 5.495750708215298e-05, "loss": 0.1795, "step": 5893 }, { "epoch": 2.176514032496307, "grad_norm": 0.279904305934906, "learning_rate": 5.493287350658949e-05, "loss": 0.1601, "step": 5894 }, { "epoch": 2.1768833087149186, "grad_norm": 0.25517791509628296, "learning_rate": 5.4908239931026e-05, "loss": 0.1567, "step": 5895 }, { "epoch": 2.1772525849335302, "grad_norm": 0.2075134962797165, "learning_rate": 5.48836063554625e-05, "loss": 0.1467, "step": 5896 }, { "epoch": 2.177621861152142, "grad_norm": 0.3045804798603058, "learning_rate": 5.485897277989901e-05, "loss": 0.1694, "step": 5897 }, { "epoch": 2.1779911373707534, "grad_norm": 0.3269966244697571, "learning_rate": 5.4834339204335515e-05, "loss": 0.1689, "step": 5898 }, { "epoch": 2.178360413589365, "grad_norm": 0.2516946792602539, "learning_rate": 5.480970562877202e-05, "loss": 0.1531, "step": 5899 }, { "epoch": 2.178729689807976, "grad_norm": 0.2638711929321289, "learning_rate": 5.478507205320853e-05, "loss": 0.1558, "step": 5900 }, { "epoch": 2.178729689807976, "eval_loss": 0.2542784810066223, "eval_runtime": 5.8598, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 5900 }, { "epoch": 2.179098966026588, "grad_norm": 0.22851087152957916, "learning_rate": 5.476043847764504e-05, "loss": 0.1753, "step": 5901 }, { "epoch": 2.1794682422451994, "grad_norm": 0.31952738761901855, "learning_rate": 5.473580490208155e-05, "loss": 0.1792, "step": 5902 }, { "epoch": 2.179837518463811, "grad_norm": 0.26568403840065, "learning_rate": 5.4711171326518055e-05, "loss": 0.1605, "step": 5903 }, { "epoch": 2.1802067946824226, "grad_norm": 0.3061986565589905, "learning_rate": 5.4686537750954556e-05, "loss": 0.1841, "step": 5904 }, { "epoch": 2.180576070901034, "grad_norm": 0.2837606966495514, "learning_rate": 5.4661904175391064e-05, "loss": 0.1786, "step": 5905 }, { "epoch": 2.1809453471196454, "grad_norm": 0.27201905846595764, "learning_rate": 5.463727059982757e-05, "loss": 0.1671, "step": 5906 }, { "epoch": 2.181314623338257, "grad_norm": 0.2528875470161438, "learning_rate": 5.461263702426408e-05, "loss": 0.1815, "step": 5907 }, { "epoch": 2.1816838995568686, "grad_norm": 0.2863306999206543, "learning_rate": 5.458800344870059e-05, "loss": 0.1689, "step": 5908 }, { "epoch": 2.18205317577548, "grad_norm": 0.32362571358680725, "learning_rate": 5.4563369873137095e-05, "loss": 0.2014, "step": 5909 }, { "epoch": 2.182422451994092, "grad_norm": 0.2716105878353119, "learning_rate": 5.45387362975736e-05, "loss": 0.1761, "step": 5910 }, { "epoch": 2.182791728212703, "grad_norm": 0.23163507878780365, "learning_rate": 5.4514102722010104e-05, "loss": 0.1653, "step": 5911 }, { "epoch": 2.1831610044313146, "grad_norm": 0.26279935240745544, "learning_rate": 5.448946914644661e-05, "loss": 0.1685, "step": 5912 }, { "epoch": 2.183530280649926, "grad_norm": 0.2372507005929947, "learning_rate": 5.446483557088312e-05, "loss": 0.1571, "step": 5913 }, { "epoch": 2.1838995568685378, "grad_norm": 0.3401440382003784, "learning_rate": 5.444020199531963e-05, "loss": 0.188, "step": 5914 }, { "epoch": 2.1842688330871494, "grad_norm": 0.24209219217300415, "learning_rate": 5.4415568419756136e-05, "loss": 0.1577, "step": 5915 }, { "epoch": 2.1846381093057605, "grad_norm": 0.28312069177627563, "learning_rate": 5.4390934844192643e-05, "loss": 0.1601, "step": 5916 }, { "epoch": 2.185007385524372, "grad_norm": 0.24116961658000946, "learning_rate": 5.436630126862915e-05, "loss": 0.1768, "step": 5917 }, { "epoch": 2.1853766617429837, "grad_norm": 0.2630855441093445, "learning_rate": 5.434166769306566e-05, "loss": 0.1607, "step": 5918 }, { "epoch": 2.1857459379615953, "grad_norm": 0.31574612855911255, "learning_rate": 5.431703411750216e-05, "loss": 0.1635, "step": 5919 }, { "epoch": 2.186115214180207, "grad_norm": 0.31741586327552795, "learning_rate": 5.429240054193867e-05, "loss": 0.1883, "step": 5920 }, { "epoch": 2.1864844903988185, "grad_norm": 0.26064226031303406, "learning_rate": 5.426776696637517e-05, "loss": 0.1612, "step": 5921 }, { "epoch": 2.1868537666174297, "grad_norm": 0.23602989315986633, "learning_rate": 5.424313339081167e-05, "loss": 0.1526, "step": 5922 }, { "epoch": 2.1872230428360413, "grad_norm": 0.22026140987873077, "learning_rate": 5.421849981524818e-05, "loss": 0.1462, "step": 5923 }, { "epoch": 2.187592319054653, "grad_norm": 0.3276820778846741, "learning_rate": 5.4193866239684686e-05, "loss": 0.1931, "step": 5924 }, { "epoch": 2.1879615952732645, "grad_norm": 0.2706556022167206, "learning_rate": 5.4169232664121194e-05, "loss": 0.1658, "step": 5925 }, { "epoch": 2.188330871491876, "grad_norm": 0.27734729647636414, "learning_rate": 5.41445990885577e-05, "loss": 0.1862, "step": 5926 }, { "epoch": 2.1887001477104873, "grad_norm": 0.2646738886833191, "learning_rate": 5.411996551299421e-05, "loss": 0.1732, "step": 5927 }, { "epoch": 2.189069423929099, "grad_norm": 0.25028637051582336, "learning_rate": 5.409533193743072e-05, "loss": 0.1798, "step": 5928 }, { "epoch": 2.1894387001477105, "grad_norm": 0.2716618478298187, "learning_rate": 5.4070698361867225e-05, "loss": 0.2028, "step": 5929 }, { "epoch": 2.189807976366322, "grad_norm": 0.26107484102249146, "learning_rate": 5.4046064786303726e-05, "loss": 0.1683, "step": 5930 }, { "epoch": 2.1901772525849337, "grad_norm": 0.2878517806529999, "learning_rate": 5.4021431210740234e-05, "loss": 0.1751, "step": 5931 }, { "epoch": 2.1905465288035453, "grad_norm": 0.2957293391227722, "learning_rate": 5.399679763517674e-05, "loss": 0.1894, "step": 5932 }, { "epoch": 2.1909158050221564, "grad_norm": 0.2170945703983307, "learning_rate": 5.397216405961325e-05, "loss": 0.1485, "step": 5933 }, { "epoch": 2.191285081240768, "grad_norm": 0.2723937928676605, "learning_rate": 5.394753048404976e-05, "loss": 0.1754, "step": 5934 }, { "epoch": 2.1916543574593796, "grad_norm": 0.30582040548324585, "learning_rate": 5.3922896908486266e-05, "loss": 0.1846, "step": 5935 }, { "epoch": 2.1920236336779912, "grad_norm": 0.2360096573829651, "learning_rate": 5.3898263332922774e-05, "loss": 0.165, "step": 5936 }, { "epoch": 2.192392909896603, "grad_norm": 0.2892259955406189, "learning_rate": 5.387362975735928e-05, "loss": 0.2004, "step": 5937 }, { "epoch": 2.192762186115214, "grad_norm": 0.26588162779808044, "learning_rate": 5.384899618179578e-05, "loss": 0.1754, "step": 5938 }, { "epoch": 2.1931314623338256, "grad_norm": 0.2676050364971161, "learning_rate": 5.382436260623229e-05, "loss": 0.1792, "step": 5939 }, { "epoch": 2.193500738552437, "grad_norm": 0.30062136054039, "learning_rate": 5.37997290306688e-05, "loss": 0.1775, "step": 5940 }, { "epoch": 2.193870014771049, "grad_norm": 0.24869313836097717, "learning_rate": 5.3775095455105306e-05, "loss": 0.1585, "step": 5941 }, { "epoch": 2.1942392909896604, "grad_norm": 0.23664288222789764, "learning_rate": 5.3750461879541814e-05, "loss": 0.1431, "step": 5942 }, { "epoch": 2.194608567208272, "grad_norm": 0.28411614894866943, "learning_rate": 5.372582830397832e-05, "loss": 0.1885, "step": 5943 }, { "epoch": 2.194977843426883, "grad_norm": 0.23681482672691345, "learning_rate": 5.370119472841483e-05, "loss": 0.1702, "step": 5944 }, { "epoch": 2.195347119645495, "grad_norm": 0.26534298062324524, "learning_rate": 5.367656115285134e-05, "loss": 0.1538, "step": 5945 }, { "epoch": 2.1957163958641064, "grad_norm": 0.25928986072540283, "learning_rate": 5.365192757728784e-05, "loss": 0.1779, "step": 5946 }, { "epoch": 2.196085672082718, "grad_norm": 0.26320552825927734, "learning_rate": 5.3627294001724347e-05, "loss": 0.1569, "step": 5947 }, { "epoch": 2.1964549483013296, "grad_norm": 0.22869595885276794, "learning_rate": 5.3602660426160854e-05, "loss": 0.166, "step": 5948 }, { "epoch": 2.1968242245199407, "grad_norm": 0.3253004848957062, "learning_rate": 5.357802685059736e-05, "loss": 0.1927, "step": 5949 }, { "epoch": 2.1971935007385524, "grad_norm": 0.3554944396018982, "learning_rate": 5.355339327503387e-05, "loss": 0.1755, "step": 5950 }, { "epoch": 2.1971935007385524, "eval_loss": 0.25651848316192627, "eval_runtime": 5.8591, "eval_samples_per_second": 8.534, "eval_steps_per_second": 1.195, "step": 5950 }, { "epoch": 2.197562776957164, "grad_norm": 0.23495326936244965, "learning_rate": 5.352875969947038e-05, "loss": 0.158, "step": 5951 }, { "epoch": 2.1979320531757756, "grad_norm": 0.2928256392478943, "learning_rate": 5.3504126123906886e-05, "loss": 0.1874, "step": 5952 }, { "epoch": 2.198301329394387, "grad_norm": 0.264809250831604, "learning_rate": 5.3479492548343394e-05, "loss": 0.1728, "step": 5953 }, { "epoch": 2.1986706056129983, "grad_norm": 0.2503385543823242, "learning_rate": 5.3454858972779895e-05, "loss": 0.172, "step": 5954 }, { "epoch": 2.19903988183161, "grad_norm": 0.2616809010505676, "learning_rate": 5.34302253972164e-05, "loss": 0.1511, "step": 5955 }, { "epoch": 2.1994091580502215, "grad_norm": 0.23035597801208496, "learning_rate": 5.340559182165291e-05, "loss": 0.1684, "step": 5956 }, { "epoch": 2.199778434268833, "grad_norm": 0.2465926557779312, "learning_rate": 5.338095824608942e-05, "loss": 0.1698, "step": 5957 }, { "epoch": 2.2001477104874447, "grad_norm": 0.27567368745803833, "learning_rate": 5.3356324670525926e-05, "loss": 0.1859, "step": 5958 }, { "epoch": 2.2005169867060563, "grad_norm": 0.22160843014717102, "learning_rate": 5.3331691094962434e-05, "loss": 0.1516, "step": 5959 }, { "epoch": 2.2008862629246675, "grad_norm": 0.26636406779289246, "learning_rate": 5.330705751939894e-05, "loss": 0.1644, "step": 5960 }, { "epoch": 2.201255539143279, "grad_norm": 0.29620641469955444, "learning_rate": 5.328242394383545e-05, "loss": 0.1603, "step": 5961 }, { "epoch": 2.2016248153618907, "grad_norm": 0.2895677089691162, "learning_rate": 5.325779036827195e-05, "loss": 0.1809, "step": 5962 }, { "epoch": 2.2019940915805023, "grad_norm": 0.28737372159957886, "learning_rate": 5.323315679270846e-05, "loss": 0.1832, "step": 5963 }, { "epoch": 2.202363367799114, "grad_norm": 0.28137853741645813, "learning_rate": 5.320852321714497e-05, "loss": 0.1738, "step": 5964 }, { "epoch": 2.202732644017725, "grad_norm": 0.38717013597488403, "learning_rate": 5.3183889641581475e-05, "loss": 0.1862, "step": 5965 }, { "epoch": 2.2031019202363367, "grad_norm": 0.27989840507507324, "learning_rate": 5.315925606601798e-05, "loss": 0.1471, "step": 5966 }, { "epoch": 2.2034711964549483, "grad_norm": 0.23981954157352448, "learning_rate": 5.313462249045449e-05, "loss": 0.1699, "step": 5967 }, { "epoch": 2.20384047267356, "grad_norm": 0.28785717487335205, "learning_rate": 5.3109988914891e-05, "loss": 0.1627, "step": 5968 }, { "epoch": 2.2042097488921715, "grad_norm": 0.3268805742263794, "learning_rate": 5.3085355339327506e-05, "loss": 0.193, "step": 5969 }, { "epoch": 2.2045790251107826, "grad_norm": 0.25777700543403625, "learning_rate": 5.306072176376401e-05, "loss": 0.1678, "step": 5970 }, { "epoch": 2.2049483013293942, "grad_norm": 0.26376500725746155, "learning_rate": 5.3036088188200515e-05, "loss": 0.1632, "step": 5971 }, { "epoch": 2.205317577548006, "grad_norm": 0.25371983647346497, "learning_rate": 5.301145461263702e-05, "loss": 0.1668, "step": 5972 }, { "epoch": 2.2056868537666174, "grad_norm": 0.22691114246845245, "learning_rate": 5.298682103707353e-05, "loss": 0.145, "step": 5973 }, { "epoch": 2.206056129985229, "grad_norm": 0.24083566665649414, "learning_rate": 5.296218746151004e-05, "loss": 0.1603, "step": 5974 }, { "epoch": 2.2064254062038406, "grad_norm": 0.228380486369133, "learning_rate": 5.2937553885946546e-05, "loss": 0.1398, "step": 5975 }, { "epoch": 2.206794682422452, "grad_norm": 0.30277687311172485, "learning_rate": 5.2912920310383054e-05, "loss": 0.1856, "step": 5976 }, { "epoch": 2.2071639586410634, "grad_norm": 0.303791880607605, "learning_rate": 5.288828673481956e-05, "loss": 0.163, "step": 5977 }, { "epoch": 2.207533234859675, "grad_norm": 0.266146183013916, "learning_rate": 5.286365315925606e-05, "loss": 0.1871, "step": 5978 }, { "epoch": 2.2079025110782866, "grad_norm": 0.290078341960907, "learning_rate": 5.283901958369257e-05, "loss": 0.1705, "step": 5979 }, { "epoch": 2.208271787296898, "grad_norm": 0.2652760446071625, "learning_rate": 5.281438600812908e-05, "loss": 0.156, "step": 5980 }, { "epoch": 2.2086410635155094, "grad_norm": 0.286885142326355, "learning_rate": 5.278975243256559e-05, "loss": 0.1833, "step": 5981 }, { "epoch": 2.209010339734121, "grad_norm": 0.26764151453971863, "learning_rate": 5.2765118857002095e-05, "loss": 0.1834, "step": 5982 }, { "epoch": 2.2093796159527326, "grad_norm": 0.24872265756130219, "learning_rate": 5.27404852814386e-05, "loss": 0.1637, "step": 5983 }, { "epoch": 2.209748892171344, "grad_norm": 0.2747139632701874, "learning_rate": 5.271585170587511e-05, "loss": 0.1703, "step": 5984 }, { "epoch": 2.210118168389956, "grad_norm": 0.2682111859321594, "learning_rate": 5.269121813031161e-05, "loss": 0.1697, "step": 5985 }, { "epoch": 2.2104874446085674, "grad_norm": 0.3948352038860321, "learning_rate": 5.266658455474812e-05, "loss": 0.1763, "step": 5986 }, { "epoch": 2.2108567208271785, "grad_norm": 0.27390846610069275, "learning_rate": 5.264195097918463e-05, "loss": 0.1678, "step": 5987 }, { "epoch": 2.21122599704579, "grad_norm": 0.24934548139572144, "learning_rate": 5.2617317403621135e-05, "loss": 0.1761, "step": 5988 }, { "epoch": 2.2115952732644018, "grad_norm": 0.2849212884902954, "learning_rate": 5.259268382805764e-05, "loss": 0.1843, "step": 5989 }, { "epoch": 2.2119645494830134, "grad_norm": 0.25760313868522644, "learning_rate": 5.256805025249415e-05, "loss": 0.1454, "step": 5990 }, { "epoch": 2.212333825701625, "grad_norm": 0.253659725189209, "learning_rate": 5.254341667693066e-05, "loss": 0.1763, "step": 5991 }, { "epoch": 2.212703101920236, "grad_norm": 0.2626623511314392, "learning_rate": 5.2518783101367167e-05, "loss": 0.162, "step": 5992 }, { "epoch": 2.2130723781388477, "grad_norm": 0.2705555260181427, "learning_rate": 5.249414952580367e-05, "loss": 0.1734, "step": 5993 }, { "epoch": 2.2134416543574593, "grad_norm": 0.2433822602033615, "learning_rate": 5.2469515950240175e-05, "loss": 0.1614, "step": 5994 }, { "epoch": 2.213810930576071, "grad_norm": 0.27496635913848877, "learning_rate": 5.244488237467668e-05, "loss": 0.1639, "step": 5995 }, { "epoch": 2.2141802067946825, "grad_norm": 0.29041311144828796, "learning_rate": 5.242024879911319e-05, "loss": 0.1496, "step": 5996 }, { "epoch": 2.214549483013294, "grad_norm": 0.2603946030139923, "learning_rate": 5.23956152235497e-05, "loss": 0.1553, "step": 5997 }, { "epoch": 2.2149187592319053, "grad_norm": 0.30235958099365234, "learning_rate": 5.237098164798621e-05, "loss": 0.1983, "step": 5998 }, { "epoch": 2.215288035450517, "grad_norm": 0.23368962109088898, "learning_rate": 5.2346348072422715e-05, "loss": 0.1538, "step": 5999 }, { "epoch": 2.2156573116691285, "grad_norm": 0.3310740888118744, "learning_rate": 5.232171449685922e-05, "loss": 0.1808, "step": 6000 }, { "epoch": 2.2156573116691285, "eval_loss": 0.253889262676239, "eval_runtime": 5.8684, "eval_samples_per_second": 8.52, "eval_steps_per_second": 1.193, "step": 6000 }, { "epoch": 2.21602658788774, "grad_norm": 0.24267062544822693, "learning_rate": 5.2297080921295724e-05, "loss": 0.1824, "step": 6001 }, { "epoch": 2.2163958641063517, "grad_norm": 0.29777005314826965, "learning_rate": 5.227244734573223e-05, "loss": 0.1793, "step": 6002 }, { "epoch": 2.216765140324963, "grad_norm": 0.2265225499868393, "learning_rate": 5.224781377016874e-05, "loss": 0.1516, "step": 6003 }, { "epoch": 2.2171344165435745, "grad_norm": 0.2195178121328354, "learning_rate": 5.222318019460525e-05, "loss": 0.1521, "step": 6004 }, { "epoch": 2.217503692762186, "grad_norm": 0.2303364872932434, "learning_rate": 5.2198546619041755e-05, "loss": 0.1489, "step": 6005 }, { "epoch": 2.2178729689807977, "grad_norm": 0.29687920212745667, "learning_rate": 5.217391304347826e-05, "loss": 0.1876, "step": 6006 }, { "epoch": 2.2182422451994093, "grad_norm": 0.22180317342281342, "learning_rate": 5.214927946791477e-05, "loss": 0.1512, "step": 6007 }, { "epoch": 2.218611521418021, "grad_norm": 0.2409660518169403, "learning_rate": 5.212464589235128e-05, "loss": 0.1672, "step": 6008 }, { "epoch": 2.218980797636632, "grad_norm": 0.2430661916732788, "learning_rate": 5.210001231678778e-05, "loss": 0.1739, "step": 6009 }, { "epoch": 2.2193500738552436, "grad_norm": 0.30108213424682617, "learning_rate": 5.207537874122429e-05, "loss": 0.1691, "step": 6010 }, { "epoch": 2.2197193500738552, "grad_norm": 0.24237754940986633, "learning_rate": 5.2050745165660796e-05, "loss": 0.155, "step": 6011 }, { "epoch": 2.220088626292467, "grad_norm": 0.2368919849395752, "learning_rate": 5.2026111590097303e-05, "loss": 0.16, "step": 6012 }, { "epoch": 2.2204579025110784, "grad_norm": 0.24591723084449768, "learning_rate": 5.200147801453381e-05, "loss": 0.1673, "step": 6013 }, { "epoch": 2.2208271787296896, "grad_norm": 0.26719069480895996, "learning_rate": 5.197684443897032e-05, "loss": 0.1538, "step": 6014 }, { "epoch": 2.221196454948301, "grad_norm": 0.23640255630016327, "learning_rate": 5.195221086340683e-05, "loss": 0.173, "step": 6015 }, { "epoch": 2.221565731166913, "grad_norm": 0.2909172475337982, "learning_rate": 5.1927577287843335e-05, "loss": 0.1675, "step": 6016 }, { "epoch": 2.2219350073855244, "grad_norm": 0.27148836851119995, "learning_rate": 5.1902943712279836e-05, "loss": 0.1623, "step": 6017 }, { "epoch": 2.222304283604136, "grad_norm": 0.24277964234352112, "learning_rate": 5.1878310136716344e-05, "loss": 0.1567, "step": 6018 }, { "epoch": 2.2226735598227476, "grad_norm": 0.26548653841018677, "learning_rate": 5.185367656115285e-05, "loss": 0.1698, "step": 6019 }, { "epoch": 2.2230428360413588, "grad_norm": 0.3038986623287201, "learning_rate": 5.182904298558936e-05, "loss": 0.1572, "step": 6020 }, { "epoch": 2.2234121122599704, "grad_norm": 0.2512723505496979, "learning_rate": 5.180440941002587e-05, "loss": 0.1566, "step": 6021 }, { "epoch": 2.223781388478582, "grad_norm": 0.2630036473274231, "learning_rate": 5.1779775834462375e-05, "loss": 0.1776, "step": 6022 }, { "epoch": 2.2241506646971936, "grad_norm": 0.244293212890625, "learning_rate": 5.175514225889888e-05, "loss": 0.1666, "step": 6023 }, { "epoch": 2.224519940915805, "grad_norm": 0.25000420212745667, "learning_rate": 5.173050868333539e-05, "loss": 0.1597, "step": 6024 }, { "epoch": 2.2248892171344163, "grad_norm": 0.2320886105298996, "learning_rate": 5.170587510777189e-05, "loss": 0.1598, "step": 6025 }, { "epoch": 2.225258493353028, "grad_norm": 0.2409050315618515, "learning_rate": 5.16812415322084e-05, "loss": 0.1599, "step": 6026 }, { "epoch": 2.2256277695716395, "grad_norm": 0.24685463309288025, "learning_rate": 5.165660795664491e-05, "loss": 0.1644, "step": 6027 }, { "epoch": 2.225997045790251, "grad_norm": 0.28407877683639526, "learning_rate": 5.1631974381081416e-05, "loss": 0.1783, "step": 6028 }, { "epoch": 2.2263663220088628, "grad_norm": 0.23314602673053741, "learning_rate": 5.1607340805517924e-05, "loss": 0.1688, "step": 6029 }, { "epoch": 2.2267355982274744, "grad_norm": 0.26370736956596375, "learning_rate": 5.158270722995443e-05, "loss": 0.1768, "step": 6030 }, { "epoch": 2.2271048744460855, "grad_norm": 0.33216115832328796, "learning_rate": 5.155807365439094e-05, "loss": 0.1904, "step": 6031 }, { "epoch": 2.227474150664697, "grad_norm": 0.24514667689800262, "learning_rate": 5.153344007882745e-05, "loss": 0.1595, "step": 6032 }, { "epoch": 2.2278434268833087, "grad_norm": 0.2976848781108856, "learning_rate": 5.150880650326395e-05, "loss": 0.178, "step": 6033 }, { "epoch": 2.2282127031019203, "grad_norm": 0.2994171380996704, "learning_rate": 5.1484172927700456e-05, "loss": 0.1824, "step": 6034 }, { "epoch": 2.228581979320532, "grad_norm": 0.2807120084762573, "learning_rate": 5.1459539352136964e-05, "loss": 0.1507, "step": 6035 }, { "epoch": 2.228951255539143, "grad_norm": 0.25743022561073303, "learning_rate": 5.143490577657347e-05, "loss": 0.1567, "step": 6036 }, { "epoch": 2.2293205317577547, "grad_norm": 0.31996509432792664, "learning_rate": 5.141027220100998e-05, "loss": 0.2078, "step": 6037 }, { "epoch": 2.2296898079763663, "grad_norm": 0.2824748158454895, "learning_rate": 5.138563862544649e-05, "loss": 0.154, "step": 6038 }, { "epoch": 2.230059084194978, "grad_norm": 0.3212464153766632, "learning_rate": 5.1361005049882995e-05, "loss": 0.1818, "step": 6039 }, { "epoch": 2.2304283604135895, "grad_norm": 0.33793872594833374, "learning_rate": 5.13363714743195e-05, "loss": 0.1721, "step": 6040 }, { "epoch": 2.230797636632201, "grad_norm": 0.3192881941795349, "learning_rate": 5.1311737898756004e-05, "loss": 0.1755, "step": 6041 }, { "epoch": 2.2311669128508123, "grad_norm": 0.3159840404987335, "learning_rate": 5.128710432319251e-05, "loss": 0.1873, "step": 6042 }, { "epoch": 2.231536189069424, "grad_norm": 0.29445603489875793, "learning_rate": 5.126247074762902e-05, "loss": 0.1717, "step": 6043 }, { "epoch": 2.2319054652880355, "grad_norm": 0.3303259313106537, "learning_rate": 5.123783717206553e-05, "loss": 0.1725, "step": 6044 }, { "epoch": 2.232274741506647, "grad_norm": 0.21008101105690002, "learning_rate": 5.1213203596502036e-05, "loss": 0.1375, "step": 6045 }, { "epoch": 2.2326440177252587, "grad_norm": 0.403527170419693, "learning_rate": 5.1188570020938544e-05, "loss": 0.206, "step": 6046 }, { "epoch": 2.23301329394387, "grad_norm": 0.273002028465271, "learning_rate": 5.116393644537505e-05, "loss": 0.1686, "step": 6047 }, { "epoch": 2.2333825701624814, "grad_norm": 0.32015368342399597, "learning_rate": 5.113930286981156e-05, "loss": 0.1722, "step": 6048 }, { "epoch": 2.233751846381093, "grad_norm": 0.2369040995836258, "learning_rate": 5.111466929424806e-05, "loss": 0.1585, "step": 6049 }, { "epoch": 2.2341211225997046, "grad_norm": 0.2641754150390625, "learning_rate": 5.109003571868457e-05, "loss": 0.1575, "step": 6050 }, { "epoch": 2.2341211225997046, "eval_loss": 0.2539427876472473, "eval_runtime": 5.8532, "eval_samples_per_second": 8.542, "eval_steps_per_second": 1.196, "step": 6050 }, { "epoch": 2.2344903988183162, "grad_norm": 0.26975658535957336, "learning_rate": 5.1065402143121076e-05, "loss": 0.1604, "step": 6051 }, { "epoch": 2.234859675036928, "grad_norm": 0.27304011583328247, "learning_rate": 5.1040768567557584e-05, "loss": 0.1463, "step": 6052 }, { "epoch": 2.235228951255539, "grad_norm": 0.2763857841491699, "learning_rate": 5.101613499199409e-05, "loss": 0.1648, "step": 6053 }, { "epoch": 2.2355982274741506, "grad_norm": 0.23335494101047516, "learning_rate": 5.09915014164306e-05, "loss": 0.1489, "step": 6054 }, { "epoch": 2.235967503692762, "grad_norm": 0.22589267790317535, "learning_rate": 5.096686784086711e-05, "loss": 0.1691, "step": 6055 }, { "epoch": 2.236336779911374, "grad_norm": 0.27718567848205566, "learning_rate": 5.0942234265303616e-05, "loss": 0.1582, "step": 6056 }, { "epoch": 2.2367060561299854, "grad_norm": 0.2392417937517166, "learning_rate": 5.091760068974012e-05, "loss": 0.164, "step": 6057 }, { "epoch": 2.2370753323485966, "grad_norm": 0.22366121411323547, "learning_rate": 5.0892967114176625e-05, "loss": 0.18, "step": 6058 }, { "epoch": 2.237444608567208, "grad_norm": 0.24311357736587524, "learning_rate": 5.086833353861313e-05, "loss": 0.1526, "step": 6059 }, { "epoch": 2.2378138847858198, "grad_norm": 0.2334897220134735, "learning_rate": 5.084369996304964e-05, "loss": 0.1628, "step": 6060 }, { "epoch": 2.2381831610044314, "grad_norm": 0.28294169902801514, "learning_rate": 5.081906638748615e-05, "loss": 0.1767, "step": 6061 }, { "epoch": 2.238552437223043, "grad_norm": 0.24123069643974304, "learning_rate": 5.0794432811922656e-05, "loss": 0.1596, "step": 6062 }, { "epoch": 2.2389217134416546, "grad_norm": 0.26982197165489197, "learning_rate": 5.0769799236359164e-05, "loss": 0.1668, "step": 6063 }, { "epoch": 2.2392909896602657, "grad_norm": 0.3898751735687256, "learning_rate": 5.0745165660795665e-05, "loss": 0.1759, "step": 6064 }, { "epoch": 2.2396602658788773, "grad_norm": 0.2820669114589691, "learning_rate": 5.072053208523217e-05, "loss": 0.1955, "step": 6065 }, { "epoch": 2.240029542097489, "grad_norm": 0.259836882352829, "learning_rate": 5.069589850966868e-05, "loss": 0.1891, "step": 6066 }, { "epoch": 2.2403988183161005, "grad_norm": 0.25741878151893616, "learning_rate": 5.067126493410519e-05, "loss": 0.1621, "step": 6067 }, { "epoch": 2.240768094534712, "grad_norm": 0.24973520636558533, "learning_rate": 5.0646631358541696e-05, "loss": 0.1695, "step": 6068 }, { "epoch": 2.2411373707533233, "grad_norm": 0.3279678523540497, "learning_rate": 5.0621997782978204e-05, "loss": 0.1746, "step": 6069 }, { "epoch": 2.241506646971935, "grad_norm": 0.22707240283489227, "learning_rate": 5.059736420741471e-05, "loss": 0.1552, "step": 6070 }, { "epoch": 2.2418759231905465, "grad_norm": 0.2978220283985138, "learning_rate": 5.057273063185122e-05, "loss": 0.1682, "step": 6071 }, { "epoch": 2.242245199409158, "grad_norm": 0.26274436712265015, "learning_rate": 5.054809705628772e-05, "loss": 0.1682, "step": 6072 }, { "epoch": 2.2426144756277697, "grad_norm": 0.22878243029117584, "learning_rate": 5.052346348072423e-05, "loss": 0.1488, "step": 6073 }, { "epoch": 2.2429837518463813, "grad_norm": 0.3393493592739105, "learning_rate": 5.049882990516074e-05, "loss": 0.1931, "step": 6074 }, { "epoch": 2.2433530280649925, "grad_norm": 0.21743762493133545, "learning_rate": 5.0474196329597245e-05, "loss": 0.159, "step": 6075 }, { "epoch": 2.243722304283604, "grad_norm": 0.28104040026664734, "learning_rate": 5.044956275403375e-05, "loss": 0.167, "step": 6076 }, { "epoch": 2.2440915805022157, "grad_norm": 0.2588372528553009, "learning_rate": 5.042492917847026e-05, "loss": 0.1864, "step": 6077 }, { "epoch": 2.2444608567208273, "grad_norm": 0.29319703578948975, "learning_rate": 5.040029560290677e-05, "loss": 0.1803, "step": 6078 }, { "epoch": 2.244830132939439, "grad_norm": 0.29294025897979736, "learning_rate": 5.0375662027343276e-05, "loss": 0.1739, "step": 6079 }, { "epoch": 2.24519940915805, "grad_norm": 0.30922091007232666, "learning_rate": 5.035102845177978e-05, "loss": 0.1935, "step": 6080 }, { "epoch": 2.2455686853766617, "grad_norm": 0.2539861500263214, "learning_rate": 5.0326394876216285e-05, "loss": 0.1616, "step": 6081 }, { "epoch": 2.2459379615952733, "grad_norm": 0.22313763201236725, "learning_rate": 5.030176130065279e-05, "loss": 0.156, "step": 6082 }, { "epoch": 2.246307237813885, "grad_norm": 0.2427612692117691, "learning_rate": 5.02771277250893e-05, "loss": 0.1639, "step": 6083 }, { "epoch": 2.2466765140324965, "grad_norm": 0.2386532723903656, "learning_rate": 5.025249414952581e-05, "loss": 0.1675, "step": 6084 }, { "epoch": 2.2470457902511076, "grad_norm": 0.2651612162590027, "learning_rate": 5.0227860573962317e-05, "loss": 0.1824, "step": 6085 }, { "epoch": 2.2474150664697192, "grad_norm": 0.24895986914634705, "learning_rate": 5.0203226998398824e-05, "loss": 0.1803, "step": 6086 }, { "epoch": 2.247784342688331, "grad_norm": 0.2946280837059021, "learning_rate": 5.017859342283533e-05, "loss": 0.1774, "step": 6087 }, { "epoch": 2.2481536189069424, "grad_norm": 0.29397857189178467, "learning_rate": 5.015395984727183e-05, "loss": 0.1732, "step": 6088 }, { "epoch": 2.248522895125554, "grad_norm": 0.2765266001224518, "learning_rate": 5.012932627170834e-05, "loss": 0.1527, "step": 6089 }, { "epoch": 2.2488921713441656, "grad_norm": 0.2427031695842743, "learning_rate": 5.010469269614485e-05, "loss": 0.1592, "step": 6090 }, { "epoch": 2.249261447562777, "grad_norm": 0.30422443151474, "learning_rate": 5.008005912058136e-05, "loss": 0.1687, "step": 6091 }, { "epoch": 2.2496307237813884, "grad_norm": 0.263476699590683, "learning_rate": 5.0055425545017865e-05, "loss": 0.1861, "step": 6092 }, { "epoch": 2.25, "grad_norm": 0.26989686489105225, "learning_rate": 5.003079196945437e-05, "loss": 0.1977, "step": 6093 }, { "epoch": 2.2503692762186116, "grad_norm": 0.23634803295135498, "learning_rate": 5.000615839389088e-05, "loss": 0.1761, "step": 6094 }, { "epoch": 2.250738552437223, "grad_norm": 0.24952930212020874, "learning_rate": 4.998152481832738e-05, "loss": 0.1675, "step": 6095 }, { "epoch": 2.251107828655835, "grad_norm": 0.30477458238601685, "learning_rate": 4.995689124276389e-05, "loss": 0.1797, "step": 6096 }, { "epoch": 2.251477104874446, "grad_norm": 0.2522794008255005, "learning_rate": 4.993225766720039e-05, "loss": 0.1598, "step": 6097 }, { "epoch": 2.2518463810930576, "grad_norm": 0.211015984416008, "learning_rate": 4.99076240916369e-05, "loss": 0.1571, "step": 6098 }, { "epoch": 2.252215657311669, "grad_norm": 0.235479936003685, "learning_rate": 4.9882990516073406e-05, "loss": 0.1747, "step": 6099 }, { "epoch": 2.2525849335302808, "grad_norm": 0.32731950283050537, "learning_rate": 4.9858356940509914e-05, "loss": 0.1731, "step": 6100 }, { "epoch": 2.2525849335302808, "eval_loss": 0.2540842294692993, "eval_runtime": 5.8529, "eval_samples_per_second": 8.543, "eval_steps_per_second": 1.196, "step": 6100 }, { "epoch": 2.252954209748892, "grad_norm": 0.2917706370353699, "learning_rate": 4.983372336494642e-05, "loss": 0.1781, "step": 6101 }, { "epoch": 2.2533234859675035, "grad_norm": 0.22985264658927917, "learning_rate": 4.980908978938293e-05, "loss": 0.1669, "step": 6102 }, { "epoch": 2.253692762186115, "grad_norm": 0.25897547602653503, "learning_rate": 4.978445621381944e-05, "loss": 0.154, "step": 6103 }, { "epoch": 2.2540620384047267, "grad_norm": 0.26246926188468933, "learning_rate": 4.9759822638255946e-05, "loss": 0.162, "step": 6104 }, { "epoch": 2.2544313146233383, "grad_norm": 0.3654504120349884, "learning_rate": 4.973518906269245e-05, "loss": 0.189, "step": 6105 }, { "epoch": 2.25480059084195, "grad_norm": 0.26094144582748413, "learning_rate": 4.9710555487128955e-05, "loss": 0.1553, "step": 6106 }, { "epoch": 2.255169867060561, "grad_norm": 0.2783089876174927, "learning_rate": 4.968592191156546e-05, "loss": 0.1651, "step": 6107 }, { "epoch": 2.2555391432791727, "grad_norm": 0.27150285243988037, "learning_rate": 4.966128833600197e-05, "loss": 0.1931, "step": 6108 }, { "epoch": 2.2559084194977843, "grad_norm": 0.2587626874446869, "learning_rate": 4.963665476043848e-05, "loss": 0.1698, "step": 6109 }, { "epoch": 2.256277695716396, "grad_norm": 0.24936683475971222, "learning_rate": 4.9612021184874986e-05, "loss": 0.1808, "step": 6110 }, { "epoch": 2.2566469719350075, "grad_norm": 0.26469147205352783, "learning_rate": 4.9587387609311494e-05, "loss": 0.167, "step": 6111 }, { "epoch": 2.2570162481536187, "grad_norm": 0.27131637930870056, "learning_rate": 4.9562754033748e-05, "loss": 0.1636, "step": 6112 }, { "epoch": 2.2573855243722303, "grad_norm": 0.2650572955608368, "learning_rate": 4.95381204581845e-05, "loss": 0.151, "step": 6113 }, { "epoch": 2.257754800590842, "grad_norm": 0.2983565032482147, "learning_rate": 4.951348688262101e-05, "loss": 0.1865, "step": 6114 }, { "epoch": 2.2581240768094535, "grad_norm": 0.3544551134109497, "learning_rate": 4.948885330705752e-05, "loss": 0.1936, "step": 6115 }, { "epoch": 2.258493353028065, "grad_norm": 0.24496768414974213, "learning_rate": 4.9464219731494026e-05, "loss": 0.1865, "step": 6116 }, { "epoch": 2.2588626292466767, "grad_norm": 0.33322036266326904, "learning_rate": 4.9439586155930534e-05, "loss": 0.2055, "step": 6117 }, { "epoch": 2.259231905465288, "grad_norm": 0.2713990807533264, "learning_rate": 4.941495258036704e-05, "loss": 0.1818, "step": 6118 }, { "epoch": 2.2596011816838995, "grad_norm": 0.24263006448745728, "learning_rate": 4.939031900480355e-05, "loss": 0.18, "step": 6119 }, { "epoch": 2.259970457902511, "grad_norm": 0.25105422735214233, "learning_rate": 4.936568542924006e-05, "loss": 0.1578, "step": 6120 }, { "epoch": 2.2603397341211227, "grad_norm": 0.24577294290065765, "learning_rate": 4.934105185367656e-05, "loss": 0.1683, "step": 6121 }, { "epoch": 2.2607090103397343, "grad_norm": 0.24745669960975647, "learning_rate": 4.931641827811307e-05, "loss": 0.1599, "step": 6122 }, { "epoch": 2.2610782865583454, "grad_norm": 0.3026842772960663, "learning_rate": 4.9291784702549575e-05, "loss": 0.1932, "step": 6123 }, { "epoch": 2.261447562776957, "grad_norm": 0.2283596694469452, "learning_rate": 4.926715112698608e-05, "loss": 0.1585, "step": 6124 }, { "epoch": 2.2618168389955686, "grad_norm": 0.22106395661830902, "learning_rate": 4.924251755142259e-05, "loss": 0.1582, "step": 6125 }, { "epoch": 2.2621861152141802, "grad_norm": 0.29688745737075806, "learning_rate": 4.92178839758591e-05, "loss": 0.1856, "step": 6126 }, { "epoch": 2.262555391432792, "grad_norm": 0.2598349153995514, "learning_rate": 4.9193250400295606e-05, "loss": 0.1875, "step": 6127 }, { "epoch": 2.2629246676514034, "grad_norm": 0.2807743549346924, "learning_rate": 4.9168616824732114e-05, "loss": 0.1562, "step": 6128 }, { "epoch": 2.2632939438700146, "grad_norm": 0.23551684617996216, "learning_rate": 4.9143983249168615e-05, "loss": 0.1652, "step": 6129 }, { "epoch": 2.263663220088626, "grad_norm": 0.25193148851394653, "learning_rate": 4.911934967360512e-05, "loss": 0.1721, "step": 6130 }, { "epoch": 2.264032496307238, "grad_norm": 0.2823428511619568, "learning_rate": 4.909471609804163e-05, "loss": 0.1756, "step": 6131 }, { "epoch": 2.2644017725258494, "grad_norm": 0.26386234164237976, "learning_rate": 4.907008252247814e-05, "loss": 0.1638, "step": 6132 }, { "epoch": 2.264771048744461, "grad_norm": 0.27456212043762207, "learning_rate": 4.9045448946914647e-05, "loss": 0.1479, "step": 6133 }, { "epoch": 2.265140324963072, "grad_norm": 0.29135406017303467, "learning_rate": 4.9020815371351154e-05, "loss": 0.1744, "step": 6134 }, { "epoch": 2.2655096011816838, "grad_norm": 0.2479601353406906, "learning_rate": 4.899618179578766e-05, "loss": 0.1544, "step": 6135 }, { "epoch": 2.2658788774002954, "grad_norm": 0.29454588890075684, "learning_rate": 4.897154822022417e-05, "loss": 0.1915, "step": 6136 }, { "epoch": 2.266248153618907, "grad_norm": 0.26884835958480835, "learning_rate": 4.894691464466067e-05, "loss": 0.1669, "step": 6137 }, { "epoch": 2.2666174298375186, "grad_norm": 0.3065447509288788, "learning_rate": 4.892228106909718e-05, "loss": 0.1798, "step": 6138 }, { "epoch": 2.26698670605613, "grad_norm": 0.28422340750694275, "learning_rate": 4.889764749353369e-05, "loss": 0.1703, "step": 6139 }, { "epoch": 2.2673559822747413, "grad_norm": 0.280860036611557, "learning_rate": 4.8873013917970195e-05, "loss": 0.159, "step": 6140 }, { "epoch": 2.267725258493353, "grad_norm": 0.24607494473457336, "learning_rate": 4.88483803424067e-05, "loss": 0.1894, "step": 6141 }, { "epoch": 2.2680945347119645, "grad_norm": 0.27865591645240784, "learning_rate": 4.882374676684321e-05, "loss": 0.1778, "step": 6142 }, { "epoch": 2.268463810930576, "grad_norm": 0.25750765204429626, "learning_rate": 4.879911319127972e-05, "loss": 0.1715, "step": 6143 }, { "epoch": 2.2688330871491877, "grad_norm": 0.27004292607307434, "learning_rate": 4.8774479615716226e-05, "loss": 0.1729, "step": 6144 }, { "epoch": 2.269202363367799, "grad_norm": 0.29048722982406616, "learning_rate": 4.874984604015273e-05, "loss": 0.1731, "step": 6145 }, { "epoch": 2.2695716395864105, "grad_norm": 0.23353642225265503, "learning_rate": 4.8725212464589235e-05, "loss": 0.1484, "step": 6146 }, { "epoch": 2.269940915805022, "grad_norm": 0.2740772068500519, "learning_rate": 4.870057888902574e-05, "loss": 0.186, "step": 6147 }, { "epoch": 2.2703101920236337, "grad_norm": 0.24230247735977173, "learning_rate": 4.867594531346225e-05, "loss": 0.1568, "step": 6148 }, { "epoch": 2.2706794682422453, "grad_norm": 0.23569877445697784, "learning_rate": 4.865131173789876e-05, "loss": 0.1614, "step": 6149 }, { "epoch": 2.271048744460857, "grad_norm": 0.2888610363006592, "learning_rate": 4.862667816233527e-05, "loss": 0.1862, "step": 6150 }, { "epoch": 2.271048744460857, "eval_loss": 0.25195202231407166, "eval_runtime": 5.8615, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 6150 }, { "epoch": 2.271418020679468, "grad_norm": 0.2675657272338867, "learning_rate": 4.8602044586771775e-05, "loss": 0.1789, "step": 6151 }, { "epoch": 2.2717872968980797, "grad_norm": 0.29622554779052734, "learning_rate": 4.857741101120828e-05, "loss": 0.1676, "step": 6152 }, { "epoch": 2.2721565731166913, "grad_norm": 0.23971012234687805, "learning_rate": 4.8552777435644783e-05, "loss": 0.1561, "step": 6153 }, { "epoch": 2.272525849335303, "grad_norm": 0.28567540645599365, "learning_rate": 4.852814386008129e-05, "loss": 0.1601, "step": 6154 }, { "epoch": 2.2728951255539145, "grad_norm": 0.28229108452796936, "learning_rate": 4.85035102845178e-05, "loss": 0.1641, "step": 6155 }, { "epoch": 2.2732644017725256, "grad_norm": 0.23102128505706787, "learning_rate": 4.847887670895431e-05, "loss": 0.1663, "step": 6156 }, { "epoch": 2.2736336779911372, "grad_norm": 0.28751540184020996, "learning_rate": 4.8454243133390815e-05, "loss": 0.1815, "step": 6157 }, { "epoch": 2.274002954209749, "grad_norm": 0.2625499963760376, "learning_rate": 4.842960955782732e-05, "loss": 0.1755, "step": 6158 }, { "epoch": 2.2743722304283605, "grad_norm": 0.3934956192970276, "learning_rate": 4.840497598226383e-05, "loss": 0.1736, "step": 6159 }, { "epoch": 2.274741506646972, "grad_norm": 0.3167099356651306, "learning_rate": 4.838034240670034e-05, "loss": 0.1881, "step": 6160 }, { "epoch": 2.2751107828655837, "grad_norm": 0.2751583158969879, "learning_rate": 4.835570883113684e-05, "loss": 0.1548, "step": 6161 }, { "epoch": 2.275480059084195, "grad_norm": 0.28156930208206177, "learning_rate": 4.833107525557335e-05, "loss": 0.182, "step": 6162 }, { "epoch": 2.2758493353028064, "grad_norm": 0.2643374502658844, "learning_rate": 4.8306441680009855e-05, "loss": 0.1917, "step": 6163 }, { "epoch": 2.276218611521418, "grad_norm": 0.20647065341472626, "learning_rate": 4.828180810444636e-05, "loss": 0.1459, "step": 6164 }, { "epoch": 2.2765878877400296, "grad_norm": 0.26006636023521423, "learning_rate": 4.825717452888287e-05, "loss": 0.1585, "step": 6165 }, { "epoch": 2.2769571639586412, "grad_norm": 0.22283130884170532, "learning_rate": 4.823254095331938e-05, "loss": 0.1454, "step": 6166 }, { "epoch": 2.2773264401772524, "grad_norm": 0.281406432390213, "learning_rate": 4.820790737775589e-05, "loss": 0.1745, "step": 6167 }, { "epoch": 2.277695716395864, "grad_norm": 0.276691734790802, "learning_rate": 4.8183273802192395e-05, "loss": 0.1617, "step": 6168 }, { "epoch": 2.2780649926144756, "grad_norm": 0.30159685015678406, "learning_rate": 4.8158640226628896e-05, "loss": 0.1844, "step": 6169 }, { "epoch": 2.278434268833087, "grad_norm": 0.2677542269229889, "learning_rate": 4.8134006651065404e-05, "loss": 0.1617, "step": 6170 }, { "epoch": 2.278803545051699, "grad_norm": 0.28805914521217346, "learning_rate": 4.810937307550191e-05, "loss": 0.1471, "step": 6171 }, { "epoch": 2.2791728212703104, "grad_norm": 0.33055534958839417, "learning_rate": 4.808473949993842e-05, "loss": 0.1987, "step": 6172 }, { "epoch": 2.2795420974889216, "grad_norm": 0.5852456092834473, "learning_rate": 4.806010592437493e-05, "loss": 0.1839, "step": 6173 }, { "epoch": 2.279911373707533, "grad_norm": 0.2516915798187256, "learning_rate": 4.8035472348811435e-05, "loss": 0.1661, "step": 6174 }, { "epoch": 2.2802806499261448, "grad_norm": 0.2683897614479065, "learning_rate": 4.801083877324794e-05, "loss": 0.1737, "step": 6175 }, { "epoch": 2.2806499261447564, "grad_norm": 0.3504439890384674, "learning_rate": 4.7986205197684444e-05, "loss": 0.1915, "step": 6176 }, { "epoch": 2.281019202363368, "grad_norm": 0.26182207465171814, "learning_rate": 4.796157162212095e-05, "loss": 0.1838, "step": 6177 }, { "epoch": 2.281388478581979, "grad_norm": 0.25246095657348633, "learning_rate": 4.793693804655746e-05, "loss": 0.164, "step": 6178 }, { "epoch": 2.2817577548005907, "grad_norm": 0.22957351803779602, "learning_rate": 4.791230447099397e-05, "loss": 0.1564, "step": 6179 }, { "epoch": 2.2821270310192023, "grad_norm": 0.22464561462402344, "learning_rate": 4.7887670895430475e-05, "loss": 0.1524, "step": 6180 }, { "epoch": 2.282496307237814, "grad_norm": 0.26672062277793884, "learning_rate": 4.786303731986698e-05, "loss": 0.1687, "step": 6181 }, { "epoch": 2.2828655834564255, "grad_norm": 0.28550687432289124, "learning_rate": 4.783840374430349e-05, "loss": 0.1827, "step": 6182 }, { "epoch": 2.283234859675037, "grad_norm": 0.28829067945480347, "learning_rate": 4.781377016874e-05, "loss": 0.1431, "step": 6183 }, { "epoch": 2.2836041358936483, "grad_norm": 0.21833986043930054, "learning_rate": 4.77891365931765e-05, "loss": 0.1484, "step": 6184 }, { "epoch": 2.28397341211226, "grad_norm": 0.2626877725124359, "learning_rate": 4.776450301761301e-05, "loss": 0.1972, "step": 6185 }, { "epoch": 2.2843426883308715, "grad_norm": 0.24253219366073608, "learning_rate": 4.7739869442049516e-05, "loss": 0.1679, "step": 6186 }, { "epoch": 2.284711964549483, "grad_norm": 0.27723631262779236, "learning_rate": 4.7715235866486024e-05, "loss": 0.1592, "step": 6187 }, { "epoch": 2.2850812407680947, "grad_norm": 0.27834993600845337, "learning_rate": 4.769060229092253e-05, "loss": 0.1805, "step": 6188 }, { "epoch": 2.285450516986706, "grad_norm": 0.2531612515449524, "learning_rate": 4.766596871535904e-05, "loss": 0.1718, "step": 6189 }, { "epoch": 2.2858197932053175, "grad_norm": 0.2874826490879059, "learning_rate": 4.764133513979555e-05, "loss": 0.1774, "step": 6190 }, { "epoch": 2.286189069423929, "grad_norm": 0.3089540898799896, "learning_rate": 4.7616701564232055e-05, "loss": 0.1838, "step": 6191 }, { "epoch": 2.2865583456425407, "grad_norm": 0.2890220880508423, "learning_rate": 4.7592067988668556e-05, "loss": 0.1656, "step": 6192 }, { "epoch": 2.2869276218611523, "grad_norm": 0.27009257674217224, "learning_rate": 4.7567434413105064e-05, "loss": 0.174, "step": 6193 }, { "epoch": 2.287296898079764, "grad_norm": 0.2783883512020111, "learning_rate": 4.754280083754157e-05, "loss": 0.1623, "step": 6194 }, { "epoch": 2.287666174298375, "grad_norm": 0.2568540871143341, "learning_rate": 4.751816726197808e-05, "loss": 0.159, "step": 6195 }, { "epoch": 2.2880354505169866, "grad_norm": 0.32755327224731445, "learning_rate": 4.749353368641459e-05, "loss": 0.1919, "step": 6196 }, { "epoch": 2.2884047267355982, "grad_norm": 0.2393258959054947, "learning_rate": 4.7468900110851096e-05, "loss": 0.1569, "step": 6197 }, { "epoch": 2.28877400295421, "grad_norm": 0.3174670338630676, "learning_rate": 4.7444266535287603e-05, "loss": 0.1557, "step": 6198 }, { "epoch": 2.2891432791728215, "grad_norm": 0.2505471706390381, "learning_rate": 4.741963295972411e-05, "loss": 0.1769, "step": 6199 }, { "epoch": 2.2895125553914326, "grad_norm": 0.3370268642902374, "learning_rate": 4.739499938416061e-05, "loss": 0.1755, "step": 6200 }, { "epoch": 2.2895125553914326, "eval_loss": 0.2525298297405243, "eval_runtime": 5.8482, "eval_samples_per_second": 8.55, "eval_steps_per_second": 1.197, "step": 6200 }, { "epoch": 2.289881831610044, "grad_norm": 0.24336397647857666, "learning_rate": 4.737036580859712e-05, "loss": 0.1814, "step": 6201 }, { "epoch": 2.290251107828656, "grad_norm": 0.3346516788005829, "learning_rate": 4.734573223303363e-05, "loss": 0.1619, "step": 6202 }, { "epoch": 2.2906203840472674, "grad_norm": 0.2932000756263733, "learning_rate": 4.7321098657470136e-05, "loss": 0.1747, "step": 6203 }, { "epoch": 2.290989660265879, "grad_norm": 0.23745404183864594, "learning_rate": 4.7296465081906644e-05, "loss": 0.1513, "step": 6204 }, { "epoch": 2.2913589364844906, "grad_norm": 0.23634615540504456, "learning_rate": 4.727183150634315e-05, "loss": 0.1635, "step": 6205 }, { "epoch": 2.291728212703102, "grad_norm": 0.30578646063804626, "learning_rate": 4.724719793077966e-05, "loss": 0.1649, "step": 6206 }, { "epoch": 2.2920974889217134, "grad_norm": 0.26579129695892334, "learning_rate": 4.722256435521617e-05, "loss": 0.171, "step": 6207 }, { "epoch": 2.292466765140325, "grad_norm": 0.2875668406486511, "learning_rate": 4.719793077965267e-05, "loss": 0.1525, "step": 6208 }, { "epoch": 2.2928360413589366, "grad_norm": 0.26967793703079224, "learning_rate": 4.7173297204089176e-05, "loss": 0.1565, "step": 6209 }, { "epoch": 2.293205317577548, "grad_norm": 0.3409324884414673, "learning_rate": 4.7148663628525684e-05, "loss": 0.1633, "step": 6210 }, { "epoch": 2.2935745937961594, "grad_norm": 0.2611065208911896, "learning_rate": 4.7124030052962185e-05, "loss": 0.1584, "step": 6211 }, { "epoch": 2.293943870014771, "grad_norm": 0.2830992639064789, "learning_rate": 4.709939647739869e-05, "loss": 0.1608, "step": 6212 }, { "epoch": 2.2943131462333826, "grad_norm": 0.2753259539604187, "learning_rate": 4.70747629018352e-05, "loss": 0.145, "step": 6213 }, { "epoch": 2.294682422451994, "grad_norm": 0.3336687386035919, "learning_rate": 4.705012932627171e-05, "loss": 0.2059, "step": 6214 }, { "epoch": 2.2950516986706058, "grad_norm": 0.2711421549320221, "learning_rate": 4.702549575070822e-05, "loss": 0.1802, "step": 6215 }, { "epoch": 2.2954209748892174, "grad_norm": 0.28786736726760864, "learning_rate": 4.7000862175144725e-05, "loss": 0.1621, "step": 6216 }, { "epoch": 2.2957902511078285, "grad_norm": 0.27143457531929016, "learning_rate": 4.6976228599581226e-05, "loss": 0.1553, "step": 6217 }, { "epoch": 2.29615952732644, "grad_norm": 0.27764955163002014, "learning_rate": 4.6951595024017734e-05, "loss": 0.1783, "step": 6218 }, { "epoch": 2.2965288035450517, "grad_norm": 0.2730041742324829, "learning_rate": 4.692696144845424e-05, "loss": 0.1578, "step": 6219 }, { "epoch": 2.2968980797636633, "grad_norm": 0.20962485671043396, "learning_rate": 4.690232787289075e-05, "loss": 0.1599, "step": 6220 }, { "epoch": 2.2972673559822745, "grad_norm": 0.3561626076698303, "learning_rate": 4.687769429732726e-05, "loss": 0.2005, "step": 6221 }, { "epoch": 2.297636632200886, "grad_norm": 0.4217565953731537, "learning_rate": 4.6853060721763765e-05, "loss": 0.1888, "step": 6222 }, { "epoch": 2.2980059084194977, "grad_norm": 0.31085893511772156, "learning_rate": 4.682842714620027e-05, "loss": 0.1546, "step": 6223 }, { "epoch": 2.2983751846381093, "grad_norm": 0.2842569947242737, "learning_rate": 4.680379357063678e-05, "loss": 0.1618, "step": 6224 }, { "epoch": 2.298744460856721, "grad_norm": 0.30500495433807373, "learning_rate": 4.677915999507328e-05, "loss": 0.1971, "step": 6225 }, { "epoch": 2.2991137370753325, "grad_norm": 0.2780814468860626, "learning_rate": 4.675452641950979e-05, "loss": 0.1876, "step": 6226 }, { "epoch": 2.299483013293944, "grad_norm": 0.2767498791217804, "learning_rate": 4.67298928439463e-05, "loss": 0.1827, "step": 6227 }, { "epoch": 2.2998522895125553, "grad_norm": 0.2901836633682251, "learning_rate": 4.6705259268382805e-05, "loss": 0.1693, "step": 6228 }, { "epoch": 2.300221565731167, "grad_norm": 0.2251076102256775, "learning_rate": 4.668062569281931e-05, "loss": 0.1655, "step": 6229 }, { "epoch": 2.3005908419497785, "grad_norm": 0.30336886644363403, "learning_rate": 4.665599211725582e-05, "loss": 0.2055, "step": 6230 }, { "epoch": 2.30096011816839, "grad_norm": 0.2422707974910736, "learning_rate": 4.663135854169233e-05, "loss": 0.1484, "step": 6231 }, { "epoch": 2.3013293943870012, "grad_norm": 0.2562599182128906, "learning_rate": 4.660672496612884e-05, "loss": 0.1779, "step": 6232 }, { "epoch": 2.301698670605613, "grad_norm": 0.24225102365016937, "learning_rate": 4.658209139056534e-05, "loss": 0.1754, "step": 6233 }, { "epoch": 2.3020679468242244, "grad_norm": 0.32815083861351013, "learning_rate": 4.6557457815001846e-05, "loss": 0.1776, "step": 6234 }, { "epoch": 2.302437223042836, "grad_norm": 0.2695056200027466, "learning_rate": 4.6532824239438354e-05, "loss": 0.1567, "step": 6235 }, { "epoch": 2.3028064992614476, "grad_norm": 0.26233479380607605, "learning_rate": 4.650819066387486e-05, "loss": 0.1554, "step": 6236 }, { "epoch": 2.3031757754800593, "grad_norm": 0.31833940744400024, "learning_rate": 4.648355708831137e-05, "loss": 0.1921, "step": 6237 }, { "epoch": 2.303545051698671, "grad_norm": 0.2691422700881958, "learning_rate": 4.645892351274788e-05, "loss": 0.1708, "step": 6238 }, { "epoch": 2.303914327917282, "grad_norm": 0.2862132489681244, "learning_rate": 4.6434289937184385e-05, "loss": 0.144, "step": 6239 }, { "epoch": 2.3042836041358936, "grad_norm": 0.2821369767189026, "learning_rate": 4.640965636162089e-05, "loss": 0.1917, "step": 6240 }, { "epoch": 2.304652880354505, "grad_norm": 0.4799966514110565, "learning_rate": 4.6385022786057394e-05, "loss": 0.183, "step": 6241 }, { "epoch": 2.305022156573117, "grad_norm": 0.32981500029563904, "learning_rate": 4.63603892104939e-05, "loss": 0.1887, "step": 6242 }, { "epoch": 2.305391432791728, "grad_norm": 0.2521057724952698, "learning_rate": 4.633575563493041e-05, "loss": 0.181, "step": 6243 }, { "epoch": 2.3057607090103396, "grad_norm": 0.2086019515991211, "learning_rate": 4.631112205936692e-05, "loss": 0.1515, "step": 6244 }, { "epoch": 2.306129985228951, "grad_norm": 0.2835588753223419, "learning_rate": 4.6286488483803426e-05, "loss": 0.1608, "step": 6245 }, { "epoch": 2.306499261447563, "grad_norm": 0.22256755828857422, "learning_rate": 4.6261854908239933e-05, "loss": 0.1443, "step": 6246 }, { "epoch": 2.3068685376661744, "grad_norm": 0.2678048312664032, "learning_rate": 4.623722133267644e-05, "loss": 0.1637, "step": 6247 }, { "epoch": 2.307237813884786, "grad_norm": 0.2963012754917145, "learning_rate": 4.621258775711295e-05, "loss": 0.177, "step": 6248 }, { "epoch": 2.307607090103397, "grad_norm": 0.28620731830596924, "learning_rate": 4.618795418154945e-05, "loss": 0.1728, "step": 6249 }, { "epoch": 2.3079763663220088, "grad_norm": 0.2526327073574066, "learning_rate": 4.616332060598596e-05, "loss": 0.1739, "step": 6250 }, { "epoch": 2.3079763663220088, "eval_loss": 0.25240325927734375, "eval_runtime": 5.8578, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 6250 }, { "epoch": 2.3083456425406204, "grad_norm": 0.22743669152259827, "learning_rate": 4.6138687030422466e-05, "loss": 0.1491, "step": 6251 }, { "epoch": 2.308714918759232, "grad_norm": 0.26746290922164917, "learning_rate": 4.6114053454858974e-05, "loss": 0.1698, "step": 6252 }, { "epoch": 2.3090841949778436, "grad_norm": 0.31007710099220276, "learning_rate": 4.608941987929548e-05, "loss": 0.1797, "step": 6253 }, { "epoch": 2.3094534711964547, "grad_norm": 0.23614712059497833, "learning_rate": 4.606478630373199e-05, "loss": 0.1617, "step": 6254 }, { "epoch": 2.3098227474150663, "grad_norm": 0.29411062598228455, "learning_rate": 4.60401527281685e-05, "loss": 0.1738, "step": 6255 }, { "epoch": 2.310192023633678, "grad_norm": 0.27511805295944214, "learning_rate": 4.6015519152605005e-05, "loss": 0.1942, "step": 6256 }, { "epoch": 2.3105612998522895, "grad_norm": 0.2956949472427368, "learning_rate": 4.5990885577041506e-05, "loss": 0.1611, "step": 6257 }, { "epoch": 2.310930576070901, "grad_norm": 0.26745113730430603, "learning_rate": 4.5966252001478014e-05, "loss": 0.1787, "step": 6258 }, { "epoch": 2.3112998522895127, "grad_norm": 0.3227623999118805, "learning_rate": 4.594161842591452e-05, "loss": 0.1649, "step": 6259 }, { "epoch": 2.311669128508124, "grad_norm": 0.2465844452381134, "learning_rate": 4.591698485035103e-05, "loss": 0.1651, "step": 6260 }, { "epoch": 2.3120384047267355, "grad_norm": 0.33145004510879517, "learning_rate": 4.589235127478754e-05, "loss": 0.1713, "step": 6261 }, { "epoch": 2.312407680945347, "grad_norm": 0.2592099606990814, "learning_rate": 4.5867717699224046e-05, "loss": 0.1615, "step": 6262 }, { "epoch": 2.3127769571639587, "grad_norm": 0.3428521454334259, "learning_rate": 4.5843084123660554e-05, "loss": 0.1756, "step": 6263 }, { "epoch": 2.3131462333825703, "grad_norm": 0.23452892899513245, "learning_rate": 4.581845054809706e-05, "loss": 0.1468, "step": 6264 }, { "epoch": 2.3135155096011815, "grad_norm": 0.2303593009710312, "learning_rate": 4.579381697253356e-05, "loss": 0.1649, "step": 6265 }, { "epoch": 2.313884785819793, "grad_norm": 0.3098956048488617, "learning_rate": 4.576918339697007e-05, "loss": 0.2381, "step": 6266 }, { "epoch": 2.3142540620384047, "grad_norm": 0.28998488187789917, "learning_rate": 4.574454982140658e-05, "loss": 0.1796, "step": 6267 }, { "epoch": 2.3146233382570163, "grad_norm": 0.32793858647346497, "learning_rate": 4.5719916245843086e-05, "loss": 0.2072, "step": 6268 }, { "epoch": 2.314992614475628, "grad_norm": 0.2523498833179474, "learning_rate": 4.5695282670279594e-05, "loss": 0.1693, "step": 6269 }, { "epoch": 2.3153618906942395, "grad_norm": 0.2589319050312042, "learning_rate": 4.56706490947161e-05, "loss": 0.1561, "step": 6270 }, { "epoch": 2.3157311669128506, "grad_norm": 0.24488919973373413, "learning_rate": 4.564601551915261e-05, "loss": 0.1719, "step": 6271 }, { "epoch": 2.3161004431314622, "grad_norm": 0.28501635789871216, "learning_rate": 4.562138194358912e-05, "loss": 0.1895, "step": 6272 }, { "epoch": 2.316469719350074, "grad_norm": 0.24768377840518951, "learning_rate": 4.559674836802562e-05, "loss": 0.1545, "step": 6273 }, { "epoch": 2.3168389955686854, "grad_norm": 0.33171817660331726, "learning_rate": 4.5572114792462127e-05, "loss": 0.1704, "step": 6274 }, { "epoch": 2.317208271787297, "grad_norm": 0.3058539628982544, "learning_rate": 4.5547481216898634e-05, "loss": 0.1707, "step": 6275 }, { "epoch": 2.317577548005908, "grad_norm": 0.2971458435058594, "learning_rate": 4.552284764133514e-05, "loss": 0.2085, "step": 6276 }, { "epoch": 2.31794682422452, "grad_norm": 0.33643579483032227, "learning_rate": 4.549821406577165e-05, "loss": 0.193, "step": 6277 }, { "epoch": 2.3183161004431314, "grad_norm": 0.26456019282341003, "learning_rate": 4.547358049020816e-05, "loss": 0.171, "step": 6278 }, { "epoch": 2.318685376661743, "grad_norm": 0.287834107875824, "learning_rate": 4.5448946914644666e-05, "loss": 0.1846, "step": 6279 }, { "epoch": 2.3190546528803546, "grad_norm": 0.36195677518844604, "learning_rate": 4.5424313339081174e-05, "loss": 0.173, "step": 6280 }, { "epoch": 2.319423929098966, "grad_norm": 0.27335676550865173, "learning_rate": 4.5399679763517675e-05, "loss": 0.1781, "step": 6281 }, { "epoch": 2.3197932053175774, "grad_norm": 0.21281443536281586, "learning_rate": 4.537504618795418e-05, "loss": 0.1551, "step": 6282 }, { "epoch": 2.320162481536189, "grad_norm": 0.30202749371528625, "learning_rate": 4.535041261239069e-05, "loss": 0.1994, "step": 6283 }, { "epoch": 2.3205317577548006, "grad_norm": 0.2695281207561493, "learning_rate": 4.53257790368272e-05, "loss": 0.1846, "step": 6284 }, { "epoch": 2.320901033973412, "grad_norm": 0.25889331102371216, "learning_rate": 4.5301145461263706e-05, "loss": 0.1472, "step": 6285 }, { "epoch": 2.321270310192024, "grad_norm": 0.30311331152915955, "learning_rate": 4.5276511885700214e-05, "loss": 0.1949, "step": 6286 }, { "epoch": 2.321639586410635, "grad_norm": 0.2517571449279785, "learning_rate": 4.525187831013672e-05, "loss": 0.1459, "step": 6287 }, { "epoch": 2.3220088626292466, "grad_norm": 0.28349751234054565, "learning_rate": 4.522724473457322e-05, "loss": 0.1683, "step": 6288 }, { "epoch": 2.322378138847858, "grad_norm": 0.3386727273464203, "learning_rate": 4.520261115900973e-05, "loss": 0.1611, "step": 6289 }, { "epoch": 2.3227474150664698, "grad_norm": 0.2999275326728821, "learning_rate": 4.517797758344624e-05, "loss": 0.1829, "step": 6290 }, { "epoch": 2.3231166912850814, "grad_norm": 0.2655416429042816, "learning_rate": 4.515334400788275e-05, "loss": 0.191, "step": 6291 }, { "epoch": 2.323485967503693, "grad_norm": 0.2688995897769928, "learning_rate": 4.5128710432319254e-05, "loss": 0.1574, "step": 6292 }, { "epoch": 2.323855243722304, "grad_norm": 0.30712631344795227, "learning_rate": 4.510407685675576e-05, "loss": 0.184, "step": 6293 }, { "epoch": 2.3242245199409157, "grad_norm": 0.29097869992256165, "learning_rate": 4.507944328119227e-05, "loss": 0.1767, "step": 6294 }, { "epoch": 2.3245937961595273, "grad_norm": 0.2865394651889801, "learning_rate": 4.505480970562878e-05, "loss": 0.189, "step": 6295 }, { "epoch": 2.324963072378139, "grad_norm": 0.25940632820129395, "learning_rate": 4.503017613006528e-05, "loss": 0.1749, "step": 6296 }, { "epoch": 2.3253323485967505, "grad_norm": 0.3576686382293701, "learning_rate": 4.500554255450179e-05, "loss": 0.2159, "step": 6297 }, { "epoch": 2.3257016248153617, "grad_norm": 0.27279672026634216, "learning_rate": 4.4980908978938295e-05, "loss": 0.1692, "step": 6298 }, { "epoch": 2.3260709010339733, "grad_norm": 0.30613669753074646, "learning_rate": 4.49562754033748e-05, "loss": 0.1649, "step": 6299 }, { "epoch": 2.326440177252585, "grad_norm": 0.24356256425380707, "learning_rate": 4.493164182781131e-05, "loss": 0.1635, "step": 6300 }, { "epoch": 2.326440177252585, "eval_loss": 0.2532932758331299, "eval_runtime": 5.8654, "eval_samples_per_second": 8.525, "eval_steps_per_second": 1.193, "step": 6300 }, { "epoch": 2.3268094534711965, "grad_norm": 0.23789072036743164, "learning_rate": 4.490700825224782e-05, "loss": 0.1688, "step": 6301 }, { "epoch": 2.327178729689808, "grad_norm": 0.2563260793685913, "learning_rate": 4.4882374676684326e-05, "loss": 0.1625, "step": 6302 }, { "epoch": 2.3275480059084197, "grad_norm": 0.22482311725616455, "learning_rate": 4.4857741101120834e-05, "loss": 0.162, "step": 6303 }, { "epoch": 2.327917282127031, "grad_norm": 0.2803346812725067, "learning_rate": 4.4833107525557335e-05, "loss": 0.1773, "step": 6304 }, { "epoch": 2.3282865583456425, "grad_norm": 0.2905280292034149, "learning_rate": 4.480847394999384e-05, "loss": 0.1988, "step": 6305 }, { "epoch": 2.328655834564254, "grad_norm": 0.3134540617465973, "learning_rate": 4.478384037443035e-05, "loss": 0.1677, "step": 6306 }, { "epoch": 2.3290251107828657, "grad_norm": 0.2726571261882782, "learning_rate": 4.475920679886686e-05, "loss": 0.1652, "step": 6307 }, { "epoch": 2.3293943870014773, "grad_norm": 0.2989545166492462, "learning_rate": 4.473457322330337e-05, "loss": 0.1719, "step": 6308 }, { "epoch": 2.3297636632200884, "grad_norm": 0.3618376851081848, "learning_rate": 4.4709939647739875e-05, "loss": 0.2088, "step": 6309 }, { "epoch": 2.3301329394387, "grad_norm": 0.28249844908714294, "learning_rate": 4.468530607217638e-05, "loss": 0.1671, "step": 6310 }, { "epoch": 2.3305022156573116, "grad_norm": 0.25478148460388184, "learning_rate": 4.466067249661289e-05, "loss": 0.1461, "step": 6311 }, { "epoch": 2.3308714918759232, "grad_norm": 0.2577178180217743, "learning_rate": 4.463603892104939e-05, "loss": 0.1387, "step": 6312 }, { "epoch": 2.331240768094535, "grad_norm": 0.24544434249401093, "learning_rate": 4.46114053454859e-05, "loss": 0.1641, "step": 6313 }, { "epoch": 2.3316100443131464, "grad_norm": 0.31728222966194153, "learning_rate": 4.458677176992241e-05, "loss": 0.1959, "step": 6314 }, { "epoch": 2.3319793205317576, "grad_norm": 0.27827325463294983, "learning_rate": 4.4562138194358915e-05, "loss": 0.1544, "step": 6315 }, { "epoch": 2.332348596750369, "grad_norm": 0.2744172513484955, "learning_rate": 4.453750461879542e-05, "loss": 0.1644, "step": 6316 }, { "epoch": 2.332717872968981, "grad_norm": 0.2671150267124176, "learning_rate": 4.451287104323193e-05, "loss": 0.1753, "step": 6317 }, { "epoch": 2.3330871491875924, "grad_norm": 0.2949393093585968, "learning_rate": 4.448823746766844e-05, "loss": 0.1829, "step": 6318 }, { "epoch": 2.333456425406204, "grad_norm": 0.3001161515712738, "learning_rate": 4.4463603892104946e-05, "loss": 0.1976, "step": 6319 }, { "epoch": 2.333825701624815, "grad_norm": 0.327101469039917, "learning_rate": 4.443897031654145e-05, "loss": 0.1831, "step": 6320 }, { "epoch": 2.3341949778434268, "grad_norm": 0.2728175222873688, "learning_rate": 4.4414336740977955e-05, "loss": 0.1485, "step": 6321 }, { "epoch": 2.3345642540620384, "grad_norm": 0.303019642829895, "learning_rate": 4.438970316541446e-05, "loss": 0.1778, "step": 6322 }, { "epoch": 2.33493353028065, "grad_norm": 0.2969949543476105, "learning_rate": 4.436506958985097e-05, "loss": 0.1812, "step": 6323 }, { "epoch": 2.3353028064992616, "grad_norm": 0.2519605755805969, "learning_rate": 4.434043601428748e-05, "loss": 0.1629, "step": 6324 }, { "epoch": 2.335672082717873, "grad_norm": 0.2607879340648651, "learning_rate": 4.431580243872399e-05, "loss": 0.1566, "step": 6325 }, { "epoch": 2.3360413589364843, "grad_norm": 0.21427521109580994, "learning_rate": 4.4291168863160495e-05, "loss": 0.1424, "step": 6326 }, { "epoch": 2.336410635155096, "grad_norm": 0.2910623550415039, "learning_rate": 4.4266535287596996e-05, "loss": 0.1723, "step": 6327 }, { "epoch": 2.3367799113737076, "grad_norm": 0.24531933665275574, "learning_rate": 4.4241901712033504e-05, "loss": 0.1562, "step": 6328 }, { "epoch": 2.337149187592319, "grad_norm": 0.2238311618566513, "learning_rate": 4.4217268136470005e-05, "loss": 0.134, "step": 6329 }, { "epoch": 2.3375184638109308, "grad_norm": 0.2861112356185913, "learning_rate": 4.419263456090651e-05, "loss": 0.1606, "step": 6330 }, { "epoch": 2.337887740029542, "grad_norm": 0.24773359298706055, "learning_rate": 4.416800098534302e-05, "loss": 0.1816, "step": 6331 }, { "epoch": 2.3382570162481535, "grad_norm": 0.22608470916748047, "learning_rate": 4.414336740977953e-05, "loss": 0.1573, "step": 6332 }, { "epoch": 2.338626292466765, "grad_norm": 0.2583619952201843, "learning_rate": 4.4118733834216036e-05, "loss": 0.1374, "step": 6333 }, { "epoch": 2.3389955686853767, "grad_norm": 0.29897114634513855, "learning_rate": 4.4094100258652544e-05, "loss": 0.1787, "step": 6334 }, { "epoch": 2.3393648449039883, "grad_norm": 0.2301228940486908, "learning_rate": 4.406946668308905e-05, "loss": 0.1517, "step": 6335 }, { "epoch": 2.3397341211226, "grad_norm": 0.2524890601634979, "learning_rate": 4.404483310752556e-05, "loss": 0.172, "step": 6336 }, { "epoch": 2.340103397341211, "grad_norm": 0.25114837288856506, "learning_rate": 4.402019953196206e-05, "loss": 0.1762, "step": 6337 }, { "epoch": 2.3404726735598227, "grad_norm": 0.2742963135242462, "learning_rate": 4.399556595639857e-05, "loss": 0.1658, "step": 6338 }, { "epoch": 2.3408419497784343, "grad_norm": 0.24793750047683716, "learning_rate": 4.397093238083508e-05, "loss": 0.1616, "step": 6339 }, { "epoch": 2.341211225997046, "grad_norm": 0.29155686497688293, "learning_rate": 4.3946298805271585e-05, "loss": 0.156, "step": 6340 }, { "epoch": 2.3415805022156575, "grad_norm": 0.2763091027736664, "learning_rate": 4.392166522970809e-05, "loss": 0.181, "step": 6341 }, { "epoch": 2.3419497784342687, "grad_norm": 0.3174024224281311, "learning_rate": 4.38970316541446e-05, "loss": 0.1653, "step": 6342 }, { "epoch": 2.3423190546528803, "grad_norm": 0.28082993626594543, "learning_rate": 4.387239807858111e-05, "loss": 0.1505, "step": 6343 }, { "epoch": 2.342688330871492, "grad_norm": 0.247702494263649, "learning_rate": 4.3847764503017616e-05, "loss": 0.1567, "step": 6344 }, { "epoch": 2.3430576070901035, "grad_norm": 0.3166281282901764, "learning_rate": 4.382313092745412e-05, "loss": 0.1635, "step": 6345 }, { "epoch": 2.343426883308715, "grad_norm": 0.32806655764579773, "learning_rate": 4.3798497351890625e-05, "loss": 0.1911, "step": 6346 }, { "epoch": 2.3437961595273267, "grad_norm": 0.2769249379634857, "learning_rate": 4.377386377632713e-05, "loss": 0.1724, "step": 6347 }, { "epoch": 2.344165435745938, "grad_norm": 0.2766216993331909, "learning_rate": 4.374923020076364e-05, "loss": 0.173, "step": 6348 }, { "epoch": 2.3445347119645494, "grad_norm": 0.2586168944835663, "learning_rate": 4.372459662520015e-05, "loss": 0.1417, "step": 6349 }, { "epoch": 2.344903988183161, "grad_norm": 0.2673698961734772, "learning_rate": 4.3699963049636656e-05, "loss": 0.177, "step": 6350 }, { "epoch": 2.344903988183161, "eval_loss": 0.2545863091945648, "eval_runtime": 5.8601, "eval_samples_per_second": 8.532, "eval_steps_per_second": 1.195, "step": 6350 }, { "epoch": 2.3452732644017726, "grad_norm": 0.29882508516311646, "learning_rate": 4.3675329474073164e-05, "loss": 0.162, "step": 6351 }, { "epoch": 2.345642540620384, "grad_norm": 0.24953065812587738, "learning_rate": 4.365069589850967e-05, "loss": 0.1657, "step": 6352 }, { "epoch": 2.3460118168389954, "grad_norm": 0.2816498279571533, "learning_rate": 4.362606232294617e-05, "loss": 0.1632, "step": 6353 }, { "epoch": 2.346381093057607, "grad_norm": 0.2919892370700836, "learning_rate": 4.360142874738268e-05, "loss": 0.1675, "step": 6354 }, { "epoch": 2.3467503692762186, "grad_norm": 0.29279571771621704, "learning_rate": 4.357679517181919e-05, "loss": 0.1769, "step": 6355 }, { "epoch": 2.34711964549483, "grad_norm": 0.3148755431175232, "learning_rate": 4.35521615962557e-05, "loss": 0.1889, "step": 6356 }, { "epoch": 2.347488921713442, "grad_norm": 0.3025984764099121, "learning_rate": 4.3527528020692205e-05, "loss": 0.1729, "step": 6357 }, { "epoch": 2.3478581979320534, "grad_norm": 0.2500268816947937, "learning_rate": 4.350289444512871e-05, "loss": 0.1416, "step": 6358 }, { "epoch": 2.3482274741506646, "grad_norm": 0.3548707962036133, "learning_rate": 4.347826086956522e-05, "loss": 0.1898, "step": 6359 }, { "epoch": 2.348596750369276, "grad_norm": 0.36047059297561646, "learning_rate": 4.345362729400173e-05, "loss": 0.1478, "step": 6360 }, { "epoch": 2.348966026587888, "grad_norm": 0.3263508081436157, "learning_rate": 4.342899371843823e-05, "loss": 0.207, "step": 6361 }, { "epoch": 2.3493353028064994, "grad_norm": 0.24649988114833832, "learning_rate": 4.340436014287474e-05, "loss": 0.1609, "step": 6362 }, { "epoch": 2.3497045790251105, "grad_norm": 0.309300035238266, "learning_rate": 4.3379726567311245e-05, "loss": 0.1729, "step": 6363 }, { "epoch": 2.350073855243722, "grad_norm": 0.21511535346508026, "learning_rate": 4.335509299174775e-05, "loss": 0.1617, "step": 6364 }, { "epoch": 2.3504431314623337, "grad_norm": 0.3180568516254425, "learning_rate": 4.333045941618426e-05, "loss": 0.1806, "step": 6365 }, { "epoch": 2.3508124076809453, "grad_norm": 0.2766928970813751, "learning_rate": 4.330582584062077e-05, "loss": 0.1844, "step": 6366 }, { "epoch": 2.351181683899557, "grad_norm": 0.2802768051624298, "learning_rate": 4.3281192265057276e-05, "loss": 0.1818, "step": 6367 }, { "epoch": 2.3515509601181686, "grad_norm": 0.25994670391082764, "learning_rate": 4.3256558689493784e-05, "loss": 0.178, "step": 6368 }, { "epoch": 2.35192023633678, "grad_norm": 0.28026604652404785, "learning_rate": 4.3231925113930285e-05, "loss": 0.1942, "step": 6369 }, { "epoch": 2.3522895125553913, "grad_norm": 0.27653589844703674, "learning_rate": 4.320729153836679e-05, "loss": 0.187, "step": 6370 }, { "epoch": 2.352658788774003, "grad_norm": 0.24260860681533813, "learning_rate": 4.31826579628033e-05, "loss": 0.1558, "step": 6371 }, { "epoch": 2.3530280649926145, "grad_norm": 0.2591858208179474, "learning_rate": 4.315802438723981e-05, "loss": 0.1645, "step": 6372 }, { "epoch": 2.353397341211226, "grad_norm": 0.2710878252983093, "learning_rate": 4.313339081167632e-05, "loss": 0.1624, "step": 6373 }, { "epoch": 2.3537666174298373, "grad_norm": 0.29834455251693726, "learning_rate": 4.3108757236112825e-05, "loss": 0.1775, "step": 6374 }, { "epoch": 2.354135893648449, "grad_norm": 0.25172194838523865, "learning_rate": 4.308412366054933e-05, "loss": 0.1781, "step": 6375 }, { "epoch": 2.3545051698670605, "grad_norm": 0.23107177019119263, "learning_rate": 4.305949008498584e-05, "loss": 0.1469, "step": 6376 }, { "epoch": 2.354874446085672, "grad_norm": 0.2343122661113739, "learning_rate": 4.303485650942234e-05, "loss": 0.1449, "step": 6377 }, { "epoch": 2.3552437223042837, "grad_norm": 0.24231816828250885, "learning_rate": 4.301022293385885e-05, "loss": 0.1738, "step": 6378 }, { "epoch": 2.3556129985228953, "grad_norm": 0.29539164900779724, "learning_rate": 4.298558935829536e-05, "loss": 0.1668, "step": 6379 }, { "epoch": 2.3559822747415065, "grad_norm": 0.24808554351329803, "learning_rate": 4.2960955782731865e-05, "loss": 0.155, "step": 6380 }, { "epoch": 2.356351550960118, "grad_norm": 0.2660427689552307, "learning_rate": 4.293632220716837e-05, "loss": 0.1648, "step": 6381 }, { "epoch": 2.3567208271787297, "grad_norm": 0.280545175075531, "learning_rate": 4.291168863160488e-05, "loss": 0.1594, "step": 6382 }, { "epoch": 2.3570901033973413, "grad_norm": 0.31870830059051514, "learning_rate": 4.288705505604139e-05, "loss": 0.1636, "step": 6383 }, { "epoch": 2.357459379615953, "grad_norm": 0.24920117855072021, "learning_rate": 4.28624214804779e-05, "loss": 0.1608, "step": 6384 }, { "epoch": 2.357828655834564, "grad_norm": 0.24905510246753693, "learning_rate": 4.28377879049144e-05, "loss": 0.136, "step": 6385 }, { "epoch": 2.3581979320531756, "grad_norm": 0.2614610195159912, "learning_rate": 4.2813154329350906e-05, "loss": 0.1624, "step": 6386 }, { "epoch": 2.3585672082717872, "grad_norm": 0.24701111018657684, "learning_rate": 4.2788520753787413e-05, "loss": 0.1609, "step": 6387 }, { "epoch": 2.358936484490399, "grad_norm": 0.3295655846595764, "learning_rate": 4.276388717822392e-05, "loss": 0.1809, "step": 6388 }, { "epoch": 2.3593057607090104, "grad_norm": 0.3034295439720154, "learning_rate": 4.273925360266043e-05, "loss": 0.193, "step": 6389 }, { "epoch": 2.359675036927622, "grad_norm": 0.26090866327285767, "learning_rate": 4.271462002709694e-05, "loss": 0.1548, "step": 6390 }, { "epoch": 2.360044313146233, "grad_norm": 0.30005943775177, "learning_rate": 4.2689986451533445e-05, "loss": 0.1803, "step": 6391 }, { "epoch": 2.360413589364845, "grad_norm": 0.2783083915710449, "learning_rate": 4.266535287596995e-05, "loss": 0.1706, "step": 6392 }, { "epoch": 2.3607828655834564, "grad_norm": 0.303497850894928, "learning_rate": 4.2640719300406454e-05, "loss": 0.1955, "step": 6393 }, { "epoch": 2.361152141802068, "grad_norm": 0.22162476181983948, "learning_rate": 4.261608572484296e-05, "loss": 0.1579, "step": 6394 }, { "epoch": 2.3615214180206796, "grad_norm": 0.24626381695270538, "learning_rate": 4.259145214927947e-05, "loss": 0.1659, "step": 6395 }, { "epoch": 2.3618906942392908, "grad_norm": 0.22730812430381775, "learning_rate": 4.256681857371598e-05, "loss": 0.1514, "step": 6396 }, { "epoch": 2.3622599704579024, "grad_norm": 0.2827454209327698, "learning_rate": 4.2542184998152485e-05, "loss": 0.1878, "step": 6397 }, { "epoch": 2.362629246676514, "grad_norm": 0.29262617230415344, "learning_rate": 4.251755142258899e-05, "loss": 0.1694, "step": 6398 }, { "epoch": 2.3629985228951256, "grad_norm": 0.2770841717720032, "learning_rate": 4.24929178470255e-05, "loss": 0.1689, "step": 6399 }, { "epoch": 2.363367799113737, "grad_norm": 0.24018266797065735, "learning_rate": 4.246828427146201e-05, "loss": 0.1864, "step": 6400 }, { "epoch": 2.363367799113737, "eval_loss": 0.2547506093978882, "eval_runtime": 5.8503, "eval_samples_per_second": 8.547, "eval_steps_per_second": 1.197, "step": 6400 }, { "epoch": 2.363737075332349, "grad_norm": 0.2783015966415405, "learning_rate": 4.244365069589851e-05, "loss": 0.1675, "step": 6401 }, { "epoch": 2.36410635155096, "grad_norm": 0.28610220551490784, "learning_rate": 4.241901712033502e-05, "loss": 0.195, "step": 6402 }, { "epoch": 2.3644756277695715, "grad_norm": 0.24822600185871124, "learning_rate": 4.2394383544771526e-05, "loss": 0.1515, "step": 6403 }, { "epoch": 2.364844903988183, "grad_norm": 0.2633461654186249, "learning_rate": 4.2369749969208034e-05, "loss": 0.1539, "step": 6404 }, { "epoch": 2.3652141802067947, "grad_norm": 0.25599509477615356, "learning_rate": 4.234511639364454e-05, "loss": 0.1757, "step": 6405 }, { "epoch": 2.3655834564254064, "grad_norm": 0.34993016719818115, "learning_rate": 4.232048281808105e-05, "loss": 0.1865, "step": 6406 }, { "epoch": 2.3659527326440175, "grad_norm": 0.26258376240730286, "learning_rate": 4.229584924251756e-05, "loss": 0.1523, "step": 6407 }, { "epoch": 2.366322008862629, "grad_norm": 0.27636638283729553, "learning_rate": 4.227121566695406e-05, "loss": 0.1512, "step": 6408 }, { "epoch": 2.3666912850812407, "grad_norm": 0.3100382089614868, "learning_rate": 4.2246582091390566e-05, "loss": 0.187, "step": 6409 }, { "epoch": 2.3670605612998523, "grad_norm": 0.2492201328277588, "learning_rate": 4.2221948515827074e-05, "loss": 0.1529, "step": 6410 }, { "epoch": 2.367429837518464, "grad_norm": 0.22442567348480225, "learning_rate": 4.219731494026358e-05, "loss": 0.1607, "step": 6411 }, { "epoch": 2.3677991137370755, "grad_norm": 0.3340201675891876, "learning_rate": 4.217268136470009e-05, "loss": 0.1775, "step": 6412 }, { "epoch": 2.3681683899556867, "grad_norm": 0.2450280338525772, "learning_rate": 4.21480477891366e-05, "loss": 0.1723, "step": 6413 }, { "epoch": 2.3685376661742983, "grad_norm": 0.22480812668800354, "learning_rate": 4.2123414213573105e-05, "loss": 0.1559, "step": 6414 }, { "epoch": 2.36890694239291, "grad_norm": 0.24426712095737457, "learning_rate": 4.209878063800961e-05, "loss": 0.1476, "step": 6415 }, { "epoch": 2.3692762186115215, "grad_norm": 0.2631894052028656, "learning_rate": 4.2074147062446114e-05, "loss": 0.1911, "step": 6416 }, { "epoch": 2.369645494830133, "grad_norm": 0.2455267757177353, "learning_rate": 4.204951348688262e-05, "loss": 0.1828, "step": 6417 }, { "epoch": 2.3700147710487443, "grad_norm": 0.38169267773628235, "learning_rate": 4.202487991131913e-05, "loss": 0.1904, "step": 6418 }, { "epoch": 2.370384047267356, "grad_norm": 0.3039427101612091, "learning_rate": 4.200024633575564e-05, "loss": 0.1731, "step": 6419 }, { "epoch": 2.3707533234859675, "grad_norm": 0.27263590693473816, "learning_rate": 4.1975612760192146e-05, "loss": 0.1798, "step": 6420 }, { "epoch": 2.371122599704579, "grad_norm": 0.2618173360824585, "learning_rate": 4.1950979184628654e-05, "loss": 0.1492, "step": 6421 }, { "epoch": 2.3714918759231907, "grad_norm": 0.2843448221683502, "learning_rate": 4.192634560906516e-05, "loss": 0.1614, "step": 6422 }, { "epoch": 2.3718611521418023, "grad_norm": 0.2256832867860794, "learning_rate": 4.190171203350167e-05, "loss": 0.1657, "step": 6423 }, { "epoch": 2.3722304283604134, "grad_norm": 0.29430991411209106, "learning_rate": 4.187707845793817e-05, "loss": 0.1744, "step": 6424 }, { "epoch": 2.372599704579025, "grad_norm": 0.2796902358531952, "learning_rate": 4.185244488237468e-05, "loss": 0.1913, "step": 6425 }, { "epoch": 2.3729689807976366, "grad_norm": 0.2929346561431885, "learning_rate": 4.1827811306811186e-05, "loss": 0.1928, "step": 6426 }, { "epoch": 2.3733382570162482, "grad_norm": 0.46698009967803955, "learning_rate": 4.1803177731247694e-05, "loss": 0.1973, "step": 6427 }, { "epoch": 2.37370753323486, "grad_norm": 0.2522735297679901, "learning_rate": 4.17785441556842e-05, "loss": 0.1671, "step": 6428 }, { "epoch": 2.374076809453471, "grad_norm": 0.3110278248786926, "learning_rate": 4.175391058012071e-05, "loss": 0.1913, "step": 6429 }, { "epoch": 2.3744460856720826, "grad_norm": 0.3266526758670807, "learning_rate": 4.172927700455722e-05, "loss": 0.1873, "step": 6430 }, { "epoch": 2.374815361890694, "grad_norm": 0.2453799694776535, "learning_rate": 4.1704643428993726e-05, "loss": 0.1551, "step": 6431 }, { "epoch": 2.375184638109306, "grad_norm": 0.24761134386062622, "learning_rate": 4.168000985343023e-05, "loss": 0.159, "step": 6432 }, { "epoch": 2.3755539143279174, "grad_norm": 0.26183828711509705, "learning_rate": 4.1655376277866734e-05, "loss": 0.155, "step": 6433 }, { "epoch": 2.375923190546529, "grad_norm": 0.24739377200603485, "learning_rate": 4.163074270230324e-05, "loss": 0.1538, "step": 6434 }, { "epoch": 2.37629246676514, "grad_norm": 0.2409384697675705, "learning_rate": 4.160610912673975e-05, "loss": 0.1684, "step": 6435 }, { "epoch": 2.3766617429837518, "grad_norm": 0.28293392062187195, "learning_rate": 4.158147555117626e-05, "loss": 0.1654, "step": 6436 }, { "epoch": 2.3770310192023634, "grad_norm": 0.31228867173194885, "learning_rate": 4.1556841975612766e-05, "loss": 0.1648, "step": 6437 }, { "epoch": 2.377400295420975, "grad_norm": 0.2207242101430893, "learning_rate": 4.1532208400049274e-05, "loss": 0.155, "step": 6438 }, { "epoch": 2.3777695716395866, "grad_norm": 0.256783664226532, "learning_rate": 4.150757482448578e-05, "loss": 0.1706, "step": 6439 }, { "epoch": 2.3781388478581977, "grad_norm": 0.238917276263237, "learning_rate": 4.148294124892228e-05, "loss": 0.1494, "step": 6440 }, { "epoch": 2.3785081240768093, "grad_norm": 0.24925558269023895, "learning_rate": 4.145830767335879e-05, "loss": 0.1684, "step": 6441 }, { "epoch": 2.378877400295421, "grad_norm": 0.2319122850894928, "learning_rate": 4.14336740977953e-05, "loss": 0.1691, "step": 6442 }, { "epoch": 2.3792466765140325, "grad_norm": 0.32297733426094055, "learning_rate": 4.14090405222318e-05, "loss": 0.1691, "step": 6443 }, { "epoch": 2.379615952732644, "grad_norm": 0.3200797140598297, "learning_rate": 4.138440694666831e-05, "loss": 0.1854, "step": 6444 }, { "epoch": 2.3799852289512557, "grad_norm": 0.31530579924583435, "learning_rate": 4.1359773371104815e-05, "loss": 0.19, "step": 6445 }, { "epoch": 2.380354505169867, "grad_norm": 0.2863091826438904, "learning_rate": 4.133513979554132e-05, "loss": 0.1689, "step": 6446 }, { "epoch": 2.3807237813884785, "grad_norm": 0.2543277442455292, "learning_rate": 4.131050621997783e-05, "loss": 0.1662, "step": 6447 }, { "epoch": 2.38109305760709, "grad_norm": 0.2751203179359436, "learning_rate": 4.128587264441434e-05, "loss": 0.1662, "step": 6448 }, { "epoch": 2.3814623338257017, "grad_norm": 0.27884477376937866, "learning_rate": 4.126123906885084e-05, "loss": 0.1769, "step": 6449 }, { "epoch": 2.3818316100443133, "grad_norm": 0.29122814536094666, "learning_rate": 4.123660549328735e-05, "loss": 0.1839, "step": 6450 }, { "epoch": 2.3818316100443133, "eval_loss": 0.25416862964630127, "eval_runtime": 5.849, "eval_samples_per_second": 8.549, "eval_steps_per_second": 1.197, "step": 6450 }, { "epoch": 2.3822008862629245, "grad_norm": 0.42802420258522034, "learning_rate": 4.1211971917723856e-05, "loss": 0.1906, "step": 6451 }, { "epoch": 2.382570162481536, "grad_norm": 0.29035428166389465, "learning_rate": 4.1187338342160364e-05, "loss": 0.1753, "step": 6452 }, { "epoch": 2.3829394387001477, "grad_norm": 0.22821110486984253, "learning_rate": 4.116270476659687e-05, "loss": 0.1688, "step": 6453 }, { "epoch": 2.3833087149187593, "grad_norm": 0.2822718918323517, "learning_rate": 4.113807119103338e-05, "loss": 0.1432, "step": 6454 }, { "epoch": 2.383677991137371, "grad_norm": 0.2678989768028259, "learning_rate": 4.111343761546989e-05, "loss": 0.1625, "step": 6455 }, { "epoch": 2.3840472673559825, "grad_norm": 0.29239994287490845, "learning_rate": 4.1088804039906395e-05, "loss": 0.1765, "step": 6456 }, { "epoch": 2.3844165435745936, "grad_norm": 0.23325029015541077, "learning_rate": 4.1064170464342896e-05, "loss": 0.1488, "step": 6457 }, { "epoch": 2.3847858197932053, "grad_norm": 0.27885130047798157, "learning_rate": 4.1039536888779404e-05, "loss": 0.1868, "step": 6458 }, { "epoch": 2.385155096011817, "grad_norm": 0.21699808537960052, "learning_rate": 4.101490331321591e-05, "loss": 0.1384, "step": 6459 }, { "epoch": 2.3855243722304285, "grad_norm": 0.2940004765987396, "learning_rate": 4.099026973765242e-05, "loss": 0.1505, "step": 6460 }, { "epoch": 2.38589364844904, "grad_norm": 0.2605478763580322, "learning_rate": 4.096563616208893e-05, "loss": 0.1713, "step": 6461 }, { "epoch": 2.386262924667651, "grad_norm": 0.23694145679473877, "learning_rate": 4.0941002586525435e-05, "loss": 0.1476, "step": 6462 }, { "epoch": 2.386632200886263, "grad_norm": 0.3237950801849365, "learning_rate": 4.091636901096194e-05, "loss": 0.1542, "step": 6463 }, { "epoch": 2.3870014771048744, "grad_norm": 0.2576119601726532, "learning_rate": 4.089173543539845e-05, "loss": 0.1495, "step": 6464 }, { "epoch": 2.387370753323486, "grad_norm": 0.3325550854206085, "learning_rate": 4.086710185983495e-05, "loss": 0.1724, "step": 6465 }, { "epoch": 2.3877400295420976, "grad_norm": 0.3270372748374939, "learning_rate": 4.084246828427146e-05, "loss": 0.1901, "step": 6466 }, { "epoch": 2.3881093057607092, "grad_norm": 0.2555522918701172, "learning_rate": 4.081783470870797e-05, "loss": 0.1482, "step": 6467 }, { "epoch": 2.3884785819793204, "grad_norm": 0.3293311893939972, "learning_rate": 4.0793201133144476e-05, "loss": 0.191, "step": 6468 }, { "epoch": 2.388847858197932, "grad_norm": 0.3073927164077759, "learning_rate": 4.0768567557580984e-05, "loss": 0.1628, "step": 6469 }, { "epoch": 2.3892171344165436, "grad_norm": 0.24122124910354614, "learning_rate": 4.074393398201749e-05, "loss": 0.1705, "step": 6470 }, { "epoch": 2.389586410635155, "grad_norm": 0.318040132522583, "learning_rate": 4.0719300406454e-05, "loss": 0.2037, "step": 6471 }, { "epoch": 2.389955686853767, "grad_norm": 0.2927151322364807, "learning_rate": 4.069466683089051e-05, "loss": 0.1584, "step": 6472 }, { "epoch": 2.390324963072378, "grad_norm": 0.28697019815444946, "learning_rate": 4.067003325532701e-05, "loss": 0.1612, "step": 6473 }, { "epoch": 2.3906942392909896, "grad_norm": 0.2633149027824402, "learning_rate": 4.0645399679763516e-05, "loss": 0.1724, "step": 6474 }, { "epoch": 2.391063515509601, "grad_norm": 0.26802918314933777, "learning_rate": 4.0620766104200024e-05, "loss": 0.1721, "step": 6475 }, { "epoch": 2.3914327917282128, "grad_norm": 0.31744685769081116, "learning_rate": 4.059613252863653e-05, "loss": 0.1588, "step": 6476 }, { "epoch": 2.3918020679468244, "grad_norm": 0.28282245993614197, "learning_rate": 4.057149895307304e-05, "loss": 0.1743, "step": 6477 }, { "epoch": 2.392171344165436, "grad_norm": 0.280818372964859, "learning_rate": 4.054686537750955e-05, "loss": 0.1662, "step": 6478 }, { "epoch": 2.392540620384047, "grad_norm": 0.26216059923171997, "learning_rate": 4.0522231801946056e-05, "loss": 0.1816, "step": 6479 }, { "epoch": 2.3929098966026587, "grad_norm": 0.39850226044654846, "learning_rate": 4.0497598226382563e-05, "loss": 0.1928, "step": 6480 }, { "epoch": 2.3932791728212703, "grad_norm": 0.29623129963874817, "learning_rate": 4.0472964650819064e-05, "loss": 0.1813, "step": 6481 }, { "epoch": 2.393648449039882, "grad_norm": 0.41195112466812134, "learning_rate": 4.044833107525557e-05, "loss": 0.1932, "step": 6482 }, { "epoch": 2.3940177252584935, "grad_norm": 0.32941770553588867, "learning_rate": 4.042369749969208e-05, "loss": 0.1854, "step": 6483 }, { "epoch": 2.3943870014771047, "grad_norm": 0.26806557178497314, "learning_rate": 4.039906392412859e-05, "loss": 0.1528, "step": 6484 }, { "epoch": 2.3947562776957163, "grad_norm": 0.2809225916862488, "learning_rate": 4.0374430348565096e-05, "loss": 0.1575, "step": 6485 }, { "epoch": 2.395125553914328, "grad_norm": 0.30107736587524414, "learning_rate": 4.0349796773001604e-05, "loss": 0.1745, "step": 6486 }, { "epoch": 2.3954948301329395, "grad_norm": 0.2642914950847626, "learning_rate": 4.032516319743811e-05, "loss": 0.1542, "step": 6487 }, { "epoch": 2.395864106351551, "grad_norm": 0.26644206047058105, "learning_rate": 4.030052962187462e-05, "loss": 0.1712, "step": 6488 }, { "epoch": 2.3962333825701627, "grad_norm": 0.2601780295372009, "learning_rate": 4.027589604631112e-05, "loss": 0.1609, "step": 6489 }, { "epoch": 2.396602658788774, "grad_norm": 0.2623216509819031, "learning_rate": 4.025126247074763e-05, "loss": 0.1742, "step": 6490 }, { "epoch": 2.3969719350073855, "grad_norm": 0.23525527119636536, "learning_rate": 4.0226628895184136e-05, "loss": 0.1732, "step": 6491 }, { "epoch": 2.397341211225997, "grad_norm": 0.2936418354511261, "learning_rate": 4.0201995319620644e-05, "loss": 0.1575, "step": 6492 }, { "epoch": 2.3977104874446087, "grad_norm": 0.21576356887817383, "learning_rate": 4.017736174405715e-05, "loss": 0.1478, "step": 6493 }, { "epoch": 2.39807976366322, "grad_norm": 0.283373087644577, "learning_rate": 4.015272816849366e-05, "loss": 0.1716, "step": 6494 }, { "epoch": 2.3984490398818314, "grad_norm": 0.24093297123908997, "learning_rate": 4.012809459293017e-05, "loss": 0.1582, "step": 6495 }, { "epoch": 2.398818316100443, "grad_norm": 0.3448116183280945, "learning_rate": 4.0103461017366676e-05, "loss": 0.1793, "step": 6496 }, { "epoch": 2.3991875923190547, "grad_norm": 0.27197304368019104, "learning_rate": 4.007882744180318e-05, "loss": 0.1705, "step": 6497 }, { "epoch": 2.3995568685376663, "grad_norm": 0.2958301305770874, "learning_rate": 4.0054193866239685e-05, "loss": 0.1585, "step": 6498 }, { "epoch": 2.399926144756278, "grad_norm": 0.23141135275363922, "learning_rate": 4.002956029067619e-05, "loss": 0.1442, "step": 6499 }, { "epoch": 2.4002954209748895, "grad_norm": 0.27087700366973877, "learning_rate": 4.00049267151127e-05, "loss": 0.1608, "step": 6500 }, { "epoch": 2.4002954209748895, "eval_loss": 0.254189670085907, "eval_runtime": 5.8593, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 6500 }, { "epoch": 2.4006646971935006, "grad_norm": 0.3575212061405182, "learning_rate": 3.998029313954921e-05, "loss": 0.1717, "step": 6501 }, { "epoch": 2.401033973412112, "grad_norm": 0.2458181381225586, "learning_rate": 3.9955659563985716e-05, "loss": 0.1431, "step": 6502 }, { "epoch": 2.401403249630724, "grad_norm": 0.2927877902984619, "learning_rate": 3.9931025988422224e-05, "loss": 0.1897, "step": 6503 }, { "epoch": 2.4017725258493354, "grad_norm": 0.2803095877170563, "learning_rate": 3.990639241285873e-05, "loss": 0.1416, "step": 6504 }, { "epoch": 2.4021418020679466, "grad_norm": 0.28265419602394104, "learning_rate": 3.988175883729523e-05, "loss": 0.1776, "step": 6505 }, { "epoch": 2.402511078286558, "grad_norm": 0.2402728646993637, "learning_rate": 3.985712526173174e-05, "loss": 0.1624, "step": 6506 }, { "epoch": 2.40288035450517, "grad_norm": 0.26712340116500854, "learning_rate": 3.983249168616825e-05, "loss": 0.1635, "step": 6507 }, { "epoch": 2.4032496307237814, "grad_norm": 0.2670974135398865, "learning_rate": 3.9807858110604756e-05, "loss": 0.1851, "step": 6508 }, { "epoch": 2.403618906942393, "grad_norm": 0.27897876501083374, "learning_rate": 3.9783224535041264e-05, "loss": 0.1751, "step": 6509 }, { "epoch": 2.4039881831610046, "grad_norm": 0.2822982966899872, "learning_rate": 3.975859095947777e-05, "loss": 0.1765, "step": 6510 }, { "epoch": 2.404357459379616, "grad_norm": 0.23303090035915375, "learning_rate": 3.973395738391428e-05, "loss": 0.1455, "step": 6511 }, { "epoch": 2.4047267355982274, "grad_norm": 0.3002856373786926, "learning_rate": 3.970932380835079e-05, "loss": 0.1817, "step": 6512 }, { "epoch": 2.405096011816839, "grad_norm": 0.29189005494117737, "learning_rate": 3.968469023278729e-05, "loss": 0.1643, "step": 6513 }, { "epoch": 2.4054652880354506, "grad_norm": 0.240583598613739, "learning_rate": 3.96600566572238e-05, "loss": 0.1375, "step": 6514 }, { "epoch": 2.405834564254062, "grad_norm": 0.30848589539527893, "learning_rate": 3.9635423081660305e-05, "loss": 0.181, "step": 6515 }, { "epoch": 2.4062038404726733, "grad_norm": 0.26245778799057007, "learning_rate": 3.961078950609681e-05, "loss": 0.1549, "step": 6516 }, { "epoch": 2.406573116691285, "grad_norm": 0.27827879786491394, "learning_rate": 3.958615593053332e-05, "loss": 0.1605, "step": 6517 }, { "epoch": 2.4069423929098965, "grad_norm": 0.2480623573064804, "learning_rate": 3.956152235496983e-05, "loss": 0.1574, "step": 6518 }, { "epoch": 2.407311669128508, "grad_norm": 0.22942690551280975, "learning_rate": 3.9536888779406336e-05, "loss": 0.1683, "step": 6519 }, { "epoch": 2.4076809453471197, "grad_norm": 0.24263231456279755, "learning_rate": 3.951225520384284e-05, "loss": 0.1525, "step": 6520 }, { "epoch": 2.4080502215657313, "grad_norm": 0.2355821132659912, "learning_rate": 3.9487621628279345e-05, "loss": 0.1488, "step": 6521 }, { "epoch": 2.4084194977843425, "grad_norm": 0.3792753219604492, "learning_rate": 3.946298805271585e-05, "loss": 0.1787, "step": 6522 }, { "epoch": 2.408788774002954, "grad_norm": 0.2381313145160675, "learning_rate": 3.943835447715236e-05, "loss": 0.1646, "step": 6523 }, { "epoch": 2.4091580502215657, "grad_norm": 0.34551018476486206, "learning_rate": 3.941372090158887e-05, "loss": 0.168, "step": 6524 }, { "epoch": 2.4095273264401773, "grad_norm": 0.34995561838150024, "learning_rate": 3.9389087326025377e-05, "loss": 0.2189, "step": 6525 }, { "epoch": 2.409896602658789, "grad_norm": 0.3129412531852722, "learning_rate": 3.9364453750461884e-05, "loss": 0.1812, "step": 6526 }, { "epoch": 2.4102658788774, "grad_norm": 0.2410811483860016, "learning_rate": 3.933982017489839e-05, "loss": 0.1596, "step": 6527 }, { "epoch": 2.4106351550960117, "grad_norm": 0.3152921497821808, "learning_rate": 3.9315186599334893e-05, "loss": 0.2059, "step": 6528 }, { "epoch": 2.4110044313146233, "grad_norm": 0.2665098309516907, "learning_rate": 3.92905530237714e-05, "loss": 0.1669, "step": 6529 }, { "epoch": 2.411373707533235, "grad_norm": 0.2058529108762741, "learning_rate": 3.926591944820791e-05, "loss": 0.1531, "step": 6530 }, { "epoch": 2.4117429837518465, "grad_norm": 0.2851026654243469, "learning_rate": 3.924128587264442e-05, "loss": 0.1792, "step": 6531 }, { "epoch": 2.412112259970458, "grad_norm": 0.39070722460746765, "learning_rate": 3.9216652297080925e-05, "loss": 0.1588, "step": 6532 }, { "epoch": 2.4124815361890692, "grad_norm": 0.33281734585762024, "learning_rate": 3.919201872151743e-05, "loss": 0.1958, "step": 6533 }, { "epoch": 2.412850812407681, "grad_norm": 0.3490018844604492, "learning_rate": 3.916738514595394e-05, "loss": 0.1819, "step": 6534 }, { "epoch": 2.4132200886262924, "grad_norm": 0.2670537531375885, "learning_rate": 3.914275157039045e-05, "loss": 0.1873, "step": 6535 }, { "epoch": 2.413589364844904, "grad_norm": 0.2629680633544922, "learning_rate": 3.911811799482695e-05, "loss": 0.1654, "step": 6536 }, { "epoch": 2.4139586410635157, "grad_norm": 0.2617724537849426, "learning_rate": 3.909348441926346e-05, "loss": 0.1445, "step": 6537 }, { "epoch": 2.414327917282127, "grad_norm": 0.31672918796539307, "learning_rate": 3.9068850843699965e-05, "loss": 0.1756, "step": 6538 }, { "epoch": 2.4146971935007384, "grad_norm": 0.32156234979629517, "learning_rate": 3.904421726813647e-05, "loss": 0.1832, "step": 6539 }, { "epoch": 2.41506646971935, "grad_norm": 0.2983616590499878, "learning_rate": 3.901958369257298e-05, "loss": 0.1646, "step": 6540 }, { "epoch": 2.4154357459379616, "grad_norm": 0.24603486061096191, "learning_rate": 3.899495011700949e-05, "loss": 0.155, "step": 6541 }, { "epoch": 2.4158050221565732, "grad_norm": 0.25211790204048157, "learning_rate": 3.8970316541446e-05, "loss": 0.1562, "step": 6542 }, { "epoch": 2.416174298375185, "grad_norm": 0.25376376509666443, "learning_rate": 3.8945682965882505e-05, "loss": 0.1617, "step": 6543 }, { "epoch": 2.416543574593796, "grad_norm": 0.26741838455200195, "learning_rate": 3.8921049390319006e-05, "loss": 0.1598, "step": 6544 }, { "epoch": 2.4169128508124076, "grad_norm": 0.2889264225959778, "learning_rate": 3.8896415814755514e-05, "loss": 0.176, "step": 6545 }, { "epoch": 2.417282127031019, "grad_norm": 0.2853246033191681, "learning_rate": 3.887178223919202e-05, "loss": 0.1771, "step": 6546 }, { "epoch": 2.417651403249631, "grad_norm": 0.2737523913383484, "learning_rate": 3.884714866362853e-05, "loss": 0.1648, "step": 6547 }, { "epoch": 2.4180206794682424, "grad_norm": 0.28757739067077637, "learning_rate": 3.882251508806504e-05, "loss": 0.1731, "step": 6548 }, { "epoch": 2.4183899556868536, "grad_norm": 0.24770046770572662, "learning_rate": 3.8797881512501545e-05, "loss": 0.1468, "step": 6549 }, { "epoch": 2.418759231905465, "grad_norm": 0.285557359457016, "learning_rate": 3.877324793693805e-05, "loss": 0.1983, "step": 6550 }, { "epoch": 2.418759231905465, "eval_loss": 0.25278323888778687, "eval_runtime": 5.8555, "eval_samples_per_second": 8.539, "eval_steps_per_second": 1.195, "step": 6550 }, { "epoch": 2.4191285081240768, "grad_norm": 0.24747471511363983, "learning_rate": 3.874861436137456e-05, "loss": 0.174, "step": 6551 }, { "epoch": 2.4194977843426884, "grad_norm": 0.2874555289745331, "learning_rate": 3.872398078581106e-05, "loss": 0.1753, "step": 6552 }, { "epoch": 2.4198670605613, "grad_norm": 0.25234147906303406, "learning_rate": 3.869934721024757e-05, "loss": 0.1617, "step": 6553 }, { "epoch": 2.4202363367799116, "grad_norm": 0.29611289501190186, "learning_rate": 3.867471363468408e-05, "loss": 0.168, "step": 6554 }, { "epoch": 2.4206056129985227, "grad_norm": 0.3307589888572693, "learning_rate": 3.8650080059120585e-05, "loss": 0.1818, "step": 6555 }, { "epoch": 2.4209748892171343, "grad_norm": 0.2812064290046692, "learning_rate": 3.862544648355709e-05, "loss": 0.1712, "step": 6556 }, { "epoch": 2.421344165435746, "grad_norm": 0.26278820633888245, "learning_rate": 3.86008129079936e-05, "loss": 0.166, "step": 6557 }, { "epoch": 2.4217134416543575, "grad_norm": 0.2416645884513855, "learning_rate": 3.857617933243011e-05, "loss": 0.1666, "step": 6558 }, { "epoch": 2.422082717872969, "grad_norm": 0.2979586720466614, "learning_rate": 3.855154575686661e-05, "loss": 0.1706, "step": 6559 }, { "epoch": 2.4224519940915803, "grad_norm": 0.22586217522621155, "learning_rate": 3.852691218130312e-05, "loss": 0.1656, "step": 6560 }, { "epoch": 2.422821270310192, "grad_norm": 0.26606011390686035, "learning_rate": 3.850227860573962e-05, "loss": 0.1705, "step": 6561 }, { "epoch": 2.4231905465288035, "grad_norm": 0.2324720025062561, "learning_rate": 3.847764503017613e-05, "loss": 0.1503, "step": 6562 }, { "epoch": 2.423559822747415, "grad_norm": 0.28596732020378113, "learning_rate": 3.8453011454612635e-05, "loss": 0.139, "step": 6563 }, { "epoch": 2.4239290989660267, "grad_norm": 0.2692070007324219, "learning_rate": 3.842837787904914e-05, "loss": 0.1846, "step": 6564 }, { "epoch": 2.4242983751846383, "grad_norm": 0.27173539996147156, "learning_rate": 3.840374430348565e-05, "loss": 0.1612, "step": 6565 }, { "epoch": 2.4246676514032495, "grad_norm": 0.2125542312860489, "learning_rate": 3.837911072792216e-05, "loss": 0.1514, "step": 6566 }, { "epoch": 2.425036927621861, "grad_norm": 0.27082082629203796, "learning_rate": 3.8354477152358666e-05, "loss": 0.1656, "step": 6567 }, { "epoch": 2.4254062038404727, "grad_norm": 0.2991025745868683, "learning_rate": 3.8329843576795174e-05, "loss": 0.1816, "step": 6568 }, { "epoch": 2.4257754800590843, "grad_norm": 0.29907068610191345, "learning_rate": 3.8305210001231675e-05, "loss": 0.1679, "step": 6569 }, { "epoch": 2.426144756277696, "grad_norm": 0.25698262453079224, "learning_rate": 3.828057642566818e-05, "loss": 0.1555, "step": 6570 }, { "epoch": 2.426514032496307, "grad_norm": 0.36471596360206604, "learning_rate": 3.825594285010469e-05, "loss": 0.2193, "step": 6571 }, { "epoch": 2.4268833087149186, "grad_norm": 0.2976161241531372, "learning_rate": 3.82313092745412e-05, "loss": 0.147, "step": 6572 }, { "epoch": 2.4272525849335302, "grad_norm": 0.28336474299430847, "learning_rate": 3.8206675698977707e-05, "loss": 0.1466, "step": 6573 }, { "epoch": 2.427621861152142, "grad_norm": 0.28433650732040405, "learning_rate": 3.8182042123414214e-05, "loss": 0.1735, "step": 6574 }, { "epoch": 2.4279911373707534, "grad_norm": 0.2402041107416153, "learning_rate": 3.815740854785072e-05, "loss": 0.1596, "step": 6575 }, { "epoch": 2.428360413589365, "grad_norm": 0.2426626980304718, "learning_rate": 3.813277497228723e-05, "loss": 0.1512, "step": 6576 }, { "epoch": 2.428729689807976, "grad_norm": 0.26151448488235474, "learning_rate": 3.810814139672373e-05, "loss": 0.1678, "step": 6577 }, { "epoch": 2.429098966026588, "grad_norm": 0.2676202356815338, "learning_rate": 3.808350782116024e-05, "loss": 0.1432, "step": 6578 }, { "epoch": 2.4294682422451994, "grad_norm": 0.22569593787193298, "learning_rate": 3.805887424559675e-05, "loss": 0.1599, "step": 6579 }, { "epoch": 2.429837518463811, "grad_norm": 0.27633586525917053, "learning_rate": 3.8034240670033255e-05, "loss": 0.169, "step": 6580 }, { "epoch": 2.4302067946824226, "grad_norm": 0.2126288115978241, "learning_rate": 3.800960709446976e-05, "loss": 0.1438, "step": 6581 }, { "epoch": 2.430576070901034, "grad_norm": 0.30395951867103577, "learning_rate": 3.798497351890627e-05, "loss": 0.182, "step": 6582 }, { "epoch": 2.4309453471196454, "grad_norm": 0.3230239748954773, "learning_rate": 3.796033994334278e-05, "loss": 0.1505, "step": 6583 }, { "epoch": 2.431314623338257, "grad_norm": 0.46397629380226135, "learning_rate": 3.7935706367779286e-05, "loss": 0.2115, "step": 6584 }, { "epoch": 2.4316838995568686, "grad_norm": 0.260724276304245, "learning_rate": 3.791107279221579e-05, "loss": 0.1585, "step": 6585 }, { "epoch": 2.43205317577548, "grad_norm": 0.2859072685241699, "learning_rate": 3.7886439216652295e-05, "loss": 0.165, "step": 6586 }, { "epoch": 2.432422451994092, "grad_norm": 0.24744963645935059, "learning_rate": 3.78618056410888e-05, "loss": 0.1487, "step": 6587 }, { "epoch": 2.432791728212703, "grad_norm": 0.22620677947998047, "learning_rate": 3.783717206552531e-05, "loss": 0.1647, "step": 6588 }, { "epoch": 2.4331610044313146, "grad_norm": 0.2779679000377655, "learning_rate": 3.781253848996182e-05, "loss": 0.2016, "step": 6589 }, { "epoch": 2.433530280649926, "grad_norm": 0.33739492297172546, "learning_rate": 3.778790491439833e-05, "loss": 0.1703, "step": 6590 }, { "epoch": 2.4338995568685378, "grad_norm": 0.2833116054534912, "learning_rate": 3.7763271338834835e-05, "loss": 0.1663, "step": 6591 }, { "epoch": 2.4342688330871494, "grad_norm": 0.26829156279563904, "learning_rate": 3.773863776327134e-05, "loss": 0.1442, "step": 6592 }, { "epoch": 2.4346381093057605, "grad_norm": 0.27546414732933044, "learning_rate": 3.7714004187707844e-05, "loss": 0.1851, "step": 6593 }, { "epoch": 2.435007385524372, "grad_norm": 0.22478356957435608, "learning_rate": 3.768937061214435e-05, "loss": 0.1704, "step": 6594 }, { "epoch": 2.4353766617429837, "grad_norm": 0.31221652030944824, "learning_rate": 3.766473703658086e-05, "loss": 0.1902, "step": 6595 }, { "epoch": 2.4357459379615953, "grad_norm": 0.29108351469039917, "learning_rate": 3.764010346101737e-05, "loss": 0.1539, "step": 6596 }, { "epoch": 2.436115214180207, "grad_norm": 0.2577853798866272, "learning_rate": 3.7615469885453875e-05, "loss": 0.1775, "step": 6597 }, { "epoch": 2.4364844903988185, "grad_norm": 0.29579925537109375, "learning_rate": 3.759083630989038e-05, "loss": 0.1762, "step": 6598 }, { "epoch": 2.4368537666174297, "grad_norm": 0.2631477415561676, "learning_rate": 3.756620273432689e-05, "loss": 0.1661, "step": 6599 }, { "epoch": 2.4372230428360413, "grad_norm": 0.2739509344100952, "learning_rate": 3.75415691587634e-05, "loss": 0.1698, "step": 6600 }, { "epoch": 2.4372230428360413, "eval_loss": 0.25389644503593445, "eval_runtime": 5.8427, "eval_samples_per_second": 8.558, "eval_steps_per_second": 1.198, "step": 6600 }, { "epoch": 2.437592319054653, "grad_norm": 0.36615848541259766, "learning_rate": 3.75169355831999e-05, "loss": 0.1729, "step": 6601 }, { "epoch": 2.4379615952732645, "grad_norm": 0.27296942472457886, "learning_rate": 3.749230200763641e-05, "loss": 0.1624, "step": 6602 }, { "epoch": 2.438330871491876, "grad_norm": 0.27478161454200745, "learning_rate": 3.7467668432072915e-05, "loss": 0.1428, "step": 6603 }, { "epoch": 2.4387001477104873, "grad_norm": 0.28324443101882935, "learning_rate": 3.744303485650942e-05, "loss": 0.1883, "step": 6604 }, { "epoch": 2.439069423929099, "grad_norm": 0.29894813895225525, "learning_rate": 3.741840128094593e-05, "loss": 0.1899, "step": 6605 }, { "epoch": 2.4394387001477105, "grad_norm": 0.3002159595489502, "learning_rate": 3.739376770538244e-05, "loss": 0.1571, "step": 6606 }, { "epoch": 2.439807976366322, "grad_norm": 0.274058073759079, "learning_rate": 3.736913412981895e-05, "loss": 0.1624, "step": 6607 }, { "epoch": 2.4401772525849337, "grad_norm": 0.27943941950798035, "learning_rate": 3.7344500554255455e-05, "loss": 0.1609, "step": 6608 }, { "epoch": 2.4405465288035453, "grad_norm": 0.24010521173477173, "learning_rate": 3.7319866978691956e-05, "loss": 0.1383, "step": 6609 }, { "epoch": 2.4409158050221564, "grad_norm": 0.28225481510162354, "learning_rate": 3.7295233403128464e-05, "loss": 0.172, "step": 6610 }, { "epoch": 2.441285081240768, "grad_norm": 0.26481708884239197, "learning_rate": 3.727059982756497e-05, "loss": 0.1562, "step": 6611 }, { "epoch": 2.4416543574593796, "grad_norm": 0.2832199037075043, "learning_rate": 3.724596625200148e-05, "loss": 0.1851, "step": 6612 }, { "epoch": 2.4420236336779912, "grad_norm": 0.2782345712184906, "learning_rate": 3.722133267643799e-05, "loss": 0.1718, "step": 6613 }, { "epoch": 2.442392909896603, "grad_norm": 0.28615009784698486, "learning_rate": 3.7196699100874495e-05, "loss": 0.1611, "step": 6614 }, { "epoch": 2.442762186115214, "grad_norm": 0.30237308144569397, "learning_rate": 3.7172065525311e-05, "loss": 0.1778, "step": 6615 }, { "epoch": 2.4431314623338256, "grad_norm": 0.26544320583343506, "learning_rate": 3.714743194974751e-05, "loss": 0.162, "step": 6616 }, { "epoch": 2.443500738552437, "grad_norm": 0.2903136610984802, "learning_rate": 3.712279837418401e-05, "loss": 0.1553, "step": 6617 }, { "epoch": 2.443870014771049, "grad_norm": 0.26397714018821716, "learning_rate": 3.709816479862052e-05, "loss": 0.1678, "step": 6618 }, { "epoch": 2.4442392909896604, "grad_norm": 0.2656189799308777, "learning_rate": 3.707353122305703e-05, "loss": 0.1562, "step": 6619 }, { "epoch": 2.444608567208272, "grad_norm": 0.2826843857765198, "learning_rate": 3.7048897647493536e-05, "loss": 0.1817, "step": 6620 }, { "epoch": 2.444977843426883, "grad_norm": 0.2653213441371918, "learning_rate": 3.7024264071930043e-05, "loss": 0.1675, "step": 6621 }, { "epoch": 2.445347119645495, "grad_norm": 0.30040302872657776, "learning_rate": 3.699963049636655e-05, "loss": 0.1999, "step": 6622 }, { "epoch": 2.4457163958641064, "grad_norm": 0.30907782912254333, "learning_rate": 3.697499692080306e-05, "loss": 0.1861, "step": 6623 }, { "epoch": 2.446085672082718, "grad_norm": 0.2911081612110138, "learning_rate": 3.695036334523957e-05, "loss": 0.1661, "step": 6624 }, { "epoch": 2.446454948301329, "grad_norm": 0.2689531743526459, "learning_rate": 3.692572976967607e-05, "loss": 0.1687, "step": 6625 }, { "epoch": 2.4468242245199407, "grad_norm": 0.3597908914089203, "learning_rate": 3.6901096194112576e-05, "loss": 0.203, "step": 6626 }, { "epoch": 2.4471935007385524, "grad_norm": 0.2764434814453125, "learning_rate": 3.6876462618549084e-05, "loss": 0.1758, "step": 6627 }, { "epoch": 2.447562776957164, "grad_norm": 0.25817057490348816, "learning_rate": 3.685182904298559e-05, "loss": 0.158, "step": 6628 }, { "epoch": 2.4479320531757756, "grad_norm": 0.27736422419548035, "learning_rate": 3.68271954674221e-05, "loss": 0.1543, "step": 6629 }, { "epoch": 2.448301329394387, "grad_norm": 0.3356666564941406, "learning_rate": 3.680256189185861e-05, "loss": 0.172, "step": 6630 }, { "epoch": 2.4486706056129988, "grad_norm": 0.3232395350933075, "learning_rate": 3.6777928316295115e-05, "loss": 0.2051, "step": 6631 }, { "epoch": 2.44903988183161, "grad_norm": 0.3052434027194977, "learning_rate": 3.6753294740731616e-05, "loss": 0.1914, "step": 6632 }, { "epoch": 2.4494091580502215, "grad_norm": 0.3377300500869751, "learning_rate": 3.6728661165168124e-05, "loss": 0.1866, "step": 6633 }, { "epoch": 2.449778434268833, "grad_norm": 0.2937794327735901, "learning_rate": 3.670402758960463e-05, "loss": 0.1442, "step": 6634 }, { "epoch": 2.4501477104874447, "grad_norm": 0.37502825260162354, "learning_rate": 3.667939401404114e-05, "loss": 0.1719, "step": 6635 }, { "epoch": 2.450516986706056, "grad_norm": 0.25513651967048645, "learning_rate": 3.665476043847765e-05, "loss": 0.1497, "step": 6636 }, { "epoch": 2.4508862629246675, "grad_norm": 0.2909369468688965, "learning_rate": 3.6630126862914156e-05, "loss": 0.1646, "step": 6637 }, { "epoch": 2.451255539143279, "grad_norm": 0.24133659899234772, "learning_rate": 3.6605493287350664e-05, "loss": 0.1582, "step": 6638 }, { "epoch": 2.4516248153618907, "grad_norm": 0.27683207392692566, "learning_rate": 3.658085971178717e-05, "loss": 0.1535, "step": 6639 }, { "epoch": 2.4519940915805023, "grad_norm": 0.2554895877838135, "learning_rate": 3.655622613622367e-05, "loss": 0.1641, "step": 6640 }, { "epoch": 2.452363367799114, "grad_norm": 0.3127109408378601, "learning_rate": 3.653159256066018e-05, "loss": 0.1911, "step": 6641 }, { "epoch": 2.4527326440177255, "grad_norm": 0.28088274598121643, "learning_rate": 3.650695898509669e-05, "loss": 0.184, "step": 6642 }, { "epoch": 2.4531019202363367, "grad_norm": 0.29056841135025024, "learning_rate": 3.6482325409533196e-05, "loss": 0.1691, "step": 6643 }, { "epoch": 2.4534711964549483, "grad_norm": 0.24302606284618378, "learning_rate": 3.6457691833969704e-05, "loss": 0.1638, "step": 6644 }, { "epoch": 2.45384047267356, "grad_norm": 0.2458467334508896, "learning_rate": 3.643305825840621e-05, "loss": 0.1537, "step": 6645 }, { "epoch": 2.4542097488921715, "grad_norm": 0.3112828731536865, "learning_rate": 3.640842468284272e-05, "loss": 0.1683, "step": 6646 }, { "epoch": 2.4545790251107826, "grad_norm": 0.2735547423362732, "learning_rate": 3.638379110727923e-05, "loss": 0.1795, "step": 6647 }, { "epoch": 2.4549483013293942, "grad_norm": 0.2628132700920105, "learning_rate": 3.635915753171573e-05, "loss": 0.1769, "step": 6648 }, { "epoch": 2.455317577548006, "grad_norm": 0.3441419303417206, "learning_rate": 3.6334523956152236e-05, "loss": 0.1899, "step": 6649 }, { "epoch": 2.4556868537666174, "grad_norm": 0.31522902846336365, "learning_rate": 3.6309890380588744e-05, "loss": 0.2296, "step": 6650 }, { "epoch": 2.4556868537666174, "eval_loss": 0.2550058662891388, "eval_runtime": 5.8477, "eval_samples_per_second": 8.55, "eval_steps_per_second": 1.197, "step": 6650 }, { "epoch": 2.456056129985229, "grad_norm": 0.24651025235652924, "learning_rate": 3.628525680502525e-05, "loss": 0.1598, "step": 6651 }, { "epoch": 2.4564254062038406, "grad_norm": 0.28866273164749146, "learning_rate": 3.626062322946176e-05, "loss": 0.1813, "step": 6652 }, { "epoch": 2.456794682422452, "grad_norm": 0.39629366993904114, "learning_rate": 3.623598965389827e-05, "loss": 0.189, "step": 6653 }, { "epoch": 2.4571639586410634, "grad_norm": 0.29161256551742554, "learning_rate": 3.6211356078334776e-05, "loss": 0.1619, "step": 6654 }, { "epoch": 2.457533234859675, "grad_norm": 0.21428142488002777, "learning_rate": 3.6186722502771284e-05, "loss": 0.1619, "step": 6655 }, { "epoch": 2.4579025110782866, "grad_norm": 0.23037759959697723, "learning_rate": 3.6162088927207785e-05, "loss": 0.1636, "step": 6656 }, { "epoch": 2.458271787296898, "grad_norm": 0.25886794924736023, "learning_rate": 3.613745535164429e-05, "loss": 0.1746, "step": 6657 }, { "epoch": 2.4586410635155094, "grad_norm": 0.27223896980285645, "learning_rate": 3.61128217760808e-05, "loss": 0.1576, "step": 6658 }, { "epoch": 2.459010339734121, "grad_norm": 0.31890416145324707, "learning_rate": 3.608818820051731e-05, "loss": 0.1725, "step": 6659 }, { "epoch": 2.4593796159527326, "grad_norm": 0.2658154368400574, "learning_rate": 3.6063554624953816e-05, "loss": 0.1582, "step": 6660 }, { "epoch": 2.459748892171344, "grad_norm": 0.2270040065050125, "learning_rate": 3.6038921049390324e-05, "loss": 0.1548, "step": 6661 }, { "epoch": 2.460118168389956, "grad_norm": 0.29490986466407776, "learning_rate": 3.601428747382683e-05, "loss": 0.1735, "step": 6662 }, { "epoch": 2.4604874446085674, "grad_norm": 0.23410217463970184, "learning_rate": 3.598965389826334e-05, "loss": 0.1563, "step": 6663 }, { "epoch": 2.4608567208271785, "grad_norm": 0.2288871556520462, "learning_rate": 3.596502032269984e-05, "loss": 0.1553, "step": 6664 }, { "epoch": 2.46122599704579, "grad_norm": 0.2695292830467224, "learning_rate": 3.594038674713635e-05, "loss": 0.1783, "step": 6665 }, { "epoch": 2.4615952732644018, "grad_norm": 0.24455784261226654, "learning_rate": 3.5915753171572857e-05, "loss": 0.1736, "step": 6666 }, { "epoch": 2.4619645494830134, "grad_norm": 0.28997039794921875, "learning_rate": 3.5891119596009364e-05, "loss": 0.175, "step": 6667 }, { "epoch": 2.462333825701625, "grad_norm": 0.2736116051673889, "learning_rate": 3.586648602044587e-05, "loss": 0.1661, "step": 6668 }, { "epoch": 2.462703101920236, "grad_norm": 0.23794032633304596, "learning_rate": 3.584185244488238e-05, "loss": 0.1631, "step": 6669 }, { "epoch": 2.4630723781388477, "grad_norm": 0.2668376564979553, "learning_rate": 3.581721886931889e-05, "loss": 0.1798, "step": 6670 }, { "epoch": 2.4634416543574593, "grad_norm": 0.25068479776382446, "learning_rate": 3.5792585293755396e-05, "loss": 0.1536, "step": 6671 }, { "epoch": 2.463810930576071, "grad_norm": 0.28217265009880066, "learning_rate": 3.57679517181919e-05, "loss": 0.1651, "step": 6672 }, { "epoch": 2.4641802067946825, "grad_norm": 0.2928770184516907, "learning_rate": 3.5743318142628405e-05, "loss": 0.1854, "step": 6673 }, { "epoch": 2.464549483013294, "grad_norm": 0.2543809115886688, "learning_rate": 3.571868456706491e-05, "loss": 0.1467, "step": 6674 }, { "epoch": 2.4649187592319053, "grad_norm": 0.2677014470100403, "learning_rate": 3.5694050991501414e-05, "loss": 0.1836, "step": 6675 }, { "epoch": 2.465288035450517, "grad_norm": 0.24781964719295502, "learning_rate": 3.566941741593792e-05, "loss": 0.1594, "step": 6676 }, { "epoch": 2.4656573116691285, "grad_norm": 0.2676795721054077, "learning_rate": 3.564478384037443e-05, "loss": 0.1631, "step": 6677 }, { "epoch": 2.46602658788774, "grad_norm": 0.23839810490608215, "learning_rate": 3.562015026481094e-05, "loss": 0.145, "step": 6678 }, { "epoch": 2.4663958641063517, "grad_norm": 0.2979888916015625, "learning_rate": 3.5595516689247445e-05, "loss": 0.1834, "step": 6679 }, { "epoch": 2.466765140324963, "grad_norm": 0.2651112377643585, "learning_rate": 3.557088311368395e-05, "loss": 0.1575, "step": 6680 }, { "epoch": 2.4671344165435745, "grad_norm": 0.32368481159210205, "learning_rate": 3.5546249538120454e-05, "loss": 0.1976, "step": 6681 }, { "epoch": 2.467503692762186, "grad_norm": 0.2911125421524048, "learning_rate": 3.552161596255696e-05, "loss": 0.1685, "step": 6682 }, { "epoch": 2.4678729689807977, "grad_norm": 0.3357445299625397, "learning_rate": 3.549698238699347e-05, "loss": 0.1976, "step": 6683 }, { "epoch": 2.4682422451994093, "grad_norm": 0.3251952826976776, "learning_rate": 3.547234881142998e-05, "loss": 0.2145, "step": 6684 }, { "epoch": 2.468611521418021, "grad_norm": 0.2840099334716797, "learning_rate": 3.5447715235866486e-05, "loss": 0.1705, "step": 6685 }, { "epoch": 2.468980797636632, "grad_norm": 0.25191012024879456, "learning_rate": 3.5423081660302994e-05, "loss": 0.1557, "step": 6686 }, { "epoch": 2.4693500738552436, "grad_norm": 0.27351176738739014, "learning_rate": 3.53984480847395e-05, "loss": 0.1552, "step": 6687 }, { "epoch": 2.4697193500738552, "grad_norm": 0.3187665641307831, "learning_rate": 3.537381450917601e-05, "loss": 0.1968, "step": 6688 }, { "epoch": 2.470088626292467, "grad_norm": 0.24932366609573364, "learning_rate": 3.534918093361251e-05, "loss": 0.1718, "step": 6689 }, { "epoch": 2.4704579025110784, "grad_norm": 0.2778686583042145, "learning_rate": 3.532454735804902e-05, "loss": 0.1613, "step": 6690 }, { "epoch": 2.4708271787296896, "grad_norm": 0.22028721868991852, "learning_rate": 3.5299913782485526e-05, "loss": 0.1686, "step": 6691 }, { "epoch": 2.471196454948301, "grad_norm": 0.30668461322784424, "learning_rate": 3.5275280206922034e-05, "loss": 0.1736, "step": 6692 }, { "epoch": 2.471565731166913, "grad_norm": 0.3277914524078369, "learning_rate": 3.525064663135854e-05, "loss": 0.1869, "step": 6693 }, { "epoch": 2.4719350073855244, "grad_norm": 0.28965499997138977, "learning_rate": 3.522601305579505e-05, "loss": 0.1597, "step": 6694 }, { "epoch": 2.472304283604136, "grad_norm": 0.2699742317199707, "learning_rate": 3.520137948023156e-05, "loss": 0.181, "step": 6695 }, { "epoch": 2.4726735598227476, "grad_norm": 0.24233360588550568, "learning_rate": 3.5176745904668065e-05, "loss": 0.1433, "step": 6696 }, { "epoch": 2.4730428360413588, "grad_norm": 0.32054582238197327, "learning_rate": 3.5152112329104566e-05, "loss": 0.1659, "step": 6697 }, { "epoch": 2.4734121122599704, "grad_norm": 0.2580467164516449, "learning_rate": 3.5127478753541074e-05, "loss": 0.1577, "step": 6698 }, { "epoch": 2.473781388478582, "grad_norm": 0.2700013816356659, "learning_rate": 3.510284517797758e-05, "loss": 0.1625, "step": 6699 }, { "epoch": 2.4741506646971936, "grad_norm": 0.2962573766708374, "learning_rate": 3.507821160241409e-05, "loss": 0.1878, "step": 6700 }, { "epoch": 2.4741506646971936, "eval_loss": 0.25414416193962097, "eval_runtime": 5.8547, "eval_samples_per_second": 8.54, "eval_steps_per_second": 1.196, "step": 6700 }, { "epoch": 2.474519940915805, "grad_norm": 0.2274537831544876, "learning_rate": 3.50535780268506e-05, "loss": 0.1547, "step": 6701 }, { "epoch": 2.4748892171344163, "grad_norm": 0.22232764959335327, "learning_rate": 3.5028944451287106e-05, "loss": 0.1338, "step": 6702 }, { "epoch": 2.475258493353028, "grad_norm": 0.253587007522583, "learning_rate": 3.5004310875723614e-05, "loss": 0.1682, "step": 6703 }, { "epoch": 2.4756277695716395, "grad_norm": 0.22102652490139008, "learning_rate": 3.497967730016012e-05, "loss": 0.1486, "step": 6704 }, { "epoch": 2.475997045790251, "grad_norm": 0.2886812686920166, "learning_rate": 3.495504372459662e-05, "loss": 0.1773, "step": 6705 }, { "epoch": 2.4763663220088628, "grad_norm": 0.2909698188304901, "learning_rate": 3.493041014903313e-05, "loss": 0.1819, "step": 6706 }, { "epoch": 2.4767355982274744, "grad_norm": 0.2511613368988037, "learning_rate": 3.490577657346964e-05, "loss": 0.1841, "step": 6707 }, { "epoch": 2.4771048744460855, "grad_norm": 0.28402179479599, "learning_rate": 3.4881142997906146e-05, "loss": 0.1709, "step": 6708 }, { "epoch": 2.477474150664697, "grad_norm": 0.24826565384864807, "learning_rate": 3.4856509422342654e-05, "loss": 0.1599, "step": 6709 }, { "epoch": 2.4778434268833087, "grad_norm": 0.28324204683303833, "learning_rate": 3.483187584677916e-05, "loss": 0.1763, "step": 6710 }, { "epoch": 2.4782127031019203, "grad_norm": 0.26067647337913513, "learning_rate": 3.480724227121567e-05, "loss": 0.1672, "step": 6711 }, { "epoch": 2.478581979320532, "grad_norm": 0.2550486624240875, "learning_rate": 3.478260869565218e-05, "loss": 0.1672, "step": 6712 }, { "epoch": 2.478951255539143, "grad_norm": 0.3083842992782593, "learning_rate": 3.475797512008868e-05, "loss": 0.1878, "step": 6713 }, { "epoch": 2.4793205317577547, "grad_norm": 0.32653459906578064, "learning_rate": 3.4733341544525187e-05, "loss": 0.1996, "step": 6714 }, { "epoch": 2.4796898079763663, "grad_norm": 0.24006037414073944, "learning_rate": 3.4708707968961694e-05, "loss": 0.1663, "step": 6715 }, { "epoch": 2.480059084194978, "grad_norm": 0.27943891286849976, "learning_rate": 3.46840743933982e-05, "loss": 0.1777, "step": 6716 }, { "epoch": 2.4804283604135895, "grad_norm": 0.2611193358898163, "learning_rate": 3.465944081783471e-05, "loss": 0.1849, "step": 6717 }, { "epoch": 2.480797636632201, "grad_norm": 0.2858585715293884, "learning_rate": 3.463480724227122e-05, "loss": 0.1606, "step": 6718 }, { "epoch": 2.4811669128508123, "grad_norm": 0.28120550513267517, "learning_rate": 3.4610173666707726e-05, "loss": 0.1864, "step": 6719 }, { "epoch": 2.481536189069424, "grad_norm": 0.26465487480163574, "learning_rate": 3.4585540091144234e-05, "loss": 0.1571, "step": 6720 }, { "epoch": 2.4819054652880355, "grad_norm": 0.29018014669418335, "learning_rate": 3.4560906515580735e-05, "loss": 0.1748, "step": 6721 }, { "epoch": 2.482274741506647, "grad_norm": 0.24420779943466187, "learning_rate": 3.453627294001724e-05, "loss": 0.1589, "step": 6722 }, { "epoch": 2.4826440177252587, "grad_norm": 0.2638775110244751, "learning_rate": 3.451163936445375e-05, "loss": 0.1472, "step": 6723 }, { "epoch": 2.48301329394387, "grad_norm": 0.3645317554473877, "learning_rate": 3.448700578889026e-05, "loss": 0.1584, "step": 6724 }, { "epoch": 2.4833825701624814, "grad_norm": 0.2596351206302643, "learning_rate": 3.4462372213326766e-05, "loss": 0.1658, "step": 6725 }, { "epoch": 2.483751846381093, "grad_norm": 0.24260061979293823, "learning_rate": 3.4437738637763274e-05, "loss": 0.1519, "step": 6726 }, { "epoch": 2.4841211225997046, "grad_norm": 0.26800626516342163, "learning_rate": 3.441310506219978e-05, "loss": 0.1424, "step": 6727 }, { "epoch": 2.4844903988183162, "grad_norm": 0.313998818397522, "learning_rate": 3.438847148663629e-05, "loss": 0.1915, "step": 6728 }, { "epoch": 2.484859675036928, "grad_norm": 0.2721114158630371, "learning_rate": 3.436383791107279e-05, "loss": 0.1569, "step": 6729 }, { "epoch": 2.485228951255539, "grad_norm": 0.29268237948417664, "learning_rate": 3.43392043355093e-05, "loss": 0.1669, "step": 6730 }, { "epoch": 2.4855982274741506, "grad_norm": 0.2563331425189972, "learning_rate": 3.431457075994581e-05, "loss": 0.1583, "step": 6731 }, { "epoch": 2.485967503692762, "grad_norm": 0.3128591477870941, "learning_rate": 3.4289937184382315e-05, "loss": 0.1856, "step": 6732 }, { "epoch": 2.486336779911374, "grad_norm": 0.28177300095558167, "learning_rate": 3.426530360881882e-05, "loss": 0.1829, "step": 6733 }, { "epoch": 2.4867060561299854, "grad_norm": 0.26359695196151733, "learning_rate": 3.424067003325533e-05, "loss": 0.1609, "step": 6734 }, { "epoch": 2.4870753323485966, "grad_norm": 0.29498758912086487, "learning_rate": 3.421603645769184e-05, "loss": 0.1838, "step": 6735 }, { "epoch": 2.487444608567208, "grad_norm": 0.2749401926994324, "learning_rate": 3.4191402882128346e-05, "loss": 0.1724, "step": 6736 }, { "epoch": 2.4878138847858198, "grad_norm": 0.2710501551628113, "learning_rate": 3.416676930656485e-05, "loss": 0.1519, "step": 6737 }, { "epoch": 2.4881831610044314, "grad_norm": 0.24372336268424988, "learning_rate": 3.4142135731001355e-05, "loss": 0.1658, "step": 6738 }, { "epoch": 2.488552437223043, "grad_norm": 0.3472476005554199, "learning_rate": 3.411750215543786e-05, "loss": 0.1919, "step": 6739 }, { "epoch": 2.4889217134416546, "grad_norm": 0.28122809529304504, "learning_rate": 3.409286857987437e-05, "loss": 0.1785, "step": 6740 }, { "epoch": 2.4892909896602657, "grad_norm": 0.3095254600048065, "learning_rate": 3.406823500431088e-05, "loss": 0.1736, "step": 6741 }, { "epoch": 2.4896602658788773, "grad_norm": 0.2329387664794922, "learning_rate": 3.4043601428747386e-05, "loss": 0.1461, "step": 6742 }, { "epoch": 2.490029542097489, "grad_norm": 0.2703981399536133, "learning_rate": 3.4018967853183894e-05, "loss": 0.1653, "step": 6743 }, { "epoch": 2.4903988183161005, "grad_norm": 0.29018229246139526, "learning_rate": 3.39943342776204e-05, "loss": 0.1673, "step": 6744 }, { "epoch": 2.490768094534712, "grad_norm": 0.2717873156070709, "learning_rate": 3.39697007020569e-05, "loss": 0.1883, "step": 6745 }, { "epoch": 2.4911373707533233, "grad_norm": 0.32639437913894653, "learning_rate": 3.394506712649341e-05, "loss": 0.1821, "step": 6746 }, { "epoch": 2.491506646971935, "grad_norm": 0.29910680651664734, "learning_rate": 3.392043355092992e-05, "loss": 0.1558, "step": 6747 }, { "epoch": 2.4918759231905465, "grad_norm": 0.2277168482542038, "learning_rate": 3.389579997536643e-05, "loss": 0.1595, "step": 6748 }, { "epoch": 2.492245199409158, "grad_norm": 0.26746252179145813, "learning_rate": 3.3871166399802935e-05, "loss": 0.1786, "step": 6749 }, { "epoch": 2.4926144756277697, "grad_norm": 0.263317346572876, "learning_rate": 3.384653282423944e-05, "loss": 0.1937, "step": 6750 }, { "epoch": 2.4926144756277697, "eval_loss": 0.2532287538051605, "eval_runtime": 5.8572, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 6750 }, { "epoch": 2.4929837518463813, "grad_norm": 0.26293596625328064, "learning_rate": 3.382189924867595e-05, "loss": 0.1526, "step": 6751 }, { "epoch": 2.4933530280649925, "grad_norm": 0.2595967948436737, "learning_rate": 3.379726567311245e-05, "loss": 0.1743, "step": 6752 }, { "epoch": 2.493722304283604, "grad_norm": 0.3618583381175995, "learning_rate": 3.377263209754896e-05, "loss": 0.1784, "step": 6753 }, { "epoch": 2.4940915805022157, "grad_norm": 0.27984780073165894, "learning_rate": 3.374799852198547e-05, "loss": 0.1768, "step": 6754 }, { "epoch": 2.4944608567208273, "grad_norm": 0.26132699847221375, "learning_rate": 3.3723364946421975e-05, "loss": 0.1723, "step": 6755 }, { "epoch": 2.494830132939439, "grad_norm": 0.3069780468940735, "learning_rate": 3.369873137085848e-05, "loss": 0.1671, "step": 6756 }, { "epoch": 2.49519940915805, "grad_norm": 0.30445432662963867, "learning_rate": 3.367409779529499e-05, "loss": 0.1514, "step": 6757 }, { "epoch": 2.4955686853766617, "grad_norm": 0.25964003801345825, "learning_rate": 3.36494642197315e-05, "loss": 0.1521, "step": 6758 }, { "epoch": 2.4959379615952733, "grad_norm": 0.2943986654281616, "learning_rate": 3.3624830644168007e-05, "loss": 0.1674, "step": 6759 }, { "epoch": 2.496307237813885, "grad_norm": 0.33864790201187134, "learning_rate": 3.360019706860451e-05, "loss": 0.1912, "step": 6760 }, { "epoch": 2.4966765140324965, "grad_norm": 0.26427775621414185, "learning_rate": 3.3575563493041016e-05, "loss": 0.1651, "step": 6761 }, { "epoch": 2.497045790251108, "grad_norm": 0.30823659896850586, "learning_rate": 3.355092991747752e-05, "loss": 0.1531, "step": 6762 }, { "epoch": 2.4974150664697192, "grad_norm": 0.2890568673610687, "learning_rate": 3.352629634191403e-05, "loss": 0.1771, "step": 6763 }, { "epoch": 2.497784342688331, "grad_norm": 0.2550124526023865, "learning_rate": 3.350166276635054e-05, "loss": 0.1583, "step": 6764 }, { "epoch": 2.4981536189069424, "grad_norm": 0.22946004569530487, "learning_rate": 3.347702919078705e-05, "loss": 0.1192, "step": 6765 }, { "epoch": 2.498522895125554, "grad_norm": 0.24715645611286163, "learning_rate": 3.3452395615223555e-05, "loss": 0.1557, "step": 6766 }, { "epoch": 2.498892171344165, "grad_norm": 0.21769702434539795, "learning_rate": 3.342776203966006e-05, "loss": 0.1542, "step": 6767 }, { "epoch": 2.499261447562777, "grad_norm": 0.2710095942020416, "learning_rate": 3.3403128464096564e-05, "loss": 0.1518, "step": 6768 }, { "epoch": 2.4996307237813884, "grad_norm": 0.2763799726963043, "learning_rate": 3.337849488853307e-05, "loss": 0.1562, "step": 6769 }, { "epoch": 2.5, "grad_norm": 0.28227436542510986, "learning_rate": 3.335386131296958e-05, "loss": 0.1817, "step": 6770 }, { "epoch": 2.5003692762186116, "grad_norm": 0.2575850784778595, "learning_rate": 3.332922773740609e-05, "loss": 0.1575, "step": 6771 }, { "epoch": 2.500738552437223, "grad_norm": 0.3194829523563385, "learning_rate": 3.3304594161842595e-05, "loss": 0.1685, "step": 6772 }, { "epoch": 2.501107828655835, "grad_norm": 0.31217771768569946, "learning_rate": 3.32799605862791e-05, "loss": 0.1753, "step": 6773 }, { "epoch": 2.501477104874446, "grad_norm": 0.24038547277450562, "learning_rate": 3.325532701071561e-05, "loss": 0.1728, "step": 6774 }, { "epoch": 2.5018463810930576, "grad_norm": 0.26459741592407227, "learning_rate": 3.323069343515212e-05, "loss": 0.1743, "step": 6775 }, { "epoch": 2.502215657311669, "grad_norm": 0.2336292862892151, "learning_rate": 3.320605985958862e-05, "loss": 0.1599, "step": 6776 }, { "epoch": 2.5025849335302808, "grad_norm": 0.29795417189598083, "learning_rate": 3.318142628402513e-05, "loss": 0.1805, "step": 6777 }, { "epoch": 2.502954209748892, "grad_norm": 0.25662103295326233, "learning_rate": 3.3156792708461636e-05, "loss": 0.1557, "step": 6778 }, { "epoch": 2.5033234859675035, "grad_norm": 0.29031679034233093, "learning_rate": 3.3132159132898144e-05, "loss": 0.1543, "step": 6779 }, { "epoch": 2.503692762186115, "grad_norm": 0.23315390944480896, "learning_rate": 3.310752555733465e-05, "loss": 0.1558, "step": 6780 }, { "epoch": 2.5040620384047267, "grad_norm": 0.3676280677318573, "learning_rate": 3.308289198177116e-05, "loss": 0.1895, "step": 6781 }, { "epoch": 2.5044313146233383, "grad_norm": 0.32318052649497986, "learning_rate": 3.305825840620767e-05, "loss": 0.1566, "step": 6782 }, { "epoch": 2.50480059084195, "grad_norm": 0.2895708978176117, "learning_rate": 3.3033624830644175e-05, "loss": 0.1607, "step": 6783 }, { "epoch": 2.5051698670605616, "grad_norm": 0.23804035782814026, "learning_rate": 3.3008991255080676e-05, "loss": 0.1519, "step": 6784 }, { "epoch": 2.5055391432791727, "grad_norm": 0.2892438471317291, "learning_rate": 3.2984357679517184e-05, "loss": 0.1555, "step": 6785 }, { "epoch": 2.5059084194977843, "grad_norm": 0.2570165693759918, "learning_rate": 3.295972410395369e-05, "loss": 0.1674, "step": 6786 }, { "epoch": 2.506277695716396, "grad_norm": 0.24235954880714417, "learning_rate": 3.29350905283902e-05, "loss": 0.1566, "step": 6787 }, { "epoch": 2.5066469719350075, "grad_norm": 0.2593287527561188, "learning_rate": 3.291045695282671e-05, "loss": 0.1506, "step": 6788 }, { "epoch": 2.5070162481536187, "grad_norm": 0.2729012370109558, "learning_rate": 3.2885823377263215e-05, "loss": 0.1516, "step": 6789 }, { "epoch": 2.5073855243722303, "grad_norm": 0.26710566878318787, "learning_rate": 3.286118980169972e-05, "loss": 0.1633, "step": 6790 }, { "epoch": 2.507754800590842, "grad_norm": 0.23136357963085175, "learning_rate": 3.2836556226136224e-05, "loss": 0.1524, "step": 6791 }, { "epoch": 2.5081240768094535, "grad_norm": 0.23863689601421356, "learning_rate": 3.281192265057273e-05, "loss": 0.1726, "step": 6792 }, { "epoch": 2.508493353028065, "grad_norm": 0.3121388554573059, "learning_rate": 3.278728907500923e-05, "loss": 0.1622, "step": 6793 }, { "epoch": 2.5088626292466767, "grad_norm": 0.22068506479263306, "learning_rate": 3.276265549944574e-05, "loss": 0.1393, "step": 6794 }, { "epoch": 2.5092319054652883, "grad_norm": 0.27219098806381226, "learning_rate": 3.273802192388225e-05, "loss": 0.1447, "step": 6795 }, { "epoch": 2.5096011816838995, "grad_norm": 0.273375928401947, "learning_rate": 3.271338834831876e-05, "loss": 0.1769, "step": 6796 }, { "epoch": 2.509970457902511, "grad_norm": 0.27035534381866455, "learning_rate": 3.2688754772755265e-05, "loss": 0.1785, "step": 6797 }, { "epoch": 2.5103397341211227, "grad_norm": 0.24310755729675293, "learning_rate": 3.266412119719177e-05, "loss": 0.1549, "step": 6798 }, { "epoch": 2.5107090103397343, "grad_norm": 0.2548632323741913, "learning_rate": 3.263948762162828e-05, "loss": 0.1599, "step": 6799 }, { "epoch": 2.5110782865583454, "grad_norm": 0.2806619107723236, "learning_rate": 3.261485404606479e-05, "loss": 0.1869, "step": 6800 }, { "epoch": 2.5110782865583454, "eval_loss": 0.2532981038093567, "eval_runtime": 5.8583, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 6800 }, { "epoch": 2.511447562776957, "grad_norm": 0.2640089988708496, "learning_rate": 3.259022047050129e-05, "loss": 0.1515, "step": 6801 }, { "epoch": 2.5118168389955686, "grad_norm": 0.32389867305755615, "learning_rate": 3.25655868949378e-05, "loss": 0.1627, "step": 6802 }, { "epoch": 2.5121861152141802, "grad_norm": 0.2391839176416397, "learning_rate": 3.2540953319374305e-05, "loss": 0.1321, "step": 6803 }, { "epoch": 2.512555391432792, "grad_norm": 0.3072627782821655, "learning_rate": 3.251631974381081e-05, "loss": 0.172, "step": 6804 }, { "epoch": 2.5129246676514034, "grad_norm": 0.2833667993545532, "learning_rate": 3.249168616824732e-05, "loss": 0.1767, "step": 6805 }, { "epoch": 2.513293943870015, "grad_norm": 0.26352235674858093, "learning_rate": 3.246705259268383e-05, "loss": 0.167, "step": 6806 }, { "epoch": 2.513663220088626, "grad_norm": 0.2721315920352936, "learning_rate": 3.2442419017120337e-05, "loss": 0.1652, "step": 6807 }, { "epoch": 2.514032496307238, "grad_norm": 0.25515738129615784, "learning_rate": 3.2417785441556844e-05, "loss": 0.1619, "step": 6808 }, { "epoch": 2.5144017725258494, "grad_norm": 0.2895338535308838, "learning_rate": 3.2393151865993346e-05, "loss": 0.2061, "step": 6809 }, { "epoch": 2.514771048744461, "grad_norm": 0.2826431393623352, "learning_rate": 3.236851829042985e-05, "loss": 0.1617, "step": 6810 }, { "epoch": 2.515140324963072, "grad_norm": 0.24174825847148895, "learning_rate": 3.234388471486636e-05, "loss": 0.1544, "step": 6811 }, { "epoch": 2.5155096011816838, "grad_norm": 0.23029516637325287, "learning_rate": 3.231925113930287e-05, "loss": 0.1458, "step": 6812 }, { "epoch": 2.5158788774002954, "grad_norm": 0.2716422975063324, "learning_rate": 3.229461756373938e-05, "loss": 0.141, "step": 6813 }, { "epoch": 2.516248153618907, "grad_norm": 0.2675827443599701, "learning_rate": 3.2269983988175885e-05, "loss": 0.1579, "step": 6814 }, { "epoch": 2.5166174298375186, "grad_norm": 0.24630804359912872, "learning_rate": 3.224535041261239e-05, "loss": 0.1351, "step": 6815 }, { "epoch": 2.51698670605613, "grad_norm": 0.2974760830402374, "learning_rate": 3.22207168370489e-05, "loss": 0.1833, "step": 6816 }, { "epoch": 2.5173559822747418, "grad_norm": 0.26326414942741394, "learning_rate": 3.21960832614854e-05, "loss": 0.1732, "step": 6817 }, { "epoch": 2.517725258493353, "grad_norm": 0.2742905616760254, "learning_rate": 3.217144968592191e-05, "loss": 0.1999, "step": 6818 }, { "epoch": 2.5180945347119645, "grad_norm": 0.26032716035842896, "learning_rate": 3.214681611035842e-05, "loss": 0.1575, "step": 6819 }, { "epoch": 2.518463810930576, "grad_norm": 0.2454797774553299, "learning_rate": 3.2122182534794925e-05, "loss": 0.1503, "step": 6820 }, { "epoch": 2.5188330871491877, "grad_norm": 0.2847702205181122, "learning_rate": 3.209754895923143e-05, "loss": 0.2125, "step": 6821 }, { "epoch": 2.519202363367799, "grad_norm": 0.24653670191764832, "learning_rate": 3.207291538366794e-05, "loss": 0.1612, "step": 6822 }, { "epoch": 2.5195716395864105, "grad_norm": 0.2155052125453949, "learning_rate": 3.204828180810445e-05, "loss": 0.166, "step": 6823 }, { "epoch": 2.519940915805022, "grad_norm": 0.2415972352027893, "learning_rate": 3.202364823254096e-05, "loss": 0.1758, "step": 6824 }, { "epoch": 2.5203101920236337, "grad_norm": 0.26860833168029785, "learning_rate": 3.199901465697746e-05, "loss": 0.164, "step": 6825 }, { "epoch": 2.5206794682422453, "grad_norm": 0.3211625814437866, "learning_rate": 3.1974381081413966e-05, "loss": 0.1647, "step": 6826 }, { "epoch": 2.521048744460857, "grad_norm": 0.21640796959400177, "learning_rate": 3.1949747505850474e-05, "loss": 0.1677, "step": 6827 }, { "epoch": 2.5214180206794685, "grad_norm": 0.23410780727863312, "learning_rate": 3.192511393028698e-05, "loss": 0.1508, "step": 6828 }, { "epoch": 2.5217872968980797, "grad_norm": 0.28496286273002625, "learning_rate": 3.190048035472349e-05, "loss": 0.1855, "step": 6829 }, { "epoch": 2.5221565731166913, "grad_norm": 0.23215197026729584, "learning_rate": 3.187584677916e-05, "loss": 0.1513, "step": 6830 }, { "epoch": 2.522525849335303, "grad_norm": 0.27465683221817017, "learning_rate": 3.1851213203596505e-05, "loss": 0.1772, "step": 6831 }, { "epoch": 2.5228951255539145, "grad_norm": 0.28835806250572205, "learning_rate": 3.182657962803301e-05, "loss": 0.162, "step": 6832 }, { "epoch": 2.5232644017725256, "grad_norm": 0.24527624249458313, "learning_rate": 3.1801946052469514e-05, "loss": 0.1605, "step": 6833 }, { "epoch": 2.5236336779911372, "grad_norm": 0.30247774720191956, "learning_rate": 3.177731247690602e-05, "loss": 0.1915, "step": 6834 }, { "epoch": 2.524002954209749, "grad_norm": 0.25916823744773865, "learning_rate": 3.175267890134253e-05, "loss": 0.1831, "step": 6835 }, { "epoch": 2.5243722304283605, "grad_norm": 0.31635192036628723, "learning_rate": 3.172804532577904e-05, "loss": 0.1859, "step": 6836 }, { "epoch": 2.524741506646972, "grad_norm": 0.2717903256416321, "learning_rate": 3.1703411750215545e-05, "loss": 0.1466, "step": 6837 }, { "epoch": 2.5251107828655837, "grad_norm": 0.2232266366481781, "learning_rate": 3.167877817465205e-05, "loss": 0.1437, "step": 6838 }, { "epoch": 2.525480059084195, "grad_norm": 0.2692876160144806, "learning_rate": 3.165414459908856e-05, "loss": 0.1638, "step": 6839 }, { "epoch": 2.5258493353028064, "grad_norm": 0.2996908128261566, "learning_rate": 3.162951102352507e-05, "loss": 0.1624, "step": 6840 }, { "epoch": 2.526218611521418, "grad_norm": 0.27915018796920776, "learning_rate": 3.160487744796157e-05, "loss": 0.1588, "step": 6841 }, { "epoch": 2.5265878877400296, "grad_norm": 0.25712302327156067, "learning_rate": 3.158024387239808e-05, "loss": 0.1813, "step": 6842 }, { "epoch": 2.5269571639586412, "grad_norm": 0.2830292582511902, "learning_rate": 3.1555610296834586e-05, "loss": 0.2051, "step": 6843 }, { "epoch": 2.5273264401772524, "grad_norm": 0.31948432326316833, "learning_rate": 3.1530976721271094e-05, "loss": 0.1748, "step": 6844 }, { "epoch": 2.527695716395864, "grad_norm": 0.279136598110199, "learning_rate": 3.15063431457076e-05, "loss": 0.1643, "step": 6845 }, { "epoch": 2.5280649926144756, "grad_norm": 0.284660279750824, "learning_rate": 3.148170957014411e-05, "loss": 0.166, "step": 6846 }, { "epoch": 2.528434268833087, "grad_norm": 0.3152167499065399, "learning_rate": 3.145707599458062e-05, "loss": 0.16, "step": 6847 }, { "epoch": 2.528803545051699, "grad_norm": 0.26597416400909424, "learning_rate": 3.1432442419017125e-05, "loss": 0.189, "step": 6848 }, { "epoch": 2.5291728212703104, "grad_norm": 0.2872331738471985, "learning_rate": 3.1407808843453626e-05, "loss": 0.1528, "step": 6849 }, { "epoch": 2.5295420974889216, "grad_norm": 0.2678552567958832, "learning_rate": 3.1383175267890134e-05, "loss": 0.1842, "step": 6850 }, { "epoch": 2.5295420974889216, "eval_loss": 0.2541038990020752, "eval_runtime": 5.8479, "eval_samples_per_second": 8.55, "eval_steps_per_second": 1.197, "step": 6850 }, { "epoch": 2.529911373707533, "grad_norm": 0.2524714469909668, "learning_rate": 3.135854169232664e-05, "loss": 0.1701, "step": 6851 }, { "epoch": 2.5302806499261448, "grad_norm": 0.3068491220474243, "learning_rate": 3.133390811676315e-05, "loss": 0.1635, "step": 6852 }, { "epoch": 2.5306499261447564, "grad_norm": 0.30152344703674316, "learning_rate": 3.130927454119966e-05, "loss": 0.162, "step": 6853 }, { "epoch": 2.5310192023633675, "grad_norm": 0.2503848671913147, "learning_rate": 3.1284640965636165e-05, "loss": 0.1773, "step": 6854 }, { "epoch": 2.531388478581979, "grad_norm": 0.281986266374588, "learning_rate": 3.126000739007267e-05, "loss": 0.1672, "step": 6855 }, { "epoch": 2.5317577548005907, "grad_norm": 0.29041624069213867, "learning_rate": 3.123537381450918e-05, "loss": 0.1654, "step": 6856 }, { "epoch": 2.5321270310192023, "grad_norm": 0.28015342354774475, "learning_rate": 3.121074023894568e-05, "loss": 0.1472, "step": 6857 }, { "epoch": 2.532496307237814, "grad_norm": 0.27722394466400146, "learning_rate": 3.118610666338219e-05, "loss": 0.1665, "step": 6858 }, { "epoch": 2.5328655834564255, "grad_norm": 0.29715755581855774, "learning_rate": 3.11614730878187e-05, "loss": 0.17, "step": 6859 }, { "epoch": 2.533234859675037, "grad_norm": 0.3651770353317261, "learning_rate": 3.1136839512255206e-05, "loss": 0.1896, "step": 6860 }, { "epoch": 2.5336041358936483, "grad_norm": 0.2718464136123657, "learning_rate": 3.1112205936691714e-05, "loss": 0.1653, "step": 6861 }, { "epoch": 2.53397341211226, "grad_norm": 0.22726774215698242, "learning_rate": 3.108757236112822e-05, "loss": 0.1566, "step": 6862 }, { "epoch": 2.5343426883308715, "grad_norm": 0.2652910053730011, "learning_rate": 3.106293878556473e-05, "loss": 0.1467, "step": 6863 }, { "epoch": 2.534711964549483, "grad_norm": 0.22904013097286224, "learning_rate": 3.103830521000123e-05, "loss": 0.1445, "step": 6864 }, { "epoch": 2.5350812407680943, "grad_norm": 0.33844199776649475, "learning_rate": 3.101367163443774e-05, "loss": 0.1885, "step": 6865 }, { "epoch": 2.535450516986706, "grad_norm": 0.24840067327022552, "learning_rate": 3.0989038058874246e-05, "loss": 0.1435, "step": 6866 }, { "epoch": 2.5358197932053175, "grad_norm": 0.31841030716896057, "learning_rate": 3.0964404483310754e-05, "loss": 0.1425, "step": 6867 }, { "epoch": 2.536189069423929, "grad_norm": 0.2922656536102295, "learning_rate": 3.093977090774726e-05, "loss": 0.1536, "step": 6868 }, { "epoch": 2.5365583456425407, "grad_norm": 0.318960577249527, "learning_rate": 3.091513733218377e-05, "loss": 0.1776, "step": 6869 }, { "epoch": 2.5369276218611523, "grad_norm": 0.2846389710903168, "learning_rate": 3.089050375662028e-05, "loss": 0.1624, "step": 6870 }, { "epoch": 2.537296898079764, "grad_norm": 0.30565300583839417, "learning_rate": 3.0865870181056786e-05, "loss": 0.1778, "step": 6871 }, { "epoch": 2.537666174298375, "grad_norm": 0.24448475241661072, "learning_rate": 3.084123660549329e-05, "loss": 0.1628, "step": 6872 }, { "epoch": 2.5380354505169866, "grad_norm": 0.30311644077301025, "learning_rate": 3.0816603029929795e-05, "loss": 0.1563, "step": 6873 }, { "epoch": 2.5384047267355982, "grad_norm": 0.2623513340950012, "learning_rate": 3.07919694543663e-05, "loss": 0.1935, "step": 6874 }, { "epoch": 2.53877400295421, "grad_norm": 0.2626889646053314, "learning_rate": 3.076733587880281e-05, "loss": 0.1687, "step": 6875 }, { "epoch": 2.539143279172821, "grad_norm": 0.3092837333679199, "learning_rate": 3.074270230323932e-05, "loss": 0.1792, "step": 6876 }, { "epoch": 2.5395125553914326, "grad_norm": 0.30462512373924255, "learning_rate": 3.0718068727675826e-05, "loss": 0.1886, "step": 6877 }, { "epoch": 2.539881831610044, "grad_norm": 0.3408518135547638, "learning_rate": 3.0693435152112334e-05, "loss": 0.1922, "step": 6878 }, { "epoch": 2.540251107828656, "grad_norm": 0.2713509798049927, "learning_rate": 3.066880157654884e-05, "loss": 0.1636, "step": 6879 }, { "epoch": 2.5406203840472674, "grad_norm": 0.31693035364151, "learning_rate": 3.064416800098534e-05, "loss": 0.1506, "step": 6880 }, { "epoch": 2.540989660265879, "grad_norm": 0.25451111793518066, "learning_rate": 3.061953442542185e-05, "loss": 0.1571, "step": 6881 }, { "epoch": 2.5413589364844906, "grad_norm": 0.2906498610973358, "learning_rate": 3.059490084985836e-05, "loss": 0.156, "step": 6882 }, { "epoch": 2.541728212703102, "grad_norm": 0.30169451236724854, "learning_rate": 3.0570267274294866e-05, "loss": 0.1458, "step": 6883 }, { "epoch": 2.5420974889217134, "grad_norm": 0.41555091738700867, "learning_rate": 3.0545633698731374e-05, "loss": 0.1743, "step": 6884 }, { "epoch": 2.542466765140325, "grad_norm": 0.2576081156730652, "learning_rate": 3.052100012316788e-05, "loss": 0.1802, "step": 6885 }, { "epoch": 2.5428360413589366, "grad_norm": 0.2756846249103546, "learning_rate": 3.0496366547604387e-05, "loss": 0.149, "step": 6886 }, { "epoch": 2.5432053175775478, "grad_norm": 0.30317094922065735, "learning_rate": 3.0471732972040894e-05, "loss": 0.162, "step": 6887 }, { "epoch": 2.5435745937961594, "grad_norm": 0.2918936014175415, "learning_rate": 3.0447099396477402e-05, "loss": 0.1668, "step": 6888 }, { "epoch": 2.543943870014771, "grad_norm": 0.35052746534347534, "learning_rate": 3.042246582091391e-05, "loss": 0.1899, "step": 6889 }, { "epoch": 2.5443131462333826, "grad_norm": 0.24106165766716003, "learning_rate": 3.0397832245350415e-05, "loss": 0.1453, "step": 6890 }, { "epoch": 2.544682422451994, "grad_norm": 0.3153793513774872, "learning_rate": 3.0373198669786923e-05, "loss": 0.1972, "step": 6891 }, { "epoch": 2.5450516986706058, "grad_norm": 0.2803950309753418, "learning_rate": 3.034856509422343e-05, "loss": 0.1575, "step": 6892 }, { "epoch": 2.5454209748892174, "grad_norm": 0.2646368145942688, "learning_rate": 3.0323931518659938e-05, "loss": 0.1513, "step": 6893 }, { "epoch": 2.5457902511078285, "grad_norm": 0.257588654756546, "learning_rate": 3.0299297943096443e-05, "loss": 0.181, "step": 6894 }, { "epoch": 2.54615952732644, "grad_norm": 0.26178112626075745, "learning_rate": 3.027466436753295e-05, "loss": 0.1427, "step": 6895 }, { "epoch": 2.5465288035450517, "grad_norm": 0.24951210618019104, "learning_rate": 3.025003079196946e-05, "loss": 0.1484, "step": 6896 }, { "epoch": 2.5468980797636633, "grad_norm": 0.27104729413986206, "learning_rate": 3.0225397216405966e-05, "loss": 0.1462, "step": 6897 }, { "epoch": 2.5472673559822745, "grad_norm": 0.31783542037010193, "learning_rate": 3.020076364084247e-05, "loss": 0.177, "step": 6898 }, { "epoch": 2.547636632200886, "grad_norm": 0.29903313517570496, "learning_rate": 3.017613006527898e-05, "loss": 0.17, "step": 6899 }, { "epoch": 2.5480059084194977, "grad_norm": 0.28590208292007446, "learning_rate": 3.0151496489715487e-05, "loss": 0.1775, "step": 6900 }, { "epoch": 2.5480059084194977, "eval_loss": 0.2523267865180969, "eval_runtime": 5.8514, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 6900 }, { "epoch": 2.5483751846381093, "grad_norm": 0.28440597653388977, "learning_rate": 3.0126862914151994e-05, "loss": 0.174, "step": 6901 }, { "epoch": 2.548744460856721, "grad_norm": 0.2633342742919922, "learning_rate": 3.01022293385885e-05, "loss": 0.1592, "step": 6902 }, { "epoch": 2.5491137370753325, "grad_norm": 0.3877559304237366, "learning_rate": 3.0077595763025007e-05, "loss": 0.1874, "step": 6903 }, { "epoch": 2.549483013293944, "grad_norm": 0.2826557755470276, "learning_rate": 3.0052962187461515e-05, "loss": 0.1771, "step": 6904 }, { "epoch": 2.5498522895125553, "grad_norm": 0.2666883170604706, "learning_rate": 3.0028328611898022e-05, "loss": 0.1739, "step": 6905 }, { "epoch": 2.550221565731167, "grad_norm": 0.26083993911743164, "learning_rate": 3.0003695036334527e-05, "loss": 0.1679, "step": 6906 }, { "epoch": 2.5505908419497785, "grad_norm": 0.2829050123691559, "learning_rate": 2.9979061460771028e-05, "loss": 0.1608, "step": 6907 }, { "epoch": 2.55096011816839, "grad_norm": 0.2269107848405838, "learning_rate": 2.9954427885207536e-05, "loss": 0.1648, "step": 6908 }, { "epoch": 2.5513293943870012, "grad_norm": 0.35450270771980286, "learning_rate": 2.9929794309644044e-05, "loss": 0.1604, "step": 6909 }, { "epoch": 2.551698670605613, "grad_norm": 0.6864081621170044, "learning_rate": 2.990516073408055e-05, "loss": 0.1959, "step": 6910 }, { "epoch": 2.5520679468242244, "grad_norm": 0.2551758289337158, "learning_rate": 2.9880527158517056e-05, "loss": 0.1619, "step": 6911 }, { "epoch": 2.552437223042836, "grad_norm": 0.3041122853755951, "learning_rate": 2.9855893582953564e-05, "loss": 0.164, "step": 6912 }, { "epoch": 2.5528064992614476, "grad_norm": 0.2321958690881729, "learning_rate": 2.9831260007390072e-05, "loss": 0.1592, "step": 6913 }, { "epoch": 2.5531757754800593, "grad_norm": 0.23633337020874023, "learning_rate": 2.980662643182658e-05, "loss": 0.1605, "step": 6914 }, { "epoch": 2.553545051698671, "grad_norm": 0.3471091389656067, "learning_rate": 2.9781992856263084e-05, "loss": 0.1808, "step": 6915 }, { "epoch": 2.553914327917282, "grad_norm": 0.25309813022613525, "learning_rate": 2.9757359280699592e-05, "loss": 0.1688, "step": 6916 }, { "epoch": 2.5542836041358936, "grad_norm": 0.2922281324863434, "learning_rate": 2.97327257051361e-05, "loss": 0.171, "step": 6917 }, { "epoch": 2.554652880354505, "grad_norm": 0.3181562125682831, "learning_rate": 2.9708092129572608e-05, "loss": 0.1846, "step": 6918 }, { "epoch": 2.555022156573117, "grad_norm": 0.23554129898548126, "learning_rate": 2.9683458554009112e-05, "loss": 0.1368, "step": 6919 }, { "epoch": 2.555391432791728, "grad_norm": 0.24824005365371704, "learning_rate": 2.965882497844562e-05, "loss": 0.1592, "step": 6920 }, { "epoch": 2.5557607090103396, "grad_norm": 0.34159722924232483, "learning_rate": 2.9634191402882128e-05, "loss": 0.1719, "step": 6921 }, { "epoch": 2.556129985228951, "grad_norm": 0.278098464012146, "learning_rate": 2.9609557827318636e-05, "loss": 0.1653, "step": 6922 }, { "epoch": 2.556499261447563, "grad_norm": 0.21666814386844635, "learning_rate": 2.958492425175514e-05, "loss": 0.1458, "step": 6923 }, { "epoch": 2.5568685376661744, "grad_norm": 0.26115965843200684, "learning_rate": 2.9560290676191648e-05, "loss": 0.1719, "step": 6924 }, { "epoch": 2.557237813884786, "grad_norm": 0.2852897346019745, "learning_rate": 2.9535657100628156e-05, "loss": 0.149, "step": 6925 }, { "epoch": 2.5576070901033976, "grad_norm": 0.27047446370124817, "learning_rate": 2.9511023525064664e-05, "loss": 0.182, "step": 6926 }, { "epoch": 2.5579763663220088, "grad_norm": 0.2714018225669861, "learning_rate": 2.948638994950117e-05, "loss": 0.1663, "step": 6927 }, { "epoch": 2.5583456425406204, "grad_norm": 0.3140382170677185, "learning_rate": 2.9461756373937676e-05, "loss": 0.1656, "step": 6928 }, { "epoch": 2.558714918759232, "grad_norm": 0.3167521357536316, "learning_rate": 2.9437122798374184e-05, "loss": 0.1816, "step": 6929 }, { "epoch": 2.5590841949778436, "grad_norm": 0.24346637725830078, "learning_rate": 2.9412489222810692e-05, "loss": 0.1469, "step": 6930 }, { "epoch": 2.5594534711964547, "grad_norm": 0.28615519404411316, "learning_rate": 2.9387855647247196e-05, "loss": 0.1812, "step": 6931 }, { "epoch": 2.5598227474150663, "grad_norm": 0.2468711882829666, "learning_rate": 2.9363222071683704e-05, "loss": 0.1302, "step": 6932 }, { "epoch": 2.560192023633678, "grad_norm": 0.27801457047462463, "learning_rate": 2.9338588496120212e-05, "loss": 0.1717, "step": 6933 }, { "epoch": 2.5605612998522895, "grad_norm": 0.3109159469604492, "learning_rate": 2.931395492055672e-05, "loss": 0.1731, "step": 6934 }, { "epoch": 2.560930576070901, "grad_norm": 0.24855497479438782, "learning_rate": 2.9289321344993224e-05, "loss": 0.1478, "step": 6935 }, { "epoch": 2.5612998522895127, "grad_norm": 0.2573830485343933, "learning_rate": 2.9264687769429732e-05, "loss": 0.1615, "step": 6936 }, { "epoch": 2.5616691285081243, "grad_norm": 0.21040625870227814, "learning_rate": 2.924005419386624e-05, "loss": 0.1706, "step": 6937 }, { "epoch": 2.5620384047267355, "grad_norm": 0.3306921124458313, "learning_rate": 2.9215420618302748e-05, "loss": 0.1848, "step": 6938 }, { "epoch": 2.562407680945347, "grad_norm": 0.3369297981262207, "learning_rate": 2.9190787042739253e-05, "loss": 0.1613, "step": 6939 }, { "epoch": 2.5627769571639587, "grad_norm": 0.2304317057132721, "learning_rate": 2.916615346717576e-05, "loss": 0.1691, "step": 6940 }, { "epoch": 2.5631462333825703, "grad_norm": 0.26104792952537537, "learning_rate": 2.9141519891612268e-05, "loss": 0.1739, "step": 6941 }, { "epoch": 2.5635155096011815, "grad_norm": 0.25000765919685364, "learning_rate": 2.9116886316048776e-05, "loss": 0.1554, "step": 6942 }, { "epoch": 2.563884785819793, "grad_norm": 0.2503049075603485, "learning_rate": 2.909225274048528e-05, "loss": 0.1479, "step": 6943 }, { "epoch": 2.5642540620384047, "grad_norm": 0.31189024448394775, "learning_rate": 2.906761916492179e-05, "loss": 0.1662, "step": 6944 }, { "epoch": 2.5646233382570163, "grad_norm": 0.2347639799118042, "learning_rate": 2.9042985589358296e-05, "loss": 0.158, "step": 6945 }, { "epoch": 2.564992614475628, "grad_norm": 0.24234919250011444, "learning_rate": 2.9018352013794804e-05, "loss": 0.1726, "step": 6946 }, { "epoch": 2.5653618906942395, "grad_norm": 0.2983647584915161, "learning_rate": 2.899371843823131e-05, "loss": 0.162, "step": 6947 }, { "epoch": 2.565731166912851, "grad_norm": 0.2836242616176605, "learning_rate": 2.8969084862667817e-05, "loss": 0.1636, "step": 6948 }, { "epoch": 2.5661004431314622, "grad_norm": 0.2699525058269501, "learning_rate": 2.8944451287104324e-05, "loss": 0.1683, "step": 6949 }, { "epoch": 2.566469719350074, "grad_norm": 0.32303038239479065, "learning_rate": 2.8919817711540832e-05, "loss": 0.1778, "step": 6950 }, { "epoch": 2.566469719350074, "eval_loss": 0.2497435361146927, "eval_runtime": 5.8575, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 6950 }, { "epoch": 2.5668389955686854, "grad_norm": 0.2764846384525299, "learning_rate": 2.8895184135977337e-05, "loss": 0.1607, "step": 6951 }, { "epoch": 2.567208271787297, "grad_norm": 0.2732771039009094, "learning_rate": 2.8870550560413845e-05, "loss": 0.1574, "step": 6952 }, { "epoch": 2.567577548005908, "grad_norm": 0.31445491313934326, "learning_rate": 2.8845916984850352e-05, "loss": 0.1827, "step": 6953 }, { "epoch": 2.56794682422452, "grad_norm": 0.24035432934761047, "learning_rate": 2.882128340928686e-05, "loss": 0.1553, "step": 6954 }, { "epoch": 2.5683161004431314, "grad_norm": 0.2663872539997101, "learning_rate": 2.8796649833723365e-05, "loss": 0.1644, "step": 6955 }, { "epoch": 2.568685376661743, "grad_norm": 0.27982500195503235, "learning_rate": 2.8772016258159873e-05, "loss": 0.1553, "step": 6956 }, { "epoch": 2.5690546528803546, "grad_norm": 0.5572454333305359, "learning_rate": 2.874738268259638e-05, "loss": 0.1526, "step": 6957 }, { "epoch": 2.569423929098966, "grad_norm": 0.2863433361053467, "learning_rate": 2.872274910703289e-05, "loss": 0.1588, "step": 6958 }, { "epoch": 2.569793205317578, "grad_norm": 0.3161817491054535, "learning_rate": 2.8698115531469393e-05, "loss": 0.222, "step": 6959 }, { "epoch": 2.570162481536189, "grad_norm": 0.28113853931427, "learning_rate": 2.86734819559059e-05, "loss": 0.1817, "step": 6960 }, { "epoch": 2.5705317577548006, "grad_norm": 0.23445942997932434, "learning_rate": 2.864884838034241e-05, "loss": 0.1497, "step": 6961 }, { "epoch": 2.570901033973412, "grad_norm": 0.3076701760292053, "learning_rate": 2.8624214804778916e-05, "loss": 0.1515, "step": 6962 }, { "epoch": 2.571270310192024, "grad_norm": 0.26474785804748535, "learning_rate": 2.859958122921542e-05, "loss": 0.1587, "step": 6963 }, { "epoch": 2.571639586410635, "grad_norm": 0.3620382249355316, "learning_rate": 2.857494765365193e-05, "loss": 0.1699, "step": 6964 }, { "epoch": 2.5720088626292466, "grad_norm": 0.2817908227443695, "learning_rate": 2.8550314078088437e-05, "loss": 0.1565, "step": 6965 }, { "epoch": 2.572378138847858, "grad_norm": 0.27126121520996094, "learning_rate": 2.8525680502524945e-05, "loss": 0.174, "step": 6966 }, { "epoch": 2.5727474150664698, "grad_norm": 0.27937519550323486, "learning_rate": 2.850104692696145e-05, "loss": 0.1773, "step": 6967 }, { "epoch": 2.5731166912850814, "grad_norm": 0.24077384173870087, "learning_rate": 2.8476413351397957e-05, "loss": 0.1395, "step": 6968 }, { "epoch": 2.573485967503693, "grad_norm": 0.26432493329048157, "learning_rate": 2.8451779775834465e-05, "loss": 0.177, "step": 6969 }, { "epoch": 2.573855243722304, "grad_norm": 0.2882770299911499, "learning_rate": 2.8427146200270973e-05, "loss": 0.17, "step": 6970 }, { "epoch": 2.5742245199409157, "grad_norm": 0.24378398060798645, "learning_rate": 2.8402512624707477e-05, "loss": 0.1655, "step": 6971 }, { "epoch": 2.5745937961595273, "grad_norm": 0.27990424633026123, "learning_rate": 2.8377879049143985e-05, "loss": 0.1644, "step": 6972 }, { "epoch": 2.574963072378139, "grad_norm": 0.258092999458313, "learning_rate": 2.8353245473580493e-05, "loss": 0.1562, "step": 6973 }, { "epoch": 2.5753323485967505, "grad_norm": 0.2670189142227173, "learning_rate": 2.8328611898017e-05, "loss": 0.1574, "step": 6974 }, { "epoch": 2.5757016248153617, "grad_norm": 0.23461149632930756, "learning_rate": 2.8303978322453505e-05, "loss": 0.1608, "step": 6975 }, { "epoch": 2.5760709010339733, "grad_norm": 0.2981928884983063, "learning_rate": 2.8279344746890013e-05, "loss": 0.1704, "step": 6976 }, { "epoch": 2.576440177252585, "grad_norm": 0.27135443687438965, "learning_rate": 2.825471117132652e-05, "loss": 0.1626, "step": 6977 }, { "epoch": 2.5768094534711965, "grad_norm": 0.25236204266548157, "learning_rate": 2.8230077595763025e-05, "loss": 0.1479, "step": 6978 }, { "epoch": 2.577178729689808, "grad_norm": 0.26014214754104614, "learning_rate": 2.8205444020199533e-05, "loss": 0.156, "step": 6979 }, { "epoch": 2.5775480059084197, "grad_norm": 0.3150984048843384, "learning_rate": 2.818081044463604e-05, "loss": 0.1758, "step": 6980 }, { "epoch": 2.577917282127031, "grad_norm": 0.31969812512397766, "learning_rate": 2.815617686907255e-05, "loss": 0.1625, "step": 6981 }, { "epoch": 2.5782865583456425, "grad_norm": 0.25959834456443787, "learning_rate": 2.8131543293509053e-05, "loss": 0.1536, "step": 6982 }, { "epoch": 2.578655834564254, "grad_norm": 0.29799771308898926, "learning_rate": 2.810690971794556e-05, "loss": 0.1511, "step": 6983 }, { "epoch": 2.5790251107828657, "grad_norm": 0.28962835669517517, "learning_rate": 2.808227614238207e-05, "loss": 0.1573, "step": 6984 }, { "epoch": 2.579394387001477, "grad_norm": 0.25530558824539185, "learning_rate": 2.8057642566818577e-05, "loss": 0.1471, "step": 6985 }, { "epoch": 2.5797636632200884, "grad_norm": 0.24739280343055725, "learning_rate": 2.803300899125508e-05, "loss": 0.15, "step": 6986 }, { "epoch": 2.5801329394387, "grad_norm": 0.3226220905780792, "learning_rate": 2.800837541569159e-05, "loss": 0.1695, "step": 6987 }, { "epoch": 2.5805022156573116, "grad_norm": 0.2757609188556671, "learning_rate": 2.7983741840128097e-05, "loss": 0.1619, "step": 6988 }, { "epoch": 2.5808714918759232, "grad_norm": 0.3205515146255493, "learning_rate": 2.7959108264564605e-05, "loss": 0.1998, "step": 6989 }, { "epoch": 2.581240768094535, "grad_norm": 0.29517224431037903, "learning_rate": 2.793447468900111e-05, "loss": 0.1701, "step": 6990 }, { "epoch": 2.5816100443131464, "grad_norm": 0.2605873942375183, "learning_rate": 2.7909841113437617e-05, "loss": 0.1599, "step": 6991 }, { "epoch": 2.5819793205317576, "grad_norm": 0.3558996915817261, "learning_rate": 2.7885207537874125e-05, "loss": 0.1546, "step": 6992 }, { "epoch": 2.582348596750369, "grad_norm": 0.3165871500968933, "learning_rate": 2.7860573962310633e-05, "loss": 0.1983, "step": 6993 }, { "epoch": 2.582717872968981, "grad_norm": 0.264943391084671, "learning_rate": 2.7835940386747138e-05, "loss": 0.1712, "step": 6994 }, { "epoch": 2.5830871491875924, "grad_norm": 0.24953593313694, "learning_rate": 2.7811306811183645e-05, "loss": 0.1604, "step": 6995 }, { "epoch": 2.5834564254062036, "grad_norm": 0.2861347794532776, "learning_rate": 2.7786673235620153e-05, "loss": 0.1704, "step": 6996 }, { "epoch": 2.583825701624815, "grad_norm": 0.2639878988265991, "learning_rate": 2.776203966005666e-05, "loss": 0.1493, "step": 6997 }, { "epoch": 2.5841949778434268, "grad_norm": 0.26286205649375916, "learning_rate": 2.7737406084493166e-05, "loss": 0.156, "step": 6998 }, { "epoch": 2.5845642540620384, "grad_norm": 0.3136504590511322, "learning_rate": 2.7712772508929674e-05, "loss": 0.1745, "step": 6999 }, { "epoch": 2.58493353028065, "grad_norm": 0.23299533128738403, "learning_rate": 2.768813893336618e-05, "loss": 0.1485, "step": 7000 }, { "epoch": 2.58493353028065, "eval_loss": 0.24902725219726562, "eval_runtime": 5.8582, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 7000 }, { "epoch": 2.5853028064992616, "grad_norm": 0.25273215770721436, "learning_rate": 2.766350535780269e-05, "loss": 0.1335, "step": 7001 }, { "epoch": 2.585672082717873, "grad_norm": 0.25386619567871094, "learning_rate": 2.7638871782239194e-05, "loss": 0.149, "step": 7002 }, { "epoch": 2.5860413589364843, "grad_norm": 0.25805094838142395, "learning_rate": 2.76142382066757e-05, "loss": 0.1695, "step": 7003 }, { "epoch": 2.586410635155096, "grad_norm": 0.30421289801597595, "learning_rate": 2.758960463111221e-05, "loss": 0.1789, "step": 7004 }, { "epoch": 2.5867799113737076, "grad_norm": 0.2829640209674835, "learning_rate": 2.7564971055548717e-05, "loss": 0.1635, "step": 7005 }, { "epoch": 2.587149187592319, "grad_norm": 0.31826967000961304, "learning_rate": 2.7540337479985222e-05, "loss": 0.1885, "step": 7006 }, { "epoch": 2.5875184638109303, "grad_norm": 0.2631217837333679, "learning_rate": 2.751570390442173e-05, "loss": 0.1645, "step": 7007 }, { "epoch": 2.587887740029542, "grad_norm": 0.31671977043151855, "learning_rate": 2.7491070328858238e-05, "loss": 0.1714, "step": 7008 }, { "epoch": 2.5882570162481535, "grad_norm": 0.2654682993888855, "learning_rate": 2.7466436753294745e-05, "loss": 0.1562, "step": 7009 }, { "epoch": 2.588626292466765, "grad_norm": 0.2755813002586365, "learning_rate": 2.744180317773125e-05, "loss": 0.2074, "step": 7010 }, { "epoch": 2.5889955686853767, "grad_norm": 0.24998833239078522, "learning_rate": 2.7417169602167758e-05, "loss": 0.1542, "step": 7011 }, { "epoch": 2.5893648449039883, "grad_norm": 0.30420055985450745, "learning_rate": 2.7392536026604266e-05, "loss": 0.1582, "step": 7012 }, { "epoch": 2.5897341211226, "grad_norm": 0.29341813921928406, "learning_rate": 2.7367902451040773e-05, "loss": 0.1671, "step": 7013 }, { "epoch": 2.590103397341211, "grad_norm": 0.3467811644077301, "learning_rate": 2.7343268875477278e-05, "loss": 0.1561, "step": 7014 }, { "epoch": 2.5904726735598227, "grad_norm": 0.29726535081863403, "learning_rate": 2.7318635299913786e-05, "loss": 0.1703, "step": 7015 }, { "epoch": 2.5908419497784343, "grad_norm": 0.2435768097639084, "learning_rate": 2.7294001724350294e-05, "loss": 0.1588, "step": 7016 }, { "epoch": 2.591211225997046, "grad_norm": 0.3036586344242096, "learning_rate": 2.72693681487868e-05, "loss": 0.1948, "step": 7017 }, { "epoch": 2.591580502215657, "grad_norm": 0.29029715061187744, "learning_rate": 2.7244734573223306e-05, "loss": 0.1699, "step": 7018 }, { "epoch": 2.5919497784342687, "grad_norm": 0.3097158968448639, "learning_rate": 2.7220100997659814e-05, "loss": 0.192, "step": 7019 }, { "epoch": 2.5923190546528803, "grad_norm": 0.24611760675907135, "learning_rate": 2.7195467422096322e-05, "loss": 0.1487, "step": 7020 }, { "epoch": 2.592688330871492, "grad_norm": 0.27117979526519775, "learning_rate": 2.717083384653283e-05, "loss": 0.1662, "step": 7021 }, { "epoch": 2.5930576070901035, "grad_norm": 0.30515336990356445, "learning_rate": 2.7146200270969334e-05, "loss": 0.1606, "step": 7022 }, { "epoch": 2.593426883308715, "grad_norm": 0.25747841596603394, "learning_rate": 2.7121566695405835e-05, "loss": 0.1566, "step": 7023 }, { "epoch": 2.5937961595273267, "grad_norm": 0.28772181272506714, "learning_rate": 2.7096933119842343e-05, "loss": 0.1442, "step": 7024 }, { "epoch": 2.594165435745938, "grad_norm": 0.2716567814350128, "learning_rate": 2.707229954427885e-05, "loss": 0.164, "step": 7025 }, { "epoch": 2.5945347119645494, "grad_norm": 0.2506811022758484, "learning_rate": 2.704766596871536e-05, "loss": 0.1586, "step": 7026 }, { "epoch": 2.594903988183161, "grad_norm": 0.2633334994316101, "learning_rate": 2.7023032393151863e-05, "loss": 0.1517, "step": 7027 }, { "epoch": 2.5952732644017726, "grad_norm": 0.27225908637046814, "learning_rate": 2.699839881758837e-05, "loss": 0.1643, "step": 7028 }, { "epoch": 2.595642540620384, "grad_norm": 0.26902899146080017, "learning_rate": 2.697376524202488e-05, "loss": 0.1443, "step": 7029 }, { "epoch": 2.5960118168389954, "grad_norm": 0.2809258699417114, "learning_rate": 2.6949131666461387e-05, "loss": 0.167, "step": 7030 }, { "epoch": 2.596381093057607, "grad_norm": 0.33841001987457275, "learning_rate": 2.692449809089789e-05, "loss": 0.1795, "step": 7031 }, { "epoch": 2.5967503692762186, "grad_norm": 0.25426897406578064, "learning_rate": 2.68998645153344e-05, "loss": 0.1427, "step": 7032 }, { "epoch": 2.59711964549483, "grad_norm": 0.26426997780799866, "learning_rate": 2.6875230939770907e-05, "loss": 0.1493, "step": 7033 }, { "epoch": 2.597488921713442, "grad_norm": 0.32085245847702026, "learning_rate": 2.6850597364207415e-05, "loss": 0.1827, "step": 7034 }, { "epoch": 2.5978581979320534, "grad_norm": 0.24739843606948853, "learning_rate": 2.682596378864392e-05, "loss": 0.1459, "step": 7035 }, { "epoch": 2.5982274741506646, "grad_norm": 0.267869770526886, "learning_rate": 2.6801330213080427e-05, "loss": 0.1769, "step": 7036 }, { "epoch": 2.598596750369276, "grad_norm": 0.310130774974823, "learning_rate": 2.6776696637516935e-05, "loss": 0.1738, "step": 7037 }, { "epoch": 2.598966026587888, "grad_norm": 0.3026999831199646, "learning_rate": 2.6752063061953443e-05, "loss": 0.1742, "step": 7038 }, { "epoch": 2.5993353028064994, "grad_norm": 0.2886936068534851, "learning_rate": 2.6727429486389947e-05, "loss": 0.1384, "step": 7039 }, { "epoch": 2.5997045790251105, "grad_norm": 0.29449960589408875, "learning_rate": 2.6702795910826455e-05, "loss": 0.1704, "step": 7040 }, { "epoch": 2.600073855243722, "grad_norm": 0.31389927864074707, "learning_rate": 2.6678162335262963e-05, "loss": 0.1912, "step": 7041 }, { "epoch": 2.6004431314623337, "grad_norm": 0.3336137533187866, "learning_rate": 2.665352875969947e-05, "loss": 0.1662, "step": 7042 }, { "epoch": 2.6008124076809453, "grad_norm": 0.3057897984981537, "learning_rate": 2.6628895184135975e-05, "loss": 0.1711, "step": 7043 }, { "epoch": 2.601181683899557, "grad_norm": 0.26825442910194397, "learning_rate": 2.6604261608572483e-05, "loss": 0.1512, "step": 7044 }, { "epoch": 2.6015509601181686, "grad_norm": 0.30866551399230957, "learning_rate": 2.657962803300899e-05, "loss": 0.144, "step": 7045 }, { "epoch": 2.60192023633678, "grad_norm": 0.31755682826042175, "learning_rate": 2.65549944574455e-05, "loss": 0.1931, "step": 7046 }, { "epoch": 2.6022895125553913, "grad_norm": 0.3211946487426758, "learning_rate": 2.6530360881882004e-05, "loss": 0.1807, "step": 7047 }, { "epoch": 2.602658788774003, "grad_norm": 0.2673085331916809, "learning_rate": 2.650572730631851e-05, "loss": 0.168, "step": 7048 }, { "epoch": 2.6030280649926145, "grad_norm": 0.2836707830429077, "learning_rate": 2.648109373075502e-05, "loss": 0.1523, "step": 7049 }, { "epoch": 2.603397341211226, "grad_norm": 0.2749764323234558, "learning_rate": 2.6456460155191527e-05, "loss": 0.1709, "step": 7050 }, { "epoch": 2.603397341211226, "eval_loss": 0.24706825613975525, "eval_runtime": 5.8584, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 7050 }, { "epoch": 2.6037666174298373, "grad_norm": 0.3217591345310211, "learning_rate": 2.643182657962803e-05, "loss": 0.1623, "step": 7051 }, { "epoch": 2.604135893648449, "grad_norm": 0.26377376914024353, "learning_rate": 2.640719300406454e-05, "loss": 0.1687, "step": 7052 }, { "epoch": 2.6045051698670605, "grad_norm": 0.2643566429615021, "learning_rate": 2.6382559428501047e-05, "loss": 0.1493, "step": 7053 }, { "epoch": 2.604874446085672, "grad_norm": 0.304091215133667, "learning_rate": 2.6357925852937555e-05, "loss": 0.1999, "step": 7054 }, { "epoch": 2.6052437223042837, "grad_norm": 0.3017551302909851, "learning_rate": 2.633329227737406e-05, "loss": 0.1737, "step": 7055 }, { "epoch": 2.6056129985228953, "grad_norm": 0.28353890776634216, "learning_rate": 2.6308658701810568e-05, "loss": 0.1655, "step": 7056 }, { "epoch": 2.605982274741507, "grad_norm": 0.2601488530635834, "learning_rate": 2.6284025126247075e-05, "loss": 0.1713, "step": 7057 }, { "epoch": 2.606351550960118, "grad_norm": 0.3210563063621521, "learning_rate": 2.6259391550683583e-05, "loss": 0.1646, "step": 7058 }, { "epoch": 2.6067208271787297, "grad_norm": 0.2617020905017853, "learning_rate": 2.6234757975120088e-05, "loss": 0.182, "step": 7059 }, { "epoch": 2.6070901033973413, "grad_norm": 0.26443079113960266, "learning_rate": 2.6210124399556596e-05, "loss": 0.1558, "step": 7060 }, { "epoch": 2.607459379615953, "grad_norm": 0.3176937401294708, "learning_rate": 2.6185490823993103e-05, "loss": 0.1384, "step": 7061 }, { "epoch": 2.607828655834564, "grad_norm": 0.2367580235004425, "learning_rate": 2.616085724842961e-05, "loss": 0.1678, "step": 7062 }, { "epoch": 2.6081979320531756, "grad_norm": 0.21823006868362427, "learning_rate": 2.6136223672866116e-05, "loss": 0.1319, "step": 7063 }, { "epoch": 2.6085672082717872, "grad_norm": 0.2675396502017975, "learning_rate": 2.6111590097302624e-05, "loss": 0.1648, "step": 7064 }, { "epoch": 2.608936484490399, "grad_norm": 0.24858689308166504, "learning_rate": 2.608695652173913e-05, "loss": 0.1692, "step": 7065 }, { "epoch": 2.6093057607090104, "grad_norm": 0.2861214578151703, "learning_rate": 2.606232294617564e-05, "loss": 0.1682, "step": 7066 }, { "epoch": 2.609675036927622, "grad_norm": 0.3085658550262451, "learning_rate": 2.6037689370612144e-05, "loss": 0.1773, "step": 7067 }, { "epoch": 2.6100443131462336, "grad_norm": 0.3473750948905945, "learning_rate": 2.6013055795048652e-05, "loss": 0.2119, "step": 7068 }, { "epoch": 2.610413589364845, "grad_norm": 0.29816731810569763, "learning_rate": 2.598842221948516e-05, "loss": 0.1709, "step": 7069 }, { "epoch": 2.6107828655834564, "grad_norm": 0.2761005163192749, "learning_rate": 2.5963788643921667e-05, "loss": 0.1499, "step": 7070 }, { "epoch": 2.611152141802068, "grad_norm": 0.403288334608078, "learning_rate": 2.5939155068358172e-05, "loss": 0.2018, "step": 7071 }, { "epoch": 2.6115214180206796, "grad_norm": 0.3042411208152771, "learning_rate": 2.591452149279468e-05, "loss": 0.1853, "step": 7072 }, { "epoch": 2.6118906942392908, "grad_norm": 0.2741173505783081, "learning_rate": 2.5889887917231188e-05, "loss": 0.1767, "step": 7073 }, { "epoch": 2.6122599704579024, "grad_norm": 0.28258514404296875, "learning_rate": 2.5865254341667696e-05, "loss": 0.1477, "step": 7074 }, { "epoch": 2.612629246676514, "grad_norm": 0.25004494190216064, "learning_rate": 2.58406207661042e-05, "loss": 0.1804, "step": 7075 }, { "epoch": 2.6129985228951256, "grad_norm": 0.24802225828170776, "learning_rate": 2.5815987190540708e-05, "loss": 0.1675, "step": 7076 }, { "epoch": 2.613367799113737, "grad_norm": 0.28887906670570374, "learning_rate": 2.5791353614977216e-05, "loss": 0.157, "step": 7077 }, { "epoch": 2.613737075332349, "grad_norm": 0.30445319414138794, "learning_rate": 2.5766720039413724e-05, "loss": 0.1905, "step": 7078 }, { "epoch": 2.6141063515509604, "grad_norm": 0.24961666762828827, "learning_rate": 2.5742086463850228e-05, "loss": 0.1533, "step": 7079 }, { "epoch": 2.6144756277695715, "grad_norm": 0.2499169558286667, "learning_rate": 2.5717452888286736e-05, "loss": 0.157, "step": 7080 }, { "epoch": 2.614844903988183, "grad_norm": 0.27537769079208374, "learning_rate": 2.5692819312723244e-05, "loss": 0.1714, "step": 7081 }, { "epoch": 2.6152141802067947, "grad_norm": 0.28446751832962036, "learning_rate": 2.566818573715975e-05, "loss": 0.1776, "step": 7082 }, { "epoch": 2.6155834564254064, "grad_norm": 0.3007389307022095, "learning_rate": 2.5643552161596256e-05, "loss": 0.1465, "step": 7083 }, { "epoch": 2.6159527326440175, "grad_norm": 0.23212134838104248, "learning_rate": 2.5618918586032764e-05, "loss": 0.142, "step": 7084 }, { "epoch": 2.616322008862629, "grad_norm": 0.30454760789871216, "learning_rate": 2.5594285010469272e-05, "loss": 0.1983, "step": 7085 }, { "epoch": 2.6166912850812407, "grad_norm": 0.3018072843551636, "learning_rate": 2.556965143490578e-05, "loss": 0.1812, "step": 7086 }, { "epoch": 2.6170605612998523, "grad_norm": 0.37997758388519287, "learning_rate": 2.5545017859342284e-05, "loss": 0.1549, "step": 7087 }, { "epoch": 2.617429837518464, "grad_norm": 0.2887950837612152, "learning_rate": 2.5520384283778792e-05, "loss": 0.1721, "step": 7088 }, { "epoch": 2.6177991137370755, "grad_norm": 0.2582404613494873, "learning_rate": 2.54957507082153e-05, "loss": 0.1622, "step": 7089 }, { "epoch": 2.618168389955687, "grad_norm": 0.22165557742118835, "learning_rate": 2.5471117132651808e-05, "loss": 0.1328, "step": 7090 }, { "epoch": 2.6185376661742983, "grad_norm": 0.25365129113197327, "learning_rate": 2.5446483557088312e-05, "loss": 0.1366, "step": 7091 }, { "epoch": 2.61890694239291, "grad_norm": 0.2626218795776367, "learning_rate": 2.542184998152482e-05, "loss": 0.1735, "step": 7092 }, { "epoch": 2.6192762186115215, "grad_norm": 0.2618752717971802, "learning_rate": 2.5397216405961328e-05, "loss": 0.1669, "step": 7093 }, { "epoch": 2.619645494830133, "grad_norm": 0.30638962984085083, "learning_rate": 2.5372582830397832e-05, "loss": 0.1928, "step": 7094 }, { "epoch": 2.6200147710487443, "grad_norm": 0.2914068400859833, "learning_rate": 2.534794925483434e-05, "loss": 0.158, "step": 7095 }, { "epoch": 2.620384047267356, "grad_norm": 0.33337917923927307, "learning_rate": 2.5323315679270848e-05, "loss": 0.1945, "step": 7096 }, { "epoch": 2.6207533234859675, "grad_norm": 0.2358565479516983, "learning_rate": 2.5298682103707356e-05, "loss": 0.1344, "step": 7097 }, { "epoch": 2.621122599704579, "grad_norm": 0.28325948119163513, "learning_rate": 2.527404852814386e-05, "loss": 0.1495, "step": 7098 }, { "epoch": 2.6214918759231907, "grad_norm": 0.26230067014694214, "learning_rate": 2.524941495258037e-05, "loss": 0.172, "step": 7099 }, { "epoch": 2.6218611521418023, "grad_norm": 0.26534080505371094, "learning_rate": 2.5224781377016876e-05, "loss": 0.1646, "step": 7100 }, { "epoch": 2.6218611521418023, "eval_loss": 0.24772045016288757, "eval_runtime": 5.8546, "eval_samples_per_second": 8.54, "eval_steps_per_second": 1.196, "step": 7100 }, { "epoch": 2.6222304283604134, "grad_norm": 0.24729250371456146, "learning_rate": 2.5200147801453384e-05, "loss": 0.1593, "step": 7101 }, { "epoch": 2.622599704579025, "grad_norm": 0.2703579366207123, "learning_rate": 2.517551422588989e-05, "loss": 0.1625, "step": 7102 }, { "epoch": 2.6229689807976366, "grad_norm": 0.2587558627128601, "learning_rate": 2.5150880650326396e-05, "loss": 0.1571, "step": 7103 }, { "epoch": 2.6233382570162482, "grad_norm": 0.3044690489768982, "learning_rate": 2.5126247074762904e-05, "loss": 0.1632, "step": 7104 }, { "epoch": 2.62370753323486, "grad_norm": 0.2837722897529602, "learning_rate": 2.5101613499199412e-05, "loss": 0.1731, "step": 7105 }, { "epoch": 2.624076809453471, "grad_norm": 0.2909303307533264, "learning_rate": 2.5076979923635917e-05, "loss": 0.1724, "step": 7106 }, { "epoch": 2.6244460856720826, "grad_norm": 0.2665870189666748, "learning_rate": 2.5052346348072425e-05, "loss": 0.141, "step": 7107 }, { "epoch": 2.624815361890694, "grad_norm": 0.23444689810276031, "learning_rate": 2.5027712772508932e-05, "loss": 0.1714, "step": 7108 }, { "epoch": 2.625184638109306, "grad_norm": 0.22857993841171265, "learning_rate": 2.500307919694544e-05, "loss": 0.1602, "step": 7109 }, { "epoch": 2.6255539143279174, "grad_norm": 0.25740647315979004, "learning_rate": 2.4978445621381945e-05, "loss": 0.1445, "step": 7110 }, { "epoch": 2.625923190546529, "grad_norm": 0.31387707591056824, "learning_rate": 2.495381204581845e-05, "loss": 0.1786, "step": 7111 }, { "epoch": 2.62629246676514, "grad_norm": 0.2700439989566803, "learning_rate": 2.4929178470254957e-05, "loss": 0.1473, "step": 7112 }, { "epoch": 2.6266617429837518, "grad_norm": 0.2955355644226074, "learning_rate": 2.4904544894691465e-05, "loss": 0.19, "step": 7113 }, { "epoch": 2.6270310192023634, "grad_norm": 0.254533976316452, "learning_rate": 2.4879911319127973e-05, "loss": 0.1704, "step": 7114 }, { "epoch": 2.627400295420975, "grad_norm": 0.34027546644210815, "learning_rate": 2.4855277743564477e-05, "loss": 0.2173, "step": 7115 }, { "epoch": 2.6277695716395866, "grad_norm": 0.30294856429100037, "learning_rate": 2.4830644168000985e-05, "loss": 0.1746, "step": 7116 }, { "epoch": 2.6281388478581977, "grad_norm": 0.24490997195243835, "learning_rate": 2.4806010592437493e-05, "loss": 0.1534, "step": 7117 }, { "epoch": 2.6285081240768093, "grad_norm": 0.29865172505378723, "learning_rate": 2.4781377016874e-05, "loss": 0.1525, "step": 7118 }, { "epoch": 2.628877400295421, "grad_norm": 0.40556472539901733, "learning_rate": 2.4756743441310505e-05, "loss": 0.1557, "step": 7119 }, { "epoch": 2.6292466765140325, "grad_norm": 0.23522064089775085, "learning_rate": 2.4732109865747013e-05, "loss": 0.1824, "step": 7120 }, { "epoch": 2.629615952732644, "grad_norm": 0.26868516206741333, "learning_rate": 2.470747629018352e-05, "loss": 0.1634, "step": 7121 }, { "epoch": 2.6299852289512557, "grad_norm": 0.3045238256454468, "learning_rate": 2.468284271462003e-05, "loss": 0.1677, "step": 7122 }, { "epoch": 2.630354505169867, "grad_norm": 0.3067895770072937, "learning_rate": 2.4658209139056533e-05, "loss": 0.148, "step": 7123 }, { "epoch": 2.6307237813884785, "grad_norm": 0.24589458107948303, "learning_rate": 2.463357556349304e-05, "loss": 0.148, "step": 7124 }, { "epoch": 2.63109305760709, "grad_norm": 0.2567375898361206, "learning_rate": 2.460894198792955e-05, "loss": 0.1354, "step": 7125 }, { "epoch": 2.6314623338257017, "grad_norm": 0.31593817472457886, "learning_rate": 2.4584308412366057e-05, "loss": 0.1535, "step": 7126 }, { "epoch": 2.631831610044313, "grad_norm": 0.29164353013038635, "learning_rate": 2.455967483680256e-05, "loss": 0.1975, "step": 7127 }, { "epoch": 2.6322008862629245, "grad_norm": 0.28976204991340637, "learning_rate": 2.453504126123907e-05, "loss": 0.155, "step": 7128 }, { "epoch": 2.632570162481536, "grad_norm": 0.3702680170536041, "learning_rate": 2.4510407685675577e-05, "loss": 0.1778, "step": 7129 }, { "epoch": 2.6329394387001477, "grad_norm": 0.31121230125427246, "learning_rate": 2.4485774110112085e-05, "loss": 0.1924, "step": 7130 }, { "epoch": 2.6333087149187593, "grad_norm": 0.25303661823272705, "learning_rate": 2.446114053454859e-05, "loss": 0.1539, "step": 7131 }, { "epoch": 2.633677991137371, "grad_norm": 0.36884987354278564, "learning_rate": 2.4436506958985097e-05, "loss": 0.1691, "step": 7132 }, { "epoch": 2.6340472673559825, "grad_norm": 0.2501956522464752, "learning_rate": 2.4411873383421605e-05, "loss": 0.164, "step": 7133 }, { "epoch": 2.6344165435745936, "grad_norm": 0.27940261363983154, "learning_rate": 2.4387239807858113e-05, "loss": 0.1801, "step": 7134 }, { "epoch": 2.6347858197932053, "grad_norm": 0.25051239132881165, "learning_rate": 2.4362606232294618e-05, "loss": 0.1685, "step": 7135 }, { "epoch": 2.635155096011817, "grad_norm": 0.2795883119106293, "learning_rate": 2.4337972656731125e-05, "loss": 0.1588, "step": 7136 }, { "epoch": 2.6355243722304285, "grad_norm": 0.2900752127170563, "learning_rate": 2.4313339081167633e-05, "loss": 0.1703, "step": 7137 }, { "epoch": 2.6358936484490396, "grad_norm": 0.25440940260887146, "learning_rate": 2.428870550560414e-05, "loss": 0.1677, "step": 7138 }, { "epoch": 2.636262924667651, "grad_norm": 0.27722442150115967, "learning_rate": 2.4264071930040646e-05, "loss": 0.169, "step": 7139 }, { "epoch": 2.636632200886263, "grad_norm": 0.29610511660575867, "learning_rate": 2.4239438354477154e-05, "loss": 0.1592, "step": 7140 }, { "epoch": 2.6370014771048744, "grad_norm": 0.24841229617595673, "learning_rate": 2.421480477891366e-05, "loss": 0.1768, "step": 7141 }, { "epoch": 2.637370753323486, "grad_norm": 0.2565562129020691, "learning_rate": 2.419017120335017e-05, "loss": 0.176, "step": 7142 }, { "epoch": 2.6377400295420976, "grad_norm": 0.31593239307403564, "learning_rate": 2.4165537627786674e-05, "loss": 0.1982, "step": 7143 }, { "epoch": 2.6381093057607092, "grad_norm": 0.2690623998641968, "learning_rate": 2.414090405222318e-05, "loss": 0.1671, "step": 7144 }, { "epoch": 2.6384785819793204, "grad_norm": 0.267892450094223, "learning_rate": 2.411627047665969e-05, "loss": 0.1896, "step": 7145 }, { "epoch": 2.638847858197932, "grad_norm": 0.30485016107559204, "learning_rate": 2.4091636901096197e-05, "loss": 0.1639, "step": 7146 }, { "epoch": 2.6392171344165436, "grad_norm": 0.26398298144340515, "learning_rate": 2.4067003325532702e-05, "loss": 0.1551, "step": 7147 }, { "epoch": 2.639586410635155, "grad_norm": 0.26881563663482666, "learning_rate": 2.404236974996921e-05, "loss": 0.1686, "step": 7148 }, { "epoch": 2.6399556868537664, "grad_norm": 0.22321288287639618, "learning_rate": 2.4017736174405718e-05, "loss": 0.1431, "step": 7149 }, { "epoch": 2.640324963072378, "grad_norm": 0.25838327407836914, "learning_rate": 2.3993102598842222e-05, "loss": 0.1816, "step": 7150 }, { "epoch": 2.640324963072378, "eval_loss": 0.2477797269821167, "eval_runtime": 5.8593, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 7150 }, { "epoch": 2.6406942392909896, "grad_norm": 0.2646441161632538, "learning_rate": 2.396846902327873e-05, "loss": 0.1665, "step": 7151 }, { "epoch": 2.641063515509601, "grad_norm": 0.2569630742073059, "learning_rate": 2.3943835447715238e-05, "loss": 0.184, "step": 7152 }, { "epoch": 2.6414327917282128, "grad_norm": 0.27376681566238403, "learning_rate": 2.3919201872151746e-05, "loss": 0.1328, "step": 7153 }, { "epoch": 2.6418020679468244, "grad_norm": 0.2927257716655731, "learning_rate": 2.389456829658825e-05, "loss": 0.1587, "step": 7154 }, { "epoch": 2.642171344165436, "grad_norm": 0.25532981753349304, "learning_rate": 2.3869934721024758e-05, "loss": 0.1672, "step": 7155 }, { "epoch": 2.642540620384047, "grad_norm": 0.2832915484905243, "learning_rate": 2.3845301145461266e-05, "loss": 0.175, "step": 7156 }, { "epoch": 2.6429098966026587, "grad_norm": 0.23710231482982635, "learning_rate": 2.3820667569897774e-05, "loss": 0.1667, "step": 7157 }, { "epoch": 2.6432791728212703, "grad_norm": 0.25296664237976074, "learning_rate": 2.3796033994334278e-05, "loss": 0.1429, "step": 7158 }, { "epoch": 2.643648449039882, "grad_norm": 0.3041204512119293, "learning_rate": 2.3771400418770786e-05, "loss": 0.1895, "step": 7159 }, { "epoch": 2.644017725258493, "grad_norm": 0.2530307471752167, "learning_rate": 2.3746766843207294e-05, "loss": 0.1636, "step": 7160 }, { "epoch": 2.6443870014771047, "grad_norm": 0.38362157344818115, "learning_rate": 2.3722133267643802e-05, "loss": 0.2049, "step": 7161 }, { "epoch": 2.6447562776957163, "grad_norm": 0.3008923828601837, "learning_rate": 2.3697499692080306e-05, "loss": 0.1864, "step": 7162 }, { "epoch": 2.645125553914328, "grad_norm": 0.2282281070947647, "learning_rate": 2.3672866116516814e-05, "loss": 0.1311, "step": 7163 }, { "epoch": 2.6454948301329395, "grad_norm": 0.26647505164146423, "learning_rate": 2.3648232540953322e-05, "loss": 0.1801, "step": 7164 }, { "epoch": 2.645864106351551, "grad_norm": 0.3231697082519531, "learning_rate": 2.362359896538983e-05, "loss": 0.1886, "step": 7165 }, { "epoch": 2.6462333825701627, "grad_norm": 0.273769348859787, "learning_rate": 2.3598965389826334e-05, "loss": 0.1553, "step": 7166 }, { "epoch": 2.646602658788774, "grad_norm": 0.27067431807518005, "learning_rate": 2.3574331814262842e-05, "loss": 0.171, "step": 7167 }, { "epoch": 2.6469719350073855, "grad_norm": 0.40708816051483154, "learning_rate": 2.3549698238699347e-05, "loss": 0.1617, "step": 7168 }, { "epoch": 2.647341211225997, "grad_norm": 0.2595626413822174, "learning_rate": 2.3525064663135854e-05, "loss": 0.157, "step": 7169 }, { "epoch": 2.6477104874446087, "grad_norm": 0.2784646451473236, "learning_rate": 2.3500431087572362e-05, "loss": 0.1714, "step": 7170 }, { "epoch": 2.64807976366322, "grad_norm": 0.25493499636650085, "learning_rate": 2.3475797512008867e-05, "loss": 0.1634, "step": 7171 }, { "epoch": 2.6484490398818314, "grad_norm": 0.39086082577705383, "learning_rate": 2.3451163936445375e-05, "loss": 0.1879, "step": 7172 }, { "epoch": 2.648818316100443, "grad_norm": 0.24589529633522034, "learning_rate": 2.3426530360881883e-05, "loss": 0.1736, "step": 7173 }, { "epoch": 2.6491875923190547, "grad_norm": 0.23601077497005463, "learning_rate": 2.340189678531839e-05, "loss": 0.1308, "step": 7174 }, { "epoch": 2.6495568685376663, "grad_norm": 0.27976614236831665, "learning_rate": 2.3377263209754895e-05, "loss": 0.1838, "step": 7175 }, { "epoch": 2.649926144756278, "grad_norm": 0.27918046712875366, "learning_rate": 2.3352629634191403e-05, "loss": 0.1508, "step": 7176 }, { "epoch": 2.6502954209748895, "grad_norm": 0.23547807335853577, "learning_rate": 2.332799605862791e-05, "loss": 0.1569, "step": 7177 }, { "epoch": 2.6506646971935006, "grad_norm": 0.22876212000846863, "learning_rate": 2.330336248306442e-05, "loss": 0.1628, "step": 7178 }, { "epoch": 2.651033973412112, "grad_norm": 0.2528270483016968, "learning_rate": 2.3278728907500923e-05, "loss": 0.1615, "step": 7179 }, { "epoch": 2.651403249630724, "grad_norm": 0.3297088146209717, "learning_rate": 2.325409533193743e-05, "loss": 0.1569, "step": 7180 }, { "epoch": 2.6517725258493354, "grad_norm": 0.25134512782096863, "learning_rate": 2.322946175637394e-05, "loss": 0.1637, "step": 7181 }, { "epoch": 2.6521418020679466, "grad_norm": 0.2733760476112366, "learning_rate": 2.3204828180810447e-05, "loss": 0.1667, "step": 7182 }, { "epoch": 2.652511078286558, "grad_norm": 0.27315571904182434, "learning_rate": 2.318019460524695e-05, "loss": 0.1457, "step": 7183 }, { "epoch": 2.65288035450517, "grad_norm": 0.29086950421333313, "learning_rate": 2.315556102968346e-05, "loss": 0.1365, "step": 7184 }, { "epoch": 2.6532496307237814, "grad_norm": 0.304569274187088, "learning_rate": 2.3130927454119967e-05, "loss": 0.1812, "step": 7185 }, { "epoch": 2.653618906942393, "grad_norm": 0.2263062745332718, "learning_rate": 2.3106293878556475e-05, "loss": 0.1373, "step": 7186 }, { "epoch": 2.6539881831610046, "grad_norm": 0.279864639043808, "learning_rate": 2.308166030299298e-05, "loss": 0.1707, "step": 7187 }, { "epoch": 2.654357459379616, "grad_norm": 0.2562095820903778, "learning_rate": 2.3057026727429487e-05, "loss": 0.1673, "step": 7188 }, { "epoch": 2.6547267355982274, "grad_norm": 0.323538601398468, "learning_rate": 2.3032393151865995e-05, "loss": 0.2028, "step": 7189 }, { "epoch": 2.655096011816839, "grad_norm": 0.3011215925216675, "learning_rate": 2.3007759576302503e-05, "loss": 0.1953, "step": 7190 }, { "epoch": 2.6554652880354506, "grad_norm": 0.23197126388549805, "learning_rate": 2.2983126000739007e-05, "loss": 0.1471, "step": 7191 }, { "epoch": 2.655834564254062, "grad_norm": 0.27711254358291626, "learning_rate": 2.2958492425175515e-05, "loss": 0.1467, "step": 7192 }, { "epoch": 2.6562038404726733, "grad_norm": 0.3001069128513336, "learning_rate": 2.2933858849612023e-05, "loss": 0.1821, "step": 7193 }, { "epoch": 2.656573116691285, "grad_norm": 0.3010239601135254, "learning_rate": 2.290922527404853e-05, "loss": 0.1761, "step": 7194 }, { "epoch": 2.6569423929098965, "grad_norm": 0.25081583857536316, "learning_rate": 2.2884591698485035e-05, "loss": 0.1618, "step": 7195 }, { "epoch": 2.657311669128508, "grad_norm": 0.2852584719657898, "learning_rate": 2.2859958122921543e-05, "loss": 0.1478, "step": 7196 }, { "epoch": 2.6576809453471197, "grad_norm": 0.25639086961746216, "learning_rate": 2.283532454735805e-05, "loss": 0.1531, "step": 7197 }, { "epoch": 2.6580502215657313, "grad_norm": 0.26445454359054565, "learning_rate": 2.281069097179456e-05, "loss": 0.1583, "step": 7198 }, { "epoch": 2.658419497784343, "grad_norm": 0.3616959750652313, "learning_rate": 2.2786057396231063e-05, "loss": 0.1756, "step": 7199 }, { "epoch": 2.658788774002954, "grad_norm": 0.2844349443912506, "learning_rate": 2.276142382066757e-05, "loss": 0.149, "step": 7200 }, { "epoch": 2.658788774002954, "eval_loss": 0.24778319895267487, "eval_runtime": 5.8637, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 7200 }, { "epoch": 2.6591580502215657, "grad_norm": 0.28954705595970154, "learning_rate": 2.273679024510408e-05, "loss": 0.1493, "step": 7201 }, { "epoch": 2.6595273264401773, "grad_norm": 0.2820834517478943, "learning_rate": 2.2712156669540587e-05, "loss": 0.1543, "step": 7202 }, { "epoch": 2.659896602658789, "grad_norm": 0.2745341360569, "learning_rate": 2.268752309397709e-05, "loss": 0.1665, "step": 7203 }, { "epoch": 2.6602658788774, "grad_norm": 0.3849048912525177, "learning_rate": 2.26628895184136e-05, "loss": 0.1738, "step": 7204 }, { "epoch": 2.6606351550960117, "grad_norm": 0.2253219187259674, "learning_rate": 2.2638255942850107e-05, "loss": 0.1553, "step": 7205 }, { "epoch": 2.6610044313146233, "grad_norm": 0.2824893295764923, "learning_rate": 2.261362236728661e-05, "loss": 0.1569, "step": 7206 }, { "epoch": 2.661373707533235, "grad_norm": 0.2944081723690033, "learning_rate": 2.258898879172312e-05, "loss": 0.172, "step": 7207 }, { "epoch": 2.6617429837518465, "grad_norm": 0.26585111021995544, "learning_rate": 2.2564355216159627e-05, "loss": 0.1612, "step": 7208 }, { "epoch": 2.662112259970458, "grad_norm": 0.27536478638648987, "learning_rate": 2.2539721640596135e-05, "loss": 0.1691, "step": 7209 }, { "epoch": 2.6624815361890697, "grad_norm": 0.27154725790023804, "learning_rate": 2.251508806503264e-05, "loss": 0.1582, "step": 7210 }, { "epoch": 2.662850812407681, "grad_norm": 0.2940751612186432, "learning_rate": 2.2490454489469147e-05, "loss": 0.1803, "step": 7211 }, { "epoch": 2.6632200886262924, "grad_norm": 0.2881784737110138, "learning_rate": 2.2465820913905655e-05, "loss": 0.1549, "step": 7212 }, { "epoch": 2.663589364844904, "grad_norm": 0.23567473888397217, "learning_rate": 2.2441187338342163e-05, "loss": 0.1376, "step": 7213 }, { "epoch": 2.6639586410635157, "grad_norm": 0.277631014585495, "learning_rate": 2.2416553762778668e-05, "loss": 0.1794, "step": 7214 }, { "epoch": 2.664327917282127, "grad_norm": 0.3504440188407898, "learning_rate": 2.2391920187215176e-05, "loss": 0.1703, "step": 7215 }, { "epoch": 2.6646971935007384, "grad_norm": 0.2590697705745697, "learning_rate": 2.2367286611651683e-05, "loss": 0.1416, "step": 7216 }, { "epoch": 2.66506646971935, "grad_norm": 0.30892354249954224, "learning_rate": 2.234265303608819e-05, "loss": 0.1894, "step": 7217 }, { "epoch": 2.6654357459379616, "grad_norm": 0.3599696457386017, "learning_rate": 2.2318019460524696e-05, "loss": 0.1867, "step": 7218 }, { "epoch": 2.6658050221565732, "grad_norm": 0.3020095229148865, "learning_rate": 2.2293385884961204e-05, "loss": 0.1898, "step": 7219 }, { "epoch": 2.666174298375185, "grad_norm": 0.303668737411499, "learning_rate": 2.226875230939771e-05, "loss": 0.1608, "step": 7220 }, { "epoch": 2.6665435745937964, "grad_norm": 0.29848936200141907, "learning_rate": 2.224411873383422e-05, "loss": 0.167, "step": 7221 }, { "epoch": 2.6669128508124076, "grad_norm": 0.25844043493270874, "learning_rate": 2.2219485158270724e-05, "loss": 0.1657, "step": 7222 }, { "epoch": 2.667282127031019, "grad_norm": 0.2853298485279083, "learning_rate": 2.219485158270723e-05, "loss": 0.1865, "step": 7223 }, { "epoch": 2.667651403249631, "grad_norm": 0.2674068212509155, "learning_rate": 2.217021800714374e-05, "loss": 0.1593, "step": 7224 }, { "epoch": 2.6680206794682424, "grad_norm": 0.33980315923690796, "learning_rate": 2.2145584431580247e-05, "loss": 0.1794, "step": 7225 }, { "epoch": 2.6683899556868536, "grad_norm": 0.39625829458236694, "learning_rate": 2.2120950856016752e-05, "loss": 0.1848, "step": 7226 }, { "epoch": 2.668759231905465, "grad_norm": 0.2665499448776245, "learning_rate": 2.2096317280453256e-05, "loss": 0.1799, "step": 7227 }, { "epoch": 2.6691285081240768, "grad_norm": 0.3573761582374573, "learning_rate": 2.2071683704889764e-05, "loss": 0.1758, "step": 7228 }, { "epoch": 2.6694977843426884, "grad_norm": 0.2860005497932434, "learning_rate": 2.2047050129326272e-05, "loss": 0.1766, "step": 7229 }, { "epoch": 2.6698670605613, "grad_norm": 0.26474326848983765, "learning_rate": 2.202241655376278e-05, "loss": 0.151, "step": 7230 }, { "epoch": 2.6702363367799116, "grad_norm": 0.28622764348983765, "learning_rate": 2.1997782978199284e-05, "loss": 0.1692, "step": 7231 }, { "epoch": 2.670605612998523, "grad_norm": 0.2745191752910614, "learning_rate": 2.1973149402635792e-05, "loss": 0.2144, "step": 7232 }, { "epoch": 2.6709748892171343, "grad_norm": 0.23318861424922943, "learning_rate": 2.19485158270723e-05, "loss": 0.1593, "step": 7233 }, { "epoch": 2.671344165435746, "grad_norm": 0.29776158928871155, "learning_rate": 2.1923882251508808e-05, "loss": 0.1944, "step": 7234 }, { "epoch": 2.6717134416543575, "grad_norm": 0.2800130546092987, "learning_rate": 2.1899248675945312e-05, "loss": 0.1723, "step": 7235 }, { "epoch": 2.672082717872969, "grad_norm": 0.24548450112342834, "learning_rate": 2.187461510038182e-05, "loss": 0.1523, "step": 7236 }, { "epoch": 2.6724519940915803, "grad_norm": 0.29334062337875366, "learning_rate": 2.1849981524818328e-05, "loss": 0.1614, "step": 7237 }, { "epoch": 2.672821270310192, "grad_norm": 0.2908742427825928, "learning_rate": 2.1825347949254836e-05, "loss": 0.1684, "step": 7238 }, { "epoch": 2.6731905465288035, "grad_norm": 0.3410395085811615, "learning_rate": 2.180071437369134e-05, "loss": 0.2048, "step": 7239 }, { "epoch": 2.673559822747415, "grad_norm": 0.30411624908447266, "learning_rate": 2.177608079812785e-05, "loss": 0.2078, "step": 7240 }, { "epoch": 2.6739290989660267, "grad_norm": 0.28259730339050293, "learning_rate": 2.1751447222564356e-05, "loss": 0.1524, "step": 7241 }, { "epoch": 2.6742983751846383, "grad_norm": 0.2348640412092209, "learning_rate": 2.1726813647000864e-05, "loss": 0.1495, "step": 7242 }, { "epoch": 2.6746676514032495, "grad_norm": 0.22753603756427765, "learning_rate": 2.170218007143737e-05, "loss": 0.1608, "step": 7243 }, { "epoch": 2.675036927621861, "grad_norm": 0.2376619130373001, "learning_rate": 2.1677546495873876e-05, "loss": 0.1511, "step": 7244 }, { "epoch": 2.6754062038404727, "grad_norm": 0.2439901977777481, "learning_rate": 2.1652912920310384e-05, "loss": 0.1396, "step": 7245 }, { "epoch": 2.6757754800590843, "grad_norm": 0.4259640574455261, "learning_rate": 2.1628279344746892e-05, "loss": 0.1726, "step": 7246 }, { "epoch": 2.676144756277696, "grad_norm": 0.3889214098453522, "learning_rate": 2.1603645769183397e-05, "loss": 0.1962, "step": 7247 }, { "epoch": 2.676514032496307, "grad_norm": 0.30397069454193115, "learning_rate": 2.1579012193619905e-05, "loss": 0.173, "step": 7248 }, { "epoch": 2.6768833087149186, "grad_norm": 0.28692227602005005, "learning_rate": 2.1554378618056412e-05, "loss": 0.1692, "step": 7249 }, { "epoch": 2.6772525849335302, "grad_norm": 0.28447920083999634, "learning_rate": 2.152974504249292e-05, "loss": 0.167, "step": 7250 }, { "epoch": 2.6772525849335302, "eval_loss": 0.24530024826526642, "eval_runtime": 5.8478, "eval_samples_per_second": 8.55, "eval_steps_per_second": 1.197, "step": 7250 }, { "epoch": 2.677621861152142, "grad_norm": 0.2785339951515198, "learning_rate": 2.1505111466929425e-05, "loss": 0.2095, "step": 7251 }, { "epoch": 2.6779911373707534, "grad_norm": 0.22790326178073883, "learning_rate": 2.1480477891365933e-05, "loss": 0.1717, "step": 7252 }, { "epoch": 2.678360413589365, "grad_norm": 0.29789429903030396, "learning_rate": 2.145584431580244e-05, "loss": 0.1673, "step": 7253 }, { "epoch": 2.678729689807976, "grad_norm": 0.24409626424312592, "learning_rate": 2.143121074023895e-05, "loss": 0.1377, "step": 7254 }, { "epoch": 2.679098966026588, "grad_norm": 0.2257269024848938, "learning_rate": 2.1406577164675453e-05, "loss": 0.154, "step": 7255 }, { "epoch": 2.6794682422451994, "grad_norm": 0.3350788950920105, "learning_rate": 2.138194358911196e-05, "loss": 0.1946, "step": 7256 }, { "epoch": 2.679837518463811, "grad_norm": 0.25675123929977417, "learning_rate": 2.135731001354847e-05, "loss": 0.1748, "step": 7257 }, { "epoch": 2.680206794682422, "grad_norm": 0.3013542890548706, "learning_rate": 2.1332676437984976e-05, "loss": 0.1649, "step": 7258 }, { "epoch": 2.680576070901034, "grad_norm": 0.27731192111968994, "learning_rate": 2.130804286242148e-05, "loss": 0.1722, "step": 7259 }, { "epoch": 2.6809453471196454, "grad_norm": 0.24194374680519104, "learning_rate": 2.128340928685799e-05, "loss": 0.1598, "step": 7260 }, { "epoch": 2.681314623338257, "grad_norm": 0.32079851627349854, "learning_rate": 2.1258775711294497e-05, "loss": 0.1722, "step": 7261 }, { "epoch": 2.6816838995568686, "grad_norm": 0.24001316726207733, "learning_rate": 2.1234142135731004e-05, "loss": 0.1541, "step": 7262 }, { "epoch": 2.68205317577548, "grad_norm": 0.36363720893859863, "learning_rate": 2.120950856016751e-05, "loss": 0.1742, "step": 7263 }, { "epoch": 2.682422451994092, "grad_norm": 0.33500853180885315, "learning_rate": 2.1184874984604017e-05, "loss": 0.2092, "step": 7264 }, { "epoch": 2.682791728212703, "grad_norm": 0.34495463967323303, "learning_rate": 2.1160241409040525e-05, "loss": 0.204, "step": 7265 }, { "epoch": 2.6831610044313146, "grad_norm": 0.24614976346492767, "learning_rate": 2.113560783347703e-05, "loss": 0.1542, "step": 7266 }, { "epoch": 2.683530280649926, "grad_norm": 0.26398226618766785, "learning_rate": 2.1110974257913537e-05, "loss": 0.1702, "step": 7267 }, { "epoch": 2.6838995568685378, "grad_norm": 0.3053957223892212, "learning_rate": 2.1086340682350045e-05, "loss": 0.1601, "step": 7268 }, { "epoch": 2.684268833087149, "grad_norm": 0.30904287099838257, "learning_rate": 2.1061707106786553e-05, "loss": 0.1975, "step": 7269 }, { "epoch": 2.6846381093057605, "grad_norm": 0.269345223903656, "learning_rate": 2.1037073531223057e-05, "loss": 0.1854, "step": 7270 }, { "epoch": 2.685007385524372, "grad_norm": 0.26761114597320557, "learning_rate": 2.1012439955659565e-05, "loss": 0.1706, "step": 7271 }, { "epoch": 2.6853766617429837, "grad_norm": 0.25350290536880493, "learning_rate": 2.0987806380096073e-05, "loss": 0.1604, "step": 7272 }, { "epoch": 2.6857459379615953, "grad_norm": 0.25855568051338196, "learning_rate": 2.096317280453258e-05, "loss": 0.1843, "step": 7273 }, { "epoch": 2.686115214180207, "grad_norm": 0.26478880643844604, "learning_rate": 2.0938539228969085e-05, "loss": 0.1578, "step": 7274 }, { "epoch": 2.6864844903988185, "grad_norm": 0.2511434853076935, "learning_rate": 2.0913905653405593e-05, "loss": 0.1706, "step": 7275 }, { "epoch": 2.6868537666174297, "grad_norm": 0.2608022689819336, "learning_rate": 2.08892720778421e-05, "loss": 0.1684, "step": 7276 }, { "epoch": 2.6872230428360413, "grad_norm": 0.2644890248775482, "learning_rate": 2.086463850227861e-05, "loss": 0.1594, "step": 7277 }, { "epoch": 2.687592319054653, "grad_norm": 0.31899401545524597, "learning_rate": 2.0840004926715113e-05, "loss": 0.1843, "step": 7278 }, { "epoch": 2.6879615952732645, "grad_norm": 0.2581588923931122, "learning_rate": 2.081537135115162e-05, "loss": 0.1698, "step": 7279 }, { "epoch": 2.6883308714918757, "grad_norm": 0.30643972754478455, "learning_rate": 2.079073777558813e-05, "loss": 0.1924, "step": 7280 }, { "epoch": 2.6887001477104873, "grad_norm": 0.24889962375164032, "learning_rate": 2.0766104200024637e-05, "loss": 0.1481, "step": 7281 }, { "epoch": 2.689069423929099, "grad_norm": 0.2676466107368469, "learning_rate": 2.074147062446114e-05, "loss": 0.1592, "step": 7282 }, { "epoch": 2.6894387001477105, "grad_norm": 0.2941732108592987, "learning_rate": 2.071683704889765e-05, "loss": 0.1657, "step": 7283 }, { "epoch": 2.689807976366322, "grad_norm": 0.3045092225074768, "learning_rate": 2.0692203473334154e-05, "loss": 0.1631, "step": 7284 }, { "epoch": 2.6901772525849337, "grad_norm": 0.24443352222442627, "learning_rate": 2.066756989777066e-05, "loss": 0.1507, "step": 7285 }, { "epoch": 2.6905465288035453, "grad_norm": 0.2525525391101837, "learning_rate": 2.064293632220717e-05, "loss": 0.1475, "step": 7286 }, { "epoch": 2.6909158050221564, "grad_norm": 0.26321685314178467, "learning_rate": 2.0618302746643674e-05, "loss": 0.1668, "step": 7287 }, { "epoch": 2.691285081240768, "grad_norm": 0.28742846846580505, "learning_rate": 2.0593669171080182e-05, "loss": 0.1562, "step": 7288 }, { "epoch": 2.6916543574593796, "grad_norm": 0.22838294506072998, "learning_rate": 2.056903559551669e-05, "loss": 0.1531, "step": 7289 }, { "epoch": 2.6920236336779912, "grad_norm": 0.22701244056224823, "learning_rate": 2.0544402019953198e-05, "loss": 0.1411, "step": 7290 }, { "epoch": 2.6923929098966024, "grad_norm": 0.27404719591140747, "learning_rate": 2.0519768444389702e-05, "loss": 0.1716, "step": 7291 }, { "epoch": 2.692762186115214, "grad_norm": 0.25083160400390625, "learning_rate": 2.049513486882621e-05, "loss": 0.1655, "step": 7292 }, { "epoch": 2.6931314623338256, "grad_norm": 0.23635615408420563, "learning_rate": 2.0470501293262718e-05, "loss": 0.1516, "step": 7293 }, { "epoch": 2.693500738552437, "grad_norm": 0.257691890001297, "learning_rate": 2.0445867717699226e-05, "loss": 0.147, "step": 7294 }, { "epoch": 2.693870014771049, "grad_norm": 0.27022585272789, "learning_rate": 2.042123414213573e-05, "loss": 0.1873, "step": 7295 }, { "epoch": 2.6942392909896604, "grad_norm": 0.2678462564945221, "learning_rate": 2.0396600566572238e-05, "loss": 0.1589, "step": 7296 }, { "epoch": 2.694608567208272, "grad_norm": 0.2651064991950989, "learning_rate": 2.0371966991008746e-05, "loss": 0.1668, "step": 7297 }, { "epoch": 2.694977843426883, "grad_norm": 0.28784722089767456, "learning_rate": 2.0347333415445254e-05, "loss": 0.1801, "step": 7298 }, { "epoch": 2.695347119645495, "grad_norm": 0.2588948607444763, "learning_rate": 2.0322699839881758e-05, "loss": 0.1739, "step": 7299 }, { "epoch": 2.6957163958641064, "grad_norm": 0.21159175038337708, "learning_rate": 2.0298066264318266e-05, "loss": 0.1383, "step": 7300 }, { "epoch": 2.6957163958641064, "eval_loss": 0.2470039576292038, "eval_runtime": 5.8534, "eval_samples_per_second": 8.542, "eval_steps_per_second": 1.196, "step": 7300 }, { "epoch": 2.696085672082718, "grad_norm": 0.2615770399570465, "learning_rate": 2.0273432688754774e-05, "loss": 0.1607, "step": 7301 }, { "epoch": 2.696454948301329, "grad_norm": 0.2541896402835846, "learning_rate": 2.0248799113191282e-05, "loss": 0.1464, "step": 7302 }, { "epoch": 2.6968242245199407, "grad_norm": 0.22565540671348572, "learning_rate": 2.0224165537627786e-05, "loss": 0.1452, "step": 7303 }, { "epoch": 2.6971935007385524, "grad_norm": 0.22654181718826294, "learning_rate": 2.0199531962064294e-05, "loss": 0.1684, "step": 7304 }, { "epoch": 2.697562776957164, "grad_norm": 0.2730237543582916, "learning_rate": 2.0174898386500802e-05, "loss": 0.1788, "step": 7305 }, { "epoch": 2.6979320531757756, "grad_norm": 0.24497225880622864, "learning_rate": 2.015026481093731e-05, "loss": 0.1749, "step": 7306 }, { "epoch": 2.698301329394387, "grad_norm": 0.27761363983154297, "learning_rate": 2.0125631235373814e-05, "loss": 0.1719, "step": 7307 }, { "epoch": 2.6986706056129988, "grad_norm": 0.25468045473098755, "learning_rate": 2.0100997659810322e-05, "loss": 0.175, "step": 7308 }, { "epoch": 2.69903988183161, "grad_norm": 0.23648743331432343, "learning_rate": 2.007636408424683e-05, "loss": 0.1639, "step": 7309 }, { "epoch": 2.6994091580502215, "grad_norm": 0.25075390934944153, "learning_rate": 2.0051730508683338e-05, "loss": 0.1551, "step": 7310 }, { "epoch": 2.699778434268833, "grad_norm": 0.2640094459056854, "learning_rate": 2.0027096933119842e-05, "loss": 0.1693, "step": 7311 }, { "epoch": 2.7001477104874447, "grad_norm": 0.2702077627182007, "learning_rate": 2.000246335755635e-05, "loss": 0.1599, "step": 7312 }, { "epoch": 2.700516986706056, "grad_norm": 0.25153788924217224, "learning_rate": 1.9977829781992858e-05, "loss": 0.1471, "step": 7313 }, { "epoch": 2.7008862629246675, "grad_norm": 0.2792302668094635, "learning_rate": 1.9953196206429366e-05, "loss": 0.1658, "step": 7314 }, { "epoch": 2.701255539143279, "grad_norm": 0.2735605239868164, "learning_rate": 1.992856263086587e-05, "loss": 0.1621, "step": 7315 }, { "epoch": 2.7016248153618907, "grad_norm": 0.3304271697998047, "learning_rate": 1.9903929055302378e-05, "loss": 0.1854, "step": 7316 }, { "epoch": 2.7019940915805023, "grad_norm": 0.31572744250297546, "learning_rate": 1.9879295479738886e-05, "loss": 0.1842, "step": 7317 }, { "epoch": 2.702363367799114, "grad_norm": 0.28819313645362854, "learning_rate": 1.9854661904175394e-05, "loss": 0.181, "step": 7318 }, { "epoch": 2.7027326440177255, "grad_norm": 0.20998355746269226, "learning_rate": 1.98300283286119e-05, "loss": 0.1389, "step": 7319 }, { "epoch": 2.7031019202363367, "grad_norm": 0.2347504198551178, "learning_rate": 1.9805394753048406e-05, "loss": 0.1474, "step": 7320 }, { "epoch": 2.7034711964549483, "grad_norm": 0.271361768245697, "learning_rate": 1.9780761177484914e-05, "loss": 0.1576, "step": 7321 }, { "epoch": 2.70384047267356, "grad_norm": 0.26428288221359253, "learning_rate": 1.975612760192142e-05, "loss": 0.1707, "step": 7322 }, { "epoch": 2.7042097488921715, "grad_norm": 0.2701990604400635, "learning_rate": 1.9731494026357927e-05, "loss": 0.1931, "step": 7323 }, { "epoch": 2.7045790251107826, "grad_norm": 0.2151554822921753, "learning_rate": 1.9706860450794434e-05, "loss": 0.1538, "step": 7324 }, { "epoch": 2.7049483013293942, "grad_norm": 0.4270957410335541, "learning_rate": 1.9682226875230942e-05, "loss": 0.1852, "step": 7325 }, { "epoch": 2.705317577548006, "grad_norm": 0.26941752433776855, "learning_rate": 1.9657593299667447e-05, "loss": 0.1573, "step": 7326 }, { "epoch": 2.7056868537666174, "grad_norm": 0.30312639474868774, "learning_rate": 1.9632959724103955e-05, "loss": 0.1752, "step": 7327 }, { "epoch": 2.706056129985229, "grad_norm": 0.436477392911911, "learning_rate": 1.9608326148540462e-05, "loss": 0.1593, "step": 7328 }, { "epoch": 2.7064254062038406, "grad_norm": 0.2763493061065674, "learning_rate": 1.958369257297697e-05, "loss": 0.1656, "step": 7329 }, { "epoch": 2.7067946824224522, "grad_norm": 0.30371522903442383, "learning_rate": 1.9559058997413475e-05, "loss": 0.1905, "step": 7330 }, { "epoch": 2.7071639586410634, "grad_norm": 0.30406200885772705, "learning_rate": 1.9534425421849983e-05, "loss": 0.1813, "step": 7331 }, { "epoch": 2.707533234859675, "grad_norm": 0.2895963191986084, "learning_rate": 1.950979184628649e-05, "loss": 0.1681, "step": 7332 }, { "epoch": 2.7079025110782866, "grad_norm": 0.23385562002658844, "learning_rate": 1.9485158270723e-05, "loss": 0.1351, "step": 7333 }, { "epoch": 2.708271787296898, "grad_norm": 0.2363734096288681, "learning_rate": 1.9460524695159503e-05, "loss": 0.1661, "step": 7334 }, { "epoch": 2.7086410635155094, "grad_norm": 0.2647867798805237, "learning_rate": 1.943589111959601e-05, "loss": 0.1799, "step": 7335 }, { "epoch": 2.709010339734121, "grad_norm": 0.25815582275390625, "learning_rate": 1.941125754403252e-05, "loss": 0.1675, "step": 7336 }, { "epoch": 2.7093796159527326, "grad_norm": 0.31180018186569214, "learning_rate": 1.9386623968469026e-05, "loss": 0.1898, "step": 7337 }, { "epoch": 2.709748892171344, "grad_norm": 0.29835644364356995, "learning_rate": 1.936199039290553e-05, "loss": 0.1743, "step": 7338 }, { "epoch": 2.710118168389956, "grad_norm": 0.247036874294281, "learning_rate": 1.933735681734204e-05, "loss": 0.1429, "step": 7339 }, { "epoch": 2.7104874446085674, "grad_norm": 0.22366099059581757, "learning_rate": 1.9312723241778547e-05, "loss": 0.1471, "step": 7340 }, { "epoch": 2.710856720827179, "grad_norm": 0.2728867828845978, "learning_rate": 1.9288089666215054e-05, "loss": 0.1505, "step": 7341 }, { "epoch": 2.71122599704579, "grad_norm": 0.27170485258102417, "learning_rate": 1.926345609065156e-05, "loss": 0.1776, "step": 7342 }, { "epoch": 2.7115952732644018, "grad_norm": 0.22643336653709412, "learning_rate": 1.9238822515088063e-05, "loss": 0.1323, "step": 7343 }, { "epoch": 2.7119645494830134, "grad_norm": 0.3507428765296936, "learning_rate": 1.921418893952457e-05, "loss": 0.1854, "step": 7344 }, { "epoch": 2.712333825701625, "grad_norm": 0.2567720413208008, "learning_rate": 1.918955536396108e-05, "loss": 0.1701, "step": 7345 }, { "epoch": 2.712703101920236, "grad_norm": 0.280446320772171, "learning_rate": 1.9164921788397587e-05, "loss": 0.1639, "step": 7346 }, { "epoch": 2.7130723781388477, "grad_norm": 0.23421820998191833, "learning_rate": 1.914028821283409e-05, "loss": 0.1595, "step": 7347 }, { "epoch": 2.7134416543574593, "grad_norm": 0.2823607325553894, "learning_rate": 1.91156546372706e-05, "loss": 0.1668, "step": 7348 }, { "epoch": 2.713810930576071, "grad_norm": 0.28469452261924744, "learning_rate": 1.9091021061707107e-05, "loss": 0.1685, "step": 7349 }, { "epoch": 2.7141802067946825, "grad_norm": 0.31783851981163025, "learning_rate": 1.9066387486143615e-05, "loss": 0.1937, "step": 7350 }, { "epoch": 2.7141802067946825, "eval_loss": 0.24873405694961548, "eval_runtime": 5.854, "eval_samples_per_second": 8.541, "eval_steps_per_second": 1.196, "step": 7350 }, { "epoch": 2.714549483013294, "grad_norm": 0.30338403582572937, "learning_rate": 1.904175391058012e-05, "loss": 0.1554, "step": 7351 }, { "epoch": 2.7149187592319057, "grad_norm": 0.2567841708660126, "learning_rate": 1.9017120335016627e-05, "loss": 0.1511, "step": 7352 }, { "epoch": 2.715288035450517, "grad_norm": 0.2727099657058716, "learning_rate": 1.8992486759453135e-05, "loss": 0.2034, "step": 7353 }, { "epoch": 2.7156573116691285, "grad_norm": 0.27551233768463135, "learning_rate": 1.8967853183889643e-05, "loss": 0.1578, "step": 7354 }, { "epoch": 2.71602658788774, "grad_norm": 0.25147584080696106, "learning_rate": 1.8943219608326148e-05, "loss": 0.1446, "step": 7355 }, { "epoch": 2.7163958641063517, "grad_norm": 0.27308040857315063, "learning_rate": 1.8918586032762656e-05, "loss": 0.159, "step": 7356 }, { "epoch": 2.716765140324963, "grad_norm": 0.30654796957969666, "learning_rate": 1.8893952457199163e-05, "loss": 0.1725, "step": 7357 }, { "epoch": 2.7171344165435745, "grad_norm": 0.4544277489185333, "learning_rate": 1.886931888163567e-05, "loss": 0.2442, "step": 7358 }, { "epoch": 2.717503692762186, "grad_norm": 0.3067072331905365, "learning_rate": 1.8844685306072176e-05, "loss": 0.1859, "step": 7359 }, { "epoch": 2.7178729689807977, "grad_norm": 0.24779309332370758, "learning_rate": 1.8820051730508684e-05, "loss": 0.1556, "step": 7360 }, { "epoch": 2.7182422451994093, "grad_norm": 0.29944664239883423, "learning_rate": 1.879541815494519e-05, "loss": 0.1543, "step": 7361 }, { "epoch": 2.718611521418021, "grad_norm": 0.2566697597503662, "learning_rate": 1.87707845793817e-05, "loss": 0.176, "step": 7362 }, { "epoch": 2.7189807976366325, "grad_norm": 0.26690617203712463, "learning_rate": 1.8746151003818204e-05, "loss": 0.1493, "step": 7363 }, { "epoch": 2.7193500738552436, "grad_norm": 0.25145423412323, "learning_rate": 1.872151742825471e-05, "loss": 0.1742, "step": 7364 }, { "epoch": 2.7197193500738552, "grad_norm": 0.258533775806427, "learning_rate": 1.869688385269122e-05, "loss": 0.172, "step": 7365 }, { "epoch": 2.720088626292467, "grad_norm": 0.24946877360343933, "learning_rate": 1.8672250277127727e-05, "loss": 0.1633, "step": 7366 }, { "epoch": 2.7204579025110784, "grad_norm": 0.2537067234516144, "learning_rate": 1.8647616701564232e-05, "loss": 0.1564, "step": 7367 }, { "epoch": 2.7208271787296896, "grad_norm": 0.3265269994735718, "learning_rate": 1.862298312600074e-05, "loss": 0.1767, "step": 7368 }, { "epoch": 2.721196454948301, "grad_norm": 0.3031497299671173, "learning_rate": 1.8598349550437248e-05, "loss": 0.1716, "step": 7369 }, { "epoch": 2.721565731166913, "grad_norm": 0.2572455108165741, "learning_rate": 1.8573715974873755e-05, "loss": 0.1475, "step": 7370 }, { "epoch": 2.7219350073855244, "grad_norm": 0.25037527084350586, "learning_rate": 1.854908239931026e-05, "loss": 0.161, "step": 7371 }, { "epoch": 2.722304283604136, "grad_norm": 0.3232778310775757, "learning_rate": 1.8524448823746768e-05, "loss": 0.1806, "step": 7372 }, { "epoch": 2.7226735598227476, "grad_norm": 0.2545141875743866, "learning_rate": 1.8499815248183276e-05, "loss": 0.1484, "step": 7373 }, { "epoch": 2.7230428360413588, "grad_norm": 0.2592243552207947, "learning_rate": 1.8475181672619783e-05, "loss": 0.1487, "step": 7374 }, { "epoch": 2.7234121122599704, "grad_norm": 0.29469770193099976, "learning_rate": 1.8450548097056288e-05, "loss": 0.1554, "step": 7375 }, { "epoch": 2.723781388478582, "grad_norm": 0.27507486939430237, "learning_rate": 1.8425914521492796e-05, "loss": 0.1729, "step": 7376 }, { "epoch": 2.7241506646971936, "grad_norm": 0.30150166153907776, "learning_rate": 1.8401280945929304e-05, "loss": 0.1543, "step": 7377 }, { "epoch": 2.724519940915805, "grad_norm": 0.27005499601364136, "learning_rate": 1.8376647370365808e-05, "loss": 0.1503, "step": 7378 }, { "epoch": 2.7248892171344163, "grad_norm": 0.2621898651123047, "learning_rate": 1.8352013794802316e-05, "loss": 0.1894, "step": 7379 }, { "epoch": 2.725258493353028, "grad_norm": 0.2828858196735382, "learning_rate": 1.8327380219238824e-05, "loss": 0.1733, "step": 7380 }, { "epoch": 2.7256277695716395, "grad_norm": 0.27500414848327637, "learning_rate": 1.8302746643675332e-05, "loss": 0.214, "step": 7381 }, { "epoch": 2.725997045790251, "grad_norm": 0.2737962305545807, "learning_rate": 1.8278113068111836e-05, "loss": 0.1658, "step": 7382 }, { "epoch": 2.7263663220088628, "grad_norm": 0.22681257128715515, "learning_rate": 1.8253479492548344e-05, "loss": 0.1812, "step": 7383 }, { "epoch": 2.7267355982274744, "grad_norm": 0.2882000505924225, "learning_rate": 1.8228845916984852e-05, "loss": 0.1827, "step": 7384 }, { "epoch": 2.7271048744460855, "grad_norm": 0.23809807002544403, "learning_rate": 1.820421234142136e-05, "loss": 0.1305, "step": 7385 }, { "epoch": 2.727474150664697, "grad_norm": 0.298092246055603, "learning_rate": 1.8179578765857864e-05, "loss": 0.1688, "step": 7386 }, { "epoch": 2.7278434268833087, "grad_norm": 0.2639222741127014, "learning_rate": 1.8154945190294372e-05, "loss": 0.1739, "step": 7387 }, { "epoch": 2.7282127031019203, "grad_norm": 0.2812730073928833, "learning_rate": 1.813031161473088e-05, "loss": 0.1453, "step": 7388 }, { "epoch": 2.7285819793205315, "grad_norm": 0.23684032261371613, "learning_rate": 1.8105678039167388e-05, "loss": 0.1694, "step": 7389 }, { "epoch": 2.728951255539143, "grad_norm": 0.2545001208782196, "learning_rate": 1.8081044463603892e-05, "loss": 0.1579, "step": 7390 }, { "epoch": 2.7293205317577547, "grad_norm": 0.3009130656719208, "learning_rate": 1.80564108880404e-05, "loss": 0.1695, "step": 7391 }, { "epoch": 2.7296898079763663, "grad_norm": 0.2816237509250641, "learning_rate": 1.8031777312476908e-05, "loss": 0.1671, "step": 7392 }, { "epoch": 2.730059084194978, "grad_norm": 0.23764042556285858, "learning_rate": 1.8007143736913416e-05, "loss": 0.1511, "step": 7393 }, { "epoch": 2.7304283604135895, "grad_norm": 0.3010806143283844, "learning_rate": 1.798251016134992e-05, "loss": 0.1841, "step": 7394 }, { "epoch": 2.730797636632201, "grad_norm": 0.29286661744117737, "learning_rate": 1.7957876585786428e-05, "loss": 0.1628, "step": 7395 }, { "epoch": 2.7311669128508123, "grad_norm": 0.25835737586021423, "learning_rate": 1.7933243010222936e-05, "loss": 0.1481, "step": 7396 }, { "epoch": 2.731536189069424, "grad_norm": 0.29753971099853516, "learning_rate": 1.7908609434659444e-05, "loss": 0.1715, "step": 7397 }, { "epoch": 2.7319054652880355, "grad_norm": 0.3576243817806244, "learning_rate": 1.788397585909595e-05, "loss": 0.1985, "step": 7398 }, { "epoch": 2.732274741506647, "grad_norm": 0.2470478117465973, "learning_rate": 1.7859342283532456e-05, "loss": 0.1491, "step": 7399 }, { "epoch": 2.7326440177252582, "grad_norm": 0.31921714544296265, "learning_rate": 1.783470870796896e-05, "loss": 0.1799, "step": 7400 }, { "epoch": 2.7326440177252582, "eval_loss": 0.24810636043548584, "eval_runtime": 5.8585, "eval_samples_per_second": 8.535, "eval_steps_per_second": 1.195, "step": 7400 }, { "epoch": 2.73301329394387, "grad_norm": 0.23354244232177734, "learning_rate": 1.781007513240547e-05, "loss": 0.1468, "step": 7401 }, { "epoch": 2.7333825701624814, "grad_norm": 0.2450440526008606, "learning_rate": 1.7785441556841977e-05, "loss": 0.1508, "step": 7402 }, { "epoch": 2.733751846381093, "grad_norm": 0.27826738357543945, "learning_rate": 1.776080798127848e-05, "loss": 0.1648, "step": 7403 }, { "epoch": 2.7341211225997046, "grad_norm": 0.28776785731315613, "learning_rate": 1.773617440571499e-05, "loss": 0.1778, "step": 7404 }, { "epoch": 2.7344903988183162, "grad_norm": 0.29564327001571655, "learning_rate": 1.7711540830151497e-05, "loss": 0.1558, "step": 7405 }, { "epoch": 2.734859675036928, "grad_norm": 0.2605178952217102, "learning_rate": 1.7686907254588005e-05, "loss": 0.1625, "step": 7406 }, { "epoch": 2.735228951255539, "grad_norm": 0.3336915969848633, "learning_rate": 1.766227367902451e-05, "loss": 0.1909, "step": 7407 }, { "epoch": 2.7355982274741506, "grad_norm": 0.2898995578289032, "learning_rate": 1.7637640103461017e-05, "loss": 0.1698, "step": 7408 }, { "epoch": 2.735967503692762, "grad_norm": 0.30992692708969116, "learning_rate": 1.7613006527897525e-05, "loss": 0.175, "step": 7409 }, { "epoch": 2.736336779911374, "grad_norm": 0.23993481695652008, "learning_rate": 1.7588372952334033e-05, "loss": 0.1604, "step": 7410 }, { "epoch": 2.736706056129985, "grad_norm": 0.2633693516254425, "learning_rate": 1.7563739376770537e-05, "loss": 0.1417, "step": 7411 }, { "epoch": 2.7370753323485966, "grad_norm": 0.30885404348373413, "learning_rate": 1.7539105801207045e-05, "loss": 0.1609, "step": 7412 }, { "epoch": 2.737444608567208, "grad_norm": 0.3195990324020386, "learning_rate": 1.7514472225643553e-05, "loss": 0.179, "step": 7413 }, { "epoch": 2.7378138847858198, "grad_norm": 0.2127685546875, "learning_rate": 1.748983865008006e-05, "loss": 0.1614, "step": 7414 }, { "epoch": 2.7381831610044314, "grad_norm": 0.2454635202884674, "learning_rate": 1.7465205074516565e-05, "loss": 0.1538, "step": 7415 }, { "epoch": 2.738552437223043, "grad_norm": 0.265864759683609, "learning_rate": 1.7440571498953073e-05, "loss": 0.162, "step": 7416 }, { "epoch": 2.7389217134416546, "grad_norm": 0.2266344577074051, "learning_rate": 1.741593792338958e-05, "loss": 0.1592, "step": 7417 }, { "epoch": 2.7392909896602657, "grad_norm": 0.28104716539382935, "learning_rate": 1.739130434782609e-05, "loss": 0.1689, "step": 7418 }, { "epoch": 2.7396602658788773, "grad_norm": 0.2466869205236435, "learning_rate": 1.7366670772262593e-05, "loss": 0.15, "step": 7419 }, { "epoch": 2.740029542097489, "grad_norm": 0.2613917589187622, "learning_rate": 1.73420371966991e-05, "loss": 0.1626, "step": 7420 }, { "epoch": 2.7403988183161005, "grad_norm": 0.3566192388534546, "learning_rate": 1.731740362113561e-05, "loss": 0.1522, "step": 7421 }, { "epoch": 2.7407680945347117, "grad_norm": 0.3045487701892853, "learning_rate": 1.7292770045572117e-05, "loss": 0.1572, "step": 7422 }, { "epoch": 2.7411373707533233, "grad_norm": 0.2889017164707184, "learning_rate": 1.726813647000862e-05, "loss": 0.1947, "step": 7423 }, { "epoch": 2.741506646971935, "grad_norm": 0.2684076428413391, "learning_rate": 1.724350289444513e-05, "loss": 0.1695, "step": 7424 }, { "epoch": 2.7418759231905465, "grad_norm": 0.3007463812828064, "learning_rate": 1.7218869318881637e-05, "loss": 0.1749, "step": 7425 }, { "epoch": 2.742245199409158, "grad_norm": 0.28021693229675293, "learning_rate": 1.7194235743318145e-05, "loss": 0.163, "step": 7426 }, { "epoch": 2.7426144756277697, "grad_norm": 0.32350239157676697, "learning_rate": 1.716960216775465e-05, "loss": 0.1792, "step": 7427 }, { "epoch": 2.7429837518463813, "grad_norm": 0.23683792352676392, "learning_rate": 1.7144968592191157e-05, "loss": 0.1555, "step": 7428 }, { "epoch": 2.7433530280649925, "grad_norm": 0.31598547101020813, "learning_rate": 1.7120335016627665e-05, "loss": 0.1817, "step": 7429 }, { "epoch": 2.743722304283604, "grad_norm": 0.29410770535469055, "learning_rate": 1.7095701441064173e-05, "loss": 0.1754, "step": 7430 }, { "epoch": 2.7440915805022157, "grad_norm": 0.3389756381511688, "learning_rate": 1.7071067865500677e-05, "loss": 0.192, "step": 7431 }, { "epoch": 2.7444608567208273, "grad_norm": 0.41600510478019714, "learning_rate": 1.7046434289937185e-05, "loss": 0.2017, "step": 7432 }, { "epoch": 2.7448301329394384, "grad_norm": 0.25756680965423584, "learning_rate": 1.7021800714373693e-05, "loss": 0.1537, "step": 7433 }, { "epoch": 2.74519940915805, "grad_norm": 0.2631795108318329, "learning_rate": 1.69971671388102e-05, "loss": 0.1586, "step": 7434 }, { "epoch": 2.7455686853766617, "grad_norm": 0.2646562159061432, "learning_rate": 1.6972533563246706e-05, "loss": 0.1676, "step": 7435 }, { "epoch": 2.7459379615952733, "grad_norm": 0.2485722452402115, "learning_rate": 1.6947899987683213e-05, "loss": 0.1466, "step": 7436 }, { "epoch": 2.746307237813885, "grad_norm": 0.28804153203964233, "learning_rate": 1.692326641211972e-05, "loss": 0.1761, "step": 7437 }, { "epoch": 2.7466765140324965, "grad_norm": 0.3488936722278595, "learning_rate": 1.6898632836556226e-05, "loss": 0.1878, "step": 7438 }, { "epoch": 2.747045790251108, "grad_norm": 0.2528108060359955, "learning_rate": 1.6873999260992734e-05, "loss": 0.1571, "step": 7439 }, { "epoch": 2.7474150664697192, "grad_norm": 0.2982749342918396, "learning_rate": 1.684936568542924e-05, "loss": 0.1861, "step": 7440 }, { "epoch": 2.747784342688331, "grad_norm": 0.3304920196533203, "learning_rate": 1.682473210986575e-05, "loss": 0.1772, "step": 7441 }, { "epoch": 2.7481536189069424, "grad_norm": 0.23332808911800385, "learning_rate": 1.6800098534302254e-05, "loss": 0.1465, "step": 7442 }, { "epoch": 2.748522895125554, "grad_norm": 0.24102748930454254, "learning_rate": 1.677546495873876e-05, "loss": 0.1623, "step": 7443 }, { "epoch": 2.748892171344165, "grad_norm": 0.3148341476917267, "learning_rate": 1.675083138317527e-05, "loss": 0.1796, "step": 7444 }, { "epoch": 2.749261447562777, "grad_norm": 0.29295220971107483, "learning_rate": 1.6726197807611777e-05, "loss": 0.1627, "step": 7445 }, { "epoch": 2.7496307237813884, "grad_norm": 0.2994075119495392, "learning_rate": 1.6701564232048282e-05, "loss": 0.1813, "step": 7446 }, { "epoch": 2.75, "grad_norm": 0.2487352341413498, "learning_rate": 1.667693065648479e-05, "loss": 0.1868, "step": 7447 }, { "epoch": 2.7503692762186116, "grad_norm": 0.2617223560810089, "learning_rate": 1.6652297080921298e-05, "loss": 0.1762, "step": 7448 }, { "epoch": 2.750738552437223, "grad_norm": 0.25977933406829834, "learning_rate": 1.6627663505357805e-05, "loss": 0.1609, "step": 7449 }, { "epoch": 2.751107828655835, "grad_norm": 0.30660438537597656, "learning_rate": 1.660302992979431e-05, "loss": 0.1684, "step": 7450 }, { "epoch": 2.751107828655835, "eval_loss": 0.24653255939483643, "eval_runtime": 5.8482, "eval_samples_per_second": 8.55, "eval_steps_per_second": 1.197, "step": 7450 }, { "epoch": 2.751477104874446, "grad_norm": 0.27738091349601746, "learning_rate": 1.6578396354230818e-05, "loss": 0.1523, "step": 7451 }, { "epoch": 2.7518463810930576, "grad_norm": 0.3045938313007355, "learning_rate": 1.6553762778667326e-05, "loss": 0.1769, "step": 7452 }, { "epoch": 2.752215657311669, "grad_norm": 0.2948349714279175, "learning_rate": 1.6529129203103834e-05, "loss": 0.1821, "step": 7453 }, { "epoch": 2.7525849335302808, "grad_norm": 0.26535508036613464, "learning_rate": 1.6504495627540338e-05, "loss": 0.1579, "step": 7454 }, { "epoch": 2.752954209748892, "grad_norm": 0.2517685294151306, "learning_rate": 1.6479862051976846e-05, "loss": 0.1482, "step": 7455 }, { "epoch": 2.7533234859675035, "grad_norm": 0.270142525434494, "learning_rate": 1.6455228476413354e-05, "loss": 0.1756, "step": 7456 }, { "epoch": 2.753692762186115, "grad_norm": 0.2596772015094757, "learning_rate": 1.643059490084986e-05, "loss": 0.142, "step": 7457 }, { "epoch": 2.7540620384047267, "grad_norm": 0.28529229760169983, "learning_rate": 1.6405961325286366e-05, "loss": 0.1777, "step": 7458 }, { "epoch": 2.7544313146233383, "grad_norm": 0.23346681892871857, "learning_rate": 1.638132774972287e-05, "loss": 0.1521, "step": 7459 }, { "epoch": 2.75480059084195, "grad_norm": 0.29147687554359436, "learning_rate": 1.635669417415938e-05, "loss": 0.147, "step": 7460 }, { "epoch": 2.7551698670605616, "grad_norm": 0.2928764820098877, "learning_rate": 1.6332060598595886e-05, "loss": 0.1672, "step": 7461 }, { "epoch": 2.7555391432791727, "grad_norm": 0.22330345213413239, "learning_rate": 1.6307427023032394e-05, "loss": 0.1646, "step": 7462 }, { "epoch": 2.7559084194977843, "grad_norm": 0.2907005250453949, "learning_rate": 1.62827934474689e-05, "loss": 0.1669, "step": 7463 }, { "epoch": 2.756277695716396, "grad_norm": 0.2795524299144745, "learning_rate": 1.6258159871905406e-05, "loss": 0.1631, "step": 7464 }, { "epoch": 2.7566469719350075, "grad_norm": 0.31589746475219727, "learning_rate": 1.6233526296341914e-05, "loss": 0.178, "step": 7465 }, { "epoch": 2.7570162481536187, "grad_norm": 0.286522775888443, "learning_rate": 1.6208892720778422e-05, "loss": 0.1748, "step": 7466 }, { "epoch": 2.7573855243722303, "grad_norm": 0.3060949742794037, "learning_rate": 1.6184259145214927e-05, "loss": 0.186, "step": 7467 }, { "epoch": 2.757754800590842, "grad_norm": 0.23353999853134155, "learning_rate": 1.6159625569651435e-05, "loss": 0.1462, "step": 7468 }, { "epoch": 2.7581240768094535, "grad_norm": 0.27649784088134766, "learning_rate": 1.6134991994087942e-05, "loss": 0.1785, "step": 7469 }, { "epoch": 2.758493353028065, "grad_norm": 0.21699804067611694, "learning_rate": 1.611035841852445e-05, "loss": 0.1535, "step": 7470 }, { "epoch": 2.7588626292466767, "grad_norm": 0.2576650381088257, "learning_rate": 1.6085724842960955e-05, "loss": 0.1636, "step": 7471 }, { "epoch": 2.7592319054652883, "grad_norm": 0.2834946811199188, "learning_rate": 1.6061091267397463e-05, "loss": 0.1712, "step": 7472 }, { "epoch": 2.7596011816838995, "grad_norm": 0.25979647040367126, "learning_rate": 1.603645769183397e-05, "loss": 0.155, "step": 7473 }, { "epoch": 2.759970457902511, "grad_norm": 0.24807441234588623, "learning_rate": 1.601182411627048e-05, "loss": 0.1637, "step": 7474 }, { "epoch": 2.7603397341211227, "grad_norm": 0.2989766299724579, "learning_rate": 1.5987190540706983e-05, "loss": 0.1793, "step": 7475 }, { "epoch": 2.7607090103397343, "grad_norm": 0.25783130526542664, "learning_rate": 1.596255696514349e-05, "loss": 0.1805, "step": 7476 }, { "epoch": 2.7610782865583454, "grad_norm": 0.2817245423793793, "learning_rate": 1.593792338958e-05, "loss": 0.1511, "step": 7477 }, { "epoch": 2.761447562776957, "grad_norm": 0.19131001830101013, "learning_rate": 1.5913289814016506e-05, "loss": 0.1427, "step": 7478 }, { "epoch": 2.7618168389955686, "grad_norm": 0.32190608978271484, "learning_rate": 1.588865623845301e-05, "loss": 0.1695, "step": 7479 }, { "epoch": 2.7621861152141802, "grad_norm": 0.2699021100997925, "learning_rate": 1.586402266288952e-05, "loss": 0.1511, "step": 7480 }, { "epoch": 2.762555391432792, "grad_norm": 0.24477988481521606, "learning_rate": 1.5839389087326027e-05, "loss": 0.1548, "step": 7481 }, { "epoch": 2.7629246676514034, "grad_norm": 0.3021427392959595, "learning_rate": 1.5814755511762534e-05, "loss": 0.17, "step": 7482 }, { "epoch": 2.763293943870015, "grad_norm": 0.29378512501716614, "learning_rate": 1.579012193619904e-05, "loss": 0.1601, "step": 7483 }, { "epoch": 2.763663220088626, "grad_norm": 0.24656188488006592, "learning_rate": 1.5765488360635547e-05, "loss": 0.1388, "step": 7484 }, { "epoch": 2.764032496307238, "grad_norm": 0.26288262009620667, "learning_rate": 1.5740854785072055e-05, "loss": 0.1836, "step": 7485 }, { "epoch": 2.7644017725258494, "grad_norm": 0.2798362374305725, "learning_rate": 1.5716221209508563e-05, "loss": 0.1408, "step": 7486 }, { "epoch": 2.764771048744461, "grad_norm": 0.26546165347099304, "learning_rate": 1.5691587633945067e-05, "loss": 0.1668, "step": 7487 }, { "epoch": 2.765140324963072, "grad_norm": 0.28876855969429016, "learning_rate": 1.5666954058381575e-05, "loss": 0.1567, "step": 7488 }, { "epoch": 2.7655096011816838, "grad_norm": 0.31431934237480164, "learning_rate": 1.5642320482818083e-05, "loss": 0.1786, "step": 7489 }, { "epoch": 2.7658788774002954, "grad_norm": 0.24657383561134338, "learning_rate": 1.561768690725459e-05, "loss": 0.1596, "step": 7490 }, { "epoch": 2.766248153618907, "grad_norm": 0.25895577669143677, "learning_rate": 1.5593053331691095e-05, "loss": 0.1704, "step": 7491 }, { "epoch": 2.7666174298375186, "grad_norm": 0.22705091536045074, "learning_rate": 1.5568419756127603e-05, "loss": 0.1386, "step": 7492 }, { "epoch": 2.76698670605613, "grad_norm": 0.30566298961639404, "learning_rate": 1.554378618056411e-05, "loss": 0.1654, "step": 7493 }, { "epoch": 2.7673559822747418, "grad_norm": 0.28241631388664246, "learning_rate": 1.5519152605000615e-05, "loss": 0.1643, "step": 7494 }, { "epoch": 2.767725258493353, "grad_norm": 0.25129958987236023, "learning_rate": 1.5494519029437123e-05, "loss": 0.1526, "step": 7495 }, { "epoch": 2.7680945347119645, "grad_norm": 0.26767459511756897, "learning_rate": 1.546988545387363e-05, "loss": 0.1447, "step": 7496 }, { "epoch": 2.768463810930576, "grad_norm": 0.2640560567378998, "learning_rate": 1.544525187831014e-05, "loss": 0.1578, "step": 7497 }, { "epoch": 2.7688330871491877, "grad_norm": 0.28000009059906006, "learning_rate": 1.5420618302746643e-05, "loss": 0.1627, "step": 7498 }, { "epoch": 2.769202363367799, "grad_norm": 0.3205571472644806, "learning_rate": 1.539598472718315e-05, "loss": 0.1918, "step": 7499 }, { "epoch": 2.7695716395864105, "grad_norm": 0.3204624354839325, "learning_rate": 1.537135115161966e-05, "loss": 0.1702, "step": 7500 }, { "epoch": 2.7695716395864105, "eval_loss": 0.2450689673423767, "eval_runtime": 5.8526, "eval_samples_per_second": 8.543, "eval_steps_per_second": 1.196, "step": 7500 }, { "epoch": 2.769940915805022, "grad_norm": 0.28050729632377625, "learning_rate": 1.5346717576056167e-05, "loss": 0.1719, "step": 7501 }, { "epoch": 2.7703101920236337, "grad_norm": 0.29927536845207214, "learning_rate": 1.532208400049267e-05, "loss": 0.1803, "step": 7502 }, { "epoch": 2.7706794682422453, "grad_norm": 0.27131029963493347, "learning_rate": 1.529745042492918e-05, "loss": 0.1562, "step": 7503 }, { "epoch": 2.771048744460857, "grad_norm": 0.2462558150291443, "learning_rate": 1.5272816849365687e-05, "loss": 0.1732, "step": 7504 }, { "epoch": 2.7714180206794685, "grad_norm": 0.3052695393562317, "learning_rate": 1.5248183273802193e-05, "loss": 0.182, "step": 7505 }, { "epoch": 2.7717872968980797, "grad_norm": 0.2922547459602356, "learning_rate": 1.5223549698238701e-05, "loss": 0.1652, "step": 7506 }, { "epoch": 2.7721565731166913, "grad_norm": 0.2786978781223297, "learning_rate": 1.5198916122675207e-05, "loss": 0.168, "step": 7507 }, { "epoch": 2.772525849335303, "grad_norm": 0.3016391694545746, "learning_rate": 1.5174282547111715e-05, "loss": 0.1726, "step": 7508 }, { "epoch": 2.7728951255539145, "grad_norm": 0.32139500975608826, "learning_rate": 1.5149648971548221e-05, "loss": 0.1961, "step": 7509 }, { "epoch": 2.7732644017725256, "grad_norm": 0.27514082193374634, "learning_rate": 1.512501539598473e-05, "loss": 0.1599, "step": 7510 }, { "epoch": 2.7736336779911372, "grad_norm": 0.24459095299243927, "learning_rate": 1.5100381820421235e-05, "loss": 0.1398, "step": 7511 }, { "epoch": 2.774002954209749, "grad_norm": 0.25074049830436707, "learning_rate": 1.5075748244857743e-05, "loss": 0.146, "step": 7512 }, { "epoch": 2.7743722304283605, "grad_norm": 0.2813052237033844, "learning_rate": 1.505111466929425e-05, "loss": 0.1514, "step": 7513 }, { "epoch": 2.774741506646972, "grad_norm": 0.29530805349349976, "learning_rate": 1.5026481093730757e-05, "loss": 0.1828, "step": 7514 }, { "epoch": 2.7751107828655837, "grad_norm": 0.2728615999221802, "learning_rate": 1.5001847518167263e-05, "loss": 0.1762, "step": 7515 }, { "epoch": 2.775480059084195, "grad_norm": 0.32301032543182373, "learning_rate": 1.4977213942603768e-05, "loss": 0.1744, "step": 7516 }, { "epoch": 2.7758493353028064, "grad_norm": 0.25327804684638977, "learning_rate": 1.4952580367040276e-05, "loss": 0.1542, "step": 7517 }, { "epoch": 2.776218611521418, "grad_norm": 0.3018170893192291, "learning_rate": 1.4927946791476782e-05, "loss": 0.1668, "step": 7518 }, { "epoch": 2.7765878877400296, "grad_norm": 0.24490094184875488, "learning_rate": 1.490331321591329e-05, "loss": 0.1478, "step": 7519 }, { "epoch": 2.7769571639586412, "grad_norm": 0.24875818192958832, "learning_rate": 1.4878679640349796e-05, "loss": 0.1579, "step": 7520 }, { "epoch": 2.7773264401772524, "grad_norm": 0.2581137716770172, "learning_rate": 1.4854046064786304e-05, "loss": 0.1937, "step": 7521 }, { "epoch": 2.777695716395864, "grad_norm": 0.30791181325912476, "learning_rate": 1.482941248922281e-05, "loss": 0.1687, "step": 7522 }, { "epoch": 2.7780649926144756, "grad_norm": 0.24371172487735748, "learning_rate": 1.4804778913659318e-05, "loss": 0.1589, "step": 7523 }, { "epoch": 2.778434268833087, "grad_norm": 0.21186070144176483, "learning_rate": 1.4780145338095824e-05, "loss": 0.1275, "step": 7524 }, { "epoch": 2.778803545051699, "grad_norm": 0.24601298570632935, "learning_rate": 1.4755511762532332e-05, "loss": 0.1428, "step": 7525 }, { "epoch": 2.7791728212703104, "grad_norm": 0.22553080320358276, "learning_rate": 1.4730878186968838e-05, "loss": 0.1385, "step": 7526 }, { "epoch": 2.7795420974889216, "grad_norm": 0.2764257490634918, "learning_rate": 1.4706244611405346e-05, "loss": 0.1826, "step": 7527 }, { "epoch": 2.779911373707533, "grad_norm": 0.2295643836259842, "learning_rate": 1.4681611035841852e-05, "loss": 0.1393, "step": 7528 }, { "epoch": 2.7802806499261448, "grad_norm": 0.25414568185806274, "learning_rate": 1.465697746027836e-05, "loss": 0.1711, "step": 7529 }, { "epoch": 2.7806499261447564, "grad_norm": 0.23428992927074432, "learning_rate": 1.4632343884714866e-05, "loss": 0.1539, "step": 7530 }, { "epoch": 2.7810192023633675, "grad_norm": 0.3167438209056854, "learning_rate": 1.4607710309151374e-05, "loss": 0.1805, "step": 7531 }, { "epoch": 2.781388478581979, "grad_norm": 0.26827964186668396, "learning_rate": 1.458307673358788e-05, "loss": 0.166, "step": 7532 }, { "epoch": 2.7817577548005907, "grad_norm": 0.2907755672931671, "learning_rate": 1.4558443158024388e-05, "loss": 0.1649, "step": 7533 }, { "epoch": 2.7821270310192023, "grad_norm": 0.2963904142379761, "learning_rate": 1.4533809582460894e-05, "loss": 0.1935, "step": 7534 }, { "epoch": 2.782496307237814, "grad_norm": 0.28386786580085754, "learning_rate": 1.4509176006897402e-05, "loss": 0.1479, "step": 7535 }, { "epoch": 2.7828655834564255, "grad_norm": 0.2962789833545685, "learning_rate": 1.4484542431333908e-05, "loss": 0.2001, "step": 7536 }, { "epoch": 2.783234859675037, "grad_norm": 0.23222845792770386, "learning_rate": 1.4459908855770416e-05, "loss": 0.1295, "step": 7537 }, { "epoch": 2.7836041358936483, "grad_norm": 0.3009457290172577, "learning_rate": 1.4435275280206922e-05, "loss": 0.163, "step": 7538 }, { "epoch": 2.78397341211226, "grad_norm": 0.3172808587551117, "learning_rate": 1.441064170464343e-05, "loss": 0.1642, "step": 7539 }, { "epoch": 2.7843426883308715, "grad_norm": 0.21474723517894745, "learning_rate": 1.4386008129079936e-05, "loss": 0.1401, "step": 7540 }, { "epoch": 2.784711964549483, "grad_norm": 0.2675321698188782, "learning_rate": 1.4361374553516444e-05, "loss": 0.1659, "step": 7541 }, { "epoch": 2.7850812407680943, "grad_norm": 0.30473241209983826, "learning_rate": 1.433674097795295e-05, "loss": 0.1672, "step": 7542 }, { "epoch": 2.785450516986706, "grad_norm": 0.24813975393772125, "learning_rate": 1.4312107402389458e-05, "loss": 0.1626, "step": 7543 }, { "epoch": 2.7858197932053175, "grad_norm": 0.2875143587589264, "learning_rate": 1.4287473826825964e-05, "loss": 0.1772, "step": 7544 }, { "epoch": 2.786189069423929, "grad_norm": 0.2905154824256897, "learning_rate": 1.4262840251262472e-05, "loss": 0.1586, "step": 7545 }, { "epoch": 2.7865583456425407, "grad_norm": 0.28693145513534546, "learning_rate": 1.4238206675698978e-05, "loss": 0.1635, "step": 7546 }, { "epoch": 2.7869276218611523, "grad_norm": 0.27007073163986206, "learning_rate": 1.4213573100135486e-05, "loss": 0.177, "step": 7547 }, { "epoch": 2.787296898079764, "grad_norm": 0.245378777384758, "learning_rate": 1.4188939524571992e-05, "loss": 0.1458, "step": 7548 }, { "epoch": 2.787666174298375, "grad_norm": 0.25880053639411926, "learning_rate": 1.41643059490085e-05, "loss": 0.178, "step": 7549 }, { "epoch": 2.7880354505169866, "grad_norm": 0.2758273184299469, "learning_rate": 1.4139672373445007e-05, "loss": 0.1639, "step": 7550 }, { "epoch": 2.7880354505169866, "eval_loss": 0.24555173516273499, "eval_runtime": 5.8616, "eval_samples_per_second": 8.53, "eval_steps_per_second": 1.194, "step": 7550 }, { "epoch": 2.7884047267355982, "grad_norm": 0.21397241950035095, "learning_rate": 1.4115038797881513e-05, "loss": 0.1413, "step": 7551 }, { "epoch": 2.78877400295421, "grad_norm": 0.3013254404067993, "learning_rate": 1.409040522231802e-05, "loss": 0.1707, "step": 7552 }, { "epoch": 2.789143279172821, "grad_norm": 0.2574707567691803, "learning_rate": 1.4065771646754527e-05, "loss": 0.1412, "step": 7553 }, { "epoch": 2.7895125553914326, "grad_norm": 0.29214608669281006, "learning_rate": 1.4041138071191035e-05, "loss": 0.1655, "step": 7554 }, { "epoch": 2.789881831610044, "grad_norm": 0.2661636769771576, "learning_rate": 1.401650449562754e-05, "loss": 0.1479, "step": 7555 }, { "epoch": 2.790251107828656, "grad_norm": 0.28535255789756775, "learning_rate": 1.3991870920064049e-05, "loss": 0.189, "step": 7556 }, { "epoch": 2.7906203840472674, "grad_norm": 0.28791603446006775, "learning_rate": 1.3967237344500555e-05, "loss": 0.165, "step": 7557 }, { "epoch": 2.790989660265879, "grad_norm": 0.2892257273197174, "learning_rate": 1.3942603768937063e-05, "loss": 0.1689, "step": 7558 }, { "epoch": 2.7913589364844906, "grad_norm": 0.2890913784503937, "learning_rate": 1.3917970193373569e-05, "loss": 0.2106, "step": 7559 }, { "epoch": 2.791728212703102, "grad_norm": 0.24501356482505798, "learning_rate": 1.3893336617810077e-05, "loss": 0.1439, "step": 7560 }, { "epoch": 2.7920974889217134, "grad_norm": 0.24362200498580933, "learning_rate": 1.3868703042246583e-05, "loss": 0.1476, "step": 7561 }, { "epoch": 2.792466765140325, "grad_norm": 0.2666977047920227, "learning_rate": 1.384406946668309e-05, "loss": 0.1734, "step": 7562 }, { "epoch": 2.7928360413589366, "grad_norm": 0.2529878318309784, "learning_rate": 1.3819435891119597e-05, "loss": 0.1729, "step": 7563 }, { "epoch": 2.7932053175775478, "grad_norm": 0.31165093183517456, "learning_rate": 1.3794802315556105e-05, "loss": 0.1924, "step": 7564 }, { "epoch": 2.7935745937961594, "grad_norm": 0.2584229111671448, "learning_rate": 1.3770168739992611e-05, "loss": 0.1505, "step": 7565 }, { "epoch": 2.793943870014771, "grad_norm": 0.23775674402713776, "learning_rate": 1.3745535164429119e-05, "loss": 0.1284, "step": 7566 }, { "epoch": 2.7943131462333826, "grad_norm": 0.2588885724544525, "learning_rate": 1.3720901588865625e-05, "loss": 0.1629, "step": 7567 }, { "epoch": 2.794682422451994, "grad_norm": 0.25051623582839966, "learning_rate": 1.3696268013302133e-05, "loss": 0.1449, "step": 7568 }, { "epoch": 2.7950516986706058, "grad_norm": 0.3355005979537964, "learning_rate": 1.3671634437738639e-05, "loss": 0.1843, "step": 7569 }, { "epoch": 2.7954209748892174, "grad_norm": 0.22919444739818573, "learning_rate": 1.3647000862175147e-05, "loss": 0.1554, "step": 7570 }, { "epoch": 2.7957902511078285, "grad_norm": 0.2616226375102997, "learning_rate": 1.3622367286611653e-05, "loss": 0.1497, "step": 7571 }, { "epoch": 2.79615952732644, "grad_norm": 0.28552570939064026, "learning_rate": 1.3597733711048161e-05, "loss": 0.1595, "step": 7572 }, { "epoch": 2.7965288035450517, "grad_norm": 0.344937801361084, "learning_rate": 1.3573100135484667e-05, "loss": 0.1861, "step": 7573 }, { "epoch": 2.7968980797636633, "grad_norm": 0.23454611003398895, "learning_rate": 1.3548466559921172e-05, "loss": 0.1539, "step": 7574 }, { "epoch": 2.7972673559822745, "grad_norm": 0.2591012120246887, "learning_rate": 1.352383298435768e-05, "loss": 0.1435, "step": 7575 }, { "epoch": 2.797636632200886, "grad_norm": 0.25938424468040466, "learning_rate": 1.3499199408794186e-05, "loss": 0.1684, "step": 7576 }, { "epoch": 2.7980059084194977, "grad_norm": 0.29442277550697327, "learning_rate": 1.3474565833230693e-05, "loss": 0.1721, "step": 7577 }, { "epoch": 2.7983751846381093, "grad_norm": 0.29727548360824585, "learning_rate": 1.34499322576672e-05, "loss": 0.1818, "step": 7578 }, { "epoch": 2.798744460856721, "grad_norm": 0.258197546005249, "learning_rate": 1.3425298682103707e-05, "loss": 0.1365, "step": 7579 }, { "epoch": 2.7991137370753325, "grad_norm": 0.2935611605644226, "learning_rate": 1.3400665106540214e-05, "loss": 0.1783, "step": 7580 }, { "epoch": 2.799483013293944, "grad_norm": 0.3015442490577698, "learning_rate": 1.3376031530976721e-05, "loss": 0.1652, "step": 7581 }, { "epoch": 2.7998522895125553, "grad_norm": 0.24497297406196594, "learning_rate": 1.3351397955413228e-05, "loss": 0.1634, "step": 7582 }, { "epoch": 2.800221565731167, "grad_norm": 0.29756438732147217, "learning_rate": 1.3326764379849736e-05, "loss": 0.138, "step": 7583 }, { "epoch": 2.8005908419497785, "grad_norm": 0.3196985721588135, "learning_rate": 1.3302130804286242e-05, "loss": 0.1776, "step": 7584 }, { "epoch": 2.80096011816839, "grad_norm": 0.33203697204589844, "learning_rate": 1.327749722872275e-05, "loss": 0.1944, "step": 7585 }, { "epoch": 2.8013293943870012, "grad_norm": 0.2631467580795288, "learning_rate": 1.3252863653159256e-05, "loss": 0.1636, "step": 7586 }, { "epoch": 2.801698670605613, "grad_norm": 0.277204692363739, "learning_rate": 1.3228230077595764e-05, "loss": 0.1718, "step": 7587 }, { "epoch": 2.8020679468242244, "grad_norm": 0.30720534920692444, "learning_rate": 1.320359650203227e-05, "loss": 0.1636, "step": 7588 }, { "epoch": 2.802437223042836, "grad_norm": 0.22924008965492249, "learning_rate": 1.3178962926468778e-05, "loss": 0.1341, "step": 7589 }, { "epoch": 2.8028064992614476, "grad_norm": 0.251715749502182, "learning_rate": 1.3154329350905284e-05, "loss": 0.1614, "step": 7590 }, { "epoch": 2.8031757754800593, "grad_norm": 0.26300671696662903, "learning_rate": 1.3129695775341792e-05, "loss": 0.139, "step": 7591 }, { "epoch": 2.803545051698671, "grad_norm": 0.3070409595966339, "learning_rate": 1.3105062199778298e-05, "loss": 0.2011, "step": 7592 }, { "epoch": 2.803914327917282, "grad_norm": 0.2788148522377014, "learning_rate": 1.3080428624214806e-05, "loss": 0.14, "step": 7593 }, { "epoch": 2.8042836041358936, "grad_norm": 0.28043505549430847, "learning_rate": 1.3055795048651312e-05, "loss": 0.1805, "step": 7594 }, { "epoch": 2.804652880354505, "grad_norm": 0.2902803421020508, "learning_rate": 1.303116147308782e-05, "loss": 0.1716, "step": 7595 }, { "epoch": 2.805022156573117, "grad_norm": 0.2920861542224884, "learning_rate": 1.3006527897524326e-05, "loss": 0.1676, "step": 7596 }, { "epoch": 2.805391432791728, "grad_norm": 0.2428940236568451, "learning_rate": 1.2981894321960834e-05, "loss": 0.1607, "step": 7597 }, { "epoch": 2.8057607090103396, "grad_norm": 0.28528156876564026, "learning_rate": 1.295726074639734e-05, "loss": 0.1677, "step": 7598 }, { "epoch": 2.806129985228951, "grad_norm": 0.22906848788261414, "learning_rate": 1.2932627170833848e-05, "loss": 0.1376, "step": 7599 }, { "epoch": 2.806499261447563, "grad_norm": 0.27055057883262634, "learning_rate": 1.2907993595270354e-05, "loss": 0.1574, "step": 7600 }, { "epoch": 2.806499261447563, "eval_loss": 0.2451067417860031, "eval_runtime": 5.8595, "eval_samples_per_second": 8.533, "eval_steps_per_second": 1.195, "step": 7600 }, { "epoch": 2.8068685376661744, "grad_norm": 0.2410304844379425, "learning_rate": 1.2883360019706862e-05, "loss": 0.1491, "step": 7601 }, { "epoch": 2.807237813884786, "grad_norm": 0.31664612889289856, "learning_rate": 1.2858726444143368e-05, "loss": 0.1627, "step": 7602 }, { "epoch": 2.8076070901033976, "grad_norm": 0.23882780969142914, "learning_rate": 1.2834092868579876e-05, "loss": 0.1438, "step": 7603 }, { "epoch": 2.8079763663220088, "grad_norm": 0.34225931763648987, "learning_rate": 1.2809459293016382e-05, "loss": 0.2023, "step": 7604 }, { "epoch": 2.8083456425406204, "grad_norm": 0.23802785575389862, "learning_rate": 1.278482571745289e-05, "loss": 0.1465, "step": 7605 }, { "epoch": 2.808714918759232, "grad_norm": 0.26142677664756775, "learning_rate": 1.2760192141889396e-05, "loss": 0.1704, "step": 7606 }, { "epoch": 2.8090841949778436, "grad_norm": 0.32098034024238586, "learning_rate": 1.2735558566325904e-05, "loss": 0.1825, "step": 7607 }, { "epoch": 2.8094534711964547, "grad_norm": 0.2660562992095947, "learning_rate": 1.271092499076241e-05, "loss": 0.1822, "step": 7608 }, { "epoch": 2.8098227474150663, "grad_norm": 0.2180064469575882, "learning_rate": 1.2686291415198916e-05, "loss": 0.1456, "step": 7609 }, { "epoch": 2.810192023633678, "grad_norm": 0.33249959349632263, "learning_rate": 1.2661657839635424e-05, "loss": 0.1766, "step": 7610 }, { "epoch": 2.8105612998522895, "grad_norm": 0.28326165676116943, "learning_rate": 1.263702426407193e-05, "loss": 0.1694, "step": 7611 }, { "epoch": 2.810930576070901, "grad_norm": 0.27733269333839417, "learning_rate": 1.2612390688508438e-05, "loss": 0.161, "step": 7612 }, { "epoch": 2.8112998522895127, "grad_norm": 0.31507349014282227, "learning_rate": 1.2587757112944944e-05, "loss": 0.2159, "step": 7613 }, { "epoch": 2.8116691285081243, "grad_norm": 0.2528786361217499, "learning_rate": 1.2563123537381452e-05, "loss": 0.1788, "step": 7614 }, { "epoch": 2.8120384047267355, "grad_norm": 0.2537676990032196, "learning_rate": 1.2538489961817958e-05, "loss": 0.1526, "step": 7615 }, { "epoch": 2.812407680945347, "grad_norm": 0.2868138253688812, "learning_rate": 1.2513856386254466e-05, "loss": 0.1538, "step": 7616 }, { "epoch": 2.8127769571639587, "grad_norm": 0.29323476552963257, "learning_rate": 1.2489222810690972e-05, "loss": 0.1685, "step": 7617 }, { "epoch": 2.8131462333825703, "grad_norm": 0.3060005009174347, "learning_rate": 1.2464589235127479e-05, "loss": 0.2033, "step": 7618 }, { "epoch": 2.8135155096011815, "grad_norm": 0.23000088334083557, "learning_rate": 1.2439955659563986e-05, "loss": 0.136, "step": 7619 }, { "epoch": 2.813884785819793, "grad_norm": 0.27344846725463867, "learning_rate": 1.2415322084000493e-05, "loss": 0.1613, "step": 7620 }, { "epoch": 2.8142540620384047, "grad_norm": 0.23537056148052216, "learning_rate": 1.2390688508437e-05, "loss": 0.1402, "step": 7621 }, { "epoch": 2.8146233382570163, "grad_norm": 0.2935500741004944, "learning_rate": 1.2366054932873507e-05, "loss": 0.1817, "step": 7622 }, { "epoch": 2.814992614475628, "grad_norm": 0.23089289665222168, "learning_rate": 1.2341421357310014e-05, "loss": 0.1562, "step": 7623 }, { "epoch": 2.8153618906942395, "grad_norm": 0.3644351065158844, "learning_rate": 1.231678778174652e-05, "loss": 0.1977, "step": 7624 }, { "epoch": 2.815731166912851, "grad_norm": 0.24921384453773499, "learning_rate": 1.2292154206183028e-05, "loss": 0.1506, "step": 7625 }, { "epoch": 2.8161004431314622, "grad_norm": 0.2631310224533081, "learning_rate": 1.2267520630619535e-05, "loss": 0.1496, "step": 7626 }, { "epoch": 2.816469719350074, "grad_norm": 0.2318798452615738, "learning_rate": 1.2242887055056043e-05, "loss": 0.1553, "step": 7627 }, { "epoch": 2.8168389955686854, "grad_norm": 0.2602575123310089, "learning_rate": 1.2218253479492549e-05, "loss": 0.1398, "step": 7628 }, { "epoch": 2.817208271787297, "grad_norm": 0.24905669689178467, "learning_rate": 1.2193619903929057e-05, "loss": 0.1685, "step": 7629 }, { "epoch": 2.817577548005908, "grad_norm": 0.28452542424201965, "learning_rate": 1.2168986328365563e-05, "loss": 0.1547, "step": 7630 }, { "epoch": 2.81794682422452, "grad_norm": 0.24822600185871124, "learning_rate": 1.214435275280207e-05, "loss": 0.1566, "step": 7631 }, { "epoch": 2.8183161004431314, "grad_norm": 0.23183251917362213, "learning_rate": 1.2119719177238577e-05, "loss": 0.134, "step": 7632 }, { "epoch": 2.818685376661743, "grad_norm": 0.3169245421886444, "learning_rate": 1.2095085601675085e-05, "loss": 0.1871, "step": 7633 }, { "epoch": 2.8190546528803546, "grad_norm": 0.24495553970336914, "learning_rate": 1.207045202611159e-05, "loss": 0.175, "step": 7634 }, { "epoch": 2.819423929098966, "grad_norm": 0.25172358751296997, "learning_rate": 1.2045818450548099e-05, "loss": 0.153, "step": 7635 }, { "epoch": 2.819793205317578, "grad_norm": 0.30053114891052246, "learning_rate": 1.2021184874984605e-05, "loss": 0.1693, "step": 7636 }, { "epoch": 2.820162481536189, "grad_norm": 0.3276084065437317, "learning_rate": 1.1996551299421111e-05, "loss": 0.1903, "step": 7637 }, { "epoch": 2.8205317577548006, "grad_norm": 0.30875363945961, "learning_rate": 1.1971917723857619e-05, "loss": 0.1865, "step": 7638 }, { "epoch": 2.820901033973412, "grad_norm": 0.22612623870372772, "learning_rate": 1.1947284148294125e-05, "loss": 0.1506, "step": 7639 }, { "epoch": 2.821270310192024, "grad_norm": 0.3551541268825531, "learning_rate": 1.1922650572730633e-05, "loss": 0.1901, "step": 7640 }, { "epoch": 2.821639586410635, "grad_norm": 0.2712530493736267, "learning_rate": 1.1898016997167139e-05, "loss": 0.1415, "step": 7641 }, { "epoch": 2.8220088626292466, "grad_norm": 0.3051557242870331, "learning_rate": 1.1873383421603647e-05, "loss": 0.1637, "step": 7642 }, { "epoch": 2.822378138847858, "grad_norm": 0.26176202297210693, "learning_rate": 1.1848749846040153e-05, "loss": 0.1679, "step": 7643 }, { "epoch": 2.8227474150664698, "grad_norm": 0.3065352737903595, "learning_rate": 1.1824116270476661e-05, "loss": 0.1692, "step": 7644 }, { "epoch": 2.8231166912850814, "grad_norm": 0.2914688289165497, "learning_rate": 1.1799482694913167e-05, "loss": 0.1423, "step": 7645 }, { "epoch": 2.823485967503693, "grad_norm": 0.2894628643989563, "learning_rate": 1.1774849119349673e-05, "loss": 0.1845, "step": 7646 }, { "epoch": 2.823855243722304, "grad_norm": 0.276998907327652, "learning_rate": 1.1750215543786181e-05, "loss": 0.1652, "step": 7647 }, { "epoch": 2.8242245199409157, "grad_norm": 0.24740327894687653, "learning_rate": 1.1725581968222687e-05, "loss": 0.1609, "step": 7648 }, { "epoch": 2.8245937961595273, "grad_norm": 0.28068864345550537, "learning_rate": 1.1700948392659195e-05, "loss": 0.1745, "step": 7649 }, { "epoch": 2.824963072378139, "grad_norm": 0.24768218398094177, "learning_rate": 1.1676314817095701e-05, "loss": 0.1664, "step": 7650 }, { "epoch": 2.824963072378139, "eval_loss": 0.24496839940547943, "eval_runtime": 5.8515, "eval_samples_per_second": 8.545, "eval_steps_per_second": 1.196, "step": 7650 }, { "epoch": 2.8253323485967505, "grad_norm": 0.265293151140213, "learning_rate": 1.165168124153221e-05, "loss": 0.1646, "step": 7651 }, { "epoch": 2.8257016248153617, "grad_norm": 0.2964051365852356, "learning_rate": 1.1627047665968715e-05, "loss": 0.1589, "step": 7652 }, { "epoch": 2.8260709010339733, "grad_norm": 0.26073381304740906, "learning_rate": 1.1602414090405223e-05, "loss": 0.1537, "step": 7653 }, { "epoch": 2.826440177252585, "grad_norm": 0.2753491997718811, "learning_rate": 1.157778051484173e-05, "loss": 0.1553, "step": 7654 }, { "epoch": 2.8268094534711965, "grad_norm": 0.31869035959243774, "learning_rate": 1.1553146939278237e-05, "loss": 0.1874, "step": 7655 }, { "epoch": 2.827178729689808, "grad_norm": 0.2545974552631378, "learning_rate": 1.1528513363714743e-05, "loss": 0.1361, "step": 7656 }, { "epoch": 2.8275480059084197, "grad_norm": 0.224674791097641, "learning_rate": 1.1503879788151251e-05, "loss": 0.1322, "step": 7657 }, { "epoch": 2.827917282127031, "grad_norm": 0.3442631661891937, "learning_rate": 1.1479246212587757e-05, "loss": 0.1972, "step": 7658 }, { "epoch": 2.8282865583456425, "grad_norm": 0.306291401386261, "learning_rate": 1.1454612637024265e-05, "loss": 0.1617, "step": 7659 }, { "epoch": 2.828655834564254, "grad_norm": 0.2775784432888031, "learning_rate": 1.1429979061460772e-05, "loss": 0.1678, "step": 7660 }, { "epoch": 2.8290251107828657, "grad_norm": 0.304565966129303, "learning_rate": 1.140534548589728e-05, "loss": 0.1939, "step": 7661 }, { "epoch": 2.829394387001477, "grad_norm": 0.3083300292491913, "learning_rate": 1.1380711910333786e-05, "loss": 0.1578, "step": 7662 }, { "epoch": 2.8297636632200884, "grad_norm": 0.30755531787872314, "learning_rate": 1.1356078334770293e-05, "loss": 0.1722, "step": 7663 }, { "epoch": 2.8301329394387, "grad_norm": 0.31816384196281433, "learning_rate": 1.13314447592068e-05, "loss": 0.1657, "step": 7664 }, { "epoch": 2.8305022156573116, "grad_norm": 0.25910794734954834, "learning_rate": 1.1306811183643306e-05, "loss": 0.1611, "step": 7665 }, { "epoch": 2.8308714918759232, "grad_norm": 0.24290092289447784, "learning_rate": 1.1282177608079814e-05, "loss": 0.1554, "step": 7666 }, { "epoch": 2.831240768094535, "grad_norm": 0.2623692750930786, "learning_rate": 1.125754403251632e-05, "loss": 0.1542, "step": 7667 }, { "epoch": 2.8316100443131464, "grad_norm": 0.25557443499565125, "learning_rate": 1.1232910456952828e-05, "loss": 0.1371, "step": 7668 }, { "epoch": 2.8319793205317576, "grad_norm": 0.3583131432533264, "learning_rate": 1.1208276881389334e-05, "loss": 0.1724, "step": 7669 }, { "epoch": 2.832348596750369, "grad_norm": 0.3200792372226715, "learning_rate": 1.1183643305825842e-05, "loss": 0.1807, "step": 7670 }, { "epoch": 2.832717872968981, "grad_norm": 0.27036356925964355, "learning_rate": 1.1159009730262348e-05, "loss": 0.1542, "step": 7671 }, { "epoch": 2.8330871491875924, "grad_norm": 0.30485615134239197, "learning_rate": 1.1134376154698856e-05, "loss": 0.1657, "step": 7672 }, { "epoch": 2.8334564254062036, "grad_norm": 0.27574265003204346, "learning_rate": 1.1109742579135362e-05, "loss": 0.1862, "step": 7673 }, { "epoch": 2.833825701624815, "grad_norm": 0.23522606492042542, "learning_rate": 1.108510900357187e-05, "loss": 0.1312, "step": 7674 }, { "epoch": 2.8341949778434268, "grad_norm": 0.32631388306617737, "learning_rate": 1.1060475428008376e-05, "loss": 0.1852, "step": 7675 }, { "epoch": 2.8345642540620384, "grad_norm": 0.2653501629829407, "learning_rate": 1.1035841852444882e-05, "loss": 0.1735, "step": 7676 }, { "epoch": 2.83493353028065, "grad_norm": 0.297904372215271, "learning_rate": 1.101120827688139e-05, "loss": 0.1549, "step": 7677 }, { "epoch": 2.8353028064992616, "grad_norm": 0.2230168879032135, "learning_rate": 1.0986574701317896e-05, "loss": 0.1477, "step": 7678 }, { "epoch": 2.835672082717873, "grad_norm": 0.3190532624721527, "learning_rate": 1.0961941125754404e-05, "loss": 0.1696, "step": 7679 }, { "epoch": 2.8360413589364843, "grad_norm": 0.2887504994869232, "learning_rate": 1.093730755019091e-05, "loss": 0.1545, "step": 7680 }, { "epoch": 2.836410635155096, "grad_norm": 0.31410741806030273, "learning_rate": 1.0912673974627418e-05, "loss": 0.1713, "step": 7681 }, { "epoch": 2.8367799113737076, "grad_norm": 0.36739683151245117, "learning_rate": 1.0888040399063924e-05, "loss": 0.1798, "step": 7682 }, { "epoch": 2.837149187592319, "grad_norm": 0.2992699146270752, "learning_rate": 1.0863406823500432e-05, "loss": 0.1807, "step": 7683 }, { "epoch": 2.8375184638109303, "grad_norm": 0.33379828929901123, "learning_rate": 1.0838773247936938e-05, "loss": 0.1869, "step": 7684 }, { "epoch": 2.837887740029542, "grad_norm": 0.31527650356292725, "learning_rate": 1.0814139672373446e-05, "loss": 0.1814, "step": 7685 }, { "epoch": 2.8382570162481535, "grad_norm": 0.27176427841186523, "learning_rate": 1.0789506096809952e-05, "loss": 0.1584, "step": 7686 }, { "epoch": 2.838626292466765, "grad_norm": 0.2806834876537323, "learning_rate": 1.076487252124646e-05, "loss": 0.1611, "step": 7687 }, { "epoch": 2.8389955686853767, "grad_norm": 0.20951002836227417, "learning_rate": 1.0740238945682966e-05, "loss": 0.1532, "step": 7688 }, { "epoch": 2.8393648449039883, "grad_norm": 0.25673583149909973, "learning_rate": 1.0715605370119474e-05, "loss": 0.1572, "step": 7689 }, { "epoch": 2.8397341211226, "grad_norm": 0.29751238226890564, "learning_rate": 1.069097179455598e-05, "loss": 0.1856, "step": 7690 }, { "epoch": 2.840103397341211, "grad_norm": 0.23384396731853485, "learning_rate": 1.0666338218992488e-05, "loss": 0.1369, "step": 7691 }, { "epoch": 2.8404726735598227, "grad_norm": 0.2613724172115326, "learning_rate": 1.0641704643428994e-05, "loss": 0.1743, "step": 7692 }, { "epoch": 2.8408419497784343, "grad_norm": 0.3152853548526764, "learning_rate": 1.0617071067865502e-05, "loss": 0.172, "step": 7693 }, { "epoch": 2.841211225997046, "grad_norm": 0.3350229859352112, "learning_rate": 1.0592437492302008e-05, "loss": 0.1894, "step": 7694 }, { "epoch": 2.841580502215657, "grad_norm": 0.29419779777526855, "learning_rate": 1.0567803916738515e-05, "loss": 0.1826, "step": 7695 }, { "epoch": 2.8419497784342687, "grad_norm": 0.2597488462924957, "learning_rate": 1.0543170341175022e-05, "loss": 0.1489, "step": 7696 }, { "epoch": 2.8423190546528803, "grad_norm": 0.3090327978134155, "learning_rate": 1.0518536765611529e-05, "loss": 0.1644, "step": 7697 }, { "epoch": 2.842688330871492, "grad_norm": 0.29547038674354553, "learning_rate": 1.0493903190048036e-05, "loss": 0.145, "step": 7698 }, { "epoch": 2.8430576070901035, "grad_norm": 0.3017313778400421, "learning_rate": 1.0469269614484543e-05, "loss": 0.2036, "step": 7699 }, { "epoch": 2.843426883308715, "grad_norm": 0.3118973970413208, "learning_rate": 1.044463603892105e-05, "loss": 0.1856, "step": 7700 }, { "epoch": 2.843426883308715, "eval_loss": 0.2449484020471573, "eval_runtime": 5.8622, "eval_samples_per_second": 8.529, "eval_steps_per_second": 1.194, "step": 7700 }, { "epoch": 2.8437961595273267, "grad_norm": 0.28383857011795044, "learning_rate": 1.0420002463357557e-05, "loss": 0.1486, "step": 7701 }, { "epoch": 2.844165435745938, "grad_norm": 0.31991198658943176, "learning_rate": 1.0395368887794065e-05, "loss": 0.19, "step": 7702 }, { "epoch": 2.8445347119645494, "grad_norm": 0.24715940654277802, "learning_rate": 1.037073531223057e-05, "loss": 0.1355, "step": 7703 }, { "epoch": 2.844903988183161, "grad_norm": 0.2981330156326294, "learning_rate": 1.0346101736667077e-05, "loss": 0.1694, "step": 7704 }, { "epoch": 2.8452732644017726, "grad_norm": 0.23327438533306122, "learning_rate": 1.0321468161103585e-05, "loss": 0.1678, "step": 7705 }, { "epoch": 2.845642540620384, "grad_norm": 0.26901179552078247, "learning_rate": 1.0296834585540091e-05, "loss": 0.168, "step": 7706 }, { "epoch": 2.8460118168389954, "grad_norm": 0.34340429306030273, "learning_rate": 1.0272201009976599e-05, "loss": 0.168, "step": 7707 }, { "epoch": 2.846381093057607, "grad_norm": 0.22643637657165527, "learning_rate": 1.0247567434413105e-05, "loss": 0.1397, "step": 7708 }, { "epoch": 2.8467503692762186, "grad_norm": 0.28315696120262146, "learning_rate": 1.0222933858849613e-05, "loss": 0.172, "step": 7709 }, { "epoch": 2.84711964549483, "grad_norm": 0.3400122821331024, "learning_rate": 1.0198300283286119e-05, "loss": 0.1544, "step": 7710 }, { "epoch": 2.847488921713442, "grad_norm": 0.3144240081310272, "learning_rate": 1.0173666707722627e-05, "loss": 0.1875, "step": 7711 }, { "epoch": 2.8478581979320534, "grad_norm": 0.25010183453559875, "learning_rate": 1.0149033132159133e-05, "loss": 0.1324, "step": 7712 }, { "epoch": 2.8482274741506646, "grad_norm": 0.37234553694725037, "learning_rate": 1.0124399556595641e-05, "loss": 0.2089, "step": 7713 }, { "epoch": 2.848596750369276, "grad_norm": 0.2791849970817566, "learning_rate": 1.0099765981032147e-05, "loss": 0.1557, "step": 7714 }, { "epoch": 2.848966026587888, "grad_norm": 0.2452075034379959, "learning_rate": 1.0075132405468655e-05, "loss": 0.1613, "step": 7715 }, { "epoch": 2.8493353028064994, "grad_norm": 0.28953418135643005, "learning_rate": 1.0050498829905161e-05, "loss": 0.1569, "step": 7716 }, { "epoch": 2.8497045790251105, "grad_norm": 0.30108311772346497, "learning_rate": 1.0025865254341669e-05, "loss": 0.1732, "step": 7717 }, { "epoch": 2.850073855243722, "grad_norm": 0.2359277755022049, "learning_rate": 1.0001231678778175e-05, "loss": 0.1729, "step": 7718 }, { "epoch": 2.8504431314623337, "grad_norm": 0.26641425490379333, "learning_rate": 9.976598103214683e-06, "loss": 0.1648, "step": 7719 }, { "epoch": 2.8508124076809453, "grad_norm": 0.27465370297431946, "learning_rate": 9.951964527651189e-06, "loss": 0.1641, "step": 7720 }, { "epoch": 2.851181683899557, "grad_norm": 0.24329142272472382, "learning_rate": 9.927330952087697e-06, "loss": 0.1469, "step": 7721 }, { "epoch": 2.8515509601181686, "grad_norm": 0.31522512435913086, "learning_rate": 9.902697376524203e-06, "loss": 0.1745, "step": 7722 }, { "epoch": 2.85192023633678, "grad_norm": 0.21894076466560364, "learning_rate": 9.87806380096071e-06, "loss": 0.1442, "step": 7723 }, { "epoch": 2.8522895125553913, "grad_norm": 0.30204495787620544, "learning_rate": 9.853430225397217e-06, "loss": 0.1674, "step": 7724 }, { "epoch": 2.852658788774003, "grad_norm": 0.27711284160614014, "learning_rate": 9.828796649833723e-06, "loss": 0.1433, "step": 7725 }, { "epoch": 2.8530280649926145, "grad_norm": 0.22089818120002747, "learning_rate": 9.804163074270231e-06, "loss": 0.1426, "step": 7726 }, { "epoch": 2.853397341211226, "grad_norm": 0.26761361956596375, "learning_rate": 9.779529498706737e-06, "loss": 0.1617, "step": 7727 }, { "epoch": 2.8537666174298373, "grad_norm": 0.37248560786247253, "learning_rate": 9.754895923143245e-06, "loss": 0.1982, "step": 7728 }, { "epoch": 2.854135893648449, "grad_norm": 0.23078663647174835, "learning_rate": 9.730262347579751e-06, "loss": 0.149, "step": 7729 }, { "epoch": 2.8545051698670605, "grad_norm": 0.3424321711063385, "learning_rate": 9.70562877201626e-06, "loss": 0.1956, "step": 7730 }, { "epoch": 2.854874446085672, "grad_norm": 0.25236353278160095, "learning_rate": 9.680995196452765e-06, "loss": 0.1605, "step": 7731 }, { "epoch": 2.8552437223042837, "grad_norm": 0.26007863879203796, "learning_rate": 9.656361620889273e-06, "loss": 0.1469, "step": 7732 }, { "epoch": 2.8556129985228953, "grad_norm": 0.2958352267742157, "learning_rate": 9.63172804532578e-06, "loss": 0.1836, "step": 7733 }, { "epoch": 2.855982274741507, "grad_norm": 0.2829585671424866, "learning_rate": 9.607094469762286e-06, "loss": 0.1508, "step": 7734 }, { "epoch": 2.856351550960118, "grad_norm": 0.2982698082923889, "learning_rate": 9.582460894198794e-06, "loss": 0.1783, "step": 7735 }, { "epoch": 2.8567208271787297, "grad_norm": 0.27404478192329407, "learning_rate": 9.5578273186353e-06, "loss": 0.1642, "step": 7736 }, { "epoch": 2.8570901033973413, "grad_norm": 0.23169001936912537, "learning_rate": 9.533193743071808e-06, "loss": 0.1622, "step": 7737 }, { "epoch": 2.857459379615953, "grad_norm": 0.2677990198135376, "learning_rate": 9.508560167508314e-06, "loss": 0.1703, "step": 7738 }, { "epoch": 2.857828655834564, "grad_norm": 0.33130165934562683, "learning_rate": 9.483926591944822e-06, "loss": 0.1868, "step": 7739 }, { "epoch": 2.8581979320531756, "grad_norm": 0.23286062479019165, "learning_rate": 9.459293016381328e-06, "loss": 0.1857, "step": 7740 }, { "epoch": 2.8585672082717872, "grad_norm": 0.25445547699928284, "learning_rate": 9.434659440817836e-06, "loss": 0.1432, "step": 7741 }, { "epoch": 2.858936484490399, "grad_norm": 0.4026498794555664, "learning_rate": 9.410025865254342e-06, "loss": 0.1975, "step": 7742 }, { "epoch": 2.8593057607090104, "grad_norm": 0.32120877504348755, "learning_rate": 9.38539228969085e-06, "loss": 0.1682, "step": 7743 }, { "epoch": 2.859675036927622, "grad_norm": 0.2434127777814865, "learning_rate": 9.360758714127356e-06, "loss": 0.1505, "step": 7744 }, { "epoch": 2.8600443131462336, "grad_norm": 0.2673363983631134, "learning_rate": 9.336125138563864e-06, "loss": 0.1641, "step": 7745 }, { "epoch": 2.860413589364845, "grad_norm": 0.2814323902130127, "learning_rate": 9.31149156300037e-06, "loss": 0.1733, "step": 7746 }, { "epoch": 2.8607828655834564, "grad_norm": 0.252278208732605, "learning_rate": 9.286857987436878e-06, "loss": 0.173, "step": 7747 }, { "epoch": 2.861152141802068, "grad_norm": 0.27015420794487, "learning_rate": 9.262224411873384e-06, "loss": 0.1658, "step": 7748 }, { "epoch": 2.8615214180206796, "grad_norm": 0.27484989166259766, "learning_rate": 9.237590836309892e-06, "loss": 0.164, "step": 7749 }, { "epoch": 2.8618906942392908, "grad_norm": 0.2758505046367645, "learning_rate": 9.212957260746398e-06, "loss": 0.1608, "step": 7750 }, { "epoch": 2.8618906942392908, "eval_loss": 0.24487631022930145, "eval_runtime": 5.8611, "eval_samples_per_second": 8.531, "eval_steps_per_second": 1.194, "step": 7750 }, { "epoch": 2.8622599704579024, "grad_norm": 0.30068379640579224, "learning_rate": 9.188323685182904e-06, "loss": 0.1789, "step": 7751 }, { "epoch": 2.862629246676514, "grad_norm": 0.28493115305900574, "learning_rate": 9.163690109619412e-06, "loss": 0.1527, "step": 7752 }, { "epoch": 2.8629985228951256, "grad_norm": 0.23973438143730164, "learning_rate": 9.139056534055918e-06, "loss": 0.1501, "step": 7753 }, { "epoch": 2.863367799113737, "grad_norm": 0.29730549454689026, "learning_rate": 9.114422958492426e-06, "loss": 0.1762, "step": 7754 }, { "epoch": 2.863737075332349, "grad_norm": 0.32645919919013977, "learning_rate": 9.089789382928932e-06, "loss": 0.1873, "step": 7755 }, { "epoch": 2.8641063515509604, "grad_norm": 0.26671698689460754, "learning_rate": 9.06515580736544e-06, "loss": 0.1432, "step": 7756 }, { "epoch": 2.8644756277695715, "grad_norm": 0.27620962262153625, "learning_rate": 9.040522231801946e-06, "loss": 0.1566, "step": 7757 }, { "epoch": 2.864844903988183, "grad_norm": 0.30255046486854553, "learning_rate": 9.015888656238454e-06, "loss": 0.1774, "step": 7758 }, { "epoch": 2.8652141802067947, "grad_norm": 0.2471076399087906, "learning_rate": 8.99125508067496e-06, "loss": 0.1641, "step": 7759 }, { "epoch": 2.8655834564254064, "grad_norm": 0.26162055134773254, "learning_rate": 8.966621505111468e-06, "loss": 0.1522, "step": 7760 }, { "epoch": 2.8659527326440175, "grad_norm": 0.27141493558883667, "learning_rate": 8.941987929547974e-06, "loss": 0.1752, "step": 7761 }, { "epoch": 2.866322008862629, "grad_norm": 0.2501200735569, "learning_rate": 8.91735435398448e-06, "loss": 0.1583, "step": 7762 }, { "epoch": 2.8666912850812407, "grad_norm": 0.24008040130138397, "learning_rate": 8.892720778420988e-06, "loss": 0.1421, "step": 7763 }, { "epoch": 2.8670605612998523, "grad_norm": 0.27715250849723816, "learning_rate": 8.868087202857494e-06, "loss": 0.1786, "step": 7764 }, { "epoch": 2.867429837518464, "grad_norm": 0.32403644919395447, "learning_rate": 8.843453627294002e-06, "loss": 0.1633, "step": 7765 }, { "epoch": 2.8677991137370755, "grad_norm": 0.3056527078151703, "learning_rate": 8.818820051730508e-06, "loss": 0.1882, "step": 7766 }, { "epoch": 2.868168389955687, "grad_norm": 0.28270432353019714, "learning_rate": 8.794186476167016e-06, "loss": 0.1688, "step": 7767 }, { "epoch": 2.8685376661742983, "grad_norm": 0.25626882910728455, "learning_rate": 8.769552900603523e-06, "loss": 0.1562, "step": 7768 }, { "epoch": 2.86890694239291, "grad_norm": 0.2904073894023895, "learning_rate": 8.74491932504003e-06, "loss": 0.1684, "step": 7769 }, { "epoch": 2.8692762186115215, "grad_norm": 0.3247445821762085, "learning_rate": 8.720285749476537e-06, "loss": 0.1763, "step": 7770 }, { "epoch": 2.869645494830133, "grad_norm": 0.28118449449539185, "learning_rate": 8.695652173913044e-06, "loss": 0.1753, "step": 7771 }, { "epoch": 2.8700147710487443, "grad_norm": 0.2690677046775818, "learning_rate": 8.67101859834955e-06, "loss": 0.1362, "step": 7772 }, { "epoch": 2.870384047267356, "grad_norm": 0.2865660786628723, "learning_rate": 8.646385022786058e-06, "loss": 0.1805, "step": 7773 }, { "epoch": 2.8707533234859675, "grad_norm": 0.23917242884635925, "learning_rate": 8.621751447222565e-06, "loss": 0.1732, "step": 7774 }, { "epoch": 2.871122599704579, "grad_norm": 0.23305903375148773, "learning_rate": 8.597117871659072e-06, "loss": 0.1614, "step": 7775 }, { "epoch": 2.8714918759231907, "grad_norm": 0.25651511549949646, "learning_rate": 8.572484296095579e-06, "loss": 0.1778, "step": 7776 }, { "epoch": 2.8718611521418023, "grad_norm": 0.252269983291626, "learning_rate": 8.547850720532087e-06, "loss": 0.1585, "step": 7777 }, { "epoch": 2.8722304283604134, "grad_norm": 0.2591896057128906, "learning_rate": 8.523217144968593e-06, "loss": 0.1766, "step": 7778 }, { "epoch": 2.872599704579025, "grad_norm": 0.26428085565567017, "learning_rate": 8.4985835694051e-06, "loss": 0.1432, "step": 7779 }, { "epoch": 2.8729689807976366, "grad_norm": 0.2558910846710205, "learning_rate": 8.473949993841607e-06, "loss": 0.1451, "step": 7780 }, { "epoch": 2.8733382570162482, "grad_norm": 0.37068071961402893, "learning_rate": 8.449316418278113e-06, "loss": 0.2287, "step": 7781 }, { "epoch": 2.87370753323486, "grad_norm": 0.21412429213523865, "learning_rate": 8.42468284271462e-06, "loss": 0.1313, "step": 7782 }, { "epoch": 2.874076809453471, "grad_norm": 0.278902530670166, "learning_rate": 8.400049267151127e-06, "loss": 0.1754, "step": 7783 }, { "epoch": 2.8744460856720826, "grad_norm": 0.27167651057243347, "learning_rate": 8.375415691587635e-06, "loss": 0.1608, "step": 7784 }, { "epoch": 2.874815361890694, "grad_norm": 0.236875981092453, "learning_rate": 8.350782116024141e-06, "loss": 0.1775, "step": 7785 }, { "epoch": 2.875184638109306, "grad_norm": 0.23661479353904724, "learning_rate": 8.326148540460649e-06, "loss": 0.1473, "step": 7786 }, { "epoch": 2.8755539143279174, "grad_norm": 0.27548691630363464, "learning_rate": 8.301514964897155e-06, "loss": 0.1524, "step": 7787 }, { "epoch": 2.875923190546529, "grad_norm": 0.19834963977336884, "learning_rate": 8.276881389333663e-06, "loss": 0.141, "step": 7788 }, { "epoch": 2.87629246676514, "grad_norm": 0.23704633116722107, "learning_rate": 8.252247813770169e-06, "loss": 0.1583, "step": 7789 }, { "epoch": 2.8766617429837518, "grad_norm": 0.2559584677219391, "learning_rate": 8.227614238206677e-06, "loss": 0.1426, "step": 7790 }, { "epoch": 2.8770310192023634, "grad_norm": 0.35006478428840637, "learning_rate": 8.202980662643183e-06, "loss": 0.2012, "step": 7791 }, { "epoch": 2.877400295420975, "grad_norm": 0.27662691473960876, "learning_rate": 8.17834708707969e-06, "loss": 0.1644, "step": 7792 }, { "epoch": 2.8777695716395866, "grad_norm": 0.2628805637359619, "learning_rate": 8.153713511516197e-06, "loss": 0.1658, "step": 7793 }, { "epoch": 2.8781388478581977, "grad_norm": 0.24730327725410461, "learning_rate": 8.129079935952703e-06, "loss": 0.1697, "step": 7794 }, { "epoch": 2.8785081240768093, "grad_norm": 0.2437671720981598, "learning_rate": 8.104446360389211e-06, "loss": 0.1514, "step": 7795 }, { "epoch": 2.878877400295421, "grad_norm": 0.2743469774723053, "learning_rate": 8.079812784825717e-06, "loss": 0.1659, "step": 7796 }, { "epoch": 2.8792466765140325, "grad_norm": 0.29006099700927734, "learning_rate": 8.055179209262225e-06, "loss": 0.1617, "step": 7797 }, { "epoch": 2.879615952732644, "grad_norm": 0.2561509311199188, "learning_rate": 8.030545633698731e-06, "loss": 0.1404, "step": 7798 }, { "epoch": 2.8799852289512557, "grad_norm": 0.269986093044281, "learning_rate": 8.00591205813524e-06, "loss": 0.1521, "step": 7799 }, { "epoch": 2.880354505169867, "grad_norm": 0.23220254480838776, "learning_rate": 7.981278482571745e-06, "loss": 0.1503, "step": 7800 }, { "epoch": 2.880354505169867, "eval_loss": 0.24527710676193237, "eval_runtime": 5.8643, "eval_samples_per_second": 8.526, "eval_steps_per_second": 1.194, "step": 7800 }, { "epoch": 2.8807237813884785, "grad_norm": 0.2728011906147003, "learning_rate": 7.956644907008253e-06, "loss": 0.1579, "step": 7801 }, { "epoch": 2.88109305760709, "grad_norm": 0.24128302931785583, "learning_rate": 7.93201133144476e-06, "loss": 0.127, "step": 7802 }, { "epoch": 2.8814623338257017, "grad_norm": 0.2598339319229126, "learning_rate": 7.907377755881267e-06, "loss": 0.1582, "step": 7803 }, { "epoch": 2.881831610044313, "grad_norm": 0.30559489130973816, "learning_rate": 7.882744180317773e-06, "loss": 0.1755, "step": 7804 }, { "epoch": 2.8822008862629245, "grad_norm": 0.25795549154281616, "learning_rate": 7.858110604754281e-06, "loss": 0.173, "step": 7805 }, { "epoch": 2.882570162481536, "grad_norm": 0.30176594853401184, "learning_rate": 7.833477029190787e-06, "loss": 0.1725, "step": 7806 }, { "epoch": 2.8829394387001477, "grad_norm": 0.2542456388473511, "learning_rate": 7.808843453627295e-06, "loss": 0.1563, "step": 7807 }, { "epoch": 2.8833087149187593, "grad_norm": 0.26239079236984253, "learning_rate": 7.784209878063801e-06, "loss": 0.1751, "step": 7808 }, { "epoch": 2.883677991137371, "grad_norm": 0.25571733713150024, "learning_rate": 7.759576302500308e-06, "loss": 0.1665, "step": 7809 }, { "epoch": 2.8840472673559825, "grad_norm": 0.24270211160182953, "learning_rate": 7.734942726936816e-06, "loss": 0.173, "step": 7810 }, { "epoch": 2.8844165435745936, "grad_norm": 0.20862916111946106, "learning_rate": 7.710309151373322e-06, "loss": 0.1416, "step": 7811 }, { "epoch": 2.8847858197932053, "grad_norm": 0.2596467137336731, "learning_rate": 7.68567557580983e-06, "loss": 0.1754, "step": 7812 }, { "epoch": 2.885155096011817, "grad_norm": 0.3205491602420807, "learning_rate": 7.661042000246336e-06, "loss": 0.1826, "step": 7813 }, { "epoch": 2.8855243722304285, "grad_norm": 0.25602033734321594, "learning_rate": 7.636408424682844e-06, "loss": 0.1502, "step": 7814 }, { "epoch": 2.8858936484490396, "grad_norm": 0.261232852935791, "learning_rate": 7.611774849119351e-06, "loss": 0.1537, "step": 7815 }, { "epoch": 2.886262924667651, "grad_norm": 0.27913686633110046, "learning_rate": 7.587141273555858e-06, "loss": 0.1434, "step": 7816 }, { "epoch": 2.886632200886263, "grad_norm": 0.25788408517837524, "learning_rate": 7.562507697992365e-06, "loss": 0.1756, "step": 7817 }, { "epoch": 2.8870014771048744, "grad_norm": 0.2901060879230499, "learning_rate": 7.537874122428872e-06, "loss": 0.1842, "step": 7818 }, { "epoch": 2.887370753323486, "grad_norm": 0.2752688527107239, "learning_rate": 7.513240546865379e-06, "loss": 0.1801, "step": 7819 }, { "epoch": 2.8877400295420976, "grad_norm": 0.26344338059425354, "learning_rate": 7.488606971301884e-06, "loss": 0.1793, "step": 7820 }, { "epoch": 2.8881093057607092, "grad_norm": 0.2962002754211426, "learning_rate": 7.463973395738391e-06, "loss": 0.1896, "step": 7821 }, { "epoch": 2.8884785819793204, "grad_norm": 0.27300742268562317, "learning_rate": 7.439339820174898e-06, "loss": 0.1587, "step": 7822 }, { "epoch": 2.888847858197932, "grad_norm": 0.25111380219459534, "learning_rate": 7.414706244611405e-06, "loss": 0.1764, "step": 7823 }, { "epoch": 2.8892171344165436, "grad_norm": 0.2565227746963501, "learning_rate": 7.390072669047912e-06, "loss": 0.1544, "step": 7824 }, { "epoch": 2.889586410635155, "grad_norm": 0.27793753147125244, "learning_rate": 7.365439093484419e-06, "loss": 0.1584, "step": 7825 }, { "epoch": 2.8899556868537664, "grad_norm": 0.2671755850315094, "learning_rate": 7.340805517920926e-06, "loss": 0.1458, "step": 7826 }, { "epoch": 2.890324963072378, "grad_norm": 0.26163250207901, "learning_rate": 7.316171942357433e-06, "loss": 0.1607, "step": 7827 }, { "epoch": 2.8906942392909896, "grad_norm": 0.29872065782546997, "learning_rate": 7.29153836679394e-06, "loss": 0.1563, "step": 7828 }, { "epoch": 2.891063515509601, "grad_norm": 0.24086235463619232, "learning_rate": 7.266904791230447e-06, "loss": 0.1411, "step": 7829 }, { "epoch": 2.8914327917282128, "grad_norm": 0.2317669689655304, "learning_rate": 7.242271215666954e-06, "loss": 0.1401, "step": 7830 }, { "epoch": 2.8918020679468244, "grad_norm": 0.24406473338603973, "learning_rate": 7.217637640103461e-06, "loss": 0.1528, "step": 7831 }, { "epoch": 2.892171344165436, "grad_norm": 0.24135088920593262, "learning_rate": 7.193004064539968e-06, "loss": 0.1949, "step": 7832 }, { "epoch": 2.892540620384047, "grad_norm": 0.305979460477829, "learning_rate": 7.168370488976475e-06, "loss": 0.1796, "step": 7833 }, { "epoch": 2.8929098966026587, "grad_norm": 0.2573777437210083, "learning_rate": 7.143736913412982e-06, "loss": 0.1656, "step": 7834 }, { "epoch": 2.8932791728212703, "grad_norm": 0.2556525766849518, "learning_rate": 7.119103337849489e-06, "loss": 0.1697, "step": 7835 }, { "epoch": 2.893648449039882, "grad_norm": 0.2672213613986969, "learning_rate": 7.094469762285996e-06, "loss": 0.1663, "step": 7836 }, { "epoch": 2.894017725258493, "grad_norm": 0.2884562015533447, "learning_rate": 7.069836186722503e-06, "loss": 0.1838, "step": 7837 }, { "epoch": 2.8943870014771047, "grad_norm": 0.27062246203422546, "learning_rate": 7.04520261115901e-06, "loss": 0.1629, "step": 7838 }, { "epoch": 2.8947562776957163, "grad_norm": 0.2844317853450775, "learning_rate": 7.020569035595517e-06, "loss": 0.1759, "step": 7839 }, { "epoch": 2.895125553914328, "grad_norm": 0.2508607506752014, "learning_rate": 6.995935460032024e-06, "loss": 0.167, "step": 7840 }, { "epoch": 2.8954948301329395, "grad_norm": 0.23912623524665833, "learning_rate": 6.971301884468531e-06, "loss": 0.1729, "step": 7841 }, { "epoch": 2.895864106351551, "grad_norm": 0.24652552604675293, "learning_rate": 6.946668308905038e-06, "loss": 0.157, "step": 7842 }, { "epoch": 2.8962333825701627, "grad_norm": 0.3296917676925659, "learning_rate": 6.922034733341545e-06, "loss": 0.172, "step": 7843 }, { "epoch": 2.896602658788774, "grad_norm": 0.2623459994792938, "learning_rate": 6.897401157778052e-06, "loss": 0.165, "step": 7844 }, { "epoch": 2.8969719350073855, "grad_norm": 0.19621816277503967, "learning_rate": 6.872767582214559e-06, "loss": 0.1286, "step": 7845 }, { "epoch": 2.897341211225997, "grad_norm": 0.31013402342796326, "learning_rate": 6.848134006651066e-06, "loss": 0.164, "step": 7846 }, { "epoch": 2.8977104874446087, "grad_norm": 0.2559880018234253, "learning_rate": 6.823500431087573e-06, "loss": 0.1622, "step": 7847 }, { "epoch": 2.89807976366322, "grad_norm": 0.31077656149864197, "learning_rate": 6.7988668555240804e-06, "loss": 0.1863, "step": 7848 }, { "epoch": 2.8984490398818314, "grad_norm": 0.29186683893203735, "learning_rate": 6.774233279960586e-06, "loss": 0.1564, "step": 7849 }, { "epoch": 2.898818316100443, "grad_norm": 0.35916706919670105, "learning_rate": 6.749599704397093e-06, "loss": 0.1862, "step": 7850 }, { "epoch": 2.898818316100443, "eval_loss": 0.2451774775981903, "eval_runtime": 5.8467, "eval_samples_per_second": 8.552, "eval_steps_per_second": 1.197, "step": 7850 }, { "epoch": 2.8991875923190547, "grad_norm": 0.3264926075935364, "learning_rate": 6.7249661288336e-06, "loss": 0.1918, "step": 7851 }, { "epoch": 2.8995568685376663, "grad_norm": 0.248603954911232, "learning_rate": 6.700332553270107e-06, "loss": 0.1441, "step": 7852 }, { "epoch": 2.899926144756278, "grad_norm": 0.23523470759391785, "learning_rate": 6.675698977706614e-06, "loss": 0.1565, "step": 7853 }, { "epoch": 2.9002954209748895, "grad_norm": 0.29151037335395813, "learning_rate": 6.651065402143121e-06, "loss": 0.1666, "step": 7854 }, { "epoch": 2.9006646971935006, "grad_norm": 0.2865246832370758, "learning_rate": 6.626431826579628e-06, "loss": 0.1907, "step": 7855 }, { "epoch": 2.901033973412112, "grad_norm": 0.29374998807907104, "learning_rate": 6.601798251016135e-06, "loss": 0.1568, "step": 7856 }, { "epoch": 2.901403249630724, "grad_norm": 0.23669621348381042, "learning_rate": 6.577164675452642e-06, "loss": 0.1433, "step": 7857 }, { "epoch": 2.9017725258493354, "grad_norm": 0.28078481554985046, "learning_rate": 6.552531099889149e-06, "loss": 0.1436, "step": 7858 }, { "epoch": 2.9021418020679466, "grad_norm": 0.24589957296848297, "learning_rate": 6.527897524325656e-06, "loss": 0.1601, "step": 7859 }, { "epoch": 2.902511078286558, "grad_norm": 0.2863220274448395, "learning_rate": 6.503263948762163e-06, "loss": 0.1764, "step": 7860 }, { "epoch": 2.90288035450517, "grad_norm": 0.29853659868240356, "learning_rate": 6.47863037319867e-06, "loss": 0.1868, "step": 7861 }, { "epoch": 2.9032496307237814, "grad_norm": 0.30052450299263, "learning_rate": 6.453996797635177e-06, "loss": 0.1709, "step": 7862 }, { "epoch": 2.903618906942393, "grad_norm": 0.34883439540863037, "learning_rate": 6.429363222071684e-06, "loss": 0.1976, "step": 7863 }, { "epoch": 2.9039881831610046, "grad_norm": 0.2708515226840973, "learning_rate": 6.404729646508191e-06, "loss": 0.1768, "step": 7864 }, { "epoch": 2.904357459379616, "grad_norm": 0.24446143209934235, "learning_rate": 6.380096070944698e-06, "loss": 0.144, "step": 7865 }, { "epoch": 2.9047267355982274, "grad_norm": 0.2536332309246063, "learning_rate": 6.355462495381205e-06, "loss": 0.1518, "step": 7866 }, { "epoch": 2.905096011816839, "grad_norm": 0.2545977532863617, "learning_rate": 6.330828919817712e-06, "loss": 0.1563, "step": 7867 }, { "epoch": 2.9054652880354506, "grad_norm": 0.3001493513584137, "learning_rate": 6.306195344254219e-06, "loss": 0.1725, "step": 7868 }, { "epoch": 2.905834564254062, "grad_norm": 0.34021979570388794, "learning_rate": 6.281561768690726e-06, "loss": 0.1953, "step": 7869 }, { "epoch": 2.9062038404726733, "grad_norm": 0.2688051760196686, "learning_rate": 6.256928193127233e-06, "loss": 0.1415, "step": 7870 }, { "epoch": 2.906573116691285, "grad_norm": 0.28454554080963135, "learning_rate": 6.232294617563739e-06, "loss": 0.1682, "step": 7871 }, { "epoch": 2.9069423929098965, "grad_norm": 0.2471422255039215, "learning_rate": 6.207661042000246e-06, "loss": 0.1544, "step": 7872 }, { "epoch": 2.907311669128508, "grad_norm": 0.29998016357421875, "learning_rate": 6.183027466436753e-06, "loss": 0.1577, "step": 7873 }, { "epoch": 2.9076809453471197, "grad_norm": 0.33225128054618835, "learning_rate": 6.15839389087326e-06, "loss": 0.1755, "step": 7874 }, { "epoch": 2.9080502215657313, "grad_norm": 0.3006001114845276, "learning_rate": 6.133760315309767e-06, "loss": 0.1706, "step": 7875 }, { "epoch": 2.908419497784343, "grad_norm": 0.2629902958869934, "learning_rate": 6.109126739746274e-06, "loss": 0.17, "step": 7876 }, { "epoch": 2.908788774002954, "grad_norm": 0.2456972301006317, "learning_rate": 6.084493164182781e-06, "loss": 0.1604, "step": 7877 }, { "epoch": 2.9091580502215657, "grad_norm": 0.283809632062912, "learning_rate": 6.059859588619288e-06, "loss": 0.1541, "step": 7878 }, { "epoch": 2.9095273264401773, "grad_norm": 0.2507307231426239, "learning_rate": 6.035226013055795e-06, "loss": 0.1523, "step": 7879 }, { "epoch": 2.909896602658789, "grad_norm": 0.25242727994918823, "learning_rate": 6.010592437492302e-06, "loss": 0.1497, "step": 7880 }, { "epoch": 2.9102658788774, "grad_norm": 0.27692046761512756, "learning_rate": 5.9859588619288094e-06, "loss": 0.1539, "step": 7881 }, { "epoch": 2.9106351550960117, "grad_norm": 0.26622235774993896, "learning_rate": 5.9613252863653164e-06, "loss": 0.1691, "step": 7882 }, { "epoch": 2.9110044313146233, "grad_norm": 0.21173381805419922, "learning_rate": 5.9366917108018235e-06, "loss": 0.1424, "step": 7883 }, { "epoch": 2.911373707533235, "grad_norm": 0.26915299892425537, "learning_rate": 5.9120581352383305e-06, "loss": 0.1662, "step": 7884 }, { "epoch": 2.9117429837518465, "grad_norm": 0.28042080998420715, "learning_rate": 5.887424559674837e-06, "loss": 0.1239, "step": 7885 }, { "epoch": 2.912112259970458, "grad_norm": 0.2873269319534302, "learning_rate": 5.862790984111344e-06, "loss": 0.169, "step": 7886 }, { "epoch": 2.9124815361890697, "grad_norm": 0.2641640603542328, "learning_rate": 5.838157408547851e-06, "loss": 0.1602, "step": 7887 }, { "epoch": 2.912850812407681, "grad_norm": 0.288686603307724, "learning_rate": 5.813523832984358e-06, "loss": 0.1896, "step": 7888 }, { "epoch": 2.9132200886262924, "grad_norm": 0.2504146099090576, "learning_rate": 5.788890257420865e-06, "loss": 0.1641, "step": 7889 }, { "epoch": 2.913589364844904, "grad_norm": 0.2569611966609955, "learning_rate": 5.764256681857372e-06, "loss": 0.1658, "step": 7890 }, { "epoch": 2.9139586410635157, "grad_norm": 0.24803724884986877, "learning_rate": 5.739623106293879e-06, "loss": 0.1536, "step": 7891 }, { "epoch": 2.914327917282127, "grad_norm": 0.2211741954088211, "learning_rate": 5.714989530730386e-06, "loss": 0.1344, "step": 7892 }, { "epoch": 2.9146971935007384, "grad_norm": 0.3000262975692749, "learning_rate": 5.690355955166893e-06, "loss": 0.174, "step": 7893 }, { "epoch": 2.91506646971935, "grad_norm": 0.2736184298992157, "learning_rate": 5.6657223796034e-06, "loss": 0.1526, "step": 7894 }, { "epoch": 2.9154357459379616, "grad_norm": 0.30402082204818726, "learning_rate": 5.641088804039907e-06, "loss": 0.1604, "step": 7895 }, { "epoch": 2.9158050221565732, "grad_norm": 0.3075074851512909, "learning_rate": 5.616455228476414e-06, "loss": 0.1956, "step": 7896 }, { "epoch": 2.916174298375185, "grad_norm": 0.27981919050216675, "learning_rate": 5.591821652912921e-06, "loss": 0.1562, "step": 7897 }, { "epoch": 2.9165435745937964, "grad_norm": 0.2536432445049286, "learning_rate": 5.567188077349428e-06, "loss": 0.1686, "step": 7898 }, { "epoch": 2.9169128508124076, "grad_norm": 0.23276817798614502, "learning_rate": 5.542554501785935e-06, "loss": 0.1457, "step": 7899 }, { "epoch": 2.917282127031019, "grad_norm": 0.31259551644325256, "learning_rate": 5.517920926222441e-06, "loss": 0.1826, "step": 7900 }, { "epoch": 2.917282127031019, "eval_loss": 0.24526342749595642, "eval_runtime": 5.8635, "eval_samples_per_second": 8.527, "eval_steps_per_second": 1.194, "step": 7900 }, { "epoch": 2.917651403249631, "grad_norm": 0.25709274411201477, "learning_rate": 5.493287350658948e-06, "loss": 0.1402, "step": 7901 }, { "epoch": 2.9180206794682424, "grad_norm": 0.29496487975120544, "learning_rate": 5.468653775095455e-06, "loss": 0.2003, "step": 7902 }, { "epoch": 2.9183899556868536, "grad_norm": 0.26852312684059143, "learning_rate": 5.444020199531962e-06, "loss": 0.1718, "step": 7903 }, { "epoch": 2.918759231905465, "grad_norm": 0.24652555584907532, "learning_rate": 5.419386623968469e-06, "loss": 0.1455, "step": 7904 }, { "epoch": 2.9191285081240768, "grad_norm": 0.2851690649986267, "learning_rate": 5.394753048404976e-06, "loss": 0.1607, "step": 7905 }, { "epoch": 2.9194977843426884, "grad_norm": 0.26648518443107605, "learning_rate": 5.370119472841483e-06, "loss": 0.1223, "step": 7906 }, { "epoch": 2.9198670605613, "grad_norm": 0.2626606523990631, "learning_rate": 5.34548589727799e-06, "loss": 0.1591, "step": 7907 }, { "epoch": 2.9202363367799116, "grad_norm": 0.2880263030529022, "learning_rate": 5.320852321714497e-06, "loss": 0.1542, "step": 7908 }, { "epoch": 2.920605612998523, "grad_norm": 0.2149953842163086, "learning_rate": 5.296218746151004e-06, "loss": 0.1494, "step": 7909 }, { "epoch": 2.9209748892171343, "grad_norm": 0.26451849937438965, "learning_rate": 5.271585170587511e-06, "loss": 0.1557, "step": 7910 }, { "epoch": 2.921344165435746, "grad_norm": 0.28757619857788086, "learning_rate": 5.246951595024018e-06, "loss": 0.1918, "step": 7911 }, { "epoch": 2.9217134416543575, "grad_norm": 0.24943974614143372, "learning_rate": 5.222318019460525e-06, "loss": 0.1522, "step": 7912 }, { "epoch": 2.922082717872969, "grad_norm": 0.2756868004798889, "learning_rate": 5.197684443897032e-06, "loss": 0.166, "step": 7913 }, { "epoch": 2.9224519940915803, "grad_norm": 0.2607571482658386, "learning_rate": 5.1730508683335384e-06, "loss": 0.1528, "step": 7914 }, { "epoch": 2.922821270310192, "grad_norm": 0.28976476192474365, "learning_rate": 5.1484172927700454e-06, "loss": 0.1536, "step": 7915 }, { "epoch": 2.9231905465288035, "grad_norm": 0.29689449071884155, "learning_rate": 5.1237837172065525e-06, "loss": 0.1605, "step": 7916 }, { "epoch": 2.923559822747415, "grad_norm": 0.2582589387893677, "learning_rate": 5.0991501416430595e-06, "loss": 0.1748, "step": 7917 }, { "epoch": 2.9239290989660267, "grad_norm": 0.2462097406387329, "learning_rate": 5.0745165660795665e-06, "loss": 0.1457, "step": 7918 }, { "epoch": 2.9242983751846383, "grad_norm": 0.27874815464019775, "learning_rate": 5.0498829905160735e-06, "loss": 0.1626, "step": 7919 }, { "epoch": 2.9246676514032495, "grad_norm": 0.24357378482818604, "learning_rate": 5.0252494149525805e-06, "loss": 0.1652, "step": 7920 }, { "epoch": 2.925036927621861, "grad_norm": 0.35426661372184753, "learning_rate": 5.0006158393890875e-06, "loss": 0.1592, "step": 7921 }, { "epoch": 2.9254062038404727, "grad_norm": 0.2852160632610321, "learning_rate": 4.9759822638255946e-06, "loss": 0.1733, "step": 7922 }, { "epoch": 2.9257754800590843, "grad_norm": 0.2774888873100281, "learning_rate": 4.9513486882621016e-06, "loss": 0.1716, "step": 7923 }, { "epoch": 2.926144756277696, "grad_norm": 0.2971727252006531, "learning_rate": 4.926715112698609e-06, "loss": 0.1473, "step": 7924 }, { "epoch": 2.926514032496307, "grad_norm": 0.25789374113082886, "learning_rate": 4.902081537135116e-06, "loss": 0.1656, "step": 7925 }, { "epoch": 2.9268833087149186, "grad_norm": 0.28932541608810425, "learning_rate": 4.877447961571623e-06, "loss": 0.1701, "step": 7926 }, { "epoch": 2.9272525849335302, "grad_norm": 0.30476686358451843, "learning_rate": 4.85281438600813e-06, "loss": 0.1594, "step": 7927 }, { "epoch": 2.927621861152142, "grad_norm": 0.2554304003715515, "learning_rate": 4.828180810444637e-06, "loss": 0.1516, "step": 7928 }, { "epoch": 2.9279911373707534, "grad_norm": 0.3235216438770294, "learning_rate": 4.803547234881143e-06, "loss": 0.1785, "step": 7929 }, { "epoch": 2.928360413589365, "grad_norm": 0.26630640029907227, "learning_rate": 4.77891365931765e-06, "loss": 0.1499, "step": 7930 }, { "epoch": 2.928729689807976, "grad_norm": 0.2978435754776001, "learning_rate": 4.754280083754157e-06, "loss": 0.1697, "step": 7931 }, { "epoch": 2.929098966026588, "grad_norm": 0.2978525161743164, "learning_rate": 4.729646508190664e-06, "loss": 0.1674, "step": 7932 }, { "epoch": 2.9294682422451994, "grad_norm": 0.3201218247413635, "learning_rate": 4.705012932627171e-06, "loss": 0.201, "step": 7933 }, { "epoch": 2.929837518463811, "grad_norm": 0.27196231484413147, "learning_rate": 4.680379357063678e-06, "loss": 0.162, "step": 7934 }, { "epoch": 2.930206794682422, "grad_norm": 0.28881552815437317, "learning_rate": 4.655745781500185e-06, "loss": 0.1804, "step": 7935 }, { "epoch": 2.930576070901034, "grad_norm": 0.2389811873435974, "learning_rate": 4.631112205936692e-06, "loss": 0.158, "step": 7936 }, { "epoch": 2.9309453471196454, "grad_norm": 0.25321751832962036, "learning_rate": 4.606478630373199e-06, "loss": 0.1604, "step": 7937 }, { "epoch": 2.931314623338257, "grad_norm": 0.27187028527259827, "learning_rate": 4.581845054809706e-06, "loss": 0.178, "step": 7938 }, { "epoch": 2.9316838995568686, "grad_norm": 0.2438676357269287, "learning_rate": 4.557211479246213e-06, "loss": 0.1566, "step": 7939 }, { "epoch": 2.93205317577548, "grad_norm": 0.2593154013156891, "learning_rate": 4.53257790368272e-06, "loss": 0.1763, "step": 7940 }, { "epoch": 2.932422451994092, "grad_norm": 0.2560650408267975, "learning_rate": 4.507944328119227e-06, "loss": 0.145, "step": 7941 }, { "epoch": 2.932791728212703, "grad_norm": 0.25635337829589844, "learning_rate": 4.483310752555734e-06, "loss": 0.1524, "step": 7942 }, { "epoch": 2.9331610044313146, "grad_norm": 0.23034490644931793, "learning_rate": 4.45867717699224e-06, "loss": 0.1479, "step": 7943 }, { "epoch": 2.933530280649926, "grad_norm": 0.25828734040260315, "learning_rate": 4.434043601428747e-06, "loss": 0.1819, "step": 7944 }, { "epoch": 2.9338995568685378, "grad_norm": 0.30721622705459595, "learning_rate": 4.409410025865254e-06, "loss": 0.1782, "step": 7945 }, { "epoch": 2.934268833087149, "grad_norm": 0.2951761484146118, "learning_rate": 4.384776450301761e-06, "loss": 0.1616, "step": 7946 }, { "epoch": 2.9346381093057605, "grad_norm": 0.2366657704114914, "learning_rate": 4.360142874738268e-06, "loss": 0.1546, "step": 7947 }, { "epoch": 2.935007385524372, "grad_norm": 0.30123084783554077, "learning_rate": 4.335509299174775e-06, "loss": 0.2079, "step": 7948 }, { "epoch": 2.9353766617429837, "grad_norm": 0.2627218961715698, "learning_rate": 4.310875723611282e-06, "loss": 0.1651, "step": 7949 }, { "epoch": 2.9357459379615953, "grad_norm": 0.30095189809799194, "learning_rate": 4.286242148047789e-06, "loss": 0.1615, "step": 7950 }, { "epoch": 2.9357459379615953, "eval_loss": 0.24511057138442993, "eval_runtime": 5.8646, "eval_samples_per_second": 8.526, "eval_steps_per_second": 1.194, "step": 7950 }, { "epoch": 2.936115214180207, "grad_norm": 0.320188969373703, "learning_rate": 4.261608572484296e-06, "loss": 0.1659, "step": 7951 }, { "epoch": 2.9364844903988185, "grad_norm": 0.25572508573532104, "learning_rate": 4.236974996920803e-06, "loss": 0.1477, "step": 7952 }, { "epoch": 2.9368537666174297, "grad_norm": 0.27010729908943176, "learning_rate": 4.21234142135731e-06, "loss": 0.1732, "step": 7953 }, { "epoch": 2.9372230428360413, "grad_norm": 0.32194846868515015, "learning_rate": 4.187707845793817e-06, "loss": 0.159, "step": 7954 }, { "epoch": 2.937592319054653, "grad_norm": 0.2527812123298645, "learning_rate": 4.163074270230324e-06, "loss": 0.1726, "step": 7955 }, { "epoch": 2.9379615952732645, "grad_norm": 0.2795100808143616, "learning_rate": 4.138440694666831e-06, "loss": 0.1746, "step": 7956 }, { "epoch": 2.9383308714918757, "grad_norm": 0.2902645766735077, "learning_rate": 4.1138071191033384e-06, "loss": 0.1647, "step": 7957 }, { "epoch": 2.9387001477104873, "grad_norm": 0.27305081486701965, "learning_rate": 4.089173543539845e-06, "loss": 0.1497, "step": 7958 }, { "epoch": 2.939069423929099, "grad_norm": 0.2554760277271271, "learning_rate": 4.064539967976352e-06, "loss": 0.1392, "step": 7959 }, { "epoch": 2.9394387001477105, "grad_norm": 0.25168025493621826, "learning_rate": 4.039906392412859e-06, "loss": 0.1522, "step": 7960 }, { "epoch": 2.939807976366322, "grad_norm": 0.22683395445346832, "learning_rate": 4.015272816849366e-06, "loss": 0.139, "step": 7961 }, { "epoch": 2.9401772525849337, "grad_norm": 0.2923617959022522, "learning_rate": 3.990639241285873e-06, "loss": 0.1802, "step": 7962 }, { "epoch": 2.9405465288035453, "grad_norm": 0.2750331163406372, "learning_rate": 3.96600566572238e-06, "loss": 0.1759, "step": 7963 }, { "epoch": 2.9409158050221564, "grad_norm": 0.3098607063293457, "learning_rate": 3.941372090158887e-06, "loss": 0.1703, "step": 7964 }, { "epoch": 2.941285081240768, "grad_norm": 0.2954683303833008, "learning_rate": 3.916738514595394e-06, "loss": 0.1844, "step": 7965 }, { "epoch": 2.9416543574593796, "grad_norm": 0.3084189295768738, "learning_rate": 3.892104939031901e-06, "loss": 0.165, "step": 7966 }, { "epoch": 2.9420236336779912, "grad_norm": 0.24966798722743988, "learning_rate": 3.867471363468408e-06, "loss": 0.1672, "step": 7967 }, { "epoch": 2.9423929098966024, "grad_norm": 0.2882799804210663, "learning_rate": 3.842837787904915e-06, "loss": 0.1438, "step": 7968 }, { "epoch": 2.942762186115214, "grad_norm": 0.2638078033924103, "learning_rate": 3.818204212341422e-06, "loss": 0.1667, "step": 7969 }, { "epoch": 2.9431314623338256, "grad_norm": 0.24424761533737183, "learning_rate": 3.793570636777929e-06, "loss": 0.1593, "step": 7970 }, { "epoch": 2.943500738552437, "grad_norm": 0.29140397906303406, "learning_rate": 3.768937061214436e-06, "loss": 0.1558, "step": 7971 }, { "epoch": 2.943870014771049, "grad_norm": 0.23234650492668152, "learning_rate": 3.744303485650942e-06, "loss": 0.1472, "step": 7972 }, { "epoch": 2.9442392909896604, "grad_norm": 0.2452017366886139, "learning_rate": 3.719669910087449e-06, "loss": 0.1529, "step": 7973 }, { "epoch": 2.944608567208272, "grad_norm": 0.255310982465744, "learning_rate": 3.695036334523956e-06, "loss": 0.1496, "step": 7974 }, { "epoch": 2.944977843426883, "grad_norm": 0.24070237576961517, "learning_rate": 3.670402758960463e-06, "loss": 0.1656, "step": 7975 }, { "epoch": 2.945347119645495, "grad_norm": 0.3284929096698761, "learning_rate": 3.64576918339697e-06, "loss": 0.1964, "step": 7976 }, { "epoch": 2.9457163958641064, "grad_norm": 0.28253570199012756, "learning_rate": 3.621135607833477e-06, "loss": 0.1747, "step": 7977 }, { "epoch": 2.946085672082718, "grad_norm": 0.3123980760574341, "learning_rate": 3.596502032269984e-06, "loss": 0.1751, "step": 7978 }, { "epoch": 2.946454948301329, "grad_norm": 0.2887688875198364, "learning_rate": 3.571868456706491e-06, "loss": 0.1769, "step": 7979 }, { "epoch": 2.9468242245199407, "grad_norm": 0.2751558721065521, "learning_rate": 3.547234881142998e-06, "loss": 0.1515, "step": 7980 }, { "epoch": 2.9471935007385524, "grad_norm": 0.24743711948394775, "learning_rate": 3.522601305579505e-06, "loss": 0.1494, "step": 7981 }, { "epoch": 2.947562776957164, "grad_norm": 0.2867875099182129, "learning_rate": 3.497967730016012e-06, "loss": 0.1838, "step": 7982 }, { "epoch": 2.9479320531757756, "grad_norm": 0.29121991991996765, "learning_rate": 3.473334154452519e-06, "loss": 0.1603, "step": 7983 }, { "epoch": 2.948301329394387, "grad_norm": 0.3091704547405243, "learning_rate": 3.448700578889026e-06, "loss": 0.1553, "step": 7984 }, { "epoch": 2.9486706056129988, "grad_norm": 0.3139994740486145, "learning_rate": 3.424067003325533e-06, "loss": 0.1502, "step": 7985 }, { "epoch": 2.94903988183161, "grad_norm": 0.25884008407592773, "learning_rate": 3.3994334277620402e-06, "loss": 0.1555, "step": 7986 }, { "epoch": 2.9494091580502215, "grad_norm": 0.2872859537601471, "learning_rate": 3.3747998521985464e-06, "loss": 0.1579, "step": 7987 }, { "epoch": 2.949778434268833, "grad_norm": 0.258348286151886, "learning_rate": 3.3501662766350534e-06, "loss": 0.1594, "step": 7988 }, { "epoch": 2.9501477104874447, "grad_norm": 0.26962390542030334, "learning_rate": 3.3255327010715604e-06, "loss": 0.162, "step": 7989 }, { "epoch": 2.950516986706056, "grad_norm": 0.23087744414806366, "learning_rate": 3.3008991255080674e-06, "loss": 0.1421, "step": 7990 }, { "epoch": 2.9508862629246675, "grad_norm": 0.31266406178474426, "learning_rate": 3.2762655499445745e-06, "loss": 0.194, "step": 7991 }, { "epoch": 2.951255539143279, "grad_norm": 0.26265013217926025, "learning_rate": 3.2516319743810815e-06, "loss": 0.1632, "step": 7992 }, { "epoch": 2.9516248153618907, "grad_norm": 0.2959122657775879, "learning_rate": 3.2269983988175885e-06, "loss": 0.1748, "step": 7993 }, { "epoch": 2.9519940915805023, "grad_norm": 0.23374541103839874, "learning_rate": 3.2023648232540955e-06, "loss": 0.1464, "step": 7994 }, { "epoch": 2.952363367799114, "grad_norm": 0.26890239119529724, "learning_rate": 3.1777312476906025e-06, "loss": 0.1401, "step": 7995 }, { "epoch": 2.9527326440177255, "grad_norm": 0.2581759989261627, "learning_rate": 3.1530976721271095e-06, "loss": 0.1599, "step": 7996 }, { "epoch": 2.9531019202363367, "grad_norm": 0.32330840826034546, "learning_rate": 3.1284640965636165e-06, "loss": 0.1658, "step": 7997 }, { "epoch": 2.9534711964549483, "grad_norm": 0.3487650752067566, "learning_rate": 3.103830521000123e-06, "loss": 0.1904, "step": 7998 }, { "epoch": 2.95384047267356, "grad_norm": 0.27576979994773865, "learning_rate": 3.07919694543663e-06, "loss": 0.1683, "step": 7999 }, { "epoch": 2.9542097488921715, "grad_norm": 0.2911146283149719, "learning_rate": 3.054563369873137e-06, "loss": 0.1644, "step": 8000 }, { "epoch": 2.9542097488921715, "eval_loss": 0.24570205807685852, "eval_runtime": 5.8577, "eval_samples_per_second": 8.536, "eval_steps_per_second": 1.195, "step": 8000 }, { "epoch": 2.9545790251107826, "grad_norm": 0.26587924361228943, "learning_rate": 3.029929794309644e-06, "loss": 0.1499, "step": 8001 }, { "epoch": 2.9549483013293942, "grad_norm": 0.2555754780769348, "learning_rate": 3.005296218746151e-06, "loss": 0.1484, "step": 8002 }, { "epoch": 2.955317577548006, "grad_norm": 0.2865438163280487, "learning_rate": 2.9806626431826582e-06, "loss": 0.1627, "step": 8003 }, { "epoch": 2.9556868537666174, "grad_norm": 0.2447013258934021, "learning_rate": 2.9560290676191652e-06, "loss": 0.1359, "step": 8004 }, { "epoch": 2.956056129985229, "grad_norm": 0.30044007301330566, "learning_rate": 2.931395492055672e-06, "loss": 0.1935, "step": 8005 }, { "epoch": 2.9564254062038406, "grad_norm": 0.26513710618019104, "learning_rate": 2.906761916492179e-06, "loss": 0.1506, "step": 8006 }, { "epoch": 2.9567946824224522, "grad_norm": 0.24062731862068176, "learning_rate": 2.882128340928686e-06, "loss": 0.1558, "step": 8007 }, { "epoch": 2.9571639586410634, "grad_norm": 0.2982660233974457, "learning_rate": 2.857494765365193e-06, "loss": 0.1877, "step": 8008 }, { "epoch": 2.957533234859675, "grad_norm": 0.2812516987323761, "learning_rate": 2.8328611898017e-06, "loss": 0.1619, "step": 8009 }, { "epoch": 2.9579025110782866, "grad_norm": 0.23445898294448853, "learning_rate": 2.808227614238207e-06, "loss": 0.1538, "step": 8010 }, { "epoch": 2.958271787296898, "grad_norm": 0.2310696393251419, "learning_rate": 2.783594038674714e-06, "loss": 0.159, "step": 8011 }, { "epoch": 2.9586410635155094, "grad_norm": 0.24844327569007874, "learning_rate": 2.7589604631112205e-06, "loss": 0.158, "step": 8012 }, { "epoch": 2.959010339734121, "grad_norm": 0.24663670361042023, "learning_rate": 2.7343268875477275e-06, "loss": 0.1501, "step": 8013 }, { "epoch": 2.9593796159527326, "grad_norm": 0.2664964497089386, "learning_rate": 2.7096933119842346e-06, "loss": 0.1495, "step": 8014 }, { "epoch": 2.959748892171344, "grad_norm": 0.2691749632358551, "learning_rate": 2.6850597364207416e-06, "loss": 0.1565, "step": 8015 }, { "epoch": 2.960118168389956, "grad_norm": 0.31154337525367737, "learning_rate": 2.6604261608572486e-06, "loss": 0.1707, "step": 8016 }, { "epoch": 2.9604874446085674, "grad_norm": 0.2657756209373474, "learning_rate": 2.6357925852937556e-06, "loss": 0.1654, "step": 8017 }, { "epoch": 2.960856720827179, "grad_norm": 0.3763958811759949, "learning_rate": 2.6111590097302626e-06, "loss": 0.1668, "step": 8018 }, { "epoch": 2.96122599704579, "grad_norm": 0.3201150596141815, "learning_rate": 2.5865254341667692e-06, "loss": 0.1882, "step": 8019 }, { "epoch": 2.9615952732644018, "grad_norm": 0.269149512052536, "learning_rate": 2.5618918586032762e-06, "loss": 0.1537, "step": 8020 }, { "epoch": 2.9619645494830134, "grad_norm": 0.23237310349941254, "learning_rate": 2.5372582830397832e-06, "loss": 0.1524, "step": 8021 }, { "epoch": 2.962333825701625, "grad_norm": 0.24856051802635193, "learning_rate": 2.5126247074762903e-06, "loss": 0.1653, "step": 8022 }, { "epoch": 2.962703101920236, "grad_norm": 0.23548021912574768, "learning_rate": 2.4879911319127973e-06, "loss": 0.1574, "step": 8023 }, { "epoch": 2.9630723781388477, "grad_norm": 0.21869832277297974, "learning_rate": 2.4633575563493043e-06, "loss": 0.1356, "step": 8024 }, { "epoch": 2.9634416543574593, "grad_norm": 0.290967732667923, "learning_rate": 2.4387239807858113e-06, "loss": 0.1619, "step": 8025 }, { "epoch": 2.963810930576071, "grad_norm": 0.3094252645969391, "learning_rate": 2.4140904052223183e-06, "loss": 0.1964, "step": 8026 }, { "epoch": 2.9641802067946825, "grad_norm": 0.3074129521846771, "learning_rate": 2.389456829658825e-06, "loss": 0.1939, "step": 8027 }, { "epoch": 2.964549483013294, "grad_norm": 0.2699299156665802, "learning_rate": 2.364823254095332e-06, "loss": 0.1629, "step": 8028 }, { "epoch": 2.9649187592319057, "grad_norm": 0.267343133687973, "learning_rate": 2.340189678531839e-06, "loss": 0.1527, "step": 8029 }, { "epoch": 2.965288035450517, "grad_norm": 0.2700563669204712, "learning_rate": 2.315556102968346e-06, "loss": 0.1518, "step": 8030 }, { "epoch": 2.9656573116691285, "grad_norm": 0.21548229455947876, "learning_rate": 2.290922527404853e-06, "loss": 0.1378, "step": 8031 }, { "epoch": 2.96602658788774, "grad_norm": 0.2287013977766037, "learning_rate": 2.26628895184136e-06, "loss": 0.154, "step": 8032 }, { "epoch": 2.9663958641063517, "grad_norm": 0.30055034160614014, "learning_rate": 2.241655376277867e-06, "loss": 0.1648, "step": 8033 }, { "epoch": 2.966765140324963, "grad_norm": 0.2905229330062866, "learning_rate": 2.2170218007143736e-06, "loss": 0.178, "step": 8034 }, { "epoch": 2.9671344165435745, "grad_norm": 0.32654109597206116, "learning_rate": 2.1923882251508806e-06, "loss": 0.1646, "step": 8035 }, { "epoch": 2.967503692762186, "grad_norm": 0.3208484947681427, "learning_rate": 2.1677546495873876e-06, "loss": 0.1558, "step": 8036 }, { "epoch": 2.9678729689807977, "grad_norm": 0.3065069019794464, "learning_rate": 2.1431210740238947e-06, "loss": 0.1592, "step": 8037 }, { "epoch": 2.9682422451994093, "grad_norm": 0.2624164819717407, "learning_rate": 2.1184874984604017e-06, "loss": 0.1423, "step": 8038 }, { "epoch": 2.968611521418021, "grad_norm": 0.2898831069469452, "learning_rate": 2.0938539228969087e-06, "loss": 0.1625, "step": 8039 }, { "epoch": 2.9689807976366325, "grad_norm": 0.3115295171737671, "learning_rate": 2.0692203473334157e-06, "loss": 0.1741, "step": 8040 }, { "epoch": 2.9693500738552436, "grad_norm": 0.23843203485012054, "learning_rate": 2.0445867717699223e-06, "loss": 0.1444, "step": 8041 }, { "epoch": 2.9697193500738552, "grad_norm": 0.266390860080719, "learning_rate": 2.0199531962064293e-06, "loss": 0.1737, "step": 8042 }, { "epoch": 2.970088626292467, "grad_norm": 0.2281162291765213, "learning_rate": 1.9953196206429363e-06, "loss": 0.1385, "step": 8043 }, { "epoch": 2.9704579025110784, "grad_norm": 0.3282414972782135, "learning_rate": 1.9706860450794434e-06, "loss": 0.1923, "step": 8044 }, { "epoch": 2.9708271787296896, "grad_norm": 0.33279144763946533, "learning_rate": 1.9460524695159504e-06, "loss": 0.1817, "step": 8045 }, { "epoch": 2.971196454948301, "grad_norm": 0.26364079117774963, "learning_rate": 1.9214188939524574e-06, "loss": 0.1515, "step": 8046 }, { "epoch": 2.971565731166913, "grad_norm": 0.28969061374664307, "learning_rate": 1.8967853183889644e-06, "loss": 0.1623, "step": 8047 }, { "epoch": 2.9719350073855244, "grad_norm": 0.28264951705932617, "learning_rate": 1.872151742825471e-06, "loss": 0.1681, "step": 8048 }, { "epoch": 2.972304283604136, "grad_norm": 0.25010421872138977, "learning_rate": 1.847518167261978e-06, "loss": 0.1468, "step": 8049 }, { "epoch": 2.9726735598227476, "grad_norm": 0.31697016954421997, "learning_rate": 1.822884591698485e-06, "loss": 0.171, "step": 8050 }, { "epoch": 2.9726735598227476, "eval_loss": 0.2450605034828186, "eval_runtime": 5.8673, "eval_samples_per_second": 8.522, "eval_steps_per_second": 1.193, "step": 8050 }, { "epoch": 2.9730428360413588, "grad_norm": 0.24555766582489014, "learning_rate": 1.798251016134992e-06, "loss": 0.1479, "step": 8051 }, { "epoch": 2.9734121122599704, "grad_norm": 0.2706266939640045, "learning_rate": 1.773617440571499e-06, "loss": 0.1783, "step": 8052 }, { "epoch": 2.973781388478582, "grad_norm": 0.313739538192749, "learning_rate": 1.748983865008006e-06, "loss": 0.174, "step": 8053 }, { "epoch": 2.9741506646971936, "grad_norm": 0.30564966797828674, "learning_rate": 1.724350289444513e-06, "loss": 0.181, "step": 8054 }, { "epoch": 2.974519940915805, "grad_norm": 0.27199769020080566, "learning_rate": 1.6997167138810201e-06, "loss": 0.1927, "step": 8055 }, { "epoch": 2.9748892171344163, "grad_norm": 0.27665677666664124, "learning_rate": 1.6750831383175267e-06, "loss": 0.1662, "step": 8056 }, { "epoch": 2.975258493353028, "grad_norm": 0.28751057386398315, "learning_rate": 1.6504495627540337e-06, "loss": 0.1612, "step": 8057 }, { "epoch": 2.9756277695716395, "grad_norm": 0.27947041392326355, "learning_rate": 1.6258159871905407e-06, "loss": 0.1823, "step": 8058 }, { "epoch": 2.975997045790251, "grad_norm": 0.2704101502895355, "learning_rate": 1.6011824116270478e-06, "loss": 0.1662, "step": 8059 }, { "epoch": 2.9763663220088628, "grad_norm": 0.23133864998817444, "learning_rate": 1.5765488360635548e-06, "loss": 0.1445, "step": 8060 }, { "epoch": 2.9767355982274744, "grad_norm": 0.258308082818985, "learning_rate": 1.5519152605000616e-06, "loss": 0.167, "step": 8061 }, { "epoch": 2.9771048744460855, "grad_norm": 0.2986674904823303, "learning_rate": 1.5272816849365686e-06, "loss": 0.1473, "step": 8062 }, { "epoch": 2.977474150664697, "grad_norm": 0.283969521522522, "learning_rate": 1.5026481093730756e-06, "loss": 0.1635, "step": 8063 }, { "epoch": 2.9778434268833087, "grad_norm": 0.21613147854804993, "learning_rate": 1.4780145338095826e-06, "loss": 0.1524, "step": 8064 }, { "epoch": 2.9782127031019203, "grad_norm": 0.23958474397659302, "learning_rate": 1.4533809582460894e-06, "loss": 0.1532, "step": 8065 }, { "epoch": 2.9785819793205315, "grad_norm": 0.23401552438735962, "learning_rate": 1.4287473826825964e-06, "loss": 0.1561, "step": 8066 }, { "epoch": 2.978951255539143, "grad_norm": 0.31438690423965454, "learning_rate": 1.4041138071191035e-06, "loss": 0.1793, "step": 8067 }, { "epoch": 2.9793205317577547, "grad_norm": 0.2550419569015503, "learning_rate": 1.3794802315556103e-06, "loss": 0.1614, "step": 8068 }, { "epoch": 2.9796898079763663, "grad_norm": 0.260769248008728, "learning_rate": 1.3548466559921173e-06, "loss": 0.1554, "step": 8069 }, { "epoch": 2.980059084194978, "grad_norm": 0.3034195303916931, "learning_rate": 1.3302130804286243e-06, "loss": 0.168, "step": 8070 }, { "epoch": 2.9804283604135895, "grad_norm": 0.3023551106452942, "learning_rate": 1.3055795048651313e-06, "loss": 0.1526, "step": 8071 }, { "epoch": 2.980797636632201, "grad_norm": 0.2257782369852066, "learning_rate": 1.2809459293016381e-06, "loss": 0.1723, "step": 8072 }, { "epoch": 2.9811669128508123, "grad_norm": 0.3689556419849396, "learning_rate": 1.2563123537381451e-06, "loss": 0.1608, "step": 8073 }, { "epoch": 2.981536189069424, "grad_norm": 0.2841677963733673, "learning_rate": 1.2316787781746521e-06, "loss": 0.1873, "step": 8074 }, { "epoch": 2.9819054652880355, "grad_norm": 0.2721441984176636, "learning_rate": 1.2070452026111592e-06, "loss": 0.1349, "step": 8075 }, { "epoch": 2.982274741506647, "grad_norm": 0.23719848692417145, "learning_rate": 1.182411627047666e-06, "loss": 0.1715, "step": 8076 }, { "epoch": 2.9826440177252582, "grad_norm": 0.32348963618278503, "learning_rate": 1.157778051484173e-06, "loss": 0.1813, "step": 8077 }, { "epoch": 2.98301329394387, "grad_norm": 0.2904803454875946, "learning_rate": 1.13314447592068e-06, "loss": 0.1822, "step": 8078 }, { "epoch": 2.9833825701624814, "grad_norm": 0.27401435375213623, "learning_rate": 1.1085109003571868e-06, "loss": 0.1683, "step": 8079 }, { "epoch": 2.983751846381093, "grad_norm": 0.3362328112125397, "learning_rate": 1.0838773247936938e-06, "loss": 0.2015, "step": 8080 }, { "epoch": 2.9841211225997046, "grad_norm": 0.2640010416507721, "learning_rate": 1.0592437492302008e-06, "loss": 0.1428, "step": 8081 }, { "epoch": 2.9844903988183162, "grad_norm": 0.293450266122818, "learning_rate": 1.0346101736667079e-06, "loss": 0.1591, "step": 8082 }, { "epoch": 2.984859675036928, "grad_norm": 0.23264862596988678, "learning_rate": 1.0099765981032147e-06, "loss": 0.1231, "step": 8083 }, { "epoch": 2.985228951255539, "grad_norm": 0.2444784939289093, "learning_rate": 9.853430225397217e-07, "loss": 0.145, "step": 8084 }, { "epoch": 2.9855982274741506, "grad_norm": 0.39302757382392883, "learning_rate": 9.607094469762287e-07, "loss": 0.1942, "step": 8085 }, { "epoch": 2.985967503692762, "grad_norm": 0.23764002323150635, "learning_rate": 9.360758714127355e-07, "loss": 0.1602, "step": 8086 }, { "epoch": 2.986336779911374, "grad_norm": 0.2889630198478699, "learning_rate": 9.114422958492425e-07, "loss": 0.1732, "step": 8087 }, { "epoch": 2.986706056129985, "grad_norm": 0.32426342368125916, "learning_rate": 8.868087202857495e-07, "loss": 0.1864, "step": 8088 }, { "epoch": 2.9870753323485966, "grad_norm": 0.2573675513267517, "learning_rate": 8.621751447222565e-07, "loss": 0.1499, "step": 8089 }, { "epoch": 2.987444608567208, "grad_norm": 0.2688586711883545, "learning_rate": 8.375415691587634e-07, "loss": 0.147, "step": 8090 }, { "epoch": 2.9878138847858198, "grad_norm": 0.27188539505004883, "learning_rate": 8.129079935952704e-07, "loss": 0.1597, "step": 8091 }, { "epoch": 2.9881831610044314, "grad_norm": 0.2932220697402954, "learning_rate": 7.882744180317774e-07, "loss": 0.1718, "step": 8092 }, { "epoch": 2.988552437223043, "grad_norm": 0.30258461833000183, "learning_rate": 7.636408424682843e-07, "loss": 0.1796, "step": 8093 }, { "epoch": 2.9889217134416546, "grad_norm": 0.24038194119930267, "learning_rate": 7.390072669047913e-07, "loss": 0.132, "step": 8094 }, { "epoch": 2.9892909896602657, "grad_norm": 0.2573639750480652, "learning_rate": 7.143736913412982e-07, "loss": 0.1578, "step": 8095 }, { "epoch": 2.9896602658788773, "grad_norm": 0.3152335286140442, "learning_rate": 6.897401157778051e-07, "loss": 0.1636, "step": 8096 }, { "epoch": 2.990029542097489, "grad_norm": 0.22796498239040375, "learning_rate": 6.651065402143121e-07, "loss": 0.1319, "step": 8097 }, { "epoch": 2.9903988183161005, "grad_norm": 0.24346081912517548, "learning_rate": 6.404729646508191e-07, "loss": 0.1606, "step": 8098 }, { "epoch": 2.9907680945347117, "grad_norm": 0.27753716707229614, "learning_rate": 6.158393890873261e-07, "loss": 0.1547, "step": 8099 }, { "epoch": 2.9911373707533233, "grad_norm": 0.24621686339378357, "learning_rate": 5.91205813523833e-07, "loss": 0.1403, "step": 8100 }, { "epoch": 2.9911373707533233, "eval_loss": 0.24522972106933594, "eval_runtime": 5.8569, "eval_samples_per_second": 8.537, "eval_steps_per_second": 1.195, "step": 8100 }, { "epoch": 2.991506646971935, "grad_norm": 0.2303771823644638, "learning_rate": 5.6657223796034e-07, "loss": 0.1515, "step": 8101 }, { "epoch": 2.9918759231905465, "grad_norm": 0.2339046150445938, "learning_rate": 5.419386623968469e-07, "loss": 0.1266, "step": 8102 }, { "epoch": 2.992245199409158, "grad_norm": 0.32565292716026306, "learning_rate": 5.173050868333539e-07, "loss": 0.1642, "step": 8103 }, { "epoch": 2.9926144756277697, "grad_norm": 0.2603965997695923, "learning_rate": 4.926715112698608e-07, "loss": 0.1598, "step": 8104 }, { "epoch": 2.9929837518463813, "grad_norm": 0.2606295049190521, "learning_rate": 4.6803793570636775e-07, "loss": 0.1962, "step": 8105 }, { "epoch": 2.9933530280649925, "grad_norm": 0.2298162877559662, "learning_rate": 4.4340436014287476e-07, "loss": 0.1472, "step": 8106 }, { "epoch": 2.993722304283604, "grad_norm": 0.28554925322532654, "learning_rate": 4.187707845793817e-07, "loss": 0.1993, "step": 8107 }, { "epoch": 2.9940915805022157, "grad_norm": 0.2721603214740753, "learning_rate": 3.941372090158887e-07, "loss": 0.1705, "step": 8108 }, { "epoch": 2.9944608567208273, "grad_norm": 0.2951487600803375, "learning_rate": 3.6950363345239565e-07, "loss": 0.18, "step": 8109 }, { "epoch": 2.9948301329394384, "grad_norm": 0.27289626002311707, "learning_rate": 3.4487005788890257e-07, "loss": 0.1754, "step": 8110 }, { "epoch": 2.99519940915805, "grad_norm": 0.29108723998069763, "learning_rate": 3.2023648232540953e-07, "loss": 0.1602, "step": 8111 }, { "epoch": 2.9955686853766617, "grad_norm": 0.2703995704650879, "learning_rate": 2.956029067619165e-07, "loss": 0.1659, "step": 8112 }, { "epoch": 2.9959379615952733, "grad_norm": 0.23298288881778717, "learning_rate": 2.7096933119842346e-07, "loss": 0.1263, "step": 8113 }, { "epoch": 2.996307237813885, "grad_norm": 0.20616547763347626, "learning_rate": 2.463357556349304e-07, "loss": 0.1347, "step": 8114 }, { "epoch": 2.9966765140324965, "grad_norm": 0.2639867067337036, "learning_rate": 2.2170218007143738e-07, "loss": 0.146, "step": 8115 }, { "epoch": 2.997045790251108, "grad_norm": 0.3198007047176361, "learning_rate": 1.9706860450794435e-07, "loss": 0.2015, "step": 8116 }, { "epoch": 2.9974150664697192, "grad_norm": 0.24881775677204132, "learning_rate": 1.7243502894445128e-07, "loss": 0.1598, "step": 8117 }, { "epoch": 2.997784342688331, "grad_norm": 0.24045133590698242, "learning_rate": 1.4780145338095825e-07, "loss": 0.1345, "step": 8118 }, { "epoch": 2.9981536189069424, "grad_norm": 0.29275083541870117, "learning_rate": 1.231678778174652e-07, "loss": 0.1775, "step": 8119 }, { "epoch": 2.998522895125554, "grad_norm": 0.33159106969833374, "learning_rate": 9.853430225397217e-08, "loss": 0.1872, "step": 8120 }, { "epoch": 2.998892171344165, "grad_norm": 0.2544606626033783, "learning_rate": 7.390072669047912e-08, "loss": 0.1503, "step": 8121 }, { "epoch": 2.999261447562777, "grad_norm": 0.24590566754341125, "learning_rate": 4.9267151126986086e-08, "loss": 0.1568, "step": 8122 }, { "epoch": 2.9996307237813884, "grad_norm": 0.2536267638206482, "learning_rate": 2.4633575563493043e-08, "loss": 0.1525, "step": 8123 }, { "epoch": 3.0, "grad_norm": 0.32975172996520996, "learning_rate": 0.0, "loss": 0.1622, "step": 8124 } ], "logging_steps": 1, "max_steps": 8124, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.03894823567147e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }