{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9969018932874354, "eval_steps": 500, "global_step": 2178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001376936316695353, "grad_norm": 6.1170735359191895, "learning_rate": 6.060606060606061e-07, "loss": 4.2914, "mean_token_accuracy": 0.3011375442147255, "num_tokens": 327335.0, "step": 1 }, { "epoch": 0.002753872633390706, "grad_norm": 10.659546852111816, "learning_rate": 1.2121212121212122e-06, "loss": 4.9262, "mean_token_accuracy": 0.2556382417678833, "num_tokens": 535993.0, "step": 2 }, { "epoch": 0.004130808950086058, "grad_norm": 13.748114585876465, "learning_rate": 1.8181818181818183e-06, "loss": 5.231, "mean_token_accuracy": 0.2342421542853117, "num_tokens": 712534.0, "step": 3 }, { "epoch": 0.005507745266781412, "grad_norm": 16.56269645690918, "learning_rate": 2.4242424242424244e-06, "loss": 5.5218, "mean_token_accuracy": 0.20879356376826763, "num_tokens": 870770.0, "step": 4 }, { "epoch": 0.0068846815834767644, "grad_norm": 20.039382934570312, "learning_rate": 3.0303030303030305e-06, "loss": 5.8364, "mean_token_accuracy": 0.18375738337635994, "num_tokens": 1016298.0, "step": 5 }, { "epoch": 0.008261617900172116, "grad_norm": 26.17948341369629, "learning_rate": 3.6363636363636366e-06, "loss": 6.1569, "mean_token_accuracy": 0.15754800289869308, "num_tokens": 1146968.0, "step": 6 }, { "epoch": 0.00963855421686747, "grad_norm": 5.590318202972412, "learning_rate": 4.242424242424243e-06, "loss": 4.2369, "mean_token_accuracy": 0.2513838531449437, "num_tokens": 1443830.0, "step": 7 }, { "epoch": 0.011015490533562823, "grad_norm": 8.490633964538574, "learning_rate": 4.848484848484849e-06, "loss": 4.6317, "mean_token_accuracy": 0.2664322331547737, "num_tokens": 1667243.0, "step": 8 }, { "epoch": 0.012392426850258176, "grad_norm": 7.378446578979492, "learning_rate": 5.4545454545454545e-06, "loss": 4.5733, "mean_token_accuracy": 0.28765494003891945, "num_tokens": 1850165.0, "step": 9 }, { "epoch": 0.013769363166953529, "grad_norm": 8.26639175415039, "learning_rate": 6.060606060606061e-06, "loss": 4.688, "mean_token_accuracy": 0.2813313528895378, "num_tokens": 2012517.0, "step": 10 }, { "epoch": 0.015146299483648882, "grad_norm": 8.842890739440918, "learning_rate": 6.666666666666667e-06, "loss": 4.7173, "mean_token_accuracy": 0.2718987576663494, "num_tokens": 2161289.0, "step": 11 }, { "epoch": 0.016523235800344233, "grad_norm": 9.959588050842285, "learning_rate": 7.272727272727273e-06, "loss": 4.37, "mean_token_accuracy": 0.3368013799190521, "num_tokens": 2296876.0, "step": 12 }, { "epoch": 0.017900172117039585, "grad_norm": 2.6649534702301025, "learning_rate": 7.87878787878788e-06, "loss": 3.8364, "mean_token_accuracy": 0.35284729674458504, "num_tokens": 2551520.0, "step": 13 }, { "epoch": 0.01927710843373494, "grad_norm": 2.641235113143921, "learning_rate": 8.484848484848486e-06, "loss": 3.9591, "mean_token_accuracy": 0.34351223334670067, "num_tokens": 2792980.0, "step": 14 }, { "epoch": 0.020654044750430294, "grad_norm": 2.882537603378296, "learning_rate": 9.090909090909091e-06, "loss": 3.924, "mean_token_accuracy": 0.3523600399494171, "num_tokens": 2984118.0, "step": 15 }, { "epoch": 0.022030981067125647, "grad_norm": 2.1470162868499756, "learning_rate": 9.696969696969698e-06, "loss": 3.8433, "mean_token_accuracy": 0.36025111749768257, "num_tokens": 3151043.0, "step": 16 }, { "epoch": 0.023407917383821, "grad_norm": 2.5478034019470215, "learning_rate": 1.0303030303030304e-05, "loss": 3.8078, "mean_token_accuracy": 0.36437464877963066, "num_tokens": 3303159.0, "step": 17 }, { "epoch": 0.024784853700516352, "grad_norm": 2.9914867877960205, "learning_rate": 1.0909090909090909e-05, "loss": 3.7083, "mean_token_accuracy": 0.3877281956374645, "num_tokens": 3442609.0, "step": 18 }, { "epoch": 0.026161790017211705, "grad_norm": 2.3635497093200684, "learning_rate": 1.1515151515151517e-05, "loss": 3.537, "mean_token_accuracy": 0.38807569071650505, "num_tokens": 3650273.0, "step": 19 }, { "epoch": 0.027538726333907058, "grad_norm": 1.674182653427124, "learning_rate": 1.2121212121212122e-05, "loss": 3.7883, "mean_token_accuracy": 0.34599847719073296, "num_tokens": 3920449.0, "step": 20 }, { "epoch": 0.02891566265060241, "grad_norm": 1.8661237955093384, "learning_rate": 1.2727272727272728e-05, "loss": 3.742, "mean_token_accuracy": 0.355143491178751, "num_tokens": 4120212.0, "step": 21 }, { "epoch": 0.030292598967297763, "grad_norm": 1.4890022277832031, "learning_rate": 1.3333333333333333e-05, "loss": 3.6502, "mean_token_accuracy": 0.3710937909781933, "num_tokens": 4292230.0, "step": 22 }, { "epoch": 0.03166953528399311, "grad_norm": 1.5397722721099854, "learning_rate": 1.3939393939393942e-05, "loss": 3.5805, "mean_token_accuracy": 0.38628287240862846, "num_tokens": 4447392.0, "step": 23 }, { "epoch": 0.033046471600688465, "grad_norm": 2.4324872493743896, "learning_rate": 1.4545454545454546e-05, "loss": 3.5303, "mean_token_accuracy": 0.3951631784439087, "num_tokens": 4590005.0, "step": 24 }, { "epoch": 0.03442340791738382, "grad_norm": 7.23522424697876, "learning_rate": 1.5151515151515153e-05, "loss": 3.4399, "mean_token_accuracy": 0.42313380539417267, "num_tokens": 4714638.0, "step": 25 }, { "epoch": 0.03580034423407917, "grad_norm": 1.0757200717926025, "learning_rate": 1.575757575757576e-05, "loss": 3.5252, "mean_token_accuracy": 0.3763487972319126, "num_tokens": 5045223.0, "step": 26 }, { "epoch": 0.03717728055077452, "grad_norm": 1.147684097290039, "learning_rate": 1.6363636363636366e-05, "loss": 3.6524, "mean_token_accuracy": 0.3689238876104355, "num_tokens": 5255226.0, "step": 27 }, { "epoch": 0.03855421686746988, "grad_norm": 1.357102870941162, "learning_rate": 1.6969696969696972e-05, "loss": 3.5329, "mean_token_accuracy": 0.38678500428795815, "num_tokens": 5432410.0, "step": 28 }, { "epoch": 0.039931153184165236, "grad_norm": 1.286051869392395, "learning_rate": 1.7575757575757576e-05, "loss": 3.442, "mean_token_accuracy": 0.40385453775525093, "num_tokens": 5591005.0, "step": 29 }, { "epoch": 0.04130808950086059, "grad_norm": 1.6013883352279663, "learning_rate": 1.8181818181818182e-05, "loss": 3.3502, "mean_token_accuracy": 0.418246079236269, "num_tokens": 5736535.0, "step": 30 }, { "epoch": 0.04268502581755594, "grad_norm": 2.5897297859191895, "learning_rate": 1.8787878787878792e-05, "loss": 3.185, "mean_token_accuracy": 0.44275902211666107, "num_tokens": 5866876.0, "step": 31 }, { "epoch": 0.044061962134251294, "grad_norm": 1.1504346132278442, "learning_rate": 1.9393939393939395e-05, "loss": 3.3967, "mean_token_accuracy": 0.4218350239098072, "num_tokens": 6160504.0, "step": 32 }, { "epoch": 0.045438898450946646, "grad_norm": 1.1350502967834473, "learning_rate": 2e-05, "loss": 3.5826, "mean_token_accuracy": 0.373053427785635, "num_tokens": 6384571.0, "step": 33 }, { "epoch": 0.046815834767642, "grad_norm": 1.1703006029129028, "learning_rate": 2.0606060606060608e-05, "loss": 3.4834, "mean_token_accuracy": 0.3872351236641407, "num_tokens": 6568493.0, "step": 34 }, { "epoch": 0.04819277108433735, "grad_norm": 1.157688021659851, "learning_rate": 2.121212121212121e-05, "loss": 3.3709, "mean_token_accuracy": 0.4116259403526783, "num_tokens": 6731269.0, "step": 35 }, { "epoch": 0.049569707401032705, "grad_norm": 1.4632712602615356, "learning_rate": 2.1818181818181818e-05, "loss": 3.229, "mean_token_accuracy": 0.43330618739128113, "num_tokens": 6880583.0, "step": 36 }, { "epoch": 0.05094664371772806, "grad_norm": 1.6523449420928955, "learning_rate": 2.2424242424242424e-05, "loss": 3.1328, "mean_token_accuracy": 0.4496758282184601, "num_tokens": 7016691.0, "step": 37 }, { "epoch": 0.05232358003442341, "grad_norm": 1.029861569404602, "learning_rate": 2.3030303030303034e-05, "loss": 3.2078, "mean_token_accuracy": 0.46514255926012993, "num_tokens": 7269984.0, "step": 38 }, { "epoch": 0.05370051635111876, "grad_norm": 0.9922376275062561, "learning_rate": 2.363636363636364e-05, "loss": 3.4512, "mean_token_accuracy": 0.39229144901037216, "num_tokens": 7510064.0, "step": 39 }, { "epoch": 0.055077452667814115, "grad_norm": 1.0325181484222412, "learning_rate": 2.4242424242424244e-05, "loss": 3.4101, "mean_token_accuracy": 0.39499303326010704, "num_tokens": 7700172.0, "step": 40 }, { "epoch": 0.05645438898450947, "grad_norm": 1.1973949670791626, "learning_rate": 2.484848484848485e-05, "loss": 3.2588, "mean_token_accuracy": 0.4236433543264866, "num_tokens": 7866579.0, "step": 41 }, { "epoch": 0.05783132530120482, "grad_norm": 1.1953175067901611, "learning_rate": 2.5454545454545457e-05, "loss": 3.1275, "mean_token_accuracy": 0.44583262130618095, "num_tokens": 8018251.0, "step": 42 }, { "epoch": 0.059208261617900174, "grad_norm": 1.6865251064300537, "learning_rate": 2.6060606060606063e-05, "loss": 3.0513, "mean_token_accuracy": 0.45363081991672516, "num_tokens": 8157216.0, "step": 43 }, { "epoch": 0.060585197934595526, "grad_norm": 1.2735155820846558, "learning_rate": 2.6666666666666667e-05, "loss": 3.034, "mean_token_accuracy": 0.49388210102915764, "num_tokens": 8366210.0, "step": 44 }, { "epoch": 0.06196213425129088, "grad_norm": 0.9794257283210754, "learning_rate": 2.7272727272727273e-05, "loss": 3.4092, "mean_token_accuracy": 0.3990417644381523, "num_tokens": 8632099.0, "step": 45 }, { "epoch": 0.06333907056798622, "grad_norm": 1.0446025133132935, "learning_rate": 2.7878787878787883e-05, "loss": 3.3861, "mean_token_accuracy": 0.4003394581377506, "num_tokens": 8830409.0, "step": 46 }, { "epoch": 0.06471600688468158, "grad_norm": 1.3185311555862427, "learning_rate": 2.848484848484849e-05, "loss": 3.2841, "mean_token_accuracy": 0.4185495004057884, "num_tokens": 9001157.0, "step": 47 }, { "epoch": 0.06609294320137693, "grad_norm": 1.4278438091278076, "learning_rate": 2.9090909090909093e-05, "loss": 3.1057, "mean_token_accuracy": 0.4466322995722294, "num_tokens": 9155978.0, "step": 48 }, { "epoch": 0.06746987951807229, "grad_norm": 1.3549339771270752, "learning_rate": 2.96969696969697e-05, "loss": 2.988, "mean_token_accuracy": 0.46312542632222176, "num_tokens": 9298583.0, "step": 49 }, { "epoch": 0.06884681583476764, "grad_norm": 2.283503293991089, "learning_rate": 3.0303030303030306e-05, "loss": 2.8224, "mean_token_accuracy": 0.5069313831627369, "num_tokens": 9422920.0, "step": 50 }, { "epoch": 0.070223752151463, "grad_norm": 1.3720706701278687, "learning_rate": 3.090909090909091e-05, "loss": 3.2607, "mean_token_accuracy": 0.4121282249689102, "num_tokens": 9745944.0, "step": 51 }, { "epoch": 0.07160068846815834, "grad_norm": 1.0494626760482788, "learning_rate": 3.151515151515152e-05, "loss": 3.3873, "mean_token_accuracy": 0.40015388280153275, "num_tokens": 9955716.0, "step": 52 }, { "epoch": 0.0729776247848537, "grad_norm": 1.1622111797332764, "learning_rate": 3.212121212121212e-05, "loss": 3.2562, "mean_token_accuracy": 0.4189477413892746, "num_tokens": 10132931.0, "step": 53 }, { "epoch": 0.07435456110154905, "grad_norm": 1.354245662689209, "learning_rate": 3.272727272727273e-05, "loss": 3.0815, "mean_token_accuracy": 0.4434211589396, "num_tokens": 10291659.0, "step": 54 }, { "epoch": 0.0757314974182444, "grad_norm": 1.6016267538070679, "learning_rate": 3.3333333333333335e-05, "loss": 2.95, "mean_token_accuracy": 0.4665900021791458, "num_tokens": 10437771.0, "step": 55 }, { "epoch": 0.07710843373493977, "grad_norm": 1.5412572622299194, "learning_rate": 3.3939393939393945e-05, "loss": 2.7839, "mean_token_accuracy": 0.49913161620497704, "num_tokens": 10569407.0, "step": 56 }, { "epoch": 0.07848537005163511, "grad_norm": 1.5170376300811768, "learning_rate": 3.454545454545455e-05, "loss": 3.122, "mean_token_accuracy": 0.45425254479050636, "num_tokens": 10862140.0, "step": 57 }, { "epoch": 0.07986230636833047, "grad_norm": 1.049173355102539, "learning_rate": 3.515151515151515e-05, "loss": 3.3338, "mean_token_accuracy": 0.40747813880443573, "num_tokens": 11086382.0, "step": 58 }, { "epoch": 0.08123924268502582, "grad_norm": 1.2531183958053589, "learning_rate": 3.575757575757576e-05, "loss": 3.216, "mean_token_accuracy": 0.4268627129495144, "num_tokens": 11270201.0, "step": 59 }, { "epoch": 0.08261617900172118, "grad_norm": 1.412578821182251, "learning_rate": 3.6363636363636364e-05, "loss": 3.0755, "mean_token_accuracy": 0.44798268377780914, "num_tokens": 11433031.0, "step": 60 }, { "epoch": 0.08399311531841652, "grad_norm": 1.3975803852081299, "learning_rate": 3.6969696969696974e-05, "loss": 2.9062, "mean_token_accuracy": 0.47047606855630875, "num_tokens": 11581858.0, "step": 61 }, { "epoch": 0.08537005163511188, "grad_norm": 1.316818118095398, "learning_rate": 3.7575757575757584e-05, "loss": 2.7432, "mean_token_accuracy": 0.4985319674015045, "num_tokens": 11717380.0, "step": 62 }, { "epoch": 0.08674698795180723, "grad_norm": 2.001642942428589, "learning_rate": 3.818181818181819e-05, "loss": 3.1496, "mean_token_accuracy": 0.47866738960146904, "num_tokens": 11974574.0, "step": 63 }, { "epoch": 0.08812392426850259, "grad_norm": 2.274289131164551, "learning_rate": 3.878787878787879e-05, "loss": 3.4154, "mean_token_accuracy": 0.3945396728813648, "num_tokens": 12217405.0, "step": 64 }, { "epoch": 0.08950086058519793, "grad_norm": 1.7582924365997314, "learning_rate": 3.93939393939394e-05, "loss": 3.2969, "mean_token_accuracy": 0.40971318632364273, "num_tokens": 12408749.0, "step": 65 }, { "epoch": 0.09087779690189329, "grad_norm": 1.5182671546936035, "learning_rate": 4e-05, "loss": 3.0945, "mean_token_accuracy": 0.4418698735535145, "num_tokens": 12575969.0, "step": 66 }, { "epoch": 0.09225473321858864, "grad_norm": 1.7919774055480957, "learning_rate": 3.9999977873545065e-05, "loss": 2.9333, "mean_token_accuracy": 0.46524856984615326, "num_tokens": 12728358.0, "step": 67 }, { "epoch": 0.093631669535284, "grad_norm": 1.9320465326309204, "learning_rate": 3.999991149422921e-05, "loss": 2.8163, "mean_token_accuracy": 0.4865446351468563, "num_tokens": 12867983.0, "step": 68 }, { "epoch": 0.09500860585197934, "grad_norm": 1.9418874979019165, "learning_rate": 3.999980086219931e-05, "loss": 2.8936, "mean_token_accuracy": 0.5090156830847263, "num_tokens": 13076698.0, "step": 69 }, { "epoch": 0.0963855421686747, "grad_norm": 1.3485099077224731, "learning_rate": 3.9999645977700165e-05, "loss": 3.2909, "mean_token_accuracy": 0.4138975478708744, "num_tokens": 13345435.0, "step": 70 }, { "epoch": 0.09776247848537005, "grad_norm": 1.2804901599884033, "learning_rate": 3.9999446841074465e-05, "loss": 3.2549, "mean_token_accuracy": 0.4170922040939331, "num_tokens": 13546114.0, "step": 71 }, { "epoch": 0.09913941480206541, "grad_norm": 1.502502202987671, "learning_rate": 3.999920345276283e-05, "loss": 3.1153, "mean_token_accuracy": 0.43394744396209717, "num_tokens": 13718182.0, "step": 72 }, { "epoch": 0.10051635111876076, "grad_norm": 1.5412629842758179, "learning_rate": 3.9998915813303806e-05, "loss": 2.891, "mean_token_accuracy": 0.4719800315797329, "num_tokens": 13873589.0, "step": 73 }, { "epoch": 0.10189328743545611, "grad_norm": 1.618108868598938, "learning_rate": 3.999858392333382e-05, "loss": 2.7543, "mean_token_accuracy": 0.4962281361222267, "num_tokens": 14016592.0, "step": 74 }, { "epoch": 0.10327022375215146, "grad_norm": 1.9174052476882935, "learning_rate": 3.999820778358724e-05, "loss": 2.6526, "mean_token_accuracy": 0.5274976417422295, "num_tokens": 14141206.0, "step": 75 }, { "epoch": 0.10464716006884682, "grad_norm": 1.4943766593933105, "learning_rate": 3.9997787394896324e-05, "loss": 3.1503, "mean_token_accuracy": 0.4229189530014992, "num_tokens": 14469584.0, "step": 76 }, { "epoch": 0.10602409638554217, "grad_norm": 1.1858155727386475, "learning_rate": 3.9997322758191244e-05, "loss": 3.2892, "mean_token_accuracy": 0.4108862318098545, "num_tokens": 14681353.0, "step": 77 }, { "epoch": 0.10740103270223753, "grad_norm": 1.2566277980804443, "learning_rate": 3.999681387450007e-05, "loss": 3.1278, "mean_token_accuracy": 0.4359424263238907, "num_tokens": 14859255.0, "step": 78 }, { "epoch": 0.10877796901893287, "grad_norm": 1.2456536293029785, "learning_rate": 3.999626074494879e-05, "loss": 2.9232, "mean_token_accuracy": 0.4665600135922432, "num_tokens": 15018871.0, "step": 79 }, { "epoch": 0.11015490533562823, "grad_norm": 1.3415398597717285, "learning_rate": 3.999566337076128e-05, "loss": 2.7716, "mean_token_accuracy": 0.49212757498025894, "num_tokens": 15165135.0, "step": 80 }, { "epoch": 0.11153184165232358, "grad_norm": 1.690281629562378, "learning_rate": 3.999502175325932e-05, "loss": 2.5981, "mean_token_accuracy": 0.5234777852892876, "num_tokens": 15296636.0, "step": 81 }, { "epoch": 0.11290877796901894, "grad_norm": 1.783535122871399, "learning_rate": 3.999433589386259e-05, "loss": 3.1024, "mean_token_accuracy": 0.44836249202489853, "num_tokens": 15590515.0, "step": 82 }, { "epoch": 0.11428571428571428, "grad_norm": 1.3803547620773315, "learning_rate": 3.999360579408863e-05, "loss": 3.2842, "mean_token_accuracy": 0.40831541642546654, "num_tokens": 15814153.0, "step": 83 }, { "epoch": 0.11566265060240964, "grad_norm": 1.1726012229919434, "learning_rate": 3.999283145555291e-05, "loss": 3.1306, "mean_token_accuracy": 0.43546949699521065, "num_tokens": 15996905.0, "step": 84 }, { "epoch": 0.11703958691910499, "grad_norm": 1.5922788381576538, "learning_rate": 3.9992012879968765e-05, "loss": 2.9024, "mean_token_accuracy": 0.4712630398571491, "num_tokens": 16159218.0, "step": 85 }, { "epoch": 0.11841652323580035, "grad_norm": 1.3921949863433838, "learning_rate": 3.999115006914741e-05, "loss": 2.7741, "mean_token_accuracy": 0.490768201649189, "num_tokens": 16308116.0, "step": 86 }, { "epoch": 0.11979345955249569, "grad_norm": 1.4682472944259644, "learning_rate": 3.999024302499794e-05, "loss": 2.6256, "mean_token_accuracy": 0.5166518725454807, "num_tokens": 16444038.0, "step": 87 }, { "epoch": 0.12117039586919105, "grad_norm": 1.6512724161148071, "learning_rate": 3.9989291749527314e-05, "loss": 2.8862, "mean_token_accuracy": 0.49898210912942886, "num_tokens": 16700711.0, "step": 88 }, { "epoch": 0.1225473321858864, "grad_norm": 1.082132339477539, "learning_rate": 3.998829624484038e-05, "loss": 3.2317, "mean_token_accuracy": 0.4210398681461811, "num_tokens": 16942496.0, "step": 89 }, { "epoch": 0.12392426850258176, "grad_norm": 1.1800761222839355, "learning_rate": 3.998725651313984e-05, "loss": 3.1222, "mean_token_accuracy": 0.4369179457426071, "num_tokens": 17133075.0, "step": 90 }, { "epoch": 0.12530120481927712, "grad_norm": 1.2172211408615112, "learning_rate": 3.998617255672623e-05, "loss": 2.9118, "mean_token_accuracy": 0.4687006361782551, "num_tokens": 17300054.0, "step": 91 }, { "epoch": 0.12667814113597245, "grad_norm": 1.214022159576416, "learning_rate": 3.998504437799799e-05, "loss": 2.7254, "mean_token_accuracy": 0.499465461820364, "num_tokens": 17452319.0, "step": 92 }, { "epoch": 0.1280550774526678, "grad_norm": 1.2541497945785522, "learning_rate": 3.998387197945135e-05, "loss": 2.5854, "mean_token_accuracy": 0.5150745622813702, "num_tokens": 17591865.0, "step": 93 }, { "epoch": 0.12943201376936317, "grad_norm": 1.8602919578552246, "learning_rate": 3.9982655363680436e-05, "loss": 2.7917, "mean_token_accuracy": 0.5266587473452091, "num_tokens": 17802233.0, "step": 94 }, { "epoch": 0.13080895008605853, "grad_norm": 1.329105257987976, "learning_rate": 3.998139453337718e-05, "loss": 3.1903, "mean_token_accuracy": 0.42970065400004387, "num_tokens": 18071795.0, "step": 95 }, { "epoch": 0.13218588640275386, "grad_norm": 1.2074801921844482, "learning_rate": 3.9980089491331344e-05, "loss": 3.1608, "mean_token_accuracy": 0.4344066306948662, "num_tokens": 18271396.0, "step": 96 }, { "epoch": 0.13356282271944922, "grad_norm": 1.2836717367172241, "learning_rate": 3.997874024043053e-05, "loss": 2.978, "mean_token_accuracy": 0.45694268494844437, "num_tokens": 18443179.0, "step": 97 }, { "epoch": 0.13493975903614458, "grad_norm": 1.2843568325042725, "learning_rate": 3.9977346783660165e-05, "loss": 2.7224, "mean_token_accuracy": 0.4963863864541054, "num_tokens": 18598468.0, "step": 98 }, { "epoch": 0.13631669535283994, "grad_norm": 1.2547967433929443, "learning_rate": 3.997590912410345e-05, "loss": 2.5827, "mean_token_accuracy": 0.5230307504534721, "num_tokens": 18740961.0, "step": 99 }, { "epoch": 0.13769363166953527, "grad_norm": 1.727879524230957, "learning_rate": 3.997442726494143e-05, "loss": 2.4895, "mean_token_accuracy": 0.5500398725271225, "num_tokens": 18865461.0, "step": 100 }, { "epoch": 0.13907056798623063, "grad_norm": 1.48578679561615, "learning_rate": 3.997290120945294e-05, "loss": 3.0296, "mean_token_accuracy": 0.43875638023018837, "num_tokens": 19199134.0, "step": 101 }, { "epoch": 0.140447504302926, "grad_norm": 1.156681776046753, "learning_rate": 3.997133096101458e-05, "loss": 3.1527, "mean_token_accuracy": 0.4271148592233658, "num_tokens": 19411013.0, "step": 102 }, { "epoch": 0.14182444061962135, "grad_norm": 1.2317570447921753, "learning_rate": 3.996971652310077e-05, "loss": 2.9644, "mean_token_accuracy": 0.4576675780117512, "num_tokens": 19589609.0, "step": 103 }, { "epoch": 0.14320137693631668, "grad_norm": 1.2452635765075684, "learning_rate": 3.996805789928368e-05, "loss": 2.7349, "mean_token_accuracy": 0.49770377203822136, "num_tokens": 19749065.0, "step": 104 }, { "epoch": 0.14457831325301204, "grad_norm": 1.3307114839553833, "learning_rate": 3.996635509323327e-05, "loss": 2.5619, "mean_token_accuracy": 0.5193930715322495, "num_tokens": 19895146.0, "step": 105 }, { "epoch": 0.1459552495697074, "grad_norm": 1.5277968645095825, "learning_rate": 3.996460810871723e-05, "loss": 2.4535, "mean_token_accuracy": 0.5457143932580948, "num_tokens": 20026678.0, "step": 106 }, { "epoch": 0.14733218588640276, "grad_norm": 1.521219253540039, "learning_rate": 3.996281694960103e-05, "loss": 2.9294, "mean_token_accuracy": 0.47624121978878975, "num_tokens": 20323137.0, "step": 107 }, { "epoch": 0.1487091222030981, "grad_norm": 1.208099365234375, "learning_rate": 3.9960981619847856e-05, "loss": 3.1125, "mean_token_accuracy": 0.437701728194952, "num_tokens": 20549234.0, "step": 108 }, { "epoch": 0.15008605851979345, "grad_norm": 1.2402607202529907, "learning_rate": 3.995910212351865e-05, "loss": 2.9822, "mean_token_accuracy": 0.458964291960001, "num_tokens": 20733689.0, "step": 109 }, { "epoch": 0.1514629948364888, "grad_norm": 1.2783458232879639, "learning_rate": 3.995717846477207e-05, "loss": 2.7682, "mean_token_accuracy": 0.48851340264081955, "num_tokens": 20896358.0, "step": 110 }, { "epoch": 0.15283993115318417, "grad_norm": 1.2558504343032837, "learning_rate": 3.99552106478645e-05, "loss": 2.553, "mean_token_accuracy": 0.5236981362104416, "num_tokens": 21045196.0, "step": 111 }, { "epoch": 0.15421686746987953, "grad_norm": 1.4791960716247559, "learning_rate": 3.995319867715001e-05, "loss": 2.4863, "mean_token_accuracy": 0.5309383049607277, "num_tokens": 21181043.0, "step": 112 }, { "epoch": 0.15559380378657486, "grad_norm": 1.6049681901931763, "learning_rate": 3.9951142557080375e-05, "loss": 2.8579, "mean_token_accuracy": 0.5141473487019539, "num_tokens": 21437381.0, "step": 113 }, { "epoch": 0.15697074010327022, "grad_norm": 1.1697379350662231, "learning_rate": 3.994904229220507e-05, "loss": 3.1232, "mean_token_accuracy": 0.43365732580423355, "num_tokens": 21681378.0, "step": 114 }, { "epoch": 0.15834767641996558, "grad_norm": 1.2635104656219482, "learning_rate": 3.9946897887171244e-05, "loss": 3.0063, "mean_token_accuracy": 0.4514121040701866, "num_tokens": 21873702.0, "step": 115 }, { "epoch": 0.15972461273666094, "grad_norm": 1.2445106506347656, "learning_rate": 3.994470934672368e-05, "loss": 2.79, "mean_token_accuracy": 0.48482318222522736, "num_tokens": 22041062.0, "step": 116 }, { "epoch": 0.16110154905335627, "grad_norm": 1.2905422449111938, "learning_rate": 3.9942476675704854e-05, "loss": 2.583, "mean_token_accuracy": 0.5192235261201859, "num_tokens": 22193115.0, "step": 117 }, { "epoch": 0.16247848537005163, "grad_norm": 1.3919048309326172, "learning_rate": 3.9940199879054884e-05, "loss": 2.4655, "mean_token_accuracy": 0.5374803096055984, "num_tokens": 22332448.0, "step": 118 }, { "epoch": 0.163855421686747, "grad_norm": 1.3141354322433472, "learning_rate": 3.9937878961811504e-05, "loss": 2.5722, "mean_token_accuracy": 0.5540702044963837, "num_tokens": 22540756.0, "step": 119 }, { "epoch": 0.16523235800344235, "grad_norm": 1.095293402671814, "learning_rate": 3.993551392911009e-05, "loss": 3.1242, "mean_token_accuracy": 0.4376874193549156, "num_tokens": 22810498.0, "step": 120 }, { "epoch": 0.16660929432013769, "grad_norm": 1.0800588130950928, "learning_rate": 3.993310478618361e-05, "loss": 3.0056, "mean_token_accuracy": 0.4532472863793373, "num_tokens": 23010640.0, "step": 121 }, { "epoch": 0.16798623063683304, "grad_norm": 1.1548457145690918, "learning_rate": 3.993065153836265e-05, "loss": 2.8199, "mean_token_accuracy": 0.4794413484632969, "num_tokens": 23182504.0, "step": 122 }, { "epoch": 0.1693631669535284, "grad_norm": 1.251437783241272, "learning_rate": 3.9928154191075375e-05, "loss": 2.6346, "mean_token_accuracy": 0.51049018278718, "num_tokens": 23337833.0, "step": 123 }, { "epoch": 0.17074010327022376, "grad_norm": 1.2779113054275513, "learning_rate": 3.9925612749847527e-05, "loss": 2.4496, "mean_token_accuracy": 0.5421527549624443, "num_tokens": 23480645.0, "step": 124 }, { "epoch": 0.1721170395869191, "grad_norm": 1.5884668827056885, "learning_rate": 3.9923027220302425e-05, "loss": 2.3761, "mean_token_accuracy": 0.5633906200528145, "num_tokens": 23605027.0, "step": 125 }, { "epoch": 0.17349397590361446, "grad_norm": 2.0900213718414307, "learning_rate": 3.9920397608160925e-05, "loss": 2.9842, "mean_token_accuracy": 0.44503533840179443, "num_tokens": 23930619.0, "step": 126 }, { "epoch": 0.17487091222030982, "grad_norm": 1.7500511407852173, "learning_rate": 3.991772391924142e-05, "loss": 3.1, "mean_token_accuracy": 0.4385366588830948, "num_tokens": 24140468.0, "step": 127 }, { "epoch": 0.17624784853700518, "grad_norm": 1.5554214715957642, "learning_rate": 3.991500615945983e-05, "loss": 2.8897, "mean_token_accuracy": 0.4732564836740494, "num_tokens": 24316861.0, "step": 128 }, { "epoch": 0.1776247848537005, "grad_norm": 1.4286441802978516, "learning_rate": 3.991224433482961e-05, "loss": 2.6964, "mean_token_accuracy": 0.5017295368015766, "num_tokens": 24475183.0, "step": 129 }, { "epoch": 0.17900172117039587, "grad_norm": 1.4087226390838623, "learning_rate": 3.9909438451461695e-05, "loss": 2.4573, "mean_token_accuracy": 0.5422694906592369, "num_tokens": 24620710.0, "step": 130 }, { "epoch": 0.18037865748709123, "grad_norm": 1.5818346738815308, "learning_rate": 3.99065885155645e-05, "loss": 2.3483, "mean_token_accuracy": 0.5621213242411613, "num_tokens": 24751689.0, "step": 131 }, { "epoch": 0.18175559380378659, "grad_norm": 1.4020298719406128, "learning_rate": 3.990369453344394e-05, "loss": 2.8081, "mean_token_accuracy": 0.4964730441570282, "num_tokens": 25047666.0, "step": 132 }, { "epoch": 0.18313253012048192, "grad_norm": 1.0241284370422363, "learning_rate": 3.990075651150336e-05, "loss": 3.1029, "mean_token_accuracy": 0.4365879110991955, "num_tokens": 25271196.0, "step": 133 }, { "epoch": 0.18450946643717728, "grad_norm": 1.1354044675827026, "learning_rate": 3.9897774456243555e-05, "loss": 2.8884, "mean_token_accuracy": 0.4657990485429764, "num_tokens": 25454744.0, "step": 134 }, { "epoch": 0.18588640275387264, "grad_norm": 1.2209124565124512, "learning_rate": 3.989474837426277e-05, "loss": 2.6915, "mean_token_accuracy": 0.5010011680424213, "num_tokens": 25617044.0, "step": 135 }, { "epoch": 0.187263339070568, "grad_norm": 1.1696401834487915, "learning_rate": 3.9891678272256646e-05, "loss": 2.4542, "mean_token_accuracy": 0.5355884060263634, "num_tokens": 25766082.0, "step": 136 }, { "epoch": 0.18864027538726333, "grad_norm": 1.2649626731872559, "learning_rate": 3.988856415701823e-05, "loss": 2.3023, "mean_token_accuracy": 0.5671580955386162, "num_tokens": 25901767.0, "step": 137 }, { "epoch": 0.1900172117039587, "grad_norm": 1.4093657732009888, "learning_rate": 3.9885406035437953e-05, "loss": 2.7506, "mean_token_accuracy": 0.5313058495521545, "num_tokens": 26160466.0, "step": 138 }, { "epoch": 0.19139414802065405, "grad_norm": 1.0992358922958374, "learning_rate": 3.988220391450361e-05, "loss": 3.0729, "mean_token_accuracy": 0.44413863494992256, "num_tokens": 26404533.0, "step": 139 }, { "epoch": 0.1927710843373494, "grad_norm": 1.0670437812805176, "learning_rate": 3.987895780130039e-05, "loss": 2.8968, "mean_token_accuracy": 0.4696607105433941, "num_tokens": 26595707.0, "step": 140 }, { "epoch": 0.19414802065404474, "grad_norm": 1.1093326807022095, "learning_rate": 3.987566770301076e-05, "loss": 2.7027, "mean_token_accuracy": 0.504904005676508, "num_tokens": 26762959.0, "step": 141 }, { "epoch": 0.1955249569707401, "grad_norm": 1.1726257801055908, "learning_rate": 3.987233362691455e-05, "loss": 2.4716, "mean_token_accuracy": 0.5397141054272652, "num_tokens": 26914825.0, "step": 142 }, { "epoch": 0.19690189328743546, "grad_norm": 1.175426721572876, "learning_rate": 3.986895558038889e-05, "loss": 2.3363, "mean_token_accuracy": 0.5584604367613792, "num_tokens": 27054256.0, "step": 143 }, { "epoch": 0.19827882960413082, "grad_norm": 1.4846247434616089, "learning_rate": 3.98655335709082e-05, "loss": 2.5307, "mean_token_accuracy": 0.5590794757008553, "num_tokens": 27261349.0, "step": 144 }, { "epoch": 0.19965576592082615, "grad_norm": 1.0601589679718018, "learning_rate": 3.986206760604418e-05, "loss": 2.9802, "mean_token_accuracy": 0.45722198858857155, "num_tokens": 27529458.0, "step": 145 }, { "epoch": 0.2010327022375215, "grad_norm": 1.107832431793213, "learning_rate": 3.9858557693465766e-05, "loss": 2.9438, "mean_token_accuracy": 0.4623042568564415, "num_tokens": 27730428.0, "step": 146 }, { "epoch": 0.20240963855421687, "grad_norm": 1.189303994178772, "learning_rate": 3.985500384093917e-05, "loss": 2.7374, "mean_token_accuracy": 0.49236099421977997, "num_tokens": 27903239.0, "step": 147 }, { "epoch": 0.20378657487091223, "grad_norm": 1.214866042137146, "learning_rate": 3.9851406056327785e-05, "loss": 2.4915, "mean_token_accuracy": 0.5351542979478836, "num_tokens": 28058872.0, "step": 148 }, { "epoch": 0.20516351118760756, "grad_norm": 1.1789809465408325, "learning_rate": 3.984776434759225e-05, "loss": 2.3177, "mean_token_accuracy": 0.5622964054346085, "num_tokens": 28201597.0, "step": 149 }, { "epoch": 0.20654044750430292, "grad_norm": 1.69223153591156, "learning_rate": 3.984407872279037e-05, "loss": 2.279, "mean_token_accuracy": 0.5818121880292892, "num_tokens": 28326240.0, "step": 150 }, { "epoch": 0.20791738382099828, "grad_norm": 1.329267144203186, "learning_rate": 3.9840349190077134e-05, "loss": 2.8356, "mean_token_accuracy": 0.46922897547483444, "num_tokens": 28651068.0, "step": 151 }, { "epoch": 0.20929432013769364, "grad_norm": 1.318422794342041, "learning_rate": 3.983657575770466e-05, "loss": 3.0002, "mean_token_accuracy": 0.4543772302567959, "num_tokens": 28861795.0, "step": 152 }, { "epoch": 0.21067125645438897, "grad_norm": 1.4330617189407349, "learning_rate": 3.983275843402222e-05, "loss": 2.8329, "mean_token_accuracy": 0.4805055074393749, "num_tokens": 29039564.0, "step": 153 }, { "epoch": 0.21204819277108433, "grad_norm": 1.2919504642486572, "learning_rate": 3.982889722747621e-05, "loss": 2.5063, "mean_token_accuracy": 0.5300585851073265, "num_tokens": 29198443.0, "step": 154 }, { "epoch": 0.2134251290877797, "grad_norm": 1.206323504447937, "learning_rate": 3.9824992146610104e-05, "loss": 2.3003, "mean_token_accuracy": 0.565208375453949, "num_tokens": 29344681.0, "step": 155 }, { "epoch": 0.21480206540447505, "grad_norm": 1.6402798891067505, "learning_rate": 3.982104320006446e-05, "loss": 2.2588, "mean_token_accuracy": 0.5762916803359985, "num_tokens": 29476418.0, "step": 156 }, { "epoch": 0.21617900172117038, "grad_norm": 1.5212482213974, "learning_rate": 3.9817050396576894e-05, "loss": 2.7962, "mean_token_accuracy": 0.4977217987179756, "num_tokens": 29771358.0, "step": 157 }, { "epoch": 0.21755593803786574, "grad_norm": 1.1356741189956665, "learning_rate": 3.9813013744982074e-05, "loss": 2.9839, "mean_token_accuracy": 0.45220955088734627, "num_tokens": 29997063.0, "step": 158 }, { "epoch": 0.2189328743545611, "grad_norm": 1.156484842300415, "learning_rate": 3.9808933254211665e-05, "loss": 2.847, "mean_token_accuracy": 0.46941187232732773, "num_tokens": 30181480.0, "step": 159 }, { "epoch": 0.22030981067125646, "grad_norm": 1.1849952936172485, "learning_rate": 3.9804808933294367e-05, "loss": 2.6016, "mean_token_accuracy": 0.5147284120321274, "num_tokens": 30344735.0, "step": 160 }, { "epoch": 0.2216867469879518, "grad_norm": 1.190155029296875, "learning_rate": 3.980064079135583e-05, "loss": 2.3564, "mean_token_accuracy": 0.5556826516985893, "num_tokens": 30493878.0, "step": 161 }, { "epoch": 0.22306368330464715, "grad_norm": 1.3447614908218384, "learning_rate": 3.979642883761866e-05, "loss": 2.2609, "mean_token_accuracy": 0.5694480910897255, "num_tokens": 30629352.0, "step": 162 }, { "epoch": 0.2244406196213425, "grad_norm": 1.4340946674346924, "learning_rate": 3.9792173081402436e-05, "loss": 2.5943, "mean_token_accuracy": 0.5412869080901146, "num_tokens": 30888819.0, "step": 163 }, { "epoch": 0.22581755593803787, "grad_norm": 1.1763209104537964, "learning_rate": 3.9787873532123626e-05, "loss": 2.9802, "mean_token_accuracy": 0.45600955933332443, "num_tokens": 31132031.0, "step": 164 }, { "epoch": 0.22719449225473323, "grad_norm": 1.092564344406128, "learning_rate": 3.978353019929562e-05, "loss": 2.8546, "mean_token_accuracy": 0.4769134223461151, "num_tokens": 31324425.0, "step": 165 }, { "epoch": 0.22857142857142856, "grad_norm": 1.0953699350357056, "learning_rate": 3.977914309252867e-05, "loss": 2.6139, "mean_token_accuracy": 0.5165294483304024, "num_tokens": 31492277.0, "step": 166 }, { "epoch": 0.22994836488812392, "grad_norm": 1.1644035577774048, "learning_rate": 3.977471222152988e-05, "loss": 2.3927, "mean_token_accuracy": 0.5534327477216721, "num_tokens": 31645118.0, "step": 167 }, { "epoch": 0.23132530120481928, "grad_norm": 1.2917240858078003, "learning_rate": 3.977023759610321e-05, "loss": 2.2197, "mean_token_accuracy": 0.5778639391064644, "num_tokens": 31784905.0, "step": 168 }, { "epoch": 0.23270223752151464, "grad_norm": 1.577622413635254, "learning_rate": 3.976571922614941e-05, "loss": 2.5409, "mean_token_accuracy": 0.5701095275580883, "num_tokens": 31992085.0, "step": 169 }, { "epoch": 0.23407917383820998, "grad_norm": 1.2725235223770142, "learning_rate": 3.9761157121666034e-05, "loss": 2.9768, "mean_token_accuracy": 0.45868420600891113, "num_tokens": 32259080.0, "step": 170 }, { "epoch": 0.23545611015490533, "grad_norm": 1.082217812538147, "learning_rate": 3.9756551292747405e-05, "loss": 2.8792, "mean_token_accuracy": 0.4727262631058693, "num_tokens": 32458969.0, "step": 171 }, { "epoch": 0.2368330464716007, "grad_norm": 1.0830609798431396, "learning_rate": 3.9751901749584576e-05, "loss": 2.655, "mean_token_accuracy": 0.5083842761814594, "num_tokens": 32630510.0, "step": 172 }, { "epoch": 0.23820998278829605, "grad_norm": 1.2041423320770264, "learning_rate": 3.974720850246535e-05, "loss": 2.3802, "mean_token_accuracy": 0.5507602244615555, "num_tokens": 32785687.0, "step": 173 }, { "epoch": 0.23958691910499139, "grad_norm": 1.235414981842041, "learning_rate": 3.974247156177423e-05, "loss": 2.2306, "mean_token_accuracy": 0.5778275653719902, "num_tokens": 32928442.0, "step": 174 }, { "epoch": 0.24096385542168675, "grad_norm": 1.6115895509719849, "learning_rate": 3.973769093799236e-05, "loss": 2.2087, "mean_token_accuracy": 0.5911441370844841, "num_tokens": 33052944.0, "step": 175 }, { "epoch": 0.2423407917383821, "grad_norm": 1.5748047828674316, "learning_rate": 3.9732866641697586e-05, "loss": 2.8022, "mean_token_accuracy": 0.4748939946293831, "num_tokens": 33385370.0, "step": 176 }, { "epoch": 0.24371772805507746, "grad_norm": 1.179655909538269, "learning_rate": 3.9727998683564355e-05, "loss": 2.9066, "mean_token_accuracy": 0.4698343314230442, "num_tokens": 33595529.0, "step": 177 }, { "epoch": 0.2450946643717728, "grad_norm": 1.2804673910140991, "learning_rate": 3.972308707436374e-05, "loss": 2.6789, "mean_token_accuracy": 0.5036469213664532, "num_tokens": 33772961.0, "step": 178 }, { "epoch": 0.24647160068846816, "grad_norm": 1.4240158796310425, "learning_rate": 3.971813182496338e-05, "loss": 2.4648, "mean_token_accuracy": 0.5381069630384445, "num_tokens": 33931884.0, "step": 179 }, { "epoch": 0.24784853700516352, "grad_norm": 1.358250617980957, "learning_rate": 3.9713132946327494e-05, "loss": 2.2707, "mean_token_accuracy": 0.5676298290491104, "num_tokens": 34078245.0, "step": 180 }, { "epoch": 0.24922547332185888, "grad_norm": 1.4165507555007935, "learning_rate": 3.970809044951683e-05, "loss": 2.1418, "mean_token_accuracy": 0.592598021030426, "num_tokens": 34209838.0, "step": 181 }, { "epoch": 0.25060240963855424, "grad_norm": 1.4032273292541504, "learning_rate": 3.970300434568864e-05, "loss": 2.6762, "mean_token_accuracy": 0.514252245426178, "num_tokens": 34503118.0, "step": 182 }, { "epoch": 0.25197934595524957, "grad_norm": 1.1530755758285522, "learning_rate": 3.9697874646096675e-05, "loss": 2.9344, "mean_token_accuracy": 0.4632447026669979, "num_tokens": 34726733.0, "step": 183 }, { "epoch": 0.2533562822719449, "grad_norm": 1.3412889242172241, "learning_rate": 3.969270136209114e-05, "loss": 2.765, "mean_token_accuracy": 0.4843039773404598, "num_tokens": 34910547.0, "step": 184 }, { "epoch": 0.2547332185886403, "grad_norm": 1.3048303127288818, "learning_rate": 3.968748450511867e-05, "loss": 2.5325, "mean_token_accuracy": 0.5263201296329498, "num_tokens": 35073057.0, "step": 185 }, { "epoch": 0.2561101549053356, "grad_norm": 1.2190852165222168, "learning_rate": 3.968222408672232e-05, "loss": 2.2777, "mean_token_accuracy": 0.5692800432443619, "num_tokens": 35221879.0, "step": 186 }, { "epoch": 0.257487091222031, "grad_norm": 1.2192051410675049, "learning_rate": 3.967692011854155e-05, "loss": 2.1469, "mean_token_accuracy": 0.5913272276520729, "num_tokens": 35357496.0, "step": 187 }, { "epoch": 0.25886402753872634, "grad_norm": 1.887019395828247, "learning_rate": 3.967157261231215e-05, "loss": 2.7086, "mean_token_accuracy": 0.5381492264568806, "num_tokens": 35615741.0, "step": 188 }, { "epoch": 0.26024096385542167, "grad_norm": 1.5546183586120605, "learning_rate": 3.9666181579866244e-05, "loss": 2.9684, "mean_token_accuracy": 0.4551645554602146, "num_tokens": 35856340.0, "step": 189 }, { "epoch": 0.26161790017211706, "grad_norm": 1.441490650177002, "learning_rate": 3.966074703313229e-05, "loss": 2.8228, "mean_token_accuracy": 0.4812959171831608, "num_tokens": 36046901.0, "step": 190 }, { "epoch": 0.2629948364888124, "grad_norm": 1.3563860654830933, "learning_rate": 3.965526898413502e-05, "loss": 2.549, "mean_token_accuracy": 0.5227860286831856, "num_tokens": 36214032.0, "step": 191 }, { "epoch": 0.2643717728055077, "grad_norm": 1.1912411451339722, "learning_rate": 3.964974744499539e-05, "loss": 2.2718, "mean_token_accuracy": 0.5703057497739792, "num_tokens": 36366096.0, "step": 192 }, { "epoch": 0.2657487091222031, "grad_norm": 1.3855469226837158, "learning_rate": 3.9644182427930626e-05, "loss": 2.2028, "mean_token_accuracy": 0.5877008885145187, "num_tokens": 36505554.0, "step": 193 }, { "epoch": 0.26712564543889844, "grad_norm": 1.250901699066162, "learning_rate": 3.963857394525413e-05, "loss": 2.3548, "mean_token_accuracy": 0.5893898010253906, "num_tokens": 36713122.0, "step": 194 }, { "epoch": 0.2685025817555938, "grad_norm": 1.05732262134552, "learning_rate": 3.963292200937551e-05, "loss": 2.889, "mean_token_accuracy": 0.4682162404060364, "num_tokens": 36980692.0, "step": 195 }, { "epoch": 0.26987951807228916, "grad_norm": 0.9622476100921631, "learning_rate": 3.9627226632800456e-05, "loss": 2.8178, "mean_token_accuracy": 0.4801763519644737, "num_tokens": 37181297.0, "step": 196 }, { "epoch": 0.2712564543889845, "grad_norm": 1.109504222869873, "learning_rate": 3.962148782813085e-05, "loss": 2.5936, "mean_token_accuracy": 0.5161283686757088, "num_tokens": 37353823.0, "step": 197 }, { "epoch": 0.2726333907056799, "grad_norm": 1.1259183883666992, "learning_rate": 3.961570560806461e-05, "loss": 2.3341, "mean_token_accuracy": 0.5625190287828445, "num_tokens": 37509548.0, "step": 198 }, { "epoch": 0.2740103270223752, "grad_norm": 1.1394364833831787, "learning_rate": 3.960987998539576e-05, "loss": 2.154, "mean_token_accuracy": 0.5888020843267441, "num_tokens": 37652460.0, "step": 199 }, { "epoch": 0.27538726333907054, "grad_norm": 1.3892855644226074, "learning_rate": 3.960401097301432e-05, "loss": 2.1269, "mean_token_accuracy": 0.6053685620427132, "num_tokens": 37777472.0, "step": 200 }, { "epoch": 0.27676419965576593, "grad_norm": 1.2384331226348877, "learning_rate": 3.959809858390634e-05, "loss": 2.7234, "mean_token_accuracy": 0.48917973041534424, "num_tokens": 38105262.0, "step": 201 }, { "epoch": 0.27814113597246126, "grad_norm": 1.0585963726043701, "learning_rate": 3.959214283115385e-05, "loss": 2.854, "mean_token_accuracy": 0.4792133942246437, "num_tokens": 38315051.0, "step": 202 }, { "epoch": 0.27951807228915665, "grad_norm": 1.0563774108886719, "learning_rate": 3.958614372793481e-05, "loss": 2.626, "mean_token_accuracy": 0.5140238702297211, "num_tokens": 38492296.0, "step": 203 }, { "epoch": 0.280895008605852, "grad_norm": 1.1138699054718018, "learning_rate": 3.9580101287523105e-05, "loss": 2.3688, "mean_token_accuracy": 0.553908035159111, "num_tokens": 38651171.0, "step": 204 }, { "epoch": 0.2822719449225473, "grad_norm": 1.1257667541503906, "learning_rate": 3.9574015523288524e-05, "loss": 2.179, "mean_token_accuracy": 0.5857122093439102, "num_tokens": 38797101.0, "step": 205 }, { "epoch": 0.2836488812392427, "grad_norm": 1.3983314037322998, "learning_rate": 3.95678864486967e-05, "loss": 2.0781, "mean_token_accuracy": 0.6106612384319305, "num_tokens": 38928254.0, "step": 206 }, { "epoch": 0.28502581755593803, "grad_norm": 1.3800560235977173, "learning_rate": 3.95617140773091e-05, "loss": 2.6497, "mean_token_accuracy": 0.5195152722299099, "num_tokens": 39219807.0, "step": 207 }, { "epoch": 0.28640275387263336, "grad_norm": 1.067930817604065, "learning_rate": 3.9555498422783e-05, "loss": 2.8217, "mean_token_accuracy": 0.48062824830412865, "num_tokens": 39442817.0, "step": 208 }, { "epoch": 0.28777969018932875, "grad_norm": 1.0888906717300415, "learning_rate": 3.954923949887144e-05, "loss": 2.656, "mean_token_accuracy": 0.5078318975865841, "num_tokens": 39626080.0, "step": 209 }, { "epoch": 0.2891566265060241, "grad_norm": 1.1856508255004883, "learning_rate": 3.954293731942319e-05, "loss": 2.4099, "mean_token_accuracy": 0.5501784980297089, "num_tokens": 39788495.0, "step": 210 }, { "epoch": 0.29053356282271947, "grad_norm": 1.1790673732757568, "learning_rate": 3.953659189838275e-05, "loss": 2.1923, "mean_token_accuracy": 0.5851127281785011, "num_tokens": 39937354.0, "step": 211 }, { "epoch": 0.2919104991394148, "grad_norm": 1.2646734714508057, "learning_rate": 3.9530203249790285e-05, "loss": 2.0688, "mean_token_accuracy": 0.6017857939004898, "num_tokens": 40073061.0, "step": 212 }, { "epoch": 0.29328743545611013, "grad_norm": 1.4980369806289673, "learning_rate": 3.95237713877816e-05, "loss": 2.5226, "mean_token_accuracy": 0.5656174942851067, "num_tokens": 40330587.0, "step": 213 }, { "epoch": 0.2946643717728055, "grad_norm": 1.06264328956604, "learning_rate": 3.951729632658814e-05, "loss": 2.8141, "mean_token_accuracy": 0.4829620122909546, "num_tokens": 40575475.0, "step": 214 }, { "epoch": 0.29604130808950085, "grad_norm": 1.1562366485595703, "learning_rate": 3.951077808053691e-05, "loss": 2.6625, "mean_token_accuracy": 0.5038036294281483, "num_tokens": 40768137.0, "step": 215 }, { "epoch": 0.2974182444061962, "grad_norm": 1.2008684873580933, "learning_rate": 3.950421666405048e-05, "loss": 2.501, "mean_token_accuracy": 0.5332116261124611, "num_tokens": 40936223.0, "step": 216 }, { "epoch": 0.2987951807228916, "grad_norm": 1.1684116125106812, "learning_rate": 3.949761209164694e-05, "loss": 2.2407, "mean_token_accuracy": 0.5752494633197784, "num_tokens": 41088804.0, "step": 217 }, { "epoch": 0.3001721170395869, "grad_norm": 1.1647323369979858, "learning_rate": 3.949096437793986e-05, "loss": 2.0704, "mean_token_accuracy": 0.6005954220890999, "num_tokens": 41228557.0, "step": 218 }, { "epoch": 0.3015490533562823, "grad_norm": 1.5959327220916748, "learning_rate": 3.948427353763829e-05, "loss": 2.3599, "mean_token_accuracy": 0.5867909118533134, "num_tokens": 41436583.0, "step": 219 }, { "epoch": 0.3029259896729776, "grad_norm": 1.3064759969711304, "learning_rate": 3.9477539585546676e-05, "loss": 2.8243, "mean_token_accuracy": 0.48164089396595955, "num_tokens": 41706506.0, "step": 220 }, { "epoch": 0.30430292598967296, "grad_norm": 1.2213923931121826, "learning_rate": 3.947076253656487e-05, "loss": 2.776, "mean_token_accuracy": 0.4907907396554947, "num_tokens": 41907023.0, "step": 221 }, { "epoch": 0.30567986230636834, "grad_norm": 1.2093830108642578, "learning_rate": 3.946394240568807e-05, "loss": 2.5021, "mean_token_accuracy": 0.5326531082391739, "num_tokens": 42079499.0, "step": 222 }, { "epoch": 0.3070567986230637, "grad_norm": 1.1815457344055176, "learning_rate": 3.9457079208006824e-05, "loss": 2.2916, "mean_token_accuracy": 0.5713094845414162, "num_tokens": 42235106.0, "step": 223 }, { "epoch": 0.30843373493975906, "grad_norm": 1.2978312969207764, "learning_rate": 3.9450172958706944e-05, "loss": 2.0645, "mean_token_accuracy": 0.6024539470672607, "num_tokens": 42378103.0, "step": 224 }, { "epoch": 0.3098106712564544, "grad_norm": 1.5836273431777954, "learning_rate": 3.944322367306951e-05, "loss": 2.0991, "mean_token_accuracy": 0.6009828597307205, "num_tokens": 42502622.0, "step": 225 }, { "epoch": 0.3111876075731497, "grad_norm": 1.36110258102417, "learning_rate": 3.9436231366470836e-05, "loss": 2.6366, "mean_token_accuracy": 0.4946838654577732, "num_tokens": 42829170.0, "step": 226 }, { "epoch": 0.3125645438898451, "grad_norm": 1.0856045484542847, "learning_rate": 3.9429196054382416e-05, "loss": 2.7977, "mean_token_accuracy": 0.48547808825969696, "num_tokens": 43038947.0, "step": 227 }, { "epoch": 0.31394148020654045, "grad_norm": 1.1839022636413574, "learning_rate": 3.942211775237089e-05, "loss": 2.5565, "mean_token_accuracy": 0.5204941220581532, "num_tokens": 43216167.0, "step": 228 }, { "epoch": 0.3153184165232358, "grad_norm": 1.3504794836044312, "learning_rate": 3.941499647609805e-05, "loss": 2.3028, "mean_token_accuracy": 0.5642237439751625, "num_tokens": 43374849.0, "step": 229 }, { "epoch": 0.31669535283993117, "grad_norm": 1.2924957275390625, "learning_rate": 3.9407832241320744e-05, "loss": 2.1479, "mean_token_accuracy": 0.5949679538607597, "num_tokens": 43520880.0, "step": 230 }, { "epoch": 0.3180722891566265, "grad_norm": 1.4180775880813599, "learning_rate": 3.940062506389089e-05, "loss": 1.992, "mean_token_accuracy": 0.6209570020437241, "num_tokens": 43652259.0, "step": 231 }, { "epoch": 0.3194492254733219, "grad_norm": 1.396475076675415, "learning_rate": 3.9393374959755404e-05, "loss": 2.6079, "mean_token_accuracy": 0.5255482085049152, "num_tokens": 43948482.0, "step": 232 }, { "epoch": 0.3208261617900172, "grad_norm": 1.1166287660598755, "learning_rate": 3.9386081944956204e-05, "loss": 2.7953, "mean_token_accuracy": 0.4842718131840229, "num_tokens": 44173230.0, "step": 233 }, { "epoch": 0.32220309810671255, "grad_norm": 1.1817964315414429, "learning_rate": 3.937874603563015e-05, "loss": 2.6179, "mean_token_accuracy": 0.5114640854299068, "num_tokens": 44356988.0, "step": 234 }, { "epoch": 0.32358003442340794, "grad_norm": 1.2425671815872192, "learning_rate": 3.937136724800901e-05, "loss": 2.3796, "mean_token_accuracy": 0.5513224229216576, "num_tokens": 44519613.0, "step": 235 }, { "epoch": 0.32495697074010327, "grad_norm": 1.1989129781723022, "learning_rate": 3.936394559841941e-05, "loss": 2.1353, "mean_token_accuracy": 0.590060368180275, "num_tokens": 44668554.0, "step": 236 }, { "epoch": 0.3263339070567986, "grad_norm": 1.1932027339935303, "learning_rate": 3.935648110328285e-05, "loss": 2.0283, "mean_token_accuracy": 0.6138491854071617, "num_tokens": 44804380.0, "step": 237 }, { "epoch": 0.327710843373494, "grad_norm": 1.6239839792251587, "learning_rate": 3.93489737791156e-05, "loss": 2.5325, "mean_token_accuracy": 0.5599273107945919, "num_tokens": 45066684.0, "step": 238 }, { "epoch": 0.3290877796901893, "grad_norm": 1.393256664276123, "learning_rate": 3.9341423642528706e-05, "loss": 2.8183, "mean_token_accuracy": 0.48138194158673286, "num_tokens": 45309123.0, "step": 239 }, { "epoch": 0.3304647160068847, "grad_norm": 1.3356724977493286, "learning_rate": 3.933383071022795e-05, "loss": 2.6757, "mean_token_accuracy": 0.5031194016337395, "num_tokens": 45500422.0, "step": 240 }, { "epoch": 0.33184165232358004, "grad_norm": 1.1816999912261963, "learning_rate": 3.93261949990138e-05, "loss": 2.4264, "mean_token_accuracy": 0.5443164110183716, "num_tokens": 45667598.0, "step": 241 }, { "epoch": 0.33321858864027537, "grad_norm": 1.1898326873779297, "learning_rate": 3.931851652578137e-05, "loss": 2.1658, "mean_token_accuracy": 0.5876059457659721, "num_tokens": 45819554.0, "step": 242 }, { "epoch": 0.33459552495697076, "grad_norm": 1.3557837009429932, "learning_rate": 3.93107953075204e-05, "loss": 2.048, "mean_token_accuracy": 0.6078234761953354, "num_tokens": 45959188.0, "step": 243 }, { "epoch": 0.3359724612736661, "grad_norm": 1.3583718538284302, "learning_rate": 3.930303136131522e-05, "loss": 2.2693, "mean_token_accuracy": 0.605956681072712, "num_tokens": 46169168.0, "step": 244 }, { "epoch": 0.3373493975903614, "grad_norm": 1.0663001537322998, "learning_rate": 3.929522470434467e-05, "loss": 2.784, "mean_token_accuracy": 0.48334869369864464, "num_tokens": 46437512.0, "step": 245 }, { "epoch": 0.3387263339070568, "grad_norm": 0.9955912828445435, "learning_rate": 3.928737535388214e-05, "loss": 2.6911, "mean_token_accuracy": 0.5011942237615585, "num_tokens": 46637160.0, "step": 246 }, { "epoch": 0.34010327022375214, "grad_norm": 1.0531190633773804, "learning_rate": 3.9279483327295444e-05, "loss": 2.4954, "mean_token_accuracy": 0.5344988629221916, "num_tokens": 46808970.0, "step": 247 }, { "epoch": 0.34148020654044753, "grad_norm": 1.0564167499542236, "learning_rate": 3.927154864204684e-05, "loss": 2.1863, "mean_token_accuracy": 0.5835342183709145, "num_tokens": 46964475.0, "step": 248 }, { "epoch": 0.34285714285714286, "grad_norm": 1.0955716371536255, "learning_rate": 3.9263571315692976e-05, "loss": 2.0229, "mean_token_accuracy": 0.6132109984755516, "num_tokens": 47107527.0, "step": 249 }, { "epoch": 0.3442340791738382, "grad_norm": 1.3675479888916016, "learning_rate": 3.925555136588484e-05, "loss": 1.9776, "mean_token_accuracy": 0.6219081059098244, "num_tokens": 47232016.0, "step": 250 }, { "epoch": 0.3456110154905336, "grad_norm": 1.2782949209213257, "learning_rate": 3.924748881036776e-05, "loss": 2.6287, "mean_token_accuracy": 0.501963946968317, "num_tokens": 47564086.0, "step": 251 }, { "epoch": 0.3469879518072289, "grad_norm": 1.0462466478347778, "learning_rate": 3.923938366698129e-05, "loss": 2.7321, "mean_token_accuracy": 0.4957784228026867, "num_tokens": 47776774.0, "step": 252 }, { "epoch": 0.34836488812392424, "grad_norm": 1.0608729124069214, "learning_rate": 3.9231235953659244e-05, "loss": 2.5201, "mean_token_accuracy": 0.5316049829125404, "num_tokens": 47955165.0, "step": 253 }, { "epoch": 0.34974182444061963, "grad_norm": 1.1228233575820923, "learning_rate": 3.922304568842963e-05, "loss": 2.2216, "mean_token_accuracy": 0.5769041180610657, "num_tokens": 48114167.0, "step": 254 }, { "epoch": 0.35111876075731496, "grad_norm": 1.1466913223266602, "learning_rate": 3.921481288941459e-05, "loss": 2.0443, "mean_token_accuracy": 0.6103585213422775, "num_tokens": 48260167.0, "step": 255 }, { "epoch": 0.35249569707401035, "grad_norm": 1.3203984498977661, "learning_rate": 3.9206537574830405e-05, "loss": 1.9653, "mean_token_accuracy": 0.6288933679461479, "num_tokens": 48391255.0, "step": 256 }, { "epoch": 0.3538726333907057, "grad_norm": 1.0986576080322266, "learning_rate": 3.91982197629874e-05, "loss": 2.5, "mean_token_accuracy": 0.5402591675519943, "num_tokens": 48691462.0, "step": 257 }, { "epoch": 0.355249569707401, "grad_norm": 0.9941425919532776, "learning_rate": 3.9189859472289956e-05, "loss": 2.7461, "mean_token_accuracy": 0.4946933686733246, "num_tokens": 48916750.0, "step": 258 }, { "epoch": 0.3566265060240964, "grad_norm": 1.0305546522140503, "learning_rate": 3.9181456721236415e-05, "loss": 2.5433, "mean_token_accuracy": 0.5262442156672478, "num_tokens": 49100873.0, "step": 259 }, { "epoch": 0.35800344234079173, "grad_norm": 1.0793207883834839, "learning_rate": 3.91730115284191e-05, "loss": 2.3132, "mean_token_accuracy": 0.5650309026241302, "num_tokens": 49263820.0, "step": 260 }, { "epoch": 0.35938037865748707, "grad_norm": 1.0759823322296143, "learning_rate": 3.9164523912524224e-05, "loss": 2.0832, "mean_token_accuracy": 0.6024623662233353, "num_tokens": 49413147.0, "step": 261 }, { "epoch": 0.36075731497418245, "grad_norm": 1.1710199117660522, "learning_rate": 3.915599389233187e-05, "loss": 1.9606, "mean_token_accuracy": 0.6240565925836563, "num_tokens": 49548988.0, "step": 262 }, { "epoch": 0.3621342512908778, "grad_norm": 1.42637038230896, "learning_rate": 3.914742148671597e-05, "loss": 2.4935, "mean_token_accuracy": 0.5686254687607288, "num_tokens": 49799570.0, "step": 263 }, { "epoch": 0.36351118760757317, "grad_norm": 1.0119879245758057, "learning_rate": 3.913880671464418e-05, "loss": 2.76, "mean_token_accuracy": 0.4931732267141342, "num_tokens": 50038970.0, "step": 264 }, { "epoch": 0.3648881239242685, "grad_norm": 1.1310069561004639, "learning_rate": 3.913014959517797e-05, "loss": 2.5984, "mean_token_accuracy": 0.5205608233809471, "num_tokens": 50228479.0, "step": 265 }, { "epoch": 0.36626506024096384, "grad_norm": 1.1469693183898926, "learning_rate": 3.912145014747245e-05, "loss": 2.3255, "mean_token_accuracy": 0.5634697526693344, "num_tokens": 50394637.0, "step": 266 }, { "epoch": 0.3676419965576592, "grad_norm": 1.1192415952682495, "learning_rate": 3.911270839077644e-05, "loss": 2.1086, "mean_token_accuracy": 0.5974500626325607, "num_tokens": 50546088.0, "step": 267 }, { "epoch": 0.36901893287435455, "grad_norm": 1.1765974760055542, "learning_rate": 3.910392434443233e-05, "loss": 1.9873, "mean_token_accuracy": 0.6153800711035728, "num_tokens": 50685051.0, "step": 268 }, { "epoch": 0.3703958691910499, "grad_norm": 1.4013806581497192, "learning_rate": 3.909509802787611e-05, "loss": 2.2168, "mean_token_accuracy": 0.6079353019595146, "num_tokens": 50892922.0, "step": 269 }, { "epoch": 0.3717728055077453, "grad_norm": 1.1515238285064697, "learning_rate": 3.908622946063728e-05, "loss": 2.7311, "mean_token_accuracy": 0.4936913959681988, "num_tokens": 51163781.0, "step": 270 }, { "epoch": 0.3731497418244406, "grad_norm": 1.2183271646499634, "learning_rate": 3.9077318662338845e-05, "loss": 2.6589, "mean_token_accuracy": 0.5077950581908226, "num_tokens": 51365191.0, "step": 271 }, { "epoch": 0.374526678141136, "grad_norm": 1.2194842100143433, "learning_rate": 3.906836565269724e-05, "loss": 2.3994, "mean_token_accuracy": 0.5463950261473656, "num_tokens": 51538288.0, "step": 272 }, { "epoch": 0.3759036144578313, "grad_norm": 1.1399606466293335, "learning_rate": 3.9059370451522295e-05, "loss": 2.1423, "mean_token_accuracy": 0.5891644209623337, "num_tokens": 51694466.0, "step": 273 }, { "epoch": 0.37728055077452666, "grad_norm": 1.1407681703567505, "learning_rate": 3.9050333078717216e-05, "loss": 1.9577, "mean_token_accuracy": 0.6174056231975555, "num_tokens": 51837720.0, "step": 274 }, { "epoch": 0.37865748709122204, "grad_norm": 1.5275992155075073, "learning_rate": 3.9041253554278486e-05, "loss": 1.9594, "mean_token_accuracy": 0.6307981014251709, "num_tokens": 51962946.0, "step": 275 }, { "epoch": 0.3800344234079174, "grad_norm": 1.5140687227249146, "learning_rate": 3.903213189829589e-05, "loss": 2.574, "mean_token_accuracy": 0.5071983374655247, "num_tokens": 52289375.0, "step": 276 }, { "epoch": 0.38141135972461276, "grad_norm": 1.2608453035354614, "learning_rate": 3.902296813095241e-05, "loss": 2.7468, "mean_token_accuracy": 0.49460677430033684, "num_tokens": 52500456.0, "step": 277 }, { "epoch": 0.3827882960413081, "grad_norm": 1.1157128810882568, "learning_rate": 3.901376227252422e-05, "loss": 2.4723, "mean_token_accuracy": 0.5356764271855354, "num_tokens": 52677785.0, "step": 278 }, { "epoch": 0.38416523235800343, "grad_norm": 1.1913294792175293, "learning_rate": 3.900451434338062e-05, "loss": 2.1715, "mean_token_accuracy": 0.5836664363741875, "num_tokens": 52836834.0, "step": 279 }, { "epoch": 0.3855421686746988, "grad_norm": 1.374477744102478, "learning_rate": 3.8995224363984e-05, "loss": 2.017, "mean_token_accuracy": 0.6114011034369469, "num_tokens": 52982553.0, "step": 280 }, { "epoch": 0.38691910499139415, "grad_norm": 1.428402066230774, "learning_rate": 3.8985892354889776e-05, "loss": 1.9209, "mean_token_accuracy": 0.630968488752842, "num_tokens": 53112804.0, "step": 281 }, { "epoch": 0.3882960413080895, "grad_norm": 1.4463852643966675, "learning_rate": 3.8976518336746396e-05, "loss": 2.4741, "mean_token_accuracy": 0.5422002524137497, "num_tokens": 53408748.0, "step": 282 }, { "epoch": 0.38967297762478487, "grad_norm": 1.1173595190048218, "learning_rate": 3.8967102330295226e-05, "loss": 2.7065, "mean_token_accuracy": 0.4989328794181347, "num_tokens": 53632874.0, "step": 283 }, { "epoch": 0.3910499139414802, "grad_norm": 1.094970703125, "learning_rate": 3.895764435637056e-05, "loss": 2.5337, "mean_token_accuracy": 0.5279142633080482, "num_tokens": 53817094.0, "step": 284 }, { "epoch": 0.3924268502581756, "grad_norm": 1.1576502323150635, "learning_rate": 3.894814443589954e-05, "loss": 2.2834, "mean_token_accuracy": 0.566967599093914, "num_tokens": 53980404.0, "step": 285 }, { "epoch": 0.3938037865748709, "grad_norm": 1.252246618270874, "learning_rate": 3.893860258990212e-05, "loss": 2.0467, "mean_token_accuracy": 0.6037642732262611, "num_tokens": 54129477.0, "step": 286 }, { "epoch": 0.39518072289156625, "grad_norm": 1.3317843675613403, "learning_rate": 3.892901883949101e-05, "loss": 1.9232, "mean_token_accuracy": 0.6271429136395454, "num_tokens": 54265287.0, "step": 287 }, { "epoch": 0.39655765920826164, "grad_norm": 1.9433021545410156, "learning_rate": 3.8919393205871676e-05, "loss": 2.4729, "mean_token_accuracy": 0.5678318254649639, "num_tokens": 54517782.0, "step": 288 }, { "epoch": 0.39793459552495697, "grad_norm": 1.6669316291809082, "learning_rate": 3.890972571034222e-05, "loss": 2.7989, "mean_token_accuracy": 0.48223431408405304, "num_tokens": 54760048.0, "step": 289 }, { "epoch": 0.3993115318416523, "grad_norm": 1.3475779294967651, "learning_rate": 3.890001637429337e-05, "loss": 2.6044, "mean_token_accuracy": 0.511832058429718, "num_tokens": 54952026.0, "step": 290 }, { "epoch": 0.4006884681583477, "grad_norm": 1.2411102056503296, "learning_rate": 3.889026521920847e-05, "loss": 2.3515, "mean_token_accuracy": 0.5567047521471977, "num_tokens": 55119421.0, "step": 291 }, { "epoch": 0.402065404475043, "grad_norm": 1.285650610923767, "learning_rate": 3.888047226666335e-05, "loss": 2.046, "mean_token_accuracy": 0.6114402785897255, "num_tokens": 55271821.0, "step": 292 }, { "epoch": 0.4034423407917384, "grad_norm": 1.4079782962799072, "learning_rate": 3.887063753832635e-05, "loss": 1.9539, "mean_token_accuracy": 0.6207842603325844, "num_tokens": 55411482.0, "step": 293 }, { "epoch": 0.40481927710843374, "grad_norm": 1.3700056076049805, "learning_rate": 3.886076105595825e-05, "loss": 2.1728, "mean_token_accuracy": 0.6168380081653595, "num_tokens": 55622241.0, "step": 294 }, { "epoch": 0.40619621342512907, "grad_norm": 1.0644291639328003, "learning_rate": 3.885084284141218e-05, "loss": 2.6747, "mean_token_accuracy": 0.5016192942857742, "num_tokens": 55892001.0, "step": 295 }, { "epoch": 0.40757314974182446, "grad_norm": 1.1584410667419434, "learning_rate": 3.8840882916633645e-05, "loss": 2.6034, "mean_token_accuracy": 0.5190937370061874, "num_tokens": 56091588.0, "step": 296 }, { "epoch": 0.4089500860585198, "grad_norm": 1.2445054054260254, "learning_rate": 3.883088130366042e-05, "loss": 2.3622, "mean_token_accuracy": 0.5567063391208649, "num_tokens": 56263163.0, "step": 297 }, { "epoch": 0.4103270223752151, "grad_norm": 1.16017484664917, "learning_rate": 3.882083802462254e-05, "loss": 2.1018, "mean_token_accuracy": 0.5977874398231506, "num_tokens": 56418414.0, "step": 298 }, { "epoch": 0.4117039586919105, "grad_norm": 1.1289631128311157, "learning_rate": 3.881075310174222e-05, "loss": 1.9062, "mean_token_accuracy": 0.6324700713157654, "num_tokens": 56561200.0, "step": 299 }, { "epoch": 0.41308089500860584, "grad_norm": 1.4063467979431152, "learning_rate": 3.88006265573338e-05, "loss": 1.9077, "mean_token_accuracy": 0.63576440513134, "num_tokens": 56685759.0, "step": 300 }, { "epoch": 0.41445783132530123, "grad_norm": 1.5332578420639038, "learning_rate": 3.879045841380377e-05, "loss": 2.5277, "mean_token_accuracy": 0.5204072222113609, "num_tokens": 57008564.0, "step": 301 }, { "epoch": 0.41583476764199656, "grad_norm": 1.1377595663070679, "learning_rate": 3.878024869365059e-05, "loss": 2.6917, "mean_token_accuracy": 0.508054431527853, "num_tokens": 57220152.0, "step": 302 }, { "epoch": 0.4172117039586919, "grad_norm": 1.1077525615692139, "learning_rate": 3.876999741946478e-05, "loss": 2.4178, "mean_token_accuracy": 0.5467412918806076, "num_tokens": 57397893.0, "step": 303 }, { "epoch": 0.4185886402753873, "grad_norm": 1.1673871278762817, "learning_rate": 3.8759704613928754e-05, "loss": 2.1402, "mean_token_accuracy": 0.5913644433021545, "num_tokens": 57556744.0, "step": 304 }, { "epoch": 0.4199655765920826, "grad_norm": 1.173911452293396, "learning_rate": 3.874937029981685e-05, "loss": 1.9598, "mean_token_accuracy": 0.6220569834113121, "num_tokens": 57702278.0, "step": 305 }, { "epoch": 0.42134251290877794, "grad_norm": 1.27133309841156, "learning_rate": 3.873899449999524e-05, "loss": 1.8393, "mean_token_accuracy": 0.645699992775917, "num_tokens": 57833377.0, "step": 306 }, { "epoch": 0.42271944922547333, "grad_norm": 1.4697576761245728, "learning_rate": 3.872857723742189e-05, "loss": 2.4597, "mean_token_accuracy": 0.5472123511135578, "num_tokens": 58126031.0, "step": 307 }, { "epoch": 0.42409638554216866, "grad_norm": 1.1394779682159424, "learning_rate": 3.871811853514652e-05, "loss": 2.6662, "mean_token_accuracy": 0.5077186748385429, "num_tokens": 58348136.0, "step": 308 }, { "epoch": 0.42547332185886405, "grad_norm": 1.1395163536071777, "learning_rate": 3.870761841631051e-05, "loss": 2.4643, "mean_token_accuracy": 0.5380654633045197, "num_tokens": 58531314.0, "step": 309 }, { "epoch": 0.4268502581755594, "grad_norm": 1.219191312789917, "learning_rate": 3.869707690414692e-05, "loss": 2.2522, "mean_token_accuracy": 0.5741859599947929, "num_tokens": 58693569.0, "step": 310 }, { "epoch": 0.4282271944922547, "grad_norm": 1.2060235738754272, "learning_rate": 3.8686494021980376e-05, "loss": 1.9759, "mean_token_accuracy": 0.6171657294034958, "num_tokens": 58842182.0, "step": 311 }, { "epoch": 0.4296041308089501, "grad_norm": 1.2168887853622437, "learning_rate": 3.867586979322703e-05, "loss": 1.8513, "mean_token_accuracy": 0.6430638283491135, "num_tokens": 58977314.0, "step": 312 }, { "epoch": 0.43098106712564543, "grad_norm": 1.3314592838287354, "learning_rate": 3.866520424139455e-05, "loss": 2.3797, "mean_token_accuracy": 0.5823260992765427, "num_tokens": 59230488.0, "step": 313 }, { "epoch": 0.43235800344234077, "grad_norm": 1.0316308736801147, "learning_rate": 3.865449739008202e-05, "loss": 2.6765, "mean_token_accuracy": 0.5044165439903736, "num_tokens": 59471412.0, "step": 314 }, { "epoch": 0.43373493975903615, "grad_norm": 1.1425864696502686, "learning_rate": 3.8643749262979896e-05, "loss": 2.5153, "mean_token_accuracy": 0.5291946306824684, "num_tokens": 59662014.0, "step": 315 }, { "epoch": 0.4351118760757315, "grad_norm": 1.1880890130996704, "learning_rate": 3.8632959883869985e-05, "loss": 2.2505, "mean_token_accuracy": 0.5715990290045738, "num_tokens": 59828653.0, "step": 316 }, { "epoch": 0.4364888123924269, "grad_norm": 1.1730512380599976, "learning_rate": 3.8622129276625345e-05, "loss": 2.0203, "mean_token_accuracy": 0.6127811521291733, "num_tokens": 59980646.0, "step": 317 }, { "epoch": 0.4378657487091222, "grad_norm": 1.167614459991455, "learning_rate": 3.861125746521028e-05, "loss": 1.8815, "mean_token_accuracy": 0.6351702809333801, "num_tokens": 60120367.0, "step": 318 }, { "epoch": 0.43924268502581754, "grad_norm": 1.374399185180664, "learning_rate": 3.860034447368024e-05, "loss": 2.202, "mean_token_accuracy": 0.6178101450204849, "num_tokens": 60328611.0, "step": 319 }, { "epoch": 0.4406196213425129, "grad_norm": 1.1421829462051392, "learning_rate": 3.858939032618184e-05, "loss": 2.6359, "mean_token_accuracy": 0.5092440620064735, "num_tokens": 60598743.0, "step": 320 }, { "epoch": 0.44199655765920826, "grad_norm": 1.1470688581466675, "learning_rate": 3.8578395046952686e-05, "loss": 2.5965, "mean_token_accuracy": 0.5178424827754498, "num_tokens": 60800214.0, "step": 321 }, { "epoch": 0.4433734939759036, "grad_norm": 1.1126904487609863, "learning_rate": 3.856735866032145e-05, "loss": 2.3266, "mean_token_accuracy": 0.5593061447143555, "num_tokens": 60972436.0, "step": 322 }, { "epoch": 0.444750430292599, "grad_norm": 1.1097062826156616, "learning_rate": 3.855628119070775e-05, "loss": 2.0554, "mean_token_accuracy": 0.6080252379179001, "num_tokens": 61127803.0, "step": 323 }, { "epoch": 0.4461273666092943, "grad_norm": 1.1265738010406494, "learning_rate": 3.85451626626221e-05, "loss": 1.8777, "mean_token_accuracy": 0.6346163600683212, "num_tokens": 61270497.0, "step": 324 }, { "epoch": 0.4475043029259897, "grad_norm": 1.3671690225601196, "learning_rate": 3.853400310066584e-05, "loss": 1.87, "mean_token_accuracy": 0.6416526138782501, "num_tokens": 61394962.0, "step": 325 }, { "epoch": 0.448881239242685, "grad_norm": 1.6358295679092407, "learning_rate": 3.8522802529531146e-05, "loss": 2.5149, "mean_token_accuracy": 0.5204578451812267, "num_tokens": 61725087.0, "step": 326 }, { "epoch": 0.45025817555938036, "grad_norm": 1.2489683628082275, "learning_rate": 3.85115609740009e-05, "loss": 2.6615, "mean_token_accuracy": 0.5079396292567253, "num_tokens": 61935865.0, "step": 327 }, { "epoch": 0.45163511187607575, "grad_norm": 1.1859657764434814, "learning_rate": 3.850027845894868e-05, "loss": 2.3973, "mean_token_accuracy": 0.5492968037724495, "num_tokens": 62113409.0, "step": 328 }, { "epoch": 0.4530120481927711, "grad_norm": 1.1950750350952148, "learning_rate": 3.84889550093387e-05, "loss": 2.1358, "mean_token_accuracy": 0.5911310464143753, "num_tokens": 62272500.0, "step": 329 }, { "epoch": 0.45438898450946646, "grad_norm": 1.193273901939392, "learning_rate": 3.8477590650225735e-05, "loss": 1.8838, "mean_token_accuracy": 0.6347566321492195, "num_tokens": 62418829.0, "step": 330 }, { "epoch": 0.4557659208261618, "grad_norm": 1.4162752628326416, "learning_rate": 3.846618540675509e-05, "loss": 1.8155, "mean_token_accuracy": 0.6521291509270668, "num_tokens": 62550532.0, "step": 331 }, { "epoch": 0.45714285714285713, "grad_norm": 1.2214502096176147, "learning_rate": 3.84547393041625e-05, "loss": 2.3983, "mean_token_accuracy": 0.5548037365078926, "num_tokens": 62846428.0, "step": 332 }, { "epoch": 0.4585197934595525, "grad_norm": 0.9276536107063293, "learning_rate": 3.8443252367774164e-05, "loss": 2.6342, "mean_token_accuracy": 0.511383481323719, "num_tokens": 63070547.0, "step": 333 }, { "epoch": 0.45989672977624785, "grad_norm": 1.0392072200775146, "learning_rate": 3.843172462300658e-05, "loss": 2.4005, "mean_token_accuracy": 0.5492786094546318, "num_tokens": 63254763.0, "step": 334 }, { "epoch": 0.4612736660929432, "grad_norm": 1.107399582862854, "learning_rate": 3.842015609536657e-05, "loss": 2.1411, "mean_token_accuracy": 0.5903339684009552, "num_tokens": 63417679.0, "step": 335 }, { "epoch": 0.46265060240963857, "grad_norm": 1.1386661529541016, "learning_rate": 3.8408546810451176e-05, "loss": 1.9382, "mean_token_accuracy": 0.6253572553396225, "num_tokens": 63566509.0, "step": 336 }, { "epoch": 0.4640275387263339, "grad_norm": 1.1869807243347168, "learning_rate": 3.8396896793947645e-05, "loss": 1.8033, "mean_token_accuracy": 0.6471054255962372, "num_tokens": 63702049.0, "step": 337 }, { "epoch": 0.4654044750430293, "grad_norm": 1.2317746877670288, "learning_rate": 3.8385206071633315e-05, "loss": 2.2545, "mean_token_accuracy": 0.5969243720173836, "num_tokens": 63956679.0, "step": 338 }, { "epoch": 0.4667814113597246, "grad_norm": 0.9732272624969482, "learning_rate": 3.837347466937562e-05, "loss": 2.6312, "mean_token_accuracy": 0.5144596695899963, "num_tokens": 64197550.0, "step": 339 }, { "epoch": 0.46815834767641995, "grad_norm": 1.063503623008728, "learning_rate": 3.8361702613131996e-05, "loss": 2.4565, "mean_token_accuracy": 0.5400962606072426, "num_tokens": 64387997.0, "step": 340 }, { "epoch": 0.46953528399311534, "grad_norm": 1.1035934686660767, "learning_rate": 3.834988992894983e-05, "loss": 2.2058, "mean_token_accuracy": 0.5828883945941925, "num_tokens": 64554984.0, "step": 341 }, { "epoch": 0.47091222030981067, "grad_norm": 1.1325132846832275, "learning_rate": 3.8338036642966396e-05, "loss": 1.9558, "mean_token_accuracy": 0.6222071722149849, "num_tokens": 64707451.0, "step": 342 }, { "epoch": 0.472289156626506, "grad_norm": 1.1343505382537842, "learning_rate": 3.8326142781408826e-05, "loss": 1.8212, "mean_token_accuracy": 0.6449543908238411, "num_tokens": 64847106.0, "step": 343 }, { "epoch": 0.4736660929432014, "grad_norm": 1.4852514266967773, "learning_rate": 3.8314208370594006e-05, "loss": 2.1245, "mean_token_accuracy": 0.6291059926152229, "num_tokens": 65054279.0, "step": 344 }, { "epoch": 0.4750430292598967, "grad_norm": 1.2754966020584106, "learning_rate": 3.830223343692857e-05, "loss": 2.6386, "mean_token_accuracy": 0.5083741173148155, "num_tokens": 65319922.0, "step": 345 }, { "epoch": 0.4764199655765921, "grad_norm": 1.202256679534912, "learning_rate": 3.829021800690879e-05, "loss": 2.5488, "mean_token_accuracy": 0.5239560641348362, "num_tokens": 65519624.0, "step": 346 }, { "epoch": 0.47779690189328744, "grad_norm": 1.2088032960891724, "learning_rate": 3.827816210712056e-05, "loss": 2.2767, "mean_token_accuracy": 0.5680591911077499, "num_tokens": 65691848.0, "step": 347 }, { "epoch": 0.47917383820998277, "grad_norm": 1.230127215385437, "learning_rate": 3.826606576423931e-05, "loss": 2.0272, "mean_token_accuracy": 0.610013946890831, "num_tokens": 65847153.0, "step": 348 }, { "epoch": 0.48055077452667816, "grad_norm": 1.3079272508621216, "learning_rate": 3.8253929005029955e-05, "loss": 1.8615, "mean_token_accuracy": 0.6363618820905685, "num_tokens": 65989998.0, "step": 349 }, { "epoch": 0.4819277108433735, "grad_norm": 1.5122532844543457, "learning_rate": 3.824175185634685e-05, "loss": 1.8956, "mean_token_accuracy": 0.6396601721644402, "num_tokens": 66114487.0, "step": 350 }, { "epoch": 0.4833046471600688, "grad_norm": 1.355708122253418, "learning_rate": 3.8229534345133695e-05, "loss": 2.4051, "mean_token_accuracy": 0.5355495847761631, "num_tokens": 66447088.0, "step": 351 }, { "epoch": 0.4846815834767642, "grad_norm": 1.1190855503082275, "learning_rate": 3.821727649842352e-05, "loss": 2.5926, "mean_token_accuracy": 0.5166705474257469, "num_tokens": 66657750.0, "step": 352 }, { "epoch": 0.48605851979345954, "grad_norm": 1.1835553646087646, "learning_rate": 3.8204978343338596e-05, "loss": 2.3463, "mean_token_accuracy": 0.5554249733686447, "num_tokens": 66835155.0, "step": 353 }, { "epoch": 0.48743545611015493, "grad_norm": 1.2817888259887695, "learning_rate": 3.819263990709037e-05, "loss": 2.0688, "mean_token_accuracy": 0.5984159037470818, "num_tokens": 66993260.0, "step": 354 }, { "epoch": 0.48881239242685026, "grad_norm": 1.2164028882980347, "learning_rate": 3.818026121697944e-05, "loss": 1.875, "mean_token_accuracy": 0.6318419501185417, "num_tokens": 67138339.0, "step": 355 }, { "epoch": 0.4901893287435456, "grad_norm": 1.2706493139266968, "learning_rate": 3.816784230039545e-05, "loss": 1.7807, "mean_token_accuracy": 0.6553415209054947, "num_tokens": 67268407.0, "step": 356 }, { "epoch": 0.491566265060241, "grad_norm": 1.5827443599700928, "learning_rate": 3.8155383184817064e-05, "loss": 2.4285, "mean_token_accuracy": 0.5514861196279526, "num_tokens": 67557882.0, "step": 357 }, { "epoch": 0.4929432013769363, "grad_norm": 1.2734037637710571, "learning_rate": 3.814288389781189e-05, "loss": 2.607, "mean_token_accuracy": 0.5158842727541924, "num_tokens": 67783015.0, "step": 358 }, { "epoch": 0.49432013769363164, "grad_norm": 1.202394723892212, "learning_rate": 3.813034446703641e-05, "loss": 2.4148, "mean_token_accuracy": 0.5459394603967667, "num_tokens": 67967538.0, "step": 359 }, { "epoch": 0.49569707401032703, "grad_norm": 1.2335023880004883, "learning_rate": 3.8117764920235945e-05, "loss": 2.1611, "mean_token_accuracy": 0.5895419120788574, "num_tokens": 68130470.0, "step": 360 }, { "epoch": 0.49707401032702236, "grad_norm": 1.2289198637008667, "learning_rate": 3.810514528524458e-05, "loss": 1.927, "mean_token_accuracy": 0.6273753494024277, "num_tokens": 68279811.0, "step": 361 }, { "epoch": 0.49845094664371775, "grad_norm": 1.2164381742477417, "learning_rate": 3.809248558998508e-05, "loss": 1.7647, "mean_token_accuracy": 0.6544591262936592, "num_tokens": 68416120.0, "step": 362 }, { "epoch": 0.4998278829604131, "grad_norm": 1.4626868963241577, "learning_rate": 3.807978586246887e-05, "loss": 2.3143, "mean_token_accuracy": 0.5882294699549675, "num_tokens": 68671551.0, "step": 363 }, { "epoch": 0.5012048192771085, "grad_norm": 1.269656777381897, "learning_rate": 3.806704613079595e-05, "loss": 2.637, "mean_token_accuracy": 0.5106620527803898, "num_tokens": 68912101.0, "step": 364 }, { "epoch": 0.5025817555938038, "grad_norm": 1.1669974327087402, "learning_rate": 3.805426642315481e-05, "loss": 2.476, "mean_token_accuracy": 0.5312730297446251, "num_tokens": 69102429.0, "step": 365 }, { "epoch": 0.5039586919104991, "grad_norm": 1.1632492542266846, "learning_rate": 3.804144676782243e-05, "loss": 2.1837, "mean_token_accuracy": 0.5860177800059319, "num_tokens": 69269441.0, "step": 366 }, { "epoch": 0.5053356282271945, "grad_norm": 1.1919527053833008, "learning_rate": 3.8028587193164164e-05, "loss": 1.9591, "mean_token_accuracy": 0.6149293929338455, "num_tokens": 69421534.0, "step": 367 }, { "epoch": 0.5067125645438898, "grad_norm": 1.2426389455795288, "learning_rate": 3.8015687727633686e-05, "loss": 1.7984, "mean_token_accuracy": 0.6475590094923973, "num_tokens": 69560848.0, "step": 368 }, { "epoch": 0.5080895008605852, "grad_norm": 1.1751524209976196, "learning_rate": 3.800274839977293e-05, "loss": 2.0852, "mean_token_accuracy": 0.633161723613739, "num_tokens": 69766110.0, "step": 369 }, { "epoch": 0.5094664371772806, "grad_norm": 0.9786078929901123, "learning_rate": 3.798976923821207e-05, "loss": 2.5538, "mean_token_accuracy": 0.5209829434752464, "num_tokens": 70034136.0, "step": 370 }, { "epoch": 0.5108433734939759, "grad_norm": 1.051689863204956, "learning_rate": 3.7976750271669353e-05, "loss": 2.4869, "mean_token_accuracy": 0.5315357744693756, "num_tokens": 70234874.0, "step": 371 }, { "epoch": 0.5122203098106712, "grad_norm": 1.0836272239685059, "learning_rate": 3.796369152895117e-05, "loss": 2.2561, "mean_token_accuracy": 0.5750308036804199, "num_tokens": 70406911.0, "step": 372 }, { "epoch": 0.5135972461273666, "grad_norm": 1.0925536155700684, "learning_rate": 3.795059303895187e-05, "loss": 1.9674, "mean_token_accuracy": 0.6178411841392517, "num_tokens": 70562268.0, "step": 373 }, { "epoch": 0.514974182444062, "grad_norm": 1.0876672267913818, "learning_rate": 3.793745483065377e-05, "loss": 1.7457, "mean_token_accuracy": 0.6547938734292984, "num_tokens": 70704691.0, "step": 374 }, { "epoch": 0.5163511187607573, "grad_norm": 1.3272855281829834, "learning_rate": 3.792427693312707e-05, "loss": 1.7869, "mean_token_accuracy": 0.6502839550375938, "num_tokens": 70829296.0, "step": 375 }, { "epoch": 0.5177280550774527, "grad_norm": 1.3884214162826538, "learning_rate": 3.7911059375529785e-05, "loss": 2.387, "mean_token_accuracy": 0.539868026971817, "num_tokens": 71157746.0, "step": 376 }, { "epoch": 0.519104991394148, "grad_norm": 1.0792295932769775, "learning_rate": 3.789780218710768e-05, "loss": 2.5444, "mean_token_accuracy": 0.5255590602755547, "num_tokens": 71368761.0, "step": 377 }, { "epoch": 0.5204819277108433, "grad_norm": 1.0573134422302246, "learning_rate": 3.788450539719423e-05, "loss": 2.2752, "mean_token_accuracy": 0.5667640268802643, "num_tokens": 71546070.0, "step": 378 }, { "epoch": 0.5218588640275387, "grad_norm": 1.1654014587402344, "learning_rate": 3.7871169035210495e-05, "loss": 2.0153, "mean_token_accuracy": 0.6093085259199142, "num_tokens": 71704581.0, "step": 379 }, { "epoch": 0.5232358003442341, "grad_norm": 1.1366355419158936, "learning_rate": 3.7857793130665135e-05, "loss": 1.8161, "mean_token_accuracy": 0.6431608349084854, "num_tokens": 71850187.0, "step": 380 }, { "epoch": 0.5246127366609294, "grad_norm": 1.1983455419540405, "learning_rate": 3.7844377713154264e-05, "loss": 1.7322, "mean_token_accuracy": 0.6639300957322121, "num_tokens": 71980912.0, "step": 381 }, { "epoch": 0.5259896729776248, "grad_norm": 1.3232380151748657, "learning_rate": 3.783092281236147e-05, "loss": 2.3287, "mean_token_accuracy": 0.5650414451956749, "num_tokens": 72275589.0, "step": 382 }, { "epoch": 0.5273666092943201, "grad_norm": 0.9793266654014587, "learning_rate": 3.7817428458057655e-05, "loss": 2.5536, "mean_token_accuracy": 0.5256604924798012, "num_tokens": 72499712.0, "step": 383 }, { "epoch": 0.5287435456110154, "grad_norm": 1.040682077407837, "learning_rate": 3.780389468010106e-05, "loss": 2.3567, "mean_token_accuracy": 0.5591141432523727, "num_tokens": 72683575.0, "step": 384 }, { "epoch": 0.5301204819277109, "grad_norm": 1.1221572160720825, "learning_rate": 3.7790321508437124e-05, "loss": 2.0654, "mean_token_accuracy": 0.6014585271477699, "num_tokens": 72845980.0, "step": 385 }, { "epoch": 0.5314974182444062, "grad_norm": 1.1273812055587769, "learning_rate": 3.7776708973098476e-05, "loss": 1.8353, "mean_token_accuracy": 0.6380122005939484, "num_tokens": 72994914.0, "step": 386 }, { "epoch": 0.5328743545611015, "grad_norm": 1.159727692604065, "learning_rate": 3.776305710420482e-05, "loss": 1.7334, "mean_token_accuracy": 0.6623869240283966, "num_tokens": 73130673.0, "step": 387 }, { "epoch": 0.5342512908777969, "grad_norm": 1.2239432334899902, "learning_rate": 3.774936593196291e-05, "loss": 2.1866, "mean_token_accuracy": 0.6117709055542946, "num_tokens": 73388655.0, "step": 388 }, { "epoch": 0.5356282271944922, "grad_norm": 1.0188941955566406, "learning_rate": 3.773563548666645e-05, "loss": 2.5355, "mean_token_accuracy": 0.5253261998295784, "num_tokens": 73631053.0, "step": 389 }, { "epoch": 0.5370051635111877, "grad_norm": 1.0915037393569946, "learning_rate": 3.7721865798696056e-05, "loss": 2.3944, "mean_token_accuracy": 0.5448566824197769, "num_tokens": 73822519.0, "step": 390 }, { "epoch": 0.538382099827883, "grad_norm": 1.0830886363983154, "learning_rate": 3.770805689851916e-05, "loss": 2.1569, "mean_token_accuracy": 0.5870517790317535, "num_tokens": 73989639.0, "step": 391 }, { "epoch": 0.5397590361445783, "grad_norm": 1.116917371749878, "learning_rate": 3.769420881668996e-05, "loss": 1.9104, "mean_token_accuracy": 0.6254802271723747, "num_tokens": 74141907.0, "step": 392 }, { "epoch": 0.5411359724612737, "grad_norm": 1.172143578529358, "learning_rate": 3.7680321583849365e-05, "loss": 1.7663, "mean_token_accuracy": 0.6534927934408188, "num_tokens": 74281445.0, "step": 393 }, { "epoch": 0.542512908777969, "grad_norm": 1.2859337329864502, "learning_rate": 3.7666395230724885e-05, "loss": 2.0109, "mean_token_accuracy": 0.6392742097377777, "num_tokens": 74490499.0, "step": 394 }, { "epoch": 0.5438898450946644, "grad_norm": 1.0432875156402588, "learning_rate": 3.7652429788130606e-05, "loss": 2.5656, "mean_token_accuracy": 0.521564669907093, "num_tokens": 74759334.0, "step": 395 }, { "epoch": 0.5452667814113598, "grad_norm": 1.0338257551193237, "learning_rate": 3.76384252869671e-05, "loss": 2.4785, "mean_token_accuracy": 0.5345182493329048, "num_tokens": 74959398.0, "step": 396 }, { "epoch": 0.5466437177280551, "grad_norm": 1.0591204166412354, "learning_rate": 3.762438175822137e-05, "loss": 2.2065, "mean_token_accuracy": 0.579163633286953, "num_tokens": 75130381.0, "step": 397 }, { "epoch": 0.5480206540447504, "grad_norm": 1.1158242225646973, "learning_rate": 3.761029923296677e-05, "loss": 1.9362, "mean_token_accuracy": 0.622274823486805, "num_tokens": 75285253.0, "step": 398 }, { "epoch": 0.5493975903614458, "grad_norm": 1.1422702074050903, "learning_rate": 3.759617774236292e-05, "loss": 1.7818, "mean_token_accuracy": 0.6537824422121048, "num_tokens": 75427875.0, "step": 399 }, { "epoch": 0.5507745266781411, "grad_norm": 1.3791770935058594, "learning_rate": 3.758201731765568e-05, "loss": 1.7558, "mean_token_accuracy": 0.6604361683130264, "num_tokens": 75552420.0, "step": 400 }, { "epoch": 0.5521514629948365, "grad_norm": 1.3844491243362427, "learning_rate": 3.756781799017705e-05, "loss": 2.3958, "mean_token_accuracy": 0.5384999141097069, "num_tokens": 75881000.0, "step": 401 }, { "epoch": 0.5535283993115319, "grad_norm": 0.9900819659233093, "learning_rate": 3.755357979134511e-05, "loss": 2.5177, "mean_token_accuracy": 0.5334893316030502, "num_tokens": 76091732.0, "step": 402 }, { "epoch": 0.5549053356282272, "grad_norm": 1.0822157859802246, "learning_rate": 3.753930275266395e-05, "loss": 2.2696, "mean_token_accuracy": 0.571400560438633, "num_tokens": 76269381.0, "step": 403 }, { "epoch": 0.5562822719449225, "grad_norm": 1.1467323303222656, "learning_rate": 3.7524986905723595e-05, "loss": 2.0004, "mean_token_accuracy": 0.6144689172506332, "num_tokens": 76427922.0, "step": 404 }, { "epoch": 0.5576592082616179, "grad_norm": 1.127500057220459, "learning_rate": 3.751063228219993e-05, "loss": 1.8066, "mean_token_accuracy": 0.6450710445642471, "num_tokens": 76573653.0, "step": 405 }, { "epoch": 0.5590361445783133, "grad_norm": 1.2141149044036865, "learning_rate": 3.749623891385465e-05, "loss": 1.7057, "mean_token_accuracy": 0.6636545956134796, "num_tokens": 76704542.0, "step": 406 }, { "epoch": 0.5604130808950086, "grad_norm": 1.2224293947219849, "learning_rate": 3.748180683253518e-05, "loss": 2.277, "mean_token_accuracy": 0.5706998407840729, "num_tokens": 76998657.0, "step": 407 }, { "epoch": 0.561790017211704, "grad_norm": 1.1562210321426392, "learning_rate": 3.7467336070174604e-05, "loss": 2.5453, "mean_token_accuracy": 0.5291676223278046, "num_tokens": 77222548.0, "step": 408 }, { "epoch": 0.5631669535283993, "grad_norm": 1.0416022539138794, "learning_rate": 3.745282665879158e-05, "loss": 2.3151, "mean_token_accuracy": 0.5649291053414345, "num_tokens": 77405387.0, "step": 409 }, { "epoch": 0.5645438898450946, "grad_norm": 1.0621392726898193, "learning_rate": 3.743827863049029e-05, "loss": 2.0041, "mean_token_accuracy": 0.6135663986206055, "num_tokens": 77567239.0, "step": 410 }, { "epoch": 0.5659208261617901, "grad_norm": 1.0950725078582764, "learning_rate": 3.742369201746038e-05, "loss": 1.7772, "mean_token_accuracy": 0.6507838368415833, "num_tokens": 77715661.0, "step": 411 }, { "epoch": 0.5672977624784854, "grad_norm": 1.1909273862838745, "learning_rate": 3.740906685197684e-05, "loss": 1.6975, "mean_token_accuracy": 0.6659714430570602, "num_tokens": 77851131.0, "step": 412 }, { "epoch": 0.5686746987951807, "grad_norm": 1.350914716720581, "learning_rate": 3.7394403166399986e-05, "loss": 2.2219, "mean_token_accuracy": 0.6066325157880783, "num_tokens": 78111863.0, "step": 413 }, { "epoch": 0.5700516351118761, "grad_norm": 1.0218485593795776, "learning_rate": 3.737970099317535e-05, "loss": 2.5382, "mean_token_accuracy": 0.5277980640530586, "num_tokens": 78358030.0, "step": 414 }, { "epoch": 0.5714285714285714, "grad_norm": 1.0065844058990479, "learning_rate": 3.7364960364833614e-05, "loss": 2.4038, "mean_token_accuracy": 0.5496814921498299, "num_tokens": 78551234.0, "step": 415 }, { "epoch": 0.5728055077452667, "grad_norm": 1.1074278354644775, "learning_rate": 3.7350181313990595e-05, "loss": 2.1103, "mean_token_accuracy": 0.5954185798764229, "num_tokens": 78718972.0, "step": 416 }, { "epoch": 0.5741824440619622, "grad_norm": 1.2066971063613892, "learning_rate": 3.7335363873347056e-05, "loss": 1.8512, "mean_token_accuracy": 0.6406131908297539, "num_tokens": 78871088.0, "step": 417 }, { "epoch": 0.5755593803786575, "grad_norm": 1.1858893632888794, "learning_rate": 3.732050807568878e-05, "loss": 1.7536, "mean_token_accuracy": 0.6553611159324646, "num_tokens": 79010290.0, "step": 418 }, { "epoch": 0.5769363166953528, "grad_norm": 1.3192733526229858, "learning_rate": 3.730561395388635e-05, "loss": 1.9706, "mean_token_accuracy": 0.6437678560614586, "num_tokens": 79218011.0, "step": 419 }, { "epoch": 0.5783132530120482, "grad_norm": 1.0126075744628906, "learning_rate": 3.729068154089519e-05, "loss": 2.4658, "mean_token_accuracy": 0.5402820035815239, "num_tokens": 79484510.0, "step": 420 }, { "epoch": 0.5796901893287435, "grad_norm": 0.9996421337127686, "learning_rate": 3.727571086975544e-05, "loss": 2.4306, "mean_token_accuracy": 0.5431196466088295, "num_tokens": 79684274.0, "step": 421 }, { "epoch": 0.5810671256454389, "grad_norm": 1.115960955619812, "learning_rate": 3.726070197359187e-05, "loss": 2.178, "mean_token_accuracy": 0.5863956809043884, "num_tokens": 79855924.0, "step": 422 }, { "epoch": 0.5824440619621343, "grad_norm": 1.155556321144104, "learning_rate": 3.724565488561387e-05, "loss": 1.8533, "mean_token_accuracy": 0.6394290551543236, "num_tokens": 80010954.0, "step": 423 }, { "epoch": 0.5838209982788296, "grad_norm": 1.166739821434021, "learning_rate": 3.72305696391153e-05, "loss": 1.7464, "mean_token_accuracy": 0.6556121706962585, "num_tokens": 80153483.0, "step": 424 }, { "epoch": 0.5851979345955249, "grad_norm": 1.3337702751159668, "learning_rate": 3.721544626747446e-05, "loss": 1.7628, "mean_token_accuracy": 0.6624541580677032, "num_tokens": 80277692.0, "step": 425 }, { "epoch": 0.5865748709122203, "grad_norm": 1.319545865058899, "learning_rate": 3.720028480415401e-05, "loss": 2.3162, "mean_token_accuracy": 0.5507803335785866, "num_tokens": 80611995.0, "step": 426 }, { "epoch": 0.5879518072289157, "grad_norm": 1.2316042184829712, "learning_rate": 3.718508528270091e-05, "loss": 2.5182, "mean_token_accuracy": 0.5287582501769066, "num_tokens": 80823683.0, "step": 427 }, { "epoch": 0.589328743545611, "grad_norm": 1.1640440225601196, "learning_rate": 3.716984773674629e-05, "loss": 2.2591, "mean_token_accuracy": 0.5720143094658852, "num_tokens": 81001463.0, "step": 428 }, { "epoch": 0.5907056798623064, "grad_norm": 1.1831591129302979, "learning_rate": 3.7154572200005446e-05, "loss": 2.0028, "mean_token_accuracy": 0.6155145838856697, "num_tokens": 81160068.0, "step": 429 }, { "epoch": 0.5920826161790017, "grad_norm": 1.180765986442566, "learning_rate": 3.7139258706277725e-05, "loss": 1.7662, "mean_token_accuracy": 0.6555434614419937, "num_tokens": 81305931.0, "step": 430 }, { "epoch": 0.593459552495697, "grad_norm": 1.3636201620101929, "learning_rate": 3.712390728944647e-05, "loss": 1.6643, "mean_token_accuracy": 0.6755910143256187, "num_tokens": 81437121.0, "step": 431 }, { "epoch": 0.5948364888123924, "grad_norm": 1.308143138885498, "learning_rate": 3.710851798347891e-05, "loss": 2.2575, "mean_token_accuracy": 0.5792365446686745, "num_tokens": 81727156.0, "step": 432 }, { "epoch": 0.5962134251290878, "grad_norm": 1.0765011310577393, "learning_rate": 3.709309082242613e-05, "loss": 2.4826, "mean_token_accuracy": 0.5386991947889328, "num_tokens": 81949708.0, "step": 433 }, { "epoch": 0.5975903614457831, "grad_norm": 1.028989315032959, "learning_rate": 3.707762584042297e-05, "loss": 2.2967, "mean_token_accuracy": 0.5677397027611732, "num_tokens": 82133496.0, "step": 434 }, { "epoch": 0.5989672977624785, "grad_norm": 1.1149604320526123, "learning_rate": 3.7062123071687944e-05, "loss": 2.0308, "mean_token_accuracy": 0.6089385002851486, "num_tokens": 82296466.0, "step": 435 }, { "epoch": 0.6003442340791738, "grad_norm": 1.1276519298553467, "learning_rate": 3.704658255052319e-05, "loss": 1.7807, "mean_token_accuracy": 0.6483778953552246, "num_tokens": 82445442.0, "step": 436 }, { "epoch": 0.6017211703958691, "grad_norm": 1.182455062866211, "learning_rate": 3.703100431131438e-05, "loss": 1.686, "mean_token_accuracy": 0.6659825369715691, "num_tokens": 82581132.0, "step": 437 }, { "epoch": 0.6030981067125646, "grad_norm": 1.4108080863952637, "learning_rate": 3.701538838853062e-05, "loss": 2.2037, "mean_token_accuracy": 0.6076828017830849, "num_tokens": 82836137.0, "step": 438 }, { "epoch": 0.6044750430292599, "grad_norm": 1.093623161315918, "learning_rate": 3.699973481672442e-05, "loss": 2.4973, "mean_token_accuracy": 0.5338713526725769, "num_tokens": 83080335.0, "step": 439 }, { "epoch": 0.6058519793459552, "grad_norm": 1.0736603736877441, "learning_rate": 3.698404363053158e-05, "loss": 2.3644, "mean_token_accuracy": 0.5539751797914505, "num_tokens": 83271862.0, "step": 440 }, { "epoch": 0.6072289156626506, "grad_norm": 1.1071313619613647, "learning_rate": 3.696831486467114e-05, "loss": 2.0986, "mean_token_accuracy": 0.5987475737929344, "num_tokens": 83439106.0, "step": 441 }, { "epoch": 0.6086058519793459, "grad_norm": 1.160266399383545, "learning_rate": 3.695254855394527e-05, "loss": 1.8515, "mean_token_accuracy": 0.6391797661781311, "num_tokens": 83591196.0, "step": 442 }, { "epoch": 0.6099827882960414, "grad_norm": 1.1559243202209473, "learning_rate": 3.693674473323924e-05, "loss": 1.7093, "mean_token_accuracy": 0.6644431203603745, "num_tokens": 83730610.0, "step": 443 }, { "epoch": 0.6113597246127367, "grad_norm": 1.4806026220321655, "learning_rate": 3.6920903437521305e-05, "loss": 2.0282, "mean_token_accuracy": 0.6394726037979126, "num_tokens": 83936181.0, "step": 444 }, { "epoch": 0.612736660929432, "grad_norm": 1.2222179174423218, "learning_rate": 3.690502470184262e-05, "loss": 2.4701, "mean_token_accuracy": 0.5345804467797279, "num_tokens": 84205106.0, "step": 445 }, { "epoch": 0.6141135972461274, "grad_norm": 1.0871068239212036, "learning_rate": 3.6889108561337205e-05, "loss": 2.3824, "mean_token_accuracy": 0.5503176525235176, "num_tokens": 84405623.0, "step": 446 }, { "epoch": 0.6154905335628227, "grad_norm": 1.1063244342803955, "learning_rate": 3.6873155051221846e-05, "loss": 2.1876, "mean_token_accuracy": 0.5814000964164734, "num_tokens": 84577569.0, "step": 447 }, { "epoch": 0.6168674698795181, "grad_norm": 1.1225630044937134, "learning_rate": 3.685716420679599e-05, "loss": 1.8723, "mean_token_accuracy": 0.6332834810018539, "num_tokens": 84733031.0, "step": 448 }, { "epoch": 0.6182444061962135, "grad_norm": 1.2177627086639404, "learning_rate": 3.684113606344172e-05, "loss": 1.7113, "mean_token_accuracy": 0.6616174280643463, "num_tokens": 84876300.0, "step": 449 }, { "epoch": 0.6196213425129088, "grad_norm": 1.468108057975769, "learning_rate": 3.6825070656623626e-05, "loss": 1.7131, "mean_token_accuracy": 0.6647237986326218, "num_tokens": 85001247.0, "step": 450 }, { "epoch": 0.6209982788296041, "grad_norm": 1.2129855155944824, "learning_rate": 3.680896802188876e-05, "loss": 2.2804, "mean_token_accuracy": 0.5559933036565781, "num_tokens": 85329356.0, "step": 451 }, { "epoch": 0.6223752151462995, "grad_norm": 1.1929696798324585, "learning_rate": 3.679282819486656e-05, "loss": 2.4667, "mean_token_accuracy": 0.5363494455814362, "num_tokens": 85539011.0, "step": 452 }, { "epoch": 0.6237521514629948, "grad_norm": 1.1579437255859375, "learning_rate": 3.677665121126871e-05, "loss": 2.2061, "mean_token_accuracy": 0.5794221386313438, "num_tokens": 85715805.0, "step": 453 }, { "epoch": 0.6251290877796902, "grad_norm": 1.1737624406814575, "learning_rate": 3.676043710688916e-05, "loss": 1.95, "mean_token_accuracy": 0.6177288889884949, "num_tokens": 85874128.0, "step": 454 }, { "epoch": 0.6265060240963856, "grad_norm": 1.1771284341812134, "learning_rate": 3.674418591760398e-05, "loss": 1.74, "mean_token_accuracy": 0.6584135517477989, "num_tokens": 86019729.0, "step": 455 }, { "epoch": 0.6278829604130809, "grad_norm": 1.305204153060913, "learning_rate": 3.6727897679371276e-05, "loss": 1.6754, "mean_token_accuracy": 0.6747702434659004, "num_tokens": 86151008.0, "step": 456 }, { "epoch": 0.6292598967297762, "grad_norm": 1.5005815029144287, "learning_rate": 3.671157242823116e-05, "loss": 2.3042, "mean_token_accuracy": 0.5708301886916161, "num_tokens": 86439689.0, "step": 457 }, { "epoch": 0.6306368330464716, "grad_norm": 1.2939491271972656, "learning_rate": 3.669521020030561e-05, "loss": 2.518, "mean_token_accuracy": 0.526306688785553, "num_tokens": 86662038.0, "step": 458 }, { "epoch": 0.632013769363167, "grad_norm": 1.1339584589004517, "learning_rate": 3.667881103179844e-05, "loss": 2.2499, "mean_token_accuracy": 0.5713271573185921, "num_tokens": 86845073.0, "step": 459 }, { "epoch": 0.6333907056798623, "grad_norm": 1.092398762702942, "learning_rate": 3.6662374958995204e-05, "loss": 1.9807, "mean_token_accuracy": 0.6155904233455658, "num_tokens": 87007364.0, "step": 460 }, { "epoch": 0.6347676419965577, "grad_norm": 1.1532047986984253, "learning_rate": 3.66459020182631e-05, "loss": 1.7481, "mean_token_accuracy": 0.6569642052054405, "num_tokens": 87156191.0, "step": 461 }, { "epoch": 0.636144578313253, "grad_norm": 1.2414846420288086, "learning_rate": 3.662939224605091e-05, "loss": 1.6674, "mean_token_accuracy": 0.669509693980217, "num_tokens": 87292049.0, "step": 462 }, { "epoch": 0.6375215146299483, "grad_norm": 1.479623556137085, "learning_rate": 3.66128456788889e-05, "loss": 2.1206, "mean_token_accuracy": 0.6143007650971413, "num_tokens": 87552339.0, "step": 463 }, { "epoch": 0.6388984509466438, "grad_norm": 1.2324708700180054, "learning_rate": 3.6596262353388756e-05, "loss": 2.5173, "mean_token_accuracy": 0.530449628829956, "num_tokens": 87796425.0, "step": 464 }, { "epoch": 0.6402753872633391, "grad_norm": 1.123120665550232, "learning_rate": 3.657964230624351e-05, "loss": 2.3198, "mean_token_accuracy": 0.5637719556689262, "num_tokens": 87988065.0, "step": 465 }, { "epoch": 0.6416523235800344, "grad_norm": 1.1310780048370361, "learning_rate": 3.6562985574227414e-05, "loss": 2.1063, "mean_token_accuracy": 0.5970068126916885, "num_tokens": 88155822.0, "step": 466 }, { "epoch": 0.6430292598967298, "grad_norm": 1.2435749769210815, "learning_rate": 3.654629219419594e-05, "loss": 1.7974, "mean_token_accuracy": 0.645952045917511, "num_tokens": 88308621.0, "step": 467 }, { "epoch": 0.6444061962134251, "grad_norm": 1.3368356227874756, "learning_rate": 3.6529562203085595e-05, "loss": 1.7062, "mean_token_accuracy": 0.6630885079503059, "num_tokens": 88448645.0, "step": 468 }, { "epoch": 0.6457831325301204, "grad_norm": 1.2511978149414062, "learning_rate": 3.651279563791393e-05, "loss": 1.9383, "mean_token_accuracy": 0.644863449037075, "num_tokens": 88657699.0, "step": 469 }, { "epoch": 0.6471600688468159, "grad_norm": 1.0245795249938965, "learning_rate": 3.649599253577942e-05, "loss": 2.482, "mean_token_accuracy": 0.5356618538498878, "num_tokens": 88926727.0, "step": 470 }, { "epoch": 0.6485370051635112, "grad_norm": 1.0277265310287476, "learning_rate": 3.6479152933861336e-05, "loss": 2.3795, "mean_token_accuracy": 0.5521609336137772, "num_tokens": 89127239.0, "step": 471 }, { "epoch": 0.6499139414802065, "grad_norm": 1.0928269624710083, "learning_rate": 3.646227686941979e-05, "loss": 2.1414, "mean_token_accuracy": 0.5898396968841553, "num_tokens": 89300075.0, "step": 472 }, { "epoch": 0.6512908777969019, "grad_norm": 1.10128653049469, "learning_rate": 3.64453643797955e-05, "loss": 1.8804, "mean_token_accuracy": 0.6330942437052727, "num_tokens": 89455503.0, "step": 473 }, { "epoch": 0.6526678141135972, "grad_norm": 1.0827891826629639, "learning_rate": 3.642841550240983e-05, "loss": 1.6935, "mean_token_accuracy": 0.6657865792512894, "num_tokens": 89598111.0, "step": 474 }, { "epoch": 0.6540447504302926, "grad_norm": 1.3395838737487793, "learning_rate": 3.641143027476463e-05, "loss": 1.6591, "mean_token_accuracy": 0.675447590649128, "num_tokens": 89722752.0, "step": 475 }, { "epoch": 0.655421686746988, "grad_norm": 1.4005639553070068, "learning_rate": 3.639440873444219e-05, "loss": 2.2972, "mean_token_accuracy": 0.5541393160820007, "num_tokens": 90050494.0, "step": 476 }, { "epoch": 0.6567986230636833, "grad_norm": 1.0419015884399414, "learning_rate": 3.6377350919105136e-05, "loss": 2.4193, "mean_token_accuracy": 0.5469594150781631, "num_tokens": 90261023.0, "step": 477 }, { "epoch": 0.6581755593803786, "grad_norm": 1.0622024536132812, "learning_rate": 3.636025686649637e-05, "loss": 2.1881, "mean_token_accuracy": 0.5827136784791946, "num_tokens": 90438521.0, "step": 478 }, { "epoch": 0.659552495697074, "grad_norm": 1.1319241523742676, "learning_rate": 3.6343126614438966e-05, "loss": 1.9261, "mean_token_accuracy": 0.6253581717610359, "num_tokens": 90597440.0, "step": 479 }, { "epoch": 0.6609294320137694, "grad_norm": 1.1031620502471924, "learning_rate": 3.632596020083612e-05, "loss": 1.7223, "mean_token_accuracy": 0.6593403667211533, "num_tokens": 90743382.0, "step": 480 }, { "epoch": 0.6623063683304647, "grad_norm": 1.2105294466018677, "learning_rate": 3.6308757663671e-05, "loss": 1.6194, "mean_token_accuracy": 0.6801304072141647, "num_tokens": 90874536.0, "step": 481 }, { "epoch": 0.6636833046471601, "grad_norm": 1.201481580734253, "learning_rate": 3.629151904100672e-05, "loss": 2.2268, "mean_token_accuracy": 0.5772555992007256, "num_tokens": 91169849.0, "step": 482 }, { "epoch": 0.6650602409638554, "grad_norm": 0.9734994769096375, "learning_rate": 3.627424437098625e-05, "loss": 2.4393, "mean_token_accuracy": 0.5421425625681877, "num_tokens": 91394652.0, "step": 483 }, { "epoch": 0.6664371772805507, "grad_norm": 1.0713133811950684, "learning_rate": 3.625693369183231e-05, "loss": 2.2417, "mean_token_accuracy": 0.5718699470162392, "num_tokens": 91578396.0, "step": 484 }, { "epoch": 0.6678141135972461, "grad_norm": 1.0906718969345093, "learning_rate": 3.623958704184729e-05, "loss": 1.9739, "mean_token_accuracy": 0.6149997934699059, "num_tokens": 91741093.0, "step": 485 }, { "epoch": 0.6691910499139415, "grad_norm": 1.0946438312530518, "learning_rate": 3.6222204459413186e-05, "loss": 1.7338, "mean_token_accuracy": 0.6562578082084656, "num_tokens": 91889825.0, "step": 486 }, { "epoch": 0.6705679862306368, "grad_norm": 1.1417019367218018, "learning_rate": 3.620478598299149e-05, "loss": 1.627, "mean_token_accuracy": 0.6770779266953468, "num_tokens": 92024849.0, "step": 487 }, { "epoch": 0.6719449225473322, "grad_norm": 1.1169488430023193, "learning_rate": 3.6187331651123107e-05, "loss": 2.116, "mean_token_accuracy": 0.617957316339016, "num_tokens": 92281256.0, "step": 488 }, { "epoch": 0.6733218588640275, "grad_norm": 0.9541012644767761, "learning_rate": 3.6169841502428285e-05, "loss": 2.457, "mean_token_accuracy": 0.5424294471740723, "num_tokens": 92521883.0, "step": 489 }, { "epoch": 0.6746987951807228, "grad_norm": 1.0350828170776367, "learning_rate": 3.6152315575606535e-05, "loss": 2.2974, "mean_token_accuracy": 0.5648578107357025, "num_tokens": 92712513.0, "step": 490 }, { "epoch": 0.6760757314974183, "grad_norm": 1.0942111015319824, "learning_rate": 3.613475390943651e-05, "loss": 1.987, "mean_token_accuracy": 0.6145618185400963, "num_tokens": 92879133.0, "step": 491 }, { "epoch": 0.6774526678141136, "grad_norm": 1.0778694152832031, "learning_rate": 3.611715654277596e-05, "loss": 1.7872, "mean_token_accuracy": 0.6495696157217026, "num_tokens": 93030676.0, "step": 492 }, { "epoch": 0.678829604130809, "grad_norm": 1.1471754312515259, "learning_rate": 3.609952351456161e-05, "loss": 1.6459, "mean_token_accuracy": 0.6755939722061157, "num_tokens": 93169780.0, "step": 493 }, { "epoch": 0.6802065404475043, "grad_norm": 1.5491341352462769, "learning_rate": 3.6081854863809104e-05, "loss": 1.9764, "mean_token_accuracy": 0.6522039249539375, "num_tokens": 93376824.0, "step": 494 }, { "epoch": 0.6815834767641996, "grad_norm": 1.078697919845581, "learning_rate": 3.60641506296129e-05, "loss": 2.398, "mean_token_accuracy": 0.5517597272992134, "num_tokens": 93642773.0, "step": 495 }, { "epoch": 0.6829604130808951, "grad_norm": 0.9739767909049988, "learning_rate": 3.6046410851146195e-05, "loss": 2.3287, "mean_token_accuracy": 0.5606530159711838, "num_tokens": 93841824.0, "step": 496 }, { "epoch": 0.6843373493975904, "grad_norm": 1.1142263412475586, "learning_rate": 3.602863556766083e-05, "loss": 2.0752, "mean_token_accuracy": 0.5990628451108932, "num_tokens": 94013477.0, "step": 497 }, { "epoch": 0.6857142857142857, "grad_norm": 1.116082787513733, "learning_rate": 3.601082481848721e-05, "loss": 1.8234, "mean_token_accuracy": 0.6377085968852043, "num_tokens": 94168519.0, "step": 498 }, { "epoch": 0.687091222030981, "grad_norm": 1.1260154247283936, "learning_rate": 3.59929786430342e-05, "loss": 1.6933, "mean_token_accuracy": 0.6669258624315262, "num_tokens": 94310910.0, "step": 499 }, { "epoch": 0.6884681583476764, "grad_norm": 1.3487789630889893, "learning_rate": 3.5975097080789066e-05, "loss": 1.6795, "mean_token_accuracy": 0.6731116250157356, "num_tokens": 94435256.0, "step": 500 }, { "epoch": 0.6898450946643718, "grad_norm": 1.2023367881774902, "learning_rate": 3.595718017131736e-05, "loss": 2.1988, "mean_token_accuracy": 0.5713008418679237, "num_tokens": 94769571.0, "step": 501 }, { "epoch": 0.6912220309810672, "grad_norm": 1.0112322568893433, "learning_rate": 3.593922795426286e-05, "loss": 2.4513, "mean_token_accuracy": 0.5425758436322212, "num_tokens": 94981403.0, "step": 502 }, { "epoch": 0.6925989672977625, "grad_norm": 1.011547565460205, "learning_rate": 3.5921240469347455e-05, "loss": 2.147, "mean_token_accuracy": 0.5906036868691444, "num_tokens": 95159263.0, "step": 503 }, { "epoch": 0.6939759036144578, "grad_norm": 1.1036226749420166, "learning_rate": 3.5903217756371066e-05, "loss": 1.848, "mean_token_accuracy": 0.640740342438221, "num_tokens": 95318084.0, "step": 504 }, { "epoch": 0.6953528399311532, "grad_norm": 1.107596516609192, "learning_rate": 3.588515985521157e-05, "loss": 1.6894, "mean_token_accuracy": 0.6650878712534904, "num_tokens": 95464024.0, "step": 505 }, { "epoch": 0.6967297762478485, "grad_norm": 1.2270642518997192, "learning_rate": 3.586706680582471e-05, "loss": 1.6211, "mean_token_accuracy": 0.682866781949997, "num_tokens": 95594659.0, "step": 506 }, { "epoch": 0.6981067125645439, "grad_norm": 1.2096253633499146, "learning_rate": 3.5848938648243976e-05, "loss": 2.1947, "mean_token_accuracy": 0.5825207903981209, "num_tokens": 95888011.0, "step": 507 }, { "epoch": 0.6994836488812393, "grad_norm": 0.9714229106903076, "learning_rate": 3.5830775422580564e-05, "loss": 2.415, "mean_token_accuracy": 0.5432767942547798, "num_tokens": 96112525.0, "step": 508 }, { "epoch": 0.7008605851979346, "grad_norm": 1.037974238395691, "learning_rate": 3.581257716902325e-05, "loss": 2.2123, "mean_token_accuracy": 0.5802514851093292, "num_tokens": 96297058.0, "step": 509 }, { "epoch": 0.7022375215146299, "grad_norm": 1.0934189558029175, "learning_rate": 3.579434392783832e-05, "loss": 1.9281, "mean_token_accuracy": 0.6215637549757957, "num_tokens": 96460299.0, "step": 510 }, { "epoch": 0.7036144578313253, "grad_norm": 1.090904951095581, "learning_rate": 3.577607573936947e-05, "loss": 1.728, "mean_token_accuracy": 0.6599818170070648, "num_tokens": 96609563.0, "step": 511 }, { "epoch": 0.7049913941480207, "grad_norm": 1.1365042924880981, "learning_rate": 3.575777264403772e-05, "loss": 1.5914, "mean_token_accuracy": 0.6810532063245773, "num_tokens": 96745441.0, "step": 512 }, { "epoch": 0.706368330464716, "grad_norm": 1.268089771270752, "learning_rate": 3.5739434682341355e-05, "loss": 2.1041, "mean_token_accuracy": 0.6240737736225128, "num_tokens": 96999769.0, "step": 513 }, { "epoch": 0.7077452667814114, "grad_norm": 1.0488606691360474, "learning_rate": 3.5721061894855756e-05, "loss": 2.4012, "mean_token_accuracy": 0.5495657026767731, "num_tokens": 97240907.0, "step": 514 }, { "epoch": 0.7091222030981067, "grad_norm": 1.0898374319076538, "learning_rate": 3.570265432223339e-05, "loss": 2.2587, "mean_token_accuracy": 0.5728898867964745, "num_tokens": 97431329.0, "step": 515 }, { "epoch": 0.710499139414802, "grad_norm": 1.1098390817642212, "learning_rate": 3.568421200520371e-05, "loss": 1.9685, "mean_token_accuracy": 0.6192406788468361, "num_tokens": 97597796.0, "step": 516 }, { "epoch": 0.7118760757314975, "grad_norm": 1.0867151021957397, "learning_rate": 3.566573498457301e-05, "loss": 1.7351, "mean_token_accuracy": 0.6555932313203812, "num_tokens": 97749499.0, "step": 517 }, { "epoch": 0.7132530120481928, "grad_norm": 1.1321520805358887, "learning_rate": 3.564722330122439e-05, "loss": 1.6291, "mean_token_accuracy": 0.6776033267378807, "num_tokens": 97888692.0, "step": 518 }, { "epoch": 0.7146299483648881, "grad_norm": 1.4763811826705933, "learning_rate": 3.562867699611764e-05, "loss": 1.8962, "mean_token_accuracy": 0.6636233776807785, "num_tokens": 98097936.0, "step": 519 }, { "epoch": 0.7160068846815835, "grad_norm": 1.1678882837295532, "learning_rate": 3.561009611028917e-05, "loss": 2.3561, "mean_token_accuracy": 0.5515822619199753, "num_tokens": 98369206.0, "step": 520 }, { "epoch": 0.7173838209982788, "grad_norm": 1.0545456409454346, "learning_rate": 3.559148068485188e-05, "loss": 2.3273, "mean_token_accuracy": 0.5596980229020119, "num_tokens": 98569759.0, "step": 521 }, { "epoch": 0.7187607573149741, "grad_norm": 1.1458402872085571, "learning_rate": 3.55728307609951e-05, "loss": 2.0731, "mean_token_accuracy": 0.6025697961449623, "num_tokens": 98741585.0, "step": 522 }, { "epoch": 0.7201376936316696, "grad_norm": 1.1328879594802856, "learning_rate": 3.555414637998453e-05, "loss": 1.7923, "mean_token_accuracy": 0.6433187499642372, "num_tokens": 98896511.0, "step": 523 }, { "epoch": 0.7215146299483649, "grad_norm": 1.0831434726715088, "learning_rate": 3.553542758316205e-05, "loss": 1.6162, "mean_token_accuracy": 0.6774062365293503, "num_tokens": 99038627.0, "step": 524 }, { "epoch": 0.7228915662650602, "grad_norm": 1.337433934211731, "learning_rate": 3.5516674411945747e-05, "loss": 1.659, "mean_token_accuracy": 0.6759642064571381, "num_tokens": 99162638.0, "step": 525 }, { "epoch": 0.7242685025817556, "grad_norm": 1.6398431062698364, "learning_rate": 3.549788690782972e-05, "loss": 2.2655, "mean_token_accuracy": 0.5585584118962288, "num_tokens": 99490429.0, "step": 526 }, { "epoch": 0.7256454388984509, "grad_norm": 1.2572869062423706, "learning_rate": 3.547906511238407e-05, "loss": 2.3918, "mean_token_accuracy": 0.5500163808465004, "num_tokens": 99700867.0, "step": 527 }, { "epoch": 0.7270223752151463, "grad_norm": 1.117976427078247, "learning_rate": 3.546020906725474e-05, "loss": 2.1662, "mean_token_accuracy": 0.5863359645009041, "num_tokens": 99878588.0, "step": 528 }, { "epoch": 0.7283993115318417, "grad_norm": 1.1656783819198608, "learning_rate": 3.544131881416349e-05, "loss": 1.8647, "mean_token_accuracy": 0.6337170153856277, "num_tokens": 100037842.0, "step": 529 }, { "epoch": 0.729776247848537, "grad_norm": 1.3075273036956787, "learning_rate": 3.542239439490776e-05, "loss": 1.6957, "mean_token_accuracy": 0.6642152145504951, "num_tokens": 100183823.0, "step": 530 }, { "epoch": 0.7311531841652323, "grad_norm": 1.3479173183441162, "learning_rate": 3.540343585136056e-05, "loss": 1.5695, "mean_token_accuracy": 0.6899547204375267, "num_tokens": 100314789.0, "step": 531 }, { "epoch": 0.7325301204819277, "grad_norm": 1.298418402671814, "learning_rate": 3.538444322547043e-05, "loss": 2.1896, "mean_token_accuracy": 0.5893116593360901, "num_tokens": 100612426.0, "step": 532 }, { "epoch": 0.7339070567986231, "grad_norm": 1.0377908945083618, "learning_rate": 3.536541655926133e-05, "loss": 2.3919, "mean_token_accuracy": 0.5514190942049026, "num_tokens": 100837257.0, "step": 533 }, { "epoch": 0.7352839931153184, "grad_norm": 1.0502586364746094, "learning_rate": 3.5346355894832515e-05, "loss": 2.2066, "mean_token_accuracy": 0.5771966874599457, "num_tokens": 101021040.0, "step": 534 }, { "epoch": 0.7366609294320138, "grad_norm": 1.156272292137146, "learning_rate": 3.532726127435849e-05, "loss": 1.9427, "mean_token_accuracy": 0.6236305385828018, "num_tokens": 101183552.0, "step": 535 }, { "epoch": 0.7380378657487091, "grad_norm": 1.1766732931137085, "learning_rate": 3.5308132740088874e-05, "loss": 1.6802, "mean_token_accuracy": 0.6645932123064995, "num_tokens": 101332121.0, "step": 536 }, { "epoch": 0.7394148020654044, "grad_norm": 1.129068374633789, "learning_rate": 3.5288970334348324e-05, "loss": 1.5578, "mean_token_accuracy": 0.6916149780154228, "num_tokens": 101467586.0, "step": 537 }, { "epoch": 0.7407917383820998, "grad_norm": 1.2099443674087524, "learning_rate": 3.5269774099536476e-05, "loss": 2.0742, "mean_token_accuracy": 0.6233073621988297, "num_tokens": 101724403.0, "step": 538 }, { "epoch": 0.7421686746987952, "grad_norm": 1.0454514026641846, "learning_rate": 3.525054407812777e-05, "loss": 2.4266, "mean_token_accuracy": 0.5437144115567207, "num_tokens": 101967076.0, "step": 539 }, { "epoch": 0.7435456110154905, "grad_norm": 1.0569863319396973, "learning_rate": 3.5231280312671426e-05, "loss": 2.2425, "mean_token_accuracy": 0.5728616416454315, "num_tokens": 102158655.0, "step": 540 }, { "epoch": 0.7449225473321859, "grad_norm": 1.1106574535369873, "learning_rate": 3.521198284579134e-05, "loss": 1.9569, "mean_token_accuracy": 0.6198526695370674, "num_tokens": 102326063.0, "step": 541 }, { "epoch": 0.7462994836488812, "grad_norm": 1.0791133642196655, "learning_rate": 3.5192651720185954e-05, "loss": 1.7398, "mean_token_accuracy": 0.6557190716266632, "num_tokens": 102478524.0, "step": 542 }, { "epoch": 0.7476764199655765, "grad_norm": 1.092035174369812, "learning_rate": 3.51732869786282e-05, "loss": 1.5746, "mean_token_accuracy": 0.6842299252748489, "num_tokens": 102618346.0, "step": 543 }, { "epoch": 0.749053356282272, "grad_norm": 1.4144999980926514, "learning_rate": 3.515388866396539e-05, "loss": 1.8621, "mean_token_accuracy": 0.6651639938354492, "num_tokens": 102828408.0, "step": 544 }, { "epoch": 0.7504302925989673, "grad_norm": 1.2091951370239258, "learning_rate": 3.513445681911912e-05, "loss": 2.4078, "mean_token_accuracy": 0.5489278435707092, "num_tokens": 103096839.0, "step": 545 }, { "epoch": 0.7518072289156627, "grad_norm": 1.067907691001892, "learning_rate": 3.511499148708517e-05, "loss": 2.3111, "mean_token_accuracy": 0.5629437193274498, "num_tokens": 103295980.0, "step": 546 }, { "epoch": 0.753184165232358, "grad_norm": 1.0365556478500366, "learning_rate": 3.509549271093341e-05, "loss": 2.0579, "mean_token_accuracy": 0.6039187237620354, "num_tokens": 103468012.0, "step": 547 }, { "epoch": 0.7545611015490533, "grad_norm": 1.0881068706512451, "learning_rate": 3.507596053380773e-05, "loss": 1.7717, "mean_token_accuracy": 0.6543176472187042, "num_tokens": 103623622.0, "step": 548 }, { "epoch": 0.7559380378657488, "grad_norm": 1.2013791799545288, "learning_rate": 3.505639499892591e-05, "loss": 1.6094, "mean_token_accuracy": 0.6782037988305092, "num_tokens": 103766630.0, "step": 549 }, { "epoch": 0.7573149741824441, "grad_norm": 1.4139701128005981, "learning_rate": 3.503679614957955e-05, "loss": 1.5901, "mean_token_accuracy": 0.685788057744503, "num_tokens": 103891420.0, "step": 550 }, { "epoch": 0.7586919104991394, "grad_norm": 1.2434674501419067, "learning_rate": 3.5017164029133944e-05, "loss": 2.2066, "mean_token_accuracy": 0.5683296471834183, "num_tokens": 104215815.0, "step": 551 }, { "epoch": 0.7600688468158348, "grad_norm": 1.061328411102295, "learning_rate": 3.499749868102802e-05, "loss": 2.3101, "mean_token_accuracy": 0.5629050880670547, "num_tokens": 104425222.0, "step": 552 }, { "epoch": 0.7614457831325301, "grad_norm": 1.0423334836959839, "learning_rate": 3.497780014877423e-05, "loss": 2.0948, "mean_token_accuracy": 0.5962710231542587, "num_tokens": 104601670.0, "step": 553 }, { "epoch": 0.7628227194492255, "grad_norm": 1.135775089263916, "learning_rate": 3.4958068475958424e-05, "loss": 1.8305, "mean_token_accuracy": 0.6407052502036095, "num_tokens": 104760192.0, "step": 554 }, { "epoch": 0.7641996557659209, "grad_norm": 1.145871877670288, "learning_rate": 3.4938303706239814e-05, "loss": 1.6579, "mean_token_accuracy": 0.6707384660840034, "num_tokens": 104905952.0, "step": 555 }, { "epoch": 0.7655765920826162, "grad_norm": 1.2127230167388916, "learning_rate": 3.4918505883350815e-05, "loss": 1.5657, "mean_token_accuracy": 0.691277913749218, "num_tokens": 105037158.0, "step": 556 }, { "epoch": 0.7669535283993115, "grad_norm": 1.3029348850250244, "learning_rate": 3.4898675051097015e-05, "loss": 2.1319, "mean_token_accuracy": 0.5915309339761734, "num_tokens": 105329399.0, "step": 557 }, { "epoch": 0.7683304647160069, "grad_norm": 1.0642842054367065, "learning_rate": 3.487881125335699e-05, "loss": 2.3876, "mean_token_accuracy": 0.5508836731314659, "num_tokens": 105553045.0, "step": 558 }, { "epoch": 0.7697074010327022, "grad_norm": 0.999578595161438, "learning_rate": 3.48589145340823e-05, "loss": 2.1331, "mean_token_accuracy": 0.5915313959121704, "num_tokens": 105735740.0, "step": 559 }, { "epoch": 0.7710843373493976, "grad_norm": 1.0432978868484497, "learning_rate": 3.483898493729732e-05, "loss": 1.8732, "mean_token_accuracy": 0.6328941807150841, "num_tokens": 105897732.0, "step": 560 }, { "epoch": 0.772461273666093, "grad_norm": 1.0806154012680054, "learning_rate": 3.4819022507099184e-05, "loss": 1.6602, "mean_token_accuracy": 0.6677856966853142, "num_tokens": 106046400.0, "step": 561 }, { "epoch": 0.7738382099827883, "grad_norm": 1.1432400941848755, "learning_rate": 3.479902728765768e-05, "loss": 1.5653, "mean_token_accuracy": 0.6848981603980064, "num_tokens": 106181938.0, "step": 562 }, { "epoch": 0.7752151462994836, "grad_norm": 1.3740053176879883, "learning_rate": 3.477899932321513e-05, "loss": 2.0412, "mean_token_accuracy": 0.6243645623326302, "num_tokens": 106437915.0, "step": 563 }, { "epoch": 0.776592082616179, "grad_norm": 1.0929107666015625, "learning_rate": 3.475893865808633e-05, "loss": 2.3973, "mean_token_accuracy": 0.5507577806711197, "num_tokens": 106680341.0, "step": 564 }, { "epoch": 0.7779690189328744, "grad_norm": 1.0211658477783203, "learning_rate": 3.4738845336658425e-05, "loss": 2.2138, "mean_token_accuracy": 0.5771443918347359, "num_tokens": 106870866.0, "step": 565 }, { "epoch": 0.7793459552495697, "grad_norm": 1.0813350677490234, "learning_rate": 3.471871940339079e-05, "loss": 1.9544, "mean_token_accuracy": 0.6215356662869453, "num_tokens": 107037776.0, "step": 566 }, { "epoch": 0.7807228915662651, "grad_norm": 1.1389501094818115, "learning_rate": 3.4698560902815e-05, "loss": 1.6657, "mean_token_accuracy": 0.6663038060069084, "num_tokens": 107189988.0, "step": 567 }, { "epoch": 0.7820998278829604, "grad_norm": 1.174644947052002, "learning_rate": 3.4678369879534664e-05, "loss": 1.5791, "mean_token_accuracy": 0.6842780783772469, "num_tokens": 107329567.0, "step": 568 }, { "epoch": 0.7834767641996557, "grad_norm": 1.1922379732131958, "learning_rate": 3.465814637822535e-05, "loss": 1.8465, "mean_token_accuracy": 0.6668727174401283, "num_tokens": 107537370.0, "step": 569 }, { "epoch": 0.7848537005163512, "grad_norm": 0.9684680700302124, "learning_rate": 3.463789044363451e-05, "loss": 2.3962, "mean_token_accuracy": 0.5488917753100395, "num_tokens": 107806542.0, "step": 570 }, { "epoch": 0.7862306368330465, "grad_norm": 1.0134209394454956, "learning_rate": 3.4617602120581345e-05, "loss": 2.2636, "mean_token_accuracy": 0.5689530521631241, "num_tokens": 108007282.0, "step": 571 }, { "epoch": 0.7876075731497418, "grad_norm": 1.05887770652771, "learning_rate": 3.459728145395671e-05, "loss": 2.0195, "mean_token_accuracy": 0.6116527318954468, "num_tokens": 108179868.0, "step": 572 }, { "epoch": 0.7889845094664372, "grad_norm": 1.0835144519805908, "learning_rate": 3.4576928488723056e-05, "loss": 1.7495, "mean_token_accuracy": 0.6506428942084312, "num_tokens": 108335922.0, "step": 573 }, { "epoch": 0.7903614457831325, "grad_norm": 1.054327130317688, "learning_rate": 3.455654326991426e-05, "loss": 1.5737, "mean_token_accuracy": 0.6846219599246979, "num_tokens": 108479143.0, "step": 574 }, { "epoch": 0.7917383820998278, "grad_norm": 1.303267478942871, "learning_rate": 3.4536125842635604e-05, "loss": 1.6364, "mean_token_accuracy": 0.6769674345850945, "num_tokens": 108603924.0, "step": 575 }, { "epoch": 0.7931153184165233, "grad_norm": 1.321031093597412, "learning_rate": 3.4515676252063595e-05, "loss": 2.2049, "mean_token_accuracy": 0.5691119879484177, "num_tokens": 108930048.0, "step": 576 }, { "epoch": 0.7944922547332186, "grad_norm": 1.091975212097168, "learning_rate": 3.449519454344595e-05, "loss": 2.3169, "mean_token_accuracy": 0.562630869448185, "num_tokens": 109140808.0, "step": 577 }, { "epoch": 0.7958691910499139, "grad_norm": 1.0463027954101562, "learning_rate": 3.44746807621014e-05, "loss": 2.0675, "mean_token_accuracy": 0.6001132130622864, "num_tokens": 109317220.0, "step": 578 }, { "epoch": 0.7972461273666093, "grad_norm": 1.065639853477478, "learning_rate": 3.445413495341971e-05, "loss": 1.7878, "mean_token_accuracy": 0.6497830450534821, "num_tokens": 109474903.0, "step": 579 }, { "epoch": 0.7986230636833046, "grad_norm": 1.1145284175872803, "learning_rate": 3.443355716286143e-05, "loss": 1.5962, "mean_token_accuracy": 0.6768307387828827, "num_tokens": 109620062.0, "step": 580 }, { "epoch": 0.8, "grad_norm": 1.2085987329483032, "learning_rate": 3.441294743595794e-05, "loss": 1.5191, "mean_token_accuracy": 0.7027463689446449, "num_tokens": 109750379.0, "step": 581 }, { "epoch": 0.8013769363166954, "grad_norm": 1.3442081212997437, "learning_rate": 3.439230581831126e-05, "loss": 2.1622, "mean_token_accuracy": 0.5897501334547997, "num_tokens": 110050478.0, "step": 582 }, { "epoch": 0.8027538726333907, "grad_norm": 0.9389193058013916, "learning_rate": 3.437163235559396e-05, "loss": 2.3353, "mean_token_accuracy": 0.557612881064415, "num_tokens": 110274636.0, "step": 583 }, { "epoch": 0.804130808950086, "grad_norm": 0.9806140065193176, "learning_rate": 3.43509270935491e-05, "loss": 2.152, "mean_token_accuracy": 0.5873523727059364, "num_tokens": 110457849.0, "step": 584 }, { "epoch": 0.8055077452667814, "grad_norm": 1.0551691055297852, "learning_rate": 3.433019007799007e-05, "loss": 1.8497, "mean_token_accuracy": 0.6364917680621147, "num_tokens": 110620497.0, "step": 585 }, { "epoch": 0.8068846815834768, "grad_norm": 1.0566250085830688, "learning_rate": 3.430942135480053e-05, "loss": 1.6382, "mean_token_accuracy": 0.6755115240812302, "num_tokens": 110769395.0, "step": 586 }, { "epoch": 0.8082616179001721, "grad_norm": 1.1175761222839355, "learning_rate": 3.428862096993433e-05, "loss": 1.5239, "mean_token_accuracy": 0.6964422985911369, "num_tokens": 110904567.0, "step": 587 }, { "epoch": 0.8096385542168675, "grad_norm": 1.2938376665115356, "learning_rate": 3.4267788969415315e-05, "loss": 2.0687, "mean_token_accuracy": 0.6222933456301689, "num_tokens": 111162374.0, "step": 588 }, { "epoch": 0.8110154905335628, "grad_norm": 1.072190761566162, "learning_rate": 3.4246925399337336e-05, "loss": 2.3555, "mean_token_accuracy": 0.5540040656924248, "num_tokens": 111404944.0, "step": 589 }, { "epoch": 0.8123924268502581, "grad_norm": 1.030254602432251, "learning_rate": 3.422603030586407e-05, "loss": 2.2244, "mean_token_accuracy": 0.5754862651228905, "num_tokens": 111596393.0, "step": 590 }, { "epoch": 0.8137693631669535, "grad_norm": 1.0412719249725342, "learning_rate": 3.420510373522896e-05, "loss": 1.9356, "mean_token_accuracy": 0.620713584125042, "num_tokens": 111763665.0, "step": 591 }, { "epoch": 0.8151462994836489, "grad_norm": 1.123029112815857, "learning_rate": 3.418414573373507e-05, "loss": 1.7118, "mean_token_accuracy": 0.663055419921875, "num_tokens": 111915727.0, "step": 592 }, { "epoch": 0.8165232358003442, "grad_norm": 1.173632025718689, "learning_rate": 3.4163156347755046e-05, "loss": 1.5748, "mean_token_accuracy": 0.6850118339061737, "num_tokens": 112055321.0, "step": 593 }, { "epoch": 0.8179001721170396, "grad_norm": 1.2265740633010864, "learning_rate": 3.4142135623730954e-05, "loss": 1.8196, "mean_token_accuracy": 0.6663366332650185, "num_tokens": 112264269.0, "step": 594 }, { "epoch": 0.8192771084337349, "grad_norm": 0.9991793036460876, "learning_rate": 3.41210836081742e-05, "loss": 2.3045, "mean_token_accuracy": 0.5633808821439743, "num_tokens": 112536718.0, "step": 595 }, { "epoch": 0.8206540447504302, "grad_norm": 0.9339414834976196, "learning_rate": 3.410000034766543e-05, "loss": 2.2738, "mean_token_accuracy": 0.5698176547884941, "num_tokens": 112738162.0, "step": 596 }, { "epoch": 0.8220309810671257, "grad_norm": 1.0586694478988647, "learning_rate": 3.4078885888854436e-05, "loss": 2.0137, "mean_token_accuracy": 0.6099574565887451, "num_tokens": 112909949.0, "step": 597 }, { "epoch": 0.823407917383821, "grad_norm": 1.139028787612915, "learning_rate": 3.405774027846002e-05, "loss": 1.7362, "mean_token_accuracy": 0.6547513604164124, "num_tokens": 113065321.0, "step": 598 }, { "epoch": 0.8247848537005164, "grad_norm": 1.1029468774795532, "learning_rate": 3.403656356326993e-05, "loss": 1.6089, "mean_token_accuracy": 0.6767487972974777, "num_tokens": 113208614.0, "step": 599 }, { "epoch": 0.8261617900172117, "grad_norm": 1.2843818664550781, "learning_rate": 3.4015355790140715e-05, "loss": 1.5819, "mean_token_accuracy": 0.6826293542981148, "num_tokens": 113333503.0, "step": 600 }, { "epoch": 0.827538726333907, "grad_norm": 1.2275323867797852, "learning_rate": 3.399411700599768e-05, "loss": 2.1509, "mean_token_accuracy": 0.5758810266852379, "num_tokens": 113664120.0, "step": 601 }, { "epoch": 0.8289156626506025, "grad_norm": 1.0087378025054932, "learning_rate": 3.39728472578347e-05, "loss": 2.2844, "mean_token_accuracy": 0.5680061355233192, "num_tokens": 113875181.0, "step": 602 }, { "epoch": 0.8302925989672978, "grad_norm": 1.0714365243911743, "learning_rate": 3.39515465927142e-05, "loss": 2.0702, "mean_token_accuracy": 0.599168986082077, "num_tokens": 114052507.0, "step": 603 }, { "epoch": 0.8316695352839931, "grad_norm": 1.106357455253601, "learning_rate": 3.393021505776701e-05, "loss": 1.7899, "mean_token_accuracy": 0.6437432989478111, "num_tokens": 114211171.0, "step": 604 }, { "epoch": 0.8330464716006885, "grad_norm": 1.1002683639526367, "learning_rate": 3.3908852700192236e-05, "loss": 1.5792, "mean_token_accuracy": 0.6850417852401733, "num_tokens": 114356640.0, "step": 605 }, { "epoch": 0.8344234079173838, "grad_norm": 1.235366940498352, "learning_rate": 3.388745956725722e-05, "loss": 1.4944, "mean_token_accuracy": 0.7040051892399788, "num_tokens": 114487630.0, "step": 606 }, { "epoch": 0.8358003442340792, "grad_norm": 1.3149973154067993, "learning_rate": 3.3866035706297366e-05, "loss": 2.0883, "mean_token_accuracy": 0.5997959151864052, "num_tokens": 114782746.0, "step": 607 }, { "epoch": 0.8371772805507746, "grad_norm": 1.0312632322311401, "learning_rate": 3.384458116471609e-05, "loss": 2.3082, "mean_token_accuracy": 0.5614949762821198, "num_tokens": 115007911.0, "step": 608 }, { "epoch": 0.8385542168674699, "grad_norm": 1.0186859369277954, "learning_rate": 3.3823095989984697e-05, "loss": 2.0768, "mean_token_accuracy": 0.6000976040959358, "num_tokens": 115192121.0, "step": 609 }, { "epoch": 0.8399311531841652, "grad_norm": 1.1025596857070923, "learning_rate": 3.3801580229642243e-05, "loss": 1.8424, "mean_token_accuracy": 0.638316310942173, "num_tokens": 115354749.0, "step": 610 }, { "epoch": 0.8413080895008606, "grad_norm": 1.1186890602111816, "learning_rate": 3.3780033931295496e-05, "loss": 1.6002, "mean_token_accuracy": 0.6785764172673225, "num_tokens": 115503261.0, "step": 611 }, { "epoch": 0.8426850258175559, "grad_norm": 1.159922480583191, "learning_rate": 3.3758457142618754e-05, "loss": 1.5167, "mean_token_accuracy": 0.6940157264471054, "num_tokens": 115638759.0, "step": 612 }, { "epoch": 0.8440619621342513, "grad_norm": 1.261696219444275, "learning_rate": 3.373684991135382e-05, "loss": 1.9873, "mean_token_accuracy": 0.6375042274594307, "num_tokens": 115895168.0, "step": 613 }, { "epoch": 0.8454388984509467, "grad_norm": 1.0701900720596313, "learning_rate": 3.371521228530984e-05, "loss": 2.3724, "mean_token_accuracy": 0.5537968054413795, "num_tokens": 116137013.0, "step": 614 }, { "epoch": 0.846815834767642, "grad_norm": 0.9965835809707642, "learning_rate": 3.369354431236319e-05, "loss": 2.1618, "mean_token_accuracy": 0.5835718959569931, "num_tokens": 116327788.0, "step": 615 }, { "epoch": 0.8481927710843373, "grad_norm": 1.1192504167556763, "learning_rate": 3.367184604045743e-05, "loss": 1.91, "mean_token_accuracy": 0.6239570379257202, "num_tokens": 116494766.0, "step": 616 }, { "epoch": 0.8495697074010327, "grad_norm": 1.124379277229309, "learning_rate": 3.3650117517603136e-05, "loss": 1.6198, "mean_token_accuracy": 0.674982562661171, "num_tokens": 116647007.0, "step": 617 }, { "epoch": 0.8509466437177281, "grad_norm": 1.1092290878295898, "learning_rate": 3.362835879187783e-05, "loss": 1.5, "mean_token_accuracy": 0.6952915340662003, "num_tokens": 116786468.0, "step": 618 }, { "epoch": 0.8523235800344234, "grad_norm": 1.4181177616119385, "learning_rate": 3.360656991142585e-05, "loss": 1.8082, "mean_token_accuracy": 0.6721536070108414, "num_tokens": 116996031.0, "step": 619 }, { "epoch": 0.8537005163511188, "grad_norm": 1.0641065835952759, "learning_rate": 3.3584750924458264e-05, "loss": 2.25, "mean_token_accuracy": 0.569654144346714, "num_tokens": 117266472.0, "step": 620 }, { "epoch": 0.8550774526678141, "grad_norm": 1.0215167999267578, "learning_rate": 3.356290187925278e-05, "loss": 2.2122, "mean_token_accuracy": 0.5773422047495842, "num_tokens": 117467184.0, "step": 621 }, { "epoch": 0.8564543889845094, "grad_norm": 1.0803672075271606, "learning_rate": 3.354102282415356e-05, "loss": 1.995, "mean_token_accuracy": 0.6133275032043457, "num_tokens": 117639751.0, "step": 622 }, { "epoch": 0.8578313253012049, "grad_norm": 1.1073962450027466, "learning_rate": 3.3519113807571205e-05, "loss": 1.7021, "mean_token_accuracy": 0.6613180115818977, "num_tokens": 117795129.0, "step": 623 }, { "epoch": 0.8592082616179002, "grad_norm": 1.1406376361846924, "learning_rate": 3.349717487798261e-05, "loss": 1.5552, "mean_token_accuracy": 0.6842564195394516, "num_tokens": 117937798.0, "step": 624 }, { "epoch": 0.8605851979345955, "grad_norm": 1.2936712503433228, "learning_rate": 3.347520608393084e-05, "loss": 1.5438, "mean_token_accuracy": 0.6883570179343224, "num_tokens": 118062455.0, "step": 625 }, { "epoch": 0.8619621342512909, "grad_norm": 1.3202587366104126, "learning_rate": 3.3453207474025054e-05, "loss": 2.13, "mean_token_accuracy": 0.580175556242466, "num_tokens": 118391465.0, "step": 626 }, { "epoch": 0.8633390705679862, "grad_norm": 1.079347014427185, "learning_rate": 3.3431179096940375e-05, "loss": 2.2654, "mean_token_accuracy": 0.5705502480268478, "num_tokens": 118602802.0, "step": 627 }, { "epoch": 0.8647160068846815, "grad_norm": 1.0786317586898804, "learning_rate": 3.340912100141778e-05, "loss": 2.0462, "mean_token_accuracy": 0.6021969392895699, "num_tokens": 118780518.0, "step": 628 }, { "epoch": 0.866092943201377, "grad_norm": 1.137144923210144, "learning_rate": 3.338703323626404e-05, "loss": 1.7631, "mean_token_accuracy": 0.6516233757138252, "num_tokens": 118939602.0, "step": 629 }, { "epoch": 0.8674698795180723, "grad_norm": 1.1716362237930298, "learning_rate": 3.3364915850351525e-05, "loss": 1.5714, "mean_token_accuracy": 0.6844504475593567, "num_tokens": 119085947.0, "step": 630 }, { "epoch": 0.8688468158347676, "grad_norm": 1.2002677917480469, "learning_rate": 3.334276889261819e-05, "loss": 1.4624, "mean_token_accuracy": 0.7057783156633377, "num_tokens": 119217758.0, "step": 631 }, { "epoch": 0.870223752151463, "grad_norm": 1.1610311269760132, "learning_rate": 3.3320592412067386e-05, "loss": 2.1094, "mean_token_accuracy": 0.597074843943119, "num_tokens": 119511903.0, "step": 632 }, { "epoch": 0.8716006884681583, "grad_norm": 1.020621418952942, "learning_rate": 3.3298386457767804e-05, "loss": 2.2978, "mean_token_accuracy": 0.5634893625974655, "num_tokens": 119736016.0, "step": 633 }, { "epoch": 0.8729776247848537, "grad_norm": 1.0546320676803589, "learning_rate": 3.327615107885335e-05, "loss": 2.106, "mean_token_accuracy": 0.5960047543048859, "num_tokens": 119920278.0, "step": 634 }, { "epoch": 0.8743545611015491, "grad_norm": 1.0415631532669067, "learning_rate": 3.325388632452304e-05, "loss": 1.8284, "mean_token_accuracy": 0.6394687369465828, "num_tokens": 120082980.0, "step": 635 }, { "epoch": 0.8757314974182444, "grad_norm": 1.0907504558563232, "learning_rate": 3.3231592244040885e-05, "loss": 1.6174, "mean_token_accuracy": 0.6738846153020859, "num_tokens": 120232119.0, "step": 636 }, { "epoch": 0.8771084337349397, "grad_norm": 1.159412145614624, "learning_rate": 3.3209268886735767e-05, "loss": 1.5195, "mean_token_accuracy": 0.6927722990512848, "num_tokens": 120367805.0, "step": 637 }, { "epoch": 0.8784853700516351, "grad_norm": 1.170999526977539, "learning_rate": 3.318691630200138e-05, "loss": 1.9347, "mean_token_accuracy": 0.6497038900852203, "num_tokens": 120629048.0, "step": 638 }, { "epoch": 0.8798623063683305, "grad_norm": 0.9520416855812073, "learning_rate": 3.3164534539296056e-05, "loss": 2.3035, "mean_token_accuracy": 0.5630774274468422, "num_tokens": 120871314.0, "step": 639 }, { "epoch": 0.8812392426850258, "grad_norm": 0.9776389598846436, "learning_rate": 3.314212364814271e-05, "loss": 2.1219, "mean_token_accuracy": 0.590424582362175, "num_tokens": 121062562.0, "step": 640 }, { "epoch": 0.8826161790017212, "grad_norm": 1.046494722366333, "learning_rate": 3.311968367812869e-05, "loss": 1.9018, "mean_token_accuracy": 0.6294309049844742, "num_tokens": 121229861.0, "step": 641 }, { "epoch": 0.8839931153184165, "grad_norm": 1.1139674186706543, "learning_rate": 3.309721467890571e-05, "loss": 1.6534, "mean_token_accuracy": 0.6708964183926582, "num_tokens": 121382325.0, "step": 642 }, { "epoch": 0.8853700516351118, "grad_norm": 1.1200459003448486, "learning_rate": 3.3074716700189676e-05, "loss": 1.5178, "mean_token_accuracy": 0.6928727477788925, "num_tokens": 121521905.0, "step": 643 }, { "epoch": 0.8867469879518072, "grad_norm": 1.1589993238449097, "learning_rate": 3.3052189791760664e-05, "loss": 1.7771, "mean_token_accuracy": 0.6749222204089165, "num_tokens": 121731828.0, "step": 644 }, { "epoch": 0.8881239242685026, "grad_norm": 0.9583616852760315, "learning_rate": 3.302963400346272e-05, "loss": 2.2869, "mean_token_accuracy": 0.5648324713110924, "num_tokens": 121998661.0, "step": 645 }, { "epoch": 0.889500860585198, "grad_norm": 0.9762979745864868, "learning_rate": 3.3007049385203816e-05, "loss": 2.2218, "mean_token_accuracy": 0.5778237879276276, "num_tokens": 122197490.0, "step": 646 }, { "epoch": 0.8908777969018933, "grad_norm": 1.0428951978683472, "learning_rate": 3.29844359869557e-05, "loss": 1.944, "mean_token_accuracy": 0.6226832494139671, "num_tokens": 122368564.0, "step": 647 }, { "epoch": 0.8922547332185886, "grad_norm": 1.1008118391036987, "learning_rate": 3.296179385875381e-05, "loss": 1.6937, "mean_token_accuracy": 0.662883460521698, "num_tokens": 122523541.0, "step": 648 }, { "epoch": 0.893631669535284, "grad_norm": 1.1035977602005005, "learning_rate": 3.293912305069715e-05, "loss": 1.5179, "mean_token_accuracy": 0.6899018362164497, "num_tokens": 122666008.0, "step": 649 }, { "epoch": 0.8950086058519794, "grad_norm": 1.3296838998794556, "learning_rate": 3.291642361294818e-05, "loss": 1.5692, "mean_token_accuracy": 0.6906843781471252, "num_tokens": 122790655.0, "step": 650 }, { "epoch": 0.8963855421686747, "grad_norm": 1.3735569715499878, "learning_rate": 3.2893695595732705e-05, "loss": 2.1272, "mean_token_accuracy": 0.5775994136929512, "num_tokens": 123118620.0, "step": 651 }, { "epoch": 0.89776247848537, "grad_norm": 1.0511846542358398, "learning_rate": 3.287093904933977e-05, "loss": 2.2639, "mean_token_accuracy": 0.5708245933055878, "num_tokens": 123329578.0, "step": 652 }, { "epoch": 0.8991394148020654, "grad_norm": 1.0521135330200195, "learning_rate": 3.284815402412154e-05, "loss": 1.9742, "mean_token_accuracy": 0.6159580796957016, "num_tokens": 123506722.0, "step": 653 }, { "epoch": 0.9005163511187607, "grad_norm": 1.097086787223816, "learning_rate": 3.282534057049322e-05, "loss": 1.7341, "mean_token_accuracy": 0.656589575111866, "num_tokens": 123665214.0, "step": 654 }, { "epoch": 0.9018932874354562, "grad_norm": 1.1034553050994873, "learning_rate": 3.280249873893288e-05, "loss": 1.5589, "mean_token_accuracy": 0.6865019872784615, "num_tokens": 123810913.0, "step": 655 }, { "epoch": 0.9032702237521515, "grad_norm": 1.1988232135772705, "learning_rate": 3.277962857998139e-05, "loss": 1.4711, "mean_token_accuracy": 0.7048565149307251, "num_tokens": 123942019.0, "step": 656 }, { "epoch": 0.9046471600688468, "grad_norm": 1.1118179559707642, "learning_rate": 3.275673014424231e-05, "loss": 2.0644, "mean_token_accuracy": 0.6025776565074921, "num_tokens": 124237737.0, "step": 657 }, { "epoch": 0.9060240963855422, "grad_norm": 1.0285691022872925, "learning_rate": 3.273380348238177e-05, "loss": 2.288, "mean_token_accuracy": 0.5647922828793526, "num_tokens": 124463102.0, "step": 658 }, { "epoch": 0.9074010327022375, "grad_norm": 0.987539529800415, "learning_rate": 3.271084864512833e-05, "loss": 2.0693, "mean_token_accuracy": 0.6018091514706612, "num_tokens": 124647371.0, "step": 659 }, { "epoch": 0.9087779690189329, "grad_norm": 1.0449810028076172, "learning_rate": 3.268786568327291e-05, "loss": 1.8031, "mean_token_accuracy": 0.6435295715928078, "num_tokens": 124810141.0, "step": 660 }, { "epoch": 0.9101549053356283, "grad_norm": 1.0954370498657227, "learning_rate": 3.2664854647668666e-05, "loss": 1.5771, "mean_token_accuracy": 0.6802947446703911, "num_tokens": 124958857.0, "step": 661 }, { "epoch": 0.9115318416523236, "grad_norm": 1.1525367498397827, "learning_rate": 3.264181558923086e-05, "loss": 1.4684, "mean_token_accuracy": 0.7033644914627075, "num_tokens": 125094296.0, "step": 662 }, { "epoch": 0.9129087779690189, "grad_norm": 1.369832992553711, "learning_rate": 3.261874855893675e-05, "loss": 1.9505, "mean_token_accuracy": 0.6436514407396317, "num_tokens": 125348636.0, "step": 663 }, { "epoch": 0.9142857142857143, "grad_norm": 1.0099798440933228, "learning_rate": 3.259565360782551e-05, "loss": 2.2893, "mean_token_accuracy": 0.5668921992182732, "num_tokens": 125591985.0, "step": 664 }, { "epoch": 0.9156626506024096, "grad_norm": 1.0739353895187378, "learning_rate": 3.257253078699807e-05, "loss": 2.1276, "mean_token_accuracy": 0.5916158780455589, "num_tokens": 125783753.0, "step": 665 }, { "epoch": 0.917039586919105, "grad_norm": 1.0985307693481445, "learning_rate": 3.254938014761704e-05, "loss": 1.842, "mean_token_accuracy": 0.637856163084507, "num_tokens": 125950878.0, "step": 666 }, { "epoch": 0.9184165232358004, "grad_norm": 1.1707426309585571, "learning_rate": 3.252620174090658e-05, "loss": 1.6453, "mean_token_accuracy": 0.6718071028590202, "num_tokens": 126103262.0, "step": 667 }, { "epoch": 0.9197934595524957, "grad_norm": 1.1297056674957275, "learning_rate": 3.250299561815228e-05, "loss": 1.5031, "mean_token_accuracy": 0.6935581564903259, "num_tokens": 126242591.0, "step": 668 }, { "epoch": 0.921170395869191, "grad_norm": 1.1309452056884766, "learning_rate": 3.2479761830701075e-05, "loss": 1.735, "mean_token_accuracy": 0.6862308159470558, "num_tokens": 126452748.0, "step": 669 }, { "epoch": 0.9225473321858864, "grad_norm": 1.0285677909851074, "learning_rate": 3.245650042996108e-05, "loss": 2.2618, "mean_token_accuracy": 0.5678475350141525, "num_tokens": 126720324.0, "step": 670 }, { "epoch": 0.9239242685025818, "grad_norm": 1.0162473917007446, "learning_rate": 3.243321146740155e-05, "loss": 2.1726, "mean_token_accuracy": 0.582960918545723, "num_tokens": 126920739.0, "step": 671 }, { "epoch": 0.9253012048192771, "grad_norm": 1.0290892124176025, "learning_rate": 3.240989499455269e-05, "loss": 1.9577, "mean_token_accuracy": 0.6180352866649628, "num_tokens": 127093097.0, "step": 672 }, { "epoch": 0.9266781411359725, "grad_norm": 1.0667035579681396, "learning_rate": 3.2386551063005596e-05, "loss": 1.6857, "mean_token_accuracy": 0.6646326705813408, "num_tokens": 127248748.0, "step": 673 }, { "epoch": 0.9280550774526678, "grad_norm": 1.1049374341964722, "learning_rate": 3.2363179724412105e-05, "loss": 1.5101, "mean_token_accuracy": 0.6960373744368553, "num_tokens": 127391899.0, "step": 674 }, { "epoch": 0.9294320137693631, "grad_norm": 1.3691071271896362, "learning_rate": 3.2339781030484715e-05, "loss": 1.5586, "mean_token_accuracy": 0.6865684390068054, "num_tokens": 127516590.0, "step": 675 }, { "epoch": 0.9308089500860586, "grad_norm": 1.4408310651779175, "learning_rate": 3.231635503299643e-05, "loss": 2.0979, "mean_token_accuracy": 0.586410865187645, "num_tokens": 127845379.0, "step": 676 }, { "epoch": 0.9321858864027539, "grad_norm": 1.1112864017486572, "learning_rate": 3.229290178378068e-05, "loss": 2.2548, "mean_token_accuracy": 0.5710437297821045, "num_tokens": 128055505.0, "step": 677 }, { "epoch": 0.9335628227194492, "grad_norm": 1.0591106414794922, "learning_rate": 3.2269421334731196e-05, "loss": 1.9656, "mean_token_accuracy": 0.6157826632261276, "num_tokens": 128232685.0, "step": 678 }, { "epoch": 0.9349397590361446, "grad_norm": 1.1297646760940552, "learning_rate": 3.224591373780188e-05, "loss": 1.7476, "mean_token_accuracy": 0.6573057919740677, "num_tokens": 128391250.0, "step": 679 }, { "epoch": 0.9363166953528399, "grad_norm": 1.143611192703247, "learning_rate": 3.222237904500672e-05, "loss": 1.5651, "mean_token_accuracy": 0.6833081617951393, "num_tokens": 128537053.0, "step": 680 }, { "epoch": 0.9376936316695352, "grad_norm": 1.180500864982605, "learning_rate": 3.219881730841964e-05, "loss": 1.4539, "mean_token_accuracy": 0.710208348929882, "num_tokens": 128668083.0, "step": 681 }, { "epoch": 0.9390705679862307, "grad_norm": 1.0734176635742188, "learning_rate": 3.217522858017442e-05, "loss": 2.0749, "mean_token_accuracy": 0.6015241518616676, "num_tokens": 128961229.0, "step": 682 }, { "epoch": 0.940447504302926, "grad_norm": 1.0053563117980957, "learning_rate": 3.215161291246454e-05, "loss": 2.2744, "mean_token_accuracy": 0.5670230314135551, "num_tokens": 129185778.0, "step": 683 }, { "epoch": 0.9418244406196213, "grad_norm": 0.9922622442245483, "learning_rate": 3.212797035754311e-05, "loss": 2.0691, "mean_token_accuracy": 0.6004709377884865, "num_tokens": 129370430.0, "step": 684 }, { "epoch": 0.9432013769363167, "grad_norm": 1.0449470281600952, "learning_rate": 3.210430096772272e-05, "loss": 1.7932, "mean_token_accuracy": 0.6461532339453697, "num_tokens": 129533830.0, "step": 685 }, { "epoch": 0.944578313253012, "grad_norm": 1.0867208242416382, "learning_rate": 3.208060479537533e-05, "loss": 1.579, "mean_token_accuracy": 0.683335468173027, "num_tokens": 129683348.0, "step": 686 }, { "epoch": 0.9459552495697074, "grad_norm": 1.1272823810577393, "learning_rate": 3.205688189293219e-05, "loss": 1.4804, "mean_token_accuracy": 0.698005273938179, "num_tokens": 129819267.0, "step": 687 }, { "epoch": 0.9473321858864028, "grad_norm": 1.4839318990707397, "learning_rate": 3.2033132312883655e-05, "loss": 1.9833, "mean_token_accuracy": 0.6416849717497826, "num_tokens": 130074738.0, "step": 688 }, { "epoch": 0.9487091222030981, "grad_norm": 1.1169933080673218, "learning_rate": 3.2009356107779134e-05, "loss": 2.2476, "mean_token_accuracy": 0.5735103040933609, "num_tokens": 130315439.0, "step": 689 }, { "epoch": 0.9500860585197934, "grad_norm": 0.9956347942352295, "learning_rate": 3.198555333022694e-05, "loss": 2.0952, "mean_token_accuracy": 0.5967729762196541, "num_tokens": 130506385.0, "step": 690 }, { "epoch": 0.9514629948364888, "grad_norm": 1.0846455097198486, "learning_rate": 3.196172403289418e-05, "loss": 1.8891, "mean_token_accuracy": 0.628291554749012, "num_tokens": 130673449.0, "step": 691 }, { "epoch": 0.9528399311531842, "grad_norm": 1.140215277671814, "learning_rate": 3.193786826850664e-05, "loss": 1.6156, "mean_token_accuracy": 0.6777254566550255, "num_tokens": 130825332.0, "step": 692 }, { "epoch": 0.9542168674698795, "grad_norm": 1.1480087041854858, "learning_rate": 3.191398608984867e-05, "loss": 1.4657, "mean_token_accuracy": 0.7011442258954048, "num_tokens": 130964791.0, "step": 693 }, { "epoch": 0.9555938037865749, "grad_norm": 1.144534707069397, "learning_rate": 3.189007754976307e-05, "loss": 1.7093, "mean_token_accuracy": 0.6890531480312347, "num_tokens": 131171672.0, "step": 694 }, { "epoch": 0.9569707401032702, "grad_norm": 0.9313427209854126, "learning_rate": 3.186614270115095e-05, "loss": 2.2304, "mean_token_accuracy": 0.5703701004385948, "num_tokens": 131443725.0, "step": 695 }, { "epoch": 0.9583476764199655, "grad_norm": 0.9735227823257446, "learning_rate": 3.184218159697166e-05, "loss": 2.2024, "mean_token_accuracy": 0.5768650621175766, "num_tokens": 131645615.0, "step": 696 }, { "epoch": 0.9597246127366609, "grad_norm": 1.0554592609405518, "learning_rate": 3.1818194290242626e-05, "loss": 1.953, "mean_token_accuracy": 0.6241937801241875, "num_tokens": 131818596.0, "step": 697 }, { "epoch": 0.9611015490533563, "grad_norm": 1.0843249559402466, "learning_rate": 3.1794180834039245e-05, "loss": 1.6896, "mean_token_accuracy": 0.6650799214839935, "num_tokens": 131974459.0, "step": 698 }, { "epoch": 0.9624784853700517, "grad_norm": 1.0485241413116455, "learning_rate": 3.177014128149479e-05, "loss": 1.4927, "mean_token_accuracy": 0.700206384062767, "num_tokens": 132117689.0, "step": 699 }, { "epoch": 0.963855421686747, "grad_norm": 1.2976409196853638, "learning_rate": 3.1746075685800276e-05, "loss": 1.5447, "mean_token_accuracy": 0.6968283355236053, "num_tokens": 132242477.0, "step": 700 }, { "epoch": 0.9652323580034423, "grad_norm": 1.3425211906433105, "learning_rate": 3.172198410020432e-05, "loss": 2.0947, "mean_token_accuracy": 0.5916655585169792, "num_tokens": 132568685.0, "step": 701 }, { "epoch": 0.9666092943201376, "grad_norm": 1.0463674068450928, "learning_rate": 3.169786657801306e-05, "loss": 2.2582, "mean_token_accuracy": 0.5740041211247444, "num_tokens": 132780543.0, "step": 702 }, { "epoch": 0.9679862306368331, "grad_norm": 1.0053788423538208, "learning_rate": 3.167372317259004e-05, "loss": 2.0033, "mean_token_accuracy": 0.6103670001029968, "num_tokens": 132958144.0, "step": 703 }, { "epoch": 0.9693631669535284, "grad_norm": 1.0473383665084839, "learning_rate": 3.164955393735605e-05, "loss": 1.6831, "mean_token_accuracy": 0.6643523722887039, "num_tokens": 133117013.0, "step": 704 }, { "epoch": 0.9707401032702238, "grad_norm": 1.0759003162384033, "learning_rate": 3.162535892578903e-05, "loss": 1.5213, "mean_token_accuracy": 0.6896376088261604, "num_tokens": 133262548.0, "step": 705 }, { "epoch": 0.9721170395869191, "grad_norm": 1.1752833127975464, "learning_rate": 3.1601138191423966e-05, "loss": 1.4007, "mean_token_accuracy": 0.7208004593849182, "num_tokens": 133393145.0, "step": 706 }, { "epoch": 0.9734939759036144, "grad_norm": 1.1671373844146729, "learning_rate": 3.157689178785276e-05, "loss": 2.0361, "mean_token_accuracy": 0.6089128032326698, "num_tokens": 133690475.0, "step": 707 }, { "epoch": 0.9748709122203099, "grad_norm": 0.9790600538253784, "learning_rate": 3.155261976872412e-05, "loss": 2.2855, "mean_token_accuracy": 0.5652761831879616, "num_tokens": 133915234.0, "step": 708 }, { "epoch": 0.9762478485370052, "grad_norm": 0.9710173010826111, "learning_rate": 3.15283221877434e-05, "loss": 2.0118, "mean_token_accuracy": 0.6117326393723488, "num_tokens": 134099506.0, "step": 709 }, { "epoch": 0.9776247848537005, "grad_norm": 1.0744870901107788, "learning_rate": 3.150399909867254e-05, "loss": 1.7701, "mean_token_accuracy": 0.6513009145855904, "num_tokens": 134262500.0, "step": 710 }, { "epoch": 0.9790017211703959, "grad_norm": 1.0981500148773193, "learning_rate": 3.147965055532991e-05, "loss": 1.5294, "mean_token_accuracy": 0.6915587186813354, "num_tokens": 134411575.0, "step": 711 }, { "epoch": 0.9803786574870912, "grad_norm": 1.1362576484680176, "learning_rate": 3.1455276611590224e-05, "loss": 1.4631, "mean_token_accuracy": 0.7067149803042412, "num_tokens": 134547445.0, "step": 712 }, { "epoch": 0.9817555938037866, "grad_norm": 0.99553382396698, "learning_rate": 3.143087732138435e-05, "loss": 1.8773, "mean_token_accuracy": 0.6479083150625229, "num_tokens": 134807175.0, "step": 713 }, { "epoch": 0.983132530120482, "grad_norm": 0.8891575932502747, "learning_rate": 3.1406452738699284e-05, "loss": 2.2457, "mean_token_accuracy": 0.5742739886045456, "num_tokens": 135048544.0, "step": 714 }, { "epoch": 0.9845094664371773, "grad_norm": 0.9619469046592712, "learning_rate": 3.138200291757797e-05, "loss": 2.088, "mean_token_accuracy": 0.5981254503130913, "num_tokens": 135238364.0, "step": 715 }, { "epoch": 0.9858864027538726, "grad_norm": 1.089682698249817, "learning_rate": 3.135752791211918e-05, "loss": 1.8142, "mean_token_accuracy": 0.6428392976522446, "num_tokens": 135404706.0, "step": 716 }, { "epoch": 0.987263339070568, "grad_norm": 1.1109813451766968, "learning_rate": 3.1333027776477454e-05, "loss": 1.581, "mean_token_accuracy": 0.6791630834341049, "num_tokens": 135556355.0, "step": 717 }, { "epoch": 0.9886402753872633, "grad_norm": 1.0903289318084717, "learning_rate": 3.130850256486287e-05, "loss": 1.463, "mean_token_accuracy": 0.7010476663708687, "num_tokens": 135695464.0, "step": 718 }, { "epoch": 0.9900172117039587, "grad_norm": 1.1477959156036377, "learning_rate": 3.128395233154105e-05, "loss": 1.6774, "mean_token_accuracy": 0.6846660301089287, "num_tokens": 135900731.0, "step": 719 }, { "epoch": 0.9913941480206541, "grad_norm": 0.9270409941673279, "learning_rate": 3.125937713083296e-05, "loss": 2.2522, "mean_token_accuracy": 0.5687893331050873, "num_tokens": 136166698.0, "step": 720 }, { "epoch": 0.9927710843373494, "grad_norm": 0.9983441829681396, "learning_rate": 3.123477701711479e-05, "loss": 2.1325, "mean_token_accuracy": 0.5904159918427467, "num_tokens": 136366387.0, "step": 721 }, { "epoch": 0.9941480206540447, "grad_norm": 1.121090292930603, "learning_rate": 3.121015204481788e-05, "loss": 1.9258, "mean_token_accuracy": 0.6243902668356895, "num_tokens": 136538684.0, "step": 722 }, { "epoch": 0.9955249569707401, "grad_norm": 1.0914192199707031, "learning_rate": 3.118550226842857e-05, "loss": 1.6346, "mean_token_accuracy": 0.6718620806932449, "num_tokens": 136694466.0, "step": 723 }, { "epoch": 0.9969018932874355, "grad_norm": 1.0405100584030151, "learning_rate": 3.116082774248807e-05, "loss": 1.4748, "mean_token_accuracy": 0.6990596503019333, "num_tokens": 136837742.0, "step": 724 }, { "epoch": 0.9982788296041308, "grad_norm": 1.296562671661377, "learning_rate": 3.113612852159235e-05, "loss": 1.5234, "mean_token_accuracy": 0.6917864084243774, "num_tokens": 136962614.0, "step": 725 }, { "epoch": 0.9996557659208262, "grad_norm": 1.1923784017562866, "learning_rate": 3.111140466039205e-05, "loss": 2.0235, "mean_token_accuracy": 0.6320527717471123, "num_tokens": 137163524.0, "step": 726 }, { "epoch": 1.0, "grad_norm": 1.1923784017562866, "learning_rate": 3.111140466039205e-05, "loss": 1.6217, "mean_token_accuracy": 0.6893291473388672, "num_tokens": 137235049.0, "step": 727 }, { "epoch": 1.0013769363166953, "grad_norm": 1.0029478073120117, "learning_rate": 3.1086656213592295e-05, "loss": 1.9726, "mean_token_accuracy": 0.6016779094934464, "num_tokens": 137562156.0, "step": 728 }, { "epoch": 1.0027538726333907, "grad_norm": 0.9400465488433838, "learning_rate": 3.106188323595262e-05, "loss": 2.0572, "mean_token_accuracy": 0.6042135208845139, "num_tokens": 137771020.0, "step": 729 }, { "epoch": 1.004130808950086, "grad_norm": 0.9249603152275085, "learning_rate": 3.103708578228686e-05, "loss": 1.7822, "mean_token_accuracy": 0.6485705301165581, "num_tokens": 137947518.0, "step": 730 }, { "epoch": 1.0055077452667813, "grad_norm": 0.9482125043869019, "learning_rate": 3.101226390746298e-05, "loss": 1.5426, "mean_token_accuracy": 0.6902381777763367, "num_tokens": 138106061.0, "step": 731 }, { "epoch": 1.0068846815834767, "grad_norm": 0.9615915417671204, "learning_rate": 3.098741766640299e-05, "loss": 1.3314, "mean_token_accuracy": 0.726908378303051, "num_tokens": 138251774.0, "step": 732 }, { "epoch": 1.0082616179001722, "grad_norm": 1.148407220840454, "learning_rate": 3.0962547114082804e-05, "loss": 1.2424, "mean_token_accuracy": 0.7494438886642456, "num_tokens": 138382807.0, "step": 733 }, { "epoch": 1.0096385542168675, "grad_norm": 1.3328787088394165, "learning_rate": 3.093765230553215e-05, "loss": 1.9235, "mean_token_accuracy": 0.6315062493085861, "num_tokens": 138681884.0, "step": 734 }, { "epoch": 1.0110154905335629, "grad_norm": 1.0105888843536377, "learning_rate": 3.091273329583441e-05, "loss": 2.1232, "mean_token_accuracy": 0.5918585136532784, "num_tokens": 138905523.0, "step": 735 }, { "epoch": 1.0123924268502582, "grad_norm": 0.8836407661437988, "learning_rate": 3.088779014012652e-05, "loss": 1.848, "mean_token_accuracy": 0.6385939419269562, "num_tokens": 139089290.0, "step": 736 }, { "epoch": 1.0137693631669535, "grad_norm": 0.9119402766227722, "learning_rate": 3.086282289359884e-05, "loss": 1.5871, "mean_token_accuracy": 0.6821451634168625, "num_tokens": 139252084.0, "step": 737 }, { "epoch": 1.0151462994836489, "grad_norm": 0.932428240776062, "learning_rate": 3.0837831611495036e-05, "loss": 1.3835, "mean_token_accuracy": 0.7179422751069069, "num_tokens": 139401174.0, "step": 738 }, { "epoch": 1.0165232358003442, "grad_norm": 0.9737330079078674, "learning_rate": 3.0812816349111956e-05, "loss": 1.252, "mean_token_accuracy": 0.7432859092950821, "num_tokens": 139536728.0, "step": 739 }, { "epoch": 1.0179001721170395, "grad_norm": 0.838245689868927, "learning_rate": 3.07877771617995e-05, "loss": 1.8125, "mean_token_accuracy": 0.6718945130705833, "num_tokens": 139790699.0, "step": 740 }, { "epoch": 1.0192771084337349, "grad_norm": 0.8993909358978271, "learning_rate": 3.076271410496052e-05, "loss": 2.1374, "mean_token_accuracy": 0.5915275812149048, "num_tokens": 140033164.0, "step": 741 }, { "epoch": 1.0206540447504302, "grad_norm": 0.8663306832313538, "learning_rate": 3.073762723405069e-05, "loss": 1.94, "mean_token_accuracy": 0.6232911869883537, "num_tokens": 140224877.0, "step": 742 }, { "epoch": 1.0220309810671258, "grad_norm": 0.8364236354827881, "learning_rate": 3.071251660457833e-05, "loss": 1.6436, "mean_token_accuracy": 0.6715668812394142, "num_tokens": 140392542.0, "step": 743 }, { "epoch": 1.023407917383821, "grad_norm": 0.8664137125015259, "learning_rate": 3.0687382272104385e-05, "loss": 1.38, "mean_token_accuracy": 0.7202821224927902, "num_tokens": 140544751.0, "step": 744 }, { "epoch": 1.0247848537005164, "grad_norm": 0.9057113528251648, "learning_rate": 3.066222429224221e-05, "loss": 1.2468, "mean_token_accuracy": 0.7420770302414894, "num_tokens": 140684278.0, "step": 745 }, { "epoch": 1.0261617900172118, "grad_norm": 0.305528461933136, "learning_rate": 3.063704272065751e-05, "loss": 1.6139, "mean_token_accuracy": 0.7189796417951584, "num_tokens": 140891829.0, "step": 746 }, { "epoch": 1.027538726333907, "grad_norm": 0.9910538792610168, "learning_rate": 3.061183761306816e-05, "loss": 2.1506, "mean_token_accuracy": 0.5871138572692871, "num_tokens": 141158195.0, "step": 747 }, { "epoch": 1.0289156626506024, "grad_norm": 0.8586408495903015, "learning_rate": 3.0586609025244144e-05, "loss": 1.9761, "mean_token_accuracy": 0.6169561222195625, "num_tokens": 141357077.0, "step": 748 }, { "epoch": 1.0302925989672977, "grad_norm": 0.843573808670044, "learning_rate": 3.056135701300736e-05, "loss": 1.6923, "mean_token_accuracy": 0.6645487323403358, "num_tokens": 141528392.0, "step": 749 }, { "epoch": 1.031669535283993, "grad_norm": 0.8673936724662781, "learning_rate": 3.053608163223159e-05, "loss": 1.4295, "mean_token_accuracy": 0.7111827060580254, "num_tokens": 141683691.0, "step": 750 }, { "epoch": 1.0330464716006884, "grad_norm": 0.9074149131774902, "learning_rate": 3.051078293884226e-05, "loss": 1.2733, "mean_token_accuracy": 0.7376473397016525, "num_tokens": 141826427.0, "step": 751 }, { "epoch": 1.0344234079173837, "grad_norm": 1.0874241590499878, "learning_rate": 3.0485460988816432e-05, "loss": 1.2872, "mean_token_accuracy": 0.7332884073257446, "num_tokens": 141951079.0, "step": 752 }, { "epoch": 1.035800344234079, "grad_norm": 0.7889423966407776, "learning_rate": 3.04601158381826e-05, "loss": 1.9195, "mean_token_accuracy": 0.6142819672822952, "num_tokens": 142276504.0, "step": 753 }, { "epoch": 1.0371772805507746, "grad_norm": 0.8049442768096924, "learning_rate": 3.0434747543020585e-05, "loss": 2.0211, "mean_token_accuracy": 0.6101675406098366, "num_tokens": 142486554.0, "step": 754 }, { "epoch": 1.03855421686747, "grad_norm": 0.8561278581619263, "learning_rate": 3.0409356159461447e-05, "loss": 1.8017, "mean_token_accuracy": 0.646651916205883, "num_tokens": 142663758.0, "step": 755 }, { "epoch": 1.0399311531841653, "grad_norm": 0.8753980398178101, "learning_rate": 3.0383941743687312e-05, "loss": 1.4765, "mean_token_accuracy": 0.6968033239245415, "num_tokens": 142822383.0, "step": 756 }, { "epoch": 1.0413080895008606, "grad_norm": 0.8627078533172607, "learning_rate": 3.0358504351931265e-05, "loss": 1.2789, "mean_token_accuracy": 0.7351161986589432, "num_tokens": 142968078.0, "step": 757 }, { "epoch": 1.042685025817556, "grad_norm": 0.9772382974624634, "learning_rate": 3.0333044040477248e-05, "loss": 1.2124, "mean_token_accuracy": 0.7500206530094147, "num_tokens": 143098751.0, "step": 758 }, { "epoch": 1.0440619621342513, "grad_norm": 0.807555615901947, "learning_rate": 3.030756086565989e-05, "loss": 1.8987, "mean_token_accuracy": 0.6327208057045937, "num_tokens": 143394527.0, "step": 759 }, { "epoch": 1.0454388984509466, "grad_norm": 0.7802316546440125, "learning_rate": 3.0282054883864434e-05, "loss": 2.0651, "mean_token_accuracy": 0.6049460098147392, "num_tokens": 143617074.0, "step": 760 }, { "epoch": 1.046815834767642, "grad_norm": 0.8540670871734619, "learning_rate": 3.025652615152658e-05, "loss": 1.8367, "mean_token_accuracy": 0.6399110704660416, "num_tokens": 143800238.0, "step": 761 }, { "epoch": 1.0481927710843373, "grad_norm": 0.871380627155304, "learning_rate": 3.0230974725132348e-05, "loss": 1.5488, "mean_token_accuracy": 0.6881750598549843, "num_tokens": 143962736.0, "step": 762 }, { "epoch": 1.0495697074010326, "grad_norm": 0.8554650545120239, "learning_rate": 3.0205400661218e-05, "loss": 1.2975, "mean_token_accuracy": 0.7284209579229355, "num_tokens": 144111488.0, "step": 763 }, { "epoch": 1.0509466437177282, "grad_norm": 0.9119603633880615, "learning_rate": 3.0179804016369857e-05, "loss": 1.2104, "mean_token_accuracy": 0.7527954652905464, "num_tokens": 144246700.0, "step": 764 }, { "epoch": 1.0523235800344235, "grad_norm": 0.721967875957489, "learning_rate": 3.0154184847224237e-05, "loss": 1.7501, "mean_token_accuracy": 0.680427297949791, "num_tokens": 144502546.0, "step": 765 }, { "epoch": 1.0537005163511188, "grad_norm": 0.7939363121986389, "learning_rate": 3.0128543210467273e-05, "loss": 2.0907, "mean_token_accuracy": 0.5991318300366402, "num_tokens": 144745284.0, "step": 766 }, { "epoch": 1.0550774526678142, "grad_norm": 0.8193758726119995, "learning_rate": 3.0102879162834812e-05, "loss": 1.9034, "mean_token_accuracy": 0.6304706037044525, "num_tokens": 144936220.0, "step": 767 }, { "epoch": 1.0564543889845095, "grad_norm": 0.8464405536651611, "learning_rate": 3.00771927611123e-05, "loss": 1.6202, "mean_token_accuracy": 0.6756428703665733, "num_tokens": 145103039.0, "step": 768 }, { "epoch": 1.0578313253012048, "grad_norm": 0.8616576790809631, "learning_rate": 3.0051484062134632e-05, "loss": 1.3577, "mean_token_accuracy": 0.7231063768267632, "num_tokens": 145254862.0, "step": 769 }, { "epoch": 1.0592082616179002, "grad_norm": 0.8938390016555786, "learning_rate": 3.0025753122786053e-05, "loss": 1.1964, "mean_token_accuracy": 0.7555558383464813, "num_tokens": 145394128.0, "step": 770 }, { "epoch": 1.0605851979345955, "grad_norm": 0.3228488862514496, "learning_rate": 3.0000000000000004e-05, "loss": 1.5964, "mean_token_accuracy": 0.7049909383058548, "num_tokens": 145600970.0, "step": 771 }, { "epoch": 1.0619621342512908, "grad_norm": 0.7962831854820251, "learning_rate": 2.9974224750759017e-05, "loss": 2.0676, "mean_token_accuracy": 0.6018437817692757, "num_tokens": 145868734.0, "step": 772 }, { "epoch": 1.0633390705679862, "grad_norm": 0.8041864037513733, "learning_rate": 2.994842743209458e-05, "loss": 1.9486, "mean_token_accuracy": 0.620536133646965, "num_tokens": 146067468.0, "step": 773 }, { "epoch": 1.0647160068846815, "grad_norm": 0.8255756497383118, "learning_rate": 2.9922608101087015e-05, "loss": 1.6755, "mean_token_accuracy": 0.6650430411100388, "num_tokens": 146239287.0, "step": 774 }, { "epoch": 1.0660929432013768, "grad_norm": 0.8502176403999329, "learning_rate": 2.9896766814865355e-05, "loss": 1.4364, "mean_token_accuracy": 0.7050567790865898, "num_tokens": 146394927.0, "step": 775 }, { "epoch": 1.0674698795180724, "grad_norm": 0.8699968457221985, "learning_rate": 2.9870903630607197e-05, "loss": 1.2611, "mean_token_accuracy": 0.7408623620867729, "num_tokens": 146538042.0, "step": 776 }, { "epoch": 1.0688468158347677, "grad_norm": 1.105823278427124, "learning_rate": 2.9845018605538598e-05, "loss": 1.3224, "mean_token_accuracy": 0.7288088127970695, "num_tokens": 146662203.0, "step": 777 }, { "epoch": 1.070223752151463, "grad_norm": 0.799994170665741, "learning_rate": 2.9819111796933948e-05, "loss": 1.9202, "mean_token_accuracy": 0.6134138330817223, "num_tokens": 146994262.0, "step": 778 }, { "epoch": 1.0716006884681584, "grad_norm": 0.7937746047973633, "learning_rate": 2.9793183262115824e-05, "loss": 2.0133, "mean_token_accuracy": 0.6118579283356667, "num_tokens": 147206288.0, "step": 779 }, { "epoch": 1.0729776247848537, "grad_norm": 0.816877543926239, "learning_rate": 2.9767233058454894e-05, "loss": 1.7441, "mean_token_accuracy": 0.6579990833997726, "num_tokens": 147384127.0, "step": 780 }, { "epoch": 1.074354561101549, "grad_norm": 0.846047580242157, "learning_rate": 2.9741261243369746e-05, "loss": 1.4997, "mean_token_accuracy": 0.6954512000083923, "num_tokens": 147543391.0, "step": 781 }, { "epoch": 1.0757314974182444, "grad_norm": 0.874823272228241, "learning_rate": 2.9715267874326805e-05, "loss": 1.3205, "mean_token_accuracy": 0.7297752350568771, "num_tokens": 147689937.0, "step": 782 }, { "epoch": 1.0771084337349397, "grad_norm": 1.0063176155090332, "learning_rate": 2.968925300884018e-05, "loss": 1.2044, "mean_token_accuracy": 0.7532805427908897, "num_tokens": 147821091.0, "step": 783 }, { "epoch": 1.078485370051635, "grad_norm": 0.8118379712104797, "learning_rate": 2.9663216704471547e-05, "loss": 1.8932, "mean_token_accuracy": 0.6369881555438042, "num_tokens": 148113904.0, "step": 784 }, { "epoch": 1.0798623063683306, "grad_norm": 0.7611131072044373, "learning_rate": 2.963715901883002e-05, "loss": 2.0566, "mean_token_accuracy": 0.6029491573572159, "num_tokens": 148337649.0, "step": 785 }, { "epoch": 1.081239242685026, "grad_norm": 0.8350545167922974, "learning_rate": 2.9611080009572015e-05, "loss": 1.8292, "mean_token_accuracy": 0.6395713910460472, "num_tokens": 148521458.0, "step": 786 }, { "epoch": 1.0826161790017212, "grad_norm": 0.8578276634216309, "learning_rate": 2.958497973440114e-05, "loss": 1.5475, "mean_token_accuracy": 0.6872339025139809, "num_tokens": 148684112.0, "step": 787 }, { "epoch": 1.0839931153184166, "grad_norm": 0.8608540296554565, "learning_rate": 2.9558858251068044e-05, "loss": 1.3263, "mean_token_accuracy": 0.7260417640209198, "num_tokens": 148832953.0, "step": 788 }, { "epoch": 1.085370051635112, "grad_norm": 0.9216232299804688, "learning_rate": 2.9532715617370318e-05, "loss": 1.2147, "mean_token_accuracy": 0.7479198649525642, "num_tokens": 148968564.0, "step": 789 }, { "epoch": 1.0867469879518072, "grad_norm": 0.6737606525421143, "learning_rate": 2.9506551891152334e-05, "loss": 1.7634, "mean_token_accuracy": 0.6802917718887329, "num_tokens": 149225837.0, "step": 790 }, { "epoch": 1.0881239242685026, "grad_norm": 0.7440963387489319, "learning_rate": 2.948036713030515e-05, "loss": 2.0953, "mean_token_accuracy": 0.5962796360254288, "num_tokens": 149467887.0, "step": 791 }, { "epoch": 1.089500860585198, "grad_norm": 0.8145778179168701, "learning_rate": 2.9454161392766355e-05, "loss": 1.8579, "mean_token_accuracy": 0.6352601423859596, "num_tokens": 149658169.0, "step": 792 }, { "epoch": 1.0908777969018932, "grad_norm": 0.8404384255409241, "learning_rate": 2.9427934736519962e-05, "loss": 1.5842, "mean_token_accuracy": 0.6831002607941628, "num_tokens": 149824286.0, "step": 793 }, { "epoch": 1.0922547332185886, "grad_norm": 0.8569501042366028, "learning_rate": 2.9401687219596247e-05, "loss": 1.3637, "mean_token_accuracy": 0.7190526500344276, "num_tokens": 149976057.0, "step": 794 }, { "epoch": 1.093631669535284, "grad_norm": 0.8966445326805115, "learning_rate": 2.9375418900071676e-05, "loss": 1.2335, "mean_token_accuracy": 0.7437724694609642, "num_tokens": 150115654.0, "step": 795 }, { "epoch": 1.0950086058519792, "grad_norm": 0.3099598288536072, "learning_rate": 2.9349129836068732e-05, "loss": 1.5631, "mean_token_accuracy": 0.7207692787051201, "num_tokens": 150323412.0, "step": 796 }, { "epoch": 1.0963855421686748, "grad_norm": 0.8416670560836792, "learning_rate": 2.9322820085755775e-05, "loss": 2.1022, "mean_token_accuracy": 0.5957780256867409, "num_tokens": 150592816.0, "step": 797 }, { "epoch": 1.0977624784853701, "grad_norm": 0.8337522745132446, "learning_rate": 2.9296489707346975e-05, "loss": 1.9462, "mean_token_accuracy": 0.6217591241002083, "num_tokens": 150792630.0, "step": 798 }, { "epoch": 1.0991394148020655, "grad_norm": 0.8509222269058228, "learning_rate": 2.9270138759102108e-05, "loss": 1.6754, "mean_token_accuracy": 0.6660570725798607, "num_tokens": 150964295.0, "step": 799 }, { "epoch": 1.1005163511187608, "grad_norm": 0.8529207706451416, "learning_rate": 2.9243767299326498e-05, "loss": 1.4333, "mean_token_accuracy": 0.7084468752145767, "num_tokens": 151119884.0, "step": 800 }, { "epoch": 1.1018932874354561, "grad_norm": 0.9028929471969604, "learning_rate": 2.9217375386370827e-05, "loss": 1.2462, "mean_token_accuracy": 0.7416334673762321, "num_tokens": 151262950.0, "step": 801 }, { "epoch": 1.1032702237521514, "grad_norm": 1.1669800281524658, "learning_rate": 2.919096307863104e-05, "loss": 1.3065, "mean_token_accuracy": 0.7330504208803177, "num_tokens": 151387816.0, "step": 802 }, { "epoch": 1.1046471600688468, "grad_norm": 0.7720150947570801, "learning_rate": 2.916453043454821e-05, "loss": 1.9054, "mean_token_accuracy": 0.6191992536187172, "num_tokens": 151715480.0, "step": 803 }, { "epoch": 1.106024096385542, "grad_norm": 0.778939425945282, "learning_rate": 2.9138077512608417e-05, "loss": 2.0065, "mean_token_accuracy": 0.6126930862665176, "num_tokens": 151926262.0, "step": 804 }, { "epoch": 1.1074010327022374, "grad_norm": 0.8334149718284607, "learning_rate": 2.9111604371342593e-05, "loss": 1.7268, "mean_token_accuracy": 0.6564971879124641, "num_tokens": 152104501.0, "step": 805 }, { "epoch": 1.1087779690189328, "grad_norm": 0.8771833181381226, "learning_rate": 2.9085111069326415e-05, "loss": 1.4817, "mean_token_accuracy": 0.6990488767623901, "num_tokens": 152263893.0, "step": 806 }, { "epoch": 1.1101549053356283, "grad_norm": 0.8995224833488464, "learning_rate": 2.905859766518017e-05, "loss": 1.2892, "mean_token_accuracy": 0.7347157672047615, "num_tokens": 152410054.0, "step": 807 }, { "epoch": 1.1115318416523237, "grad_norm": 0.9743083715438843, "learning_rate": 2.903206421756862e-05, "loss": 1.2013, "mean_token_accuracy": 0.7512853220105171, "num_tokens": 152541044.0, "step": 808 }, { "epoch": 1.112908777969019, "grad_norm": 0.7063298225402832, "learning_rate": 2.9005510785200887e-05, "loss": 1.8107, "mean_token_accuracy": 0.6448180750012398, "num_tokens": 152836339.0, "step": 809 }, { "epoch": 1.1142857142857143, "grad_norm": 0.7613732814788818, "learning_rate": 2.8978937426830286e-05, "loss": 2.0252, "mean_token_accuracy": 0.6103781387209892, "num_tokens": 153062184.0, "step": 810 }, { "epoch": 1.1156626506024097, "grad_norm": 0.8352683186531067, "learning_rate": 2.8952344201254253e-05, "loss": 1.7892, "mean_token_accuracy": 0.6467345356941223, "num_tokens": 153247034.0, "step": 811 }, { "epoch": 1.117039586919105, "grad_norm": 0.8839289546012878, "learning_rate": 2.892573116731417e-05, "loss": 1.5608, "mean_token_accuracy": 0.686791442334652, "num_tokens": 153410619.0, "step": 812 }, { "epoch": 1.1184165232358003, "grad_norm": 0.8540650606155396, "learning_rate": 2.889909838389523e-05, "loss": 1.3292, "mean_token_accuracy": 0.7261709347367287, "num_tokens": 153560305.0, "step": 813 }, { "epoch": 1.1197934595524957, "grad_norm": 0.9269915223121643, "learning_rate": 2.8872445909926358e-05, "loss": 1.2294, "mean_token_accuracy": 0.747901663184166, "num_tokens": 153696557.0, "step": 814 }, { "epoch": 1.121170395869191, "grad_norm": 0.7020638585090637, "learning_rate": 2.8845773804380028e-05, "loss": 1.6933, "mean_token_accuracy": 0.6874417141079903, "num_tokens": 153950491.0, "step": 815 }, { "epoch": 1.1225473321858863, "grad_norm": 0.8261013031005859, "learning_rate": 2.8819082126272152e-05, "loss": 2.0975, "mean_token_accuracy": 0.5970967561006546, "num_tokens": 154190981.0, "step": 816 }, { "epoch": 1.1239242685025816, "grad_norm": 0.8408902287483215, "learning_rate": 2.8792370934661948e-05, "loss": 1.8661, "mean_token_accuracy": 0.6326352804899216, "num_tokens": 154381897.0, "step": 817 }, { "epoch": 1.1253012048192772, "grad_norm": 0.8611677885055542, "learning_rate": 2.8765640288651807e-05, "loss": 1.6039, "mean_token_accuracy": 0.6775162145495415, "num_tokens": 154548641.0, "step": 818 }, { "epoch": 1.1266781411359725, "grad_norm": 0.8767852783203125, "learning_rate": 2.873889024738719e-05, "loss": 1.3578, "mean_token_accuracy": 0.7211553007364273, "num_tokens": 154700483.0, "step": 819 }, { "epoch": 1.1280550774526679, "grad_norm": 0.937568724155426, "learning_rate": 2.8712120870056455e-05, "loss": 1.2261, "mean_token_accuracy": 0.7466598376631737, "num_tokens": 154839931.0, "step": 820 }, { "epoch": 1.1294320137693632, "grad_norm": 0.31266045570373535, "learning_rate": 2.8685332215890723e-05, "loss": 1.5433, "mean_token_accuracy": 0.7212316989898682, "num_tokens": 155048495.0, "step": 821 }, { "epoch": 1.1308089500860585, "grad_norm": 0.8887499570846558, "learning_rate": 2.8658524344163807e-05, "loss": 2.0616, "mean_token_accuracy": 0.6020722538232803, "num_tokens": 155317279.0, "step": 822 }, { "epoch": 1.1321858864027539, "grad_norm": 0.831369936466217, "learning_rate": 2.8631697314192012e-05, "loss": 1.9446, "mean_token_accuracy": 0.6234478428959846, "num_tokens": 155518377.0, "step": 823 }, { "epoch": 1.1335628227194492, "grad_norm": 0.8362749814987183, "learning_rate": 2.8604851185334062e-05, "loss": 1.6876, "mean_token_accuracy": 0.6669899299740791, "num_tokens": 155691066.0, "step": 824 }, { "epoch": 1.1349397590361445, "grad_norm": 0.8689372539520264, "learning_rate": 2.8577986016990906e-05, "loss": 1.424, "mean_token_accuracy": 0.7098526060581207, "num_tokens": 155847066.0, "step": 825 }, { "epoch": 1.1363166953528399, "grad_norm": 0.8942853808403015, "learning_rate": 2.8551101868605644e-05, "loss": 1.2199, "mean_token_accuracy": 0.7498811632394791, "num_tokens": 155990392.0, "step": 826 }, { "epoch": 1.1376936316695352, "grad_norm": 1.1074973344802856, "learning_rate": 2.8524198799663367e-05, "loss": 1.2822, "mean_token_accuracy": 0.7377926632761955, "num_tokens": 156115495.0, "step": 827 }, { "epoch": 1.1390705679862307, "grad_norm": 0.7945520877838135, "learning_rate": 2.8497276869691028e-05, "loss": 1.8719, "mean_token_accuracy": 0.6227510273456573, "num_tokens": 156443577.0, "step": 828 }, { "epoch": 1.140447504302926, "grad_norm": 0.8320983052253723, "learning_rate": 2.8470336138257315e-05, "loss": 2.018, "mean_token_accuracy": 0.6127274185419083, "num_tokens": 156654378.0, "step": 829 }, { "epoch": 1.1418244406196214, "grad_norm": 0.8422808647155762, "learning_rate": 2.8443376664972516e-05, "loss": 1.7594, "mean_token_accuracy": 0.650455504655838, "num_tokens": 156832771.0, "step": 830 }, { "epoch": 1.1432013769363167, "grad_norm": 0.8811792731285095, "learning_rate": 2.8416398509488386e-05, "loss": 1.5172, "mean_token_accuracy": 0.6938158497214317, "num_tokens": 156991651.0, "step": 831 }, { "epoch": 1.144578313253012, "grad_norm": 0.8821980357170105, "learning_rate": 2.8389401731498018e-05, "loss": 1.2542, "mean_token_accuracy": 0.7413734942674637, "num_tokens": 157137650.0, "step": 832 }, { "epoch": 1.1459552495697074, "grad_norm": 1.0082980394363403, "learning_rate": 2.836238639073572e-05, "loss": 1.181, "mean_token_accuracy": 0.7566413581371307, "num_tokens": 157268965.0, "step": 833 }, { "epoch": 1.1473321858864027, "grad_norm": 0.8754553198814392, "learning_rate": 2.833535254697685e-05, "loss": 1.8214, "mean_token_accuracy": 0.6474481001496315, "num_tokens": 157559296.0, "step": 834 }, { "epoch": 1.148709122203098, "grad_norm": 0.7932919263839722, "learning_rate": 2.8308300260037734e-05, "loss": 2.0495, "mean_token_accuracy": 0.6053192466497421, "num_tokens": 157784178.0, "step": 835 }, { "epoch": 1.1500860585197934, "grad_norm": 0.8215339183807373, "learning_rate": 2.8281229589775484e-05, "loss": 1.8199, "mean_token_accuracy": 0.6439216360449791, "num_tokens": 157968405.0, "step": 836 }, { "epoch": 1.1514629948364887, "grad_norm": 0.8696882724761963, "learning_rate": 2.8254140596087897e-05, "loss": 1.5419, "mean_token_accuracy": 0.6860733330249786, "num_tokens": 158130994.0, "step": 837 }, { "epoch": 1.152839931153184, "grad_norm": 0.8903083205223083, "learning_rate": 2.8227033338913318e-05, "loss": 1.3325, "mean_token_accuracy": 0.7250578179955482, "num_tokens": 158279770.0, "step": 838 }, { "epoch": 1.1542168674698796, "grad_norm": 0.9364290237426758, "learning_rate": 2.8199907878230495e-05, "loss": 1.2039, "mean_token_accuracy": 0.7530009672045708, "num_tokens": 158415301.0, "step": 839 }, { "epoch": 1.155593803786575, "grad_norm": 0.6996081471443176, "learning_rate": 2.8172764274058456e-05, "loss": 1.7328, "mean_token_accuracy": 0.6794904991984367, "num_tokens": 158671826.0, "step": 840 }, { "epoch": 1.1569707401032703, "grad_norm": 0.7778701186180115, "learning_rate": 2.814560258645638e-05, "loss": 2.0586, "mean_token_accuracy": 0.6020767390727997, "num_tokens": 158912583.0, "step": 841 }, { "epoch": 1.1583476764199656, "grad_norm": 0.811480700969696, "learning_rate": 2.8118422875523434e-05, "loss": 1.8451, "mean_token_accuracy": 0.6390801295638084, "num_tokens": 159102679.0, "step": 842 }, { "epoch": 1.159724612736661, "grad_norm": 0.8552351593971252, "learning_rate": 2.8091225201398703e-05, "loss": 1.5456, "mean_token_accuracy": 0.6886724680662155, "num_tokens": 159268784.0, "step": 843 }, { "epoch": 1.1611015490533563, "grad_norm": 0.8535781502723694, "learning_rate": 2.8064009624260994e-05, "loss": 1.3192, "mean_token_accuracy": 0.7293161228299141, "num_tokens": 159420542.0, "step": 844 }, { "epoch": 1.1624784853700516, "grad_norm": 0.9110128879547119, "learning_rate": 2.8036776204328728e-05, "loss": 1.209, "mean_token_accuracy": 0.7517788484692574, "num_tokens": 159559817.0, "step": 845 }, { "epoch": 1.163855421686747, "grad_norm": 0.3102878928184509, "learning_rate": 2.800952500185981e-05, "loss": 1.5792, "mean_token_accuracy": 0.7168994843959808, "num_tokens": 159769189.0, "step": 846 }, { "epoch": 1.1652323580034423, "grad_norm": 0.8316339254379272, "learning_rate": 2.7982256077151482e-05, "loss": 2.0746, "mean_token_accuracy": 0.6017453819513321, "num_tokens": 160038320.0, "step": 847 }, { "epoch": 1.1666092943201376, "grad_norm": 0.8364328742027283, "learning_rate": 2.7954969490540223e-05, "loss": 1.9094, "mean_token_accuracy": 0.6263487339019775, "num_tokens": 160238702.0, "step": 848 }, { "epoch": 1.1679862306368332, "grad_norm": 0.8459382057189941, "learning_rate": 2.7927665302401568e-05, "loss": 1.6463, "mean_token_accuracy": 0.6697124242782593, "num_tokens": 160410466.0, "step": 849 }, { "epoch": 1.1693631669535285, "grad_norm": 0.8562229871749878, "learning_rate": 2.7900343573150003e-05, "loss": 1.3943, "mean_token_accuracy": 0.7150789275765419, "num_tokens": 160565468.0, "step": 850 }, { "epoch": 1.1707401032702238, "grad_norm": 0.9041212797164917, "learning_rate": 2.787300436323883e-05, "loss": 1.2304, "mean_token_accuracy": 0.7453877478837967, "num_tokens": 160707866.0, "step": 851 }, { "epoch": 1.1721170395869192, "grad_norm": 1.1399047374725342, "learning_rate": 2.7845647733160032e-05, "loss": 1.2809, "mean_token_accuracy": 0.736150287091732, "num_tokens": 160832391.0, "step": 852 }, { "epoch": 1.1734939759036145, "grad_norm": 0.7767520546913147, "learning_rate": 2.7818273743444132e-05, "loss": 1.857, "mean_token_accuracy": 0.6244627833366394, "num_tokens": 161158357.0, "step": 853 }, { "epoch": 1.1748709122203098, "grad_norm": 0.7661546468734741, "learning_rate": 2.7790882454660065e-05, "loss": 1.9957, "mean_token_accuracy": 0.6141610741615295, "num_tokens": 161369803.0, "step": 854 }, { "epoch": 1.1762478485370051, "grad_norm": 0.8285494446754456, "learning_rate": 2.7763473927415038e-05, "loss": 1.7297, "mean_token_accuracy": 0.6580388695001602, "num_tokens": 161546935.0, "step": 855 }, { "epoch": 1.1776247848537005, "grad_norm": 0.8922840356826782, "learning_rate": 2.7736048222354414e-05, "loss": 1.4685, "mean_token_accuracy": 0.7008523121476173, "num_tokens": 161705018.0, "step": 856 }, { "epoch": 1.1790017211703958, "grad_norm": 0.8974297642707825, "learning_rate": 2.7708605400161554e-05, "loss": 1.2672, "mean_token_accuracy": 0.7379241362214088, "num_tokens": 161850551.0, "step": 857 }, { "epoch": 1.1803786574870911, "grad_norm": 0.9765338897705078, "learning_rate": 2.768114552155769e-05, "loss": 1.1767, "mean_token_accuracy": 0.7606470286846161, "num_tokens": 161981308.0, "step": 858 }, { "epoch": 1.1817555938037865, "grad_norm": 0.7747376561164856, "learning_rate": 2.7653668647301797e-05, "loss": 1.7906, "mean_token_accuracy": 0.6494040042161942, "num_tokens": 162271740.0, "step": 859 }, { "epoch": 1.1831325301204818, "grad_norm": 0.7852546572685242, "learning_rate": 2.7626174838190464e-05, "loss": 2.0337, "mean_token_accuracy": 0.6081027463078499, "num_tokens": 162495629.0, "step": 860 }, { "epoch": 1.1845094664371774, "grad_norm": 0.8383556604385376, "learning_rate": 2.759866415505774e-05, "loss": 1.7843, "mean_token_accuracy": 0.6475213840603828, "num_tokens": 162679281.0, "step": 861 }, { "epoch": 1.1858864027538727, "grad_norm": 0.8743767142295837, "learning_rate": 2.757113665877502e-05, "loss": 1.5555, "mean_token_accuracy": 0.6866861283779144, "num_tokens": 162841773.0, "step": 862 }, { "epoch": 1.187263339070568, "grad_norm": 0.8875442743301392, "learning_rate": 2.754359241025088e-05, "loss": 1.315, "mean_token_accuracy": 0.730181910097599, "num_tokens": 162990800.0, "step": 863 }, { "epoch": 1.1886402753872634, "grad_norm": 0.9263463020324707, "learning_rate": 2.7516031470430996e-05, "loss": 1.1817, "mean_token_accuracy": 0.7571709603071213, "num_tokens": 163126582.0, "step": 864 }, { "epoch": 1.1900172117039587, "grad_norm": 0.7206065058708191, "learning_rate": 2.748845390029794e-05, "loss": 1.7337, "mean_token_accuracy": 0.6856749281287193, "num_tokens": 163382534.0, "step": 865 }, { "epoch": 1.191394148020654, "grad_norm": 0.7561405897140503, "learning_rate": 2.7460859760871117e-05, "loss": 2.0284, "mean_token_accuracy": 0.6086657643318176, "num_tokens": 163625297.0, "step": 866 }, { "epoch": 1.1927710843373494, "grad_norm": 0.8265641331672668, "learning_rate": 2.743324911320655e-05, "loss": 1.8123, "mean_token_accuracy": 0.6436498686671257, "num_tokens": 163815218.0, "step": 867 }, { "epoch": 1.1941480206540447, "grad_norm": 0.8555657267570496, "learning_rate": 2.740562201839684e-05, "loss": 1.5461, "mean_token_accuracy": 0.6880714446306229, "num_tokens": 163981155.0, "step": 868 }, { "epoch": 1.19552495697074, "grad_norm": 0.8658325672149658, "learning_rate": 2.7377978537570938e-05, "loss": 1.3173, "mean_token_accuracy": 0.729948379099369, "num_tokens": 164132579.0, "step": 869 }, { "epoch": 1.1969018932874356, "grad_norm": 0.9308359622955322, "learning_rate": 2.7350318731894075e-05, "loss": 1.2161, "mean_token_accuracy": 0.7445507496595383, "num_tokens": 164272080.0, "step": 870 }, { "epoch": 1.198278829604131, "grad_norm": 0.2961113452911377, "learning_rate": 2.7322642662567592e-05, "loss": 1.5178, "mean_token_accuracy": 0.7292258590459824, "num_tokens": 164484446.0, "step": 871 }, { "epoch": 1.1996557659208262, "grad_norm": 0.8503017425537109, "learning_rate": 2.729495039082881e-05, "loss": 2.0701, "mean_token_accuracy": 0.6018701419234276, "num_tokens": 164754962.0, "step": 872 }, { "epoch": 1.2010327022375216, "grad_norm": 0.8252357840538025, "learning_rate": 2.726724197795093e-05, "loss": 1.9275, "mean_token_accuracy": 0.623389258980751, "num_tokens": 164955623.0, "step": 873 }, { "epoch": 1.202409638554217, "grad_norm": 0.8368914127349854, "learning_rate": 2.7239517485242836e-05, "loss": 1.648, "mean_token_accuracy": 0.6704286932945251, "num_tokens": 165127912.0, "step": 874 }, { "epoch": 1.2037865748709122, "grad_norm": 0.86594557762146, "learning_rate": 2.7211776974048997e-05, "loss": 1.407, "mean_token_accuracy": 0.7131162211298943, "num_tokens": 165283664.0, "step": 875 }, { "epoch": 1.2051635111876076, "grad_norm": 0.9028728604316711, "learning_rate": 2.7184020505749336e-05, "loss": 1.2362, "mean_token_accuracy": 0.7450968772172928, "num_tokens": 165426700.0, "step": 876 }, { "epoch": 1.206540447504303, "grad_norm": 1.1287189722061157, "learning_rate": 2.715624814175907e-05, "loss": 1.2501, "mean_token_accuracy": 0.740516223013401, "num_tokens": 165551263.0, "step": 877 }, { "epoch": 1.2079173838209982, "grad_norm": 0.7769281268119812, "learning_rate": 2.7128459943528608e-05, "loss": 1.8489, "mean_token_accuracy": 0.6237825229763985, "num_tokens": 165876286.0, "step": 878 }, { "epoch": 1.2092943201376936, "grad_norm": 0.7675428986549377, "learning_rate": 2.7100655972543372e-05, "loss": 1.9595, "mean_token_accuracy": 0.6208030134439468, "num_tokens": 166086777.0, "step": 879 }, { "epoch": 1.2106712564543889, "grad_norm": 0.8268479108810425, "learning_rate": 2.7072836290323698e-05, "loss": 1.7142, "mean_token_accuracy": 0.6620016843080521, "num_tokens": 166264190.0, "step": 880 }, { "epoch": 1.2120481927710842, "grad_norm": 0.8755585551261902, "learning_rate": 2.7045000958424674e-05, "loss": 1.4447, "mean_token_accuracy": 0.7046144381165504, "num_tokens": 166422836.0, "step": 881 }, { "epoch": 1.2134251290877798, "grad_norm": 0.8986185193061829, "learning_rate": 2.7017150038436027e-05, "loss": 1.2757, "mean_token_accuracy": 0.7372832223773003, "num_tokens": 166568666.0, "step": 882 }, { "epoch": 1.214802065404475, "grad_norm": 1.0156749486923218, "learning_rate": 2.698928359198197e-05, "loss": 1.2033, "mean_token_accuracy": 0.750631719827652, "num_tokens": 166699685.0, "step": 883 }, { "epoch": 1.2161790017211704, "grad_norm": 0.7821558713912964, "learning_rate": 2.696140168072107e-05, "loss": 1.8007, "mean_token_accuracy": 0.649416908621788, "num_tokens": 166996050.0, "step": 884 }, { "epoch": 1.2175559380378658, "grad_norm": 0.7794591784477234, "learning_rate": 2.6933504366346107e-05, "loss": 2.0039, "mean_token_accuracy": 0.6135642528533936, "num_tokens": 167220496.0, "step": 885 }, { "epoch": 1.218932874354561, "grad_norm": 0.8278117179870605, "learning_rate": 2.6905591710583957e-05, "loss": 1.788, "mean_token_accuracy": 0.6475827991962433, "num_tokens": 167404103.0, "step": 886 }, { "epoch": 1.2203098106712564, "grad_norm": 0.8658751845359802, "learning_rate": 2.687766377519542e-05, "loss": 1.5038, "mean_token_accuracy": 0.6964550912380219, "num_tokens": 167566570.0, "step": 887 }, { "epoch": 1.2216867469879518, "grad_norm": 0.8914316892623901, "learning_rate": 2.6849720621975127e-05, "loss": 1.315, "mean_token_accuracy": 0.7271912842988968, "num_tokens": 167715518.0, "step": 888 }, { "epoch": 1.223063683304647, "grad_norm": 0.9134377241134644, "learning_rate": 2.6821762312751368e-05, "loss": 1.1723, "mean_token_accuracy": 0.7539044842123985, "num_tokens": 167851476.0, "step": 889 }, { "epoch": 1.2244406196213424, "grad_norm": 0.6995800733566284, "learning_rate": 2.679378890938597e-05, "loss": 1.6968, "mean_token_accuracy": 0.6844705939292908, "num_tokens": 168108208.0, "step": 890 }, { "epoch": 1.225817555938038, "grad_norm": 0.7791701555252075, "learning_rate": 2.676580047377415e-05, "loss": 2.0867, "mean_token_accuracy": 0.5997078120708466, "num_tokens": 168351110.0, "step": 891 }, { "epoch": 1.2271944922547333, "grad_norm": 0.8148375153541565, "learning_rate": 2.6737797067844403e-05, "loss": 1.8595, "mean_token_accuracy": 0.6367138996720314, "num_tokens": 168543112.0, "step": 892 }, { "epoch": 1.2285714285714286, "grad_norm": 0.8645954132080078, "learning_rate": 2.6709778753558334e-05, "loss": 1.6019, "mean_token_accuracy": 0.6776773482561111, "num_tokens": 168710649.0, "step": 893 }, { "epoch": 1.229948364888124, "grad_norm": 0.875282883644104, "learning_rate": 2.6681745592910546e-05, "loss": 1.3612, "mean_token_accuracy": 0.7222695797681808, "num_tokens": 168863026.0, "step": 894 }, { "epoch": 1.2313253012048193, "grad_norm": 0.9204263687133789, "learning_rate": 2.6653697647928485e-05, "loss": 1.1971, "mean_token_accuracy": 0.7523454204201698, "num_tokens": 169002831.0, "step": 895 }, { "epoch": 1.2327022375215146, "grad_norm": 0.32034972310066223, "learning_rate": 2.6625634980672294e-05, "loss": 1.5317, "mean_token_accuracy": 0.7190104126930237, "num_tokens": 169212017.0, "step": 896 }, { "epoch": 1.23407917383821, "grad_norm": 0.7613781094551086, "learning_rate": 2.659755765323473e-05, "loss": 2.0706, "mean_token_accuracy": 0.6033573150634766, "num_tokens": 169485734.0, "step": 897 }, { "epoch": 1.2354561101549053, "grad_norm": 0.8060704469680786, "learning_rate": 2.656946572774095e-05, "loss": 1.9255, "mean_token_accuracy": 0.6256428807973862, "num_tokens": 169688356.0, "step": 898 }, { "epoch": 1.2368330464716006, "grad_norm": 0.836452305316925, "learning_rate": 2.6541359266348437e-05, "loss": 1.6634, "mean_token_accuracy": 0.6659364551305771, "num_tokens": 169860878.0, "step": 899 }, { "epoch": 1.238209982788296, "grad_norm": 0.8585756421089172, "learning_rate": 2.6513238331246816e-05, "loss": 1.3841, "mean_token_accuracy": 0.7168543934822083, "num_tokens": 170016692.0, "step": 900 }, { "epoch": 1.2395869191049913, "grad_norm": 0.8945450186729431, "learning_rate": 2.648510298465775e-05, "loss": 1.229, "mean_token_accuracy": 0.7438998967409134, "num_tokens": 170159905.0, "step": 901 }, { "epoch": 1.2409638554216866, "grad_norm": 1.1221544742584229, "learning_rate": 2.645695328883479e-05, "loss": 1.2594, "mean_token_accuracy": 0.7377005442976952, "num_tokens": 170284838.0, "step": 902 }, { "epoch": 1.2423407917383822, "grad_norm": 0.7463877201080322, "learning_rate": 2.6428789306063233e-05, "loss": 1.8849, "mean_token_accuracy": 0.6207120940089226, "num_tokens": 170610147.0, "step": 903 }, { "epoch": 1.2437177280550775, "grad_norm": 0.809718668460846, "learning_rate": 2.6400611098659988e-05, "loss": 1.9745, "mean_token_accuracy": 0.6152700483798981, "num_tokens": 170819929.0, "step": 904 }, { "epoch": 1.2450946643717729, "grad_norm": 0.8378859162330627, "learning_rate": 2.637241872897344e-05, "loss": 1.703, "mean_token_accuracy": 0.658865436911583, "num_tokens": 170997252.0, "step": 905 }, { "epoch": 1.2464716006884682, "grad_norm": 0.8751640915870667, "learning_rate": 2.63442122593833e-05, "loss": 1.4383, "mean_token_accuracy": 0.7072534188628197, "num_tokens": 171156033.0, "step": 906 }, { "epoch": 1.2478485370051635, "grad_norm": 0.8898435235023499, "learning_rate": 2.6315991752300503e-05, "loss": 1.2686, "mean_token_accuracy": 0.734234631061554, "num_tokens": 171301943.0, "step": 907 }, { "epoch": 1.2492254733218588, "grad_norm": 0.97126305103302, "learning_rate": 2.6287757270167008e-05, "loss": 1.1724, "mean_token_accuracy": 0.7584729567170143, "num_tokens": 171433069.0, "step": 908 }, { "epoch": 1.2506024096385542, "grad_norm": 0.7812871336936951, "learning_rate": 2.6259508875455727e-05, "loss": 1.8113, "mean_token_accuracy": 0.649210013449192, "num_tokens": 171729736.0, "step": 909 }, { "epoch": 1.2519793459552495, "grad_norm": 0.7761862277984619, "learning_rate": 2.623124663067034e-05, "loss": 2.0142, "mean_token_accuracy": 0.6107062324881554, "num_tokens": 171956166.0, "step": 910 }, { "epoch": 1.2533562822719448, "grad_norm": 0.8129152655601501, "learning_rate": 2.6202970598345173e-05, "loss": 1.7615, "mean_token_accuracy": 0.6542713567614555, "num_tokens": 172140696.0, "step": 911 }, { "epoch": 1.2547332185886404, "grad_norm": 0.8629953861236572, "learning_rate": 2.617468084104507e-05, "loss": 1.4954, "mean_token_accuracy": 0.6950381174683571, "num_tokens": 172303352.0, "step": 912 }, { "epoch": 1.2561101549053357, "grad_norm": 0.8831518888473511, "learning_rate": 2.6146377421365225e-05, "loss": 1.2754, "mean_token_accuracy": 0.732158251106739, "num_tokens": 172451809.0, "step": 913 }, { "epoch": 1.257487091222031, "grad_norm": 0.9591671824455261, "learning_rate": 2.6118060401931073e-05, "loss": 1.1649, "mean_token_accuracy": 0.7583649232983589, "num_tokens": 172587091.0, "step": 914 }, { "epoch": 1.2588640275387264, "grad_norm": 0.6585769057273865, "learning_rate": 2.6089729845398144e-05, "loss": 1.6905, "mean_token_accuracy": 0.6887401342391968, "num_tokens": 172845254.0, "step": 915 }, { "epoch": 1.2602409638554217, "grad_norm": 0.7398154139518738, "learning_rate": 2.6061385814451913e-05, "loss": 2.0165, "mean_token_accuracy": 0.6131977438926697, "num_tokens": 173087410.0, "step": 916 }, { "epoch": 1.261617900172117, "grad_norm": 0.8096103072166443, "learning_rate": 2.6033028371807677e-05, "loss": 1.8667, "mean_token_accuracy": 0.634074829518795, "num_tokens": 173279380.0, "step": 917 }, { "epoch": 1.2629948364888124, "grad_norm": 0.8578912615776062, "learning_rate": 2.6004657580210397e-05, "loss": 1.5664, "mean_token_accuracy": 0.6858732774853706, "num_tokens": 173446952.0, "step": 918 }, { "epoch": 1.2643717728055077, "grad_norm": 0.8783684372901917, "learning_rate": 2.5976273502434584e-05, "loss": 1.3418, "mean_token_accuracy": 0.7226818278431892, "num_tokens": 173599199.0, "step": 919 }, { "epoch": 1.265748709122203, "grad_norm": 0.8982758522033691, "learning_rate": 2.5947876201284136e-05, "loss": 1.1907, "mean_token_accuracy": 0.7532911598682404, "num_tokens": 173738634.0, "step": 920 }, { "epoch": 1.2671256454388984, "grad_norm": 0.3083597719669342, "learning_rate": 2.5919465739592207e-05, "loss": 1.4799, "mean_token_accuracy": 0.7261609807610512, "num_tokens": 173949545.0, "step": 921 }, { "epoch": 1.2685025817555937, "grad_norm": 0.774091899394989, "learning_rate": 2.5891042180221094e-05, "loss": 2.005, "mean_token_accuracy": 0.6122879758477211, "num_tokens": 174219596.0, "step": 922 }, { "epoch": 1.269879518072289, "grad_norm": 0.8105296492576599, "learning_rate": 2.5862605586062044e-05, "loss": 1.9359, "mean_token_accuracy": 0.6251882910728455, "num_tokens": 174421253.0, "step": 923 }, { "epoch": 1.2712564543889844, "grad_norm": 0.8358834385871887, "learning_rate": 2.5834156020035162e-05, "loss": 1.6567, "mean_token_accuracy": 0.6707037016749382, "num_tokens": 174594026.0, "step": 924 }, { "epoch": 1.27263339070568, "grad_norm": 0.8727187514305115, "learning_rate": 2.580569354508925e-05, "loss": 1.3858, "mean_token_accuracy": 0.7177674025297165, "num_tokens": 174750041.0, "step": 925 }, { "epoch": 1.2740103270223753, "grad_norm": 0.8808802962303162, "learning_rate": 2.5777218224201676e-05, "loss": 1.1973, "mean_token_accuracy": 0.7479586154222488, "num_tokens": 174893139.0, "step": 926 }, { "epoch": 1.2753872633390706, "grad_norm": 1.1372605562210083, "learning_rate": 2.5748730120378237e-05, "loss": 1.2647, "mean_token_accuracy": 0.7404135465621948, "num_tokens": 175017894.0, "step": 927 }, { "epoch": 1.276764199655766, "grad_norm": 0.7611331939697266, "learning_rate": 2.5720229296653006e-05, "loss": 1.8409, "mean_token_accuracy": 0.6223496869206429, "num_tokens": 175346870.0, "step": 928 }, { "epoch": 1.2781411359724613, "grad_norm": 0.8060401082038879, "learning_rate": 2.569171581608819e-05, "loss": 1.9729, "mean_token_accuracy": 0.6162276715040207, "num_tokens": 175558835.0, "step": 929 }, { "epoch": 1.2795180722891566, "grad_norm": 0.8358564376831055, "learning_rate": 2.5663189741774028e-05, "loss": 1.6956, "mean_token_accuracy": 0.6617697775363922, "num_tokens": 175737162.0, "step": 930 }, { "epoch": 1.280895008605852, "grad_norm": 0.8489002585411072, "learning_rate": 2.5634651136828597e-05, "loss": 1.4259, "mean_token_accuracy": 0.7082144320011139, "num_tokens": 175896495.0, "step": 931 }, { "epoch": 1.2822719449225473, "grad_norm": 0.8715285658836365, "learning_rate": 2.5606100064397725e-05, "loss": 1.2726, "mean_token_accuracy": 0.7353676930069923, "num_tokens": 176042702.0, "step": 932 }, { "epoch": 1.2836488812392428, "grad_norm": 0.985191285610199, "learning_rate": 2.5577536587654805e-05, "loss": 1.178, "mean_token_accuracy": 0.7569754049181938, "num_tokens": 176173930.0, "step": 933 }, { "epoch": 1.2850258175559381, "grad_norm": 0.7751879096031189, "learning_rate": 2.554896076980069e-05, "loss": 1.7708, "mean_token_accuracy": 0.6496579647064209, "num_tokens": 176472038.0, "step": 934 }, { "epoch": 1.2864027538726335, "grad_norm": 0.7827499508857727, "learning_rate": 2.5520372674063528e-05, "loss": 2.0021, "mean_token_accuracy": 0.6135028004646301, "num_tokens": 176695849.0, "step": 935 }, { "epoch": 1.2877796901893288, "grad_norm": 0.8259003162384033, "learning_rate": 2.549177236369865e-05, "loss": 1.7519, "mean_token_accuracy": 0.6526268571615219, "num_tokens": 176879149.0, "step": 936 }, { "epoch": 1.2891566265060241, "grad_norm": 0.8680696487426758, "learning_rate": 2.54631599019884e-05, "loss": 1.5258, "mean_token_accuracy": 0.6917814165353775, "num_tokens": 177041707.0, "step": 937 }, { "epoch": 1.2905335628227195, "grad_norm": 0.8847443461418152, "learning_rate": 2.5434535352242006e-05, "loss": 1.2862, "mean_token_accuracy": 0.7326818481087685, "num_tokens": 177190535.0, "step": 938 }, { "epoch": 1.2919104991394148, "grad_norm": 0.9351314306259155, "learning_rate": 2.540589877779546e-05, "loss": 1.1803, "mean_token_accuracy": 0.7544071227312088, "num_tokens": 177325851.0, "step": 939 }, { "epoch": 1.2932874354561101, "grad_norm": 0.7416567206382751, "learning_rate": 2.5377250242011338e-05, "loss": 1.6923, "mean_token_accuracy": 0.6847232431173325, "num_tokens": 177587136.0, "step": 940 }, { "epoch": 1.2946643717728055, "grad_norm": 0.7803258299827576, "learning_rate": 2.53485898082787e-05, "loss": 2.0506, "mean_token_accuracy": 0.6053315475583076, "num_tokens": 177828796.0, "step": 941 }, { "epoch": 1.2960413080895008, "grad_norm": 0.8058802485466003, "learning_rate": 2.5319917540012928e-05, "loss": 1.8295, "mean_token_accuracy": 0.6382907405495644, "num_tokens": 178018963.0, "step": 942 }, { "epoch": 1.2974182444061961, "grad_norm": 0.8608372807502747, "learning_rate": 2.5291233500655584e-05, "loss": 1.5648, "mean_token_accuracy": 0.6836274340748787, "num_tokens": 178185828.0, "step": 943 }, { "epoch": 1.2987951807228915, "grad_norm": 0.8871830701828003, "learning_rate": 2.526253775367428e-05, "loss": 1.3222, "mean_token_accuracy": 0.7286455258727074, "num_tokens": 178338247.0, "step": 944 }, { "epoch": 1.3001721170395868, "grad_norm": 0.9162492752075195, "learning_rate": 2.523383036256252e-05, "loss": 1.1965, "mean_token_accuracy": 0.7522569000720978, "num_tokens": 178478133.0, "step": 945 }, { "epoch": 1.3015490533562823, "grad_norm": 0.3231734335422516, "learning_rate": 2.52051113908396e-05, "loss": 1.5126, "mean_token_accuracy": 0.7270072996616364, "num_tokens": 178683622.0, "step": 946 }, { "epoch": 1.3029259896729777, "grad_norm": 0.7814964056015015, "learning_rate": 2.5176380902050418e-05, "loss": 2.0216, "mean_token_accuracy": 0.6078179702162743, "num_tokens": 178951698.0, "step": 947 }, { "epoch": 1.304302925989673, "grad_norm": 0.8092551827430725, "learning_rate": 2.5147638959765362e-05, "loss": 1.9031, "mean_token_accuracy": 0.630603663623333, "num_tokens": 179151347.0, "step": 948 }, { "epoch": 1.3056798623063683, "grad_norm": 0.8497322201728821, "learning_rate": 2.5118885627580155e-05, "loss": 1.6507, "mean_token_accuracy": 0.6691940277814865, "num_tokens": 179323593.0, "step": 949 }, { "epoch": 1.3070567986230637, "grad_norm": 0.869314968585968, "learning_rate": 2.5090120969115725e-05, "loss": 1.3863, "mean_token_accuracy": 0.7170169204473495, "num_tokens": 179479090.0, "step": 950 }, { "epoch": 1.308433734939759, "grad_norm": 0.8826742768287659, "learning_rate": 2.506134504801807e-05, "loss": 1.1991, "mean_token_accuracy": 0.7480077594518661, "num_tokens": 179621590.0, "step": 951 }, { "epoch": 1.3098106712564543, "grad_norm": 1.1172997951507568, "learning_rate": 2.5032557927958116e-05, "loss": 1.2293, "mean_token_accuracy": 0.7444333210587502, "num_tokens": 179745702.0, "step": 952 }, { "epoch": 1.3111876075731497, "grad_norm": 0.6922846436500549, "learning_rate": 2.500375967263153e-05, "loss": 1.8037, "mean_token_accuracy": 0.6363579407334328, "num_tokens": 180074686.0, "step": 953 }, { "epoch": 1.3125645438898452, "grad_norm": 0.8014053702354431, "learning_rate": 2.4974950345758653e-05, "loss": 1.9685, "mean_token_accuracy": 0.6181764602661133, "num_tokens": 180285464.0, "step": 954 }, { "epoch": 1.3139414802065406, "grad_norm": 0.844632089138031, "learning_rate": 2.494613001108431e-05, "loss": 1.6947, "mean_token_accuracy": 0.6617102473974228, "num_tokens": 180463013.0, "step": 955 }, { "epoch": 1.315318416523236, "grad_norm": 0.8726085424423218, "learning_rate": 2.4917298732377694e-05, "loss": 1.4117, "mean_token_accuracy": 0.7089902684092522, "num_tokens": 180621704.0, "step": 956 }, { "epoch": 1.3166953528399312, "grad_norm": 0.8781647086143494, "learning_rate": 2.488845657343219e-05, "loss": 1.2584, "mean_token_accuracy": 0.7364067286252975, "num_tokens": 180767417.0, "step": 957 }, { "epoch": 1.3180722891566266, "grad_norm": 1.0091910362243652, "learning_rate": 2.485960359806528e-05, "loss": 1.1881, "mean_token_accuracy": 0.7561876475811005, "num_tokens": 180897750.0, "step": 958 }, { "epoch": 1.3194492254733219, "grad_norm": 0.766934335231781, "learning_rate": 2.483073987011837e-05, "loss": 1.8039, "mean_token_accuracy": 0.6482538059353828, "num_tokens": 181192671.0, "step": 959 }, { "epoch": 1.3208261617900172, "grad_norm": 0.74049311876297, "learning_rate": 2.4801865453456647e-05, "loss": 1.9954, "mean_token_accuracy": 0.6151130050420761, "num_tokens": 181417146.0, "step": 960 }, { "epoch": 1.3222030981067125, "grad_norm": 0.8175472617149353, "learning_rate": 2.4772980411968975e-05, "loss": 1.7578, "mean_token_accuracy": 0.65184485912323, "num_tokens": 181600613.0, "step": 961 }, { "epoch": 1.3235800344234079, "grad_norm": 0.8665565252304077, "learning_rate": 2.47440848095677e-05, "loss": 1.4942, "mean_token_accuracy": 0.6947615742683411, "num_tokens": 181763528.0, "step": 962 }, { "epoch": 1.3249569707401032, "grad_norm": 0.8957054615020752, "learning_rate": 2.471517871018855e-05, "loss": 1.2887, "mean_token_accuracy": 0.7323865294456482, "num_tokens": 181912536.0, "step": 963 }, { "epoch": 1.3263339070567985, "grad_norm": 0.9280022978782654, "learning_rate": 2.468626217779047e-05, "loss": 1.1585, "mean_token_accuracy": 0.7608812600374222, "num_tokens": 182048170.0, "step": 964 }, { "epoch": 1.3277108433734939, "grad_norm": 0.6526057124137878, "learning_rate": 2.46573352763555e-05, "loss": 1.6827, "mean_token_accuracy": 0.6931243240833282, "num_tokens": 182300911.0, "step": 965 }, { "epoch": 1.3290877796901892, "grad_norm": 0.7506466507911682, "learning_rate": 2.4628398069888625e-05, "loss": 2.0272, "mean_token_accuracy": 0.6105439215898514, "num_tokens": 182541532.0, "step": 966 }, { "epoch": 1.3304647160068848, "grad_norm": 0.8224135041236877, "learning_rate": 2.4599450622417615e-05, "loss": 1.8568, "mean_token_accuracy": 0.6367796957492828, "num_tokens": 182732054.0, "step": 967 }, { "epoch": 1.33184165232358, "grad_norm": 0.8707013130187988, "learning_rate": 2.457049299799291e-05, "loss": 1.5749, "mean_token_accuracy": 0.68394885212183, "num_tokens": 182898955.0, "step": 968 }, { "epoch": 1.3332185886402754, "grad_norm": 0.8659912943840027, "learning_rate": 2.4541525260687468e-05, "loss": 1.3213, "mean_token_accuracy": 0.7269002050161362, "num_tokens": 183051035.0, "step": 969 }, { "epoch": 1.3345955249569708, "grad_norm": 0.9058789014816284, "learning_rate": 2.4512547474596624e-05, "loss": 1.1818, "mean_token_accuracy": 0.7513315975666046, "num_tokens": 183190622.0, "step": 970 }, { "epoch": 1.335972461273666, "grad_norm": 0.3078063130378723, "learning_rate": 2.4483559703837943e-05, "loss": 1.521, "mean_token_accuracy": 0.7238981798291206, "num_tokens": 183401884.0, "step": 971 }, { "epoch": 1.3373493975903614, "grad_norm": 0.7920054197311401, "learning_rate": 2.4454562012551088e-05, "loss": 1.9777, "mean_token_accuracy": 0.6156459152698517, "num_tokens": 183671654.0, "step": 972 }, { "epoch": 1.3387263339070568, "grad_norm": 0.8109396696090698, "learning_rate": 2.4425554464897675e-05, "loss": 1.8982, "mean_token_accuracy": 0.6282585486769676, "num_tokens": 183871843.0, "step": 973 }, { "epoch": 1.340103270223752, "grad_norm": 0.8370656371116638, "learning_rate": 2.4396537125061112e-05, "loss": 1.6302, "mean_token_accuracy": 0.6729145273566246, "num_tokens": 184043820.0, "step": 974 }, { "epoch": 1.3414802065404476, "grad_norm": 0.8438716530799866, "learning_rate": 2.4367510057246492e-05, "loss": 1.3667, "mean_token_accuracy": 0.7191689237952232, "num_tokens": 184199361.0, "step": 975 }, { "epoch": 1.342857142857143, "grad_norm": 0.8825638294219971, "learning_rate": 2.433847332568042e-05, "loss": 1.1934, "mean_token_accuracy": 0.7488013207912445, "num_tokens": 184342413.0, "step": 976 }, { "epoch": 1.3442340791738383, "grad_norm": 1.1351597309112549, "learning_rate": 2.430942699461091e-05, "loss": 1.2464, "mean_token_accuracy": 0.7440810203552246, "num_tokens": 184467094.0, "step": 977 }, { "epoch": 1.3456110154905336, "grad_norm": 0.7818828225135803, "learning_rate": 2.4280371128307168e-05, "loss": 1.8094, "mean_token_accuracy": 0.6330305486917496, "num_tokens": 184798262.0, "step": 978 }, { "epoch": 1.346987951807229, "grad_norm": 0.8132795691490173, "learning_rate": 2.4251305791059533e-05, "loss": 1.9811, "mean_token_accuracy": 0.6168111115694046, "num_tokens": 185009682.0, "step": 979 }, { "epoch": 1.3483648881239243, "grad_norm": 0.8436702489852905, "learning_rate": 2.4222231047179303e-05, "loss": 1.7008, "mean_token_accuracy": 0.6620882153511047, "num_tokens": 185187887.0, "step": 980 }, { "epoch": 1.3497418244406196, "grad_norm": 0.8626536726951599, "learning_rate": 2.419314696099858e-05, "loss": 1.4158, "mean_token_accuracy": 0.7108184024691582, "num_tokens": 185347047.0, "step": 981 }, { "epoch": 1.351118760757315, "grad_norm": 0.8714147806167603, "learning_rate": 2.416405359687012e-05, "loss": 1.2298, "mean_token_accuracy": 0.7460647374391556, "num_tokens": 185493070.0, "step": 982 }, { "epoch": 1.3524956970740103, "grad_norm": 0.9871793985366821, "learning_rate": 2.4134951019167235e-05, "loss": 1.1698, "mean_token_accuracy": 0.7529937028884888, "num_tokens": 185624283.0, "step": 983 }, { "epoch": 1.3538726333907056, "grad_norm": 0.8183675408363342, "learning_rate": 2.4105839292283605e-05, "loss": 1.7646, "mean_token_accuracy": 0.6526894122362137, "num_tokens": 185917612.0, "step": 984 }, { "epoch": 1.355249569707401, "grad_norm": 0.7811689972877502, "learning_rate": 2.4076718480633178e-05, "loss": 1.9603, "mean_token_accuracy": 0.6200869083404541, "num_tokens": 186142721.0, "step": 985 }, { "epoch": 1.3566265060240963, "grad_norm": 0.8290724754333496, "learning_rate": 2.4047588648649968e-05, "loss": 1.7652, "mean_token_accuracy": 0.6529950425028801, "num_tokens": 186327089.0, "step": 986 }, { "epoch": 1.3580034423407916, "grad_norm": 0.8798555731773376, "learning_rate": 2.4018449860787977e-05, "loss": 1.5283, "mean_token_accuracy": 0.6928371116518974, "num_tokens": 186490468.0, "step": 987 }, { "epoch": 1.359380378657487, "grad_norm": 0.8853054046630859, "learning_rate": 2.398930218152101e-05, "loss": 1.2727, "mean_token_accuracy": 0.7347674891352654, "num_tokens": 186639639.0, "step": 988 }, { "epoch": 1.3607573149741825, "grad_norm": 0.9417387247085571, "learning_rate": 2.3960145675342544e-05, "loss": 1.1788, "mean_token_accuracy": 0.7542718946933746, "num_tokens": 186775721.0, "step": 989 }, { "epoch": 1.3621342512908778, "grad_norm": 0.711798906326294, "learning_rate": 2.3930980406765598e-05, "loss": 1.6265, "mean_token_accuracy": 0.6951203420758247, "num_tokens": 187033998.0, "step": 990 }, { "epoch": 1.3635111876075732, "grad_norm": 0.8221349120140076, "learning_rate": 2.390180644032257e-05, "loss": 2.0355, "mean_token_accuracy": 0.608329564332962, "num_tokens": 187278763.0, "step": 991 }, { "epoch": 1.3648881239242685, "grad_norm": 0.8340291976928711, "learning_rate": 2.38726238405651e-05, "loss": 1.8298, "mean_token_accuracy": 0.6403230354189873, "num_tokens": 187471917.0, "step": 992 }, { "epoch": 1.3662650602409638, "grad_norm": 0.8547493815422058, "learning_rate": 2.3843432672063944e-05, "loss": 1.5617, "mean_token_accuracy": 0.6828739121556282, "num_tokens": 187638993.0, "step": 993 }, { "epoch": 1.3676419965576592, "grad_norm": 0.8788800239562988, "learning_rate": 2.38142329994088e-05, "loss": 1.3406, "mean_token_accuracy": 0.7248962223529816, "num_tokens": 187791054.0, "step": 994 }, { "epoch": 1.3690189328743545, "grad_norm": 0.9118287563323975, "learning_rate": 2.3785024887208207e-05, "loss": 1.1561, "mean_token_accuracy": 0.7573669701814651, "num_tokens": 187930668.0, "step": 995 }, { "epoch": 1.3703958691910498, "grad_norm": 0.3151678144931793, "learning_rate": 2.3755808400089347e-05, "loss": 1.4347, "mean_token_accuracy": 0.7338683530688286, "num_tokens": 188140529.0, "step": 996 }, { "epoch": 1.3717728055077454, "grad_norm": 0.8552554249763489, "learning_rate": 2.372658360269796e-05, "loss": 2.0108, "mean_token_accuracy": 0.6119868010282516, "num_tokens": 188408266.0, "step": 997 }, { "epoch": 1.3731497418244407, "grad_norm": 0.8353647589683533, "learning_rate": 2.3697350559698156e-05, "loss": 1.8835, "mean_token_accuracy": 0.6295699998736382, "num_tokens": 188606981.0, "step": 998 }, { "epoch": 1.374526678141136, "grad_norm": 0.8378680944442749, "learning_rate": 2.3668109335772293e-05, "loss": 1.6301, "mean_token_accuracy": 0.6736423820257187, "num_tokens": 188778208.0, "step": 999 }, { "epoch": 1.3759036144578314, "grad_norm": 0.8767390251159668, "learning_rate": 2.363885999562084e-05, "loss": 1.3654, "mean_token_accuracy": 0.7196676284074783, "num_tokens": 188933661.0, "step": 1000 }, { "epoch": 1.3772805507745267, "grad_norm": 0.8790720701217651, "learning_rate": 2.3609602603962217e-05, "loss": 1.1855, "mean_token_accuracy": 0.7516512498259544, "num_tokens": 189076424.0, "step": 1001 }, { "epoch": 1.378657487091222, "grad_norm": 1.1123323440551758, "learning_rate": 2.3580337225532663e-05, "loss": 1.242, "mean_token_accuracy": 0.7398405820131302, "num_tokens": 189200818.0, "step": 1002 }, { "epoch": 1.3800344234079174, "grad_norm": 0.7432454228401184, "learning_rate": 2.3551063925086072e-05, "loss": 1.8003, "mean_token_accuracy": 0.6367504745721817, "num_tokens": 189529458.0, "step": 1003 }, { "epoch": 1.3814113597246127, "grad_norm": 0.8134406805038452, "learning_rate": 2.3521782767393883e-05, "loss": 1.93, "mean_token_accuracy": 0.626864604651928, "num_tokens": 189740543.0, "step": 1004 }, { "epoch": 1.382788296041308, "grad_norm": 0.8540675044059753, "learning_rate": 2.3492493817244933e-05, "loss": 1.6774, "mean_token_accuracy": 0.6661203280091286, "num_tokens": 189917694.0, "step": 1005 }, { "epoch": 1.3841652323580034, "grad_norm": 0.8631286025047302, "learning_rate": 2.3463197139445284e-05, "loss": 1.3972, "mean_token_accuracy": 0.7148472517728806, "num_tokens": 190076558.0, "step": 1006 }, { "epoch": 1.3855421686746987, "grad_norm": 0.8984408378601074, "learning_rate": 2.3433892798818078e-05, "loss": 1.2434, "mean_token_accuracy": 0.7414769530296326, "num_tokens": 190222551.0, "step": 1007 }, { "epoch": 1.386919104991394, "grad_norm": 0.9857003092765808, "learning_rate": 2.340458086020345e-05, "loss": 1.1408, "mean_token_accuracy": 0.7659621983766556, "num_tokens": 190353618.0, "step": 1008 }, { "epoch": 1.3882960413080894, "grad_norm": 0.7427212595939636, "learning_rate": 2.3375261388458318e-05, "loss": 1.7533, "mean_token_accuracy": 0.6593968644738197, "num_tokens": 190646480.0, "step": 1009 }, { "epoch": 1.389672977624785, "grad_norm": 0.7846319675445557, "learning_rate": 2.3345934448456297e-05, "loss": 1.9699, "mean_token_accuracy": 0.6189417913556099, "num_tokens": 190869428.0, "step": 1010 }, { "epoch": 1.3910499139414803, "grad_norm": 0.8518584966659546, "learning_rate": 2.3316600105087484e-05, "loss": 1.7486, "mean_token_accuracy": 0.6549213454127312, "num_tokens": 191053441.0, "step": 1011 }, { "epoch": 1.3924268502581756, "grad_norm": 0.8752524852752686, "learning_rate": 2.3287258423258405e-05, "loss": 1.5006, "mean_token_accuracy": 0.6921668872237206, "num_tokens": 191216412.0, "step": 1012 }, { "epoch": 1.393803786574871, "grad_norm": 0.8808018565177917, "learning_rate": 2.325790946789178e-05, "loss": 1.255, "mean_token_accuracy": 0.7382014319300652, "num_tokens": 191365554.0, "step": 1013 }, { "epoch": 1.3951807228915662, "grad_norm": 0.9355611205101013, "learning_rate": 2.322855330392645e-05, "loss": 1.1288, "mean_token_accuracy": 0.7636255249381065, "num_tokens": 191501613.0, "step": 1014 }, { "epoch": 1.3965576592082616, "grad_norm": 0.6678423881530762, "learning_rate": 2.3199189996317205e-05, "loss": 1.6539, "mean_token_accuracy": 0.6909663006663322, "num_tokens": 191756933.0, "step": 1015 }, { "epoch": 1.397934595524957, "grad_norm": 0.7983126640319824, "learning_rate": 2.3169819610034635e-05, "loss": 1.9902, "mean_token_accuracy": 0.6143665388226509, "num_tokens": 192000100.0, "step": 1016 }, { "epoch": 1.3993115318416522, "grad_norm": 0.825959324836731, "learning_rate": 2.3140442210064982e-05, "loss": 1.7953, "mean_token_accuracy": 0.648130513727665, "num_tokens": 192190941.0, "step": 1017 }, { "epoch": 1.4006884681583478, "grad_norm": 0.8509828448295593, "learning_rate": 2.3111057861410026e-05, "loss": 1.5264, "mean_token_accuracy": 0.6879749670624733, "num_tokens": 192358094.0, "step": 1018 }, { "epoch": 1.4020654044750431, "grad_norm": 0.882448673248291, "learning_rate": 2.3081666629086918e-05, "loss": 1.308, "mean_token_accuracy": 0.7282876148819923, "num_tokens": 192510491.0, "step": 1019 }, { "epoch": 1.4034423407917385, "grad_norm": 0.923114001750946, "learning_rate": 2.3052268578128025e-05, "loss": 1.1875, "mean_token_accuracy": 0.7531473934650421, "num_tokens": 192650318.0, "step": 1020 }, { "epoch": 1.4048192771084338, "grad_norm": 0.3180157244205475, "learning_rate": 2.3022863773580813e-05, "loss": 1.4722, "mean_token_accuracy": 0.7323845848441124, "num_tokens": 192857220.0, "step": 1021 }, { "epoch": 1.4061962134251291, "grad_norm": 0.8509544134140015, "learning_rate": 2.2993452280507708e-05, "loss": 1.9587, "mean_token_accuracy": 0.6176796555519104, "num_tokens": 193125754.0, "step": 1022 }, { "epoch": 1.4075731497418245, "grad_norm": 0.8282464742660522, "learning_rate": 2.296403416398589e-05, "loss": 1.852, "mean_token_accuracy": 0.6399711966514587, "num_tokens": 193326241.0, "step": 1023 }, { "epoch": 1.4089500860585198, "grad_norm": 0.8569718599319458, "learning_rate": 2.2934609489107236e-05, "loss": 1.6164, "mean_token_accuracy": 0.6732972785830498, "num_tokens": 193498605.0, "step": 1024 }, { "epoch": 1.4103270223752151, "grad_norm": 0.8601506352424622, "learning_rate": 2.2905178320978126e-05, "loss": 1.3521, "mean_token_accuracy": 0.7227311283349991, "num_tokens": 193654116.0, "step": 1025 }, { "epoch": 1.4117039586919105, "grad_norm": 0.9177577495574951, "learning_rate": 2.2875740724719294e-05, "loss": 1.2126, "mean_token_accuracy": 0.7441741898655891, "num_tokens": 193796821.0, "step": 1026 }, { "epoch": 1.4130808950086058, "grad_norm": 1.112290620803833, "learning_rate": 2.2846296765465708e-05, "loss": 1.2167, "mean_token_accuracy": 0.7471260726451874, "num_tokens": 193921393.0, "step": 1027 }, { "epoch": 1.4144578313253011, "grad_norm": 0.7228164672851562, "learning_rate": 2.2816846508366407e-05, "loss": 1.8012, "mean_token_accuracy": 0.6331262364983559, "num_tokens": 194251515.0, "step": 1028 }, { "epoch": 1.4158347676419965, "grad_norm": 0.7793260812759399, "learning_rate": 2.278739001858437e-05, "loss": 1.9363, "mean_token_accuracy": 0.6228564158082008, "num_tokens": 194463020.0, "step": 1029 }, { "epoch": 1.4172117039586918, "grad_norm": 0.8345648050308228, "learning_rate": 2.2757927361296376e-05, "loss": 1.7105, "mean_token_accuracy": 0.6602297574281693, "num_tokens": 194640587.0, "step": 1030 }, { "epoch": 1.4185886402753873, "grad_norm": 0.8697944283485413, "learning_rate": 2.272845860169283e-05, "loss": 1.413, "mean_token_accuracy": 0.7103829905390739, "num_tokens": 194799135.0, "step": 1031 }, { "epoch": 1.4199655765920827, "grad_norm": 0.8911084532737732, "learning_rate": 2.2698983804977654e-05, "loss": 1.2181, "mean_token_accuracy": 0.7462772205471992, "num_tokens": 194944703.0, "step": 1032 }, { "epoch": 1.421342512908778, "grad_norm": 1.0021148920059204, "learning_rate": 2.2669503036368124e-05, "loss": 1.1128, "mean_token_accuracy": 0.7658976316452026, "num_tokens": 195075662.0, "step": 1033 }, { "epoch": 1.4227194492254733, "grad_norm": 0.7517693042755127, "learning_rate": 2.2640016361094733e-05, "loss": 1.7353, "mean_token_accuracy": 0.6601576432585716, "num_tokens": 195367482.0, "step": 1034 }, { "epoch": 1.4240963855421687, "grad_norm": 0.7775019407272339, "learning_rate": 2.261052384440104e-05, "loss": 1.9702, "mean_token_accuracy": 0.6190107613801956, "num_tokens": 195591483.0, "step": 1035 }, { "epoch": 1.425473321858864, "grad_norm": 0.8401229381561279, "learning_rate": 2.2581025551543516e-05, "loss": 1.7451, "mean_token_accuracy": 0.6532518938183784, "num_tokens": 195775799.0, "step": 1036 }, { "epoch": 1.4268502581755593, "grad_norm": 0.8811124563217163, "learning_rate": 2.2551521547791443e-05, "loss": 1.4883, "mean_token_accuracy": 0.6995291635394096, "num_tokens": 195938615.0, "step": 1037 }, { "epoch": 1.4282271944922547, "grad_norm": 0.8872791528701782, "learning_rate": 2.252201189842671e-05, "loss": 1.266, "mean_token_accuracy": 0.7347011864185333, "num_tokens": 196087453.0, "step": 1038 }, { "epoch": 1.4296041308089502, "grad_norm": 0.9557095766067505, "learning_rate": 2.249249666874372e-05, "loss": 1.1635, "mean_token_accuracy": 0.7564141824841499, "num_tokens": 196223189.0, "step": 1039 }, { "epoch": 1.4309810671256455, "grad_norm": 0.6854400038719177, "learning_rate": 2.246297592404921e-05, "loss": 1.6423, "mean_token_accuracy": 0.6934361383318901, "num_tokens": 196479058.0, "step": 1040 }, { "epoch": 1.4323580034423409, "grad_norm": 0.805069625377655, "learning_rate": 2.2433449729662114e-05, "loss": 2.0265, "mean_token_accuracy": 0.6100568324327469, "num_tokens": 196722845.0, "step": 1041 }, { "epoch": 1.4337349397590362, "grad_norm": 0.8354216814041138, "learning_rate": 2.240391815091344e-05, "loss": 1.8255, "mean_token_accuracy": 0.6408760845661163, "num_tokens": 196915422.0, "step": 1042 }, { "epoch": 1.4351118760757315, "grad_norm": 0.8761371374130249, "learning_rate": 2.2374381253146105e-05, "loss": 1.5416, "mean_token_accuracy": 0.68685432523489, "num_tokens": 197083030.0, "step": 1043 }, { "epoch": 1.4364888123924269, "grad_norm": 0.853762686252594, "learning_rate": 2.2344839101714793e-05, "loss": 1.2716, "mean_token_accuracy": 0.7363938614726067, "num_tokens": 197235281.0, "step": 1044 }, { "epoch": 1.4378657487091222, "grad_norm": 0.9135344624519348, "learning_rate": 2.2315291761985803e-05, "loss": 1.1675, "mean_token_accuracy": 0.754224069416523, "num_tokens": 197374896.0, "step": 1045 }, { "epoch": 1.4392426850258175, "grad_norm": 0.31827613711357117, "learning_rate": 2.2285739299336933e-05, "loss": 1.4775, "mean_token_accuracy": 0.7233272269368172, "num_tokens": 197584901.0, "step": 1046 }, { "epoch": 1.4406196213425129, "grad_norm": 0.830418050289154, "learning_rate": 2.2256181779157297e-05, "loss": 1.9454, "mean_token_accuracy": 0.6219183430075645, "num_tokens": 197856195.0, "step": 1047 }, { "epoch": 1.4419965576592082, "grad_norm": 0.8262435793876648, "learning_rate": 2.222661926684722e-05, "loss": 1.836, "mean_token_accuracy": 0.6403505131602287, "num_tokens": 198056223.0, "step": 1048 }, { "epoch": 1.4433734939759035, "grad_norm": 0.8405689597129822, "learning_rate": 2.2197051827818053e-05, "loss": 1.601, "mean_token_accuracy": 0.6767699420452118, "num_tokens": 198228741.0, "step": 1049 }, { "epoch": 1.4447504302925989, "grad_norm": 0.8873103857040405, "learning_rate": 2.2167479527492058e-05, "loss": 1.3722, "mean_token_accuracy": 0.716407023370266, "num_tokens": 198384448.0, "step": 1050 }, { "epoch": 1.4461273666092942, "grad_norm": 0.9040471911430359, "learning_rate": 2.2137902431302264e-05, "loss": 1.1957, "mean_token_accuracy": 0.752019964158535, "num_tokens": 198527433.0, "step": 1051 }, { "epoch": 1.4475043029259897, "grad_norm": 1.110107421875, "learning_rate": 2.2108320604692275e-05, "loss": 1.237, "mean_token_accuracy": 0.7402260005474091, "num_tokens": 198652373.0, "step": 1052 }, { "epoch": 1.448881239242685, "grad_norm": 0.7106909155845642, "learning_rate": 2.2078734113116207e-05, "loss": 1.7549, "mean_token_accuracy": 0.6435808688402176, "num_tokens": 198989311.0, "step": 1053 }, { "epoch": 1.4502581755593804, "grad_norm": 0.8192483186721802, "learning_rate": 2.2049143022038472e-05, "loss": 1.9168, "mean_token_accuracy": 0.6223702803254128, "num_tokens": 199200950.0, "step": 1054 }, { "epoch": 1.4516351118760757, "grad_norm": 0.8310084342956543, "learning_rate": 2.2019547396933666e-05, "loss": 1.6539, "mean_token_accuracy": 0.6676526740193367, "num_tokens": 199378981.0, "step": 1055 }, { "epoch": 1.453012048192771, "grad_norm": 0.8648124933242798, "learning_rate": 2.1989947303286408e-05, "loss": 1.3942, "mean_token_accuracy": 0.7140700444579124, "num_tokens": 199537708.0, "step": 1056 }, { "epoch": 1.4543889845094664, "grad_norm": 0.9147064089775085, "learning_rate": 2.196034280659122e-05, "loss": 1.2172, "mean_token_accuracy": 0.7447221055626869, "num_tokens": 199683493.0, "step": 1057 }, { "epoch": 1.4557659208261617, "grad_norm": 1.0322288274765015, "learning_rate": 2.1930733972352343e-05, "loss": 1.1694, "mean_token_accuracy": 0.7564518228173256, "num_tokens": 199813938.0, "step": 1058 }, { "epoch": 1.457142857142857, "grad_norm": 0.7404462099075317, "learning_rate": 2.190112086608365e-05, "loss": 1.7831, "mean_token_accuracy": 0.6507120877504349, "num_tokens": 200107476.0, "step": 1059 }, { "epoch": 1.4585197934595526, "grad_norm": 0.7646079659461975, "learning_rate": 2.1871503553308447e-05, "loss": 1.9878, "mean_token_accuracy": 0.6141515523195267, "num_tokens": 200331877.0, "step": 1060 }, { "epoch": 1.459896729776248, "grad_norm": 0.8174039721488953, "learning_rate": 2.1841882099559334e-05, "loss": 1.7176, "mean_token_accuracy": 0.6611126363277435, "num_tokens": 200515301.0, "step": 1061 }, { "epoch": 1.4612736660929433, "grad_norm": 0.8613215088844299, "learning_rate": 2.1812256570378096e-05, "loss": 1.4389, "mean_token_accuracy": 0.704386442899704, "num_tokens": 200677940.0, "step": 1062 }, { "epoch": 1.4626506024096386, "grad_norm": 0.8652694225311279, "learning_rate": 2.178262703131552e-05, "loss": 1.2452, "mean_token_accuracy": 0.7408580183982849, "num_tokens": 200826484.0, "step": 1063 }, { "epoch": 1.464027538726334, "grad_norm": 0.9338531494140625, "learning_rate": 2.1752993547931283e-05, "loss": 1.1226, "mean_token_accuracy": 0.7615847438573837, "num_tokens": 200961824.0, "step": 1064 }, { "epoch": 1.4654044750430293, "grad_norm": 0.7004619836807251, "learning_rate": 2.1723356185793762e-05, "loss": 1.6295, "mean_token_accuracy": 0.6955777332186699, "num_tokens": 201217574.0, "step": 1065 }, { "epoch": 1.4667814113597246, "grad_norm": 0.7759816646575928, "learning_rate": 2.169371501047995e-05, "loss": 2.0059, "mean_token_accuracy": 0.6135530546307564, "num_tokens": 201459386.0, "step": 1066 }, { "epoch": 1.46815834767642, "grad_norm": 0.8169485926628113, "learning_rate": 2.166407008757525e-05, "loss": 1.7847, "mean_token_accuracy": 0.6457942128181458, "num_tokens": 201649681.0, "step": 1067 }, { "epoch": 1.4695352839931153, "grad_norm": 0.8518682718276978, "learning_rate": 2.1634421482673368e-05, "loss": 1.5505, "mean_token_accuracy": 0.6893503740429878, "num_tokens": 201816398.0, "step": 1068 }, { "epoch": 1.4709122203098106, "grad_norm": 0.8751932382583618, "learning_rate": 2.160476926137616e-05, "loss": 1.2666, "mean_token_accuracy": 0.7346247658133507, "num_tokens": 201968106.0, "step": 1069 }, { "epoch": 1.472289156626506, "grad_norm": 0.9084135293960571, "learning_rate": 2.1575113489293473e-05, "loss": 1.1539, "mean_token_accuracy": 0.7573561295866966, "num_tokens": 202107483.0, "step": 1070 }, { "epoch": 1.4736660929432013, "grad_norm": 0.31062647700309753, "learning_rate": 2.1545454232043026e-05, "loss": 1.4604, "mean_token_accuracy": 0.7338655665516853, "num_tokens": 202318963.0, "step": 1071 }, { "epoch": 1.4750430292598966, "grad_norm": 0.7858709096908569, "learning_rate": 2.1515791555250236e-05, "loss": 1.9621, "mean_token_accuracy": 0.6192703768610954, "num_tokens": 202588049.0, "step": 1072 }, { "epoch": 1.4764199655765922, "grad_norm": 0.8299309015274048, "learning_rate": 2.1486125524548093e-05, "loss": 1.8726, "mean_token_accuracy": 0.6334018930792809, "num_tokens": 202788856.0, "step": 1073 }, { "epoch": 1.4777969018932875, "grad_norm": 0.8427513837814331, "learning_rate": 2.1456456205577e-05, "loss": 1.5654, "mean_token_accuracy": 0.6857735514640808, "num_tokens": 202961125.0, "step": 1074 }, { "epoch": 1.4791738382099828, "grad_norm": 0.8715685606002808, "learning_rate": 2.1426783663984648e-05, "loss": 1.3597, "mean_token_accuracy": 0.722464993596077, "num_tokens": 203116729.0, "step": 1075 }, { "epoch": 1.4805507745266782, "grad_norm": 0.8905746340751648, "learning_rate": 2.1397107965425857e-05, "loss": 1.1495, "mean_token_accuracy": 0.7583421692252159, "num_tokens": 203259525.0, "step": 1076 }, { "epoch": 1.4819277108433735, "grad_norm": 1.1445410251617432, "learning_rate": 2.136742917556242e-05, "loss": 1.2477, "mean_token_accuracy": 0.7398752123117447, "num_tokens": 203383801.0, "step": 1077 }, { "epoch": 1.4833046471600688, "grad_norm": 0.6792526841163635, "learning_rate": 2.133774736006297e-05, "loss": 1.7457, "mean_token_accuracy": 0.6451440006494522, "num_tokens": 203716319.0, "step": 1078 }, { "epoch": 1.4846815834767642, "grad_norm": 0.7797232866287231, "learning_rate": 2.1308062584602865e-05, "loss": 1.9517, "mean_token_accuracy": 0.6205267682671547, "num_tokens": 203928807.0, "step": 1079 }, { "epoch": 1.4860585197934595, "grad_norm": 0.8275617361068726, "learning_rate": 2.1278374914863974e-05, "loss": 1.6576, "mean_token_accuracy": 0.666592001914978, "num_tokens": 204106562.0, "step": 1080 }, { "epoch": 1.487435456110155, "grad_norm": 0.870800793170929, "learning_rate": 2.1248684416534586e-05, "loss": 1.4162, "mean_token_accuracy": 0.71072768419981, "num_tokens": 204265323.0, "step": 1081 }, { "epoch": 1.4888123924268504, "grad_norm": 0.8660578727722168, "learning_rate": 2.1218991155309244e-05, "loss": 1.2192, "mean_token_accuracy": 0.7452527582645416, "num_tokens": 204411136.0, "step": 1082 }, { "epoch": 1.4901893287435457, "grad_norm": 0.9886125922203064, "learning_rate": 2.1189295196888624e-05, "loss": 1.138, "mean_token_accuracy": 0.7623933330178261, "num_tokens": 204541958.0, "step": 1083 }, { "epoch": 1.491566265060241, "grad_norm": 0.7864317297935486, "learning_rate": 2.115959660697935e-05, "loss": 1.7712, "mean_token_accuracy": 0.6528060510754585, "num_tokens": 204834644.0, "step": 1084 }, { "epoch": 1.4929432013769364, "grad_norm": 0.7883524298667908, "learning_rate": 2.112989545129386e-05, "loss": 1.9989, "mean_token_accuracy": 0.6138142123818398, "num_tokens": 205060739.0, "step": 1085 }, { "epoch": 1.4943201376936317, "grad_norm": 0.825107991695404, "learning_rate": 2.1100191795550292e-05, "loss": 1.7594, "mean_token_accuracy": 0.651285320520401, "num_tokens": 205246160.0, "step": 1086 }, { "epoch": 1.495697074010327, "grad_norm": 0.8643438816070557, "learning_rate": 2.1070485705472305e-05, "loss": 1.462, "mean_token_accuracy": 0.7049982324242592, "num_tokens": 205409610.0, "step": 1087 }, { "epoch": 1.4970740103270224, "grad_norm": 0.890242874622345, "learning_rate": 2.1040777246788952e-05, "loss": 1.2761, "mean_token_accuracy": 0.7313218414783478, "num_tokens": 205559073.0, "step": 1088 }, { "epoch": 1.4984509466437177, "grad_norm": 0.9313849210739136, "learning_rate": 2.101106648523451e-05, "loss": 1.1217, "mean_token_accuracy": 0.7658671662211418, "num_tokens": 205694737.0, "step": 1089 }, { "epoch": 1.499827882960413, "grad_norm": 0.7431533336639404, "learning_rate": 2.0981353486548363e-05, "loss": 1.6383, "mean_token_accuracy": 0.6938622519373894, "num_tokens": 205954727.0, "step": 1090 }, { "epoch": 1.5012048192771084, "grad_norm": 0.7653607130050659, "learning_rate": 2.095163831647485e-05, "loss": 1.9553, "mean_token_accuracy": 0.6201811283826828, "num_tokens": 206198595.0, "step": 1091 }, { "epoch": 1.5025817555938037, "grad_norm": 0.8297497034072876, "learning_rate": 2.09219210407631e-05, "loss": 1.8351, "mean_token_accuracy": 0.639694832265377, "num_tokens": 206389926.0, "step": 1092 }, { "epoch": 1.503958691910499, "grad_norm": 0.8594647645950317, "learning_rate": 2.0892201725166918e-05, "loss": 1.5144, "mean_token_accuracy": 0.6905359774827957, "num_tokens": 206556624.0, "step": 1093 }, { "epoch": 1.5053356282271944, "grad_norm": 0.89158034324646, "learning_rate": 2.086248043544461e-05, "loss": 1.2732, "mean_token_accuracy": 0.7353991866111755, "num_tokens": 206708391.0, "step": 1094 }, { "epoch": 1.5067125645438897, "grad_norm": 0.915770947933197, "learning_rate": 2.0832757237358853e-05, "loss": 1.1568, "mean_token_accuracy": 0.7563420012593269, "num_tokens": 206847641.0, "step": 1095 }, { "epoch": 1.5080895008605852, "grad_norm": 0.3224357068538666, "learning_rate": 2.0803032196676542e-05, "loss": 1.4149, "mean_token_accuracy": 0.7317471280694008, "num_tokens": 207054755.0, "step": 1096 }, { "epoch": 1.5094664371772806, "grad_norm": 0.7832369208335876, "learning_rate": 2.077330537916866e-05, "loss": 1.9328, "mean_token_accuracy": 0.6233831569552422, "num_tokens": 207325005.0, "step": 1097 }, { "epoch": 1.510843373493976, "grad_norm": 0.8304225206375122, "learning_rate": 2.074357685061012e-05, "loss": 1.8603, "mean_token_accuracy": 0.6361274868249893, "num_tokens": 207524402.0, "step": 1098 }, { "epoch": 1.5122203098106712, "grad_norm": 0.8616681694984436, "learning_rate": 2.0713846676779613e-05, "loss": 1.5725, "mean_token_accuracy": 0.679482065141201, "num_tokens": 207696039.0, "step": 1099 }, { "epoch": 1.5135972461273666, "grad_norm": 0.8834628462791443, "learning_rate": 2.0684114923459472e-05, "loss": 1.3473, "mean_token_accuracy": 0.7187673598527908, "num_tokens": 207851186.0, "step": 1100 }, { "epoch": 1.5149741824440621, "grad_norm": 0.8837085962295532, "learning_rate": 2.0654381656435526e-05, "loss": 1.1609, "mean_token_accuracy": 0.7569831684231758, "num_tokens": 207993828.0, "step": 1101 }, { "epoch": 1.5163511187607575, "grad_norm": 1.135230541229248, "learning_rate": 2.0624646941496957e-05, "loss": 1.1938, "mean_token_accuracy": 0.7481365650892258, "num_tokens": 208118433.0, "step": 1102 }, { "epoch": 1.5177280550774528, "grad_norm": 0.7040228247642517, "learning_rate": 2.059491084443615e-05, "loss": 1.741, "mean_token_accuracy": 0.6438729539513588, "num_tokens": 208447772.0, "step": 1103 }, { "epoch": 1.5191049913941481, "grad_norm": 0.7872399091720581, "learning_rate": 2.0565173431048538e-05, "loss": 1.9352, "mean_token_accuracy": 0.6246440187096596, "num_tokens": 208660330.0, "step": 1104 }, { "epoch": 1.5204819277108435, "grad_norm": 0.8301945328712463, "learning_rate": 2.0535434767132495e-05, "loss": 1.6775, "mean_token_accuracy": 0.6662969887256622, "num_tokens": 208838282.0, "step": 1105 }, { "epoch": 1.5218588640275388, "grad_norm": 0.8524273037910461, "learning_rate": 2.050569491848911e-05, "loss": 1.4081, "mean_token_accuracy": 0.7148371562361717, "num_tokens": 208997714.0, "step": 1106 }, { "epoch": 1.5232358003442341, "grad_norm": 0.8902197480201721, "learning_rate": 2.0475953950922148e-05, "loss": 1.2191, "mean_token_accuracy": 0.7436284944415092, "num_tokens": 209144104.0, "step": 1107 }, { "epoch": 1.5246127366609294, "grad_norm": 0.9735713005065918, "learning_rate": 2.0446211930237828e-05, "loss": 1.1255, "mean_token_accuracy": 0.7638668268918991, "num_tokens": 209275374.0, "step": 1108 }, { "epoch": 1.5259896729776248, "grad_norm": 0.7158445715904236, "learning_rate": 2.0416468922244688e-05, "loss": 1.7198, "mean_token_accuracy": 0.6610725000500679, "num_tokens": 209569549.0, "step": 1109 }, { "epoch": 1.52736660929432, "grad_norm": 0.7722089886665344, "learning_rate": 2.0386724992753465e-05, "loss": 1.9552, "mean_token_accuracy": 0.6209739074110985, "num_tokens": 209793407.0, "step": 1110 }, { "epoch": 1.5287435456110154, "grad_norm": 0.8398303389549255, "learning_rate": 2.0356980207576923e-05, "loss": 1.7245, "mean_token_accuracy": 0.6614594832062721, "num_tokens": 209976939.0, "step": 1111 }, { "epoch": 1.5301204819277108, "grad_norm": 0.8689599633216858, "learning_rate": 2.0327234632529738e-05, "loss": 1.4482, "mean_token_accuracy": 0.7050692215561867, "num_tokens": 210139686.0, "step": 1112 }, { "epoch": 1.531497418244406, "grad_norm": 0.8845296502113342, "learning_rate": 2.029748833342832e-05, "loss": 1.2418, "mean_token_accuracy": 0.7421463653445244, "num_tokens": 210288727.0, "step": 1113 }, { "epoch": 1.5328743545611014, "grad_norm": 0.9201565384864807, "learning_rate": 2.026774137609068e-05, "loss": 1.1195, "mean_token_accuracy": 0.767252154648304, "num_tokens": 210424502.0, "step": 1114 }, { "epoch": 1.5342512908777968, "grad_norm": 0.668174147605896, "learning_rate": 2.023799382633629e-05, "loss": 1.6223, "mean_token_accuracy": 0.6964156553149223, "num_tokens": 210681603.0, "step": 1115 }, { "epoch": 1.535628227194492, "grad_norm": 0.7670809626579285, "learning_rate": 2.0208245749985927e-05, "loss": 1.9963, "mean_token_accuracy": 0.6138638481497765, "num_tokens": 210923294.0, "step": 1116 }, { "epoch": 1.5370051635111877, "grad_norm": 0.8148202896118164, "learning_rate": 2.017849721286155e-05, "loss": 1.7824, "mean_token_accuracy": 0.6462766230106354, "num_tokens": 211114335.0, "step": 1117 }, { "epoch": 1.538382099827883, "grad_norm": 0.8645744323730469, "learning_rate": 2.014874828078612e-05, "loss": 1.5538, "mean_token_accuracy": 0.6869703233242035, "num_tokens": 211281333.0, "step": 1118 }, { "epoch": 1.5397590361445783, "grad_norm": 0.8560033440589905, "learning_rate": 2.0118999019583475e-05, "loss": 1.2305, "mean_token_accuracy": 0.7390608191490173, "num_tokens": 211433144.0, "step": 1119 }, { "epoch": 1.5411359724612737, "grad_norm": 0.9119908809661865, "learning_rate": 2.0089249495078186e-05, "loss": 1.1715, "mean_token_accuracy": 0.7561827972531319, "num_tokens": 211572633.0, "step": 1120 }, { "epoch": 1.542512908777969, "grad_norm": 0.3245386779308319, "learning_rate": 2.0059499773095405e-05, "loss": 1.3976, "mean_token_accuracy": 0.735389456152916, "num_tokens": 211782618.0, "step": 1121 }, { "epoch": 1.5438898450946645, "grad_norm": 0.8076409697532654, "learning_rate": 2.002974991946072e-05, "loss": 1.9085, "mean_token_accuracy": 0.6286847367882729, "num_tokens": 212050598.0, "step": 1122 }, { "epoch": 1.5452667814113599, "grad_norm": 0.8373785018920898, "learning_rate": 2e-05, "loss": 1.873, "mean_token_accuracy": 0.6322076618671417, "num_tokens": 212249511.0, "step": 1123 }, { "epoch": 1.5466437177280552, "grad_norm": 0.8385191559791565, "learning_rate": 1.9970250080539292e-05, "loss": 1.5632, "mean_token_accuracy": 0.6836376041173935, "num_tokens": 212420931.0, "step": 1124 }, { "epoch": 1.5480206540447505, "grad_norm": 0.8631343245506287, "learning_rate": 1.9940500226904602e-05, "loss": 1.2973, "mean_token_accuracy": 0.7300891429185867, "num_tokens": 212576041.0, "step": 1125 }, { "epoch": 1.5493975903614459, "grad_norm": 0.8886727094650269, "learning_rate": 1.991075050492182e-05, "loss": 1.1533, "mean_token_accuracy": 0.7585268095135689, "num_tokens": 212718458.0, "step": 1126 }, { "epoch": 1.5507745266781412, "grad_norm": 1.1461021900177002, "learning_rate": 1.9881000980416528e-05, "loss": 1.202, "mean_token_accuracy": 0.7486373260617256, "num_tokens": 212842892.0, "step": 1127 }, { "epoch": 1.5521514629948365, "grad_norm": 0.7357667684555054, "learning_rate": 1.985125171921389e-05, "loss": 1.7452, "mean_token_accuracy": 0.6404742449522018, "num_tokens": 213175201.0, "step": 1128 }, { "epoch": 1.5535283993115319, "grad_norm": 0.8244959115982056, "learning_rate": 1.9821502787138457e-05, "loss": 1.9271, "mean_token_accuracy": 0.6239080429077148, "num_tokens": 213385159.0, "step": 1129 }, { "epoch": 1.5549053356282272, "grad_norm": 0.8483816385269165, "learning_rate": 1.979175425001408e-05, "loss": 1.6698, "mean_token_accuracy": 0.6682136207818985, "num_tokens": 213562944.0, "step": 1130 }, { "epoch": 1.5562822719449225, "grad_norm": 0.8606539368629456, "learning_rate": 1.9762006173663717e-05, "loss": 1.3536, "mean_token_accuracy": 0.7210585996508598, "num_tokens": 213721784.0, "step": 1131 }, { "epoch": 1.5576592082616179, "grad_norm": 0.9110720157623291, "learning_rate": 1.973225862390933e-05, "loss": 1.1713, "mean_token_accuracy": 0.7537117078900337, "num_tokens": 213867511.0, "step": 1132 }, { "epoch": 1.5590361445783132, "grad_norm": 1.027510166168213, "learning_rate": 1.9702511666571687e-05, "loss": 1.1532, "mean_token_accuracy": 0.762851893901825, "num_tokens": 213998185.0, "step": 1133 }, { "epoch": 1.5604130808950085, "grad_norm": 0.7259673476219177, "learning_rate": 1.9672765367470265e-05, "loss": 1.7404, "mean_token_accuracy": 0.655971959233284, "num_tokens": 214289191.0, "step": 1134 }, { "epoch": 1.5617900172117039, "grad_norm": 0.7747064232826233, "learning_rate": 1.964301979242308e-05, "loss": 1.9297, "mean_token_accuracy": 0.626616396009922, "num_tokens": 214512697.0, "step": 1135 }, { "epoch": 1.5631669535283992, "grad_norm": 0.839672863483429, "learning_rate": 1.9613275007246545e-05, "loss": 1.7338, "mean_token_accuracy": 0.6558791473507881, "num_tokens": 214696860.0, "step": 1136 }, { "epoch": 1.5645438898450945, "grad_norm": 0.8707978129386902, "learning_rate": 1.958353107775532e-05, "loss": 1.4641, "mean_token_accuracy": 0.7004250511527061, "num_tokens": 214859657.0, "step": 1137 }, { "epoch": 1.56592082616179, "grad_norm": 0.8945202827453613, "learning_rate": 1.955378806976218e-05, "loss": 1.2226, "mean_token_accuracy": 0.741669051349163, "num_tokens": 215008616.0, "step": 1138 }, { "epoch": 1.5672977624784854, "grad_norm": 0.9491830468177795, "learning_rate": 1.9524046049077855e-05, "loss": 1.1265, "mean_token_accuracy": 0.7636683210730553, "num_tokens": 215144551.0, "step": 1139 }, { "epoch": 1.5686746987951807, "grad_norm": 0.6579519510269165, "learning_rate": 1.9494305081510893e-05, "loss": 1.6028, "mean_token_accuracy": 0.7018257901072502, "num_tokens": 215403866.0, "step": 1140 }, { "epoch": 1.570051635111876, "grad_norm": 0.7547858357429504, "learning_rate": 1.9464565232867512e-05, "loss": 1.962, "mean_token_accuracy": 0.6204793974757195, "num_tokens": 215644370.0, "step": 1141 }, { "epoch": 1.5714285714285714, "grad_norm": 0.8160419464111328, "learning_rate": 1.9434826568951458e-05, "loss": 1.7569, "mean_token_accuracy": 0.6508408859372139, "num_tokens": 215835038.0, "step": 1142 }, { "epoch": 1.5728055077452667, "grad_norm": 0.8504343032836914, "learning_rate": 1.9405089155563853e-05, "loss": 1.496, "mean_token_accuracy": 0.6959516480565071, "num_tokens": 216001821.0, "step": 1143 }, { "epoch": 1.5741824440619623, "grad_norm": 0.865224301815033, "learning_rate": 1.9375353058503054e-05, "loss": 1.2613, "mean_token_accuracy": 0.7336238324642181, "num_tokens": 216153826.0, "step": 1144 }, { "epoch": 1.5755593803786576, "grad_norm": 0.9144826531410217, "learning_rate": 1.934561834356448e-05, "loss": 1.1274, "mean_token_accuracy": 0.7643484175205231, "num_tokens": 216293342.0, "step": 1145 }, { "epoch": 1.576936316695353, "grad_norm": 0.3124276101589203, "learning_rate": 1.9315885076540538e-05, "loss": 1.4219, "mean_token_accuracy": 0.7395726665854454, "num_tokens": 216500734.0, "step": 1146 }, { "epoch": 1.5783132530120483, "grad_norm": 0.7738456726074219, "learning_rate": 1.9286153323220393e-05, "loss": 1.9181, "mean_token_accuracy": 0.6256974190473557, "num_tokens": 216769786.0, "step": 1147 }, { "epoch": 1.5796901893287436, "grad_norm": 0.8179636001586914, "learning_rate": 1.9256423149389883e-05, "loss": 1.8661, "mean_token_accuracy": 0.6331812366843224, "num_tokens": 216970655.0, "step": 1148 }, { "epoch": 1.581067125645439, "grad_norm": 0.8426827788352966, "learning_rate": 1.9226694620831342e-05, "loss": 1.5715, "mean_token_accuracy": 0.6829972043633461, "num_tokens": 217142770.0, "step": 1149 }, { "epoch": 1.5824440619621343, "grad_norm": 0.8648706078529358, "learning_rate": 1.9196967803323464e-05, "loss": 1.3181, "mean_token_accuracy": 0.7270866855978966, "num_tokens": 217298290.0, "step": 1150 }, { "epoch": 1.5838209982788296, "grad_norm": 0.8983505368232727, "learning_rate": 1.916724276264115e-05, "loss": 1.175, "mean_token_accuracy": 0.7520844116806984, "num_tokens": 217441544.0, "step": 1151 }, { "epoch": 1.585197934595525, "grad_norm": 1.1527864933013916, "learning_rate": 1.91375195645554e-05, "loss": 1.2137, "mean_token_accuracy": 0.7456628009676933, "num_tokens": 217566424.0, "step": 1152 }, { "epoch": 1.5865748709122203, "grad_norm": 0.6858416199684143, "learning_rate": 1.9107798274833092e-05, "loss": 1.7644, "mean_token_accuracy": 0.6408993005752563, "num_tokens": 217894287.0, "step": 1153 }, { "epoch": 1.5879518072289156, "grad_norm": 0.7959484457969666, "learning_rate": 1.9078078959236907e-05, "loss": 1.9103, "mean_token_accuracy": 0.6289984583854675, "num_tokens": 218105701.0, "step": 1154 }, { "epoch": 1.589328743545611, "grad_norm": 0.8340932130813599, "learning_rate": 1.9048361683525155e-05, "loss": 1.6524, "mean_token_accuracy": 0.6698451638221741, "num_tokens": 218284149.0, "step": 1155 }, { "epoch": 1.5907056798623063, "grad_norm": 0.8532328605651855, "learning_rate": 1.901864651345164e-05, "loss": 1.3922, "mean_token_accuracy": 0.7138284221291542, "num_tokens": 218443593.0, "step": 1156 }, { "epoch": 1.5920826161790016, "grad_norm": 0.8798726797103882, "learning_rate": 1.8988933514765497e-05, "loss": 1.1987, "mean_token_accuracy": 0.7491217628121376, "num_tokens": 218589861.0, "step": 1157 }, { "epoch": 1.593459552495697, "grad_norm": 0.9848282933235168, "learning_rate": 1.8959222753211055e-05, "loss": 1.1445, "mean_token_accuracy": 0.7636303082108498, "num_tokens": 218720919.0, "step": 1158 }, { "epoch": 1.5948364888123923, "grad_norm": 0.7004112005233765, "learning_rate": 1.8929514294527698e-05, "loss": 1.7572, "mean_token_accuracy": 0.661091223359108, "num_tokens": 219011209.0, "step": 1159 }, { "epoch": 1.5962134251290878, "grad_norm": 0.768261730670929, "learning_rate": 1.889980820444971e-05, "loss": 1.9176, "mean_token_accuracy": 0.6270869821310043, "num_tokens": 219234313.0, "step": 1160 }, { "epoch": 1.5975903614457831, "grad_norm": 0.8193297386169434, "learning_rate": 1.887010454870615e-05, "loss": 1.6867, "mean_token_accuracy": 0.6655331701040268, "num_tokens": 219417423.0, "step": 1161 }, { "epoch": 1.5989672977624785, "grad_norm": 0.841273307800293, "learning_rate": 1.8840403393020663e-05, "loss": 1.4044, "mean_token_accuracy": 0.7113868147134781, "num_tokens": 219579972.0, "step": 1162 }, { "epoch": 1.6003442340791738, "grad_norm": 0.875188410282135, "learning_rate": 1.8810704803111382e-05, "loss": 1.2107, "mean_token_accuracy": 0.7447655647993088, "num_tokens": 219728963.0, "step": 1163 }, { "epoch": 1.6017211703958691, "grad_norm": 0.9449931383132935, "learning_rate": 1.878100884469076e-05, "loss": 1.1014, "mean_token_accuracy": 0.7672284096479416, "num_tokens": 219864579.0, "step": 1164 }, { "epoch": 1.6030981067125647, "grad_norm": 0.7049762606620789, "learning_rate": 1.875131558346542e-05, "loss": 1.6534, "mean_token_accuracy": 0.6969328969717026, "num_tokens": 220125246.0, "step": 1165 }, { "epoch": 1.60447504302926, "grad_norm": 0.7862063050270081, "learning_rate": 1.8721625085136033e-05, "loss": 1.9387, "mean_token_accuracy": 0.6225867494940758, "num_tokens": 220367804.0, "step": 1166 }, { "epoch": 1.6058519793459554, "grad_norm": 0.8137915730476379, "learning_rate": 1.869193741539714e-05, "loss": 1.7525, "mean_token_accuracy": 0.6512618511915207, "num_tokens": 220559171.0, "step": 1167 }, { "epoch": 1.6072289156626507, "grad_norm": 0.8570161461830139, "learning_rate": 1.866225263993703e-05, "loss": 1.4828, "mean_token_accuracy": 0.697824016213417, "num_tokens": 220725806.0, "step": 1168 }, { "epoch": 1.608605851979346, "grad_norm": 0.8648846745491028, "learning_rate": 1.863257082443759e-05, "loss": 1.2475, "mean_token_accuracy": 0.7376796007156372, "num_tokens": 220877500.0, "step": 1169 }, { "epoch": 1.6099827882960414, "grad_norm": 0.8977555632591248, "learning_rate": 1.8602892034574153e-05, "loss": 1.0937, "mean_token_accuracy": 0.7691123113036156, "num_tokens": 221016419.0, "step": 1170 }, { "epoch": 1.6113597246127367, "grad_norm": 0.31957316398620605, "learning_rate": 1.8573216336015355e-05, "loss": 1.4339, "mean_token_accuracy": 0.7357581928372383, "num_tokens": 221224056.0, "step": 1171 }, { "epoch": 1.612736660929432, "grad_norm": 0.789380669593811, "learning_rate": 1.8543543794423006e-05, "loss": 1.9385, "mean_token_accuracy": 0.6241256594657898, "num_tokens": 221491031.0, "step": 1172 }, { "epoch": 1.6141135972461274, "grad_norm": 0.8279462456703186, "learning_rate": 1.8513874475451917e-05, "loss": 1.869, "mean_token_accuracy": 0.6335198804736137, "num_tokens": 221690932.0, "step": 1173 }, { "epoch": 1.6154905335628227, "grad_norm": 0.845177412033081, "learning_rate": 1.848420844474977e-05, "loss": 1.5771, "mean_token_accuracy": 0.6837534755468369, "num_tokens": 221863363.0, "step": 1174 }, { "epoch": 1.616867469879518, "grad_norm": 0.8575026988983154, "learning_rate": 1.8454545767956978e-05, "loss": 1.2745, "mean_token_accuracy": 0.7347875982522964, "num_tokens": 222018744.0, "step": 1175 }, { "epoch": 1.6182444061962133, "grad_norm": 0.9120182991027832, "learning_rate": 1.8424886510706527e-05, "loss": 1.1734, "mean_token_accuracy": 0.7570618316531181, "num_tokens": 222161112.0, "step": 1176 }, { "epoch": 1.6196213425129087, "grad_norm": 1.1853965520858765, "learning_rate": 1.839523073862385e-05, "loss": 1.1949, "mean_token_accuracy": 0.7529495134949684, "num_tokens": 222284979.0, "step": 1177 }, { "epoch": 1.620998278829604, "grad_norm": 0.6793304681777954, "learning_rate": 1.8365578517326642e-05, "loss": 1.7212, "mean_token_accuracy": 0.6466706097126007, "num_tokens": 222612461.0, "step": 1178 }, { "epoch": 1.6223752151462993, "grad_norm": 0.8365301489830017, "learning_rate": 1.8335929912424756e-05, "loss": 1.8977, "mean_token_accuracy": 0.629869319498539, "num_tokens": 222822386.0, "step": 1179 }, { "epoch": 1.6237521514629947, "grad_norm": 0.8551831841468811, "learning_rate": 1.8306284989520055e-05, "loss": 1.6187, "mean_token_accuracy": 0.6739480048418045, "num_tokens": 222999945.0, "step": 1180 }, { "epoch": 1.6251290877796902, "grad_norm": 0.8729605674743652, "learning_rate": 1.827664381420624e-05, "loss": 1.3852, "mean_token_accuracy": 0.7159881666302681, "num_tokens": 223158542.0, "step": 1181 }, { "epoch": 1.6265060240963856, "grad_norm": 0.8854681849479675, "learning_rate": 1.8247006452068724e-05, "loss": 1.2006, "mean_token_accuracy": 0.7493217065930367, "num_tokens": 223304417.0, "step": 1182 }, { "epoch": 1.627882960413081, "grad_norm": 0.9903656840324402, "learning_rate": 1.8217372968684483e-05, "loss": 1.1036, "mean_token_accuracy": 0.7682375684380531, "num_tokens": 223436005.0, "step": 1183 }, { "epoch": 1.6292598967297762, "grad_norm": 0.7649958729743958, "learning_rate": 1.818774342962191e-05, "loss": 1.6705, "mean_token_accuracy": 0.6676303073763847, "num_tokens": 223728411.0, "step": 1184 }, { "epoch": 1.6306368330464716, "grad_norm": 0.7916508316993713, "learning_rate": 1.8158117900440673e-05, "loss": 1.9265, "mean_token_accuracy": 0.6279838532209396, "num_tokens": 223952062.0, "step": 1185 }, { "epoch": 1.632013769363167, "grad_norm": 0.825942873954773, "learning_rate": 1.8128496446691563e-05, "loss": 1.7027, "mean_token_accuracy": 0.6600113734602928, "num_tokens": 224135931.0, "step": 1186 }, { "epoch": 1.6333907056798624, "grad_norm": 0.8738645911216736, "learning_rate": 1.8098879133916352e-05, "loss": 1.4328, "mean_token_accuracy": 0.7080342397093773, "num_tokens": 224298646.0, "step": 1187 }, { "epoch": 1.6347676419965578, "grad_norm": 0.8783003687858582, "learning_rate": 1.806926602764766e-05, "loss": 1.2148, "mean_token_accuracy": 0.7446788847446442, "num_tokens": 224447803.0, "step": 1188 }, { "epoch": 1.636144578313253, "grad_norm": 0.9401087164878845, "learning_rate": 1.8039657193408788e-05, "loss": 1.1473, "mean_token_accuracy": 0.7639375403523445, "num_tokens": 224583703.0, "step": 1189 }, { "epoch": 1.6375215146299484, "grad_norm": 0.6994162201881409, "learning_rate": 1.80100526967136e-05, "loss": 1.6076, "mean_token_accuracy": 0.6948953717947006, "num_tokens": 224838703.0, "step": 1190 }, { "epoch": 1.6388984509466438, "grad_norm": 0.8140792846679688, "learning_rate": 1.798045260306634e-05, "loss": 1.9775, "mean_token_accuracy": 0.6181609258055687, "num_tokens": 225080445.0, "step": 1191 }, { "epoch": 1.640275387263339, "grad_norm": 0.8414948582649231, "learning_rate": 1.795085697796153e-05, "loss": 1.7455, "mean_token_accuracy": 0.6535793915390968, "num_tokens": 225271820.0, "step": 1192 }, { "epoch": 1.6416523235800344, "grad_norm": 0.8683633804321289, "learning_rate": 1.7921265886883792e-05, "loss": 1.5216, "mean_token_accuracy": 0.6928914561867714, "num_tokens": 225438959.0, "step": 1193 }, { "epoch": 1.6430292598967298, "grad_norm": 0.8620215654373169, "learning_rate": 1.789167939530773e-05, "loss": 1.2454, "mean_token_accuracy": 0.7394632697105408, "num_tokens": 225591190.0, "step": 1194 }, { "epoch": 1.644406196213425, "grad_norm": 0.9493000507354736, "learning_rate": 1.786209756869775e-05, "loss": 1.1599, "mean_token_accuracy": 0.7587597519159317, "num_tokens": 225731012.0, "step": 1195 }, { "epoch": 1.6457831325301204, "grad_norm": 0.3236679136753082, "learning_rate": 1.7832520472507945e-05, "loss": 1.4231, "mean_token_accuracy": 0.7355068400502205, "num_tokens": 225937322.0, "step": 1196 }, { "epoch": 1.6471600688468158, "grad_norm": 0.7611335515975952, "learning_rate": 1.7802948172181954e-05, "loss": 1.9196, "mean_token_accuracy": 0.6258838698267937, "num_tokens": 226204757.0, "step": 1197 }, { "epoch": 1.648537005163511, "grad_norm": 0.8371473550796509, "learning_rate": 1.7773380733152786e-05, "loss": 1.8388, "mean_token_accuracy": 0.6386329308152199, "num_tokens": 226404087.0, "step": 1198 }, { "epoch": 1.6499139414802064, "grad_norm": 0.8603143692016602, "learning_rate": 1.774381822084271e-05, "loss": 1.5662, "mean_token_accuracy": 0.6847781166434288, "num_tokens": 226575207.0, "step": 1199 }, { "epoch": 1.6512908777969018, "grad_norm": 0.8851235508918762, "learning_rate": 1.771426070066307e-05, "loss": 1.3079, "mean_token_accuracy": 0.7297224849462509, "num_tokens": 226730131.0, "step": 1200 }, { "epoch": 1.652667814113597, "grad_norm": 0.8969115614891052, "learning_rate": 1.76847082380142e-05, "loss": 1.1699, "mean_token_accuracy": 0.7550645023584366, "num_tokens": 226872548.0, "step": 1201 }, { "epoch": 1.6540447504302926, "grad_norm": 1.124322533607483, "learning_rate": 1.765516089828522e-05, "loss": 1.2023, "mean_token_accuracy": 0.7512956410646439, "num_tokens": 226996596.0, "step": 1202 }, { "epoch": 1.655421686746988, "grad_norm": 0.7262313961982727, "learning_rate": 1.7625618746853902e-05, "loss": 1.7398, "mean_token_accuracy": 0.6448372900485992, "num_tokens": 227327373.0, "step": 1203 }, { "epoch": 1.6567986230636833, "grad_norm": 0.8436802625656128, "learning_rate": 1.7596081849086562e-05, "loss": 1.9084, "mean_token_accuracy": 0.6290230751037598, "num_tokens": 227538340.0, "step": 1204 }, { "epoch": 1.6581755593803786, "grad_norm": 0.8699339032173157, "learning_rate": 1.756655027033789e-05, "loss": 1.6867, "mean_token_accuracy": 0.6609821170568466, "num_tokens": 227716183.0, "step": 1205 }, { "epoch": 1.659552495697074, "grad_norm": 0.8603414297103882, "learning_rate": 1.7537024075950795e-05, "loss": 1.3625, "mean_token_accuracy": 0.7178564891219139, "num_tokens": 227875665.0, "step": 1206 }, { "epoch": 1.6609294320137695, "grad_norm": 0.8917186260223389, "learning_rate": 1.7507503331256283e-05, "loss": 1.1833, "mean_token_accuracy": 0.751776933670044, "num_tokens": 228021899.0, "step": 1207 }, { "epoch": 1.6623063683304649, "grad_norm": 1.0143816471099854, "learning_rate": 1.7477988101573292e-05, "loss": 1.0925, "mean_token_accuracy": 0.7706505805253983, "num_tokens": 228153443.0, "step": 1208 }, { "epoch": 1.6636833046471602, "grad_norm": 0.7585045695304871, "learning_rate": 1.744847845220856e-05, "loss": 1.7173, "mean_token_accuracy": 0.6682528331875801, "num_tokens": 228448589.0, "step": 1209 }, { "epoch": 1.6650602409638555, "grad_norm": 0.7824603319168091, "learning_rate": 1.741897444845649e-05, "loss": 1.9477, "mean_token_accuracy": 0.622168131172657, "num_tokens": 228674296.0, "step": 1210 }, { "epoch": 1.6664371772805509, "grad_norm": 0.8258208632469177, "learning_rate": 1.7389476155598974e-05, "loss": 1.6909, "mean_token_accuracy": 0.6632427126169205, "num_tokens": 228859112.0, "step": 1211 }, { "epoch": 1.6678141135972462, "grad_norm": 0.8659867644309998, "learning_rate": 1.7359983638905277e-05, "loss": 1.4271, "mean_token_accuracy": 0.7051323354244232, "num_tokens": 229022060.0, "step": 1212 }, { "epoch": 1.6691910499139415, "grad_norm": 0.8957744836807251, "learning_rate": 1.7330496963631883e-05, "loss": 1.2225, "mean_token_accuracy": 0.7435164451599121, "num_tokens": 229171037.0, "step": 1213 }, { "epoch": 1.6705679862306368, "grad_norm": 0.939918041229248, "learning_rate": 1.730101619502235e-05, "loss": 1.0947, "mean_token_accuracy": 0.772207647562027, "num_tokens": 229306597.0, "step": 1214 }, { "epoch": 1.6719449225473322, "grad_norm": 0.6582490801811218, "learning_rate": 1.7271541398307175e-05, "loss": 1.5627, "mean_token_accuracy": 0.704075999557972, "num_tokens": 229560459.0, "step": 1215 }, { "epoch": 1.6733218588640275, "grad_norm": 0.7934812903404236, "learning_rate": 1.7242072638703627e-05, "loss": 1.9604, "mean_token_accuracy": 0.6196297481656075, "num_tokens": 229802968.0, "step": 1216 }, { "epoch": 1.6746987951807228, "grad_norm": 0.8231950402259827, "learning_rate": 1.7212609981415632e-05, "loss": 1.7464, "mean_token_accuracy": 0.6531975567340851, "num_tokens": 229994298.0, "step": 1217 }, { "epoch": 1.6760757314974182, "grad_norm": 0.8533493280410767, "learning_rate": 1.7183153491633603e-05, "loss": 1.49, "mean_token_accuracy": 0.695885181427002, "num_tokens": 230161586.0, "step": 1218 }, { "epoch": 1.6774526678141135, "grad_norm": 0.8680657148361206, "learning_rate": 1.7153703234534302e-05, "loss": 1.2416, "mean_token_accuracy": 0.7420571893453598, "num_tokens": 230313715.0, "step": 1219 }, { "epoch": 1.6788296041308088, "grad_norm": 0.9151865243911743, "learning_rate": 1.7124259275280716e-05, "loss": 1.1126, "mean_token_accuracy": 0.7675537839531898, "num_tokens": 230453042.0, "step": 1220 }, { "epoch": 1.6802065404475042, "grad_norm": 0.32754525542259216, "learning_rate": 1.709482167902188e-05, "loss": 1.404, "mean_token_accuracy": 0.7326289415359497, "num_tokens": 230663366.0, "step": 1221 }, { "epoch": 1.6815834767641995, "grad_norm": 0.7530432939529419, "learning_rate": 1.7065390510892767e-05, "loss": 1.9156, "mean_token_accuracy": 0.6268618479371071, "num_tokens": 230931721.0, "step": 1222 }, { "epoch": 1.682960413080895, "grad_norm": 0.8375855088233948, "learning_rate": 1.7035965836014118e-05, "loss": 1.8293, "mean_token_accuracy": 0.6406033635139465, "num_tokens": 231131921.0, "step": 1223 }, { "epoch": 1.6843373493975904, "grad_norm": 0.8437657356262207, "learning_rate": 1.7006547719492302e-05, "loss": 1.58, "mean_token_accuracy": 0.6800993010401726, "num_tokens": 231304295.0, "step": 1224 }, { "epoch": 1.6857142857142857, "grad_norm": 0.8703485131263733, "learning_rate": 1.6977136226419187e-05, "loss": 1.32, "mean_token_accuracy": 0.727257177233696, "num_tokens": 231460011.0, "step": 1225 }, { "epoch": 1.687091222030981, "grad_norm": 0.9148727655410767, "learning_rate": 1.6947731421871978e-05, "loss": 1.1728, "mean_token_accuracy": 0.7518595606088638, "num_tokens": 231603182.0, "step": 1226 }, { "epoch": 1.6884681583476764, "grad_norm": 1.1315114498138428, "learning_rate": 1.6918333370913092e-05, "loss": 1.1917, "mean_token_accuracy": 0.7488835081458092, "num_tokens": 231728323.0, "step": 1227 }, { "epoch": 1.689845094664372, "grad_norm": 0.6420438289642334, "learning_rate": 1.6888942138589977e-05, "loss": 1.6951, "mean_token_accuracy": 0.6509273573756218, "num_tokens": 232055162.0, "step": 1228 }, { "epoch": 1.6912220309810673, "grad_norm": 0.7858041524887085, "learning_rate": 1.685955778993502e-05, "loss": 1.8594, "mean_token_accuracy": 0.6369646862149239, "num_tokens": 232264968.0, "step": 1229 }, { "epoch": 1.6925989672977626, "grad_norm": 0.8352747559547424, "learning_rate": 1.6830180389965372e-05, "loss": 1.6009, "mean_token_accuracy": 0.6786380559206009, "num_tokens": 232441532.0, "step": 1230 }, { "epoch": 1.693975903614458, "grad_norm": 0.8704172372817993, "learning_rate": 1.68008100036828e-05, "loss": 1.3685, "mean_token_accuracy": 0.7170184850692749, "num_tokens": 232600090.0, "step": 1231 }, { "epoch": 1.6953528399311533, "grad_norm": 0.8690811991691589, "learning_rate": 1.6771446696073552e-05, "loss": 1.1743, "mean_token_accuracy": 0.75677889585495, "num_tokens": 232745981.0, "step": 1232 }, { "epoch": 1.6967297762478486, "grad_norm": 0.9915464520454407, "learning_rate": 1.6742090532108228e-05, "loss": 1.1107, "mean_token_accuracy": 0.7700666636228561, "num_tokens": 232877461.0, "step": 1233 }, { "epoch": 1.698106712564544, "grad_norm": 0.6863532066345215, "learning_rate": 1.67127415767416e-05, "loss": 1.7264, "mean_token_accuracy": 0.6647436618804932, "num_tokens": 233167389.0, "step": 1234 }, { "epoch": 1.6994836488812393, "grad_norm": 0.7636541724205017, "learning_rate": 1.6683399894912522e-05, "loss": 1.9091, "mean_token_accuracy": 0.6286723837256432, "num_tokens": 233389298.0, "step": 1235 }, { "epoch": 1.7008605851979346, "grad_norm": 0.8364078402519226, "learning_rate": 1.6654065551543716e-05, "loss": 1.6676, "mean_token_accuracy": 0.667531318962574, "num_tokens": 233571887.0, "step": 1236 }, { "epoch": 1.70223752151463, "grad_norm": 0.8555752635002136, "learning_rate": 1.6624738611541685e-05, "loss": 1.4072, "mean_token_accuracy": 0.7124991565942764, "num_tokens": 233734100.0, "step": 1237 }, { "epoch": 1.7036144578313253, "grad_norm": 0.8578059077262878, "learning_rate": 1.6595419139796553e-05, "loss": 1.2322, "mean_token_accuracy": 0.7453624084591866, "num_tokens": 233883150.0, "step": 1238 }, { "epoch": 1.7049913941480206, "grad_norm": 0.9424326419830322, "learning_rate": 1.6566107201181926e-05, "loss": 1.0919, "mean_token_accuracy": 0.7702762484550476, "num_tokens": 234018775.0, "step": 1239 }, { "epoch": 1.706368330464716, "grad_norm": 0.6340410113334656, "learning_rate": 1.6536802860554723e-05, "loss": 1.6069, "mean_token_accuracy": 0.7008150070905685, "num_tokens": 234274346.0, "step": 1240 }, { "epoch": 1.7077452667814113, "grad_norm": 0.7397426962852478, "learning_rate": 1.6507506182755067e-05, "loss": 1.9467, "mean_token_accuracy": 0.61911241710186, "num_tokens": 234516285.0, "step": 1241 }, { "epoch": 1.7091222030981066, "grad_norm": 0.8246981501579285, "learning_rate": 1.6478217232606114e-05, "loss": 1.7902, "mean_token_accuracy": 0.6467876881361008, "num_tokens": 234706660.0, "step": 1242 }, { "epoch": 1.710499139414802, "grad_norm": 0.8507930636405945, "learning_rate": 1.6448936074913938e-05, "loss": 1.4218, "mean_token_accuracy": 0.7112103775143623, "num_tokens": 234872846.0, "step": 1243 }, { "epoch": 1.7118760757314975, "grad_norm": 0.8798739910125732, "learning_rate": 1.641966277446735e-05, "loss": 1.2172, "mean_token_accuracy": 0.7421343699097633, "num_tokens": 235024148.0, "step": 1244 }, { "epoch": 1.7132530120481928, "grad_norm": 0.9245933294296265, "learning_rate": 1.6390397396037793e-05, "loss": 1.1174, "mean_token_accuracy": 0.7648959308862686, "num_tokens": 235162862.0, "step": 1245 }, { "epoch": 1.7146299483648881, "grad_norm": 0.30774232745170593, "learning_rate": 1.6361140004379165e-05, "loss": 1.4006, "mean_token_accuracy": 0.7362763583660126, "num_tokens": 235374312.0, "step": 1246 }, { "epoch": 1.7160068846815835, "grad_norm": 0.7292870879173279, "learning_rate": 1.6331890664227714e-05, "loss": 1.9416, "mean_token_accuracy": 0.6237359941005707, "num_tokens": 235645521.0, "step": 1247 }, { "epoch": 1.7173838209982788, "grad_norm": 0.8019707798957825, "learning_rate": 1.6302649440301847e-05, "loss": 1.8007, "mean_token_accuracy": 0.6491107642650604, "num_tokens": 235844747.0, "step": 1248 }, { "epoch": 1.7187607573149741, "grad_norm": 0.8370382785797119, "learning_rate": 1.6273416397302043e-05, "loss": 1.5531, "mean_token_accuracy": 0.6860848069190979, "num_tokens": 236015654.0, "step": 1249 }, { "epoch": 1.7201376936316697, "grad_norm": 0.85453200340271, "learning_rate": 1.6244191599910653e-05, "loss": 1.2864, "mean_token_accuracy": 0.7334303930401802, "num_tokens": 236170660.0, "step": 1250 }, { "epoch": 1.721514629948365, "grad_norm": 0.9009243249893188, "learning_rate": 1.6214975112791803e-05, "loss": 1.162, "mean_token_accuracy": 0.754755049943924, "num_tokens": 236313094.0, "step": 1251 }, { "epoch": 1.7228915662650603, "grad_norm": 1.1193197965621948, "learning_rate": 1.6185767000591202e-05, "loss": 1.1711, "mean_token_accuracy": 0.7519168257713318, "num_tokens": 236437440.0, "step": 1252 }, { "epoch": 1.7242685025817557, "grad_norm": 0.6400884389877319, "learning_rate": 1.615656732793606e-05, "loss": 1.6979, "mean_token_accuracy": 0.6510279476642609, "num_tokens": 236764303.0, "step": 1253 }, { "epoch": 1.725645438898451, "grad_norm": 0.7861027121543884, "learning_rate": 1.6127376159434903e-05, "loss": 1.8643, "mean_token_accuracy": 0.6333560794591904, "num_tokens": 236973986.0, "step": 1254 }, { "epoch": 1.7270223752151463, "grad_norm": 0.8506038188934326, "learning_rate": 1.609819355967744e-05, "loss": 1.6431, "mean_token_accuracy": 0.6708569601178169, "num_tokens": 237150908.0, "step": 1255 }, { "epoch": 1.7283993115318417, "grad_norm": 0.8661695718765259, "learning_rate": 1.606901959323441e-05, "loss": 1.3499, "mean_token_accuracy": 0.720679983496666, "num_tokens": 237309457.0, "step": 1256 }, { "epoch": 1.729776247848537, "grad_norm": 0.8978309631347656, "learning_rate": 1.603985432465746e-05, "loss": 1.1625, "mean_token_accuracy": 0.7561330795288086, "num_tokens": 237455366.0, "step": 1257 }, { "epoch": 1.7311531841652323, "grad_norm": 0.9882899522781372, "learning_rate": 1.6010697818478996e-05, "loss": 1.092, "mean_token_accuracy": 0.7724716812372208, "num_tokens": 237586506.0, "step": 1258 }, { "epoch": 1.7325301204819277, "grad_norm": 0.660470187664032, "learning_rate": 1.5981550139212023e-05, "loss": 1.6668, "mean_token_accuracy": 0.6711469814181328, "num_tokens": 237882901.0, "step": 1259 }, { "epoch": 1.733907056798623, "grad_norm": 0.7519417405128479, "learning_rate": 1.5952411351350042e-05, "loss": 1.9129, "mean_token_accuracy": 0.6266067177057266, "num_tokens": 238106667.0, "step": 1260 }, { "epoch": 1.7352839931153183, "grad_norm": 0.8170222043991089, "learning_rate": 1.5923281519366832e-05, "loss": 1.6646, "mean_token_accuracy": 0.6668546125292778, "num_tokens": 238290126.0, "step": 1261 }, { "epoch": 1.7366609294320137, "grad_norm": 0.862545371055603, "learning_rate": 1.58941607077164e-05, "loss": 1.4163, "mean_token_accuracy": 0.7120806202292442, "num_tokens": 238452567.0, "step": 1262 }, { "epoch": 1.738037865748709, "grad_norm": 0.8794005513191223, "learning_rate": 1.586504898083277e-05, "loss": 1.2104, "mean_token_accuracy": 0.7481513172388077, "num_tokens": 238601386.0, "step": 1263 }, { "epoch": 1.7394148020654043, "grad_norm": 0.9453165531158447, "learning_rate": 1.5835946403129886e-05, "loss": 1.111, "mean_token_accuracy": 0.7686129435896873, "num_tokens": 238737087.0, "step": 1264 }, { "epoch": 1.7407917383820997, "grad_norm": 0.6501623392105103, "learning_rate": 1.580685303900143e-05, "loss": 1.5922, "mean_token_accuracy": 0.6997333914041519, "num_tokens": 238989818.0, "step": 1265 }, { "epoch": 1.7421686746987952, "grad_norm": 0.7419114112854004, "learning_rate": 1.5777768952820697e-05, "loss": 1.9431, "mean_token_accuracy": 0.623982310295105, "num_tokens": 239229486.0, "step": 1266 }, { "epoch": 1.7435456110154905, "grad_norm": 0.8376520276069641, "learning_rate": 1.5748694208940467e-05, "loss": 1.756, "mean_token_accuracy": 0.6514360532164574, "num_tokens": 239419494.0, "step": 1267 }, { "epoch": 1.7449225473321859, "grad_norm": 0.8747854232788086, "learning_rate": 1.5719628871692842e-05, "loss": 1.4931, "mean_token_accuracy": 0.698030523955822, "num_tokens": 239586450.0, "step": 1268 }, { "epoch": 1.7462994836488812, "grad_norm": 0.8704959154129028, "learning_rate": 1.5690573005389103e-05, "loss": 1.2347, "mean_token_accuracy": 0.7386101484298706, "num_tokens": 239738428.0, "step": 1269 }, { "epoch": 1.7476764199655765, "grad_norm": 0.9165136814117432, "learning_rate": 1.5661526674319582e-05, "loss": 1.1169, "mean_token_accuracy": 0.763124942779541, "num_tokens": 239877657.0, "step": 1270 }, { "epoch": 1.749053356282272, "grad_norm": 0.32749027013778687, "learning_rate": 1.5632489942753515e-05, "loss": 1.395, "mean_token_accuracy": 0.7387227416038513, "num_tokens": 240085659.0, "step": 1271 }, { "epoch": 1.7504302925989674, "grad_norm": 0.7325544953346252, "learning_rate": 1.5603462874938895e-05, "loss": 1.9275, "mean_token_accuracy": 0.6231383979320526, "num_tokens": 240352674.0, "step": 1272 }, { "epoch": 1.7518072289156628, "grad_norm": 0.836333155632019, "learning_rate": 1.557444553510233e-05, "loss": 1.791, "mean_token_accuracy": 0.646561436355114, "num_tokens": 240551861.0, "step": 1273 }, { "epoch": 1.753184165232358, "grad_norm": 0.8493391871452332, "learning_rate": 1.5545437987448912e-05, "loss": 1.516, "mean_token_accuracy": 0.6906410157680511, "num_tokens": 240723272.0, "step": 1274 }, { "epoch": 1.7545611015490534, "grad_norm": 0.8551358580589294, "learning_rate": 1.551644029616206e-05, "loss": 1.2773, "mean_token_accuracy": 0.7359446063637733, "num_tokens": 240878295.0, "step": 1275 }, { "epoch": 1.7559380378657488, "grad_norm": 0.8808310627937317, "learning_rate": 1.548745252540339e-05, "loss": 1.125, "mean_token_accuracy": 0.7647600769996643, "num_tokens": 241021071.0, "step": 1276 }, { "epoch": 1.757314974182444, "grad_norm": 1.1303379535675049, "learning_rate": 1.545847473931254e-05, "loss": 1.2059, "mean_token_accuracy": 0.7511379793286324, "num_tokens": 241146088.0, "step": 1277 }, { "epoch": 1.7586919104991394, "grad_norm": 0.6889764666557312, "learning_rate": 1.5429507002007096e-05, "loss": 1.7435, "mean_token_accuracy": 0.6458823308348656, "num_tokens": 241467960.0, "step": 1278 }, { "epoch": 1.7600688468158348, "grad_norm": 0.8078868389129639, "learning_rate": 1.5400549377582392e-05, "loss": 1.8862, "mean_token_accuracy": 0.632098838686943, "num_tokens": 241677384.0, "step": 1279 }, { "epoch": 1.76144578313253, "grad_norm": 0.842052161693573, "learning_rate": 1.5371601930111382e-05, "loss": 1.6121, "mean_token_accuracy": 0.6775482296943665, "num_tokens": 241854950.0, "step": 1280 }, { "epoch": 1.7628227194492254, "grad_norm": 0.8564832210540771, "learning_rate": 1.5342664723644502e-05, "loss": 1.3578, "mean_token_accuracy": 0.7196655422449112, "num_tokens": 242013645.0, "step": 1281 }, { "epoch": 1.7641996557659207, "grad_norm": 0.8789716958999634, "learning_rate": 1.5313737822209532e-05, "loss": 1.1478, "mean_token_accuracy": 0.758471317589283, "num_tokens": 242159529.0, "step": 1282 }, { "epoch": 1.765576592082616, "grad_norm": 0.9754416942596436, "learning_rate": 1.5284821289811453e-05, "loss": 1.0648, "mean_token_accuracy": 0.776745967566967, "num_tokens": 242290680.0, "step": 1283 }, { "epoch": 1.7669535283993114, "grad_norm": 0.6603948473930359, "learning_rate": 1.525591519043231e-05, "loss": 1.6158, "mean_token_accuracy": 0.6777462288737297, "num_tokens": 242586842.0, "step": 1284 }, { "epoch": 1.7683304647160067, "grad_norm": 0.7511172890663147, "learning_rate": 1.5227019588031035e-05, "loss": 1.9058, "mean_token_accuracy": 0.6282012090086937, "num_tokens": 242811086.0, "step": 1285 }, { "epoch": 1.769707401032702, "grad_norm": 0.8251307606697083, "learning_rate": 1.519813454654336e-05, "loss": 1.6557, "mean_token_accuracy": 0.6673218235373497, "num_tokens": 242994211.0, "step": 1286 }, { "epoch": 1.7710843373493976, "grad_norm": 0.8694981336593628, "learning_rate": 1.5169260129881638e-05, "loss": 1.4476, "mean_token_accuracy": 0.703913114964962, "num_tokens": 243156671.0, "step": 1287 }, { "epoch": 1.772461273666093, "grad_norm": 0.879697322845459, "learning_rate": 1.5140396401934725e-05, "loss": 1.1885, "mean_token_accuracy": 0.7493556439876556, "num_tokens": 243305637.0, "step": 1288 }, { "epoch": 1.7738382099827883, "grad_norm": 0.9404916167259216, "learning_rate": 1.5111543426567813e-05, "loss": 1.0628, "mean_token_accuracy": 0.7761910632252693, "num_tokens": 243440988.0, "step": 1289 }, { "epoch": 1.7752151462994836, "grad_norm": 0.6362167596817017, "learning_rate": 1.5082701267622311e-05, "loss": 1.6063, "mean_token_accuracy": 0.7022790089249611, "num_tokens": 243696062.0, "step": 1290 }, { "epoch": 1.776592082616179, "grad_norm": 0.7374881505966187, "learning_rate": 1.5053869988915691e-05, "loss": 1.9388, "mean_token_accuracy": 0.6248039975762367, "num_tokens": 243936542.0, "step": 1291 }, { "epoch": 1.7779690189328745, "grad_norm": 0.811362624168396, "learning_rate": 1.502504965424135e-05, "loss": 1.7284, "mean_token_accuracy": 0.6563708782196045, "num_tokens": 244127887.0, "step": 1292 }, { "epoch": 1.7793459552495698, "grad_norm": 0.8551064133644104, "learning_rate": 1.4996240327368478e-05, "loss": 1.4571, "mean_token_accuracy": 0.7014239430427551, "num_tokens": 244294866.0, "step": 1293 }, { "epoch": 1.7807228915662652, "grad_norm": 0.884354293346405, "learning_rate": 1.4967442072041895e-05, "loss": 1.2532, "mean_token_accuracy": 0.7396837174892426, "num_tokens": 244447087.0, "step": 1294 }, { "epoch": 1.7820998278829605, "grad_norm": 0.9257299900054932, "learning_rate": 1.4938654951981933e-05, "loss": 1.104, "mean_token_accuracy": 0.7685519829392433, "num_tokens": 244586512.0, "step": 1295 }, { "epoch": 1.7834767641996558, "grad_norm": 0.3103896975517273, "learning_rate": 1.4909879030884282e-05, "loss": 1.4026, "mean_token_accuracy": 0.7393483072519302, "num_tokens": 244798371.0, "step": 1296 }, { "epoch": 1.7848537005163512, "grad_norm": 0.7191721200942993, "learning_rate": 1.4881114372419854e-05, "loss": 1.8692, "mean_token_accuracy": 0.6342304199934006, "num_tokens": 245065866.0, "step": 1297 }, { "epoch": 1.7862306368330465, "grad_norm": 0.8058621883392334, "learning_rate": 1.4852361040234646e-05, "loss": 1.7826, "mean_token_accuracy": 0.6471372172236443, "num_tokens": 245264976.0, "step": 1298 }, { "epoch": 1.7876075731497418, "grad_norm": 0.8349825143814087, "learning_rate": 1.4823619097949584e-05, "loss": 1.5107, "mean_token_accuracy": 0.6959997788071632, "num_tokens": 245436266.0, "step": 1299 }, { "epoch": 1.7889845094664372, "grad_norm": 0.8561532497406006, "learning_rate": 1.47948886091604e-05, "loss": 1.2631, "mean_token_accuracy": 0.7373026087880135, "num_tokens": 245590599.0, "step": 1300 }, { "epoch": 1.7903614457831325, "grad_norm": 0.9090086221694946, "learning_rate": 1.4766169637437485e-05, "loss": 1.119, "mean_token_accuracy": 0.7610385119915009, "num_tokens": 245732785.0, "step": 1301 }, { "epoch": 1.7917383820998278, "grad_norm": 1.1533457040786743, "learning_rate": 1.4737462246325731e-05, "loss": 1.1711, "mean_token_accuracy": 0.7551911696791649, "num_tokens": 245857154.0, "step": 1302 }, { "epoch": 1.7931153184165232, "grad_norm": 0.6462624669075012, "learning_rate": 1.4708766499344424e-05, "loss": 1.6989, "mean_token_accuracy": 0.6538957431912422, "num_tokens": 246184019.0, "step": 1303 }, { "epoch": 1.7944922547332185, "grad_norm": 0.7905831933021545, "learning_rate": 1.4680082459987079e-05, "loss": 1.8578, "mean_token_accuracy": 0.6375768184661865, "num_tokens": 246396660.0, "step": 1304 }, { "epoch": 1.7958691910499138, "grad_norm": 0.843809187412262, "learning_rate": 1.4651410191721306e-05, "loss": 1.6408, "mean_token_accuracy": 0.6718823984265327, "num_tokens": 246575479.0, "step": 1305 }, { "epoch": 1.7972461273666092, "grad_norm": 0.8617410063743591, "learning_rate": 1.462274975798867e-05, "loss": 1.3496, "mean_token_accuracy": 0.7217898741364479, "num_tokens": 246735353.0, "step": 1306 }, { "epoch": 1.7986230636833045, "grad_norm": 0.8956752419471741, "learning_rate": 1.4594101222204544e-05, "loss": 1.1498, "mean_token_accuracy": 0.7571221962571144, "num_tokens": 246881610.0, "step": 1307 }, { "epoch": 1.8, "grad_norm": 1.0061700344085693, "learning_rate": 1.4565464647757997e-05, "loss": 1.1016, "mean_token_accuracy": 0.7667861506342888, "num_tokens": 247012963.0, "step": 1308 }, { "epoch": 1.8013769363166954, "grad_norm": 0.6535512208938599, "learning_rate": 1.4536840098011613e-05, "loss": 1.6868, "mean_token_accuracy": 0.6667754799127579, "num_tokens": 247306985.0, "step": 1309 }, { "epoch": 1.8027538726333907, "grad_norm": 0.7586234211921692, "learning_rate": 1.450822763630136e-05, "loss": 1.8998, "mean_token_accuracy": 0.6290462613105774, "num_tokens": 247531615.0, "step": 1310 }, { "epoch": 1.804130808950086, "grad_norm": 0.8287749886512756, "learning_rate": 1.4479627325936476e-05, "loss": 1.6755, "mean_token_accuracy": 0.6664717867970467, "num_tokens": 247715505.0, "step": 1311 }, { "epoch": 1.8055077452667814, "grad_norm": 0.870663583278656, "learning_rate": 1.4451039230199317e-05, "loss": 1.4187, "mean_token_accuracy": 0.7092928290367126, "num_tokens": 247878132.0, "step": 1312 }, { "epoch": 1.806884681583477, "grad_norm": 0.8821465373039246, "learning_rate": 1.4422463412345202e-05, "loss": 1.1801, "mean_token_accuracy": 0.7516836747527122, "num_tokens": 248026867.0, "step": 1313 }, { "epoch": 1.8082616179001723, "grad_norm": 0.9614560604095459, "learning_rate": 1.4393899935602282e-05, "loss": 1.0963, "mean_token_accuracy": 0.7707404494285583, "num_tokens": 248162390.0, "step": 1314 }, { "epoch": 1.8096385542168676, "grad_norm": 0.621468186378479, "learning_rate": 1.4365348863171406e-05, "loss": 1.5259, "mean_token_accuracy": 0.704691007733345, "num_tokens": 248420458.0, "step": 1315 }, { "epoch": 1.811015490533563, "grad_norm": 0.7369716167449951, "learning_rate": 1.433681025822598e-05, "loss": 1.9185, "mean_token_accuracy": 0.6262470260262489, "num_tokens": 248664305.0, "step": 1316 }, { "epoch": 1.8123924268502583, "grad_norm": 0.8140324950218201, "learning_rate": 1.4308284183911818e-05, "loss": 1.7438, "mean_token_accuracy": 0.6548831313848495, "num_tokens": 248857123.0, "step": 1317 }, { "epoch": 1.8137693631669536, "grad_norm": 0.8646644353866577, "learning_rate": 1.4279770703347008e-05, "loss": 1.4798, "mean_token_accuracy": 0.6994293704628944, "num_tokens": 249024720.0, "step": 1318 }, { "epoch": 1.815146299483649, "grad_norm": 0.8854145407676697, "learning_rate": 1.425126987962177e-05, "loss": 1.2322, "mean_token_accuracy": 0.741290807723999, "num_tokens": 249176941.0, "step": 1319 }, { "epoch": 1.8165232358003442, "grad_norm": 0.9297785758972168, "learning_rate": 1.4222781775798327e-05, "loss": 1.1189, "mean_token_accuracy": 0.7670661583542824, "num_tokens": 249316438.0, "step": 1320 }, { "epoch": 1.8179001721170396, "grad_norm": 0.32708412408828735, "learning_rate": 1.4194306454910757e-05, "loss": 1.3856, "mean_token_accuracy": 0.7358467876911163, "num_tokens": 249523376.0, "step": 1321 }, { "epoch": 1.819277108433735, "grad_norm": 0.714974045753479, "learning_rate": 1.4165843979964841e-05, "loss": 1.8723, "mean_token_accuracy": 0.6344536542892456, "num_tokens": 249793300.0, "step": 1322 }, { "epoch": 1.8206540447504302, "grad_norm": 0.8129270076751709, "learning_rate": 1.4137394413937959e-05, "loss": 1.7956, "mean_token_accuracy": 0.644280806183815, "num_tokens": 249994908.0, "step": 1323 }, { "epoch": 1.8220309810671256, "grad_norm": 0.8463249802589417, "learning_rate": 1.410895781977891e-05, "loss": 1.5508, "mean_token_accuracy": 0.6853475645184517, "num_tokens": 250167948.0, "step": 1324 }, { "epoch": 1.823407917383821, "grad_norm": 0.8694126009941101, "learning_rate": 1.4080534260407792e-05, "loss": 1.2964, "mean_token_accuracy": 0.7304019778966904, "num_tokens": 250323942.0, "step": 1325 }, { "epoch": 1.8247848537005162, "grad_norm": 0.8997028470039368, "learning_rate": 1.4052123798715874e-05, "loss": 1.1461, "mean_token_accuracy": 0.7565995380282402, "num_tokens": 250467084.0, "step": 1326 }, { "epoch": 1.8261617900172116, "grad_norm": 1.1254774332046509, "learning_rate": 1.4023726497565422e-05, "loss": 1.1325, "mean_token_accuracy": 0.7612581923604012, "num_tokens": 250592011.0, "step": 1327 }, { "epoch": 1.827538726333907, "grad_norm": 0.6326587200164795, "learning_rate": 1.3995342419789608e-05, "loss": 1.6957, "mean_token_accuracy": 0.6511140465736389, "num_tokens": 250921126.0, "step": 1328 }, { "epoch": 1.8289156626506025, "grad_norm": 0.7798895835876465, "learning_rate": 1.396697162819233e-05, "loss": 1.8215, "mean_token_accuracy": 0.6423550024628639, "num_tokens": 251131189.0, "step": 1329 }, { "epoch": 1.8302925989672978, "grad_norm": 0.831348180770874, "learning_rate": 1.3938614185548094e-05, "loss": 1.5703, "mean_token_accuracy": 0.6831551417708397, "num_tokens": 251307947.0, "step": 1330 }, { "epoch": 1.8316695352839931, "grad_norm": 0.8575800061225891, "learning_rate": 1.3910270154601864e-05, "loss": 1.3175, "mean_token_accuracy": 0.7264093831181526, "num_tokens": 251466166.0, "step": 1331 }, { "epoch": 1.8330464716006885, "grad_norm": 0.9102597236633301, "learning_rate": 1.388193959806893e-05, "loss": 1.1819, "mean_token_accuracy": 0.7475340589880943, "num_tokens": 251611729.0, "step": 1332 }, { "epoch": 1.8344234079173838, "grad_norm": 0.9966944456100464, "learning_rate": 1.385362257863478e-05, "loss": 1.0812, "mean_token_accuracy": 0.7753980755805969, "num_tokens": 251742499.0, "step": 1333 }, { "epoch": 1.8358003442340793, "grad_norm": 0.6318532228469849, "learning_rate": 1.3825319158954941e-05, "loss": 1.6857, "mean_token_accuracy": 0.6680663004517555, "num_tokens": 252037422.0, "step": 1334 }, { "epoch": 1.8371772805507747, "grad_norm": 0.7561232447624207, "learning_rate": 1.3797029401654834e-05, "loss": 1.8816, "mean_token_accuracy": 0.632215678691864, "num_tokens": 252259878.0, "step": 1335 }, { "epoch": 1.83855421686747, "grad_norm": 0.8427339792251587, "learning_rate": 1.3768753369329664e-05, "loss": 1.6483, "mean_token_accuracy": 0.6709583550691605, "num_tokens": 252442662.0, "step": 1336 }, { "epoch": 1.8399311531841653, "grad_norm": 0.8654614090919495, "learning_rate": 1.3740491124544276e-05, "loss": 1.3724, "mean_token_accuracy": 0.7191244512796402, "num_tokens": 252604886.0, "step": 1337 }, { "epoch": 1.8413080895008607, "grad_norm": 0.8719386458396912, "learning_rate": 1.3712242729832993e-05, "loss": 1.1999, "mean_token_accuracy": 0.7476626187562943, "num_tokens": 252753882.0, "step": 1338 }, { "epoch": 1.842685025817556, "grad_norm": 0.9376453757286072, "learning_rate": 1.3684008247699505e-05, "loss": 1.0809, "mean_token_accuracy": 0.7719202935695648, "num_tokens": 252889803.0, "step": 1339 }, { "epoch": 1.8440619621342513, "grad_norm": 0.6114423871040344, "learning_rate": 1.3655787740616702e-05, "loss": 1.5696, "mean_token_accuracy": 0.7059722244739532, "num_tokens": 253148972.0, "step": 1340 }, { "epoch": 1.8454388984509467, "grad_norm": 0.7201230525970459, "learning_rate": 1.3627581271026565e-05, "loss": 1.9373, "mean_token_accuracy": 0.6213406324386597, "num_tokens": 253392879.0, "step": 1341 }, { "epoch": 1.846815834767642, "grad_norm": 0.8226843476295471, "learning_rate": 1.3599388901340019e-05, "loss": 1.7478, "mean_token_accuracy": 0.6555330231785774, "num_tokens": 253585104.0, "step": 1342 }, { "epoch": 1.8481927710843373, "grad_norm": 0.8581429123878479, "learning_rate": 1.3571210693936774e-05, "loss": 1.4648, "mean_token_accuracy": 0.7003771513700485, "num_tokens": 253752385.0, "step": 1343 }, { "epoch": 1.8495697074010327, "grad_norm": 0.8811759948730469, "learning_rate": 1.3543046711165215e-05, "loss": 1.2587, "mean_token_accuracy": 0.7371983155608177, "num_tokens": 253904275.0, "step": 1344 }, { "epoch": 1.850946643717728, "grad_norm": 0.9092879295349121, "learning_rate": 1.3514897015342257e-05, "loss": 1.0861, "mean_token_accuracy": 0.7705585956573486, "num_tokens": 254043813.0, "step": 1345 }, { "epoch": 1.8523235800344233, "grad_norm": 0.33225715160369873, "learning_rate": 1.3486761668753187e-05, "loss": 1.4459, "mean_token_accuracy": 0.7388823553919792, "num_tokens": 254249588.0, "step": 1346 }, { "epoch": 1.8537005163511187, "grad_norm": 0.7166661620140076, "learning_rate": 1.345864073365157e-05, "loss": 1.9038, "mean_token_accuracy": 0.6285142078995705, "num_tokens": 254514684.0, "step": 1347 }, { "epoch": 1.855077452667814, "grad_norm": 0.7955671548843384, "learning_rate": 1.343053427225905e-05, "loss": 1.78, "mean_token_accuracy": 0.6500406041741371, "num_tokens": 254714231.0, "step": 1348 }, { "epoch": 1.8564543889845093, "grad_norm": 0.8379384875297546, "learning_rate": 1.3402442346765272e-05, "loss": 1.5395, "mean_token_accuracy": 0.690035991370678, "num_tokens": 254886686.0, "step": 1349 }, { "epoch": 1.8578313253012049, "grad_norm": 0.874836266040802, "learning_rate": 1.3374365019327709e-05, "loss": 1.2876, "mean_token_accuracy": 0.732218012213707, "num_tokens": 255042178.0, "step": 1350 }, { "epoch": 1.8592082616179002, "grad_norm": 0.8982986807823181, "learning_rate": 1.3346302352071525e-05, "loss": 1.144, "mean_token_accuracy": 0.7602247893810272, "num_tokens": 255184747.0, "step": 1351 }, { "epoch": 1.8605851979345955, "grad_norm": 1.1676193475723267, "learning_rate": 1.3318254407089459e-05, "loss": 1.2186, "mean_token_accuracy": 0.7483651414513588, "num_tokens": 255309021.0, "step": 1352 }, { "epoch": 1.8619621342512909, "grad_norm": 0.617272138595581, "learning_rate": 1.3290221246441669e-05, "loss": 1.658, "mean_token_accuracy": 0.6601421609520912, "num_tokens": 255639952.0, "step": 1353 }, { "epoch": 1.8633390705679862, "grad_norm": 0.7790567278862, "learning_rate": 1.3262202932155602e-05, "loss": 1.8684, "mean_token_accuracy": 0.6340247616171837, "num_tokens": 255852088.0, "step": 1354 }, { "epoch": 1.8647160068846815, "grad_norm": 0.8440123200416565, "learning_rate": 1.3234199526225858e-05, "loss": 1.6334, "mean_token_accuracy": 0.6711716502904892, "num_tokens": 256029891.0, "step": 1355 }, { "epoch": 1.866092943201377, "grad_norm": 0.8837971687316895, "learning_rate": 1.3206211090614035e-05, "loss": 1.3464, "mean_token_accuracy": 0.7213215157389641, "num_tokens": 256188967.0, "step": 1356 }, { "epoch": 1.8674698795180724, "grad_norm": 0.9012327194213867, "learning_rate": 1.3178237687248632e-05, "loss": 1.1468, "mean_token_accuracy": 0.7599586918950081, "num_tokens": 256334910.0, "step": 1357 }, { "epoch": 1.8688468158347677, "grad_norm": 1.0042059421539307, "learning_rate": 1.3150279378024873e-05, "loss": 1.0656, "mean_token_accuracy": 0.7780537381768227, "num_tokens": 256466139.0, "step": 1358 }, { "epoch": 1.870223752151463, "grad_norm": 0.6521654725074768, "learning_rate": 1.3122336224804589e-05, "loss": 1.6574, "mean_token_accuracy": 0.6711662411689758, "num_tokens": 256759382.0, "step": 1359 }, { "epoch": 1.8716006884681584, "grad_norm": 0.7461091876029968, "learning_rate": 1.3094408289416052e-05, "loss": 1.8649, "mean_token_accuracy": 0.6340623646974564, "num_tokens": 256984085.0, "step": 1360 }, { "epoch": 1.8729776247848537, "grad_norm": 0.8127901554107666, "learning_rate": 1.3066495633653897e-05, "loss": 1.6204, "mean_token_accuracy": 0.675668515264988, "num_tokens": 257168026.0, "step": 1361 }, { "epoch": 1.874354561101549, "grad_norm": 0.8580586314201355, "learning_rate": 1.303859831927894e-05, "loss": 1.3609, "mean_token_accuracy": 0.7214544489979744, "num_tokens": 257330349.0, "step": 1362 }, { "epoch": 1.8757314974182444, "grad_norm": 0.8856133818626404, "learning_rate": 1.3010716408018037e-05, "loss": 1.1911, "mean_token_accuracy": 0.7512433081865311, "num_tokens": 257479092.0, "step": 1363 }, { "epoch": 1.8771084337349397, "grad_norm": 0.939976692199707, "learning_rate": 1.2982849961563976e-05, "loss": 1.059, "mean_token_accuracy": 0.7757995575666428, "num_tokens": 257614461.0, "step": 1364 }, { "epoch": 1.878485370051635, "grad_norm": 0.6272172927856445, "learning_rate": 1.2954999041575331e-05, "loss": 1.5831, "mean_token_accuracy": 0.7035972103476524, "num_tokens": 257871309.0, "step": 1365 }, { "epoch": 1.8798623063683304, "grad_norm": 0.7191384434700012, "learning_rate": 1.2927163709676305e-05, "loss": 1.8735, "mean_token_accuracy": 0.6344793662428856, "num_tokens": 258112765.0, "step": 1366 }, { "epoch": 1.8812392426850257, "grad_norm": 0.8407889008522034, "learning_rate": 1.2899344027456638e-05, "loss": 1.7176, "mean_token_accuracy": 0.6588469073176384, "num_tokens": 258302897.0, "step": 1367 }, { "epoch": 1.882616179001721, "grad_norm": 0.8683682084083557, "learning_rate": 1.2871540056471403e-05, "loss": 1.4684, "mean_token_accuracy": 0.7004240900278091, "num_tokens": 258468980.0, "step": 1368 }, { "epoch": 1.8839931153184164, "grad_norm": 0.8766279816627502, "learning_rate": 1.2843751858240938e-05, "loss": 1.2345, "mean_token_accuracy": 0.7398325428366661, "num_tokens": 258620697.0, "step": 1369 }, { "epoch": 1.8853700516351117, "grad_norm": 0.90859055519104, "learning_rate": 1.2815979494250672e-05, "loss": 1.0786, "mean_token_accuracy": 0.7722681686282158, "num_tokens": 258760316.0, "step": 1370 }, { "epoch": 1.886746987951807, "grad_norm": 0.311278373003006, "learning_rate": 1.278822302595101e-05, "loss": 1.3326, "mean_token_accuracy": 0.7429277077317238, "num_tokens": 258971252.0, "step": 1371 }, { "epoch": 1.8881239242685026, "grad_norm": 0.7106454372406006, "learning_rate": 1.276048251475717e-05, "loss": 1.8427, "mean_token_accuracy": 0.6397242471575737, "num_tokens": 259241789.0, "step": 1372 }, { "epoch": 1.889500860585198, "grad_norm": 0.808847963809967, "learning_rate": 1.2732758022049072e-05, "loss": 1.7698, "mean_token_accuracy": 0.6507874727249146, "num_tokens": 259442257.0, "step": 1373 }, { "epoch": 1.8908777969018933, "grad_norm": 0.8578771352767944, "learning_rate": 1.2705049609171186e-05, "loss": 1.5298, "mean_token_accuracy": 0.6919310986995697, "num_tokens": 259614445.0, "step": 1374 }, { "epoch": 1.8922547332185886, "grad_norm": 0.8807763457298279, "learning_rate": 1.267735733743242e-05, "loss": 1.2653, "mean_token_accuracy": 0.7374033331871033, "num_tokens": 259769806.0, "step": 1375 }, { "epoch": 1.893631669535284, "grad_norm": 0.8843784928321838, "learning_rate": 1.2649681268105933e-05, "loss": 1.099, "mean_token_accuracy": 0.7669659703969955, "num_tokens": 259912617.0, "step": 1376 }, { "epoch": 1.8950086058519795, "grad_norm": 1.121716856956482, "learning_rate": 1.2622021462429069e-05, "loss": 1.1637, "mean_token_accuracy": 0.7569302842020988, "num_tokens": 260037181.0, "step": 1377 }, { "epoch": 1.8963855421686748, "grad_norm": 0.6206921339035034, "learning_rate": 1.2594377981603167e-05, "loss": 1.6935, "mean_token_accuracy": 0.6538518890738487, "num_tokens": 260373890.0, "step": 1378 }, { "epoch": 1.8977624784853702, "grad_norm": 0.7759021520614624, "learning_rate": 1.2566750886793453e-05, "loss": 1.8693, "mean_token_accuracy": 0.6360936239361763, "num_tokens": 260586397.0, "step": 1379 }, { "epoch": 1.8991394148020655, "grad_norm": 0.8486682176589966, "learning_rate": 1.2539140239128891e-05, "loss": 1.6203, "mean_token_accuracy": 0.6759685575962067, "num_tokens": 260764936.0, "step": 1380 }, { "epoch": 1.9005163511187608, "grad_norm": 0.8492434620857239, "learning_rate": 1.251154609970206e-05, "loss": 1.3218, "mean_token_accuracy": 0.7263266518712044, "num_tokens": 260924353.0, "step": 1381 }, { "epoch": 1.9018932874354562, "grad_norm": 0.8954276442527771, "learning_rate": 1.2483968529569009e-05, "loss": 1.1933, "mean_token_accuracy": 0.7474881187081337, "num_tokens": 261070455.0, "step": 1382 }, { "epoch": 1.9032702237521515, "grad_norm": 0.9996843338012695, "learning_rate": 1.2456407589749126e-05, "loss": 1.0674, "mean_token_accuracy": 0.7782631814479828, "num_tokens": 261201431.0, "step": 1383 }, { "epoch": 1.9046471600688468, "grad_norm": 0.630810558795929, "learning_rate": 1.2428863341224988e-05, "loss": 1.5928, "mean_token_accuracy": 0.6787693053483963, "num_tokens": 261497581.0, "step": 1384 }, { "epoch": 1.9060240963855422, "grad_norm": 0.7666919231414795, "learning_rate": 1.2401335844942263e-05, "loss": 1.9024, "mean_token_accuracy": 0.6306620389223099, "num_tokens": 261720600.0, "step": 1385 }, { "epoch": 1.9074010327022375, "grad_norm": 0.8369731903076172, "learning_rate": 1.2373825161809541e-05, "loss": 1.6488, "mean_token_accuracy": 0.6671880558133125, "num_tokens": 261903551.0, "step": 1386 }, { "epoch": 1.9087779690189328, "grad_norm": 0.8711162209510803, "learning_rate": 1.2346331352698206e-05, "loss": 1.3855, "mean_token_accuracy": 0.7164968103170395, "num_tokens": 262065607.0, "step": 1387 }, { "epoch": 1.9101549053356282, "grad_norm": 0.8847206830978394, "learning_rate": 1.2318854478442317e-05, "loss": 1.1991, "mean_token_accuracy": 0.7501604855060577, "num_tokens": 262214130.0, "step": 1388 }, { "epoch": 1.9115318416523235, "grad_norm": 0.9412459135055542, "learning_rate": 1.2291394599838453e-05, "loss": 1.0746, "mean_token_accuracy": 0.773893415927887, "num_tokens": 262349526.0, "step": 1389 }, { "epoch": 1.9129087779690188, "grad_norm": 0.618564784526825, "learning_rate": 1.2263951777645588e-05, "loss": 1.519, "mean_token_accuracy": 0.7115224450826645, "num_tokens": 262604319.0, "step": 1390 }, { "epoch": 1.9142857142857141, "grad_norm": 0.720917284488678, "learning_rate": 1.223652607258496e-05, "loss": 1.8997, "mean_token_accuracy": 0.6330820620059967, "num_tokens": 262845161.0, "step": 1391 }, { "epoch": 1.9156626506024095, "grad_norm": 0.8232987523078918, "learning_rate": 1.2209117545339945e-05, "loss": 1.7089, "mean_token_accuracy": 0.6587498486042023, "num_tokens": 263035428.0, "step": 1392 }, { "epoch": 1.917039586919105, "grad_norm": 0.8537179231643677, "learning_rate": 1.2181726256555877e-05, "loss": 1.4136, "mean_token_accuracy": 0.7102102488279343, "num_tokens": 263201334.0, "step": 1393 }, { "epoch": 1.9184165232358004, "grad_norm": 0.8880217671394348, "learning_rate": 1.2154352266839977e-05, "loss": 1.2325, "mean_token_accuracy": 0.7410198375582695, "num_tokens": 263352817.0, "step": 1394 }, { "epoch": 1.9197934595524957, "grad_norm": 0.9116145968437195, "learning_rate": 1.2126995636761174e-05, "loss": 1.07, "mean_token_accuracy": 0.7738258540630341, "num_tokens": 263492016.0, "step": 1395 }, { "epoch": 1.921170395869191, "grad_norm": 0.30924832820892334, "learning_rate": 1.2099656426850004e-05, "loss": 1.3582, "mean_token_accuracy": 0.7404803484678268, "num_tokens": 263704303.0, "step": 1396 }, { "epoch": 1.9225473321858864, "grad_norm": 0.7105490565299988, "learning_rate": 1.2072334697598439e-05, "loss": 1.9062, "mean_token_accuracy": 0.6273626461625099, "num_tokens": 263973352.0, "step": 1397 }, { "epoch": 1.923924268502582, "grad_norm": 0.7977651953697205, "learning_rate": 1.204503050945978e-05, "loss": 1.7478, "mean_token_accuracy": 0.6540260538458824, "num_tokens": 264173762.0, "step": 1398 }, { "epoch": 1.9253012048192772, "grad_norm": 0.8480536341667175, "learning_rate": 1.2017743922848518e-05, "loss": 1.526, "mean_token_accuracy": 0.6902996376156807, "num_tokens": 264346402.0, "step": 1399 }, { "epoch": 1.9266781411359726, "grad_norm": 0.8731435537338257, "learning_rate": 1.1990474998140198e-05, "loss": 1.2805, "mean_token_accuracy": 0.7339234873652458, "num_tokens": 264502154.0, "step": 1400 }, { "epoch": 1.928055077452668, "grad_norm": 0.8941655158996582, "learning_rate": 1.1963223795671279e-05, "loss": 1.0945, "mean_token_accuracy": 0.7686266973614693, "num_tokens": 264645013.0, "step": 1401 }, { "epoch": 1.9294320137693632, "grad_norm": 1.1239748001098633, "learning_rate": 1.1935990375739011e-05, "loss": 1.1351, "mean_token_accuracy": 0.7614062875509262, "num_tokens": 264769898.0, "step": 1402 }, { "epoch": 1.9308089500860586, "grad_norm": 0.6295753717422485, "learning_rate": 1.19087747986013e-05, "loss": 1.7058, "mean_token_accuracy": 0.653442993760109, "num_tokens": 265095775.0, "step": 1403 }, { "epoch": 1.932185886402754, "grad_norm": 0.7719454169273376, "learning_rate": 1.1881577124476569e-05, "loss": 1.8436, "mean_token_accuracy": 0.6376594305038452, "num_tokens": 265306128.0, "step": 1404 }, { "epoch": 1.9335628227194492, "grad_norm": 0.8550253510475159, "learning_rate": 1.1854397413543626e-05, "loss": 1.5957, "mean_token_accuracy": 0.6781428456306458, "num_tokens": 265484152.0, "step": 1405 }, { "epoch": 1.9349397590361446, "grad_norm": 0.8659741282463074, "learning_rate": 1.1827235725941546e-05, "loss": 1.3435, "mean_token_accuracy": 0.7275960147380829, "num_tokens": 265643250.0, "step": 1406 }, { "epoch": 1.93631669535284, "grad_norm": 0.876444399356842, "learning_rate": 1.1800092121769506e-05, "loss": 1.1269, "mean_token_accuracy": 0.7614795416593552, "num_tokens": 265789326.0, "step": 1407 }, { "epoch": 1.9376936316695352, "grad_norm": 0.9669113159179688, "learning_rate": 1.177296666108669e-05, "loss": 1.0553, "mean_token_accuracy": 0.77923933416605, "num_tokens": 265920660.0, "step": 1408 }, { "epoch": 1.9390705679862306, "grad_norm": 0.6321369409561157, "learning_rate": 1.1745859403912108e-05, "loss": 1.6583, "mean_token_accuracy": 0.6688172966241837, "num_tokens": 266214843.0, "step": 1409 }, { "epoch": 1.940447504302926, "grad_norm": 0.7659294009208679, "learning_rate": 1.1718770410224524e-05, "loss": 1.8699, "mean_token_accuracy": 0.6382747516036034, "num_tokens": 266438061.0, "step": 1410 }, { "epoch": 1.9418244406196212, "grad_norm": 0.8295349478721619, "learning_rate": 1.1691699739962275e-05, "loss": 1.6534, "mean_token_accuracy": 0.6697342917323112, "num_tokens": 266621432.0, "step": 1411 }, { "epoch": 1.9432013769363166, "grad_norm": 0.8792596459388733, "learning_rate": 1.166464745302315e-05, "loss": 1.4187, "mean_token_accuracy": 0.7092704698443413, "num_tokens": 266784321.0, "step": 1412 }, { "epoch": 1.944578313253012, "grad_norm": 0.8690779209136963, "learning_rate": 1.1637613609264284e-05, "loss": 1.1537, "mean_token_accuracy": 0.7554378360509872, "num_tokens": 266933379.0, "step": 1413 }, { "epoch": 1.9459552495697074, "grad_norm": 0.9609450101852417, "learning_rate": 1.1610598268501982e-05, "loss": 1.0888, "mean_token_accuracy": 0.7704839035868645, "num_tokens": 267069152.0, "step": 1414 }, { "epoch": 1.9473321858864028, "grad_norm": 0.6223478317260742, "learning_rate": 1.1583601490511618e-05, "loss": 1.5443, "mean_token_accuracy": 0.7068136632442474, "num_tokens": 267324415.0, "step": 1415 }, { "epoch": 1.948709122203098, "grad_norm": 0.7259305119514465, "learning_rate": 1.1556623335027496e-05, "loss": 1.8685, "mean_token_accuracy": 0.6363760083913803, "num_tokens": 267569128.0, "step": 1416 }, { "epoch": 1.9500860585197934, "grad_norm": 0.808188796043396, "learning_rate": 1.1529663861742692e-05, "loss": 1.6875, "mean_token_accuracy": 0.6649265289306641, "num_tokens": 267761679.0, "step": 1417 }, { "epoch": 1.9514629948364888, "grad_norm": 0.8700260519981384, "learning_rate": 1.1502723130308979e-05, "loss": 1.4902, "mean_token_accuracy": 0.6955113932490349, "num_tokens": 267929539.0, "step": 1418 }, { "epoch": 1.9528399311531843, "grad_norm": 0.8664353489875793, "learning_rate": 1.147580120033664e-05, "loss": 1.1986, "mean_token_accuracy": 0.750079795718193, "num_tokens": 268081987.0, "step": 1419 }, { "epoch": 1.9542168674698797, "grad_norm": 0.920962393283844, "learning_rate": 1.1448898131394364e-05, "loss": 1.1114, "mean_token_accuracy": 0.7658612877130508, "num_tokens": 268221298.0, "step": 1420 }, { "epoch": 1.955593803786575, "grad_norm": 0.3439907729625702, "learning_rate": 1.1422013983009102e-05, "loss": 1.4081, "mean_token_accuracy": 0.7422041893005371, "num_tokens": 268423768.0, "step": 1421 }, { "epoch": 1.9569707401032703, "grad_norm": 0.7157606482505798, "learning_rate": 1.139514881466594e-05, "loss": 1.9276, "mean_token_accuracy": 0.623312696814537, "num_tokens": 268689196.0, "step": 1422 }, { "epoch": 1.9583476764199657, "grad_norm": 0.8100824356079102, "learning_rate": 1.1368302685807984e-05, "loss": 1.7628, "mean_token_accuracy": 0.6514342129230499, "num_tokens": 268888220.0, "step": 1423 }, { "epoch": 1.959724612736661, "grad_norm": 0.8664728999137878, "learning_rate": 1.1341475655836196e-05, "loss": 1.5155, "mean_token_accuracy": 0.6947689577937126, "num_tokens": 269059640.0, "step": 1424 }, { "epoch": 1.9611015490533563, "grad_norm": 0.8536049127578735, "learning_rate": 1.1314667784109285e-05, "loss": 1.2447, "mean_token_accuracy": 0.7373368144035339, "num_tokens": 269214584.0, "step": 1425 }, { "epoch": 1.9624784853700517, "grad_norm": 0.9018675088882446, "learning_rate": 1.1287879129943558e-05, "loss": 1.1243, "mean_token_accuracy": 0.7611035704612732, "num_tokens": 269357451.0, "step": 1426 }, { "epoch": 1.963855421686747, "grad_norm": 1.1527801752090454, "learning_rate": 1.1261109752612813e-05, "loss": 1.1708, "mean_token_accuracy": 0.7560771927237511, "num_tokens": 269482038.0, "step": 1427 }, { "epoch": 1.9652323580034423, "grad_norm": 0.6197457909584045, "learning_rate": 1.1234359711348195e-05, "loss": 1.681, "mean_token_accuracy": 0.653467670083046, "num_tokens": 269813179.0, "step": 1428 }, { "epoch": 1.9666092943201376, "grad_norm": 0.780720055103302, "learning_rate": 1.1207629065338063e-05, "loss": 1.8204, "mean_token_accuracy": 0.6423346400260925, "num_tokens": 270024992.0, "step": 1429 }, { "epoch": 1.967986230636833, "grad_norm": 0.842248797416687, "learning_rate": 1.118091787372786e-05, "loss": 1.5714, "mean_token_accuracy": 0.6846251338720322, "num_tokens": 270202255.0, "step": 1430 }, { "epoch": 1.9693631669535283, "grad_norm": 0.8757875561714172, "learning_rate": 1.1154226195619979e-05, "loss": 1.2912, "mean_token_accuracy": 0.728495679795742, "num_tokens": 270360520.0, "step": 1431 }, { "epoch": 1.9707401032702236, "grad_norm": 0.885865330696106, "learning_rate": 1.1127554090073639e-05, "loss": 1.1223, "mean_token_accuracy": 0.7638876736164093, "num_tokens": 270506139.0, "step": 1432 }, { "epoch": 1.972117039586919, "grad_norm": 0.9924752116203308, "learning_rate": 1.1100901616104776e-05, "loss": 1.0703, "mean_token_accuracy": 0.7769577279686928, "num_tokens": 270637382.0, "step": 1433 }, { "epoch": 1.9734939759036143, "grad_norm": 0.6685663461685181, "learning_rate": 1.107426883268584e-05, "loss": 1.6242, "mean_token_accuracy": 0.6768189966678619, "num_tokens": 270928432.0, "step": 1434 }, { "epoch": 1.9748709122203099, "grad_norm": 0.7688246965408325, "learning_rate": 1.1047655798745752e-05, "loss": 1.894, "mean_token_accuracy": 0.6318825706839561, "num_tokens": 271151442.0, "step": 1435 }, { "epoch": 1.9762478485370052, "grad_norm": 0.8250108957290649, "learning_rate": 1.1021062573169719e-05, "loss": 1.6113, "mean_token_accuracy": 0.6763308495283127, "num_tokens": 271335380.0, "step": 1436 }, { "epoch": 1.9776247848537005, "grad_norm": 0.8665726184844971, "learning_rate": 1.0994489214799122e-05, "loss": 1.3798, "mean_token_accuracy": 0.7133776620030403, "num_tokens": 271498451.0, "step": 1437 }, { "epoch": 1.9790017211703959, "grad_norm": 0.8847171068191528, "learning_rate": 1.0967935782431382e-05, "loss": 1.1576, "mean_token_accuracy": 0.7544376477599144, "num_tokens": 271647507.0, "step": 1438 }, { "epoch": 1.9803786574870912, "grad_norm": 0.9474474191665649, "learning_rate": 1.0941402334819836e-05, "loss": 1.0526, "mean_token_accuracy": 0.7797155231237411, "num_tokens": 271783194.0, "step": 1439 }, { "epoch": 1.9817555938037867, "grad_norm": 0.6000257134437561, "learning_rate": 1.091488893067359e-05, "loss": 1.519, "mean_token_accuracy": 0.7134736180305481, "num_tokens": 272041103.0, "step": 1440 }, { "epoch": 1.983132530120482, "grad_norm": 0.7204787731170654, "learning_rate": 1.0888395628657413e-05, "loss": 1.8889, "mean_token_accuracy": 0.6332732141017914, "num_tokens": 272283143.0, "step": 1441 }, { "epoch": 1.9845094664371774, "grad_norm": 0.8230807781219482, "learning_rate": 1.0861922487391588e-05, "loss": 1.714, "mean_token_accuracy": 0.6588010638952255, "num_tokens": 272474049.0, "step": 1442 }, { "epoch": 1.9858864027538727, "grad_norm": 0.8528974652290344, "learning_rate": 1.0835469565451792e-05, "loss": 1.4583, "mean_token_accuracy": 0.7036792635917664, "num_tokens": 272641011.0, "step": 1443 }, { "epoch": 1.987263339070568, "grad_norm": 0.8770927786827087, "learning_rate": 1.0809036921368966e-05, "loss": 1.2171, "mean_token_accuracy": 0.7425957173109055, "num_tokens": 272793012.0, "step": 1444 }, { "epoch": 1.9886402753872634, "grad_norm": 0.9051824808120728, "learning_rate": 1.078262461362918e-05, "loss": 1.0775, "mean_token_accuracy": 0.7729323580861092, "num_tokens": 272932596.0, "step": 1445 }, { "epoch": 1.9900172117039587, "grad_norm": 0.3294163644313812, "learning_rate": 1.0756232700673506e-05, "loss": 1.4032, "mean_token_accuracy": 0.7319297194480896, "num_tokens": 273139436.0, "step": 1446 }, { "epoch": 1.991394148020654, "grad_norm": 0.706210196018219, "learning_rate": 1.0729861240897892e-05, "loss": 1.8678, "mean_token_accuracy": 0.6353985369205475, "num_tokens": 273409152.0, "step": 1447 }, { "epoch": 1.9927710843373494, "grad_norm": 0.8196651339530945, "learning_rate": 1.070351029265303e-05, "loss": 1.7986, "mean_token_accuracy": 0.6477495655417442, "num_tokens": 273609183.0, "step": 1448 }, { "epoch": 1.9941480206540447, "grad_norm": 0.8589168190956116, "learning_rate": 1.0677179914244235e-05, "loss": 1.5171, "mean_token_accuracy": 0.6943169012665749, "num_tokens": 273781138.0, "step": 1449 }, { "epoch": 1.99552495697074, "grad_norm": 0.8774291276931763, "learning_rate": 1.0650870163931275e-05, "loss": 1.2367, "mean_token_accuracy": 0.7414907515048981, "num_tokens": 273936280.0, "step": 1450 }, { "epoch": 1.9969018932874354, "grad_norm": 0.9021503925323486, "learning_rate": 1.0624581099928324e-05, "loss": 1.099, "mean_token_accuracy": 0.7669189497828484, "num_tokens": 274078648.0, "step": 1451 }, { "epoch": 1.9982788296041307, "grad_norm": 1.1246883869171143, "learning_rate": 1.0598312780403756e-05, "loss": 1.1406, "mean_token_accuracy": 0.7656938433647156, "num_tokens": 274203222.0, "step": 1452 }, { "epoch": 1.999655765920826, "grad_norm": 0.862447202205658, "learning_rate": 1.0572065263480046e-05, "loss": 1.5707, "mean_token_accuracy": 0.7007786855101585, "num_tokens": 274398743.0, "step": 1453 }, { "epoch": 2.0, "grad_norm": 0.862447202205658, "learning_rate": 1.0572065263480046e-05, "loss": 1.1947, "mean_token_accuracy": 0.7604585886001587, "num_tokens": 274471301.0, "step": 1454 }, { "epoch": 2.0013769363166953, "grad_norm": 0.5245041251182556, "learning_rate": 1.054583860723365e-05, "loss": 1.6272, "mean_token_accuracy": 0.6637683361768723, "num_tokens": 274802773.0, "step": 1455 }, { "epoch": 2.0027538726333907, "grad_norm": 0.6168164610862732, "learning_rate": 1.0519632869694854e-05, "loss": 1.678, "mean_token_accuracy": 0.6699230596423149, "num_tokens": 275012987.0, "step": 1456 }, { "epoch": 2.004130808950086, "grad_norm": 0.6599552035331726, "learning_rate": 1.0493448108847669e-05, "loss": 1.4208, "mean_token_accuracy": 0.711552120745182, "num_tokens": 275190121.0, "step": 1457 }, { "epoch": 2.0055077452667813, "grad_norm": 0.6674471497535706, "learning_rate": 1.0467284382629685e-05, "loss": 1.1718, "mean_token_accuracy": 0.7587836384773254, "num_tokens": 275349182.0, "step": 1458 }, { "epoch": 2.0068846815834767, "grad_norm": 0.6672955751419067, "learning_rate": 1.0441141748931964e-05, "loss": 0.9691, "mean_token_accuracy": 0.7957204431295395, "num_tokens": 275495145.0, "step": 1459 }, { "epoch": 2.008261617900172, "grad_norm": 0.7448542714118958, "learning_rate": 1.0415020265598872e-05, "loss": 0.8875, "mean_token_accuracy": 0.8128118962049484, "num_tokens": 275626373.0, "step": 1460 }, { "epoch": 2.0096385542168673, "grad_norm": 0.4872192144393921, "learning_rate": 1.0388919990427992e-05, "loss": 1.5542, "mean_token_accuracy": 0.6897685751318932, "num_tokens": 275917261.0, "step": 1461 }, { "epoch": 2.0110154905335627, "grad_norm": 0.6192371249198914, "learning_rate": 1.0362840981169982e-05, "loss": 1.7514, "mean_token_accuracy": 0.6570955812931061, "num_tokens": 276141506.0, "step": 1462 }, { "epoch": 2.012392426850258, "grad_norm": 0.6606364846229553, "learning_rate": 1.0336783295528454e-05, "loss": 1.4777, "mean_token_accuracy": 0.7032622024416924, "num_tokens": 276325339.0, "step": 1463 }, { "epoch": 2.0137693631669533, "grad_norm": 0.6770911812782288, "learning_rate": 1.0310746991159822e-05, "loss": 1.2285, "mean_token_accuracy": 0.7483886703848839, "num_tokens": 276487572.0, "step": 1464 }, { "epoch": 2.015146299483649, "grad_norm": 0.6855038404464722, "learning_rate": 1.0284732125673198e-05, "loss": 1.01, "mean_token_accuracy": 0.7861585319042206, "num_tokens": 276636188.0, "step": 1465 }, { "epoch": 2.0165232358003444, "grad_norm": 0.7136525511741638, "learning_rate": 1.0258738756630255e-05, "loss": 0.8904, "mean_token_accuracy": 0.8137165829539299, "num_tokens": 276771291.0, "step": 1466 }, { "epoch": 2.0179001721170398, "grad_norm": 0.17994464933872223, "learning_rate": 1.0232766941545116e-05, "loss": 1.4497, "mean_token_accuracy": 0.7275280058383942, "num_tokens": 277026029.0, "step": 1467 }, { "epoch": 2.019277108433735, "grad_norm": 0.6066409945487976, "learning_rate": 1.0206816737884182e-05, "loss": 1.804, "mean_token_accuracy": 0.6491142809391022, "num_tokens": 277267490.0, "step": 1468 }, { "epoch": 2.0206540447504304, "grad_norm": 0.6626295447349548, "learning_rate": 1.0180888203066059e-05, "loss": 1.5657, "mean_token_accuracy": 0.6901882067322731, "num_tokens": 277458091.0, "step": 1469 }, { "epoch": 2.0220309810671258, "grad_norm": 0.6843374967575073, "learning_rate": 1.0154981394461409e-05, "loss": 1.2673, "mean_token_accuracy": 0.7406948506832123, "num_tokens": 277624958.0, "step": 1470 }, { "epoch": 2.023407917383821, "grad_norm": 0.6729955673217773, "learning_rate": 1.0129096369392815e-05, "loss": 1.0309, "mean_token_accuracy": 0.7859680280089378, "num_tokens": 277776917.0, "step": 1471 }, { "epoch": 2.0247848537005164, "grad_norm": 0.720774233341217, "learning_rate": 1.0103233185134647e-05, "loss": 0.9162, "mean_token_accuracy": 0.8090636879205704, "num_tokens": 277916285.0, "step": 1472 }, { "epoch": 2.0261617900172118, "grad_norm": 0.2538791000843048, "learning_rate": 1.0077391898912983e-05, "loss": 1.2554, "mean_token_accuracy": 0.7660908475518227, "num_tokens": 278122606.0, "step": 1473 }, { "epoch": 2.027538726333907, "grad_norm": 0.5798207521438599, "learning_rate": 1.0051572567905419e-05, "loss": 1.7568, "mean_token_accuracy": 0.6557189747691154, "num_tokens": 278390906.0, "step": 1474 }, { "epoch": 2.0289156626506024, "grad_norm": 0.6549938917160034, "learning_rate": 1.0025775249240993e-05, "loss": 1.6024, "mean_token_accuracy": 0.6799617409706116, "num_tokens": 278589706.0, "step": 1475 }, { "epoch": 2.0302925989672977, "grad_norm": 0.7017835974693298, "learning_rate": 1.0000000000000006e-05, "loss": 1.3374, "mean_token_accuracy": 0.7262888625264168, "num_tokens": 278760828.0, "step": 1476 }, { "epoch": 2.031669535283993, "grad_norm": 0.6785268187522888, "learning_rate": 9.974246877213955e-06, "loss": 1.081, "mean_token_accuracy": 0.7741596400737762, "num_tokens": 278915930.0, "step": 1477 }, { "epoch": 2.0330464716006884, "grad_norm": 0.6926475167274475, "learning_rate": 9.948515937865375e-06, "loss": 0.946, "mean_token_accuracy": 0.8030892834067345, "num_tokens": 279058880.0, "step": 1478 }, { "epoch": 2.0344234079173837, "grad_norm": 0.8619573712348938, "learning_rate": 9.922807238887708e-06, "loss": 0.9916, "mean_token_accuracy": 0.788903146982193, "num_tokens": 279184122.0, "step": 1479 }, { "epoch": 2.035800344234079, "grad_norm": 0.5203723907470703, "learning_rate": 9.897120837165197e-06, "loss": 1.5583, "mean_token_accuracy": 0.6745313107967377, "num_tokens": 279515124.0, "step": 1480 }, { "epoch": 2.0371772805507744, "grad_norm": 0.6505166888237, "learning_rate": 9.871456789532736e-06, "loss": 1.7181, "mean_token_accuracy": 0.6620335280895233, "num_tokens": 279726403.0, "step": 1481 }, { "epoch": 2.0385542168674697, "grad_norm": 0.6928951144218445, "learning_rate": 9.845815152775762e-06, "loss": 1.4476, "mean_token_accuracy": 0.707116924226284, "num_tokens": 279903830.0, "step": 1482 }, { "epoch": 2.039931153184165, "grad_norm": 0.6795191168785095, "learning_rate": 9.82019598363015e-06, "loss": 1.152, "mean_token_accuracy": 0.7619714960455894, "num_tokens": 280062707.0, "step": 1483 }, { "epoch": 2.0413080895008604, "grad_norm": 0.6890574097633362, "learning_rate": 9.794599338782011e-06, "loss": 0.9909, "mean_token_accuracy": 0.7909902110695839, "num_tokens": 280208436.0, "step": 1484 }, { "epoch": 2.0426850258175557, "grad_norm": 0.7527069449424744, "learning_rate": 9.769025274867659e-06, "loss": 0.8812, "mean_token_accuracy": 0.8175709322094917, "num_tokens": 280339505.0, "step": 1485 }, { "epoch": 2.0440619621342515, "grad_norm": 0.4757000803947449, "learning_rate": 9.743473848473429e-06, "loss": 1.5605, "mean_token_accuracy": 0.688486248254776, "num_tokens": 280639757.0, "step": 1486 }, { "epoch": 2.045438898450947, "grad_norm": 0.6278905272483826, "learning_rate": 9.717945116135568e-06, "loss": 1.7403, "mean_token_accuracy": 0.6578486189246178, "num_tokens": 280865298.0, "step": 1487 }, { "epoch": 2.046815834767642, "grad_norm": 0.6858152151107788, "learning_rate": 9.692439134340116e-06, "loss": 1.488, "mean_token_accuracy": 0.703212209045887, "num_tokens": 281048485.0, "step": 1488 }, { "epoch": 2.0481927710843375, "grad_norm": 0.6942347288131714, "learning_rate": 9.66695595952276e-06, "loss": 1.2272, "mean_token_accuracy": 0.747717097401619, "num_tokens": 281210753.0, "step": 1489 }, { "epoch": 2.049569707401033, "grad_norm": 0.68510502576828, "learning_rate": 9.641495648068739e-06, "loss": 0.9909, "mean_token_accuracy": 0.7892533913254738, "num_tokens": 281359523.0, "step": 1490 }, { "epoch": 2.050946643717728, "grad_norm": 0.7381236553192139, "learning_rate": 9.616058256312694e-06, "loss": 0.8885, "mean_token_accuracy": 0.8159934729337692, "num_tokens": 281495209.0, "step": 1491 }, { "epoch": 2.0523235800344235, "grad_norm": 0.17488650977611542, "learning_rate": 9.590643840538558e-06, "loss": 1.4697, "mean_token_accuracy": 0.7328026667237282, "num_tokens": 281748049.0, "step": 1492 }, { "epoch": 2.053700516351119, "grad_norm": 0.6142717003822327, "learning_rate": 9.56525245697942e-06, "loss": 1.7651, "mean_token_accuracy": 0.6531996205449104, "num_tokens": 281989526.0, "step": 1493 }, { "epoch": 2.055077452667814, "grad_norm": 0.6912600994110107, "learning_rate": 9.53988416181741e-06, "loss": 1.541, "mean_token_accuracy": 0.6931588649749756, "num_tokens": 282180414.0, "step": 1494 }, { "epoch": 2.0564543889845095, "grad_norm": 0.7013281583786011, "learning_rate": 9.514539011183573e-06, "loss": 1.2817, "mean_token_accuracy": 0.7365870550274849, "num_tokens": 282347425.0, "step": 1495 }, { "epoch": 2.057831325301205, "grad_norm": 0.6938005089759827, "learning_rate": 9.489217061157744e-06, "loss": 1.0627, "mean_token_accuracy": 0.7802534848451614, "num_tokens": 282499580.0, "step": 1496 }, { "epoch": 2.0592082616179, "grad_norm": 0.7185420989990234, "learning_rate": 9.463918367768421e-06, "loss": 0.9196, "mean_token_accuracy": 0.8099230527877808, "num_tokens": 282638952.0, "step": 1497 }, { "epoch": 2.0605851979345955, "grad_norm": 0.2506062090396881, "learning_rate": 9.438642986992641e-06, "loss": 1.2506, "mean_token_accuracy": 0.7708637565374374, "num_tokens": 282847905.0, "step": 1498 }, { "epoch": 2.061962134251291, "grad_norm": 0.5880323648452759, "learning_rate": 9.413390974755864e-06, "loss": 1.7507, "mean_token_accuracy": 0.6575633510947227, "num_tokens": 283116772.0, "step": 1499 }, { "epoch": 2.063339070567986, "grad_norm": 0.6459956765174866, "learning_rate": 9.388162386931842e-06, "loss": 1.6009, "mean_token_accuracy": 0.6825879886746407, "num_tokens": 283316250.0, "step": 1500 }, { "epoch": 2.0647160068846815, "grad_norm": 0.683403730392456, "learning_rate": 9.362957279342497e-06, "loss": 1.3389, "mean_token_accuracy": 0.7281539812684059, "num_tokens": 283488533.0, "step": 1501 }, { "epoch": 2.066092943201377, "grad_norm": 0.6890749931335449, "learning_rate": 9.337775707757792e-06, "loss": 1.0866, "mean_token_accuracy": 0.7723620012402534, "num_tokens": 283644233.0, "step": 1502 }, { "epoch": 2.067469879518072, "grad_norm": 0.6982219815254211, "learning_rate": 9.312617727895621e-06, "loss": 0.9182, "mean_token_accuracy": 0.8051978051662445, "num_tokens": 283787177.0, "step": 1503 }, { "epoch": 2.0688468158347675, "grad_norm": 0.8668891787528992, "learning_rate": 9.287483395421675e-06, "loss": 0.9685, "mean_token_accuracy": 0.7924002930521965, "num_tokens": 283911752.0, "step": 1504 }, { "epoch": 2.070223752151463, "grad_norm": 0.5198926329612732, "learning_rate": 9.262372765949319e-06, "loss": 1.5445, "mean_token_accuracy": 0.681204155087471, "num_tokens": 284243406.0, "step": 1505 }, { "epoch": 2.071600688468158, "grad_norm": 0.646537721157074, "learning_rate": 9.23728589503948e-06, "loss": 1.69, "mean_token_accuracy": 0.6681678667664528, "num_tokens": 284454869.0, "step": 1506 }, { "epoch": 2.072977624784854, "grad_norm": 0.6736452579498291, "learning_rate": 9.212222838200503e-06, "loss": 1.4329, "mean_token_accuracy": 0.7115899473428726, "num_tokens": 284633275.0, "step": 1507 }, { "epoch": 2.0743545611015493, "grad_norm": 0.7069110870361328, "learning_rate": 9.187183650888056e-06, "loss": 1.1811, "mean_token_accuracy": 0.7581687644124031, "num_tokens": 284792775.0, "step": 1508 }, { "epoch": 2.0757314974182446, "grad_norm": 0.6928911209106445, "learning_rate": 9.162168388504972e-06, "loss": 0.9642, "mean_token_accuracy": 0.7922114878892899, "num_tokens": 284938686.0, "step": 1509 }, { "epoch": 2.07710843373494, "grad_norm": 0.7813495397567749, "learning_rate": 9.13717710640116e-06, "loss": 0.9087, "mean_token_accuracy": 0.8117382675409317, "num_tokens": 285069351.0, "step": 1510 }, { "epoch": 2.0784853700516353, "grad_norm": 0.48035556077957153, "learning_rate": 9.112209859873479e-06, "loss": 1.5384, "mean_token_accuracy": 0.693968266248703, "num_tokens": 285363894.0, "step": 1511 }, { "epoch": 2.0798623063683306, "grad_norm": 0.6270480155944824, "learning_rate": 9.08726670416559e-06, "loss": 1.7327, "mean_token_accuracy": 0.6609182730317116, "num_tokens": 285586120.0, "step": 1512 }, { "epoch": 2.081239242685026, "grad_norm": 0.6709407567977905, "learning_rate": 9.06234769446785e-06, "loss": 1.4543, "mean_token_accuracy": 0.7072853893041611, "num_tokens": 285768869.0, "step": 1513 }, { "epoch": 2.0826161790017212, "grad_norm": 0.6872348189353943, "learning_rate": 9.037452885917197e-06, "loss": 1.2065, "mean_token_accuracy": 0.7526841014623642, "num_tokens": 285931232.0, "step": 1514 }, { "epoch": 2.0839931153184166, "grad_norm": 0.6794991493225098, "learning_rate": 9.012582333597016e-06, "loss": 1.0058, "mean_token_accuracy": 0.790569357573986, "num_tokens": 286080134.0, "step": 1515 }, { "epoch": 2.085370051635112, "grad_norm": 0.7353165745735168, "learning_rate": 8.987736092537029e-06, "loss": 0.8987, "mean_token_accuracy": 0.8125896900892258, "num_tokens": 286215751.0, "step": 1516 }, { "epoch": 2.0867469879518072, "grad_norm": 0.18044686317443848, "learning_rate": 8.962914217713148e-06, "loss": 1.4276, "mean_token_accuracy": 0.7308859825134277, "num_tokens": 286469224.0, "step": 1517 }, { "epoch": 2.0881239242685026, "grad_norm": 0.6110803484916687, "learning_rate": 8.938116764047387e-06, "loss": 1.7896, "mean_token_accuracy": 0.6511989906430244, "num_tokens": 286709533.0, "step": 1518 }, { "epoch": 2.089500860585198, "grad_norm": 0.6667545437812805, "learning_rate": 8.913343786407717e-06, "loss": 1.5462, "mean_token_accuracy": 0.69186582416296, "num_tokens": 286900801.0, "step": 1519 }, { "epoch": 2.0908777969018932, "grad_norm": 0.6947219371795654, "learning_rate": 8.888595339607961e-06, "loss": 1.2728, "mean_token_accuracy": 0.7415034621953964, "num_tokens": 287067657.0, "step": 1520 }, { "epoch": 2.0922547332185886, "grad_norm": 0.6835901141166687, "learning_rate": 8.863871478407648e-06, "loss": 1.0161, "mean_token_accuracy": 0.78593909740448, "num_tokens": 287219747.0, "step": 1521 }, { "epoch": 2.093631669535284, "grad_norm": 0.7160490155220032, "learning_rate": 8.839172257511934e-06, "loss": 0.8893, "mean_token_accuracy": 0.8152542263269424, "num_tokens": 287359389.0, "step": 1522 }, { "epoch": 2.0950086058519792, "grad_norm": 0.26382410526275635, "learning_rate": 8.814497731571432e-06, "loss": 1.3024, "mean_token_accuracy": 0.7682442963123322, "num_tokens": 287568854.0, "step": 1523 }, { "epoch": 2.0963855421686746, "grad_norm": 0.5817862153053284, "learning_rate": 8.789847955182118e-06, "loss": 1.7521, "mean_token_accuracy": 0.6553285866975784, "num_tokens": 287839618.0, "step": 1524 }, { "epoch": 2.09776247848537, "grad_norm": 0.6565458178520203, "learning_rate": 8.765222982885218e-06, "loss": 1.6359, "mean_token_accuracy": 0.6778272613883018, "num_tokens": 288041949.0, "step": 1525 }, { "epoch": 2.0991394148020652, "grad_norm": 0.6956175565719604, "learning_rate": 8.74062286916705e-06, "loss": 1.3409, "mean_token_accuracy": 0.7272205203771591, "num_tokens": 288214743.0, "step": 1526 }, { "epoch": 2.1005163511187606, "grad_norm": 0.7079113125801086, "learning_rate": 8.716047668458954e-06, "loss": 1.1048, "mean_token_accuracy": 0.767342247068882, "num_tokens": 288370530.0, "step": 1527 }, { "epoch": 2.1018932874354563, "grad_norm": 0.7006421685218811, "learning_rate": 8.691497435137135e-06, "loss": 0.9358, "mean_token_accuracy": 0.8045190051198006, "num_tokens": 288513952.0, "step": 1528 }, { "epoch": 2.1032702237521517, "grad_norm": 0.8556535840034485, "learning_rate": 8.666972223522559e-06, "loss": 0.9744, "mean_token_accuracy": 0.7961792573332787, "num_tokens": 288639298.0, "step": 1529 }, { "epoch": 2.104647160068847, "grad_norm": 0.523818850517273, "learning_rate": 8.642472087880823e-06, "loss": 1.5789, "mean_token_accuracy": 0.6779975593090057, "num_tokens": 288964985.0, "step": 1530 }, { "epoch": 2.1060240963855423, "grad_norm": 0.6485081911087036, "learning_rate": 8.617997082422031e-06, "loss": 1.6786, "mean_token_accuracy": 0.6694323793053627, "num_tokens": 289175265.0, "step": 1531 }, { "epoch": 2.1074010327022377, "grad_norm": 0.6912594437599182, "learning_rate": 8.593547261300716e-06, "loss": 1.4396, "mean_token_accuracy": 0.7091517522931099, "num_tokens": 289352959.0, "step": 1532 }, { "epoch": 2.108777969018933, "grad_norm": 0.679163932800293, "learning_rate": 8.569122678615658e-06, "loss": 1.1383, "mean_token_accuracy": 0.7633033692836761, "num_tokens": 289511782.0, "step": 1533 }, { "epoch": 2.1101549053356283, "grad_norm": 0.6919251084327698, "learning_rate": 8.544723388409788e-06, "loss": 0.9713, "mean_token_accuracy": 0.795423798263073, "num_tokens": 289657730.0, "step": 1534 }, { "epoch": 2.1115318416523237, "grad_norm": 0.7832975387573242, "learning_rate": 8.520349444670093e-06, "loss": 0.8998, "mean_token_accuracy": 0.8114837855100632, "num_tokens": 289788777.0, "step": 1535 }, { "epoch": 2.112908777969019, "grad_norm": 0.47778409719467163, "learning_rate": 8.496000901327467e-06, "loss": 1.5398, "mean_token_accuracy": 0.6903469935059547, "num_tokens": 290084755.0, "step": 1536 }, { "epoch": 2.1142857142857143, "grad_norm": 0.6218992471694946, "learning_rate": 8.47167781225661e-06, "loss": 1.7186, "mean_token_accuracy": 0.6634940207004547, "num_tokens": 290310001.0, "step": 1537 }, { "epoch": 2.1156626506024097, "grad_norm": 0.6773734092712402, "learning_rate": 8.447380231275889e-06, "loss": 1.4513, "mean_token_accuracy": 0.7102481201291084, "num_tokens": 290494126.0, "step": 1538 }, { "epoch": 2.117039586919105, "grad_norm": 0.7159666419029236, "learning_rate": 8.423108212147241e-06, "loss": 1.2194, "mean_token_accuracy": 0.7490066215395927, "num_tokens": 290656829.0, "step": 1539 }, { "epoch": 2.1184165232358003, "grad_norm": 0.6949093341827393, "learning_rate": 8.39886180857604e-06, "loss": 0.988, "mean_token_accuracy": 0.7933070585131645, "num_tokens": 290805842.0, "step": 1540 }, { "epoch": 2.1197934595524957, "grad_norm": 0.734028160572052, "learning_rate": 8.374641074210979e-06, "loss": 0.8896, "mean_token_accuracy": 0.8136364072561264, "num_tokens": 290941918.0, "step": 1541 }, { "epoch": 2.121170395869191, "grad_norm": 0.1727452576160431, "learning_rate": 8.35044606264396e-06, "loss": 1.4144, "mean_token_accuracy": 0.7334836572408676, "num_tokens": 291201598.0, "step": 1542 }, { "epoch": 2.1225473321858863, "grad_norm": 0.6117264032363892, "learning_rate": 8.326276827409963e-06, "loss": 1.792, "mean_token_accuracy": 0.6518953889608383, "num_tokens": 291445116.0, "step": 1543 }, { "epoch": 2.1239242685025816, "grad_norm": 0.6688777804374695, "learning_rate": 8.30213342198694e-06, "loss": 1.5479, "mean_token_accuracy": 0.6900360509753227, "num_tokens": 291637064.0, "step": 1544 }, { "epoch": 2.125301204819277, "grad_norm": 0.697697639465332, "learning_rate": 8.278015899795689e-06, "loss": 1.2689, "mean_token_accuracy": 0.7404518947005272, "num_tokens": 291804306.0, "step": 1545 }, { "epoch": 2.1266781411359723, "grad_norm": 0.7081233859062195, "learning_rate": 8.253924314199733e-06, "loss": 1.0383, "mean_token_accuracy": 0.7817588448524475, "num_tokens": 291956087.0, "step": 1546 }, { "epoch": 2.1280550774526676, "grad_norm": 0.712034285068512, "learning_rate": 8.229858718505212e-06, "loss": 0.8702, "mean_token_accuracy": 0.815075159072876, "num_tokens": 292095303.0, "step": 1547 }, { "epoch": 2.129432013769363, "grad_norm": 0.24870504438877106, "learning_rate": 8.20581916596076e-06, "loss": 1.27, "mean_token_accuracy": 0.7659237831830978, "num_tokens": 292306250.0, "step": 1548 }, { "epoch": 2.1308089500860588, "grad_norm": 0.5867757201194763, "learning_rate": 8.181805709757383e-06, "loss": 1.7294, "mean_token_accuracy": 0.6576812714338303, "num_tokens": 292575992.0, "step": 1549 }, { "epoch": 2.1321858864027536, "grad_norm": 0.6643468141555786, "learning_rate": 8.157818403028343e-06, "loss": 1.6429, "mean_token_accuracy": 0.6765299886465073, "num_tokens": 292775940.0, "step": 1550 }, { "epoch": 2.1335628227194494, "grad_norm": 0.6787965893745422, "learning_rate": 8.133857298849052e-06, "loss": 1.3477, "mean_token_accuracy": 0.7270012050867081, "num_tokens": 292947922.0, "step": 1551 }, { "epoch": 2.1349397590361447, "grad_norm": 0.712180495262146, "learning_rate": 8.109922450236938e-06, "loss": 1.1047, "mean_token_accuracy": 0.7707819789648056, "num_tokens": 293103170.0, "step": 1552 }, { "epoch": 2.13631669535284, "grad_norm": 0.7130451202392578, "learning_rate": 8.086013910151334e-06, "loss": 0.9557, "mean_token_accuracy": 0.7989757359027863, "num_tokens": 293245908.0, "step": 1553 }, { "epoch": 2.1376936316695354, "grad_norm": 0.8879528641700745, "learning_rate": 8.062131731493364e-06, "loss": 0.9881, "mean_token_accuracy": 0.7927462756633759, "num_tokens": 293370454.0, "step": 1554 }, { "epoch": 2.1390705679862307, "grad_norm": 0.5198155641555786, "learning_rate": 8.038275967105824e-06, "loss": 1.5718, "mean_token_accuracy": 0.6749449297785759, "num_tokens": 293695744.0, "step": 1555 }, { "epoch": 2.140447504302926, "grad_norm": 0.6458196640014648, "learning_rate": 8.014446669773061e-06, "loss": 1.7012, "mean_token_accuracy": 0.6668796017765999, "num_tokens": 293905501.0, "step": 1556 }, { "epoch": 2.1418244406196214, "grad_norm": 0.6960837841033936, "learning_rate": 7.990643892220866e-06, "loss": 1.4565, "mean_token_accuracy": 0.7068379148840904, "num_tokens": 294082799.0, "step": 1557 }, { "epoch": 2.1432013769363167, "grad_norm": 0.6973714232444763, "learning_rate": 7.966867687116354e-06, "loss": 1.1428, "mean_token_accuracy": 0.763836681842804, "num_tokens": 294241433.0, "step": 1558 }, { "epoch": 2.144578313253012, "grad_norm": 0.7130071520805359, "learning_rate": 7.943118107067813e-06, "loss": 0.9843, "mean_token_accuracy": 0.7949051260948181, "num_tokens": 294386973.0, "step": 1559 }, { "epoch": 2.1459552495697074, "grad_norm": 0.7809724807739258, "learning_rate": 7.91939520462467e-06, "loss": 0.9224, "mean_token_accuracy": 0.8070959150791168, "num_tokens": 294517899.0, "step": 1560 }, { "epoch": 2.1473321858864027, "grad_norm": 0.46587756276130676, "learning_rate": 7.895699032277287e-06, "loss": 1.4921, "mean_token_accuracy": 0.696626603603363, "num_tokens": 294815042.0, "step": 1561 }, { "epoch": 2.148709122203098, "grad_norm": 0.6321620941162109, "learning_rate": 7.872029642456895e-06, "loss": 1.7321, "mean_token_accuracy": 0.6614574119448662, "num_tokens": 295041626.0, "step": 1562 }, { "epoch": 2.1500860585197934, "grad_norm": 0.6820998787879944, "learning_rate": 7.848387087535465e-06, "loss": 1.5094, "mean_token_accuracy": 0.6996432542800903, "num_tokens": 295226518.0, "step": 1563 }, { "epoch": 2.1514629948364887, "grad_norm": 0.7156899571418762, "learning_rate": 7.824771419825588e-06, "loss": 1.2301, "mean_token_accuracy": 0.7483606860041618, "num_tokens": 295389628.0, "step": 1564 }, { "epoch": 2.152839931153184, "grad_norm": 0.712492048740387, "learning_rate": 7.801182691580362e-06, "loss": 1.0024, "mean_token_accuracy": 0.7895011678338051, "num_tokens": 295538931.0, "step": 1565 }, { "epoch": 2.1542168674698794, "grad_norm": 0.7548331618309021, "learning_rate": 7.77762095499329e-06, "loss": 0.8918, "mean_token_accuracy": 0.8110873252153397, "num_tokens": 295674863.0, "step": 1566 }, { "epoch": 2.1555938037865747, "grad_norm": 0.17988964915275574, "learning_rate": 7.754086262198128e-06, "loss": 1.4344, "mean_token_accuracy": 0.7285774201154709, "num_tokens": 295933541.0, "step": 1567 }, { "epoch": 2.15697074010327, "grad_norm": 0.6155644059181213, "learning_rate": 7.730578665268815e-06, "loss": 1.7811, "mean_token_accuracy": 0.6536228805780411, "num_tokens": 296175495.0, "step": 1568 }, { "epoch": 2.1583476764199654, "grad_norm": 0.6692784428596497, "learning_rate": 7.707098216219325e-06, "loss": 1.5173, "mean_token_accuracy": 0.6955201327800751, "num_tokens": 296366474.0, "step": 1569 }, { "epoch": 2.159724612736661, "grad_norm": 0.6982670426368713, "learning_rate": 7.683644967003574e-06, "loss": 1.3025, "mean_token_accuracy": 0.7348902076482773, "num_tokens": 296533865.0, "step": 1570 }, { "epoch": 2.161101549053356, "grad_norm": 0.7116630673408508, "learning_rate": 7.66021896951529e-06, "loss": 1.0651, "mean_token_accuracy": 0.7770791202783585, "num_tokens": 296686339.0, "step": 1571 }, { "epoch": 2.162478485370052, "grad_norm": 0.7188316583633423, "learning_rate": 7.636820275587894e-06, "loss": 0.914, "mean_token_accuracy": 0.8098565638065338, "num_tokens": 296826083.0, "step": 1572 }, { "epoch": 2.163855421686747, "grad_norm": 0.24134404957294464, "learning_rate": 7.613448936994405e-06, "loss": 1.2853, "mean_token_accuracy": 0.7702748477458954, "num_tokens": 297038209.0, "step": 1573 }, { "epoch": 2.1652323580034425, "grad_norm": 0.5813956260681152, "learning_rate": 7.590105005447317e-06, "loss": 1.7556, "mean_token_accuracy": 0.6542028039693832, "num_tokens": 297307621.0, "step": 1574 }, { "epoch": 2.166609294320138, "grad_norm": 0.6606854200363159, "learning_rate": 7.566788532598457e-06, "loss": 1.6128, "mean_token_accuracy": 0.6799138709902763, "num_tokens": 297508314.0, "step": 1575 }, { "epoch": 2.167986230636833, "grad_norm": 0.690636396408081, "learning_rate": 7.5434995700389235e-06, "loss": 1.339, "mean_token_accuracy": 0.7294749841094017, "num_tokens": 297681060.0, "step": 1576 }, { "epoch": 2.1693631669535285, "grad_norm": 0.7087718844413757, "learning_rate": 7.520238169298937e-06, "loss": 1.1169, "mean_token_accuracy": 0.7667310237884521, "num_tokens": 297837132.0, "step": 1577 }, { "epoch": 2.170740103270224, "grad_norm": 0.7029904723167419, "learning_rate": 7.497004381847726e-06, "loss": 0.9239, "mean_token_accuracy": 0.8045652583241463, "num_tokens": 297980407.0, "step": 1578 }, { "epoch": 2.172117039586919, "grad_norm": 0.8994806408882141, "learning_rate": 7.473798259093421e-06, "loss": 0.9978, "mean_token_accuracy": 0.7891354858875275, "num_tokens": 298105396.0, "step": 1579 }, { "epoch": 2.1734939759036145, "grad_norm": 0.5239232182502747, "learning_rate": 7.450619852382959e-06, "loss": 1.5751, "mean_token_accuracy": 0.6735339239239693, "num_tokens": 298433633.0, "step": 1580 }, { "epoch": 2.17487091222031, "grad_norm": 0.6510961651802063, "learning_rate": 7.42746921300193e-06, "loss": 1.692, "mean_token_accuracy": 0.667929545044899, "num_tokens": 298644134.0, "step": 1581 }, { "epoch": 2.176247848537005, "grad_norm": 0.6874836087226868, "learning_rate": 7.404346392174497e-06, "loss": 1.3826, "mean_token_accuracy": 0.7209445089101791, "num_tokens": 298821491.0, "step": 1582 }, { "epoch": 2.1776247848537005, "grad_norm": 0.7153891921043396, "learning_rate": 7.381251441063255e-06, "loss": 1.1502, "mean_token_accuracy": 0.7636496275663376, "num_tokens": 298980256.0, "step": 1583 }, { "epoch": 2.179001721170396, "grad_norm": 0.7004349231719971, "learning_rate": 7.358184410769149e-06, "loss": 0.9648, "mean_token_accuracy": 0.7989815399050713, "num_tokens": 299125887.0, "step": 1584 }, { "epoch": 2.180378657487091, "grad_norm": 0.7863442301750183, "learning_rate": 7.335145352331339e-06, "loss": 0.9107, "mean_token_accuracy": 0.8112569451332092, "num_tokens": 299256682.0, "step": 1585 }, { "epoch": 2.1817555938037865, "grad_norm": 0.4772692024707794, "learning_rate": 7.312134316727093e-06, "loss": 1.5421, "mean_token_accuracy": 0.7001556605100632, "num_tokens": 299553587.0, "step": 1586 }, { "epoch": 2.183132530120482, "grad_norm": 0.6340495944023132, "learning_rate": 7.289151354871677e-06, "loss": 1.7411, "mean_token_accuracy": 0.662436731159687, "num_tokens": 299778444.0, "step": 1587 }, { "epoch": 2.184509466437177, "grad_norm": 0.6865718960762024, "learning_rate": 7.266196517618238e-06, "loss": 1.487, "mean_token_accuracy": 0.7013279423117638, "num_tokens": 299962302.0, "step": 1588 }, { "epoch": 2.1858864027538725, "grad_norm": 0.692055881023407, "learning_rate": 7.243269855757693e-06, "loss": 1.2017, "mean_token_accuracy": 0.7542572170495987, "num_tokens": 300125451.0, "step": 1589 }, { "epoch": 2.187263339070568, "grad_norm": 0.7207212448120117, "learning_rate": 7.220371420018608e-06, "loss": 1.0176, "mean_token_accuracy": 0.7867571488022804, "num_tokens": 300274877.0, "step": 1590 }, { "epoch": 2.188640275387263, "grad_norm": 0.7567468881607056, "learning_rate": 7.197501261067128e-06, "loss": 0.9006, "mean_token_accuracy": 0.8134497627615929, "num_tokens": 300411062.0, "step": 1591 }, { "epoch": 2.1900172117039585, "grad_norm": 0.1792241781949997, "learning_rate": 7.1746594295067826e-06, "loss": 1.4519, "mean_token_accuracy": 0.7345615550875664, "num_tokens": 300666202.0, "step": 1592 }, { "epoch": 2.1913941480206542, "grad_norm": 0.6075887084007263, "learning_rate": 7.151845975878457e-06, "loss": 1.7505, "mean_token_accuracy": 0.6574583798646927, "num_tokens": 300906990.0, "step": 1593 }, { "epoch": 2.1927710843373496, "grad_norm": 0.6837210059165955, "learning_rate": 7.129060950660236e-06, "loss": 1.5422, "mean_token_accuracy": 0.6896303445100784, "num_tokens": 301097524.0, "step": 1594 }, { "epoch": 2.194148020654045, "grad_norm": 0.6981785893440247, "learning_rate": 7.106304404267304e-06, "loss": 1.2541, "mean_token_accuracy": 0.7397252470254898, "num_tokens": 301264357.0, "step": 1595 }, { "epoch": 2.1955249569707402, "grad_norm": 0.7120211720466614, "learning_rate": 7.083576387051827e-06, "loss": 1.0608, "mean_token_accuracy": 0.7794656157493591, "num_tokens": 301416575.0, "step": 1596 }, { "epoch": 2.1969018932874356, "grad_norm": 0.7311321496963501, "learning_rate": 7.060876949302855e-06, "loss": 0.9019, "mean_token_accuracy": 0.8103385642170906, "num_tokens": 301556132.0, "step": 1597 }, { "epoch": 2.198278829604131, "grad_norm": 0.25091299414634705, "learning_rate": 7.0382061412461935e-06, "loss": 1.2838, "mean_token_accuracy": 0.7668245881795883, "num_tokens": 301766523.0, "step": 1598 }, { "epoch": 2.1996557659208262, "grad_norm": 0.5793142914772034, "learning_rate": 7.015564013044302e-06, "loss": 1.7099, "mean_token_accuracy": 0.6632814928889275, "num_tokens": 302034615.0, "step": 1599 }, { "epoch": 2.2010327022375216, "grad_norm": 0.6738784313201904, "learning_rate": 6.99295061479619e-06, "loss": 1.6236, "mean_token_accuracy": 0.6769010424613953, "num_tokens": 302233970.0, "step": 1600 }, { "epoch": 2.202409638554217, "grad_norm": 0.7071889042854309, "learning_rate": 6.970365996537285e-06, "loss": 1.3868, "mean_token_accuracy": 0.7178677842020988, "num_tokens": 302406057.0, "step": 1601 }, { "epoch": 2.2037865748709122, "grad_norm": 0.6929791569709778, "learning_rate": 6.947810208239343e-06, "loss": 1.065, "mean_token_accuracy": 0.7779111638665199, "num_tokens": 302561530.0, "step": 1602 }, { "epoch": 2.2051635111876076, "grad_norm": 0.7156249284744263, "learning_rate": 6.925283299810328e-06, "loss": 0.9341, "mean_token_accuracy": 0.804315410554409, "num_tokens": 302704413.0, "step": 1603 }, { "epoch": 2.206540447504303, "grad_norm": 0.901728093624115, "learning_rate": 6.902785321094301e-06, "loss": 0.9888, "mean_token_accuracy": 0.7927461490035057, "num_tokens": 302828901.0, "step": 1604 }, { "epoch": 2.2079173838209982, "grad_norm": 0.5278829336166382, "learning_rate": 6.880316321871312e-06, "loss": 1.591, "mean_token_accuracy": 0.6715151369571686, "num_tokens": 303157002.0, "step": 1605 }, { "epoch": 2.2092943201376936, "grad_norm": 0.6510056853294373, "learning_rate": 6.857876351857296e-06, "loss": 1.6688, "mean_token_accuracy": 0.6700789406895638, "num_tokens": 303366795.0, "step": 1606 }, { "epoch": 2.210671256454389, "grad_norm": 0.6874690651893616, "learning_rate": 6.8354654607039535e-06, "loss": 1.4152, "mean_token_accuracy": 0.71517363935709, "num_tokens": 303544580.0, "step": 1607 }, { "epoch": 2.212048192771084, "grad_norm": 0.701011061668396, "learning_rate": 6.8130836979986236e-06, "loss": 1.1379, "mean_token_accuracy": 0.7648810371756554, "num_tokens": 303703386.0, "step": 1608 }, { "epoch": 2.2134251290877796, "grad_norm": 0.7050424814224243, "learning_rate": 6.7907311132642325e-06, "loss": 0.948, "mean_token_accuracy": 0.8001508116722107, "num_tokens": 303849137.0, "step": 1609 }, { "epoch": 2.214802065404475, "grad_norm": 0.7902639508247375, "learning_rate": 6.768407755959119e-06, "loss": 0.8905, "mean_token_accuracy": 0.8149047866463661, "num_tokens": 303980183.0, "step": 1610 }, { "epoch": 2.21617900172117, "grad_norm": 0.47658437490463257, "learning_rate": 6.746113675476959e-06, "loss": 1.5629, "mean_token_accuracy": 0.6867681816220284, "num_tokens": 304272108.0, "step": 1611 }, { "epoch": 2.2175559380378655, "grad_norm": 0.64127117395401, "learning_rate": 6.723848921146649e-06, "loss": 1.7326, "mean_token_accuracy": 0.657711997628212, "num_tokens": 304496063.0, "step": 1612 }, { "epoch": 2.218932874354561, "grad_norm": 0.6873314380645752, "learning_rate": 6.701613542232202e-06, "loss": 1.4474, "mean_token_accuracy": 0.7086688652634621, "num_tokens": 304679915.0, "step": 1613 }, { "epoch": 2.2203098106712567, "grad_norm": 0.7076175808906555, "learning_rate": 6.67940758793262e-06, "loss": 1.2146, "mean_token_accuracy": 0.7505630776286125, "num_tokens": 304842604.0, "step": 1614 }, { "epoch": 2.221686746987952, "grad_norm": 0.6955431699752808, "learning_rate": 6.657231107381821e-06, "loss": 0.9871, "mean_token_accuracy": 0.7939598187804222, "num_tokens": 304991759.0, "step": 1615 }, { "epoch": 2.2230636833046473, "grad_norm": 0.7522688508033752, "learning_rate": 6.635084149648481e-06, "loss": 0.8753, "mean_token_accuracy": 0.8161954879760742, "num_tokens": 305127662.0, "step": 1616 }, { "epoch": 2.2244406196213427, "grad_norm": 0.17932190001010895, "learning_rate": 6.612966763735971e-06, "loss": 1.4255, "mean_token_accuracy": 0.7366128712892532, "num_tokens": 305383459.0, "step": 1617 }, { "epoch": 2.225817555938038, "grad_norm": 0.6070579886436462, "learning_rate": 6.5908789985822175e-06, "loss": 1.7663, "mean_token_accuracy": 0.6541169732809067, "num_tokens": 305625653.0, "step": 1618 }, { "epoch": 2.2271944922547333, "grad_norm": 0.6756413578987122, "learning_rate": 6.568820903059632e-06, "loss": 1.54, "mean_token_accuracy": 0.6938938796520233, "num_tokens": 305816889.0, "step": 1619 }, { "epoch": 2.2285714285714286, "grad_norm": 0.7063832879066467, "learning_rate": 6.54679252597495e-06, "loss": 1.255, "mean_token_accuracy": 0.7439309135079384, "num_tokens": 305983516.0, "step": 1620 }, { "epoch": 2.229948364888124, "grad_norm": 0.6961568593978882, "learning_rate": 6.524793916069161e-06, "loss": 1.0623, "mean_token_accuracy": 0.7785451337695122, "num_tokens": 306135106.0, "step": 1621 }, { "epoch": 2.2313253012048193, "grad_norm": 0.7295913100242615, "learning_rate": 6.502825122017391e-06, "loss": 0.8982, "mean_token_accuracy": 0.8109885305166245, "num_tokens": 306274310.0, "step": 1622 }, { "epoch": 2.2327022375215146, "grad_norm": 0.24796590209007263, "learning_rate": 6.480886192428794e-06, "loss": 1.2328, "mean_token_accuracy": 0.775164470076561, "num_tokens": 306484024.0, "step": 1623 }, { "epoch": 2.23407917383821, "grad_norm": 0.5831223726272583, "learning_rate": 6.45897717584645e-06, "loss": 1.7293, "mean_token_accuracy": 0.6582137271761894, "num_tokens": 306753459.0, "step": 1624 }, { "epoch": 2.2354561101549053, "grad_norm": 0.6578834056854248, "learning_rate": 6.437098120747229e-06, "loss": 1.6075, "mean_token_accuracy": 0.6802227422595024, "num_tokens": 306954762.0, "step": 1625 }, { "epoch": 2.2368330464716006, "grad_norm": 0.7057643532752991, "learning_rate": 6.415249075541736e-06, "loss": 1.3515, "mean_token_accuracy": 0.7255446463823318, "num_tokens": 307127119.0, "step": 1626 }, { "epoch": 2.238209982788296, "grad_norm": 0.7012023329734802, "learning_rate": 6.39343008857416e-06, "loss": 1.0939, "mean_token_accuracy": 0.7705222442746162, "num_tokens": 307282782.0, "step": 1627 }, { "epoch": 2.2395869191049913, "grad_norm": 0.7288472056388855, "learning_rate": 6.3716412081221766e-06, "loss": 0.9414, "mean_token_accuracy": 0.8035621866583824, "num_tokens": 307425945.0, "step": 1628 }, { "epoch": 2.2409638554216866, "grad_norm": 0.916356086730957, "learning_rate": 6.349882482396868e-06, "loss": 0.9746, "mean_token_accuracy": 0.7955620959401131, "num_tokens": 307550480.0, "step": 1629 }, { "epoch": 2.242340791738382, "grad_norm": 0.5225470066070557, "learning_rate": 6.328153959542573e-06, "loss": 1.5801, "mean_token_accuracy": 0.6743348762392998, "num_tokens": 307881649.0, "step": 1630 }, { "epoch": 2.2437177280550773, "grad_norm": 0.6504858732223511, "learning_rate": 6.30645568763681e-06, "loss": 1.6806, "mean_token_accuracy": 0.6684362292289734, "num_tokens": 308093362.0, "step": 1631 }, { "epoch": 2.2450946643717726, "grad_norm": 0.6868367195129395, "learning_rate": 6.2847877146901706e-06, "loss": 1.4469, "mean_token_accuracy": 0.7122135236859322, "num_tokens": 308271720.0, "step": 1632 }, { "epoch": 2.246471600688468, "grad_norm": 0.6967132687568665, "learning_rate": 6.2631500886461835e-06, "loss": 1.138, "mean_token_accuracy": 0.7637504115700722, "num_tokens": 308431121.0, "step": 1633 }, { "epoch": 2.2478485370051633, "grad_norm": 0.710109293460846, "learning_rate": 6.241542857381251e-06, "loss": 0.9567, "mean_token_accuracy": 0.7982008531689644, "num_tokens": 308577244.0, "step": 1634 }, { "epoch": 2.249225473321859, "grad_norm": 0.798099160194397, "learning_rate": 6.219966068704517e-06, "loss": 0.8888, "mean_token_accuracy": 0.8131435662508011, "num_tokens": 308708158.0, "step": 1635 }, { "epoch": 2.2506024096385544, "grad_norm": 0.4762585163116455, "learning_rate": 6.198419770357764e-06, "loss": 1.5139, "mean_token_accuracy": 0.7006706520915031, "num_tokens": 308999458.0, "step": 1636 }, { "epoch": 2.2519793459552497, "grad_norm": 0.6419603228569031, "learning_rate": 6.176904010015312e-06, "loss": 1.7653, "mean_token_accuracy": 0.6574624553322792, "num_tokens": 309222184.0, "step": 1637 }, { "epoch": 2.253356282271945, "grad_norm": 0.6926255822181702, "learning_rate": 6.155418835283906e-06, "loss": 1.4686, "mean_token_accuracy": 0.7040091082453728, "num_tokens": 309405453.0, "step": 1638 }, { "epoch": 2.2547332185886404, "grad_norm": 0.719021737575531, "learning_rate": 6.133964293702634e-06, "loss": 1.1945, "mean_token_accuracy": 0.753422811627388, "num_tokens": 309568089.0, "step": 1639 }, { "epoch": 2.2561101549053357, "grad_norm": 0.7100511193275452, "learning_rate": 6.112540432742791e-06, "loss": 1.005, "mean_token_accuracy": 0.7900200113654137, "num_tokens": 309717242.0, "step": 1640 }, { "epoch": 2.257487091222031, "grad_norm": 0.7481628656387329, "learning_rate": 6.091147299807769e-06, "loss": 0.8821, "mean_token_accuracy": 0.8130477294325829, "num_tokens": 309853153.0, "step": 1641 }, { "epoch": 2.2588640275387264, "grad_norm": 0.1823204606771469, "learning_rate": 6.0697849422330015e-06, "loss": 1.4532, "mean_token_accuracy": 0.7288776859641075, "num_tokens": 310108836.0, "step": 1642 }, { "epoch": 2.2602409638554217, "grad_norm": 0.6173501014709473, "learning_rate": 6.048453407285806e-06, "loss": 1.7375, "mean_token_accuracy": 0.6603184640407562, "num_tokens": 310349936.0, "step": 1643 }, { "epoch": 2.261617900172117, "grad_norm": 0.6767826080322266, "learning_rate": 6.02715274216531e-06, "loss": 1.5393, "mean_token_accuracy": 0.6918373480439186, "num_tokens": 310541341.0, "step": 1644 }, { "epoch": 2.2629948364888124, "grad_norm": 0.7111715078353882, "learning_rate": 6.005882994002335e-06, "loss": 1.2487, "mean_token_accuracy": 0.7449442744255066, "num_tokens": 310708236.0, "step": 1645 }, { "epoch": 2.2643717728055077, "grad_norm": 0.6988787651062012, "learning_rate": 5.9846442098592895e-06, "loss": 1.0495, "mean_token_accuracy": 0.7800050377845764, "num_tokens": 310860448.0, "step": 1646 }, { "epoch": 2.265748709122203, "grad_norm": 0.7303226590156555, "learning_rate": 5.963436436730079e-06, "loss": 0.9036, "mean_token_accuracy": 0.8112505152821541, "num_tokens": 311000155.0, "step": 1647 }, { "epoch": 2.2671256454388984, "grad_norm": 0.24853697419166565, "learning_rate": 5.942259721539985e-06, "loss": 1.2276, "mean_token_accuracy": 0.7762962207198143, "num_tokens": 311210858.0, "step": 1648 }, { "epoch": 2.2685025817555937, "grad_norm": 0.5804238319396973, "learning_rate": 5.921114111145567e-06, "loss": 1.7658, "mean_token_accuracy": 0.653652548789978, "num_tokens": 311481524.0, "step": 1649 }, { "epoch": 2.269879518072289, "grad_norm": 0.6610504388809204, "learning_rate": 5.8999996523345715e-06, "loss": 1.6086, "mean_token_accuracy": 0.679690770804882, "num_tokens": 311681329.0, "step": 1650 }, { "epoch": 2.2712564543889844, "grad_norm": 0.7054418921470642, "learning_rate": 5.878916391825804e-06, "loss": 1.3611, "mean_token_accuracy": 0.7254486232995987, "num_tokens": 311853503.0, "step": 1651 }, { "epoch": 2.2726333907056797, "grad_norm": 0.6988577246665955, "learning_rate": 5.857864376269051e-06, "loss": 1.0928, "mean_token_accuracy": 0.7736073061823845, "num_tokens": 312009063.0, "step": 1652 }, { "epoch": 2.274010327022375, "grad_norm": 0.7207877039909363, "learning_rate": 5.836843652244957e-06, "loss": 0.9313, "mean_token_accuracy": 0.8009350374341011, "num_tokens": 312152246.0, "step": 1653 }, { "epoch": 2.2753872633390704, "grad_norm": 0.8804559707641602, "learning_rate": 5.815854266264933e-06, "loss": 0.9762, "mean_token_accuracy": 0.7935193255543709, "num_tokens": 312277198.0, "step": 1654 }, { "epoch": 2.2767641996557657, "grad_norm": 0.5207816362380981, "learning_rate": 5.794896264771051e-06, "loss": 1.5831, "mean_token_accuracy": 0.6711349934339523, "num_tokens": 312606189.0, "step": 1655 }, { "epoch": 2.2781411359724615, "grad_norm": 0.6564119458198547, "learning_rate": 5.773969694135937e-06, "loss": 1.6878, "mean_token_accuracy": 0.6674639508128166, "num_tokens": 312817475.0, "step": 1656 }, { "epoch": 2.279518072289157, "grad_norm": 0.6972959041595459, "learning_rate": 5.753074600662671e-06, "loss": 1.3991, "mean_token_accuracy": 0.7170222699642181, "num_tokens": 312995383.0, "step": 1657 }, { "epoch": 2.280895008605852, "grad_norm": 0.7062073349952698, "learning_rate": 5.732211030584691e-06, "loss": 1.1523, "mean_token_accuracy": 0.7632657214999199, "num_tokens": 313154407.0, "step": 1658 }, { "epoch": 2.2822719449225475, "grad_norm": 0.709684431552887, "learning_rate": 5.711379030065678e-06, "loss": 0.982, "mean_token_accuracy": 0.7944241687655449, "num_tokens": 313300300.0, "step": 1659 }, { "epoch": 2.283648881239243, "grad_norm": 0.781527042388916, "learning_rate": 5.690578645199469e-06, "loss": 0.903, "mean_token_accuracy": 0.8121207654476166, "num_tokens": 313431262.0, "step": 1660 }, { "epoch": 2.285025817555938, "grad_norm": 0.4764096140861511, "learning_rate": 5.669809922009937e-06, "loss": 1.5266, "mean_token_accuracy": 0.6996960043907166, "num_tokens": 313725044.0, "step": 1661 }, { "epoch": 2.2864027538726335, "grad_norm": 0.6308221817016602, "learning_rate": 5.649072906450906e-06, "loss": 1.7157, "mean_token_accuracy": 0.6632435545325279, "num_tokens": 313947524.0, "step": 1662 }, { "epoch": 2.287779690189329, "grad_norm": 0.6873571276664734, "learning_rate": 5.628367644406039e-06, "loss": 1.4692, "mean_token_accuracy": 0.7064629420638084, "num_tokens": 314130390.0, "step": 1663 }, { "epoch": 2.289156626506024, "grad_norm": 0.6987667083740234, "learning_rate": 5.607694181688743e-06, "loss": 1.1673, "mean_token_accuracy": 0.7564731538295746, "num_tokens": 314292644.0, "step": 1664 }, { "epoch": 2.2905335628227195, "grad_norm": 0.6968876123428345, "learning_rate": 5.587052564042066e-06, "loss": 0.9749, "mean_token_accuracy": 0.7957777976989746, "num_tokens": 314441459.0, "step": 1665 }, { "epoch": 2.291910499139415, "grad_norm": 0.7558847665786743, "learning_rate": 5.566442837138577e-06, "loss": 0.9032, "mean_token_accuracy": 0.8114924803376198, "num_tokens": 314577389.0, "step": 1666 }, { "epoch": 2.29328743545611, "grad_norm": 0.17702549695968628, "learning_rate": 5.545865046580299e-06, "loss": 1.4598, "mean_token_accuracy": 0.7284581288695335, "num_tokens": 314838312.0, "step": 1667 }, { "epoch": 2.2946643717728055, "grad_norm": 0.6087902784347534, "learning_rate": 5.5253192378985966e-06, "loss": 1.7662, "mean_token_accuracy": 0.6554195731878281, "num_tokens": 315082090.0, "step": 1668 }, { "epoch": 2.296041308089501, "grad_norm": 0.6738694906234741, "learning_rate": 5.504805456554057e-06, "loss": 1.5305, "mean_token_accuracy": 0.6954032406210899, "num_tokens": 315273586.0, "step": 1669 }, { "epoch": 2.297418244406196, "grad_norm": 0.7090211510658264, "learning_rate": 5.484323747936404e-06, "loss": 1.2976, "mean_token_accuracy": 0.73500906676054, "num_tokens": 315440984.0, "step": 1670 }, { "epoch": 2.2987951807228915, "grad_norm": 0.7054688930511475, "learning_rate": 5.463874157364399e-06, "loss": 1.0253, "mean_token_accuracy": 0.7830094024538994, "num_tokens": 315593262.0, "step": 1671 }, { "epoch": 2.300172117039587, "grad_norm": 0.737417995929718, "learning_rate": 5.443456730085737e-06, "loss": 0.905, "mean_token_accuracy": 0.8110709711909294, "num_tokens": 315732753.0, "step": 1672 }, { "epoch": 2.301549053356282, "grad_norm": 0.2584570050239563, "learning_rate": 5.423071511276951e-06, "loss": 1.2363, "mean_token_accuracy": 0.7749732956290245, "num_tokens": 315940561.0, "step": 1673 }, { "epoch": 2.3029259896729775, "grad_norm": 0.5865582823753357, "learning_rate": 5.402718546043293e-06, "loss": 1.7648, "mean_token_accuracy": 0.6573128551244736, "num_tokens": 316211252.0, "step": 1674 }, { "epoch": 2.304302925989673, "grad_norm": 0.6683372259140015, "learning_rate": 5.382397879418664e-06, "loss": 1.5946, "mean_token_accuracy": 0.6836843341588974, "num_tokens": 316411972.0, "step": 1675 }, { "epoch": 2.305679862306368, "grad_norm": 0.6920455694198608, "learning_rate": 5.362109556365496e-06, "loss": 1.322, "mean_token_accuracy": 0.7308057025074959, "num_tokens": 316583786.0, "step": 1676 }, { "epoch": 2.307056798623064, "grad_norm": 0.711327314376831, "learning_rate": 5.3418536217746504e-06, "loss": 1.0875, "mean_token_accuracy": 0.7729003727436066, "num_tokens": 316738772.0, "step": 1677 }, { "epoch": 2.3084337349397592, "grad_norm": 0.7288492918014526, "learning_rate": 5.321630120465342e-06, "loss": 0.9115, "mean_token_accuracy": 0.8068003281950951, "num_tokens": 316881271.0, "step": 1678 }, { "epoch": 2.3098106712564546, "grad_norm": 0.9139898419380188, "learning_rate": 5.3014390971850035e-06, "loss": 0.9795, "mean_token_accuracy": 0.7905402779579163, "num_tokens": 317005815.0, "step": 1679 }, { "epoch": 2.31118760757315, "grad_norm": 0.5320467948913574, "learning_rate": 5.281280596609211e-06, "loss": 1.5834, "mean_token_accuracy": 0.6723697036504745, "num_tokens": 317332088.0, "step": 1680 }, { "epoch": 2.3125645438898452, "grad_norm": 0.676445722579956, "learning_rate": 5.261154663341586e-06, "loss": 1.7467, "mean_token_accuracy": 0.6600613072514534, "num_tokens": 317542907.0, "step": 1681 }, { "epoch": 2.3139414802065406, "grad_norm": 0.6976999044418335, "learning_rate": 5.2410613419136715e-06, "loss": 1.4401, "mean_token_accuracy": 0.708362378180027, "num_tokens": 317721006.0, "step": 1682 }, { "epoch": 2.315318416523236, "grad_norm": 0.7008019685745239, "learning_rate": 5.221000676784873e-06, "loss": 1.1311, "mean_token_accuracy": 0.7613184601068497, "num_tokens": 317880617.0, "step": 1683 }, { "epoch": 2.316695352839931, "grad_norm": 0.7132427096366882, "learning_rate": 5.200972712342327e-06, "loss": 0.9605, "mean_token_accuracy": 0.7944475933909416, "num_tokens": 318027257.0, "step": 1684 }, { "epoch": 2.3180722891566266, "grad_norm": 0.7709197998046875, "learning_rate": 5.180977492900823e-06, "loss": 0.8818, "mean_token_accuracy": 0.8156022503972054, "num_tokens": 318158863.0, "step": 1685 }, { "epoch": 2.319449225473322, "grad_norm": 0.48575299978256226, "learning_rate": 5.16101506270269e-06, "loss": 1.5642, "mean_token_accuracy": 0.6869316324591637, "num_tokens": 318450469.0, "step": 1686 }, { "epoch": 2.320826161790017, "grad_norm": 0.6355774998664856, "learning_rate": 5.141085465917703e-06, "loss": 1.705, "mean_token_accuracy": 0.6640212759375572, "num_tokens": 318675454.0, "step": 1687 }, { "epoch": 2.3222030981067125, "grad_norm": 0.6925192475318909, "learning_rate": 5.121188746643009e-06, "loss": 1.4618, "mean_token_accuracy": 0.7074486017227173, "num_tokens": 318859383.0, "step": 1688 }, { "epoch": 2.323580034423408, "grad_norm": 0.7126806378364563, "learning_rate": 5.101324948902988e-06, "loss": 1.2199, "mean_token_accuracy": 0.747402161359787, "num_tokens": 319022326.0, "step": 1689 }, { "epoch": 2.324956970740103, "grad_norm": 0.7117187976837158, "learning_rate": 5.081494116649186e-06, "loss": 1.0029, "mean_token_accuracy": 0.7898950576782227, "num_tokens": 319171496.0, "step": 1690 }, { "epoch": 2.3263339070567985, "grad_norm": 0.7417982220649719, "learning_rate": 5.0616962937601945e-06, "loss": 0.8905, "mean_token_accuracy": 0.811457633972168, "num_tokens": 319307444.0, "step": 1691 }, { "epoch": 2.327710843373494, "grad_norm": 0.18283845484256744, "learning_rate": 5.041931524041584e-06, "loss": 1.3932, "mean_token_accuracy": 0.7354881390929222, "num_tokens": 319564671.0, "step": 1692 }, { "epoch": 2.329087779690189, "grad_norm": 0.5967641472816467, "learning_rate": 5.0221998512257795e-06, "loss": 1.7454, "mean_token_accuracy": 0.6580119282007217, "num_tokens": 319807089.0, "step": 1693 }, { "epoch": 2.3304647160068845, "grad_norm": 0.6695621609687805, "learning_rate": 5.002501318971984e-06, "loss": 1.5537, "mean_token_accuracy": 0.6916912794113159, "num_tokens": 319998117.0, "step": 1694 }, { "epoch": 2.33184165232358, "grad_norm": 0.7041584253311157, "learning_rate": 4.9828359708660605e-06, "loss": 1.2781, "mean_token_accuracy": 0.7387253046035767, "num_tokens": 320165396.0, "step": 1695 }, { "epoch": 2.333218588640275, "grad_norm": 0.7146373391151428, "learning_rate": 4.963203850420455e-06, "loss": 1.039, "mean_token_accuracy": 0.7829486504197121, "num_tokens": 320317828.0, "step": 1696 }, { "epoch": 2.3345955249569705, "grad_norm": 0.7420217990875244, "learning_rate": 4.9436050010740834e-06, "loss": 0.9023, "mean_token_accuracy": 0.8138061910867691, "num_tokens": 320457388.0, "step": 1697 }, { "epoch": 2.3359724612736663, "grad_norm": 0.2639135420322418, "learning_rate": 4.924039466192272e-06, "loss": 1.2362, "mean_token_accuracy": 0.7708728760480881, "num_tokens": 320663820.0, "step": 1698 }, { "epoch": 2.337349397590361, "grad_norm": 0.5798234343528748, "learning_rate": 4.904507289066594e-06, "loss": 1.7764, "mean_token_accuracy": 0.6533364504575729, "num_tokens": 320932084.0, "step": 1699 }, { "epoch": 2.338726333907057, "grad_norm": 0.6750074028968811, "learning_rate": 4.885008512914837e-06, "loss": 1.6207, "mean_token_accuracy": 0.6776204854249954, "num_tokens": 321132076.0, "step": 1700 }, { "epoch": 2.3401032702237523, "grad_norm": 0.7188519239425659, "learning_rate": 4.865543180880883e-06, "loss": 1.3716, "mean_token_accuracy": 0.7211605980992317, "num_tokens": 321304219.0, "step": 1701 }, { "epoch": 2.3414802065404476, "grad_norm": 0.7159742116928101, "learning_rate": 4.8461113360346095e-06, "loss": 1.0985, "mean_token_accuracy": 0.7713616043329239, "num_tokens": 321459634.0, "step": 1702 }, { "epoch": 2.342857142857143, "grad_norm": 0.7474786043167114, "learning_rate": 4.8267130213718005e-06, "loss": 0.9433, "mean_token_accuracy": 0.802973210811615, "num_tokens": 321602251.0, "step": 1703 }, { "epoch": 2.3442340791738383, "grad_norm": 0.8885399103164673, "learning_rate": 4.80734827981405e-06, "loss": 0.9881, "mean_token_accuracy": 0.7869381010532379, "num_tokens": 321726656.0, "step": 1704 }, { "epoch": 2.3456110154905336, "grad_norm": 0.5404081344604492, "learning_rate": 4.788017154208668e-06, "loss": 1.5863, "mean_token_accuracy": 0.6734351962804794, "num_tokens": 322054601.0, "step": 1705 }, { "epoch": 2.346987951807229, "grad_norm": 0.6510046124458313, "learning_rate": 4.76871968732858e-06, "loss": 1.665, "mean_token_accuracy": 0.670007161796093, "num_tokens": 322265007.0, "step": 1706 }, { "epoch": 2.3483648881239243, "grad_norm": 0.6978887915611267, "learning_rate": 4.7494559218722395e-06, "loss": 1.3861, "mean_token_accuracy": 0.7202155292034149, "num_tokens": 322442217.0, "step": 1707 }, { "epoch": 2.3497418244406196, "grad_norm": 0.7168549299240112, "learning_rate": 4.73022590046353e-06, "loss": 1.1463, "mean_token_accuracy": 0.76197799295187, "num_tokens": 322600489.0, "step": 1708 }, { "epoch": 2.351118760757315, "grad_norm": 0.7016539573669434, "learning_rate": 4.711029665651674e-06, "loss": 0.9486, "mean_token_accuracy": 0.8005194514989853, "num_tokens": 322745630.0, "step": 1709 }, { "epoch": 2.3524956970740103, "grad_norm": 0.7918953895568848, "learning_rate": 4.691867259911131e-06, "loss": 0.8918, "mean_token_accuracy": 0.8128003552556038, "num_tokens": 322876051.0, "step": 1710 }, { "epoch": 2.3538726333907056, "grad_norm": 0.47965678572654724, "learning_rate": 4.672738725641515e-06, "loss": 1.5253, "mean_token_accuracy": 0.6944354847073555, "num_tokens": 323170641.0, "step": 1711 }, { "epoch": 2.355249569707401, "grad_norm": 0.637756884098053, "learning_rate": 4.653644105167487e-06, "loss": 1.7244, "mean_token_accuracy": 0.6636629775166512, "num_tokens": 323394882.0, "step": 1712 }, { "epoch": 2.3566265060240963, "grad_norm": 0.6921768188476562, "learning_rate": 4.634583440738676e-06, "loss": 1.4473, "mean_token_accuracy": 0.7088181301951408, "num_tokens": 323578691.0, "step": 1713 }, { "epoch": 2.3580034423407916, "grad_norm": 0.7075146436691284, "learning_rate": 4.615556774529579e-06, "loss": 1.1883, "mean_token_accuracy": 0.7549368143081665, "num_tokens": 323741461.0, "step": 1714 }, { "epoch": 2.359380378657487, "grad_norm": 0.7125049233436584, "learning_rate": 4.596564148639448e-06, "loss": 0.9905, "mean_token_accuracy": 0.7934694066643715, "num_tokens": 323890294.0, "step": 1715 }, { "epoch": 2.3607573149741823, "grad_norm": 0.7201095819473267, "learning_rate": 4.577605605092248e-06, "loss": 0.8531, "mean_token_accuracy": 0.8183129951357841, "num_tokens": 324025824.0, "step": 1716 }, { "epoch": 2.3621342512908776, "grad_norm": 0.17793194949626923, "learning_rate": 4.5586811858365085e-06, "loss": 1.4008, "mean_token_accuracy": 0.7398326024413109, "num_tokens": 324281879.0, "step": 1717 }, { "epoch": 2.363511187607573, "grad_norm": 0.6003541946411133, "learning_rate": 4.53979093274526e-06, "loss": 1.7439, "mean_token_accuracy": 0.6577341184020042, "num_tokens": 324525652.0, "step": 1718 }, { "epoch": 2.3648881239242687, "grad_norm": 0.6812847852706909, "learning_rate": 4.520934887615937e-06, "loss": 1.5311, "mean_token_accuracy": 0.6972426474094391, "num_tokens": 324717601.0, "step": 1719 }, { "epoch": 2.3662650602409636, "grad_norm": 0.715694785118103, "learning_rate": 4.502113092170283e-06, "loss": 1.2718, "mean_token_accuracy": 0.7405311614274979, "num_tokens": 324884529.0, "step": 1720 }, { "epoch": 2.3676419965576594, "grad_norm": 0.7059047222137451, "learning_rate": 4.483325588054259e-06, "loss": 1.0432, "mean_token_accuracy": 0.7807513698935509, "num_tokens": 325036554.0, "step": 1721 }, { "epoch": 2.3690189328743547, "grad_norm": 0.7466967105865479, "learning_rate": 4.464572416837949e-06, "loss": 0.9041, "mean_token_accuracy": 0.8133066147565842, "num_tokens": 325176036.0, "step": 1722 }, { "epoch": 2.37039586919105, "grad_norm": 0.25727829337120056, "learning_rate": 4.445853620015479e-06, "loss": 1.182, "mean_token_accuracy": 0.7735105976462364, "num_tokens": 325384645.0, "step": 1723 }, { "epoch": 2.3717728055077454, "grad_norm": 0.5960420966148376, "learning_rate": 4.427169239004902e-06, "loss": 1.7304, "mean_token_accuracy": 0.6615153923630714, "num_tokens": 325652380.0, "step": 1724 }, { "epoch": 2.3731497418244407, "grad_norm": 0.68340665102005, "learning_rate": 4.408519315148132e-06, "loss": 1.6156, "mean_token_accuracy": 0.6808261647820473, "num_tokens": 325850660.0, "step": 1725 }, { "epoch": 2.374526678141136, "grad_norm": 0.7111878991127014, "learning_rate": 4.389903889710836e-06, "loss": 1.3544, "mean_token_accuracy": 0.7275245562195778, "num_tokens": 326021338.0, "step": 1726 }, { "epoch": 2.3759036144578314, "grad_norm": 0.7097486853599548, "learning_rate": 4.37132300388236e-06, "loss": 1.0778, "mean_token_accuracy": 0.7740315869450569, "num_tokens": 326175578.0, "step": 1727 }, { "epoch": 2.3772805507745267, "grad_norm": 0.7189635634422302, "learning_rate": 4.352776698775611e-06, "loss": 0.9335, "mean_token_accuracy": 0.8060808181762695, "num_tokens": 326317367.0, "step": 1728 }, { "epoch": 2.378657487091222, "grad_norm": 0.9099303483963013, "learning_rate": 4.334265015426993e-06, "loss": 1.0124, "mean_token_accuracy": 0.7884478867053986, "num_tokens": 326441168.0, "step": 1729 }, { "epoch": 2.3800344234079174, "grad_norm": 0.5250269174575806, "learning_rate": 4.315787994796292e-06, "loss": 1.5374, "mean_token_accuracy": 0.6810234263539314, "num_tokens": 326763374.0, "step": 1730 }, { "epoch": 2.3814113597246127, "grad_norm": 0.6562104821205139, "learning_rate": 4.297345677766613e-06, "loss": 1.6598, "mean_token_accuracy": 0.6703085079789162, "num_tokens": 326971168.0, "step": 1731 }, { "epoch": 2.382788296041308, "grad_norm": 0.7038097977638245, "learning_rate": 4.278938105144255e-06, "loss": 1.3785, "mean_token_accuracy": 0.7214647084474564, "num_tokens": 327147261.0, "step": 1732 }, { "epoch": 2.3841652323580034, "grad_norm": 0.7010499835014343, "learning_rate": 4.260565317658656e-06, "loss": 1.1162, "mean_token_accuracy": 0.7701524421572685, "num_tokens": 327305329.0, "step": 1733 }, { "epoch": 2.3855421686746987, "grad_norm": 0.7136934399604797, "learning_rate": 4.24222735596228e-06, "loss": 0.9748, "mean_token_accuracy": 0.7943526804447174, "num_tokens": 327450770.0, "step": 1734 }, { "epoch": 2.386919104991394, "grad_norm": 0.7958358526229858, "learning_rate": 4.223924260630536e-06, "loss": 0.8957, "mean_token_accuracy": 0.8141337633132935, "num_tokens": 327581720.0, "step": 1735 }, { "epoch": 2.3882960413080894, "grad_norm": 0.4784802198410034, "learning_rate": 4.205656072161681e-06, "loss": 1.5087, "mean_token_accuracy": 0.6994279995560646, "num_tokens": 327876910.0, "step": 1736 }, { "epoch": 2.3896729776247847, "grad_norm": 0.6351954936981201, "learning_rate": 4.187422830976751e-06, "loss": 1.7049, "mean_token_accuracy": 0.6644628793001175, "num_tokens": 328102962.0, "step": 1737 }, { "epoch": 2.39104991394148, "grad_norm": 0.7033203840255737, "learning_rate": 4.1692245774194375e-06, "loss": 1.4771, "mean_token_accuracy": 0.7026321589946747, "num_tokens": 328287314.0, "step": 1738 }, { "epoch": 2.3924268502581754, "grad_norm": 0.7017447352409363, "learning_rate": 4.151061351756032e-06, "loss": 1.1738, "mean_token_accuracy": 0.7574551627039909, "num_tokens": 328449830.0, "step": 1739 }, { "epoch": 2.393803786574871, "grad_norm": 0.7137829065322876, "learning_rate": 4.132933194175299e-06, "loss": 0.9895, "mean_token_accuracy": 0.7893725782632828, "num_tokens": 328598924.0, "step": 1740 }, { "epoch": 2.395180722891566, "grad_norm": 0.7523530721664429, "learning_rate": 4.114840144788437e-06, "loss": 0.8681, "mean_token_accuracy": 0.8178837597370148, "num_tokens": 328734770.0, "step": 1741 }, { "epoch": 2.396557659208262, "grad_norm": 0.18110291659832, "learning_rate": 4.096782243628943e-06, "loss": 1.3965, "mean_token_accuracy": 0.7355195060372353, "num_tokens": 328993859.0, "step": 1742 }, { "epoch": 2.397934595524957, "grad_norm": 0.6096231341362, "learning_rate": 4.078759530652554e-06, "loss": 1.7549, "mean_token_accuracy": 0.6550184786319733, "num_tokens": 329240756.0, "step": 1743 }, { "epoch": 2.3993115318416525, "grad_norm": 0.6809365153312683, "learning_rate": 4.060772045737144e-06, "loss": 1.5642, "mean_token_accuracy": 0.6903017535805702, "num_tokens": 329434137.0, "step": 1744 }, { "epoch": 2.400688468158348, "grad_norm": 0.7207406163215637, "learning_rate": 4.0428198286826425e-06, "loss": 1.3063, "mean_token_accuracy": 0.7340347468852997, "num_tokens": 329601564.0, "step": 1745 }, { "epoch": 2.402065404475043, "grad_norm": 0.7203844785690308, "learning_rate": 4.0249029192109335e-06, "loss": 1.0412, "mean_token_accuracy": 0.781635656952858, "num_tokens": 329753452.0, "step": 1746 }, { "epoch": 2.4034423407917385, "grad_norm": 0.7284123301506042, "learning_rate": 4.007021356965804e-06, "loss": 0.8774, "mean_token_accuracy": 0.8141548410058022, "num_tokens": 329892841.0, "step": 1747 }, { "epoch": 2.404819277108434, "grad_norm": 0.25973454117774963, "learning_rate": 3.989175181512794e-06, "loss": 1.3085, "mean_token_accuracy": 0.7669506371021271, "num_tokens": 330099020.0, "step": 1748 }, { "epoch": 2.406196213425129, "grad_norm": 0.5734166502952576, "learning_rate": 3.971364432339171e-06, "loss": 1.7113, "mean_token_accuracy": 0.6645490080118179, "num_tokens": 330368060.0, "step": 1749 }, { "epoch": 2.4075731497418245, "grad_norm": 0.6639246940612793, "learning_rate": 3.953589148853807e-06, "loss": 1.5651, "mean_token_accuracy": 0.6878276392817497, "num_tokens": 330568254.0, "step": 1750 }, { "epoch": 2.40895008605852, "grad_norm": 0.7019029855728149, "learning_rate": 3.935849370387104e-06, "loss": 1.3345, "mean_token_accuracy": 0.7322966232895851, "num_tokens": 330740083.0, "step": 1751 }, { "epoch": 2.410327022375215, "grad_norm": 0.7198159694671631, "learning_rate": 3.918145136190903e-06, "loss": 1.0913, "mean_token_accuracy": 0.7713266238570213, "num_tokens": 330895319.0, "step": 1752 }, { "epoch": 2.4117039586919105, "grad_norm": 0.7398266792297363, "learning_rate": 3.900476485438396e-06, "loss": 0.9176, "mean_token_accuracy": 0.8056728392839432, "num_tokens": 331037784.0, "step": 1753 }, { "epoch": 2.413080895008606, "grad_norm": 0.9339487552642822, "learning_rate": 3.882843457224046e-06, "loss": 0.9877, "mean_token_accuracy": 0.7942272126674652, "num_tokens": 331162080.0, "step": 1754 }, { "epoch": 2.414457831325301, "grad_norm": 0.5233159065246582, "learning_rate": 3.865246090563493e-06, "loss": 1.5884, "mean_token_accuracy": 0.672079399228096, "num_tokens": 331492703.0, "step": 1755 }, { "epoch": 2.4158347676419965, "grad_norm": 0.6508143544197083, "learning_rate": 3.8476844243934695e-06, "loss": 1.6524, "mean_token_accuracy": 0.6747035458683968, "num_tokens": 331703514.0, "step": 1756 }, { "epoch": 2.417211703958692, "grad_norm": 0.7054522037506104, "learning_rate": 3.830158497571717e-06, "loss": 1.4041, "mean_token_accuracy": 0.7156855389475822, "num_tokens": 331881447.0, "step": 1757 }, { "epoch": 2.418588640275387, "grad_norm": 0.7159806489944458, "learning_rate": 3.8126683488769e-06, "loss": 1.1583, "mean_token_accuracy": 0.7579380795359612, "num_tokens": 332040525.0, "step": 1758 }, { "epoch": 2.4199655765920824, "grad_norm": 0.7168437838554382, "learning_rate": 3.7952140170085174e-06, "loss": 0.9568, "mean_token_accuracy": 0.7968730926513672, "num_tokens": 332186536.0, "step": 1759 }, { "epoch": 2.4213425129087778, "grad_norm": 0.7758257389068604, "learning_rate": 3.777795540586817e-06, "loss": 0.8878, "mean_token_accuracy": 0.8153270781040192, "num_tokens": 332317736.0, "step": 1760 }, { "epoch": 2.4227194492254736, "grad_norm": 0.48313048481941223, "learning_rate": 3.7604129581527127e-06, "loss": 1.5217, "mean_token_accuracy": 0.6954300701618195, "num_tokens": 332614781.0, "step": 1761 }, { "epoch": 2.4240963855421684, "grad_norm": 0.6325194835662842, "learning_rate": 3.7430663081676977e-06, "loss": 1.714, "mean_token_accuracy": 0.6641905680298805, "num_tokens": 332840699.0, "step": 1762 }, { "epoch": 2.425473321858864, "grad_norm": 0.7000746130943298, "learning_rate": 3.7257556290137565e-06, "loss": 1.4793, "mean_token_accuracy": 0.7033317908644676, "num_tokens": 333024786.0, "step": 1763 }, { "epoch": 2.4268502581755595, "grad_norm": 0.7212076187133789, "learning_rate": 3.708480958993286e-06, "loss": 1.2285, "mean_token_accuracy": 0.7500164359807968, "num_tokens": 333187485.0, "step": 1764 }, { "epoch": 2.428227194492255, "grad_norm": 0.7268520593643188, "learning_rate": 3.6912423363290106e-06, "loss": 1.0003, "mean_token_accuracy": 0.791379302740097, "num_tokens": 333336470.0, "step": 1765 }, { "epoch": 2.42960413080895, "grad_norm": 0.7591962218284607, "learning_rate": 3.6740397991638864e-06, "loss": 0.8781, "mean_token_accuracy": 0.8176119327545166, "num_tokens": 333472073.0, "step": 1766 }, { "epoch": 2.4309810671256455, "grad_norm": 0.18491947650909424, "learning_rate": 3.6568733855610327e-06, "loss": 1.444, "mean_token_accuracy": 0.7290427833795547, "num_tokens": 333726965.0, "step": 1767 }, { "epoch": 2.432358003442341, "grad_norm": 0.609462559223175, "learning_rate": 3.639743133503635e-06, "loss": 1.8021, "mean_token_accuracy": 0.6489241868257523, "num_tokens": 333968450.0, "step": 1768 }, { "epoch": 2.433734939759036, "grad_norm": 0.6866064071655273, "learning_rate": 3.6226490808948713e-06, "loss": 1.5286, "mean_token_accuracy": 0.6960158422589302, "num_tokens": 334159795.0, "step": 1769 }, { "epoch": 2.4351118760757315, "grad_norm": 0.721662700176239, "learning_rate": 3.605591265557815e-06, "loss": 1.2908, "mean_token_accuracy": 0.7368486076593399, "num_tokens": 334327193.0, "step": 1770 }, { "epoch": 2.436488812392427, "grad_norm": 0.7069699168205261, "learning_rate": 3.5885697252353714e-06, "loss": 1.0269, "mean_token_accuracy": 0.7846559584140778, "num_tokens": 334479758.0, "step": 1771 }, { "epoch": 2.437865748709122, "grad_norm": 0.7240312695503235, "learning_rate": 3.5715844975901747e-06, "loss": 0.8709, "mean_token_accuracy": 0.8143385574221611, "num_tokens": 334619657.0, "step": 1772 }, { "epoch": 2.4392426850258175, "grad_norm": 0.25595933198928833, "learning_rate": 3.554635620204503e-06, "loss": 1.2657, "mean_token_accuracy": 0.769765816628933, "num_tokens": 334832568.0, "step": 1773 }, { "epoch": 2.440619621342513, "grad_norm": 0.5911667346954346, "learning_rate": 3.5377231305802153e-06, "loss": 1.7205, "mean_token_accuracy": 0.6605594828724861, "num_tokens": 335098539.0, "step": 1774 }, { "epoch": 2.441996557659208, "grad_norm": 0.6723808646202087, "learning_rate": 3.520847066138664e-06, "loss": 1.6012, "mean_token_accuracy": 0.6824354529380798, "num_tokens": 335297510.0, "step": 1775 }, { "epoch": 2.4433734939759035, "grad_norm": 0.7009894847869873, "learning_rate": 3.5040074642205934e-06, "loss": 1.3137, "mean_token_accuracy": 0.7311461716890335, "num_tokens": 335469014.0, "step": 1776 }, { "epoch": 2.444750430292599, "grad_norm": 0.719142496585846, "learning_rate": 3.4872043620860698e-06, "loss": 1.0699, "mean_token_accuracy": 0.7736940979957581, "num_tokens": 335624192.0, "step": 1777 }, { "epoch": 2.446127366609294, "grad_norm": 0.7118104100227356, "learning_rate": 3.4704377969144053e-06, "loss": 0.9068, "mean_token_accuracy": 0.8066819086670876, "num_tokens": 335766849.0, "step": 1778 }, { "epoch": 2.4475043029259895, "grad_norm": 0.9027979373931885, "learning_rate": 3.453707805804063e-06, "loss": 0.9676, "mean_token_accuracy": 0.7963006719946861, "num_tokens": 335891369.0, "step": 1779 }, { "epoch": 2.448881239242685, "grad_norm": 0.5272055268287659, "learning_rate": 3.437014425772587e-06, "loss": 1.5965, "mean_token_accuracy": 0.6715639531612396, "num_tokens": 336219561.0, "step": 1780 }, { "epoch": 2.45025817555938, "grad_norm": 0.6648910045623779, "learning_rate": 3.420357693756502e-06, "loss": 1.6965, "mean_token_accuracy": 0.6649904996156693, "num_tokens": 336428641.0, "step": 1781 }, { "epoch": 2.451635111876076, "grad_norm": 0.6951359510421753, "learning_rate": 3.4037376466112517e-06, "loss": 1.3812, "mean_token_accuracy": 0.7215996906161308, "num_tokens": 336605055.0, "step": 1782 }, { "epoch": 2.453012048192771, "grad_norm": 0.717552900314331, "learning_rate": 3.38715432111111e-06, "loss": 1.142, "mean_token_accuracy": 0.7633810937404633, "num_tokens": 336763176.0, "step": 1783 }, { "epoch": 2.4543889845094666, "grad_norm": 0.7249279022216797, "learning_rate": 3.3706077539490933e-06, "loss": 0.9699, "mean_token_accuracy": 0.7938893958926201, "num_tokens": 336908691.0, "step": 1784 }, { "epoch": 2.455765920826162, "grad_norm": 0.7902023196220398, "learning_rate": 3.354097981736899e-06, "loss": 0.8746, "mean_token_accuracy": 0.8178988397121429, "num_tokens": 337039197.0, "step": 1785 }, { "epoch": 2.4571428571428573, "grad_norm": 0.484509140253067, "learning_rate": 3.3376250410047973e-06, "loss": 1.5124, "mean_token_accuracy": 0.6956018954515457, "num_tokens": 337328467.0, "step": 1786 }, { "epoch": 2.4585197934595526, "grad_norm": 0.6379771828651428, "learning_rate": 3.3211889682015607e-06, "loss": 1.7159, "mean_token_accuracy": 0.6617947742342949, "num_tokens": 337551741.0, "step": 1787 }, { "epoch": 2.459896729776248, "grad_norm": 0.7065856456756592, "learning_rate": 3.3047897996943947e-06, "loss": 1.449, "mean_token_accuracy": 0.7101314291357994, "num_tokens": 337734625.0, "step": 1788 }, { "epoch": 2.4612736660929433, "grad_norm": 0.7084061503410339, "learning_rate": 3.288427571768851e-06, "loss": 1.1916, "mean_token_accuracy": 0.7546485066413879, "num_tokens": 337896776.0, "step": 1789 }, { "epoch": 2.4626506024096386, "grad_norm": 0.7147011160850525, "learning_rate": 3.2721023206287296e-06, "loss": 0.9688, "mean_token_accuracy": 0.798360787332058, "num_tokens": 338045435.0, "step": 1790 }, { "epoch": 2.464027538726334, "grad_norm": 0.7634836435317993, "learning_rate": 3.2558140823960293e-06, "loss": 0.8803, "mean_token_accuracy": 0.8153973966836929, "num_tokens": 338180865.0, "step": 1791 }, { "epoch": 2.4654044750430293, "grad_norm": 0.18025483191013336, "learning_rate": 3.2395628931108434e-06, "loss": 1.4494, "mean_token_accuracy": 0.7344219610095024, "num_tokens": 338439642.0, "step": 1792 }, { "epoch": 2.4667814113597246, "grad_norm": 0.6042923927307129, "learning_rate": 3.2233487887312974e-06, "loss": 1.738, "mean_token_accuracy": 0.6574280485510826, "num_tokens": 338684049.0, "step": 1793 }, { "epoch": 2.46815834767642, "grad_norm": 0.6756945848464966, "learning_rate": 3.207171805133453e-06, "loss": 1.5524, "mean_token_accuracy": 0.6915686875581741, "num_tokens": 338876500.0, "step": 1794 }, { "epoch": 2.4695352839931153, "grad_norm": 0.708626925945282, "learning_rate": 3.1910319781112364e-06, "loss": 1.2839, "mean_token_accuracy": 0.7383913397789001, "num_tokens": 339044354.0, "step": 1795 }, { "epoch": 2.4709122203098106, "grad_norm": 0.7090128064155579, "learning_rate": 3.174929343376374e-06, "loss": 1.0476, "mean_token_accuracy": 0.7813762575387955, "num_tokens": 339197107.0, "step": 1796 }, { "epoch": 2.472289156626506, "grad_norm": 0.726616621017456, "learning_rate": 3.1588639365582875e-06, "loss": 0.8692, "mean_token_accuracy": 0.8180599734187126, "num_tokens": 339336993.0, "step": 1797 }, { "epoch": 2.4736660929432013, "grad_norm": 0.25993111729621887, "learning_rate": 3.1428357932040133e-06, "loss": 1.2694, "mean_token_accuracy": 0.774927943944931, "num_tokens": 339542757.0, "step": 1798 }, { "epoch": 2.4750430292598966, "grad_norm": 0.5870371460914612, "learning_rate": 3.1268449487781606e-06, "loss": 1.7353, "mean_token_accuracy": 0.658186562359333, "num_tokens": 339811397.0, "step": 1799 }, { "epoch": 2.476419965576592, "grad_norm": 0.6763726472854614, "learning_rate": 3.1108914386627976e-06, "loss": 1.6141, "mean_token_accuracy": 0.68065045773983, "num_tokens": 340011394.0, "step": 1800 }, { "epoch": 2.4777969018932873, "grad_norm": 0.7111775279045105, "learning_rate": 3.0949752981573855e-06, "loss": 1.3356, "mean_token_accuracy": 0.7286431342363358, "num_tokens": 340182884.0, "step": 1801 }, { "epoch": 2.4791738382099826, "grad_norm": 0.7066949605941772, "learning_rate": 3.0790965624787027e-06, "loss": 1.0602, "mean_token_accuracy": 0.7788359820842743, "num_tokens": 340338005.0, "step": 1802 }, { "epoch": 2.4805507745266784, "grad_norm": 0.7209497690200806, "learning_rate": 3.0632552667607606e-06, "loss": 0.8976, "mean_token_accuracy": 0.8090033829212189, "num_tokens": 340480639.0, "step": 1803 }, { "epoch": 2.4819277108433733, "grad_norm": 0.8935545682907104, "learning_rate": 3.04745144605473e-06, "loss": 0.9541, "mean_token_accuracy": 0.7943127602338791, "num_tokens": 340605041.0, "step": 1804 }, { "epoch": 2.483304647160069, "grad_norm": 0.5217428207397461, "learning_rate": 3.0316851353288657e-06, "loss": 1.5601, "mean_token_accuracy": 0.6796040832996368, "num_tokens": 340931433.0, "step": 1805 }, { "epoch": 2.4846815834767644, "grad_norm": 0.6473129391670227, "learning_rate": 3.0159563694684245e-06, "loss": 1.6453, "mean_token_accuracy": 0.674877479672432, "num_tokens": 341142723.0, "step": 1806 }, { "epoch": 2.4860585197934597, "grad_norm": 0.6981895565986633, "learning_rate": 3.000265183275586e-06, "loss": 1.3944, "mean_token_accuracy": 0.7198233306407928, "num_tokens": 341320629.0, "step": 1807 }, { "epoch": 2.487435456110155, "grad_norm": 0.7059392929077148, "learning_rate": 2.984611611469386e-06, "loss": 1.1454, "mean_token_accuracy": 0.7631513476371765, "num_tokens": 341479822.0, "step": 1808 }, { "epoch": 2.4888123924268504, "grad_norm": 0.7153941988945007, "learning_rate": 2.9689956886856273e-06, "loss": 0.9508, "mean_token_accuracy": 0.8001632168889046, "num_tokens": 341625732.0, "step": 1809 }, { "epoch": 2.4901893287435457, "grad_norm": 0.783607542514801, "learning_rate": 2.953417449476812e-06, "loss": 0.8741, "mean_token_accuracy": 0.8172848150134087, "num_tokens": 341756724.0, "step": 1810 }, { "epoch": 2.491566265060241, "grad_norm": 0.4780285954475403, "learning_rate": 2.937876928312062e-06, "loss": 1.5344, "mean_token_accuracy": 0.6935426965355873, "num_tokens": 342053286.0, "step": 1811 }, { "epoch": 2.4929432013769364, "grad_norm": 0.6403234004974365, "learning_rate": 2.9223741595770392e-06, "loss": 1.7418, "mean_token_accuracy": 0.6603347733616829, "num_tokens": 342277770.0, "step": 1812 }, { "epoch": 2.4943201376936317, "grad_norm": 0.6929663419723511, "learning_rate": 2.9069091775738756e-06, "loss": 1.4565, "mean_token_accuracy": 0.7054594606161118, "num_tokens": 342460889.0, "step": 1813 }, { "epoch": 2.495697074010327, "grad_norm": 0.7154524326324463, "learning_rate": 2.8914820165210965e-06, "loss": 1.2054, "mean_token_accuracy": 0.7523983493447304, "num_tokens": 342623387.0, "step": 1814 }, { "epoch": 2.4970740103270224, "grad_norm": 0.6992544531822205, "learning_rate": 2.8760927105535376e-06, "loss": 0.9797, "mean_token_accuracy": 0.7923762127757072, "num_tokens": 342772080.0, "step": 1815 }, { "epoch": 2.4984509466437177, "grad_norm": 0.7469410300254822, "learning_rate": 2.860741293722278e-06, "loss": 0.8772, "mean_token_accuracy": 0.8135547637939453, "num_tokens": 342908106.0, "step": 1816 }, { "epoch": 2.499827882960413, "grad_norm": 0.18429484963417053, "learning_rate": 2.8454277999945603e-06, "loss": 1.428, "mean_token_accuracy": 0.7350741028785706, "num_tokens": 343161907.0, "step": 1817 }, { "epoch": 2.5012048192771084, "grad_norm": 0.6135638356208801, "learning_rate": 2.8301522632537183e-06, "loss": 1.7466, "mean_token_accuracy": 0.6605497151613235, "num_tokens": 343403525.0, "step": 1818 }, { "epoch": 2.5025817555938037, "grad_norm": 0.6938374638557434, "learning_rate": 2.8149147172990978e-06, "loss": 1.5272, "mean_token_accuracy": 0.6948526352643967, "num_tokens": 343594821.0, "step": 1819 }, { "epoch": 2.503958691910499, "grad_norm": 0.7154526710510254, "learning_rate": 2.7997151958459888e-06, "loss": 1.2744, "mean_token_accuracy": 0.740277923643589, "num_tokens": 343762264.0, "step": 1820 }, { "epoch": 2.5053356282271944, "grad_norm": 0.7085053324699402, "learning_rate": 2.7845537325255457e-06, "loss": 1.0464, "mean_token_accuracy": 0.7798978835344315, "num_tokens": 343914462.0, "step": 1821 }, { "epoch": 2.5067125645438897, "grad_norm": 0.7244232296943665, "learning_rate": 2.769430360884711e-06, "loss": 0.8606, "mean_token_accuracy": 0.821686677634716, "num_tokens": 344053657.0, "step": 1822 }, { "epoch": 2.508089500860585, "grad_norm": 0.258849173784256, "learning_rate": 2.7543451143861344e-06, "loss": 1.2319, "mean_token_accuracy": 0.7755017280578613, "num_tokens": 344260372.0, "step": 1823 }, { "epoch": 2.509466437177281, "grad_norm": 0.5854313969612122, "learning_rate": 2.7392980264081306e-06, "loss": 1.7571, "mean_token_accuracy": 0.6575272381305695, "num_tokens": 344527850.0, "step": 1824 }, { "epoch": 2.5108433734939757, "grad_norm": 0.6785693168640137, "learning_rate": 2.7242891302445686e-06, "loss": 1.6289, "mean_token_accuracy": 0.6777406334877014, "num_tokens": 344727866.0, "step": 1825 }, { "epoch": 2.5122203098106715, "grad_norm": 0.7080572843551636, "learning_rate": 2.709318459104815e-06, "loss": 1.3244, "mean_token_accuracy": 0.7308924868702888, "num_tokens": 344899470.0, "step": 1826 }, { "epoch": 2.5135972461273663, "grad_norm": 0.7199095487594604, "learning_rate": 2.694386046113655e-06, "loss": 1.0629, "mean_token_accuracy": 0.7779956459999084, "num_tokens": 345054417.0, "step": 1827 }, { "epoch": 2.514974182444062, "grad_norm": 0.7281377911567688, "learning_rate": 2.679491924311226e-06, "loss": 0.9185, "mean_token_accuracy": 0.8098790869116783, "num_tokens": 345197066.0, "step": 1828 }, { "epoch": 2.5163511187607575, "grad_norm": 0.9195863604545593, "learning_rate": 2.664636126652942e-06, "loss": 0.9815, "mean_token_accuracy": 0.7893208265304565, "num_tokens": 345321210.0, "step": 1829 }, { "epoch": 2.517728055077453, "grad_norm": 0.5108121633529663, "learning_rate": 2.6498186860094176e-06, "loss": 1.5619, "mean_token_accuracy": 0.6752858310937881, "num_tokens": 345651601.0, "step": 1830 }, { "epoch": 2.519104991394148, "grad_norm": 0.6448004245758057, "learning_rate": 2.63503963516639e-06, "loss": 1.6559, "mean_token_accuracy": 0.6741896569728851, "num_tokens": 345864008.0, "step": 1831 }, { "epoch": 2.5204819277108435, "grad_norm": 0.7041764259338379, "learning_rate": 2.6202990068246625e-06, "loss": 1.4338, "mean_token_accuracy": 0.7120680287480354, "num_tokens": 346042403.0, "step": 1832 }, { "epoch": 2.521858864027539, "grad_norm": 0.7159610986709595, "learning_rate": 2.605596833600017e-06, "loss": 1.1829, "mean_token_accuracy": 0.7552445232868195, "num_tokens": 346201415.0, "step": 1833 }, { "epoch": 2.523235800344234, "grad_norm": 0.7174326181411743, "learning_rate": 2.5909331480231558e-06, "loss": 0.9526, "mean_token_accuracy": 0.7997385486960411, "num_tokens": 346347481.0, "step": 1834 }, { "epoch": 2.5246127366609294, "grad_norm": 0.7922436594963074, "learning_rate": 2.5763079825396165e-06, "loss": 0.8709, "mean_token_accuracy": 0.8182816356420517, "num_tokens": 346478415.0, "step": 1835 }, { "epoch": 2.525989672977625, "grad_norm": 0.46641403436660767, "learning_rate": 2.5617213695097045e-06, "loss": 1.5162, "mean_token_accuracy": 0.696432463824749, "num_tokens": 346774533.0, "step": 1836 }, { "epoch": 2.52736660929432, "grad_norm": 0.6326403021812439, "learning_rate": 2.547173341208422e-06, "loss": 1.7452, "mean_token_accuracy": 0.6567660570144653, "num_tokens": 346999946.0, "step": 1837 }, { "epoch": 2.5287435456110154, "grad_norm": 0.6984650492668152, "learning_rate": 2.532663929825405e-06, "loss": 1.4778, "mean_token_accuracy": 0.7064724937081337, "num_tokens": 347185264.0, "step": 1838 }, { "epoch": 2.5301204819277108, "grad_norm": 0.7262609004974365, "learning_rate": 2.5181931674648265e-06, "loss": 1.2399, "mean_token_accuracy": 0.7472666800022125, "num_tokens": 347348467.0, "step": 1839 }, { "epoch": 2.531497418244406, "grad_norm": 0.7113873958587646, "learning_rate": 2.5037610861453578e-06, "loss": 0.9984, "mean_token_accuracy": 0.7895892709493637, "num_tokens": 347497728.0, "step": 1840 }, { "epoch": 2.5328743545611014, "grad_norm": 0.7424374222755432, "learning_rate": 2.4893677178000797e-06, "loss": 0.8714, "mean_token_accuracy": 0.8159217983484268, "num_tokens": 347633506.0, "step": 1841 }, { "epoch": 2.5342512908777968, "grad_norm": 0.18732605874538422, "learning_rate": 2.475013094276413e-06, "loss": 1.3748, "mean_token_accuracy": 0.7341266795992851, "num_tokens": 347895502.0, "step": 1842 }, { "epoch": 2.535628227194492, "grad_norm": 0.6101974248886108, "learning_rate": 2.460697247336048e-06, "loss": 1.7408, "mean_token_accuracy": 0.6583914905786514, "num_tokens": 348138478.0, "step": 1843 }, { "epoch": 2.5370051635111874, "grad_norm": 0.6800745129585266, "learning_rate": 2.4464202086548874e-06, "loss": 1.5271, "mean_token_accuracy": 0.69540224224329, "num_tokens": 348329384.0, "step": 1844 }, { "epoch": 2.538382099827883, "grad_norm": 0.7203128337860107, "learning_rate": 2.4321820098229476e-06, "loss": 1.2601, "mean_token_accuracy": 0.7396189644932747, "num_tokens": 348496688.0, "step": 1845 }, { "epoch": 2.539759036144578, "grad_norm": 0.713460385799408, "learning_rate": 2.4179826823443266e-06, "loss": 1.0297, "mean_token_accuracy": 0.7841876745223999, "num_tokens": 348648809.0, "step": 1846 }, { "epoch": 2.541135972461274, "grad_norm": 0.7171739935874939, "learning_rate": 2.4038222576370873e-06, "loss": 0.8619, "mean_token_accuracy": 0.8170942291617393, "num_tokens": 348787889.0, "step": 1847 }, { "epoch": 2.5425129087779688, "grad_norm": 0.26832571625709534, "learning_rate": 2.3897007670332385e-06, "loss": 1.2665, "mean_token_accuracy": 0.7687693536281586, "num_tokens": 348994122.0, "step": 1848 }, { "epoch": 2.5438898450946645, "grad_norm": 0.5764619708061218, "learning_rate": 2.3756182417786322e-06, "loss": 1.7223, "mean_token_accuracy": 0.6619002521038055, "num_tokens": 349264919.0, "step": 1849 }, { "epoch": 2.54526678141136, "grad_norm": 0.6667534708976746, "learning_rate": 2.3615747130329013e-06, "loss": 1.5802, "mean_token_accuracy": 0.6857097968459129, "num_tokens": 349466188.0, "step": 1850 }, { "epoch": 2.546643717728055, "grad_norm": 0.7030420303344727, "learning_rate": 2.3475702118693987e-06, "loss": 1.33, "mean_token_accuracy": 0.7310289219021797, "num_tokens": 349638853.0, "step": 1851 }, { "epoch": 2.5480206540447505, "grad_norm": 0.7106204032897949, "learning_rate": 2.3336047692751216e-06, "loss": 1.0755, "mean_token_accuracy": 0.7729623764753342, "num_tokens": 349794402.0, "step": 1852 }, { "epoch": 2.549397590361446, "grad_norm": 0.7266464829444885, "learning_rate": 2.3196784161506415e-06, "loss": 0.938, "mean_token_accuracy": 0.8033684119582176, "num_tokens": 349937397.0, "step": 1853 }, { "epoch": 2.550774526678141, "grad_norm": 0.9123243689537048, "learning_rate": 2.3057911833100377e-06, "loss": 0.9655, "mean_token_accuracy": 0.7947684526443481, "num_tokens": 350062282.0, "step": 1854 }, { "epoch": 2.5521514629948365, "grad_norm": 0.5233032703399658, "learning_rate": 2.291943101480847e-06, "loss": 1.572, "mean_token_accuracy": 0.6750996708869934, "num_tokens": 350385944.0, "step": 1855 }, { "epoch": 2.553528399311532, "grad_norm": 0.6640167832374573, "learning_rate": 2.278134201303952e-06, "loss": 1.6745, "mean_token_accuracy": 0.6709112375974655, "num_tokens": 350595618.0, "step": 1856 }, { "epoch": 2.554905335628227, "grad_norm": 0.6896263957023621, "learning_rate": 2.2643645133335546e-06, "loss": 1.4013, "mean_token_accuracy": 0.7183136865496635, "num_tokens": 350772752.0, "step": 1857 }, { "epoch": 2.5562822719449225, "grad_norm": 0.7109851241111755, "learning_rate": 2.250634068037099e-06, "loss": 1.1259, "mean_token_accuracy": 0.7662733271718025, "num_tokens": 350931545.0, "step": 1858 }, { "epoch": 2.557659208261618, "grad_norm": 0.7111606597900391, "learning_rate": 2.2369428957951865e-06, "loss": 0.9566, "mean_token_accuracy": 0.8020409420132637, "num_tokens": 351077403.0, "step": 1859 }, { "epoch": 2.559036144578313, "grad_norm": 0.76932293176651, "learning_rate": 2.223291026901533e-06, "loss": 0.8724, "mean_token_accuracy": 0.8158709704875946, "num_tokens": 351208614.0, "step": 1860 }, { "epoch": 2.5604130808950085, "grad_norm": 0.480739027261734, "learning_rate": 2.209678491562881e-06, "loss": 1.5227, "mean_token_accuracy": 0.6941847205162048, "num_tokens": 351502509.0, "step": 1861 }, { "epoch": 2.561790017211704, "grad_norm": 0.6342624425888062, "learning_rate": 2.1961053198989467e-06, "loss": 1.6873, "mean_token_accuracy": 0.6670400202274323, "num_tokens": 351725599.0, "step": 1862 }, { "epoch": 2.563166953528399, "grad_norm": 0.696419358253479, "learning_rate": 2.182571541942349e-06, "loss": 1.4575, "mean_token_accuracy": 0.7068997547030449, "num_tokens": 351907771.0, "step": 1863 }, { "epoch": 2.5645438898450945, "grad_norm": 0.7000638246536255, "learning_rate": 2.1690771876385375e-06, "loss": 1.1793, "mean_token_accuracy": 0.7564426064491272, "num_tokens": 352069320.0, "step": 1864 }, { "epoch": 2.56592082616179, "grad_norm": 0.6961361765861511, "learning_rate": 2.1556222868457377e-06, "loss": 0.9782, "mean_token_accuracy": 0.7942206859588623, "num_tokens": 352217735.0, "step": 1865 }, { "epoch": 2.5672977624784856, "grad_norm": 0.7389585971832275, "learning_rate": 2.142206869334873e-06, "loss": 0.871, "mean_token_accuracy": 0.8179055526852608, "num_tokens": 352353121.0, "step": 1866 }, { "epoch": 2.5686746987951805, "grad_norm": 0.17864498496055603, "learning_rate": 2.128830964789508e-06, "loss": 1.4554, "mean_token_accuracy": 0.7332155257463455, "num_tokens": 352611229.0, "step": 1867 }, { "epoch": 2.5700516351118763, "grad_norm": 0.609570324420929, "learning_rate": 2.1154946028057744e-06, "loss": 1.7783, "mean_token_accuracy": 0.6532195508480072, "num_tokens": 352855875.0, "step": 1868 }, { "epoch": 2.571428571428571, "grad_norm": 0.6770307421684265, "learning_rate": 2.102197812892317e-06, "loss": 1.522, "mean_token_accuracy": 0.6938225775957108, "num_tokens": 353047684.0, "step": 1869 }, { "epoch": 2.572805507745267, "grad_norm": 0.711121141910553, "learning_rate": 2.0889406244702193e-06, "loss": 1.2729, "mean_token_accuracy": 0.7386659160256386, "num_tokens": 353215115.0, "step": 1870 }, { "epoch": 2.5741824440619623, "grad_norm": 0.7181096076965332, "learning_rate": 2.075723066872939e-06, "loss": 1.0269, "mean_token_accuracy": 0.7845107391476631, "num_tokens": 353367472.0, "step": 1871 }, { "epoch": 2.5755593803786576, "grad_norm": 0.7448581457138062, "learning_rate": 2.062545169346235e-06, "loss": 0.9042, "mean_token_accuracy": 0.8105903267860413, "num_tokens": 353507280.0, "step": 1872 }, { "epoch": 2.576936316695353, "grad_norm": 0.2549469470977783, "learning_rate": 2.0494069610481347e-06, "loss": 1.2218, "mean_token_accuracy": 0.7749459370970726, "num_tokens": 353716682.0, "step": 1873 }, { "epoch": 2.5783132530120483, "grad_norm": 0.5823709964752197, "learning_rate": 2.0363084710488334e-06, "loss": 1.7096, "mean_token_accuracy": 0.664029449224472, "num_tokens": 353986417.0, "step": 1874 }, { "epoch": 2.5796901893287436, "grad_norm": 0.6664543151855469, "learning_rate": 2.023249728330645e-06, "loss": 1.6102, "mean_token_accuracy": 0.6831080764532089, "num_tokens": 354186269.0, "step": 1875 }, { "epoch": 2.581067125645439, "grad_norm": 0.710763156414032, "learning_rate": 2.0102307617879367e-06, "loss": 1.3455, "mean_token_accuracy": 0.7264646589756012, "num_tokens": 354358334.0, "step": 1876 }, { "epoch": 2.5824440619621343, "grad_norm": 0.710014283657074, "learning_rate": 1.997251600227068e-06, "loss": 1.0624, "mean_token_accuracy": 0.7774544879794121, "num_tokens": 354513587.0, "step": 1877 }, { "epoch": 2.5838209982788296, "grad_norm": 0.7412890791893005, "learning_rate": 1.98431227236632e-06, "loss": 0.9386, "mean_token_accuracy": 0.8037263751029968, "num_tokens": 354656452.0, "step": 1878 }, { "epoch": 2.585197934595525, "grad_norm": 0.9001291394233704, "learning_rate": 1.971412806835842e-06, "loss": 0.9684, "mean_token_accuracy": 0.7955762296915054, "num_tokens": 354781399.0, "step": 1879 }, { "epoch": 2.5865748709122203, "grad_norm": 0.5096123218536377, "learning_rate": 1.9585532321775736e-06, "loss": 1.5265, "mean_token_accuracy": 0.6836948692798615, "num_tokens": 355116124.0, "step": 1880 }, { "epoch": 2.5879518072289156, "grad_norm": 0.6588422656059265, "learning_rate": 1.9457335768451967e-06, "loss": 1.6588, "mean_token_accuracy": 0.672054760158062, "num_tokens": 355327120.0, "step": 1881 }, { "epoch": 2.589328743545611, "grad_norm": 0.6864471435546875, "learning_rate": 1.9329538692040594e-06, "loss": 1.3676, "mean_token_accuracy": 0.7239294722676277, "num_tokens": 355504335.0, "step": 1882 }, { "epoch": 2.5907056798623063, "grad_norm": 0.709483802318573, "learning_rate": 1.9202141375311335e-06, "loss": 1.1347, "mean_token_accuracy": 0.7651729062199593, "num_tokens": 355663048.0, "step": 1883 }, { "epoch": 2.5920826161790016, "grad_norm": 0.7142502665519714, "learning_rate": 1.9075144100149234e-06, "loss": 0.9403, "mean_token_accuracy": 0.801085963845253, "num_tokens": 355808922.0, "step": 1884 }, { "epoch": 2.593459552495697, "grad_norm": 0.814610481262207, "learning_rate": 1.8948547147554252e-06, "loss": 0.8933, "mean_token_accuracy": 0.8147625476121902, "num_tokens": 355939668.0, "step": 1885 }, { "epoch": 2.5948364888123923, "grad_norm": 0.4795001447200775, "learning_rate": 1.8822350797640543e-06, "loss": 1.5355, "mean_token_accuracy": 0.6919986084103584, "num_tokens": 356230796.0, "step": 1886 }, { "epoch": 2.596213425129088, "grad_norm": 0.6374743580818176, "learning_rate": 1.8696555329635924e-06, "loss": 1.7177, "mean_token_accuracy": 0.6617404073476791, "num_tokens": 356455188.0, "step": 1887 }, { "epoch": 2.597590361445783, "grad_norm": 0.6852765679359436, "learning_rate": 1.857116102188119e-06, "loss": 1.4376, "mean_token_accuracy": 0.7073842659592628, "num_tokens": 356639173.0, "step": 1888 }, { "epoch": 2.5989672977624787, "grad_norm": 0.7156987190246582, "learning_rate": 1.8446168151829424e-06, "loss": 1.1833, "mean_token_accuracy": 0.7542614787817001, "num_tokens": 356801880.0, "step": 1889 }, { "epoch": 2.6003442340791736, "grad_norm": 0.7090957760810852, "learning_rate": 1.8321576996045553e-06, "loss": 0.995, "mean_token_accuracy": 0.7923205643892288, "num_tokens": 356950834.0, "step": 1890 }, { "epoch": 2.6017211703958694, "grad_norm": 0.7545850276947021, "learning_rate": 1.8197387830205682e-06, "loss": 0.8903, "mean_token_accuracy": 0.8141032829880714, "num_tokens": 357086431.0, "step": 1891 }, { "epoch": 2.6030981067125647, "grad_norm": 0.17923860251903534, "learning_rate": 1.8073600929096314e-06, "loss": 1.4033, "mean_token_accuracy": 0.7385569959878922, "num_tokens": 357344204.0, "step": 1892 }, { "epoch": 2.60447504302926, "grad_norm": 0.6089861989021301, "learning_rate": 1.7950216566614086e-06, "loss": 1.7395, "mean_token_accuracy": 0.6591399908065796, "num_tokens": 357588632.0, "step": 1893 }, { "epoch": 2.6058519793459554, "grad_norm": 0.6749317646026611, "learning_rate": 1.782723501576482e-06, "loss": 1.5353, "mean_token_accuracy": 0.6932085826992989, "num_tokens": 357780309.0, "step": 1894 }, { "epoch": 2.6072289156626507, "grad_norm": 0.69772869348526, "learning_rate": 1.770465654866309e-06, "loss": 1.2506, "mean_token_accuracy": 0.7434923723340034, "num_tokens": 357947690.0, "step": 1895 }, { "epoch": 2.608605851979346, "grad_norm": 0.7112127542495728, "learning_rate": 1.7582481436531585e-06, "loss": 1.0485, "mean_token_accuracy": 0.7807156220078468, "num_tokens": 358100152.0, "step": 1896 }, { "epoch": 2.6099827882960414, "grad_norm": 0.7430719137191772, "learning_rate": 1.7460709949700504e-06, "loss": 0.8864, "mean_token_accuracy": 0.811816930770874, "num_tokens": 358239783.0, "step": 1897 }, { "epoch": 2.6113597246127367, "grad_norm": 0.26531580090522766, "learning_rate": 1.733934235760697e-06, "loss": 1.2514, "mean_token_accuracy": 0.7717476710677147, "num_tokens": 358448760.0, "step": 1898 }, { "epoch": 2.612736660929432, "grad_norm": 0.5840001702308655, "learning_rate": 1.7218378928794455e-06, "loss": 1.7597, "mean_token_accuracy": 0.6573799327015877, "num_tokens": 358715251.0, "step": 1899 }, { "epoch": 2.6141135972461274, "grad_norm": 0.6724269390106201, "learning_rate": 1.7097819930912129e-06, "loss": 1.617, "mean_token_accuracy": 0.6790646389126778, "num_tokens": 358914780.0, "step": 1900 }, { "epoch": 2.6154905335628227, "grad_norm": 0.7186145782470703, "learning_rate": 1.6977665630714345e-06, "loss": 1.3514, "mean_token_accuracy": 0.7245762720704079, "num_tokens": 359086655.0, "step": 1901 }, { "epoch": 2.616867469879518, "grad_norm": 0.7007203698158264, "learning_rate": 1.6857916294059929e-06, "loss": 1.0864, "mean_token_accuracy": 0.7722152322530746, "num_tokens": 359241661.0, "step": 1902 }, { "epoch": 2.6182444061962133, "grad_norm": 0.7311568856239319, "learning_rate": 1.673857218591175e-06, "loss": 0.9016, "mean_token_accuracy": 0.8097322881221771, "num_tokens": 359384381.0, "step": 1903 }, { "epoch": 2.6196213425129087, "grad_norm": 0.9065669775009155, "learning_rate": 1.661963357033607e-06, "loss": 0.9754, "mean_token_accuracy": 0.7942287400364876, "num_tokens": 359508719.0, "step": 1904 }, { "epoch": 2.620998278829604, "grad_norm": 0.5209767818450928, "learning_rate": 1.650110071050175e-06, "loss": 1.5581, "mean_token_accuracy": 0.6764244362711906, "num_tokens": 359839351.0, "step": 1905 }, { "epoch": 2.6223752151462993, "grad_norm": 0.6556630730628967, "learning_rate": 1.6382973868680062e-06, "loss": 1.6727, "mean_token_accuracy": 0.6725062802433968, "num_tokens": 360049071.0, "step": 1906 }, { "epoch": 2.6237521514629947, "grad_norm": 0.6963779330253601, "learning_rate": 1.6265253306243823e-06, "loss": 1.3879, "mean_token_accuracy": 0.7199321910738945, "num_tokens": 360226175.0, "step": 1907 }, { "epoch": 2.6251290877796905, "grad_norm": 0.720212996006012, "learning_rate": 1.6147939283666892e-06, "loss": 1.131, "mean_token_accuracy": 0.7639685645699501, "num_tokens": 360384951.0, "step": 1908 }, { "epoch": 2.6265060240963853, "grad_norm": 0.7146462798118591, "learning_rate": 1.6031032060523633e-06, "loss": 0.9569, "mean_token_accuracy": 0.7997109442949295, "num_tokens": 360531150.0, "step": 1909 }, { "epoch": 2.627882960413081, "grad_norm": 0.78847336769104, "learning_rate": 1.5914531895488262e-06, "loss": 0.8838, "mean_token_accuracy": 0.8178327903151512, "num_tokens": 360662648.0, "step": 1910 }, { "epoch": 2.629259896729776, "grad_norm": 0.461379736661911, "learning_rate": 1.5798439046334357e-06, "loss": 1.4828, "mean_token_accuracy": 0.6999989002943039, "num_tokens": 360957377.0, "step": 1911 }, { "epoch": 2.630636833046472, "grad_norm": 0.6264628171920776, "learning_rate": 1.5682753769934245e-06, "loss": 1.7221, "mean_token_accuracy": 0.6637033596634865, "num_tokens": 361181303.0, "step": 1912 }, { "epoch": 2.632013769363167, "grad_norm": 0.6989827752113342, "learning_rate": 1.5567476322258413e-06, "loss": 1.4595, "mean_token_accuracy": 0.7082502618432045, "num_tokens": 361365113.0, "step": 1913 }, { "epoch": 2.6333907056798624, "grad_norm": 0.7197508215904236, "learning_rate": 1.5452606958375005e-06, "loss": 1.191, "mean_token_accuracy": 0.7524938508868217, "num_tokens": 361527947.0, "step": 1914 }, { "epoch": 2.6347676419965578, "grad_norm": 0.70897376537323, "learning_rate": 1.53381459324492e-06, "loss": 1.0102, "mean_token_accuracy": 0.7887141853570938, "num_tokens": 361677598.0, "step": 1915 }, { "epoch": 2.636144578313253, "grad_norm": 0.7482555508613586, "learning_rate": 1.5224093497742654e-06, "loss": 0.8688, "mean_token_accuracy": 0.8184317052364349, "num_tokens": 361813691.0, "step": 1916 }, { "epoch": 2.6375215146299484, "grad_norm": 0.18295319378376007, "learning_rate": 1.511044990661299e-06, "loss": 1.39, "mean_token_accuracy": 0.7402258440852165, "num_tokens": 362072352.0, "step": 1917 }, { "epoch": 2.6388984509466438, "grad_norm": 0.6088801026344299, "learning_rate": 1.4997215410513178e-06, "loss": 1.7344, "mean_token_accuracy": 0.661285474896431, "num_tokens": 362316938.0, "step": 1918 }, { "epoch": 2.640275387263339, "grad_norm": 0.6796539425849915, "learning_rate": 1.4884390259991023e-06, "loss": 1.5338, "mean_token_accuracy": 0.6947265192866325, "num_tokens": 362508653.0, "step": 1919 }, { "epoch": 2.6416523235800344, "grad_norm": 0.6966733932495117, "learning_rate": 1.4771974704688564e-06, "loss": 1.2496, "mean_token_accuracy": 0.7448520585894585, "num_tokens": 362675595.0, "step": 1920 }, { "epoch": 2.6430292598967298, "grad_norm": 0.7095774412155151, "learning_rate": 1.4659968993341612e-06, "loss": 1.0557, "mean_token_accuracy": 0.7782033309340477, "num_tokens": 362827750.0, "step": 1921 }, { "epoch": 2.644406196213425, "grad_norm": 0.738463282585144, "learning_rate": 1.4548373373779078e-06, "loss": 0.8796, "mean_token_accuracy": 0.8131650686264038, "num_tokens": 362967115.0, "step": 1922 }, { "epoch": 2.6457831325301204, "grad_norm": 0.2534920573234558, "learning_rate": 1.4437188092922494e-06, "loss": 1.238, "mean_token_accuracy": 0.7720615863800049, "num_tokens": 363175373.0, "step": 1923 }, { "epoch": 2.6471600688468158, "grad_norm": 0.5814939737319946, "learning_rate": 1.4326413396785488e-06, "loss": 1.7296, "mean_token_accuracy": 0.6589570418000221, "num_tokens": 363445607.0, "step": 1924 }, { "epoch": 2.648537005163511, "grad_norm": 0.6676390171051025, "learning_rate": 1.4216049530473175e-06, "loss": 1.6029, "mean_token_accuracy": 0.683783546090126, "num_tokens": 363645919.0, "step": 1925 }, { "epoch": 2.6499139414802064, "grad_norm": 0.6943197846412659, "learning_rate": 1.4106096738181707e-06, "loss": 1.3018, "mean_token_accuracy": 0.7349057570099831, "num_tokens": 363818230.0, "step": 1926 }, { "epoch": 2.6512908777969018, "grad_norm": 0.7241226434707642, "learning_rate": 1.3996555263197587e-06, "loss": 1.0657, "mean_token_accuracy": 0.7750911712646484, "num_tokens": 363973580.0, "step": 1927 }, { "epoch": 2.652667814113597, "grad_norm": 0.7290360927581787, "learning_rate": 1.3887425347897287e-06, "loss": 0.9171, "mean_token_accuracy": 0.8063676208257675, "num_tokens": 364116313.0, "step": 1928 }, { "epoch": 2.654044750430293, "grad_norm": 0.9089776873588562, "learning_rate": 1.3778707233746657e-06, "loss": 0.9961, "mean_token_accuracy": 0.7932952716946602, "num_tokens": 364240612.0, "step": 1929 }, { "epoch": 2.6554216867469878, "grad_norm": 0.5158873796463013, "learning_rate": 1.367040116130025e-06, "loss": 1.5482, "mean_token_accuracy": 0.6784003749489784, "num_tokens": 364564352.0, "step": 1930 }, { "epoch": 2.6567986230636835, "grad_norm": 0.6704475283622742, "learning_rate": 1.3562507370201062e-06, "loss": 1.7181, "mean_token_accuracy": 0.6654514893889427, "num_tokens": 364774729.0, "step": 1931 }, { "epoch": 2.6581755593803784, "grad_norm": 0.7013496160507202, "learning_rate": 1.3455026099179835e-06, "loss": 1.4139, "mean_token_accuracy": 0.7163332030177116, "num_tokens": 364951929.0, "step": 1932 }, { "epoch": 2.659552495697074, "grad_norm": 0.7190361618995667, "learning_rate": 1.3347957586054494e-06, "loss": 1.112, "mean_token_accuracy": 0.7674020454287529, "num_tokens": 365110575.0, "step": 1933 }, { "epoch": 2.6609294320137695, "grad_norm": 0.7379345893859863, "learning_rate": 1.3241302067729689e-06, "loss": 0.9675, "mean_token_accuracy": 0.7962216213345528, "num_tokens": 365256394.0, "step": 1934 }, { "epoch": 2.662306368330465, "grad_norm": 0.8029128313064575, "learning_rate": 1.313505978019627e-06, "loss": 0.8745, "mean_token_accuracy": 0.8203368782997131, "num_tokens": 365387330.0, "step": 1935 }, { "epoch": 2.66368330464716, "grad_norm": 0.4753556549549103, "learning_rate": 1.3029230958530791e-06, "loss": 1.4967, "mean_token_accuracy": 0.6967713609337807, "num_tokens": 365676453.0, "step": 1936 }, { "epoch": 2.6650602409638555, "grad_norm": 0.6347079277038574, "learning_rate": 1.2923815836894926e-06, "loss": 1.7239, "mean_token_accuracy": 0.6630264148116112, "num_tokens": 365899645.0, "step": 1937 }, { "epoch": 2.666437177280551, "grad_norm": 0.681901752948761, "learning_rate": 1.2818814648534895e-06, "loss": 1.4789, "mean_token_accuracy": 0.7009682506322861, "num_tokens": 366084028.0, "step": 1938 }, { "epoch": 2.667814113597246, "grad_norm": 0.7119235396385193, "learning_rate": 1.271422762578114e-06, "loss": 1.1902, "mean_token_accuracy": 0.7558143064379692, "num_tokens": 366247658.0, "step": 1939 }, { "epoch": 2.6691910499139415, "grad_norm": 0.710078239440918, "learning_rate": 1.2610055000047683e-06, "loss": 0.9875, "mean_token_accuracy": 0.7935100197792053, "num_tokens": 366397302.0, "step": 1940 }, { "epoch": 2.670567986230637, "grad_norm": 0.7363475561141968, "learning_rate": 1.2506297001831568e-06, "loss": 0.8888, "mean_token_accuracy": 0.8126700520515442, "num_tokens": 366533783.0, "step": 1941 }, { "epoch": 2.671944922547332, "grad_norm": 0.18314048647880554, "learning_rate": 1.240295386071253e-06, "loss": 1.4233, "mean_token_accuracy": 0.73854910582304, "num_tokens": 366789215.0, "step": 1942 }, { "epoch": 2.6733218588640275, "grad_norm": 0.615715503692627, "learning_rate": 1.2300025805352277e-06, "loss": 1.7374, "mean_token_accuracy": 0.6593687385320663, "num_tokens": 367028431.0, "step": 1943 }, { "epoch": 2.674698795180723, "grad_norm": 0.6806591153144836, "learning_rate": 1.2197513063494082e-06, "loss": 1.5145, "mean_token_accuracy": 0.6980579644441605, "num_tokens": 367218753.0, "step": 1944 }, { "epoch": 2.676075731497418, "grad_norm": 0.7155870795249939, "learning_rate": 1.2095415861962367e-06, "loss": 1.2968, "mean_token_accuracy": 0.7365723848342896, "num_tokens": 367386017.0, "step": 1945 }, { "epoch": 2.6774526678141135, "grad_norm": 0.7102594375610352, "learning_rate": 1.1993734426661985e-06, "loss": 1.0288, "mean_token_accuracy": 0.78189317882061, "num_tokens": 367538370.0, "step": 1946 }, { "epoch": 2.678829604130809, "grad_norm": 0.7340806126594543, "learning_rate": 1.1892468982577899e-06, "loss": 0.8771, "mean_token_accuracy": 0.8138488158583641, "num_tokens": 367677771.0, "step": 1947 }, { "epoch": 2.680206540447504, "grad_norm": 0.253791481256485, "learning_rate": 1.1791619753774653e-06, "loss": 1.2482, "mean_token_accuracy": 0.7735838368535042, "num_tokens": 367888080.0, "step": 1948 }, { "epoch": 2.6815834767641995, "grad_norm": 0.5747764706611633, "learning_rate": 1.1691186963395861e-06, "loss": 1.7504, "mean_token_accuracy": 0.656780481338501, "num_tokens": 368155755.0, "step": 1949 }, { "epoch": 2.6829604130808953, "grad_norm": 0.6689324975013733, "learning_rate": 1.1591170833663655e-06, "loss": 1.5896, "mean_token_accuracy": 0.6838538274168968, "num_tokens": 368355316.0, "step": 1950 }, { "epoch": 2.68433734939759, "grad_norm": 0.7073647379875183, "learning_rate": 1.1491571585878281e-06, "loss": 1.3175, "mean_token_accuracy": 0.7315395250916481, "num_tokens": 368526959.0, "step": 1951 }, { "epoch": 2.685714285714286, "grad_norm": 0.7201231122016907, "learning_rate": 1.1392389440417584e-06, "loss": 1.0948, "mean_token_accuracy": 0.7732421606779099, "num_tokens": 368681984.0, "step": 1952 }, { "epoch": 2.687091222030981, "grad_norm": 0.7249655723571777, "learning_rate": 1.1293624616736464e-06, "loss": 0.9102, "mean_token_accuracy": 0.8054459765553474, "num_tokens": 368824617.0, "step": 1953 }, { "epoch": 2.6884681583476766, "grad_norm": 0.8863624334335327, "learning_rate": 1.1195277333366517e-06, "loss": 0.9568, "mean_token_accuracy": 0.7996673062443733, "num_tokens": 368949564.0, "step": 1954 }, { "epoch": 2.689845094664372, "grad_norm": 0.519594669342041, "learning_rate": 1.1097347807915359e-06, "loss": 1.5888, "mean_token_accuracy": 0.6736690327525139, "num_tokens": 369274573.0, "step": 1955 }, { "epoch": 2.6912220309810673, "grad_norm": 0.6434212327003479, "learning_rate": 1.099983625706631e-06, "loss": 1.66, "mean_token_accuracy": 0.671598955988884, "num_tokens": 369485323.0, "step": 1956 }, { "epoch": 2.6925989672977626, "grad_norm": 0.7002913951873779, "learning_rate": 1.0902742896577912e-06, "loss": 1.4099, "mean_token_accuracy": 0.7148083746433258, "num_tokens": 369663016.0, "step": 1957 }, { "epoch": 2.693975903614458, "grad_norm": 0.7163001298904419, "learning_rate": 1.0806067941283295e-06, "loss": 1.1087, "mean_token_accuracy": 0.7673701420426369, "num_tokens": 369821808.0, "step": 1958 }, { "epoch": 2.6953528399311533, "grad_norm": 0.7063849568367004, "learning_rate": 1.0709811605089904e-06, "loss": 0.9331, "mean_token_accuracy": 0.8035457208752632, "num_tokens": 369967573.0, "step": 1959 }, { "epoch": 2.6967297762478486, "grad_norm": 0.7743335962295532, "learning_rate": 1.0613974100978885e-06, "loss": 0.8906, "mean_token_accuracy": 0.8132505416870117, "num_tokens": 370098605.0, "step": 1960 }, { "epoch": 2.698106712564544, "grad_norm": 0.4722658097743988, "learning_rate": 1.0518555641004613e-06, "loss": 1.5068, "mean_token_accuracy": 0.6973554566502571, "num_tokens": 370392410.0, "step": 1961 }, { "epoch": 2.6994836488812393, "grad_norm": 0.6407646536827087, "learning_rate": 1.0423556436294402e-06, "loss": 1.7265, "mean_token_accuracy": 0.659909576177597, "num_tokens": 370618342.0, "step": 1962 }, { "epoch": 2.7008605851979346, "grad_norm": 0.6878456473350525, "learning_rate": 1.0328976697047731e-06, "loss": 1.473, "mean_token_accuracy": 0.7051919773221016, "num_tokens": 370802514.0, "step": 1963 }, { "epoch": 2.70223752151463, "grad_norm": 0.7015199661254883, "learning_rate": 1.0234816632536094e-06, "loss": 1.1695, "mean_token_accuracy": 0.7576495781540871, "num_tokens": 370964978.0, "step": 1964 }, { "epoch": 2.7036144578313253, "grad_norm": 0.7151827216148376, "learning_rate": 1.0141076451102272e-06, "loss": 0.9766, "mean_token_accuracy": 0.7937287539243698, "num_tokens": 371113509.0, "step": 1965 }, { "epoch": 2.7049913941480206, "grad_norm": 0.740776538848877, "learning_rate": 1.004775636016011e-06, "loss": 0.8511, "mean_token_accuracy": 0.8226206079125404, "num_tokens": 371248952.0, "step": 1966 }, { "epoch": 2.706368330464716, "grad_norm": 0.1836763322353363, "learning_rate": 9.95485656619386e-07, "loss": 1.4279, "mean_token_accuracy": 0.73452427983284, "num_tokens": 371506083.0, "step": 1967 }, { "epoch": 2.7077452667814113, "grad_norm": 0.6079369783401489, "learning_rate": 9.862377274757828e-07, "loss": 1.7394, "mean_token_accuracy": 0.6615287214517593, "num_tokens": 371749116.0, "step": 1968 }, { "epoch": 2.7091222030981066, "grad_norm": 0.6816548705101013, "learning_rate": 9.77031869047591e-07, "loss": 1.5255, "mean_token_accuracy": 0.6968459337949753, "num_tokens": 371940624.0, "step": 1969 }, { "epoch": 2.710499139414802, "grad_norm": 0.7148515582084656, "learning_rate": 9.678681017041125e-07, "loss": 1.2569, "mean_token_accuracy": 0.7417391464114189, "num_tokens": 372107426.0, "step": 1970 }, { "epoch": 2.7118760757314977, "grad_norm": 0.7250866293907166, "learning_rate": 9.587464457215146e-07, "loss": 1.0249, "mean_token_accuracy": 0.7853161543607712, "num_tokens": 372258955.0, "step": 1971 }, { "epoch": 2.7132530120481926, "grad_norm": 0.7513755559921265, "learning_rate": 9.496669212827903e-07, "loss": 0.8778, "mean_token_accuracy": 0.8152144476771355, "num_tokens": 372397728.0, "step": 1972 }, { "epoch": 2.7146299483648884, "grad_norm": 0.27348434925079346, "learning_rate": 9.406295484777073e-07, "loss": 1.2317, "mean_token_accuracy": 0.771148294210434, "num_tokens": 372603643.0, "step": 1973 }, { "epoch": 2.7160068846815832, "grad_norm": 0.5815503001213074, "learning_rate": 9.316343473027656e-07, "loss": 1.7492, "mean_token_accuracy": 0.6585545614361763, "num_tokens": 372869886.0, "step": 1974 }, { "epoch": 2.717383820998279, "grad_norm": 0.6755785942077637, "learning_rate": 9.226813376611599e-07, "loss": 1.6265, "mean_token_accuracy": 0.677994854748249, "num_tokens": 373070141.0, "step": 1975 }, { "epoch": 2.718760757314974, "grad_norm": 0.7002668380737305, "learning_rate": 9.137705393627239e-07, "loss": 1.3436, "mean_token_accuracy": 0.7252058833837509, "num_tokens": 373242384.0, "step": 1976 }, { "epoch": 2.7201376936316697, "grad_norm": 0.7259037494659424, "learning_rate": 9.049019721238972e-07, "loss": 1.0946, "mean_token_accuracy": 0.7718681171536446, "num_tokens": 373397885.0, "step": 1977 }, { "epoch": 2.721514629948365, "grad_norm": 0.7250061631202698, "learning_rate": 8.960756555676764e-07, "loss": 0.93, "mean_token_accuracy": 0.8040464073419571, "num_tokens": 373540673.0, "step": 1978 }, { "epoch": 2.7228915662650603, "grad_norm": 0.8871768712997437, "learning_rate": 8.872916092235662e-07, "loss": 0.9549, "mean_token_accuracy": 0.7991655319929123, "num_tokens": 373665019.0, "step": 1979 }, { "epoch": 2.7242685025817557, "grad_norm": 0.5285206437110901, "learning_rate": 8.785498525275505e-07, "loss": 1.5869, "mean_token_accuracy": 0.6736835092306137, "num_tokens": 373989702.0, "step": 1980 }, { "epoch": 2.725645438898451, "grad_norm": 0.6670492887496948, "learning_rate": 8.698504048220391e-07, "loss": 1.6759, "mean_token_accuracy": 0.6688327416777611, "num_tokens": 374199536.0, "step": 1981 }, { "epoch": 2.7270223752151463, "grad_norm": 0.7040827870368958, "learning_rate": 8.611932853558236e-07, "loss": 1.3804, "mean_token_accuracy": 0.7207533791661263, "num_tokens": 374376393.0, "step": 1982 }, { "epoch": 2.7283993115318417, "grad_norm": 0.7087607383728027, "learning_rate": 8.525785132840391e-07, "loss": 1.1115, "mean_token_accuracy": 0.7684153169393539, "num_tokens": 374534799.0, "step": 1983 }, { "epoch": 2.729776247848537, "grad_norm": 0.7067899107933044, "learning_rate": 8.440061076681272e-07, "loss": 0.9654, "mean_token_accuracy": 0.7964450716972351, "num_tokens": 374680513.0, "step": 1984 }, { "epoch": 2.7311531841652323, "grad_norm": 0.7769688963890076, "learning_rate": 8.354760874757772e-07, "loss": 0.8689, "mean_token_accuracy": 0.8188998773694038, "num_tokens": 374811440.0, "step": 1985 }, { "epoch": 2.7325301204819277, "grad_norm": 0.47712579369544983, "learning_rate": 8.269884715809029e-07, "loss": 1.5318, "mean_token_accuracy": 0.693647600710392, "num_tokens": 375110320.0, "step": 1986 }, { "epoch": 2.733907056798623, "grad_norm": 0.6316239237785339, "learning_rate": 8.185432787635905e-07, "loss": 1.7219, "mean_token_accuracy": 0.662549689412117, "num_tokens": 375337257.0, "step": 1987 }, { "epoch": 2.7352839931153183, "grad_norm": 0.6898048520088196, "learning_rate": 8.101405277100549e-07, "loss": 1.4702, "mean_token_accuracy": 0.704453818500042, "num_tokens": 375521915.0, "step": 1988 }, { "epoch": 2.7366609294320137, "grad_norm": 0.7172072529792786, "learning_rate": 8.017802370126037e-07, "loss": 1.1926, "mean_token_accuracy": 0.7554534077644348, "num_tokens": 375685085.0, "step": 1989 }, { "epoch": 2.738037865748709, "grad_norm": 0.7112855911254883, "learning_rate": 7.934624251695999e-07, "loss": 0.9945, "mean_token_accuracy": 0.7914550676941872, "num_tokens": 375834283.0, "step": 1990 }, { "epoch": 2.7394148020654043, "grad_norm": 0.7585837841033936, "learning_rate": 7.851871105854125e-07, "loss": 0.8745, "mean_token_accuracy": 0.8194985762238503, "num_tokens": 375970039.0, "step": 1991 }, { "epoch": 2.7407917383820997, "grad_norm": 0.18834951519966125, "learning_rate": 7.769543115703771e-07, "loss": 1.4142, "mean_token_accuracy": 0.7314117923378944, "num_tokens": 376226934.0, "step": 1992 }, { "epoch": 2.742168674698795, "grad_norm": 0.6210713386535645, "learning_rate": 7.687640463407597e-07, "loss": 1.7846, "mean_token_accuracy": 0.6532099097967148, "num_tokens": 376467562.0, "step": 1993 }, { "epoch": 2.7435456110154908, "grad_norm": 0.6880428791046143, "learning_rate": 7.606163330187155e-07, "loss": 1.536, "mean_token_accuracy": 0.695578821003437, "num_tokens": 376658533.0, "step": 1994 }, { "epoch": 2.7449225473321857, "grad_norm": 0.7067397832870483, "learning_rate": 7.525111896322479e-07, "loss": 1.2477, "mean_token_accuracy": 0.7445135861635208, "num_tokens": 376825540.0, "step": 1995 }, { "epoch": 2.7462994836488814, "grad_norm": 0.7168175578117371, "learning_rate": 7.444486341151602e-07, "loss": 1.0163, "mean_token_accuracy": 0.7877990454435349, "num_tokens": 376977502.0, "step": 1996 }, { "epoch": 2.7476764199655763, "grad_norm": 0.7417207360267639, "learning_rate": 7.364286843070312e-07, "loss": 0.8956, "mean_token_accuracy": 0.813263475894928, "num_tokens": 377116584.0, "step": 1997 }, { "epoch": 2.749053356282272, "grad_norm": 0.25620874762535095, "learning_rate": 7.28451357953166e-07, "loss": 1.2325, "mean_token_accuracy": 0.7680191323161125, "num_tokens": 377327259.0, "step": 1998 }, { "epoch": 2.7504302925989674, "grad_norm": 0.5800641775131226, "learning_rate": 7.205166727045631e-07, "loss": 1.7512, "mean_token_accuracy": 0.6562517061829567, "num_tokens": 377598870.0, "step": 1999 }, { "epoch": 2.7518072289156628, "grad_norm": 0.6705427169799805, "learning_rate": 7.126246461178609e-07, "loss": 1.6294, "mean_token_accuracy": 0.6780501306056976, "num_tokens": 377799595.0, "step": 2000 }, { "epoch": 2.753184165232358, "grad_norm": 0.7062921524047852, "learning_rate": 7.047752956553267e-07, "loss": 1.3232, "mean_token_accuracy": 0.7314512804150581, "num_tokens": 377971370.0, "step": 2001 }, { "epoch": 2.7545611015490534, "grad_norm": 0.7042173743247986, "learning_rate": 6.969686386847852e-07, "loss": 1.0517, "mean_token_accuracy": 0.7808914184570312, "num_tokens": 378126345.0, "step": 2002 }, { "epoch": 2.7559380378657488, "grad_norm": 0.7238186597824097, "learning_rate": 6.892046924796037e-07, "loss": 0.9038, "mean_token_accuracy": 0.8083808347582817, "num_tokens": 378269017.0, "step": 2003 }, { "epoch": 2.757314974182444, "grad_norm": 0.8951687216758728, "learning_rate": 6.814834742186361e-07, "loss": 0.9619, "mean_token_accuracy": 0.7949137389659882, "num_tokens": 378393629.0, "step": 2004 }, { "epoch": 2.7586919104991394, "grad_norm": 0.527163028717041, "learning_rate": 6.738050009862052e-07, "loss": 1.5803, "mean_token_accuracy": 0.6759164035320282, "num_tokens": 378719354.0, "step": 2005 }, { "epoch": 2.7600688468158348, "grad_norm": 0.6530824303627014, "learning_rate": 6.661692897720517e-07, "loss": 1.683, "mean_token_accuracy": 0.6686783134937286, "num_tokens": 378929393.0, "step": 2006 }, { "epoch": 2.76144578313253, "grad_norm": 0.7001327276229858, "learning_rate": 6.585763574712945e-07, "loss": 1.3864, "mean_token_accuracy": 0.7192200943827629, "num_tokens": 379107085.0, "step": 2007 }, { "epoch": 2.7628227194492254, "grad_norm": 0.7146303653717041, "learning_rate": 6.510262208844031e-07, "loss": 1.1276, "mean_token_accuracy": 0.7654188275337219, "num_tokens": 379265991.0, "step": 2008 }, { "epoch": 2.7641996557659207, "grad_norm": 0.7025536298751831, "learning_rate": 6.435188967171524e-07, "loss": 0.9365, "mean_token_accuracy": 0.8014422133564949, "num_tokens": 379411979.0, "step": 2009 }, { "epoch": 2.765576592082616, "grad_norm": 0.7917940616607666, "learning_rate": 6.36054401580588e-07, "loss": 0.8797, "mean_token_accuracy": 0.8154914230108261, "num_tokens": 379543189.0, "step": 2010 }, { "epoch": 2.7669535283993114, "grad_norm": 0.46835407614707947, "learning_rate": 6.286327519909985e-07, "loss": 1.4834, "mean_token_accuracy": 0.7006205841898918, "num_tokens": 379840349.0, "step": 2011 }, { "epoch": 2.7683304647160067, "grad_norm": 0.6340617537498474, "learning_rate": 6.212539643698546e-07, "loss": 1.7299, "mean_token_accuracy": 0.6620654463768005, "num_tokens": 380064840.0, "step": 2012 }, { "epoch": 2.769707401032702, "grad_norm": 0.6916467547416687, "learning_rate": 6.139180550438006e-07, "loss": 1.4724, "mean_token_accuracy": 0.7057976499199867, "num_tokens": 380248206.0, "step": 2013 }, { "epoch": 2.7710843373493974, "grad_norm": 0.7124943137168884, "learning_rate": 6.066250402446039e-07, "loss": 1.1871, "mean_token_accuracy": 0.7551316395401955, "num_tokens": 380410661.0, "step": 2014 }, { "epoch": 2.772461273666093, "grad_norm": 0.7106195092201233, "learning_rate": 5.993749361091206e-07, "loss": 0.9948, "mean_token_accuracy": 0.7922666221857071, "num_tokens": 380559308.0, "step": 2015 }, { "epoch": 2.773838209982788, "grad_norm": 0.748979926109314, "learning_rate": 5.921677586792607e-07, "loss": 0.8774, "mean_token_accuracy": 0.816713385283947, "num_tokens": 380694895.0, "step": 2016 }, { "epoch": 2.775215146299484, "grad_norm": 0.18257112801074982, "learning_rate": 5.850035239019524e-07, "loss": 1.4047, "mean_token_accuracy": 0.7350507080554962, "num_tokens": 380950889.0, "step": 2017 }, { "epoch": 2.7765920826161787, "grad_norm": 0.604890763759613, "learning_rate": 5.77882247629109e-07, "loss": 1.7273, "mean_token_accuracy": 0.6612015590071678, "num_tokens": 381190979.0, "step": 2018 }, { "epoch": 2.7779690189328745, "grad_norm": 0.6868957281112671, "learning_rate": 5.708039456175907e-07, "loss": 1.4885, "mean_token_accuracy": 0.7005837261676788, "num_tokens": 381380743.0, "step": 2019 }, { "epoch": 2.77934595524957, "grad_norm": 0.7249525785446167, "learning_rate": 5.63768633529167e-07, "loss": 1.2689, "mean_token_accuracy": 0.7416115924715996, "num_tokens": 381547045.0, "step": 2020 }, { "epoch": 2.780722891566265, "grad_norm": 0.7100666165351868, "learning_rate": 5.567763269304927e-07, "loss": 1.0075, "mean_token_accuracy": 0.7899357378482819, "num_tokens": 381698708.0, "step": 2021 }, { "epoch": 2.7820998278829605, "grad_norm": 0.7372847199440002, "learning_rate": 5.498270412930629e-07, "loss": 0.8636, "mean_token_accuracy": 0.8203412368893623, "num_tokens": 381838229.0, "step": 2022 }, { "epoch": 2.783476764199656, "grad_norm": 0.25674867630004883, "learning_rate": 5.429207919931801e-07, "loss": 1.251, "mean_token_accuracy": 0.7685097828507423, "num_tokens": 382048341.0, "step": 2023 }, { "epoch": 2.784853700516351, "grad_norm": 0.5776864886283875, "learning_rate": 5.360575943119317e-07, "loss": 1.7317, "mean_token_accuracy": 0.657738484442234, "num_tokens": 382317478.0, "step": 2024 }, { "epoch": 2.7862306368330465, "grad_norm": 0.6731990575790405, "learning_rate": 5.292374634351371e-07, "loss": 1.6281, "mean_token_accuracy": 0.6773140653967857, "num_tokens": 382517172.0, "step": 2025 }, { "epoch": 2.787607573149742, "grad_norm": 0.702387809753418, "learning_rate": 5.224604144533274e-07, "loss": 1.3192, "mean_token_accuracy": 0.731514498591423, "num_tokens": 382688690.0, "step": 2026 }, { "epoch": 2.788984509466437, "grad_norm": 0.7188317775726318, "learning_rate": 5.157264623617119e-07, "loss": 1.097, "mean_token_accuracy": 0.7707481309771538, "num_tokens": 382843894.0, "step": 2027 }, { "epoch": 2.7903614457831325, "grad_norm": 0.7242744565010071, "learning_rate": 5.090356220601389e-07, "loss": 0.8801, "mean_token_accuracy": 0.8133730813860893, "num_tokens": 382986352.0, "step": 2028 }, { "epoch": 2.791738382099828, "grad_norm": 0.9143649339675903, "learning_rate": 5.023879083530636e-07, "loss": 0.9803, "mean_token_accuracy": 0.7934044599533081, "num_tokens": 383110888.0, "step": 2029 }, { "epoch": 2.793115318416523, "grad_norm": 0.5255395174026489, "learning_rate": 4.957833359495246e-07, "loss": 1.5638, "mean_token_accuracy": 0.6776519790291786, "num_tokens": 383437084.0, "step": 2030 }, { "epoch": 2.7944922547332185, "grad_norm": 0.6473753452301025, "learning_rate": 4.892219194630943e-07, "loss": 1.6837, "mean_token_accuracy": 0.669155664741993, "num_tokens": 383648642.0, "step": 2031 }, { "epoch": 2.795869191049914, "grad_norm": 0.6903766393661499, "learning_rate": 4.82703673411864e-07, "loss": 1.3736, "mean_token_accuracy": 0.7214004024863243, "num_tokens": 383826733.0, "step": 2032 }, { "epoch": 2.797246127366609, "grad_norm": 0.7103862166404724, "learning_rate": 4.762286122184012e-07, "loss": 1.1162, "mean_token_accuracy": 0.7667590379714966, "num_tokens": 383985773.0, "step": 2033 }, { "epoch": 2.7986230636833045, "grad_norm": 0.727574348449707, "learning_rate": 4.6979675020971895e-07, "loss": 0.9477, "mean_token_accuracy": 0.7981488108634949, "num_tokens": 384131531.0, "step": 2034 }, { "epoch": 2.8, "grad_norm": 0.7953712344169617, "learning_rate": 4.634081016172509e-07, "loss": 0.8881, "mean_token_accuracy": 0.8131375014781952, "num_tokens": 384262600.0, "step": 2035 }, { "epoch": 2.8013769363166956, "grad_norm": 0.48292794823646545, "learning_rate": 4.570626805768119e-07, "loss": 1.5251, "mean_token_accuracy": 0.6958710476756096, "num_tokens": 384554479.0, "step": 2036 }, { "epoch": 2.8027538726333905, "grad_norm": 0.6333779096603394, "learning_rate": 4.507605011285643e-07, "loss": 1.6859, "mean_token_accuracy": 0.6702176108956337, "num_tokens": 384778544.0, "step": 2037 }, { "epoch": 2.8041308089500863, "grad_norm": 0.6980260014533997, "learning_rate": 4.4450157721699803e-07, "loss": 1.4525, "mean_token_accuracy": 0.7076320871710777, "num_tokens": 384962187.0, "step": 2038 }, { "epoch": 2.805507745266781, "grad_norm": 0.7155051827430725, "learning_rate": 4.3828592269089975e-07, "loss": 1.1978, "mean_token_accuracy": 0.7551355436444283, "num_tokens": 385125082.0, "step": 2039 }, { "epoch": 2.806884681583477, "grad_norm": 0.7061944007873535, "learning_rate": 4.3211355130330147e-07, "loss": 0.9758, "mean_token_accuracy": 0.7939661964774132, "num_tokens": 385274145.0, "step": 2040 }, { "epoch": 2.8082616179001723, "grad_norm": 0.7623984217643738, "learning_rate": 4.259844767114762e-07, "loss": 0.8775, "mean_token_accuracy": 0.8165731951594353, "num_tokens": 385409940.0, "step": 2041 }, { "epoch": 2.8096385542168676, "grad_norm": 0.18243198096752167, "learning_rate": 4.198987124768938e-07, "loss": 1.401, "mean_token_accuracy": 0.7313174903392792, "num_tokens": 385672876.0, "step": 2042 }, { "epoch": 2.811015490533563, "grad_norm": 0.6185261011123657, "learning_rate": 4.1385627206519396e-07, "loss": 1.7465, "mean_token_accuracy": 0.6565422117710114, "num_tokens": 385914918.0, "step": 2043 }, { "epoch": 2.8123924268502583, "grad_norm": 0.6906319856643677, "learning_rate": 4.07857168846153e-07, "loss": 1.5067, "mean_token_accuracy": 0.7001138627529144, "num_tokens": 386105372.0, "step": 2044 }, { "epoch": 2.8137693631669536, "grad_norm": 0.69684898853302, "learning_rate": 4.0190141609365963e-07, "loss": 1.2509, "mean_token_accuracy": 0.7429342493414879, "num_tokens": 386272445.0, "step": 2045 }, { "epoch": 2.815146299483649, "grad_norm": 0.7148314714431763, "learning_rate": 3.9598902698568367e-07, "loss": 1.0383, "mean_token_accuracy": 0.7838262766599655, "num_tokens": 386424801.0, "step": 2046 }, { "epoch": 2.8165232358003442, "grad_norm": 0.7152018547058105, "learning_rate": 3.901200146042472e-07, "loss": 0.8523, "mean_token_accuracy": 0.8209054172039032, "num_tokens": 386564716.0, "step": 2047 }, { "epoch": 2.8179001721170396, "grad_norm": 0.2631148099899292, "learning_rate": 3.842943919353914e-07, "loss": 1.2211, "mean_token_accuracy": 0.7734421938657761, "num_tokens": 386774190.0, "step": 2048 }, { "epoch": 2.819277108433735, "grad_norm": 0.576949417591095, "learning_rate": 3.7851217186915645e-07, "loss": 1.731, "mean_token_accuracy": 0.6586603671312332, "num_tokens": 387045187.0, "step": 2049 }, { "epoch": 2.8206540447504302, "grad_norm": 0.6692036986351013, "learning_rate": 3.7277336719954593e-07, "loss": 1.6121, "mean_token_accuracy": 0.6806850284337997, "num_tokens": 387246028.0, "step": 2050 }, { "epoch": 2.8220309810671256, "grad_norm": 0.710793673992157, "learning_rate": 3.670779906244981e-07, "loss": 1.3208, "mean_token_accuracy": 0.7288904339075089, "num_tokens": 387419076.0, "step": 2051 }, { "epoch": 2.823407917383821, "grad_norm": 0.7294498085975647, "learning_rate": 3.614260547458659e-07, "loss": 1.1165, "mean_token_accuracy": 0.7679166793823242, "num_tokens": 387575289.0, "step": 2052 }, { "epoch": 2.8247848537005162, "grad_norm": 0.730133056640625, "learning_rate": 3.558175720693768e-07, "loss": 0.9204, "mean_token_accuracy": 0.8080978095531464, "num_tokens": 387718488.0, "step": 2053 }, { "epoch": 2.8261617900172116, "grad_norm": 0.8876115083694458, "learning_rate": 3.5025255500461544e-07, "loss": 0.9453, "mean_token_accuracy": 0.7970060408115387, "num_tokens": 387843505.0, "step": 2054 }, { "epoch": 2.827538726333907, "grad_norm": 0.5224396586418152, "learning_rate": 3.447310158649897e-07, "loss": 1.5748, "mean_token_accuracy": 0.6754656881093979, "num_tokens": 388167951.0, "step": 2055 }, { "epoch": 2.8289156626506022, "grad_norm": 0.6646923422813416, "learning_rate": 3.392529668677114e-07, "loss": 1.6989, "mean_token_accuracy": 0.6636364683508873, "num_tokens": 388379401.0, "step": 2056 }, { "epoch": 2.830292598967298, "grad_norm": 0.7032209038734436, "learning_rate": 3.33818420133758e-07, "loss": 1.3773, "mean_token_accuracy": 0.7206544727087021, "num_tokens": 388556657.0, "step": 2057 }, { "epoch": 2.831669535283993, "grad_norm": 0.716389000415802, "learning_rate": 3.2842738768785744e-07, "loss": 1.1402, "mean_token_accuracy": 0.7603052631020546, "num_tokens": 388715190.0, "step": 2058 }, { "epoch": 2.8330464716006887, "grad_norm": 0.7018716335296631, "learning_rate": 3.230798814584502e-07, "loss": 0.9209, "mean_token_accuracy": 0.8057087734341621, "num_tokens": 388860747.0, "step": 2059 }, { "epoch": 2.8344234079173836, "grad_norm": 0.7905487418174744, "learning_rate": 3.177759132776781e-07, "loss": 0.8488, "mean_token_accuracy": 0.8227215856313705, "num_tokens": 388991561.0, "step": 2060 }, { "epoch": 2.8358003442340793, "grad_norm": 0.4678185284137726, "learning_rate": 3.1251549488133805e-07, "loss": 1.4903, "mean_token_accuracy": 0.6983000561594963, "num_tokens": 389287831.0, "step": 2061 }, { "epoch": 2.8371772805507747, "grad_norm": 0.631415843963623, "learning_rate": 3.0729863790886824e-07, "loss": 1.6891, "mean_token_accuracy": 0.6677605882287025, "num_tokens": 389512440.0, "step": 2062 }, { "epoch": 2.83855421686747, "grad_norm": 0.6915248036384583, "learning_rate": 3.021253539033309e-07, "loss": 1.4776, "mean_token_accuracy": 0.7001409754157066, "num_tokens": 389696475.0, "step": 2063 }, { "epoch": 2.8399311531841653, "grad_norm": 0.7181739807128906, "learning_rate": 2.969956543113628e-07, "loss": 1.1847, "mean_token_accuracy": 0.7562448009848595, "num_tokens": 389859401.0, "step": 2064 }, { "epoch": 2.8413080895008607, "grad_norm": 0.7274524569511414, "learning_rate": 2.919095504831737e-07, "loss": 0.9672, "mean_token_accuracy": 0.7959790453314781, "num_tokens": 390008257.0, "step": 2065 }, { "epoch": 2.842685025817556, "grad_norm": 0.7580775022506714, "learning_rate": 2.8686705367250824e-07, "loss": 0.8801, "mean_token_accuracy": 0.8198358714580536, "num_tokens": 390143726.0, "step": 2066 }, { "epoch": 2.8440619621342513, "grad_norm": 0.18676821887493134, "learning_rate": 2.818681750366259e-07, "loss": 1.3593, "mean_token_accuracy": 0.7418858036398888, "num_tokens": 390399732.0, "step": 2067 }, { "epoch": 2.8454388984509467, "grad_norm": 0.6136084198951721, "learning_rate": 2.7691292563627016e-07, "loss": 1.7636, "mean_token_accuracy": 0.6582169011235237, "num_tokens": 390642051.0, "step": 2068 }, { "epoch": 2.846815834767642, "grad_norm": 0.6696892380714417, "learning_rate": 2.7200131643565055e-07, "loss": 1.5006, "mean_token_accuracy": 0.6993548944592476, "num_tokens": 390833962.0, "step": 2069 }, { "epoch": 2.8481927710843373, "grad_norm": 0.7174932956695557, "learning_rate": 2.671333583024205e-07, "loss": 1.2604, "mean_token_accuracy": 0.742204524576664, "num_tokens": 391001457.0, "step": 2070 }, { "epoch": 2.8495697074010327, "grad_norm": 0.7044008374214172, "learning_rate": 2.6230906200764406e-07, "loss": 1.0255, "mean_token_accuracy": 0.7864366620779037, "num_tokens": 391153448.0, "step": 2071 }, { "epoch": 2.850946643717728, "grad_norm": 0.741597056388855, "learning_rate": 2.575284382257781e-07, "loss": 0.9041, "mean_token_accuracy": 0.8122463747859001, "num_tokens": 391292989.0, "step": 2072 }, { "epoch": 2.8523235800344233, "grad_norm": 0.26605549454689026, "learning_rate": 2.5279149753464794e-07, "loss": 1.2487, "mean_token_accuracy": 0.7673855945467949, "num_tokens": 391498725.0, "step": 2073 }, { "epoch": 2.8537005163511187, "grad_norm": 0.5885834693908691, "learning_rate": 2.48098250415425e-07, "loss": 1.7857, "mean_token_accuracy": 0.6504311189055443, "num_tokens": 391767055.0, "step": 2074 }, { "epoch": 2.855077452667814, "grad_norm": 0.6754207015037537, "learning_rate": 2.4344870725260264e-07, "loss": 1.6207, "mean_token_accuracy": 0.6794070079922676, "num_tokens": 391967367.0, "step": 2075 }, { "epoch": 2.8564543889845093, "grad_norm": 0.6972692608833313, "learning_rate": 2.3884287833396915e-07, "loss": 1.3142, "mean_token_accuracy": 0.7345927134156227, "num_tokens": 392139230.0, "step": 2076 }, { "epoch": 2.8578313253012047, "grad_norm": 0.7115241289138794, "learning_rate": 2.3428077385059255e-07, "loss": 1.0685, "mean_token_accuracy": 0.7781840562820435, "num_tokens": 392294009.0, "step": 2077 }, { "epoch": 2.8592082616179004, "grad_norm": 0.7251147627830505, "learning_rate": 2.297624038967916e-07, "loss": 0.9112, "mean_token_accuracy": 0.8063132762908936, "num_tokens": 392436510.0, "step": 2078 }, { "epoch": 2.8605851979345953, "grad_norm": 0.9277305006980896, "learning_rate": 2.2528777847011796e-07, "loss": 0.9704, "mean_token_accuracy": 0.7977848947048187, "num_tokens": 392561130.0, "step": 2079 }, { "epoch": 2.861962134251291, "grad_norm": 0.5205933451652527, "learning_rate": 2.208569074713318e-07, "loss": 1.5772, "mean_token_accuracy": 0.674488291144371, "num_tokens": 392890507.0, "step": 2080 }, { "epoch": 2.863339070567986, "grad_norm": 0.6665334701538086, "learning_rate": 2.1646980070437973e-07, "loss": 1.6794, "mean_token_accuracy": 0.6681771725416183, "num_tokens": 393100698.0, "step": 2081 }, { "epoch": 2.8647160068846818, "grad_norm": 0.6979389190673828, "learning_rate": 2.121264678763746e-07, "loss": 1.3769, "mean_token_accuracy": 0.7205251604318619, "num_tokens": 393277761.0, "step": 2082 }, { "epoch": 2.866092943201377, "grad_norm": 0.7080696225166321, "learning_rate": 2.0782691859756898e-07, "loss": 1.1252, "mean_token_accuracy": 0.7666004449129105, "num_tokens": 393436482.0, "step": 2083 }, { "epoch": 2.8674698795180724, "grad_norm": 0.7295445799827576, "learning_rate": 2.0357116238134633e-07, "loss": 0.9507, "mean_token_accuracy": 0.7992531880736351, "num_tokens": 393582375.0, "step": 2084 }, { "epoch": 2.8688468158347677, "grad_norm": 0.7953742742538452, "learning_rate": 1.993592086441809e-07, "loss": 0.8656, "mean_token_accuracy": 0.8196282312273979, "num_tokens": 393713429.0, "step": 2085 }, { "epoch": 2.870223752151463, "grad_norm": 0.47408002614974976, "learning_rate": 1.951910667056378e-07, "loss": 1.5444, "mean_token_accuracy": 0.6958937495946884, "num_tokens": 394004037.0, "step": 2086 }, { "epoch": 2.8716006884681584, "grad_norm": 0.6170153021812439, "learning_rate": 1.9106674578833306e-07, "loss": 1.7136, "mean_token_accuracy": 0.6644507274031639, "num_tokens": 394229435.0, "step": 2087 }, { "epoch": 2.8729776247848537, "grad_norm": 0.6923982501029968, "learning_rate": 1.869862550179291e-07, "loss": 1.4546, "mean_token_accuracy": 0.7085531577467918, "num_tokens": 394413871.0, "step": 2088 }, { "epoch": 2.874354561101549, "grad_norm": 0.7200195789337158, "learning_rate": 1.829496034231082e-07, "loss": 1.2104, "mean_token_accuracy": 0.7505840808153152, "num_tokens": 394576892.0, "step": 2089 }, { "epoch": 2.8757314974182444, "grad_norm": 0.7030972242355347, "learning_rate": 1.789567999355457e-07, "loss": 0.9668, "mean_token_accuracy": 0.7959944605827332, "num_tokens": 394725711.0, "step": 2090 }, { "epoch": 2.8771084337349397, "grad_norm": 0.764281690120697, "learning_rate": 1.750078533898991e-07, "loss": 0.8762, "mean_token_accuracy": 0.8165703490376472, "num_tokens": 394861202.0, "step": 2091 }, { "epoch": 2.878485370051635, "grad_norm": 0.18511918187141418, "learning_rate": 1.7110277252379238e-07, "loss": 1.4112, "mean_token_accuracy": 0.7330933809280396, "num_tokens": 395118041.0, "step": 2092 }, { "epoch": 2.8798623063683304, "grad_norm": 0.6107594966888428, "learning_rate": 1.6724156597778042e-07, "loss": 1.7542, "mean_token_accuracy": 0.6581893712282181, "num_tokens": 395361575.0, "step": 2093 }, { "epoch": 2.8812392426850257, "grad_norm": 0.6893260478973389, "learning_rate": 1.6342424229534691e-07, "loss": 1.5455, "mean_token_accuracy": 0.6930405497550964, "num_tokens": 395553493.0, "step": 2094 }, { "epoch": 2.882616179001721, "grad_norm": 0.7080364227294922, "learning_rate": 1.5965080992287329e-07, "loss": 1.2777, "mean_token_accuracy": 0.7358921319246292, "num_tokens": 395721530.0, "step": 2095 }, { "epoch": 2.8839931153184164, "grad_norm": 0.7135997414588928, "learning_rate": 1.559212772096319e-07, "loss": 1.0323, "mean_token_accuracy": 0.7862193286418915, "num_tokens": 395874245.0, "step": 2096 }, { "epoch": 2.8853700516351117, "grad_norm": 0.74049973487854, "learning_rate": 1.5223565240775062e-07, "loss": 0.8925, "mean_token_accuracy": 0.8111562281847, "num_tokens": 396013982.0, "step": 2097 }, { "epoch": 2.886746987951807, "grad_norm": 0.2568226754665375, "learning_rate": 1.4859394367221725e-07, "loss": 1.2362, "mean_token_accuracy": 0.775275357067585, "num_tokens": 396223267.0, "step": 2098 }, { "epoch": 2.888123924268503, "grad_norm": 0.5761784911155701, "learning_rate": 1.449961590608373e-07, "loss": 1.7075, "mean_token_accuracy": 0.6661471128463745, "num_tokens": 396494114.0, "step": 2099 }, { "epoch": 2.8895008605851977, "grad_norm": 0.6677254438400269, "learning_rate": 1.4144230653423408e-07, "loss": 1.608, "mean_token_accuracy": 0.6805535554885864, "num_tokens": 396694752.0, "step": 2100 }, { "epoch": 2.8908777969018935, "grad_norm": 0.7117964625358582, "learning_rate": 1.3793239395582413e-07, "loss": 1.3099, "mean_token_accuracy": 0.7350323051214218, "num_tokens": 396866802.0, "step": 2101 }, { "epoch": 2.8922547332185884, "grad_norm": 0.7203536033630371, "learning_rate": 1.3446642909180186e-07, "loss": 1.0634, "mean_token_accuracy": 0.7782750502228737, "num_tokens": 397022105.0, "step": 2102 }, { "epoch": 2.893631669535284, "grad_norm": 0.7185733914375305, "learning_rate": 1.310444196111127e-07, "loss": 0.9141, "mean_token_accuracy": 0.8097512125968933, "num_tokens": 397165119.0, "step": 2103 }, { "epoch": 2.8950086058519795, "grad_norm": 0.8703638911247253, "learning_rate": 1.276663730854555e-07, "loss": 0.9424, "mean_token_accuracy": 0.80014668405056, "num_tokens": 397290267.0, "step": 2104 }, { "epoch": 2.896385542168675, "grad_norm": 0.5179038047790527, "learning_rate": 1.2433229698924686e-07, "loss": 1.5819, "mean_token_accuracy": 0.6740756705403328, "num_tokens": 397619295.0, "step": 2105 }, { "epoch": 2.89776247848537, "grad_norm": 0.648863673210144, "learning_rate": 1.2104219869961685e-07, "loss": 1.6592, "mean_token_accuracy": 0.672499805688858, "num_tokens": 397829558.0, "step": 2106 }, { "epoch": 2.8991394148020655, "grad_norm": 0.7128008008003235, "learning_rate": 1.1779608549638666e-07, "loss": 1.4161, "mean_token_accuracy": 0.7167481482028961, "num_tokens": 398007426.0, "step": 2107 }, { "epoch": 2.900516351118761, "grad_norm": 0.7043711543083191, "learning_rate": 1.1459396456205307e-07, "loss": 1.1246, "mean_token_accuracy": 0.7669165134429932, "num_tokens": 398166452.0, "step": 2108 }, { "epoch": 2.901893287435456, "grad_norm": 0.7060954570770264, "learning_rate": 1.1143584298177523e-07, "loss": 0.9348, "mean_token_accuracy": 0.8039215430617332, "num_tokens": 398312216.0, "step": 2109 }, { "epoch": 2.9032702237521515, "grad_norm": 0.7781714797019958, "learning_rate": 1.0832172774335902e-07, "loss": 0.8825, "mean_token_accuracy": 0.8144431486725807, "num_tokens": 398443154.0, "step": 2110 }, { "epoch": 2.904647160068847, "grad_norm": 0.4726144075393677, "learning_rate": 1.0525162573723269e-07, "loss": 1.5018, "mean_token_accuracy": 0.6987911909818649, "num_tokens": 398736483.0, "step": 2111 }, { "epoch": 2.906024096385542, "grad_norm": 0.6363417506217957, "learning_rate": 1.0222554375644677e-07, "loss": 1.6684, "mean_token_accuracy": 0.6701172888278961, "num_tokens": 398961883.0, "step": 2112 }, { "epoch": 2.9074010327022375, "grad_norm": 0.7019259929656982, "learning_rate": 9.924348849664534e-08, "loss": 1.4827, "mean_token_accuracy": 0.7016695439815521, "num_tokens": 399146211.0, "step": 2113 }, { "epoch": 2.908777969018933, "grad_norm": 0.711423933506012, "learning_rate": 9.630546655606365e-08, "loss": 1.2088, "mean_token_accuracy": 0.7518921121954918, "num_tokens": 399309160.0, "step": 2114 }, { "epoch": 2.910154905335628, "grad_norm": 0.7196462154388428, "learning_rate": 9.34114844354972e-08, "loss": 0.9892, "mean_token_accuracy": 0.7932283952832222, "num_tokens": 399458480.0, "step": 2115 }, { "epoch": 2.9115318416523235, "grad_norm": 0.7411069273948669, "learning_rate": 9.056154853830823e-08, "loss": 0.84, "mean_token_accuracy": 0.82362100481987, "num_tokens": 399594787.0, "step": 2116 }, { "epoch": 2.912908777969019, "grad_norm": 0.1903555542230606, "learning_rate": 8.775566517039036e-08, "loss": 1.4007, "mean_token_accuracy": 0.7349238842725754, "num_tokens": 399847892.0, "step": 2117 }, { "epoch": 2.914285714285714, "grad_norm": 0.6101546287536621, "learning_rate": 8.499384054017067e-08, "loss": 1.7504, "mean_token_accuracy": 0.6605448052287102, "num_tokens": 400090551.0, "step": 2118 }, { "epoch": 2.9156626506024095, "grad_norm": 0.6759142279624939, "learning_rate": 8.227608075858984e-08, "loss": 1.5037, "mean_token_accuracy": 0.6966723501682281, "num_tokens": 400282503.0, "step": 2119 }, { "epoch": 2.9170395869191053, "grad_norm": 0.6956368684768677, "learning_rate": 7.960239183908202e-08, "loss": 1.2552, "mean_token_accuracy": 0.7457203194499016, "num_tokens": 400450156.0, "step": 2120 }, { "epoch": 2.9184165232358, "grad_norm": 0.7090383172035217, "learning_rate": 7.697277969757722e-08, "loss": 1.0275, "mean_token_accuracy": 0.7831647098064423, "num_tokens": 400602784.0, "step": 2121 }, { "epoch": 2.919793459552496, "grad_norm": 0.7178573608398438, "learning_rate": 7.438725015247228e-08, "loss": 0.8726, "mean_token_accuracy": 0.8171307370066643, "num_tokens": 400742422.0, "step": 2122 }, { "epoch": 2.921170395869191, "grad_norm": 0.25572454929351807, "learning_rate": 7.184580892462877e-08, "loss": 1.1961, "mean_token_accuracy": 0.7771441116929054, "num_tokens": 400951774.0, "step": 2123 }, { "epoch": 2.9225473321858866, "grad_norm": 0.5783873796463013, "learning_rate": 6.934846163735298e-08, "loss": 1.7243, "mean_token_accuracy": 0.6570136323571205, "num_tokens": 401218134.0, "step": 2124 }, { "epoch": 2.923924268502582, "grad_norm": 0.6774641871452332, "learning_rate": 6.689521381639363e-08, "loss": 1.5935, "mean_token_accuracy": 0.6832430437207222, "num_tokens": 401416264.0, "step": 2125 }, { "epoch": 2.9253012048192772, "grad_norm": 0.7055312991142273, "learning_rate": 6.448607088991532e-08, "loss": 1.3006, "mean_token_accuracy": 0.7346298769116402, "num_tokens": 401587687.0, "step": 2126 }, { "epoch": 2.9266781411359726, "grad_norm": 0.7250730395317078, "learning_rate": 6.212103818849625e-08, "loss": 1.0892, "mean_token_accuracy": 0.7752754911780357, "num_tokens": 401743035.0, "step": 2127 }, { "epoch": 2.928055077452668, "grad_norm": 0.7232088446617126, "learning_rate": 5.980012094511933e-08, "loss": 0.8976, "mean_token_accuracy": 0.8123901411890984, "num_tokens": 401885709.0, "step": 2128 }, { "epoch": 2.9294320137693632, "grad_norm": 0.9116280674934387, "learning_rate": 5.7523324295147845e-08, "loss": 1.0257, "mean_token_accuracy": 0.7853526622056961, "num_tokens": 402010015.0, "step": 2129 }, { "epoch": 2.9308089500860586, "grad_norm": 0.5092731714248657, "learning_rate": 5.529065327632532e-08, "loss": 1.5657, "mean_token_accuracy": 0.6725831925868988, "num_tokens": 402334630.0, "step": 2130 }, { "epoch": 2.932185886402754, "grad_norm": 0.6588203310966492, "learning_rate": 5.310211282876676e-08, "loss": 1.6495, "mean_token_accuracy": 0.6725225523114204, "num_tokens": 402544989.0, "step": 2131 }, { "epoch": 2.9335628227194492, "grad_norm": 0.6907715201377869, "learning_rate": 5.09577077949297e-08, "loss": 1.3579, "mean_token_accuracy": 0.7222354784607887, "num_tokens": 402722080.0, "step": 2132 }, { "epoch": 2.9349397590361446, "grad_norm": 0.7279479503631592, "learning_rate": 4.885744291962535e-08, "loss": 1.1247, "mean_token_accuracy": 0.7637469619512558, "num_tokens": 402880357.0, "step": 2133 }, { "epoch": 2.93631669535284, "grad_norm": 0.7187070250511169, "learning_rate": 4.680132284999639e-08, "loss": 0.9383, "mean_token_accuracy": 0.8006767258048058, "num_tokens": 403025902.0, "step": 2134 }, { "epoch": 2.9376936316695352, "grad_norm": 0.8044910430908203, "learning_rate": 4.4789352135505834e-08, "loss": 0.858, "mean_token_accuracy": 0.8207897767424583, "num_tokens": 403157113.0, "step": 2135 }, { "epoch": 2.9390705679862306, "grad_norm": 0.4767455756664276, "learning_rate": 4.2821535227930424e-08, "loss": 1.5595, "mean_token_accuracy": 0.6969205662608147, "num_tokens": 403450552.0, "step": 2136 }, { "epoch": 2.940447504302926, "grad_norm": 0.6325563788414001, "learning_rate": 4.08978764813539e-08, "loss": 1.7186, "mean_token_accuracy": 0.6640539616346359, "num_tokens": 403676046.0, "step": 2137 }, { "epoch": 2.9418244406196212, "grad_norm": 0.6993392705917358, "learning_rate": 3.9018380152149313e-08, "loss": 1.4545, "mean_token_accuracy": 0.7075598686933517, "num_tokens": 403859628.0, "step": 2138 }, { "epoch": 2.9432013769363166, "grad_norm": 0.7151709198951721, "learning_rate": 3.718305039897896e-08, "loss": 1.1897, "mean_token_accuracy": 0.7547161132097244, "num_tokens": 404021949.0, "step": 2139 }, { "epoch": 2.944578313253012, "grad_norm": 0.7093547582626343, "learning_rate": 3.539189128277221e-08, "loss": 0.9825, "mean_token_accuracy": 0.7946733832359314, "num_tokens": 404170674.0, "step": 2140 }, { "epoch": 2.9459552495697077, "grad_norm": 0.7327467799186707, "learning_rate": 3.3644906766734374e-08, "loss": 0.8643, "mean_token_accuracy": 0.8170072808861732, "num_tokens": 404306164.0, "step": 2141 }, { "epoch": 2.9473321858864026, "grad_norm": 0.19308508932590485, "learning_rate": 3.194210071631787e-08, "loss": 1.433, "mean_token_accuracy": 0.7293608337640762, "num_tokens": 404559064.0, "step": 2142 }, { "epoch": 2.9487091222030983, "grad_norm": 0.6087179183959961, "learning_rate": 3.028347689923328e-08, "loss": 1.7702, "mean_token_accuracy": 0.6541821956634521, "num_tokens": 404800075.0, "step": 2143 }, { "epoch": 2.950086058519793, "grad_norm": 0.6832478642463684, "learning_rate": 2.8669038985422724e-08, "loss": 1.5077, "mean_token_accuracy": 0.6988482028245926, "num_tokens": 404991097.0, "step": 2144 }, { "epoch": 2.951462994836489, "grad_norm": 0.7122058272361755, "learning_rate": 2.7098790547068765e-08, "loss": 1.2619, "mean_token_accuracy": 0.7439946606755257, "num_tokens": 405158078.0, "step": 2145 }, { "epoch": 2.9528399311531843, "grad_norm": 0.7064540982246399, "learning_rate": 2.557273505856994e-08, "loss": 1.0137, "mean_token_accuracy": 0.7863739505410194, "num_tokens": 405310131.0, "step": 2146 }, { "epoch": 2.9542168674698797, "grad_norm": 0.7294560670852661, "learning_rate": 2.4090875896551903e-08, "loss": 0.8721, "mean_token_accuracy": 0.8169639483094215, "num_tokens": 405449473.0, "step": 2147 }, { "epoch": 2.955593803786575, "grad_norm": 0.2543903887271881, "learning_rate": 2.2653216339840746e-08, "loss": 1.2736, "mean_token_accuracy": 0.7692638486623764, "num_tokens": 405659655.0, "step": 2148 }, { "epoch": 2.9569707401032703, "grad_norm": 0.567044734954834, "learning_rate": 2.1259759569467463e-08, "loss": 1.7024, "mean_token_accuracy": 0.6634880006313324, "num_tokens": 405928620.0, "step": 2149 }, { "epoch": 2.9583476764199657, "grad_norm": 0.6734067797660828, "learning_rate": 1.991050866865685e-08, "loss": 1.5656, "mean_token_accuracy": 0.6879629343748093, "num_tokens": 406128648.0, "step": 2150 }, { "epoch": 2.959724612736661, "grad_norm": 0.7059922218322754, "learning_rate": 1.860546662282525e-08, "loss": 1.3299, "mean_token_accuracy": 0.7286324128508568, "num_tokens": 406300402.0, "step": 2151 }, { "epoch": 2.9611015490533563, "grad_norm": 0.7123343348503113, "learning_rate": 1.7344636319565067e-08, "loss": 1.1033, "mean_token_accuracy": 0.7701897472143173, "num_tokens": 406455899.0, "step": 2152 }, { "epoch": 2.9624784853700517, "grad_norm": 0.7277344465255737, "learning_rate": 1.6128020548649148e-08, "loss": 0.9402, "mean_token_accuracy": 0.8056737631559372, "num_tokens": 406598814.0, "step": 2153 }, { "epoch": 2.963855421686747, "grad_norm": 0.9169206023216248, "learning_rate": 1.49556220020175e-08, "loss": 0.9836, "mean_token_accuracy": 0.7928073853254318, "num_tokens": 406723564.0, "step": 2154 }, { "epoch": 2.9652323580034423, "grad_norm": 0.5129714608192444, "learning_rate": 1.3827443273768393e-08, "loss": 1.5557, "mean_token_accuracy": 0.6791412979364395, "num_tokens": 407047110.0, "step": 2155 }, { "epoch": 2.9666092943201376, "grad_norm": 0.6601217985153198, "learning_rate": 1.2743486860165022e-08, "loss": 1.6581, "mean_token_accuracy": 0.6738254800438881, "num_tokens": 407256266.0, "step": 2156 }, { "epoch": 2.967986230636833, "grad_norm": 0.7051712274551392, "learning_rate": 1.1703755159619967e-08, "loss": 1.3644, "mean_token_accuracy": 0.7229912132024765, "num_tokens": 407432852.0, "step": 2157 }, { "epoch": 2.9693631669535283, "grad_norm": 0.707111656665802, "learning_rate": 1.070825047268631e-08, "loss": 1.096, "mean_token_accuracy": 0.7736361920833588, "num_tokens": 407591142.0, "step": 2158 }, { "epoch": 2.9707401032702236, "grad_norm": 0.7054973244667053, "learning_rate": 9.756975002066515e-09, "loss": 0.9467, "mean_token_accuracy": 0.8017539381980896, "num_tokens": 407736542.0, "step": 2159 }, { "epoch": 2.972117039586919, "grad_norm": 0.7890355587005615, "learning_rate": 8.849930852596889e-09, "loss": 0.8645, "mean_token_accuracy": 0.8170957714319229, "num_tokens": 407867307.0, "step": 2160 }, { "epoch": 2.9734939759036143, "grad_norm": 0.47276580333709717, "learning_rate": 7.987120031238693e-09, "loss": 1.5435, "mean_token_accuracy": 0.6926775425672531, "num_tokens": 408164651.0, "step": 2161 }, { "epoch": 2.97487091222031, "grad_norm": 0.6224261522293091, "learning_rate": 7.168544447093695e-09, "loss": 1.697, "mean_token_accuracy": 0.6674415767192841, "num_tokens": 408391761.0, "step": 2162 }, { "epoch": 2.976247848537005, "grad_norm": 0.692267894744873, "learning_rate": 6.3942059113730745e-09, "loss": 1.451, "mean_token_accuracy": 0.7079981565475464, "num_tokens": 408576291.0, "step": 2163 }, { "epoch": 2.9776247848537007, "grad_norm": 0.7106713652610779, "learning_rate": 5.664106137419634e-09, "loss": 1.1985, "mean_token_accuracy": 0.7536637261509895, "num_tokens": 408739246.0, "step": 2164 }, { "epoch": 2.9790017211703956, "grad_norm": 0.721775472164154, "learning_rate": 4.978246740678927e-09, "loss": 0.988, "mean_token_accuracy": 0.7895259633660316, "num_tokens": 408888414.0, "step": 2165 }, { "epoch": 2.9803786574870914, "grad_norm": 0.7528623938560486, "learning_rate": 4.336629238719248e-09, "loss": 0.8805, "mean_token_accuracy": 0.8140000030398369, "num_tokens": 409024177.0, "step": 2166 }, { "epoch": 2.9817555938037867, "grad_norm": 0.1834622621536255, "learning_rate": 3.739255051211643e-09, "loss": 1.42, "mean_token_accuracy": 0.7306899800896645, "num_tokens": 409283101.0, "step": 2167 }, { "epoch": 2.983132530120482, "grad_norm": 0.6118700504302979, "learning_rate": 3.186125499934356e-09, "loss": 1.755, "mean_token_accuracy": 0.6568222567439079, "num_tokens": 409523992.0, "step": 2168 }, { "epoch": 2.9845094664371774, "grad_norm": 0.6854743957519531, "learning_rate": 2.6772418087639417e-09, "loss": 1.529, "mean_token_accuracy": 0.695137269794941, "num_tokens": 409713924.0, "step": 2169 }, { "epoch": 2.9858864027538727, "grad_norm": 0.6959836483001709, "learning_rate": 2.2126051036819306e-09, "loss": 1.2254, "mean_token_accuracy": 0.7485772669315338, "num_tokens": 409879962.0, "step": 2170 }, { "epoch": 2.987263339070568, "grad_norm": 0.7159447073936462, "learning_rate": 1.7922164127659457e-09, "loss": 1.0091, "mean_token_accuracy": 0.7854340523481369, "num_tokens": 410031674.0, "step": 2171 }, { "epoch": 2.9886402753872634, "grad_norm": 0.7227118611335754, "learning_rate": 1.4160766661830416e-09, "loss": 0.8706, "mean_token_accuracy": 0.8158926144242287, "num_tokens": 410170922.0, "step": 2172 }, { "epoch": 2.9900172117039587, "grad_norm": 0.26748570799827576, "learning_rate": 1.0841866962008063e-09, "loss": 1.252, "mean_token_accuracy": 0.768527016043663, "num_tokens": 410376329.0, "step": 2173 }, { "epoch": 2.991394148020654, "grad_norm": 0.5846934914588928, "learning_rate": 7.965472371718186e-10, "loss": 1.7821, "mean_token_accuracy": 0.6527567431330681, "num_tokens": 410645193.0, "step": 2174 }, { "epoch": 2.9927710843373494, "grad_norm": 0.6728100180625916, "learning_rate": 5.531589255403092e-10, "loss": 1.5898, "mean_token_accuracy": 0.6842018663883209, "num_tokens": 410843336.0, "step": 2175 }, { "epoch": 2.9941480206540447, "grad_norm": 0.7064998149871826, "learning_rate": 3.540222998399401e-10, "loss": 1.3073, "mean_token_accuracy": 0.7327628210186958, "num_tokens": 411013835.0, "step": 2176 }, { "epoch": 2.99552495697074, "grad_norm": 0.7168886661529541, "learning_rate": 1.9913780068936407e-10, "loss": 1.0694, "mean_token_accuracy": 0.7764119878411293, "num_tokens": 411167962.0, "step": 2177 }, { "epoch": 2.9969018932874354, "grad_norm": 0.7224218249320984, "learning_rate": 8.850577079222434e-11, "loss": 0.9069, "mean_token_accuracy": 0.8093838170170784, "num_tokens": 411310219.0, "step": 2178 } ], "logging_steps": 1, "max_steps": 2178, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8705548266270884e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }