{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.604010207801677, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002916514764855997, "grad_norm": 16.414438247680664, "learning_rate": 0.0, "loss": 7.9527, "step": 1 }, { "epoch": 0.0005833029529711994, "grad_norm": 19.759525299072266, "learning_rate": 4e-05, "loss": 8.3494, "step": 2 }, { "epoch": 0.0008749544294567992, "grad_norm": 20.793371200561523, "learning_rate": 8e-05, "loss": 8.188, "step": 3 }, { "epoch": 0.0011666059059423988, "grad_norm": 154.26425170898438, "learning_rate": 0.00012, "loss": 7.3825, "step": 4 }, { "epoch": 0.0014582573824279985, "grad_norm": 141.8572540283203, "learning_rate": 0.00016, "loss": 6.7963, "step": 5 }, { "epoch": 0.0017499088589135983, "grad_norm": 18.762121200561523, "learning_rate": 0.0002, "loss": 6.5293, "step": 6 }, { "epoch": 0.002041560335399198, "grad_norm": 28.779830932617188, "learning_rate": 0.00019998054853141413, "loss": 6.2418, "step": 7 }, { "epoch": 0.0023332118118847976, "grad_norm": 27.985748291015625, "learning_rate": 0.00019996109706282825, "loss": 6.7501, "step": 8 }, { "epoch": 0.0026248632883703973, "grad_norm": 12.921512603759766, "learning_rate": 0.00019994164559424237, "loss": 6.0619, "step": 9 }, { "epoch": 0.002916514764855997, "grad_norm": 8.12353801727295, "learning_rate": 0.0001999221941256565, "loss": 5.6675, "step": 10 }, { "epoch": 0.0032081662413415966, "grad_norm": 7.871522903442383, "learning_rate": 0.00019990274265707062, "loss": 5.9127, "step": 11 }, { "epoch": 0.0034998177178271967, "grad_norm": 5.419699668884277, "learning_rate": 0.00019988329118848474, "loss": 5.5482, "step": 12 }, { "epoch": 0.0037914691943127963, "grad_norm": 8.595728874206543, "learning_rate": 0.00019986383971989886, "loss": 5.9463, "step": 13 }, { "epoch": 0.004083120670798396, "grad_norm": 3.71962571144104, "learning_rate": 0.00019984438825131298, "loss": 5.6955, "step": 14 }, { "epoch": 0.004374772147283996, "grad_norm": 5.772061824798584, "learning_rate": 0.0001998249367827271, "loss": 5.6603, "step": 15 }, { "epoch": 0.004666423623769595, "grad_norm": 5.029808521270752, "learning_rate": 0.00019980548531414123, "loss": 5.9216, "step": 16 }, { "epoch": 0.004958075100255195, "grad_norm": 5.2702717781066895, "learning_rate": 0.00019978603384555535, "loss": 5.7627, "step": 17 }, { "epoch": 0.005249726576740795, "grad_norm": 4.539237022399902, "learning_rate": 0.00019976658237696947, "loss": 5.355, "step": 18 }, { "epoch": 0.005541378053226394, "grad_norm": 3.4533181190490723, "learning_rate": 0.0001997471309083836, "loss": 5.3057, "step": 19 }, { "epoch": 0.005833029529711994, "grad_norm": 4.49119234085083, "learning_rate": 0.0001997276794397977, "loss": 5.471, "step": 20 }, { "epoch": 0.0061246810061975935, "grad_norm": 4.865241527557373, "learning_rate": 0.00019970822797121183, "loss": 5.3359, "step": 21 }, { "epoch": 0.006416332482683193, "grad_norm": 7.495031356811523, "learning_rate": 0.00019968877650262596, "loss": 5.8991, "step": 22 }, { "epoch": 0.006707983959168794, "grad_norm": 4.314780235290527, "learning_rate": 0.00019966932503404008, "loss": 5.1877, "step": 23 }, { "epoch": 0.006999635435654393, "grad_norm": 4.645003318786621, "learning_rate": 0.0001996498735654542, "loss": 5.4811, "step": 24 }, { "epoch": 0.007291286912139993, "grad_norm": 5.93269681930542, "learning_rate": 0.00019963042209686832, "loss": 5.1065, "step": 25 }, { "epoch": 0.007582938388625593, "grad_norm": 5.6493659019470215, "learning_rate": 0.00019961097062828244, "loss": 5.6438, "step": 26 }, { "epoch": 0.007874589865111192, "grad_norm": 7.376254081726074, "learning_rate": 0.00019959151915969656, "loss": 5.2033, "step": 27 }, { "epoch": 0.008166241341596792, "grad_norm": 4.436602592468262, "learning_rate": 0.00019957206769111068, "loss": 5.274, "step": 28 }, { "epoch": 0.008457892818082392, "grad_norm": 4.457859039306641, "learning_rate": 0.0001995526162225248, "loss": 5.5059, "step": 29 }, { "epoch": 0.008749544294567991, "grad_norm": 5.812066078186035, "learning_rate": 0.00019953316475393893, "loss": 5.623, "step": 30 }, { "epoch": 0.009041195771053591, "grad_norm": 4.582685947418213, "learning_rate": 0.00019951371328535305, "loss": 5.5386, "step": 31 }, { "epoch": 0.00933284724753919, "grad_norm": 6.701116561889648, "learning_rate": 0.00019949426181676717, "loss": 5.2596, "step": 32 }, { "epoch": 0.00962449872402479, "grad_norm": 4.321029186248779, "learning_rate": 0.0001994748103481813, "loss": 5.4782, "step": 33 }, { "epoch": 0.00991615020051039, "grad_norm": 4.403057098388672, "learning_rate": 0.00019945535887959541, "loss": 5.3637, "step": 34 }, { "epoch": 0.01020780167699599, "grad_norm": 3.0161266326904297, "learning_rate": 0.00019943590741100954, "loss": 5.4921, "step": 35 }, { "epoch": 0.01049945315348159, "grad_norm": 6.888625144958496, "learning_rate": 0.00019941645594242366, "loss": 5.4306, "step": 36 }, { "epoch": 0.010791104629967189, "grad_norm": 4.63026237487793, "learning_rate": 0.00019939700447383778, "loss": 5.0159, "step": 37 }, { "epoch": 0.011082756106452788, "grad_norm": 6.093297481536865, "learning_rate": 0.00019937755300525193, "loss": 5.3365, "step": 38 }, { "epoch": 0.011374407582938388, "grad_norm": 3.6189231872558594, "learning_rate": 0.00019935810153666602, "loss": 5.2186, "step": 39 }, { "epoch": 0.011666059059423988, "grad_norm": 7.820289134979248, "learning_rate": 0.00019933865006808014, "loss": 5.162, "step": 40 }, { "epoch": 0.011957710535909587, "grad_norm": 8.196646690368652, "learning_rate": 0.00019931919859949427, "loss": 5.3822, "step": 41 }, { "epoch": 0.012249362012395187, "grad_norm": 4.929930686950684, "learning_rate": 0.0001992997471309084, "loss": 5.0969, "step": 42 }, { "epoch": 0.012541013488880787, "grad_norm": 5.643044471740723, "learning_rate": 0.0001992802956623225, "loss": 5.4066, "step": 43 }, { "epoch": 0.012832664965366386, "grad_norm": 4.61903715133667, "learning_rate": 0.00019926084419373663, "loss": 4.9828, "step": 44 }, { "epoch": 0.013124316441851986, "grad_norm": 3.6604957580566406, "learning_rate": 0.00019924139272515078, "loss": 5.1689, "step": 45 }, { "epoch": 0.013415967918337587, "grad_norm": 2.9471001625061035, "learning_rate": 0.00019922194125656487, "loss": 5.1396, "step": 46 }, { "epoch": 0.013707619394823187, "grad_norm": 6.661954402923584, "learning_rate": 0.000199202489787979, "loss": 5.129, "step": 47 }, { "epoch": 0.013999270871308787, "grad_norm": 4.815802574157715, "learning_rate": 0.00019918303831939314, "loss": 5.3188, "step": 48 }, { "epoch": 0.014290922347794386, "grad_norm": 3.835651159286499, "learning_rate": 0.00019916358685080724, "loss": 4.8205, "step": 49 }, { "epoch": 0.014582573824279986, "grad_norm": 3.2757277488708496, "learning_rate": 0.00019914413538222136, "loss": 5.2257, "step": 50 }, { "epoch": 0.014874225300765586, "grad_norm": 2.7755329608917236, "learning_rate": 0.00019912468391363548, "loss": 5.0553, "step": 51 }, { "epoch": 0.015165876777251185, "grad_norm": 3.2506415843963623, "learning_rate": 0.00019910523244504963, "loss": 5.1455, "step": 52 }, { "epoch": 0.015457528253736785, "grad_norm": 3.2655768394470215, "learning_rate": 0.00019908578097646372, "loss": 5.2591, "step": 53 }, { "epoch": 0.015749179730222385, "grad_norm": 4.3707075119018555, "learning_rate": 0.00019906632950787785, "loss": 5.2289, "step": 54 }, { "epoch": 0.016040831206707983, "grad_norm": 3.800015687942505, "learning_rate": 0.000199046878039292, "loss": 5.0829, "step": 55 }, { "epoch": 0.016332482683193584, "grad_norm": 3.5299184322357178, "learning_rate": 0.0001990274265707061, "loss": 5.1265, "step": 56 }, { "epoch": 0.016624134159679182, "grad_norm": 3.4997024536132812, "learning_rate": 0.0001990079751021202, "loss": 5.0747, "step": 57 }, { "epoch": 0.016915785636164783, "grad_norm": 4.166904449462891, "learning_rate": 0.00019898852363353436, "loss": 5.2736, "step": 58 }, { "epoch": 0.01720743711265038, "grad_norm": 4.3089823722839355, "learning_rate": 0.00019896907216494845, "loss": 5.2383, "step": 59 }, { "epoch": 0.017499088589135983, "grad_norm": 6.229640960693359, "learning_rate": 0.00019894962069636258, "loss": 5.1537, "step": 60 }, { "epoch": 0.017790740065621584, "grad_norm": 3.0097970962524414, "learning_rate": 0.0001989301692277767, "loss": 4.8217, "step": 61 }, { "epoch": 0.018082391542107182, "grad_norm": 3.216181993484497, "learning_rate": 0.00019891071775919085, "loss": 5.1919, "step": 62 }, { "epoch": 0.018374043018592783, "grad_norm": 5.76234245300293, "learning_rate": 0.00019889126629060494, "loss": 5.0472, "step": 63 }, { "epoch": 0.01866569449507838, "grad_norm": 3.174882650375366, "learning_rate": 0.00019887181482201906, "loss": 4.9581, "step": 64 }, { "epoch": 0.018957345971563982, "grad_norm": 5.084100723266602, "learning_rate": 0.0001988523633534332, "loss": 4.7746, "step": 65 }, { "epoch": 0.01924899744804958, "grad_norm": 4.715723514556885, "learning_rate": 0.0001988329118848473, "loss": 5.3356, "step": 66 }, { "epoch": 0.019540648924535182, "grad_norm": 4.600905895233154, "learning_rate": 0.00019881346041626143, "loss": 4.8503, "step": 67 }, { "epoch": 0.01983230040102078, "grad_norm": 3.585688591003418, "learning_rate": 0.00019879400894767558, "loss": 5.179, "step": 68 }, { "epoch": 0.02012395187750638, "grad_norm": 3.094789505004883, "learning_rate": 0.0001987745574790897, "loss": 5.069, "step": 69 }, { "epoch": 0.02041560335399198, "grad_norm": 3.1266374588012695, "learning_rate": 0.0001987551060105038, "loss": 5.1906, "step": 70 }, { "epoch": 0.02070725483047758, "grad_norm": 5.068209171295166, "learning_rate": 0.0001987356545419179, "loss": 5.1845, "step": 71 }, { "epoch": 0.02099890630696318, "grad_norm": 4.004126071929932, "learning_rate": 0.00019871620307333206, "loss": 5.1679, "step": 72 }, { "epoch": 0.02129055778344878, "grad_norm": 3.871622085571289, "learning_rate": 0.00019869675160474616, "loss": 4.8692, "step": 73 }, { "epoch": 0.021582209259934378, "grad_norm": 5.422293663024902, "learning_rate": 0.00019867730013616028, "loss": 4.9281, "step": 74 }, { "epoch": 0.02187386073641998, "grad_norm": 4.1079816818237305, "learning_rate": 0.00019865784866757443, "loss": 5.1744, "step": 75 }, { "epoch": 0.022165512212905577, "grad_norm": 5.287847995758057, "learning_rate": 0.00019863839719898855, "loss": 5.119, "step": 76 }, { "epoch": 0.02245716368939118, "grad_norm": 3.4588711261749268, "learning_rate": 0.00019861894573040264, "loss": 4.8687, "step": 77 }, { "epoch": 0.022748815165876776, "grad_norm": 3.623326301574707, "learning_rate": 0.0001985994942618168, "loss": 5.0499, "step": 78 }, { "epoch": 0.023040466642362378, "grad_norm": 4.892202377319336, "learning_rate": 0.0001985800427932309, "loss": 5.1649, "step": 79 }, { "epoch": 0.023332118118847976, "grad_norm": 3.885995864868164, "learning_rate": 0.000198560591324645, "loss": 5.0227, "step": 80 }, { "epoch": 0.023623769595333577, "grad_norm": 5.170047760009766, "learning_rate": 0.00019854113985605913, "loss": 4.8294, "step": 81 }, { "epoch": 0.023915421071819175, "grad_norm": 4.546537399291992, "learning_rate": 0.00019852168838747328, "loss": 5.0578, "step": 82 }, { "epoch": 0.024207072548304776, "grad_norm": 4.045368671417236, "learning_rate": 0.00019850223691888737, "loss": 5.2737, "step": 83 }, { "epoch": 0.024498724024790374, "grad_norm": 3.0393455028533936, "learning_rate": 0.0001984827854503015, "loss": 4.9347, "step": 84 }, { "epoch": 0.024790375501275976, "grad_norm": 4.093234062194824, "learning_rate": 0.00019846333398171564, "loss": 4.8293, "step": 85 }, { "epoch": 0.025082026977761573, "grad_norm": 2.7594780921936035, "learning_rate": 0.00019844388251312976, "loss": 5.0701, "step": 86 }, { "epoch": 0.025373678454247175, "grad_norm": 3.962042808532715, "learning_rate": 0.00019842443104454386, "loss": 4.8717, "step": 87 }, { "epoch": 0.025665329930732773, "grad_norm": 3.9724669456481934, "learning_rate": 0.00019840497957595798, "loss": 4.9848, "step": 88 }, { "epoch": 0.025956981407218374, "grad_norm": 4.083434581756592, "learning_rate": 0.00019838552810737213, "loss": 5.323, "step": 89 }, { "epoch": 0.026248632883703972, "grad_norm": 4.656790256500244, "learning_rate": 0.00019836607663878622, "loss": 4.98, "step": 90 }, { "epoch": 0.026540284360189573, "grad_norm": 4.449328899383545, "learning_rate": 0.00019834662517020035, "loss": 4.806, "step": 91 }, { "epoch": 0.026831935836675175, "grad_norm": 3.3707408905029297, "learning_rate": 0.0001983271737016145, "loss": 5.261, "step": 92 }, { "epoch": 0.027123587313160773, "grad_norm": 3.324280261993408, "learning_rate": 0.00019830772223302862, "loss": 4.7909, "step": 93 }, { "epoch": 0.027415238789646374, "grad_norm": 3.296072006225586, "learning_rate": 0.0001982882707644427, "loss": 4.9188, "step": 94 }, { "epoch": 0.027706890266131972, "grad_norm": 2.7474851608276367, "learning_rate": 0.00019826881929585686, "loss": 5.0354, "step": 95 }, { "epoch": 0.027998541742617573, "grad_norm": 4.2157111167907715, "learning_rate": 0.00019824936782727098, "loss": 4.8458, "step": 96 }, { "epoch": 0.02829019321910317, "grad_norm": 3.022955894470215, "learning_rate": 0.00019822991635868507, "loss": 4.7598, "step": 97 }, { "epoch": 0.028581844695588773, "grad_norm": 2.469153881072998, "learning_rate": 0.0001982104648900992, "loss": 4.9443, "step": 98 }, { "epoch": 0.02887349617207437, "grad_norm": 3.191643714904785, "learning_rate": 0.00019819101342151334, "loss": 4.6843, "step": 99 }, { "epoch": 0.029165147648559972, "grad_norm": 2.831871509552002, "learning_rate": 0.00019817156195292747, "loss": 4.7748, "step": 100 }, { "epoch": 0.02945679912504557, "grad_norm": 3.8723671436309814, "learning_rate": 0.00019815211048434156, "loss": 4.8554, "step": 101 }, { "epoch": 0.02974845060153117, "grad_norm": 2.744079351425171, "learning_rate": 0.0001981326590157557, "loss": 4.8349, "step": 102 }, { "epoch": 0.03004010207801677, "grad_norm": 3.916684865951538, "learning_rate": 0.00019811320754716983, "loss": 4.5747, "step": 103 }, { "epoch": 0.03033175355450237, "grad_norm": 4.069173336029053, "learning_rate": 0.00019809375607858393, "loss": 4.992, "step": 104 }, { "epoch": 0.03062340503098797, "grad_norm": 5.31610107421875, "learning_rate": 0.00019807430460999807, "loss": 4.9012, "step": 105 }, { "epoch": 0.03091505650747357, "grad_norm": 5.239981174468994, "learning_rate": 0.0001980548531414122, "loss": 4.9665, "step": 106 }, { "epoch": 0.031206707983959168, "grad_norm": 10.584851264953613, "learning_rate": 0.0001980354016728263, "loss": 4.6935, "step": 107 }, { "epoch": 0.03149835946044477, "grad_norm": 3.892777681350708, "learning_rate": 0.0001980159502042404, "loss": 5.2157, "step": 108 }, { "epoch": 0.03179001093693037, "grad_norm": 2.671393632888794, "learning_rate": 0.00019799649873565456, "loss": 4.62, "step": 109 }, { "epoch": 0.032081662413415965, "grad_norm": 2.4834694862365723, "learning_rate": 0.00019797704726706868, "loss": 4.7652, "step": 110 }, { "epoch": 0.03237331388990157, "grad_norm": 6.28948450088501, "learning_rate": 0.00019795759579848278, "loss": 5.0893, "step": 111 }, { "epoch": 0.03266496536638717, "grad_norm": 5.9128499031066895, "learning_rate": 0.00019793814432989693, "loss": 4.9234, "step": 112 }, { "epoch": 0.032956616842872766, "grad_norm": 5.77049446105957, "learning_rate": 0.00019791869286131105, "loss": 4.79, "step": 113 }, { "epoch": 0.033248268319358364, "grad_norm": 5.037501335144043, "learning_rate": 0.00019789924139272514, "loss": 4.8665, "step": 114 }, { "epoch": 0.03353991979584397, "grad_norm": 4.729011535644531, "learning_rate": 0.0001978797899241393, "loss": 4.8175, "step": 115 }, { "epoch": 0.033831571272329566, "grad_norm": 3.9499034881591797, "learning_rate": 0.0001978603384555534, "loss": 5.0241, "step": 116 }, { "epoch": 0.034123222748815164, "grad_norm": 2.794339895248413, "learning_rate": 0.00019784088698696753, "loss": 4.8601, "step": 117 }, { "epoch": 0.03441487422530076, "grad_norm": 3.4785914421081543, "learning_rate": 0.00019782143551838163, "loss": 4.9147, "step": 118 }, { "epoch": 0.03470652570178637, "grad_norm": 2.864555835723877, "learning_rate": 0.00019780198404979578, "loss": 4.9952, "step": 119 }, { "epoch": 0.034998177178271965, "grad_norm": 3.581209897994995, "learning_rate": 0.0001977825325812099, "loss": 4.4224, "step": 120 }, { "epoch": 0.03528982865475756, "grad_norm": 2.7270421981811523, "learning_rate": 0.000197763081112624, "loss": 5.0382, "step": 121 }, { "epoch": 0.03558148013124317, "grad_norm": 3.476796865463257, "learning_rate": 0.00019774362964403814, "loss": 4.9126, "step": 122 }, { "epoch": 0.035873131607728766, "grad_norm": 4.477820873260498, "learning_rate": 0.00019772417817545226, "loss": 4.2991, "step": 123 }, { "epoch": 0.036164783084214364, "grad_norm": 2.3631224632263184, "learning_rate": 0.00019770472670686638, "loss": 4.7467, "step": 124 }, { "epoch": 0.03645643456069996, "grad_norm": 3.621644973754883, "learning_rate": 0.0001976852752382805, "loss": 4.8219, "step": 125 }, { "epoch": 0.036748086037185566, "grad_norm": 2.905380964279175, "learning_rate": 0.00019766582376969463, "loss": 4.8571, "step": 126 }, { "epoch": 0.037039737513671164, "grad_norm": 4.762263774871826, "learning_rate": 0.00019764637230110875, "loss": 5.1347, "step": 127 }, { "epoch": 0.03733138899015676, "grad_norm": 3.166132926940918, "learning_rate": 0.00019762692083252284, "loss": 4.7157, "step": 128 }, { "epoch": 0.03762304046664236, "grad_norm": 3.4076318740844727, "learning_rate": 0.000197607469363937, "loss": 4.7238, "step": 129 }, { "epoch": 0.037914691943127965, "grad_norm": 3.1313138008117676, "learning_rate": 0.00019758801789535111, "loss": 4.7099, "step": 130 }, { "epoch": 0.03820634341961356, "grad_norm": 4.487920761108398, "learning_rate": 0.00019756856642676524, "loss": 4.7953, "step": 131 }, { "epoch": 0.03849799489609916, "grad_norm": 2.783236265182495, "learning_rate": 0.00019754911495817936, "loss": 4.6832, "step": 132 }, { "epoch": 0.03878964637258476, "grad_norm": 5.5125041007995605, "learning_rate": 0.00019752966348959348, "loss": 4.8761, "step": 133 }, { "epoch": 0.039081297849070364, "grad_norm": 3.2930171489715576, "learning_rate": 0.0001975102120210076, "loss": 4.3151, "step": 134 }, { "epoch": 0.03937294932555596, "grad_norm": 6.780705451965332, "learning_rate": 0.00019749076055242172, "loss": 4.9145, "step": 135 }, { "epoch": 0.03966460080204156, "grad_norm": 3.9956400394439697, "learning_rate": 0.00019747130908383584, "loss": 5.0834, "step": 136 }, { "epoch": 0.03995625227852716, "grad_norm": 4.022609233856201, "learning_rate": 0.00019745185761524997, "loss": 4.908, "step": 137 }, { "epoch": 0.04024790375501276, "grad_norm": 3.3174099922180176, "learning_rate": 0.00019743240614666406, "loss": 4.5881, "step": 138 }, { "epoch": 0.04053955523149836, "grad_norm": 3.395751953125, "learning_rate": 0.0001974129546780782, "loss": 4.8242, "step": 139 }, { "epoch": 0.04083120670798396, "grad_norm": 2.609846353530884, "learning_rate": 0.00019739350320949233, "loss": 4.6323, "step": 140 }, { "epoch": 0.041122858184469556, "grad_norm": 3.877764940261841, "learning_rate": 0.00019737405174090645, "loss": 4.8657, "step": 141 }, { "epoch": 0.04141450966095516, "grad_norm": 2.5183775424957275, "learning_rate": 0.00019735460027232057, "loss": 4.8716, "step": 142 }, { "epoch": 0.04170616113744076, "grad_norm": 3.0500574111938477, "learning_rate": 0.0001973351488037347, "loss": 4.9475, "step": 143 }, { "epoch": 0.04199781261392636, "grad_norm": 2.0173606872558594, "learning_rate": 0.00019731569733514882, "loss": 4.7378, "step": 144 }, { "epoch": 0.042289464090411955, "grad_norm": 3.7338593006134033, "learning_rate": 0.00019729624586656294, "loss": 4.664, "step": 145 }, { "epoch": 0.04258111556689756, "grad_norm": 3.281522512435913, "learning_rate": 0.00019727679439797706, "loss": 4.9284, "step": 146 }, { "epoch": 0.04287276704338316, "grad_norm": 2.798964023590088, "learning_rate": 0.00019725734292939118, "loss": 4.7993, "step": 147 }, { "epoch": 0.043164418519868755, "grad_norm": 3.437518358230591, "learning_rate": 0.0001972378914608053, "loss": 4.6755, "step": 148 }, { "epoch": 0.04345606999635435, "grad_norm": 3.024007558822632, "learning_rate": 0.00019721843999221942, "loss": 4.9108, "step": 149 }, { "epoch": 0.04374772147283996, "grad_norm": 3.682771921157837, "learning_rate": 0.00019719898852363355, "loss": 5.0804, "step": 150 }, { "epoch": 0.044039372949325556, "grad_norm": 3.978522300720215, "learning_rate": 0.00019717953705504767, "loss": 4.7246, "step": 151 }, { "epoch": 0.044331024425811154, "grad_norm": 5.216152667999268, "learning_rate": 0.0001971600855864618, "loss": 5.0158, "step": 152 }, { "epoch": 0.04462267590229676, "grad_norm": 3.4110636711120605, "learning_rate": 0.0001971406341178759, "loss": 4.743, "step": 153 }, { "epoch": 0.04491432737878236, "grad_norm": 2.888540744781494, "learning_rate": 0.00019712118264929003, "loss": 4.8148, "step": 154 }, { "epoch": 0.045205978855267955, "grad_norm": 4.582914352416992, "learning_rate": 0.00019710173118070415, "loss": 4.5543, "step": 155 }, { "epoch": 0.04549763033175355, "grad_norm": 3.3754498958587646, "learning_rate": 0.00019708227971211828, "loss": 4.7203, "step": 156 }, { "epoch": 0.04578928180823916, "grad_norm": 3.618863582611084, "learning_rate": 0.0001970628282435324, "loss": 4.8012, "step": 157 }, { "epoch": 0.046080933284724755, "grad_norm": 2.9040188789367676, "learning_rate": 0.00019704337677494652, "loss": 4.8128, "step": 158 }, { "epoch": 0.04637258476121035, "grad_norm": 3.3440024852752686, "learning_rate": 0.00019702392530636064, "loss": 4.9171, "step": 159 }, { "epoch": 0.04666423623769595, "grad_norm": 2.995412588119507, "learning_rate": 0.00019700447383777476, "loss": 4.6833, "step": 160 }, { "epoch": 0.046955887714181556, "grad_norm": 2.556804656982422, "learning_rate": 0.00019698502236918888, "loss": 4.8821, "step": 161 }, { "epoch": 0.047247539190667154, "grad_norm": 2.774730920791626, "learning_rate": 0.000196965570900603, "loss": 4.689, "step": 162 }, { "epoch": 0.04753919066715275, "grad_norm": 2.971433401107788, "learning_rate": 0.00019694611943201713, "loss": 4.761, "step": 163 }, { "epoch": 0.04783084214363835, "grad_norm": 3.415595769882202, "learning_rate": 0.00019692666796343125, "loss": 4.9141, "step": 164 }, { "epoch": 0.048122493620123954, "grad_norm": 3.7800793647766113, "learning_rate": 0.00019690721649484537, "loss": 4.655, "step": 165 }, { "epoch": 0.04841414509660955, "grad_norm": 2.7145912647247314, "learning_rate": 0.0001968877650262595, "loss": 4.9598, "step": 166 }, { "epoch": 0.04870579657309515, "grad_norm": 3.2894575595855713, "learning_rate": 0.0001968683135576736, "loss": 4.3858, "step": 167 }, { "epoch": 0.04899744804958075, "grad_norm": 4.536457061767578, "learning_rate": 0.00019684886208908773, "loss": 4.7246, "step": 168 }, { "epoch": 0.04928909952606635, "grad_norm": 2.8026583194732666, "learning_rate": 0.00019682941062050186, "loss": 4.9215, "step": 169 }, { "epoch": 0.04958075100255195, "grad_norm": 5.243438243865967, "learning_rate": 0.00019680995915191598, "loss": 4.6463, "step": 170 }, { "epoch": 0.04987240247903755, "grad_norm": 3.9599621295928955, "learning_rate": 0.0001967905076833301, "loss": 4.9841, "step": 171 }, { "epoch": 0.05016405395552315, "grad_norm": 2.403648614883423, "learning_rate": 0.00019677105621474422, "loss": 4.548, "step": 172 }, { "epoch": 0.05045570543200875, "grad_norm": 3.0962436199188232, "learning_rate": 0.00019675160474615834, "loss": 5.0583, "step": 173 }, { "epoch": 0.05074735690849435, "grad_norm": 3.8097147941589355, "learning_rate": 0.00019673215327757246, "loss": 4.7836, "step": 174 }, { "epoch": 0.05103900838497995, "grad_norm": 4.776021957397461, "learning_rate": 0.00019671270180898659, "loss": 4.5905, "step": 175 }, { "epoch": 0.051330659861465545, "grad_norm": 2.9202258586883545, "learning_rate": 0.0001966932503404007, "loss": 4.6141, "step": 176 }, { "epoch": 0.05162231133795115, "grad_norm": 3.6275815963745117, "learning_rate": 0.00019667379887181483, "loss": 4.8995, "step": 177 }, { "epoch": 0.05191396281443675, "grad_norm": 2.1192002296447754, "learning_rate": 0.00019665434740322895, "loss": 4.4915, "step": 178 }, { "epoch": 0.052205614290922346, "grad_norm": 4.557978630065918, "learning_rate": 0.00019663489593464307, "loss": 4.7263, "step": 179 }, { "epoch": 0.052497265767407944, "grad_norm": 5.12597131729126, "learning_rate": 0.0001966154444660572, "loss": 4.5682, "step": 180 }, { "epoch": 0.05278891724389355, "grad_norm": 4.005878925323486, "learning_rate": 0.00019659599299747132, "loss": 4.8687, "step": 181 }, { "epoch": 0.05308056872037915, "grad_norm": 3.6628952026367188, "learning_rate": 0.00019657654152888544, "loss": 4.7474, "step": 182 }, { "epoch": 0.053372220196864745, "grad_norm": 4.2477192878723145, "learning_rate": 0.00019655709006029956, "loss": 4.9409, "step": 183 }, { "epoch": 0.05366387167335035, "grad_norm": 4.177287578582764, "learning_rate": 0.00019653763859171368, "loss": 4.5149, "step": 184 }, { "epoch": 0.05395552314983595, "grad_norm": 2.6852266788482666, "learning_rate": 0.0001965181871231278, "loss": 4.6173, "step": 185 }, { "epoch": 0.054247174626321545, "grad_norm": 3.697601318359375, "learning_rate": 0.00019649873565454192, "loss": 4.8331, "step": 186 }, { "epoch": 0.05453882610280714, "grad_norm": 5.223504066467285, "learning_rate": 0.00019647928418595605, "loss": 4.8045, "step": 187 }, { "epoch": 0.05483047757929275, "grad_norm": 3.378572702407837, "learning_rate": 0.00019645983271737017, "loss": 4.701, "step": 188 }, { "epoch": 0.055122129055778346, "grad_norm": 2.8503661155700684, "learning_rate": 0.0001964403812487843, "loss": 4.8101, "step": 189 }, { "epoch": 0.055413780532263944, "grad_norm": 3.673322916030884, "learning_rate": 0.0001964209297801984, "loss": 4.8585, "step": 190 }, { "epoch": 0.05570543200874954, "grad_norm": 4.9313063621521, "learning_rate": 0.00019640147831161253, "loss": 4.886, "step": 191 }, { "epoch": 0.05599708348523515, "grad_norm": 3.6467201709747314, "learning_rate": 0.00019638202684302668, "loss": 4.7424, "step": 192 }, { "epoch": 0.056288734961720745, "grad_norm": 3.24381422996521, "learning_rate": 0.00019636257537444077, "loss": 4.6363, "step": 193 }, { "epoch": 0.05658038643820634, "grad_norm": 3.488929033279419, "learning_rate": 0.0001963431239058549, "loss": 4.8984, "step": 194 }, { "epoch": 0.05687203791469194, "grad_norm": 3.949713706970215, "learning_rate": 0.00019632367243726902, "loss": 4.6621, "step": 195 }, { "epoch": 0.057163689391177545, "grad_norm": 2.7937984466552734, "learning_rate": 0.00019630422096868314, "loss": 4.5593, "step": 196 }, { "epoch": 0.05745534086766314, "grad_norm": 6.74245548248291, "learning_rate": 0.00019628476950009726, "loss": 4.8967, "step": 197 }, { "epoch": 0.05774699234414874, "grad_norm": 2.752370595932007, "learning_rate": 0.00019626531803151138, "loss": 4.719, "step": 198 }, { "epoch": 0.05803864382063434, "grad_norm": 3.298980236053467, "learning_rate": 0.0001962458665629255, "loss": 4.8082, "step": 199 }, { "epoch": 0.058330295297119944, "grad_norm": 3.9131977558135986, "learning_rate": 0.00019622641509433963, "loss": 4.5207, "step": 200 }, { "epoch": 0.05862194677360554, "grad_norm": 2.989313840866089, "learning_rate": 0.00019620696362575375, "loss": 4.6566, "step": 201 }, { "epoch": 0.05891359825009114, "grad_norm": 2.278000831604004, "learning_rate": 0.0001961875121571679, "loss": 4.7488, "step": 202 }, { "epoch": 0.05920524972657674, "grad_norm": 3.8926479816436768, "learning_rate": 0.000196168060688582, "loss": 4.4491, "step": 203 }, { "epoch": 0.05949690120306234, "grad_norm": 3.324033260345459, "learning_rate": 0.0001961486092199961, "loss": 4.628, "step": 204 }, { "epoch": 0.05978855267954794, "grad_norm": 2.9942760467529297, "learning_rate": 0.00019612915775141023, "loss": 4.624, "step": 205 }, { "epoch": 0.06008020415603354, "grad_norm": 2.6430258750915527, "learning_rate": 0.00019610970628282436, "loss": 4.6391, "step": 206 }, { "epoch": 0.060371855632519136, "grad_norm": 4.3733391761779785, "learning_rate": 0.00019609025481423848, "loss": 4.74, "step": 207 }, { "epoch": 0.06066350710900474, "grad_norm": 3.6343955993652344, "learning_rate": 0.0001960708033456526, "loss": 4.6226, "step": 208 }, { "epoch": 0.06095515858549034, "grad_norm": 3.3466055393218994, "learning_rate": 0.00019605135187706675, "loss": 4.6624, "step": 209 }, { "epoch": 0.06124681006197594, "grad_norm": 7.2834038734436035, "learning_rate": 0.00019603190040848084, "loss": 4.5907, "step": 210 }, { "epoch": 0.06153846153846154, "grad_norm": 4.301766872406006, "learning_rate": 0.00019601244893989496, "loss": 4.8576, "step": 211 }, { "epoch": 0.06183011301494714, "grad_norm": 4.776557445526123, "learning_rate": 0.0001959929974713091, "loss": 4.7528, "step": 212 }, { "epoch": 0.06212176449143274, "grad_norm": 3.7908501625061035, "learning_rate": 0.0001959735460027232, "loss": 4.693, "step": 213 }, { "epoch": 0.062413415967918336, "grad_norm": 4.114233493804932, "learning_rate": 0.00019595409453413733, "loss": 4.9188, "step": 214 }, { "epoch": 0.06270506744440393, "grad_norm": 4.075882911682129, "learning_rate": 0.00019593464306555145, "loss": 5.1271, "step": 215 }, { "epoch": 0.06299671892088954, "grad_norm": 6.4170002937316895, "learning_rate": 0.0001959151915969656, "loss": 4.7683, "step": 216 }, { "epoch": 0.06328837039737514, "grad_norm": 4.042845726013184, "learning_rate": 0.0001958957401283797, "loss": 4.6717, "step": 217 }, { "epoch": 0.06358002187386073, "grad_norm": 2.2285616397857666, "learning_rate": 0.00019587628865979381, "loss": 4.4787, "step": 218 }, { "epoch": 0.06387167335034634, "grad_norm": 3.4165778160095215, "learning_rate": 0.00019585683719120796, "loss": 4.7106, "step": 219 }, { "epoch": 0.06416332482683193, "grad_norm": 3.3771297931671143, "learning_rate": 0.00019583738572262206, "loss": 4.9262, "step": 220 }, { "epoch": 0.06445497630331753, "grad_norm": 3.944932699203491, "learning_rate": 0.00019581793425403618, "loss": 4.7227, "step": 221 }, { "epoch": 0.06474662777980314, "grad_norm": 3.0989463329315186, "learning_rate": 0.00019579848278545033, "loss": 4.7909, "step": 222 }, { "epoch": 0.06503827925628873, "grad_norm": 3.6651737689971924, "learning_rate": 0.00019577903131686445, "loss": 4.887, "step": 223 }, { "epoch": 0.06532993073277434, "grad_norm": 4.740444183349609, "learning_rate": 0.00019575957984827854, "loss": 5.0046, "step": 224 }, { "epoch": 0.06562158220925994, "grad_norm": 3.940077543258667, "learning_rate": 0.00019574012837969267, "loss": 4.5371, "step": 225 }, { "epoch": 0.06591323368574553, "grad_norm": 2.7288267612457275, "learning_rate": 0.00019572067691110681, "loss": 4.6473, "step": 226 }, { "epoch": 0.06620488516223114, "grad_norm": 4.231914043426514, "learning_rate": 0.0001957012254425209, "loss": 4.7246, "step": 227 }, { "epoch": 0.06649653663871673, "grad_norm": 3.4270825386047363, "learning_rate": 0.00019568177397393503, "loss": 4.8249, "step": 228 }, { "epoch": 0.06678818811520233, "grad_norm": 6.292290210723877, "learning_rate": 0.00019566232250534918, "loss": 4.5854, "step": 229 }, { "epoch": 0.06707983959168794, "grad_norm": 2.2854278087615967, "learning_rate": 0.00019564287103676327, "loss": 4.515, "step": 230 }, { "epoch": 0.06737149106817353, "grad_norm": 4.46990966796875, "learning_rate": 0.0001956234195681774, "loss": 4.7337, "step": 231 }, { "epoch": 0.06766314254465913, "grad_norm": 3.0182275772094727, "learning_rate": 0.00019560396809959154, "loss": 4.5608, "step": 232 }, { "epoch": 0.06795479402114474, "grad_norm": 3.670341968536377, "learning_rate": 0.00019558451663100567, "loss": 4.976, "step": 233 }, { "epoch": 0.06824644549763033, "grad_norm": 3.408461570739746, "learning_rate": 0.00019556506516241976, "loss": 4.6873, "step": 234 }, { "epoch": 0.06853809697411593, "grad_norm": 3.310580253601074, "learning_rate": 0.00019554561369383388, "loss": 4.7882, "step": 235 }, { "epoch": 0.06882974845060152, "grad_norm": 3.7846269607543945, "learning_rate": 0.00019552616222524803, "loss": 4.7603, "step": 236 }, { "epoch": 0.06912139992708713, "grad_norm": 3.5698342323303223, "learning_rate": 0.00019550671075666212, "loss": 4.8663, "step": 237 }, { "epoch": 0.06941305140357273, "grad_norm": 2.5568552017211914, "learning_rate": 0.00019548725928807625, "loss": 4.5338, "step": 238 }, { "epoch": 0.06970470288005833, "grad_norm": 2.0983262062072754, "learning_rate": 0.0001954678078194904, "loss": 4.5659, "step": 239 }, { "epoch": 0.06999635435654393, "grad_norm": 3.552135467529297, "learning_rate": 0.00019544835635090452, "loss": 4.8945, "step": 240 }, { "epoch": 0.07028800583302953, "grad_norm": 3.0007147789001465, "learning_rate": 0.0001954289048823186, "loss": 4.6278, "step": 241 }, { "epoch": 0.07057965730951513, "grad_norm": 3.5922021865844727, "learning_rate": 0.00019540945341373276, "loss": 4.8851, "step": 242 }, { "epoch": 0.07087130878600073, "grad_norm": 4.7101616859436035, "learning_rate": 0.00019539000194514688, "loss": 4.6862, "step": 243 }, { "epoch": 0.07116296026248634, "grad_norm": 2.440412998199463, "learning_rate": 0.00019537055047656098, "loss": 4.7277, "step": 244 }, { "epoch": 0.07145461173897193, "grad_norm": 3.617727518081665, "learning_rate": 0.0001953510990079751, "loss": 4.5874, "step": 245 }, { "epoch": 0.07174626321545753, "grad_norm": 2.9946746826171875, "learning_rate": 0.00019533164753938925, "loss": 4.3742, "step": 246 }, { "epoch": 0.07203791469194312, "grad_norm": 3.2224831581115723, "learning_rate": 0.00019531219607080337, "loss": 4.2716, "step": 247 }, { "epoch": 0.07232956616842873, "grad_norm": 3.504857063293457, "learning_rate": 0.00019529274460221746, "loss": 4.87, "step": 248 }, { "epoch": 0.07262121764491433, "grad_norm": 2.6164348125457764, "learning_rate": 0.0001952732931336316, "loss": 4.6554, "step": 249 }, { "epoch": 0.07291286912139992, "grad_norm": 3.222731828689575, "learning_rate": 0.00019525384166504573, "loss": 4.7493, "step": 250 }, { "epoch": 0.07320452059788553, "grad_norm": 2.396941661834717, "learning_rate": 0.00019523439019645983, "loss": 4.67, "step": 251 }, { "epoch": 0.07349617207437113, "grad_norm": 2.4573915004730225, "learning_rate": 0.00019521493872787395, "loss": 4.6723, "step": 252 }, { "epoch": 0.07378782355085672, "grad_norm": 4.403919219970703, "learning_rate": 0.0001951954872592881, "loss": 4.6143, "step": 253 }, { "epoch": 0.07407947502734233, "grad_norm": 3.3172433376312256, "learning_rate": 0.0001951760357907022, "loss": 4.848, "step": 254 }, { "epoch": 0.07437112650382792, "grad_norm": 3.384395122528076, "learning_rate": 0.00019515658432211631, "loss": 4.6624, "step": 255 }, { "epoch": 0.07466277798031352, "grad_norm": 3.3942575454711914, "learning_rate": 0.00019513713285353046, "loss": 4.7087, "step": 256 }, { "epoch": 0.07495442945679913, "grad_norm": 3.9758498668670654, "learning_rate": 0.00019511768138494458, "loss": 4.8357, "step": 257 }, { "epoch": 0.07524608093328472, "grad_norm": 3.2680184841156006, "learning_rate": 0.00019509822991635868, "loss": 4.7555, "step": 258 }, { "epoch": 0.07553773240977033, "grad_norm": 4.418920993804932, "learning_rate": 0.00019507877844777283, "loss": 4.4454, "step": 259 }, { "epoch": 0.07582938388625593, "grad_norm": 2.662158250808716, "learning_rate": 0.00019505932697918695, "loss": 4.6025, "step": 260 }, { "epoch": 0.07612103536274152, "grad_norm": 3.6071372032165527, "learning_rate": 0.00019503987551060104, "loss": 4.8332, "step": 261 }, { "epoch": 0.07641268683922713, "grad_norm": 4.804398059844971, "learning_rate": 0.00019502042404201516, "loss": 4.6861, "step": 262 }, { "epoch": 0.07670433831571272, "grad_norm": 4.186119079589844, "learning_rate": 0.0001950009725734293, "loss": 4.713, "step": 263 }, { "epoch": 0.07699598979219832, "grad_norm": 4.358138084411621, "learning_rate": 0.00019498152110484344, "loss": 4.8186, "step": 264 }, { "epoch": 0.07728764126868393, "grad_norm": 3.0141568183898926, "learning_rate": 0.00019496206963625753, "loss": 4.847, "step": 265 }, { "epoch": 0.07757929274516952, "grad_norm": 5.378215789794922, "learning_rate": 0.00019494261816767168, "loss": 4.9041, "step": 266 }, { "epoch": 0.07787094422165512, "grad_norm": 11.842477798461914, "learning_rate": 0.0001949231666990858, "loss": 4.7335, "step": 267 }, { "epoch": 0.07816259569814073, "grad_norm": 2.8514225482940674, "learning_rate": 0.0001949037152304999, "loss": 4.4724, "step": 268 }, { "epoch": 0.07845424717462632, "grad_norm": 2.3388633728027344, "learning_rate": 0.00019488426376191404, "loss": 4.7462, "step": 269 }, { "epoch": 0.07874589865111192, "grad_norm": 3.8558509349823, "learning_rate": 0.00019486481229332816, "loss": 4.6844, "step": 270 }, { "epoch": 0.07903755012759753, "grad_norm": 2.709312677383423, "learning_rate": 0.00019484536082474229, "loss": 4.4626, "step": 271 }, { "epoch": 0.07932920160408312, "grad_norm": 3.6714487075805664, "learning_rate": 0.00019482590935615638, "loss": 4.6332, "step": 272 }, { "epoch": 0.07962085308056872, "grad_norm": 2.0513763427734375, "learning_rate": 0.00019480645788757053, "loss": 4.8516, "step": 273 }, { "epoch": 0.07991250455705431, "grad_norm": 2.744565725326538, "learning_rate": 0.00019478700641898465, "loss": 4.7982, "step": 274 }, { "epoch": 0.08020415603353992, "grad_norm": 2.7696619033813477, "learning_rate": 0.00019476755495039875, "loss": 4.6148, "step": 275 }, { "epoch": 0.08049580751002552, "grad_norm": 3.436234951019287, "learning_rate": 0.0001947481034818129, "loss": 4.5705, "step": 276 }, { "epoch": 0.08078745898651112, "grad_norm": 2.9915554523468018, "learning_rate": 0.00019472865201322702, "loss": 4.5311, "step": 277 }, { "epoch": 0.08107911046299672, "grad_norm": 2.7451577186584473, "learning_rate": 0.00019470920054464114, "loss": 4.8513, "step": 278 }, { "epoch": 0.08137076193948233, "grad_norm": 3.2365424633026123, "learning_rate": 0.00019468974907605526, "loss": 4.6764, "step": 279 }, { "epoch": 0.08166241341596792, "grad_norm": 3.6207687854766846, "learning_rate": 0.00019467029760746938, "loss": 4.5958, "step": 280 }, { "epoch": 0.08195406489245352, "grad_norm": 2.6951112747192383, "learning_rate": 0.0001946508461388835, "loss": 4.7621, "step": 281 }, { "epoch": 0.08224571636893911, "grad_norm": 18.47077178955078, "learning_rate": 0.0001946313946702976, "loss": 4.9141, "step": 282 }, { "epoch": 0.08253736784542472, "grad_norm": 4.8374738693237305, "learning_rate": 0.00019461194320171175, "loss": 4.6963, "step": 283 }, { "epoch": 0.08282901932191032, "grad_norm": 2.4302830696105957, "learning_rate": 0.00019459249173312587, "loss": 4.7531, "step": 284 }, { "epoch": 0.08312067079839591, "grad_norm": 2.244938611984253, "learning_rate": 0.00019457304026453996, "loss": 4.664, "step": 285 }, { "epoch": 0.08341232227488152, "grad_norm": 3.566474676132202, "learning_rate": 0.0001945535887959541, "loss": 4.8558, "step": 286 }, { "epoch": 0.08370397375136712, "grad_norm": 4.761203765869141, "learning_rate": 0.00019453413732736823, "loss": 4.5949, "step": 287 }, { "epoch": 0.08399562522785271, "grad_norm": 2.763244152069092, "learning_rate": 0.00019451468585878235, "loss": 4.4771, "step": 288 }, { "epoch": 0.08428727670433832, "grad_norm": 2.284223794937134, "learning_rate": 0.00019449523439019647, "loss": 4.543, "step": 289 }, { "epoch": 0.08457892818082391, "grad_norm": 4.4081010818481445, "learning_rate": 0.0001944757829216106, "loss": 4.544, "step": 290 }, { "epoch": 0.08487057965730951, "grad_norm": 3.0221195220947266, "learning_rate": 0.00019445633145302472, "loss": 4.8145, "step": 291 }, { "epoch": 0.08516223113379512, "grad_norm": 3.244807243347168, "learning_rate": 0.0001944368799844388, "loss": 4.9551, "step": 292 }, { "epoch": 0.08545388261028071, "grad_norm": 2.336052179336548, "learning_rate": 0.00019441742851585296, "loss": 4.656, "step": 293 }, { "epoch": 0.08574553408676631, "grad_norm": 2.8653674125671387, "learning_rate": 0.00019439797704726708, "loss": 4.8984, "step": 294 }, { "epoch": 0.08603718556325192, "grad_norm": 2.9375057220458984, "learning_rate": 0.0001943785255786812, "loss": 4.8241, "step": 295 }, { "epoch": 0.08632883703973751, "grad_norm": 4.766423225402832, "learning_rate": 0.00019435907411009533, "loss": 4.6621, "step": 296 }, { "epoch": 0.08662048851622312, "grad_norm": 3.2495522499084473, "learning_rate": 0.00019433962264150945, "loss": 4.4896, "step": 297 }, { "epoch": 0.0869121399927087, "grad_norm": 3.450291633605957, "learning_rate": 0.00019432017117292357, "loss": 4.5862, "step": 298 }, { "epoch": 0.08720379146919431, "grad_norm": 3.0337040424346924, "learning_rate": 0.0001943007197043377, "loss": 4.6036, "step": 299 }, { "epoch": 0.08749544294567992, "grad_norm": 2.9175710678100586, "learning_rate": 0.0001942812682357518, "loss": 4.8459, "step": 300 }, { "epoch": 0.08778709442216551, "grad_norm": 4.640604019165039, "learning_rate": 0.00019426181676716593, "loss": 4.7217, "step": 301 }, { "epoch": 0.08807874589865111, "grad_norm": 2.5526700019836426, "learning_rate": 0.00019424236529858006, "loss": 4.7152, "step": 302 }, { "epoch": 0.08837039737513672, "grad_norm": 2.6795997619628906, "learning_rate": 0.00019422291382999418, "loss": 4.7224, "step": 303 }, { "epoch": 0.08866204885162231, "grad_norm": 3.780064582824707, "learning_rate": 0.0001942034623614083, "loss": 4.9312, "step": 304 }, { "epoch": 0.08895370032810791, "grad_norm": 5.694493770599365, "learning_rate": 0.00019418401089282242, "loss": 4.5114, "step": 305 }, { "epoch": 0.08924535180459352, "grad_norm": 3.259495973587036, "learning_rate": 0.00019416455942423654, "loss": 5.0344, "step": 306 }, { "epoch": 0.08953700328107911, "grad_norm": 3.1601338386535645, "learning_rate": 0.00019414510795565066, "loss": 4.9521, "step": 307 }, { "epoch": 0.08982865475756471, "grad_norm": 3.480398178100586, "learning_rate": 0.00019412565648706479, "loss": 4.4706, "step": 308 }, { "epoch": 0.0901203062340503, "grad_norm": 3.3123891353607178, "learning_rate": 0.0001941062050184789, "loss": 4.7, "step": 309 }, { "epoch": 0.09041195771053591, "grad_norm": 2.9874155521392822, "learning_rate": 0.00019408675354989303, "loss": 4.6514, "step": 310 }, { "epoch": 0.09070360918702151, "grad_norm": 2.9311928749084473, "learning_rate": 0.00019406730208130715, "loss": 4.6238, "step": 311 }, { "epoch": 0.0909952606635071, "grad_norm": 4.2490668296813965, "learning_rate": 0.00019404785061272127, "loss": 4.7224, "step": 312 }, { "epoch": 0.09128691213999271, "grad_norm": 2.907639503479004, "learning_rate": 0.0001940283991441354, "loss": 4.4592, "step": 313 }, { "epoch": 0.09157856361647831, "grad_norm": 2.884443998336792, "learning_rate": 0.00019400894767554951, "loss": 4.7449, "step": 314 }, { "epoch": 0.0918702150929639, "grad_norm": 3.219919204711914, "learning_rate": 0.00019398949620696364, "loss": 4.5651, "step": 315 }, { "epoch": 0.09216186656944951, "grad_norm": 2.4171881675720215, "learning_rate": 0.00019397004473837776, "loss": 4.4245, "step": 316 }, { "epoch": 0.0924535180459351, "grad_norm": 3.575878143310547, "learning_rate": 0.00019395059326979188, "loss": 4.7412, "step": 317 }, { "epoch": 0.0927451695224207, "grad_norm": 3.5516090393066406, "learning_rate": 0.000193931141801206, "loss": 4.8135, "step": 318 }, { "epoch": 0.09303682099890631, "grad_norm": 3.9379847049713135, "learning_rate": 0.00019391169033262012, "loss": 4.4589, "step": 319 }, { "epoch": 0.0933284724753919, "grad_norm": 3.4649713039398193, "learning_rate": 0.00019389223886403424, "loss": 4.6282, "step": 320 }, { "epoch": 0.0936201239518775, "grad_norm": 3.2376515865325928, "learning_rate": 0.00019387278739544837, "loss": 4.7286, "step": 321 }, { "epoch": 0.09391177542836311, "grad_norm": 2.762099027633667, "learning_rate": 0.0001938533359268625, "loss": 4.4506, "step": 322 }, { "epoch": 0.0942034269048487, "grad_norm": 1.6559114456176758, "learning_rate": 0.0001938338844582766, "loss": 4.719, "step": 323 }, { "epoch": 0.09449507838133431, "grad_norm": 2.819185733795166, "learning_rate": 0.00019381443298969073, "loss": 4.6758, "step": 324 }, { "epoch": 0.0947867298578199, "grad_norm": 6.718408107757568, "learning_rate": 0.00019379498152110485, "loss": 4.8634, "step": 325 }, { "epoch": 0.0950783813343055, "grad_norm": 3.222621440887451, "learning_rate": 0.00019377553005251897, "loss": 4.6115, "step": 326 }, { "epoch": 0.09537003281079111, "grad_norm": 3.2231178283691406, "learning_rate": 0.0001937560785839331, "loss": 4.5377, "step": 327 }, { "epoch": 0.0956616842872767, "grad_norm": 2.9750940799713135, "learning_rate": 0.00019373662711534722, "loss": 4.8738, "step": 328 }, { "epoch": 0.0959533357637623, "grad_norm": 3.050295829772949, "learning_rate": 0.00019371717564676134, "loss": 4.7481, "step": 329 }, { "epoch": 0.09624498724024791, "grad_norm": 3.8282337188720703, "learning_rate": 0.00019369772417817546, "loss": 4.9622, "step": 330 }, { "epoch": 0.0965366387167335, "grad_norm": 3.302602767944336, "learning_rate": 0.00019367827270958958, "loss": 4.7353, "step": 331 }, { "epoch": 0.0968282901932191, "grad_norm": 3.8155040740966797, "learning_rate": 0.0001936588212410037, "loss": 4.8287, "step": 332 }, { "epoch": 0.09711994166970471, "grad_norm": 4.609050273895264, "learning_rate": 0.00019363936977241783, "loss": 5.0228, "step": 333 }, { "epoch": 0.0974115931461903, "grad_norm": 3.716909408569336, "learning_rate": 0.00019361991830383195, "loss": 4.8328, "step": 334 }, { "epoch": 0.0977032446226759, "grad_norm": 2.9868643283843994, "learning_rate": 0.00019360046683524607, "loss": 4.8028, "step": 335 }, { "epoch": 0.0979948960991615, "grad_norm": 3.2756991386413574, "learning_rate": 0.0001935810153666602, "loss": 4.6674, "step": 336 }, { "epoch": 0.0982865475756471, "grad_norm": 4.441425800323486, "learning_rate": 0.0001935615638980743, "loss": 4.522, "step": 337 }, { "epoch": 0.0985781990521327, "grad_norm": 3.0000412464141846, "learning_rate": 0.00019354211242948843, "loss": 4.6807, "step": 338 }, { "epoch": 0.0988698505286183, "grad_norm": 3.145289897918701, "learning_rate": 0.00019352266096090255, "loss": 4.4437, "step": 339 }, { "epoch": 0.0991615020051039, "grad_norm": 3.2336301803588867, "learning_rate": 0.00019350320949231668, "loss": 4.9232, "step": 340 }, { "epoch": 0.0994531534815895, "grad_norm": 3.5684351921081543, "learning_rate": 0.0001934837580237308, "loss": 4.5746, "step": 341 }, { "epoch": 0.0997448049580751, "grad_norm": 2.833815336227417, "learning_rate": 0.00019346430655514492, "loss": 4.7545, "step": 342 }, { "epoch": 0.1000364564345607, "grad_norm": 2.3592467308044434, "learning_rate": 0.00019344485508655904, "loss": 4.7484, "step": 343 }, { "epoch": 0.1003281079110463, "grad_norm": 2.742006778717041, "learning_rate": 0.00019342540361797316, "loss": 4.8234, "step": 344 }, { "epoch": 0.1006197593875319, "grad_norm": 3.556426525115967, "learning_rate": 0.00019340595214938728, "loss": 4.6295, "step": 345 }, { "epoch": 0.1009114108640175, "grad_norm": 3.1205265522003174, "learning_rate": 0.0001933865006808014, "loss": 4.3443, "step": 346 }, { "epoch": 0.1012030623405031, "grad_norm": 3.135064125061035, "learning_rate": 0.00019336704921221553, "loss": 4.4234, "step": 347 }, { "epoch": 0.1014947138169887, "grad_norm": 3.510084629058838, "learning_rate": 0.00019334759774362965, "loss": 4.5372, "step": 348 }, { "epoch": 0.1017863652934743, "grad_norm": 2.6597461700439453, "learning_rate": 0.00019332814627504377, "loss": 4.4574, "step": 349 }, { "epoch": 0.1020780167699599, "grad_norm": 4.7024712562561035, "learning_rate": 0.0001933086948064579, "loss": 4.6274, "step": 350 }, { "epoch": 0.1023696682464455, "grad_norm": 4.618969440460205, "learning_rate": 0.00019328924333787201, "loss": 5.0903, "step": 351 }, { "epoch": 0.10266131972293109, "grad_norm": 3.5818464756011963, "learning_rate": 0.00019326979186928614, "loss": 4.8043, "step": 352 }, { "epoch": 0.1029529711994167, "grad_norm": 2.565622329711914, "learning_rate": 0.00019325034040070026, "loss": 4.8352, "step": 353 }, { "epoch": 0.1032446226759023, "grad_norm": 2.1009931564331055, "learning_rate": 0.00019323088893211438, "loss": 4.5445, "step": 354 }, { "epoch": 0.10353627415238789, "grad_norm": 4.07314395904541, "learning_rate": 0.0001932114374635285, "loss": 4.7378, "step": 355 }, { "epoch": 0.1038279256288735, "grad_norm": 3.1302175521850586, "learning_rate": 0.00019319198599494265, "loss": 4.5499, "step": 356 }, { "epoch": 0.1041195771053591, "grad_norm": 3.070889949798584, "learning_rate": 0.00019317253452635674, "loss": 4.6326, "step": 357 }, { "epoch": 0.10441122858184469, "grad_norm": 5.264571666717529, "learning_rate": 0.00019315308305777086, "loss": 4.8269, "step": 358 }, { "epoch": 0.1047028800583303, "grad_norm": 6.522325038909912, "learning_rate": 0.00019313363158918499, "loss": 4.7993, "step": 359 }, { "epoch": 0.10499453153481589, "grad_norm": 3.126035213470459, "learning_rate": 0.0001931141801205991, "loss": 4.6702, "step": 360 }, { "epoch": 0.10528618301130149, "grad_norm": 2.859683036804199, "learning_rate": 0.00019309472865201323, "loss": 4.6381, "step": 361 }, { "epoch": 0.1055778344877871, "grad_norm": 3.5438621044158936, "learning_rate": 0.00019307527718342735, "loss": 5.0339, "step": 362 }, { "epoch": 0.10586948596427269, "grad_norm": 2.056342363357544, "learning_rate": 0.0001930558257148415, "loss": 4.6383, "step": 363 }, { "epoch": 0.1061611374407583, "grad_norm": 3.7751927375793457, "learning_rate": 0.0001930363742462556, "loss": 4.63, "step": 364 }, { "epoch": 0.1064527889172439, "grad_norm": 5.761468410491943, "learning_rate": 0.00019301692277766972, "loss": 4.715, "step": 365 }, { "epoch": 0.10674444039372949, "grad_norm": 3.7869679927825928, "learning_rate": 0.00019299747130908386, "loss": 4.7125, "step": 366 }, { "epoch": 0.1070360918702151, "grad_norm": 3.5277693271636963, "learning_rate": 0.00019297801984049796, "loss": 4.7548, "step": 367 }, { "epoch": 0.1073277433467007, "grad_norm": 2.613450288772583, "learning_rate": 0.00019295856837191208, "loss": 4.6297, "step": 368 }, { "epoch": 0.10761939482318629, "grad_norm": 2.255612850189209, "learning_rate": 0.0001929391169033262, "loss": 4.4428, "step": 369 }, { "epoch": 0.1079110462996719, "grad_norm": 3.5260050296783447, "learning_rate": 0.00019291966543474035, "loss": 4.9608, "step": 370 }, { "epoch": 0.10820269777615749, "grad_norm": 2.6680591106414795, "learning_rate": 0.00019290021396615445, "loss": 4.8874, "step": 371 }, { "epoch": 0.10849434925264309, "grad_norm": 3.31095290184021, "learning_rate": 0.00019288076249756857, "loss": 4.7814, "step": 372 }, { "epoch": 0.1087860007291287, "grad_norm": 4.33250617980957, "learning_rate": 0.00019286131102898272, "loss": 4.5224, "step": 373 }, { "epoch": 0.10907765220561429, "grad_norm": 4.3986334800720215, "learning_rate": 0.0001928418595603968, "loss": 4.8116, "step": 374 }, { "epoch": 0.10936930368209989, "grad_norm": 5.943734645843506, "learning_rate": 0.00019282240809181093, "loss": 4.7141, "step": 375 }, { "epoch": 0.1096609551585855, "grad_norm": 3.4450385570526123, "learning_rate": 0.00019280295662322508, "loss": 4.5488, "step": 376 }, { "epoch": 0.10995260663507109, "grad_norm": 2.6267900466918945, "learning_rate": 0.00019278350515463918, "loss": 4.7601, "step": 377 }, { "epoch": 0.11024425811155669, "grad_norm": 3.8454322814941406, "learning_rate": 0.0001927640536860533, "loss": 4.6669, "step": 378 }, { "epoch": 0.11053590958804228, "grad_norm": 2.229508638381958, "learning_rate": 0.00019274460221746742, "loss": 4.6143, "step": 379 }, { "epoch": 0.11082756106452789, "grad_norm": 3.0657124519348145, "learning_rate": 0.00019272515074888157, "loss": 4.6653, "step": 380 }, { "epoch": 0.11111921254101349, "grad_norm": 3.210451364517212, "learning_rate": 0.00019270569928029566, "loss": 4.7143, "step": 381 }, { "epoch": 0.11141086401749908, "grad_norm": 3.795510768890381, "learning_rate": 0.00019268624781170978, "loss": 4.7236, "step": 382 }, { "epoch": 0.11170251549398469, "grad_norm": 2.614006519317627, "learning_rate": 0.00019266679634312393, "loss": 4.471, "step": 383 }, { "epoch": 0.1119941669704703, "grad_norm": 3.350501298904419, "learning_rate": 0.00019264734487453803, "loss": 4.9229, "step": 384 }, { "epoch": 0.11228581844695588, "grad_norm": 3.380019426345825, "learning_rate": 0.00019262789340595215, "loss": 4.7446, "step": 385 }, { "epoch": 0.11257746992344149, "grad_norm": 3.155155658721924, "learning_rate": 0.0001926084419373663, "loss": 4.7897, "step": 386 }, { "epoch": 0.11286912139992708, "grad_norm": 2.2815022468566895, "learning_rate": 0.00019258899046878042, "loss": 4.3213, "step": 387 }, { "epoch": 0.11316077287641269, "grad_norm": 2.328812837600708, "learning_rate": 0.0001925695390001945, "loss": 4.66, "step": 388 }, { "epoch": 0.11345242435289829, "grad_norm": 2.504554033279419, "learning_rate": 0.00019255008753160863, "loss": 4.6765, "step": 389 }, { "epoch": 0.11374407582938388, "grad_norm": 3.895245313644409, "learning_rate": 0.00019253063606302278, "loss": 4.7642, "step": 390 }, { "epoch": 0.11403572730586949, "grad_norm": 3.6395151615142822, "learning_rate": 0.00019251118459443688, "loss": 4.3701, "step": 391 }, { "epoch": 0.11432737878235509, "grad_norm": 3.2197787761688232, "learning_rate": 0.000192491733125851, "loss": 4.5553, "step": 392 }, { "epoch": 0.11461903025884068, "grad_norm": 3.7138917446136475, "learning_rate": 0.00019247228165726515, "loss": 4.7593, "step": 393 }, { "epoch": 0.11491068173532629, "grad_norm": 4.130615711212158, "learning_rate": 0.00019245283018867927, "loss": 4.9652, "step": 394 }, { "epoch": 0.11520233321181189, "grad_norm": 3.1377007961273193, "learning_rate": 0.00019243337872009336, "loss": 4.6912, "step": 395 }, { "epoch": 0.11549398468829748, "grad_norm": 2.6413867473602295, "learning_rate": 0.0001924139272515075, "loss": 4.6126, "step": 396 }, { "epoch": 0.11578563616478309, "grad_norm": 2.9382095336914062, "learning_rate": 0.00019239447578292163, "loss": 4.7848, "step": 397 }, { "epoch": 0.11607728764126868, "grad_norm": 2.6897225379943848, "learning_rate": 0.00019237502431433573, "loss": 4.5613, "step": 398 }, { "epoch": 0.11636893911775428, "grad_norm": 3.609869956970215, "learning_rate": 0.00019235557284574985, "loss": 4.4661, "step": 399 }, { "epoch": 0.11666059059423989, "grad_norm": 2.837736129760742, "learning_rate": 0.000192336121377164, "loss": 4.4727, "step": 400 }, { "epoch": 0.11695224207072548, "grad_norm": 2.221576690673828, "learning_rate": 0.0001923166699085781, "loss": 4.56, "step": 401 }, { "epoch": 0.11724389354721108, "grad_norm": 2.3918964862823486, "learning_rate": 0.00019229721843999221, "loss": 4.4049, "step": 402 }, { "epoch": 0.11753554502369669, "grad_norm": 2.726630926132202, "learning_rate": 0.00019227776697140636, "loss": 4.6911, "step": 403 }, { "epoch": 0.11782719650018228, "grad_norm": 3.143047332763672, "learning_rate": 0.00019225831550282049, "loss": 4.8027, "step": 404 }, { "epoch": 0.11811884797666788, "grad_norm": 4.570982933044434, "learning_rate": 0.00019223886403423458, "loss": 4.7134, "step": 405 }, { "epoch": 0.11841049945315348, "grad_norm": 6.358057975769043, "learning_rate": 0.00019221941256564873, "loss": 4.9322, "step": 406 }, { "epoch": 0.11870215092963908, "grad_norm": 4.6042280197143555, "learning_rate": 0.00019219996109706285, "loss": 4.8617, "step": 407 }, { "epoch": 0.11899380240612469, "grad_norm": 2.500908136367798, "learning_rate": 0.00019218050962847694, "loss": 4.6078, "step": 408 }, { "epoch": 0.11928545388261028, "grad_norm": 3.5713884830474854, "learning_rate": 0.00019216105815989107, "loss": 4.5056, "step": 409 }, { "epoch": 0.11957710535909588, "grad_norm": 3.7049076557159424, "learning_rate": 0.00019214160669130521, "loss": 4.7696, "step": 410 }, { "epoch": 0.11986875683558149, "grad_norm": 3.4996488094329834, "learning_rate": 0.00019212215522271934, "loss": 4.7031, "step": 411 }, { "epoch": 0.12016040831206708, "grad_norm": 2.789867877960205, "learning_rate": 0.00019210270375413343, "loss": 4.5491, "step": 412 }, { "epoch": 0.12045205978855268, "grad_norm": 3.8946642875671387, "learning_rate": 0.00019208325228554758, "loss": 4.5002, "step": 413 }, { "epoch": 0.12074371126503827, "grad_norm": 2.8941266536712646, "learning_rate": 0.0001920638008169617, "loss": 4.5108, "step": 414 }, { "epoch": 0.12103536274152388, "grad_norm": 3.1839709281921387, "learning_rate": 0.0001920443493483758, "loss": 4.6121, "step": 415 }, { "epoch": 0.12132701421800948, "grad_norm": 2.7988791465759277, "learning_rate": 0.00019202489787978992, "loss": 4.3726, "step": 416 }, { "epoch": 0.12161866569449507, "grad_norm": 2.7462360858917236, "learning_rate": 0.00019200544641120407, "loss": 4.6386, "step": 417 }, { "epoch": 0.12191031717098068, "grad_norm": 4.876325607299805, "learning_rate": 0.0001919859949426182, "loss": 4.7313, "step": 418 }, { "epoch": 0.12220196864746628, "grad_norm": 4.21843147277832, "learning_rate": 0.00019196654347403228, "loss": 4.9746, "step": 419 }, { "epoch": 0.12249362012395187, "grad_norm": 4.178854465484619, "learning_rate": 0.00019194709200544643, "loss": 4.5262, "step": 420 }, { "epoch": 0.12278527160043748, "grad_norm": 3.1641030311584473, "learning_rate": 0.00019192764053686055, "loss": 4.8505, "step": 421 }, { "epoch": 0.12307692307692308, "grad_norm": 3.897005796432495, "learning_rate": 0.00019190818906827465, "loss": 4.7068, "step": 422 }, { "epoch": 0.12336857455340867, "grad_norm": 2.587770700454712, "learning_rate": 0.0001918887375996888, "loss": 4.4265, "step": 423 }, { "epoch": 0.12366022602989428, "grad_norm": 2.9774086475372314, "learning_rate": 0.00019186928613110292, "loss": 4.6895, "step": 424 }, { "epoch": 0.12395187750637987, "grad_norm": 2.671832323074341, "learning_rate": 0.000191849834662517, "loss": 4.5574, "step": 425 }, { "epoch": 0.12424352898286548, "grad_norm": 2.7587478160858154, "learning_rate": 0.00019183038319393113, "loss": 4.3728, "step": 426 }, { "epoch": 0.12453518045935108, "grad_norm": 2.8490169048309326, "learning_rate": 0.00019181093172534528, "loss": 4.8107, "step": 427 }, { "epoch": 0.12482683193583667, "grad_norm": 4.774936199188232, "learning_rate": 0.0001917914802567594, "loss": 4.3211, "step": 428 }, { "epoch": 0.12511848341232226, "grad_norm": 3.231813430786133, "learning_rate": 0.0001917720287881735, "loss": 4.5257, "step": 429 }, { "epoch": 0.12541013488880787, "grad_norm": 3.3284261226654053, "learning_rate": 0.00019175257731958765, "loss": 4.7809, "step": 430 }, { "epoch": 0.12570178636529347, "grad_norm": 4.575500965118408, "learning_rate": 0.00019173312585100177, "loss": 4.8047, "step": 431 }, { "epoch": 0.12599343784177908, "grad_norm": 2.3104889392852783, "learning_rate": 0.00019171367438241586, "loss": 4.618, "step": 432 }, { "epoch": 0.12628508931826468, "grad_norm": 3.1838290691375732, "learning_rate": 0.00019169422291383, "loss": 4.4081, "step": 433 }, { "epoch": 0.1265767407947503, "grad_norm": 2.3359789848327637, "learning_rate": 0.00019167477144524413, "loss": 4.6484, "step": 434 }, { "epoch": 0.12686839227123586, "grad_norm": 3.127880096435547, "learning_rate": 0.00019165531997665825, "loss": 4.6015, "step": 435 }, { "epoch": 0.12716004374772147, "grad_norm": 3.196791887283325, "learning_rate": 0.00019163586850807235, "loss": 4.6646, "step": 436 }, { "epoch": 0.12745169522420707, "grad_norm": 3.5657782554626465, "learning_rate": 0.0001916164170394865, "loss": 4.6059, "step": 437 }, { "epoch": 0.12774334670069268, "grad_norm": 3.9331247806549072, "learning_rate": 0.00019159696557090062, "loss": 4.9303, "step": 438 }, { "epoch": 0.12803499817717828, "grad_norm": 2.7160286903381348, "learning_rate": 0.00019157751410231471, "loss": 4.753, "step": 439 }, { "epoch": 0.12832664965366386, "grad_norm": 2.7744052410125732, "learning_rate": 0.00019155806263372886, "loss": 4.5471, "step": 440 }, { "epoch": 0.12861830113014946, "grad_norm": 3.170248508453369, "learning_rate": 0.00019153861116514298, "loss": 4.6244, "step": 441 }, { "epoch": 0.12890995260663507, "grad_norm": 3.1329219341278076, "learning_rate": 0.0001915191596965571, "loss": 4.6092, "step": 442 }, { "epoch": 0.12920160408312067, "grad_norm": 3.332566022872925, "learning_rate": 0.00019149970822797123, "loss": 4.5984, "step": 443 }, { "epoch": 0.12949325555960628, "grad_norm": 3.5031394958496094, "learning_rate": 0.00019148025675938535, "loss": 4.607, "step": 444 }, { "epoch": 0.12978490703609186, "grad_norm": 3.4088852405548096, "learning_rate": 0.00019146080529079947, "loss": 4.4933, "step": 445 }, { "epoch": 0.13007655851257746, "grad_norm": 3.1001057624816895, "learning_rate": 0.00019144135382221357, "loss": 4.6979, "step": 446 }, { "epoch": 0.13036820998906307, "grad_norm": 2.79952335357666, "learning_rate": 0.00019142190235362771, "loss": 4.5852, "step": 447 }, { "epoch": 0.13065986146554867, "grad_norm": 3.8515095710754395, "learning_rate": 0.00019140245088504184, "loss": 4.8019, "step": 448 }, { "epoch": 0.13095151294203428, "grad_norm": 6.422250270843506, "learning_rate": 0.00019138299941645596, "loss": 4.705, "step": 449 }, { "epoch": 0.13124316441851988, "grad_norm": 3.3716847896575928, "learning_rate": 0.00019136354794787008, "loss": 4.7552, "step": 450 }, { "epoch": 0.13153481589500546, "grad_norm": 3.6769912242889404, "learning_rate": 0.0001913440964792842, "loss": 4.4873, "step": 451 }, { "epoch": 0.13182646737149106, "grad_norm": 2.2977747917175293, "learning_rate": 0.00019132464501069832, "loss": 4.5726, "step": 452 }, { "epoch": 0.13211811884797667, "grad_norm": 2.6574201583862305, "learning_rate": 0.00019130519354211244, "loss": 4.4021, "step": 453 }, { "epoch": 0.13240977032446227, "grad_norm": 2.346829652786255, "learning_rate": 0.00019128574207352656, "loss": 4.4522, "step": 454 }, { "epoch": 0.13270142180094788, "grad_norm": 2.3006410598754883, "learning_rate": 0.0001912662906049407, "loss": 4.4613, "step": 455 }, { "epoch": 0.13299307327743345, "grad_norm": 3.6945579051971436, "learning_rate": 0.00019124683913635478, "loss": 4.5233, "step": 456 }, { "epoch": 0.13328472475391906, "grad_norm": 2.871180772781372, "learning_rate": 0.00019122738766776893, "loss": 4.9005, "step": 457 }, { "epoch": 0.13357637623040466, "grad_norm": 6.778193950653076, "learning_rate": 0.00019120793619918305, "loss": 4.5932, "step": 458 }, { "epoch": 0.13386802770689027, "grad_norm": 5.0842976570129395, "learning_rate": 0.00019118848473059717, "loss": 4.7084, "step": 459 }, { "epoch": 0.13415967918337587, "grad_norm": 3.634028196334839, "learning_rate": 0.0001911690332620113, "loss": 4.3537, "step": 460 }, { "epoch": 0.13445133065986148, "grad_norm": 2.424978494644165, "learning_rate": 0.00019114958179342542, "loss": 4.4581, "step": 461 }, { "epoch": 0.13474298213634706, "grad_norm": 2.1800954341888428, "learning_rate": 0.00019113013032483954, "loss": 4.4797, "step": 462 }, { "epoch": 0.13503463361283266, "grad_norm": 3.0263118743896484, "learning_rate": 0.00019111067885625366, "loss": 4.6167, "step": 463 }, { "epoch": 0.13532628508931827, "grad_norm": 2.8931994438171387, "learning_rate": 0.00019109122738766778, "loss": 4.6461, "step": 464 }, { "epoch": 0.13561793656580387, "grad_norm": 5.102127552032471, "learning_rate": 0.0001910717759190819, "loss": 4.7828, "step": 465 }, { "epoch": 0.13590958804228948, "grad_norm": 2.667389154434204, "learning_rate": 0.00019105232445049602, "loss": 4.429, "step": 466 }, { "epoch": 0.13620123951877505, "grad_norm": 3.4943718910217285, "learning_rate": 0.00019103287298191015, "loss": 4.676, "step": 467 }, { "epoch": 0.13649289099526066, "grad_norm": 3.3478949069976807, "learning_rate": 0.00019101342151332427, "loss": 4.5488, "step": 468 }, { "epoch": 0.13678454247174626, "grad_norm": 3.2986929416656494, "learning_rate": 0.0001909939700447384, "loss": 4.4084, "step": 469 }, { "epoch": 0.13707619394823187, "grad_norm": 1.9929161071777344, "learning_rate": 0.0001909745185761525, "loss": 4.5166, "step": 470 }, { "epoch": 0.13736784542471747, "grad_norm": 2.4314489364624023, "learning_rate": 0.00019095506710756663, "loss": 4.2224, "step": 471 }, { "epoch": 0.13765949690120305, "grad_norm": 3.474910259246826, "learning_rate": 0.00019093561563898075, "loss": 4.825, "step": 472 }, { "epoch": 0.13795114837768865, "grad_norm": 2.542742967605591, "learning_rate": 0.00019091616417039488, "loss": 4.3931, "step": 473 }, { "epoch": 0.13824279985417426, "grad_norm": 2.8264899253845215, "learning_rate": 0.000190896712701809, "loss": 4.7344, "step": 474 }, { "epoch": 0.13853445133065986, "grad_norm": 2.5735552310943604, "learning_rate": 0.00019087726123322312, "loss": 4.42, "step": 475 }, { "epoch": 0.13882610280714547, "grad_norm": 3.6205849647521973, "learning_rate": 0.00019085780976463724, "loss": 4.5983, "step": 476 }, { "epoch": 0.13911775428363107, "grad_norm": 3.231921911239624, "learning_rate": 0.00019083835829605136, "loss": 4.6319, "step": 477 }, { "epoch": 0.13940940576011665, "grad_norm": 3.27740216255188, "learning_rate": 0.00019081890682746548, "loss": 4.6396, "step": 478 }, { "epoch": 0.13970105723660226, "grad_norm": 3.273685932159424, "learning_rate": 0.0001907994553588796, "loss": 4.8445, "step": 479 }, { "epoch": 0.13999270871308786, "grad_norm": 2.3555538654327393, "learning_rate": 0.00019078000389029373, "loss": 4.6594, "step": 480 }, { "epoch": 0.14028436018957346, "grad_norm": 3.45271372795105, "learning_rate": 0.00019076055242170785, "loss": 4.4583, "step": 481 }, { "epoch": 0.14057601166605907, "grad_norm": 3.143744945526123, "learning_rate": 0.00019074110095312197, "loss": 4.5665, "step": 482 }, { "epoch": 0.14086766314254465, "grad_norm": 3.995802164077759, "learning_rate": 0.0001907216494845361, "loss": 4.6622, "step": 483 }, { "epoch": 0.14115931461903025, "grad_norm": 4.105745792388916, "learning_rate": 0.0001907021980159502, "loss": 4.6374, "step": 484 }, { "epoch": 0.14145096609551586, "grad_norm": 4.389105319976807, "learning_rate": 0.00019068274654736433, "loss": 4.5818, "step": 485 }, { "epoch": 0.14174261757200146, "grad_norm": 4.7971601486206055, "learning_rate": 0.00019066329507877846, "loss": 4.7525, "step": 486 }, { "epoch": 0.14203426904848707, "grad_norm": 3.1032824516296387, "learning_rate": 0.00019064384361019258, "loss": 4.8364, "step": 487 }, { "epoch": 0.14232592052497267, "grad_norm": 1.9772573709487915, "learning_rate": 0.0001906243921416067, "loss": 4.7575, "step": 488 }, { "epoch": 0.14261757200145825, "grad_norm": 2.044980049133301, "learning_rate": 0.00019060494067302082, "loss": 4.5359, "step": 489 }, { "epoch": 0.14290922347794385, "grad_norm": 3.601925849914551, "learning_rate": 0.00019058548920443494, "loss": 4.5427, "step": 490 }, { "epoch": 0.14320087495442946, "grad_norm": 2.1400558948516846, "learning_rate": 0.00019056603773584906, "loss": 4.5254, "step": 491 }, { "epoch": 0.14349252643091506, "grad_norm": 5.658024311065674, "learning_rate": 0.00019054658626726319, "loss": 4.7312, "step": 492 }, { "epoch": 0.14378417790740067, "grad_norm": 5.837948322296143, "learning_rate": 0.0001905271347986773, "loss": 4.8606, "step": 493 }, { "epoch": 0.14407582938388624, "grad_norm": 3.977720022201538, "learning_rate": 0.00019050768333009143, "loss": 4.7481, "step": 494 }, { "epoch": 0.14436748086037185, "grad_norm": 2.818211555480957, "learning_rate": 0.00019048823186150555, "loss": 4.7371, "step": 495 }, { "epoch": 0.14465913233685745, "grad_norm": 3.469208240509033, "learning_rate": 0.00019046878039291967, "loss": 5.1479, "step": 496 }, { "epoch": 0.14495078381334306, "grad_norm": 4.103572845458984, "learning_rate": 0.0001904493289243338, "loss": 4.6394, "step": 497 }, { "epoch": 0.14524243528982866, "grad_norm": 3.443567991256714, "learning_rate": 0.00019042987745574792, "loss": 4.9119, "step": 498 }, { "epoch": 0.14553408676631424, "grad_norm": 3.5618796348571777, "learning_rate": 0.00019041042598716204, "loss": 4.8648, "step": 499 }, { "epoch": 0.14582573824279985, "grad_norm": 2.1778645515441895, "learning_rate": 0.00019039097451857616, "loss": 4.7058, "step": 500 }, { "epoch": 0.14611738971928545, "grad_norm": 7.1377482414245605, "learning_rate": 0.00019037152304999028, "loss": 4.626, "step": 501 }, { "epoch": 0.14640904119577106, "grad_norm": 4.531087398529053, "learning_rate": 0.0001903520715814044, "loss": 4.5241, "step": 502 }, { "epoch": 0.14670069267225666, "grad_norm": 3.051013231277466, "learning_rate": 0.00019033262011281852, "loss": 4.6937, "step": 503 }, { "epoch": 0.14699234414874227, "grad_norm": 3.6335387229919434, "learning_rate": 0.00019031316864423264, "loss": 4.6851, "step": 504 }, { "epoch": 0.14728399562522784, "grad_norm": 3.148088216781616, "learning_rate": 0.00019029371717564677, "loss": 4.5532, "step": 505 }, { "epoch": 0.14757564710171345, "grad_norm": 2.7948524951934814, "learning_rate": 0.0001902742657070609, "loss": 4.7331, "step": 506 }, { "epoch": 0.14786729857819905, "grad_norm": 3.577493906021118, "learning_rate": 0.000190254814238475, "loss": 4.5979, "step": 507 }, { "epoch": 0.14815895005468466, "grad_norm": 3.9706740379333496, "learning_rate": 0.00019023536276988913, "loss": 4.8843, "step": 508 }, { "epoch": 0.14845060153117026, "grad_norm": 2.795764207839966, "learning_rate": 0.00019021591130130325, "loss": 4.5881, "step": 509 }, { "epoch": 0.14874225300765584, "grad_norm": 2.351940393447876, "learning_rate": 0.0001901964598327174, "loss": 4.2137, "step": 510 }, { "epoch": 0.14903390448414144, "grad_norm": 3.4490065574645996, "learning_rate": 0.0001901770083641315, "loss": 4.6189, "step": 511 }, { "epoch": 0.14932555596062705, "grad_norm": 2.570460081100464, "learning_rate": 0.00019015755689554562, "loss": 4.5548, "step": 512 }, { "epoch": 0.14961720743711265, "grad_norm": 4.5603532791137695, "learning_rate": 0.00019013810542695974, "loss": 4.6562, "step": 513 }, { "epoch": 0.14990885891359826, "grad_norm": 3.8804543018341064, "learning_rate": 0.00019011865395837386, "loss": 4.7996, "step": 514 }, { "epoch": 0.15020051039008386, "grad_norm": 2.9861841201782227, "learning_rate": 0.00019009920248978798, "loss": 4.5711, "step": 515 }, { "epoch": 0.15049216186656944, "grad_norm": 6.319651126861572, "learning_rate": 0.0001900797510212021, "loss": 4.8051, "step": 516 }, { "epoch": 0.15078381334305505, "grad_norm": 2.9608216285705566, "learning_rate": 0.00019006029955261623, "loss": 4.4823, "step": 517 }, { "epoch": 0.15107546481954065, "grad_norm": 2.776097297668457, "learning_rate": 0.00019004084808403035, "loss": 4.7044, "step": 518 }, { "epoch": 0.15136711629602626, "grad_norm": 3.118093490600586, "learning_rate": 0.00019002139661544447, "loss": 4.5742, "step": 519 }, { "epoch": 0.15165876777251186, "grad_norm": 4.802477836608887, "learning_rate": 0.00019000194514685862, "loss": 4.5711, "step": 520 }, { "epoch": 0.15195041924899744, "grad_norm": 3.1876189708709717, "learning_rate": 0.0001899824936782727, "loss": 4.6913, "step": 521 }, { "epoch": 0.15224207072548304, "grad_norm": 3.4085171222686768, "learning_rate": 0.00018996304220968683, "loss": 4.5941, "step": 522 }, { "epoch": 0.15253372220196865, "grad_norm": 2.6446216106414795, "learning_rate": 0.00018994359074110095, "loss": 4.5668, "step": 523 }, { "epoch": 0.15282537367845425, "grad_norm": 2.2282896041870117, "learning_rate": 0.00018992413927251508, "loss": 4.3466, "step": 524 }, { "epoch": 0.15311702515493986, "grad_norm": 3.2351644039154053, "learning_rate": 0.0001899046878039292, "loss": 4.8119, "step": 525 }, { "epoch": 0.15340867663142543, "grad_norm": 2.2451322078704834, "learning_rate": 0.00018988523633534332, "loss": 4.4239, "step": 526 }, { "epoch": 0.15370032810791104, "grad_norm": 2.7645018100738525, "learning_rate": 0.00018986578486675747, "loss": 4.8063, "step": 527 }, { "epoch": 0.15399197958439664, "grad_norm": 3.597245693206787, "learning_rate": 0.00018984633339817156, "loss": 4.5596, "step": 528 }, { "epoch": 0.15428363106088225, "grad_norm": 2.206756830215454, "learning_rate": 0.00018982688192958568, "loss": 4.551, "step": 529 }, { "epoch": 0.15457528253736785, "grad_norm": 3.3251848220825195, "learning_rate": 0.00018980743046099983, "loss": 4.5933, "step": 530 }, { "epoch": 0.15486693401385346, "grad_norm": 2.5595338344573975, "learning_rate": 0.00018978797899241393, "loss": 4.3339, "step": 531 }, { "epoch": 0.15515858549033903, "grad_norm": 3.148743152618408, "learning_rate": 0.00018976852752382805, "loss": 4.7748, "step": 532 }, { "epoch": 0.15545023696682464, "grad_norm": 6.097256183624268, "learning_rate": 0.00018974907605524217, "loss": 4.6075, "step": 533 }, { "epoch": 0.15574188844331024, "grad_norm": 5.444161415100098, "learning_rate": 0.00018972962458665632, "loss": 4.5372, "step": 534 }, { "epoch": 0.15603353991979585, "grad_norm": 3.0859375, "learning_rate": 0.00018971017311807041, "loss": 4.5904, "step": 535 }, { "epoch": 0.15632519139628145, "grad_norm": 3.6680972576141357, "learning_rate": 0.00018969072164948454, "loss": 4.6288, "step": 536 }, { "epoch": 0.15661684287276703, "grad_norm": 3.3787848949432373, "learning_rate": 0.00018967127018089868, "loss": 4.431, "step": 537 }, { "epoch": 0.15690849434925264, "grad_norm": 2.9840028285980225, "learning_rate": 0.00018965181871231278, "loss": 4.4421, "step": 538 }, { "epoch": 0.15720014582573824, "grad_norm": 2.5077617168426514, "learning_rate": 0.0001896323672437269, "loss": 4.4895, "step": 539 }, { "epoch": 0.15749179730222385, "grad_norm": 2.6565470695495605, "learning_rate": 0.00018961291577514105, "loss": 4.4374, "step": 540 }, { "epoch": 0.15778344877870945, "grad_norm": 3.7739250659942627, "learning_rate": 0.00018959346430655517, "loss": 4.4222, "step": 541 }, { "epoch": 0.15807510025519506, "grad_norm": 2.848496675491333, "learning_rate": 0.00018957401283796927, "loss": 4.6025, "step": 542 }, { "epoch": 0.15836675173168063, "grad_norm": 3.3667349815368652, "learning_rate": 0.0001895545613693834, "loss": 4.5293, "step": 543 }, { "epoch": 0.15865840320816624, "grad_norm": 3.3874528408050537, "learning_rate": 0.00018953510990079754, "loss": 4.6658, "step": 544 }, { "epoch": 0.15895005468465184, "grad_norm": 3.9956984519958496, "learning_rate": 0.00018951565843221163, "loss": 4.7495, "step": 545 }, { "epoch": 0.15924170616113745, "grad_norm": 2.721524715423584, "learning_rate": 0.00018949620696362575, "loss": 4.7796, "step": 546 }, { "epoch": 0.15953335763762305, "grad_norm": 3.064499855041504, "learning_rate": 0.0001894767554950399, "loss": 4.6285, "step": 547 }, { "epoch": 0.15982500911410863, "grad_norm": 2.605491876602173, "learning_rate": 0.000189457304026454, "loss": 4.5181, "step": 548 }, { "epoch": 0.16011666059059423, "grad_norm": 3.150367021560669, "learning_rate": 0.00018943785255786812, "loss": 4.7941, "step": 549 }, { "epoch": 0.16040831206707984, "grad_norm": 2.6047489643096924, "learning_rate": 0.00018941840108928227, "loss": 4.6363, "step": 550 }, { "epoch": 0.16069996354356544, "grad_norm": 2.53788685798645, "learning_rate": 0.0001893989496206964, "loss": 4.4579, "step": 551 }, { "epoch": 0.16099161502005105, "grad_norm": 3.6476316452026367, "learning_rate": 0.00018937949815211048, "loss": 4.4497, "step": 552 }, { "epoch": 0.16128326649653663, "grad_norm": 3.0124099254608154, "learning_rate": 0.0001893600466835246, "loss": 4.7189, "step": 553 }, { "epoch": 0.16157491797302223, "grad_norm": 2.1160964965820312, "learning_rate": 0.00018934059521493875, "loss": 4.5891, "step": 554 }, { "epoch": 0.16186656944950784, "grad_norm": 2.8185276985168457, "learning_rate": 0.00018932114374635285, "loss": 4.4306, "step": 555 }, { "epoch": 0.16215822092599344, "grad_norm": 2.8464012145996094, "learning_rate": 0.00018930169227776697, "loss": 4.5456, "step": 556 }, { "epoch": 0.16244987240247905, "grad_norm": 3.7114083766937256, "learning_rate": 0.00018928224080918112, "loss": 4.8416, "step": 557 }, { "epoch": 0.16274152387896465, "grad_norm": 3.525038242340088, "learning_rate": 0.00018926278934059524, "loss": 4.771, "step": 558 }, { "epoch": 0.16303317535545023, "grad_norm": 3.318035125732422, "learning_rate": 0.00018924333787200933, "loss": 4.4613, "step": 559 }, { "epoch": 0.16332482683193583, "grad_norm": 3.71954083442688, "learning_rate": 0.00018922388640342348, "loss": 4.4378, "step": 560 }, { "epoch": 0.16361647830842144, "grad_norm": 3.215571880340576, "learning_rate": 0.0001892044349348376, "loss": 4.6356, "step": 561 }, { "epoch": 0.16390812978490704, "grad_norm": 2.849475383758545, "learning_rate": 0.0001891849834662517, "loss": 4.9485, "step": 562 }, { "epoch": 0.16419978126139265, "grad_norm": 3.084679126739502, "learning_rate": 0.00018916553199766582, "loss": 4.521, "step": 563 }, { "epoch": 0.16449143273787822, "grad_norm": 3.3428761959075928, "learning_rate": 0.00018914608052907997, "loss": 4.6831, "step": 564 }, { "epoch": 0.16478308421436383, "grad_norm": 2.731468677520752, "learning_rate": 0.0001891266290604941, "loss": 4.706, "step": 565 }, { "epoch": 0.16507473569084943, "grad_norm": 3.222507953643799, "learning_rate": 0.00018910717759190818, "loss": 4.3969, "step": 566 }, { "epoch": 0.16536638716733504, "grad_norm": 2.8675901889801025, "learning_rate": 0.00018908772612332233, "loss": 4.6465, "step": 567 }, { "epoch": 0.16565803864382064, "grad_norm": 3.3803517818450928, "learning_rate": 0.00018906827465473645, "loss": 4.1242, "step": 568 }, { "epoch": 0.16594969012030622, "grad_norm": 2.652334690093994, "learning_rate": 0.00018904882318615055, "loss": 4.8853, "step": 569 }, { "epoch": 0.16624134159679183, "grad_norm": 3.6649065017700195, "learning_rate": 0.0001890293717175647, "loss": 4.4866, "step": 570 }, { "epoch": 0.16653299307327743, "grad_norm": 3.5787675380706787, "learning_rate": 0.00018900992024897882, "loss": 4.4923, "step": 571 }, { "epoch": 0.16682464454976303, "grad_norm": 3.119187831878662, "learning_rate": 0.0001889904687803929, "loss": 4.3956, "step": 572 }, { "epoch": 0.16711629602624864, "grad_norm": 3.5303168296813965, "learning_rate": 0.00018897101731180703, "loss": 4.6909, "step": 573 }, { "epoch": 0.16740794750273424, "grad_norm": 2.2574052810668945, "learning_rate": 0.00018895156584322118, "loss": 4.5607, "step": 574 }, { "epoch": 0.16769959897921982, "grad_norm": 2.444497585296631, "learning_rate": 0.0001889321143746353, "loss": 4.5466, "step": 575 }, { "epoch": 0.16799125045570543, "grad_norm": 3.646522283554077, "learning_rate": 0.0001889126629060494, "loss": 4.8883, "step": 576 }, { "epoch": 0.16828290193219103, "grad_norm": 3.169597864151001, "learning_rate": 0.00018889321143746355, "loss": 4.6631, "step": 577 }, { "epoch": 0.16857455340867664, "grad_norm": 3.083824396133423, "learning_rate": 0.00018887375996887767, "loss": 4.6211, "step": 578 }, { "epoch": 0.16886620488516224, "grad_norm": 2.7449772357940674, "learning_rate": 0.00018885430850029176, "loss": 4.7378, "step": 579 }, { "epoch": 0.16915785636164782, "grad_norm": 3.5369606018066406, "learning_rate": 0.00018883485703170589, "loss": 4.6576, "step": 580 }, { "epoch": 0.16944950783813342, "grad_norm": 4.087699890136719, "learning_rate": 0.00018881540556312003, "loss": 4.4413, "step": 581 }, { "epoch": 0.16974115931461903, "grad_norm": 2.7086215019226074, "learning_rate": 0.00018879595409453416, "loss": 4.4391, "step": 582 }, { "epoch": 0.17003281079110463, "grad_norm": 2.8197808265686035, "learning_rate": 0.00018877650262594825, "loss": 4.8348, "step": 583 }, { "epoch": 0.17032446226759024, "grad_norm": 3.2810144424438477, "learning_rate": 0.0001887570511573624, "loss": 4.5934, "step": 584 }, { "epoch": 0.17061611374407584, "grad_norm": 3.4460465908050537, "learning_rate": 0.00018873759968877652, "loss": 4.5317, "step": 585 }, { "epoch": 0.17090776522056142, "grad_norm": 2.5300745964050293, "learning_rate": 0.00018871814822019062, "loss": 4.7029, "step": 586 }, { "epoch": 0.17119941669704702, "grad_norm": 4.322498321533203, "learning_rate": 0.00018869869675160476, "loss": 4.6568, "step": 587 }, { "epoch": 0.17149106817353263, "grad_norm": 3.610750675201416, "learning_rate": 0.00018867924528301889, "loss": 4.5233, "step": 588 }, { "epoch": 0.17178271965001823, "grad_norm": 4.325952053070068, "learning_rate": 0.000188659793814433, "loss": 4.696, "step": 589 }, { "epoch": 0.17207437112650384, "grad_norm": 2.3732073307037354, "learning_rate": 0.0001886403423458471, "loss": 4.6441, "step": 590 }, { "epoch": 0.17236602260298942, "grad_norm": 3.8870480060577393, "learning_rate": 0.00018862089087726125, "loss": 4.5103, "step": 591 }, { "epoch": 0.17265767407947502, "grad_norm": 2.577425003051758, "learning_rate": 0.00018860143940867537, "loss": 4.3057, "step": 592 }, { "epoch": 0.17294932555596063, "grad_norm": 3.7582345008850098, "learning_rate": 0.00018858198794008947, "loss": 4.6547, "step": 593 }, { "epoch": 0.17324097703244623, "grad_norm": 3.7345097064971924, "learning_rate": 0.00018856253647150362, "loss": 4.6569, "step": 594 }, { "epoch": 0.17353262850893184, "grad_norm": 3.663644790649414, "learning_rate": 0.00018854308500291774, "loss": 4.7613, "step": 595 }, { "epoch": 0.1738242799854174, "grad_norm": 3.660203695297241, "learning_rate": 0.00018852363353433186, "loss": 4.835, "step": 596 }, { "epoch": 0.17411593146190302, "grad_norm": 3.222712993621826, "learning_rate": 0.00018850418206574598, "loss": 4.6657, "step": 597 }, { "epoch": 0.17440758293838862, "grad_norm": 2.643040180206299, "learning_rate": 0.0001884847305971601, "loss": 4.4167, "step": 598 }, { "epoch": 0.17469923441487423, "grad_norm": 3.562634229660034, "learning_rate": 0.00018846527912857422, "loss": 4.7466, "step": 599 }, { "epoch": 0.17499088589135983, "grad_norm": 2.9751503467559814, "learning_rate": 0.00018844582765998832, "loss": 4.6042, "step": 600 }, { "epoch": 0.17528253736784544, "grad_norm": 2.790027379989624, "learning_rate": 0.00018842637619140247, "loss": 4.6669, "step": 601 }, { "epoch": 0.17557418884433101, "grad_norm": 3.722815990447998, "learning_rate": 0.0001884069247228166, "loss": 4.8478, "step": 602 }, { "epoch": 0.17586584032081662, "grad_norm": 3.819136142730713, "learning_rate": 0.00018838747325423068, "loss": 4.7765, "step": 603 }, { "epoch": 0.17615749179730222, "grad_norm": 3.289628744125366, "learning_rate": 0.00018836802178564483, "loss": 4.7139, "step": 604 }, { "epoch": 0.17644914327378783, "grad_norm": 3.6192362308502197, "learning_rate": 0.00018834857031705895, "loss": 4.787, "step": 605 }, { "epoch": 0.17674079475027343, "grad_norm": 3.3440685272216797, "learning_rate": 0.00018832911884847307, "loss": 4.8415, "step": 606 }, { "epoch": 0.177032446226759, "grad_norm": 4.521566867828369, "learning_rate": 0.0001883096673798872, "loss": 4.6141, "step": 607 }, { "epoch": 0.17732409770324462, "grad_norm": 6.254881381988525, "learning_rate": 0.00018829021591130132, "loss": 4.8808, "step": 608 }, { "epoch": 0.17761574917973022, "grad_norm": 3.706969738006592, "learning_rate": 0.00018827076444271544, "loss": 4.7277, "step": 609 }, { "epoch": 0.17790740065621582, "grad_norm": 2.366523265838623, "learning_rate": 0.00018825131297412953, "loss": 4.4855, "step": 610 }, { "epoch": 0.17819905213270143, "grad_norm": 2.9035117626190186, "learning_rate": 0.00018823186150554368, "loss": 4.8907, "step": 611 }, { "epoch": 0.17849070360918703, "grad_norm": 3.9948368072509766, "learning_rate": 0.0001882124100369578, "loss": 4.7338, "step": 612 }, { "epoch": 0.1787823550856726, "grad_norm": 3.4792466163635254, "learning_rate": 0.00018819295856837193, "loss": 4.6561, "step": 613 }, { "epoch": 0.17907400656215822, "grad_norm": 3.49200701713562, "learning_rate": 0.00018817350709978605, "loss": 4.5276, "step": 614 }, { "epoch": 0.17936565803864382, "grad_norm": 3.078486442565918, "learning_rate": 0.00018815405563120017, "loss": 4.7687, "step": 615 }, { "epoch": 0.17965730951512943, "grad_norm": 2.484663248062134, "learning_rate": 0.0001881346041626143, "loss": 4.5021, "step": 616 }, { "epoch": 0.17994896099161503, "grad_norm": 2.7570674419403076, "learning_rate": 0.0001881151526940284, "loss": 4.5515, "step": 617 }, { "epoch": 0.1802406124681006, "grad_norm": 3.219447612762451, "learning_rate": 0.00018809570122544253, "loss": 4.5329, "step": 618 }, { "epoch": 0.1805322639445862, "grad_norm": 4.390439510345459, "learning_rate": 0.00018807624975685666, "loss": 4.5944, "step": 619 }, { "epoch": 0.18082391542107182, "grad_norm": 3.9557864665985107, "learning_rate": 0.00018805679828827078, "loss": 5.0022, "step": 620 }, { "epoch": 0.18111556689755742, "grad_norm": 3.08716082572937, "learning_rate": 0.0001880373468196849, "loss": 4.5433, "step": 621 }, { "epoch": 0.18140721837404303, "grad_norm": 3.3325228691101074, "learning_rate": 0.00018801789535109902, "loss": 4.8576, "step": 622 }, { "epoch": 0.1816988698505286, "grad_norm": 2.8664073944091797, "learning_rate": 0.00018799844388251314, "loss": 4.3915, "step": 623 }, { "epoch": 0.1819905213270142, "grad_norm": 4.884296417236328, "learning_rate": 0.00018797899241392726, "loss": 4.5123, "step": 624 }, { "epoch": 0.18228217280349981, "grad_norm": 2.1718978881835938, "learning_rate": 0.00018795954094534138, "loss": 4.2427, "step": 625 }, { "epoch": 0.18257382427998542, "grad_norm": 2.4223806858062744, "learning_rate": 0.0001879400894767555, "loss": 4.4784, "step": 626 }, { "epoch": 0.18286547575647102, "grad_norm": 1.8622790575027466, "learning_rate": 0.00018792063800816963, "loss": 4.5399, "step": 627 }, { "epoch": 0.18315712723295663, "grad_norm": 2.266139268875122, "learning_rate": 0.00018790118653958375, "loss": 4.4854, "step": 628 }, { "epoch": 0.1834487787094422, "grad_norm": 4.574977397918701, "learning_rate": 0.00018788173507099787, "loss": 4.7456, "step": 629 }, { "epoch": 0.1837404301859278, "grad_norm": 2.6676788330078125, "learning_rate": 0.000187862283602412, "loss": 4.727, "step": 630 }, { "epoch": 0.18403208166241342, "grad_norm": 4.47611665725708, "learning_rate": 0.00018784283213382611, "loss": 4.596, "step": 631 }, { "epoch": 0.18432373313889902, "grad_norm": 2.5077505111694336, "learning_rate": 0.00018782338066524024, "loss": 4.5156, "step": 632 }, { "epoch": 0.18461538461538463, "grad_norm": 3.3039302825927734, "learning_rate": 0.00018780392919665436, "loss": 4.9082, "step": 633 }, { "epoch": 0.1849070360918702, "grad_norm": 3.1767868995666504, "learning_rate": 0.00018778447772806848, "loss": 4.7401, "step": 634 }, { "epoch": 0.1851986875683558, "grad_norm": 2.693634033203125, "learning_rate": 0.0001877650262594826, "loss": 4.7712, "step": 635 }, { "epoch": 0.1854903390448414, "grad_norm": 2.9141695499420166, "learning_rate": 0.00018774557479089672, "loss": 4.3002, "step": 636 }, { "epoch": 0.18578199052132702, "grad_norm": 3.60662579536438, "learning_rate": 0.00018772612332231084, "loss": 4.5655, "step": 637 }, { "epoch": 0.18607364199781262, "grad_norm": 2.9903082847595215, "learning_rate": 0.00018770667185372497, "loss": 4.7543, "step": 638 }, { "epoch": 0.18636529347429823, "grad_norm": 3.3168880939483643, "learning_rate": 0.0001876872203851391, "loss": 4.6594, "step": 639 }, { "epoch": 0.1866569449507838, "grad_norm": 2.971438407897949, "learning_rate": 0.0001876677689165532, "loss": 4.449, "step": 640 }, { "epoch": 0.1869485964272694, "grad_norm": 2.422393560409546, "learning_rate": 0.00018764831744796733, "loss": 4.4864, "step": 641 }, { "epoch": 0.187240247903755, "grad_norm": 3.291090488433838, "learning_rate": 0.00018762886597938145, "loss": 4.7474, "step": 642 }, { "epoch": 0.18753189938024062, "grad_norm": 3.0077943801879883, "learning_rate": 0.00018760941451079557, "loss": 4.4874, "step": 643 }, { "epoch": 0.18782355085672622, "grad_norm": 3.1237118244171143, "learning_rate": 0.0001875899630422097, "loss": 4.8468, "step": 644 }, { "epoch": 0.1881152023332118, "grad_norm": 4.647111892700195, "learning_rate": 0.00018757051157362382, "loss": 4.7506, "step": 645 }, { "epoch": 0.1884068538096974, "grad_norm": 2.422553539276123, "learning_rate": 0.00018755106010503794, "loss": 4.5828, "step": 646 }, { "epoch": 0.188698505286183, "grad_norm": 2.510828971862793, "learning_rate": 0.00018753160863645206, "loss": 4.6438, "step": 647 }, { "epoch": 0.18899015676266862, "grad_norm": 3.2256765365600586, "learning_rate": 0.00018751215716786618, "loss": 4.5122, "step": 648 }, { "epoch": 0.18928180823915422, "grad_norm": 3.5184805393218994, "learning_rate": 0.0001874927056992803, "loss": 4.616, "step": 649 }, { "epoch": 0.1895734597156398, "grad_norm": 3.0099148750305176, "learning_rate": 0.00018747325423069442, "loss": 4.2848, "step": 650 }, { "epoch": 0.1898651111921254, "grad_norm": 3.2419188022613525, "learning_rate": 0.00018745380276210855, "loss": 4.5677, "step": 651 }, { "epoch": 0.190156762668611, "grad_norm": 3.1540980339050293, "learning_rate": 0.00018743435129352267, "loss": 4.5497, "step": 652 }, { "epoch": 0.1904484141450966, "grad_norm": 2.779902458190918, "learning_rate": 0.0001874148998249368, "loss": 4.4334, "step": 653 }, { "epoch": 0.19074006562158222, "grad_norm": 2.7975714206695557, "learning_rate": 0.0001873954483563509, "loss": 4.5651, "step": 654 }, { "epoch": 0.19103171709806782, "grad_norm": 3.532451868057251, "learning_rate": 0.00018737599688776503, "loss": 4.8306, "step": 655 }, { "epoch": 0.1913233685745534, "grad_norm": 2.576568603515625, "learning_rate": 0.00018735654541917915, "loss": 4.1825, "step": 656 }, { "epoch": 0.191615020051039, "grad_norm": 3.7216835021972656, "learning_rate": 0.0001873370939505933, "loss": 4.3464, "step": 657 }, { "epoch": 0.1919066715275246, "grad_norm": 3.615727424621582, "learning_rate": 0.0001873176424820074, "loss": 4.6756, "step": 658 }, { "epoch": 0.1921983230040102, "grad_norm": 3.0763440132141113, "learning_rate": 0.00018729819101342152, "loss": 4.3229, "step": 659 }, { "epoch": 0.19248997448049582, "grad_norm": 4.843511581420898, "learning_rate": 0.00018727873954483564, "loss": 4.5437, "step": 660 }, { "epoch": 0.1927816259569814, "grad_norm": 2.931809425354004, "learning_rate": 0.00018725928807624976, "loss": 4.5791, "step": 661 }, { "epoch": 0.193073277433467, "grad_norm": 2.7375307083129883, "learning_rate": 0.00018723983660766388, "loss": 4.5051, "step": 662 }, { "epoch": 0.1933649289099526, "grad_norm": 2.751859664916992, "learning_rate": 0.000187220385139078, "loss": 4.2908, "step": 663 }, { "epoch": 0.1936565803864382, "grad_norm": 3.7624051570892334, "learning_rate": 0.00018720093367049213, "loss": 4.662, "step": 664 }, { "epoch": 0.19394823186292381, "grad_norm": 2.8341257572174072, "learning_rate": 0.00018718148220190625, "loss": 4.6737, "step": 665 }, { "epoch": 0.19423988333940942, "grad_norm": 3.436225414276123, "learning_rate": 0.00018716203073332037, "loss": 4.8292, "step": 666 }, { "epoch": 0.194531534815895, "grad_norm": 2.566385269165039, "learning_rate": 0.0001871425792647345, "loss": 4.4935, "step": 667 }, { "epoch": 0.1948231862923806, "grad_norm": 2.387192487716675, "learning_rate": 0.0001871231277961486, "loss": 4.9607, "step": 668 }, { "epoch": 0.1951148377688662, "grad_norm": 3.110121488571167, "learning_rate": 0.00018710367632756273, "loss": 4.7985, "step": 669 }, { "epoch": 0.1954064892453518, "grad_norm": 3.0479259490966797, "learning_rate": 0.00018708422485897686, "loss": 4.9659, "step": 670 }, { "epoch": 0.19569814072183742, "grad_norm": 2.9323301315307617, "learning_rate": 0.00018706477339039098, "loss": 4.5088, "step": 671 }, { "epoch": 0.195989792198323, "grad_norm": 2.8888635635375977, "learning_rate": 0.0001870453219218051, "loss": 4.6899, "step": 672 }, { "epoch": 0.1962814436748086, "grad_norm": 3.3222408294677734, "learning_rate": 0.00018702587045321922, "loss": 4.5193, "step": 673 }, { "epoch": 0.1965730951512942, "grad_norm": 3.4004952907562256, "learning_rate": 0.00018700641898463337, "loss": 4.3743, "step": 674 }, { "epoch": 0.1968647466277798, "grad_norm": 2.95086669921875, "learning_rate": 0.00018698696751604746, "loss": 4.5267, "step": 675 }, { "epoch": 0.1971563981042654, "grad_norm": 1.6148622035980225, "learning_rate": 0.00018696751604746159, "loss": 4.4974, "step": 676 }, { "epoch": 0.197448049580751, "grad_norm": 3.118873119354248, "learning_rate": 0.0001869480645788757, "loss": 4.3982, "step": 677 }, { "epoch": 0.1977397010572366, "grad_norm": 3.500903606414795, "learning_rate": 0.00018692861311028983, "loss": 4.4877, "step": 678 }, { "epoch": 0.1980313525337222, "grad_norm": 3.411710500717163, "learning_rate": 0.00018690916164170395, "loss": 4.729, "step": 679 }, { "epoch": 0.1983230040102078, "grad_norm": 2.9313700199127197, "learning_rate": 0.00018688971017311807, "loss": 4.6544, "step": 680 }, { "epoch": 0.1986146554866934, "grad_norm": 3.0305533409118652, "learning_rate": 0.00018687025870453222, "loss": 4.4624, "step": 681 }, { "epoch": 0.198906306963179, "grad_norm": 2.4705615043640137, "learning_rate": 0.00018685080723594632, "loss": 4.7432, "step": 682 }, { "epoch": 0.1991979584396646, "grad_norm": 5.059365749359131, "learning_rate": 0.00018683135576736044, "loss": 4.9177, "step": 683 }, { "epoch": 0.1994896099161502, "grad_norm": 2.892500400543213, "learning_rate": 0.00018681190429877459, "loss": 4.7365, "step": 684 }, { "epoch": 0.1997812613926358, "grad_norm": 3.5583925247192383, "learning_rate": 0.00018679245283018868, "loss": 4.7238, "step": 685 }, { "epoch": 0.2000729128691214, "grad_norm": 4.012203693389893, "learning_rate": 0.0001867730013616028, "loss": 4.3365, "step": 686 }, { "epoch": 0.200364564345607, "grad_norm": 3.1043074131011963, "learning_rate": 0.00018675354989301692, "loss": 4.3985, "step": 687 }, { "epoch": 0.2006562158220926, "grad_norm": 2.8020436763763428, "learning_rate": 0.00018673409842443107, "loss": 4.4039, "step": 688 }, { "epoch": 0.2009478672985782, "grad_norm": 3.180589199066162, "learning_rate": 0.00018671464695584517, "loss": 4.8673, "step": 689 }, { "epoch": 0.2012395187750638, "grad_norm": 3.750607967376709, "learning_rate": 0.0001866951954872593, "loss": 4.9104, "step": 690 }, { "epoch": 0.2015311702515494, "grad_norm": 2.609916925430298, "learning_rate": 0.00018667574401867344, "loss": 4.3131, "step": 691 }, { "epoch": 0.201822821728035, "grad_norm": 3.8368589878082275, "learning_rate": 0.00018665629255008753, "loss": 4.6965, "step": 692 }, { "epoch": 0.2021144732045206, "grad_norm": 2.6046977043151855, "learning_rate": 0.00018663684108150165, "loss": 4.248, "step": 693 }, { "epoch": 0.2024061246810062, "grad_norm": 4.2925705909729, "learning_rate": 0.0001866173896129158, "loss": 4.5872, "step": 694 }, { "epoch": 0.2026977761574918, "grad_norm": 2.848599672317505, "learning_rate": 0.0001865979381443299, "loss": 4.6682, "step": 695 }, { "epoch": 0.2029894276339774, "grad_norm": 4.0271453857421875, "learning_rate": 0.00018657848667574402, "loss": 4.8003, "step": 696 }, { "epoch": 0.203281079110463, "grad_norm": 3.3750953674316406, "learning_rate": 0.00018655903520715814, "loss": 4.5296, "step": 697 }, { "epoch": 0.2035727305869486, "grad_norm": 2.133457899093628, "learning_rate": 0.0001865395837385723, "loss": 4.6423, "step": 698 }, { "epoch": 0.20386438206343419, "grad_norm": 2.220770835876465, "learning_rate": 0.00018652013226998638, "loss": 4.4706, "step": 699 }, { "epoch": 0.2041560335399198, "grad_norm": 5.301233291625977, "learning_rate": 0.0001865006808014005, "loss": 4.3589, "step": 700 }, { "epoch": 0.2044476850164054, "grad_norm": 2.9282689094543457, "learning_rate": 0.00018648122933281465, "loss": 4.5367, "step": 701 }, { "epoch": 0.204739336492891, "grad_norm": 2.8660924434661865, "learning_rate": 0.00018646177786422875, "loss": 4.6521, "step": 702 }, { "epoch": 0.2050309879693766, "grad_norm": 2.9097461700439453, "learning_rate": 0.00018644232639564287, "loss": 4.7352, "step": 703 }, { "epoch": 0.20532263944586218, "grad_norm": 2.808509588241577, "learning_rate": 0.00018642287492705702, "loss": 4.3848, "step": 704 }, { "epoch": 0.2056142909223478, "grad_norm": 2.5345253944396973, "learning_rate": 0.00018640342345847114, "loss": 4.5672, "step": 705 }, { "epoch": 0.2059059423988334, "grad_norm": 3.344050168991089, "learning_rate": 0.00018638397198988523, "loss": 4.5155, "step": 706 }, { "epoch": 0.206197593875319, "grad_norm": 4.288127899169922, "learning_rate": 0.00018636452052129936, "loss": 4.457, "step": 707 }, { "epoch": 0.2064892453518046, "grad_norm": 2.531038522720337, "learning_rate": 0.0001863450690527135, "loss": 4.8635, "step": 708 }, { "epoch": 0.2067808968282902, "grad_norm": 4.220776081085205, "learning_rate": 0.0001863256175841276, "loss": 4.7248, "step": 709 }, { "epoch": 0.20707254830477578, "grad_norm": 2.9945616722106934, "learning_rate": 0.00018630616611554172, "loss": 4.3864, "step": 710 }, { "epoch": 0.2073641997812614, "grad_norm": 2.730329990386963, "learning_rate": 0.00018628671464695587, "loss": 4.6528, "step": 711 }, { "epoch": 0.207655851257747, "grad_norm": 2.786471128463745, "learning_rate": 0.00018626726317837, "loss": 4.278, "step": 712 }, { "epoch": 0.2079475027342326, "grad_norm": 2.8507487773895264, "learning_rate": 0.00018624781170978408, "loss": 4.6915, "step": 713 }, { "epoch": 0.2082391542107182, "grad_norm": 2.4309983253479004, "learning_rate": 0.00018622836024119823, "loss": 4.568, "step": 714 }, { "epoch": 0.20853080568720378, "grad_norm": 2.8405544757843018, "learning_rate": 0.00018620890877261236, "loss": 4.5632, "step": 715 }, { "epoch": 0.20882245716368938, "grad_norm": 2.5228710174560547, "learning_rate": 0.00018618945730402645, "loss": 4.2649, "step": 716 }, { "epoch": 0.209114108640175, "grad_norm": 4.159340858459473, "learning_rate": 0.00018617000583544057, "loss": 5.0209, "step": 717 }, { "epoch": 0.2094057601166606, "grad_norm": 2.9400475025177, "learning_rate": 0.00018615055436685472, "loss": 4.6248, "step": 718 }, { "epoch": 0.2096974115931462, "grad_norm": 3.8663737773895264, "learning_rate": 0.00018613110289826881, "loss": 4.2011, "step": 719 }, { "epoch": 0.20998906306963178, "grad_norm": 2.6275527477264404, "learning_rate": 0.00018611165142968294, "loss": 4.1487, "step": 720 }, { "epoch": 0.21028071454611738, "grad_norm": 2.2887380123138428, "learning_rate": 0.00018609219996109708, "loss": 4.2063, "step": 721 }, { "epoch": 0.21057236602260299, "grad_norm": 3.1969480514526367, "learning_rate": 0.0001860727484925112, "loss": 4.6946, "step": 722 }, { "epoch": 0.2108640174990886, "grad_norm": 2.6498360633850098, "learning_rate": 0.0001860532970239253, "loss": 4.6739, "step": 723 }, { "epoch": 0.2111556689755742, "grad_norm": 2.729609727859497, "learning_rate": 0.00018603384555533945, "loss": 4.4401, "step": 724 }, { "epoch": 0.2114473204520598, "grad_norm": 3.074009656906128, "learning_rate": 0.00018601439408675357, "loss": 4.7226, "step": 725 }, { "epoch": 0.21173897192854538, "grad_norm": 11.95627212524414, "learning_rate": 0.00018599494261816767, "loss": 4.3319, "step": 726 }, { "epoch": 0.21203062340503098, "grad_norm": 2.290714979171753, "learning_rate": 0.0001859754911495818, "loss": 4.5338, "step": 727 }, { "epoch": 0.2123222748815166, "grad_norm": 3.106574296951294, "learning_rate": 0.00018595603968099594, "loss": 4.5044, "step": 728 }, { "epoch": 0.2126139263580022, "grad_norm": 3.163475513458252, "learning_rate": 0.00018593658821241006, "loss": 4.6482, "step": 729 }, { "epoch": 0.2129055778344878, "grad_norm": 3.9468994140625, "learning_rate": 0.00018591713674382415, "loss": 4.6049, "step": 730 }, { "epoch": 0.21319722931097337, "grad_norm": 3.9274308681488037, "learning_rate": 0.0001858976852752383, "loss": 4.6569, "step": 731 }, { "epoch": 0.21348888078745898, "grad_norm": 3.0346522331237793, "learning_rate": 0.00018587823380665242, "loss": 4.6851, "step": 732 }, { "epoch": 0.21378053226394458, "grad_norm": 4.085484027862549, "learning_rate": 0.00018585878233806652, "loss": 4.6273, "step": 733 }, { "epoch": 0.2140721837404302, "grad_norm": 3.4246275424957275, "learning_rate": 0.00018583933086948067, "loss": 4.5688, "step": 734 }, { "epoch": 0.2143638352169158, "grad_norm": 3.0418763160705566, "learning_rate": 0.0001858198794008948, "loss": 4.4875, "step": 735 }, { "epoch": 0.2146554866934014, "grad_norm": 3.0768215656280518, "learning_rate": 0.0001858004279323089, "loss": 4.8606, "step": 736 }, { "epoch": 0.21494713816988698, "grad_norm": 3.921767473220825, "learning_rate": 0.000185780976463723, "loss": 4.6263, "step": 737 }, { "epoch": 0.21523878964637258, "grad_norm": 2.53562068939209, "learning_rate": 0.00018576152499513715, "loss": 4.6521, "step": 738 }, { "epoch": 0.21553044112285819, "grad_norm": 2.744770050048828, "learning_rate": 0.00018574207352655127, "loss": 4.4062, "step": 739 }, { "epoch": 0.2158220925993438, "grad_norm": 2.673628091812134, "learning_rate": 0.00018572262205796537, "loss": 4.7552, "step": 740 }, { "epoch": 0.2161137440758294, "grad_norm": 2.7652063369750977, "learning_rate": 0.00018570317058937952, "loss": 4.7331, "step": 741 }, { "epoch": 0.21640539555231497, "grad_norm": 2.395751953125, "learning_rate": 0.00018568371912079364, "loss": 4.555, "step": 742 }, { "epoch": 0.21669704702880058, "grad_norm": 4.208797454833984, "learning_rate": 0.00018566426765220773, "loss": 4.5032, "step": 743 }, { "epoch": 0.21698869850528618, "grad_norm": 2.4764039516448975, "learning_rate": 0.00018564481618362185, "loss": 4.4026, "step": 744 }, { "epoch": 0.2172803499817718, "grad_norm": 2.8367743492126465, "learning_rate": 0.000185625364715036, "loss": 4.8006, "step": 745 }, { "epoch": 0.2175720014582574, "grad_norm": 3.1486856937408447, "learning_rate": 0.00018560591324645012, "loss": 4.7474, "step": 746 }, { "epoch": 0.21786365293474297, "grad_norm": 2.7762880325317383, "learning_rate": 0.00018558646177786422, "loss": 4.4712, "step": 747 }, { "epoch": 0.21815530441122857, "grad_norm": 3.9860737323760986, "learning_rate": 0.00018556701030927837, "loss": 4.6209, "step": 748 }, { "epoch": 0.21844695588771418, "grad_norm": 3.4660637378692627, "learning_rate": 0.0001855475588406925, "loss": 4.6308, "step": 749 }, { "epoch": 0.21873860736419978, "grad_norm": 2.580972671508789, "learning_rate": 0.00018552810737210658, "loss": 4.7296, "step": 750 }, { "epoch": 0.2190302588406854, "grad_norm": 3.0775108337402344, "learning_rate": 0.00018550865590352073, "loss": 4.5977, "step": 751 }, { "epoch": 0.219321910317171, "grad_norm": 2.714320421218872, "learning_rate": 0.00018548920443493485, "loss": 4.6983, "step": 752 }, { "epoch": 0.21961356179365657, "grad_norm": 2.3963844776153564, "learning_rate": 0.00018546975296634898, "loss": 4.5647, "step": 753 }, { "epoch": 0.21990521327014217, "grad_norm": 2.567695140838623, "learning_rate": 0.00018545030149776307, "loss": 4.6879, "step": 754 }, { "epoch": 0.22019686474662778, "grad_norm": 2.6042070388793945, "learning_rate": 0.00018543085002917722, "loss": 4.7808, "step": 755 }, { "epoch": 0.22048851622311338, "grad_norm": 3.748924970626831, "learning_rate": 0.00018541139856059134, "loss": 4.653, "step": 756 }, { "epoch": 0.220780167699599, "grad_norm": 2.838731527328491, "learning_rate": 0.00018539194709200544, "loss": 4.4208, "step": 757 }, { "epoch": 0.22107181917608457, "grad_norm": 2.920016288757324, "learning_rate": 0.00018537249562341958, "loss": 4.6668, "step": 758 }, { "epoch": 0.22136347065257017, "grad_norm": 2.8891570568084717, "learning_rate": 0.0001853530441548337, "loss": 4.5958, "step": 759 }, { "epoch": 0.22165512212905578, "grad_norm": 2.829120397567749, "learning_rate": 0.00018533359268624783, "loss": 4.7083, "step": 760 }, { "epoch": 0.22194677360554138, "grad_norm": 3.001934051513672, "learning_rate": 0.00018531414121766195, "loss": 4.6174, "step": 761 }, { "epoch": 0.22223842508202699, "grad_norm": 3.8507306575775146, "learning_rate": 0.00018529468974907607, "loss": 4.6725, "step": 762 }, { "epoch": 0.2225300765585126, "grad_norm": 4.010118007659912, "learning_rate": 0.0001852752382804902, "loss": 4.2497, "step": 763 }, { "epoch": 0.22282172803499817, "grad_norm": 2.8058691024780273, "learning_rate": 0.00018525578681190429, "loss": 4.3637, "step": 764 }, { "epoch": 0.22311337951148377, "grad_norm": 5.19416618347168, "learning_rate": 0.00018523633534331843, "loss": 4.5532, "step": 765 }, { "epoch": 0.22340503098796938, "grad_norm": 4.079737663269043, "learning_rate": 0.00018521688387473256, "loss": 4.3943, "step": 766 }, { "epoch": 0.22369668246445498, "grad_norm": 2.589343547821045, "learning_rate": 0.00018519743240614668, "loss": 4.7579, "step": 767 }, { "epoch": 0.2239883339409406, "grad_norm": 3.342949151992798, "learning_rate": 0.0001851779809375608, "loss": 4.7606, "step": 768 }, { "epoch": 0.22427998541742616, "grad_norm": 3.3996431827545166, "learning_rate": 0.00018515852946897492, "loss": 4.7075, "step": 769 }, { "epoch": 0.22457163689391177, "grad_norm": 2.6282482147216797, "learning_rate": 0.00018513907800038904, "loss": 4.7431, "step": 770 }, { "epoch": 0.22486328837039737, "grad_norm": 2.599902868270874, "learning_rate": 0.00018511962653180316, "loss": 4.318, "step": 771 }, { "epoch": 0.22515493984688298, "grad_norm": 3.1583664417266846, "learning_rate": 0.00018510017506321729, "loss": 4.4484, "step": 772 }, { "epoch": 0.22544659132336858, "grad_norm": 3.6342434883117676, "learning_rate": 0.0001850807235946314, "loss": 4.4865, "step": 773 }, { "epoch": 0.22573824279985416, "grad_norm": 2.7037546634674072, "learning_rate": 0.0001850612721260455, "loss": 4.5828, "step": 774 }, { "epoch": 0.22602989427633977, "grad_norm": 2.7461729049682617, "learning_rate": 0.00018504182065745965, "loss": 4.4567, "step": 775 }, { "epoch": 0.22632154575282537, "grad_norm": 3.501742362976074, "learning_rate": 0.00018502236918887377, "loss": 4.7571, "step": 776 }, { "epoch": 0.22661319722931098, "grad_norm": 2.636225461959839, "learning_rate": 0.0001850029177202879, "loss": 4.6214, "step": 777 }, { "epoch": 0.22690484870579658, "grad_norm": 2.8184280395507812, "learning_rate": 0.00018498346625170202, "loss": 4.5483, "step": 778 }, { "epoch": 0.22719650018228219, "grad_norm": 3.044334650039673, "learning_rate": 0.00018496401478311614, "loss": 4.8139, "step": 779 }, { "epoch": 0.22748815165876776, "grad_norm": 2.850149631500244, "learning_rate": 0.00018494456331453026, "loss": 4.5475, "step": 780 }, { "epoch": 0.22777980313525337, "grad_norm": 2.9877946376800537, "learning_rate": 0.00018492511184594438, "loss": 4.4749, "step": 781 }, { "epoch": 0.22807145461173897, "grad_norm": 3.3858983516693115, "learning_rate": 0.0001849056603773585, "loss": 4.7788, "step": 782 }, { "epoch": 0.22836310608822458, "grad_norm": 2.8404958248138428, "learning_rate": 0.00018488620890877262, "loss": 4.5768, "step": 783 }, { "epoch": 0.22865475756471018, "grad_norm": 2.3659164905548096, "learning_rate": 0.00018486675744018675, "loss": 4.5367, "step": 784 }, { "epoch": 0.22894640904119576, "grad_norm": 3.5543313026428223, "learning_rate": 0.00018484730597160087, "loss": 4.6814, "step": 785 }, { "epoch": 0.22923806051768136, "grad_norm": 2.370296001434326, "learning_rate": 0.000184827854503015, "loss": 4.6056, "step": 786 }, { "epoch": 0.22952971199416697, "grad_norm": 2.7585551738739014, "learning_rate": 0.0001848084030344291, "loss": 4.6521, "step": 787 }, { "epoch": 0.22982136347065257, "grad_norm": 3.58742618560791, "learning_rate": 0.00018478895156584323, "loss": 4.1765, "step": 788 }, { "epoch": 0.23011301494713818, "grad_norm": 2.3788907527923584, "learning_rate": 0.00018476950009725735, "loss": 4.5337, "step": 789 }, { "epoch": 0.23040466642362378, "grad_norm": 3.7014002799987793, "learning_rate": 0.00018475004862867147, "loss": 4.7868, "step": 790 }, { "epoch": 0.23069631790010936, "grad_norm": 3.3734829425811768, "learning_rate": 0.0001847305971600856, "loss": 4.5534, "step": 791 }, { "epoch": 0.23098796937659496, "grad_norm": 2.5478570461273193, "learning_rate": 0.00018471114569149972, "loss": 3.9989, "step": 792 }, { "epoch": 0.23127962085308057, "grad_norm": 5.6430230140686035, "learning_rate": 0.00018469169422291384, "loss": 4.5423, "step": 793 }, { "epoch": 0.23157127232956617, "grad_norm": 3.681939125061035, "learning_rate": 0.00018467224275432796, "loss": 4.947, "step": 794 }, { "epoch": 0.23186292380605178, "grad_norm": 3.3415205478668213, "learning_rate": 0.00018465279128574208, "loss": 4.5231, "step": 795 }, { "epoch": 0.23215457528253736, "grad_norm": 3.5222063064575195, "learning_rate": 0.0001846333398171562, "loss": 4.6415, "step": 796 }, { "epoch": 0.23244622675902296, "grad_norm": 6.184996604919434, "learning_rate": 0.00018461388834857033, "loss": 4.5442, "step": 797 }, { "epoch": 0.23273787823550857, "grad_norm": 3.966951608657837, "learning_rate": 0.00018459443687998445, "loss": 4.5828, "step": 798 }, { "epoch": 0.23302952971199417, "grad_norm": 3.594588041305542, "learning_rate": 0.00018457498541139857, "loss": 4.5804, "step": 799 }, { "epoch": 0.23332118118847978, "grad_norm": 2.2958900928497314, "learning_rate": 0.0001845555339428127, "loss": 4.5387, "step": 800 }, { "epoch": 0.23361283266496535, "grad_norm": 3.744041681289673, "learning_rate": 0.0001845360824742268, "loss": 4.2912, "step": 801 }, { "epoch": 0.23390448414145096, "grad_norm": 3.7400238513946533, "learning_rate": 0.00018451663100564093, "loss": 4.4669, "step": 802 }, { "epoch": 0.23419613561793656, "grad_norm": 2.9110944271087646, "learning_rate": 0.00018449717953705506, "loss": 4.5346, "step": 803 }, { "epoch": 0.23448778709442217, "grad_norm": 4.102070331573486, "learning_rate": 0.00018447772806846918, "loss": 4.7891, "step": 804 }, { "epoch": 0.23477943857090777, "grad_norm": 3.197807788848877, "learning_rate": 0.0001844582765998833, "loss": 4.4562, "step": 805 }, { "epoch": 0.23507109004739338, "grad_norm": 2.9298534393310547, "learning_rate": 0.00018443882513129742, "loss": 3.9685, "step": 806 }, { "epoch": 0.23536274152387895, "grad_norm": 3.5369277000427246, "learning_rate": 0.00018441937366271154, "loss": 4.6127, "step": 807 }, { "epoch": 0.23565439300036456, "grad_norm": 4.15557861328125, "learning_rate": 0.00018439992219412566, "loss": 4.5176, "step": 808 }, { "epoch": 0.23594604447685016, "grad_norm": 2.918419122695923, "learning_rate": 0.00018438047072553979, "loss": 4.4006, "step": 809 }, { "epoch": 0.23623769595333577, "grad_norm": 2.151205539703369, "learning_rate": 0.0001843610192569539, "loss": 4.6052, "step": 810 }, { "epoch": 0.23652934742982137, "grad_norm": 5.383347988128662, "learning_rate": 0.00018434156778836803, "loss": 4.563, "step": 811 }, { "epoch": 0.23682099890630695, "grad_norm": 4.1631317138671875, "learning_rate": 0.00018432211631978215, "loss": 4.3865, "step": 812 }, { "epoch": 0.23711265038279256, "grad_norm": 3.258519411087036, "learning_rate": 0.00018430266485119627, "loss": 4.517, "step": 813 }, { "epoch": 0.23740430185927816, "grad_norm": 3.5818071365356445, "learning_rate": 0.0001842832133826104, "loss": 4.372, "step": 814 }, { "epoch": 0.23769595333576377, "grad_norm": 2.5143957138061523, "learning_rate": 0.00018426376191402451, "loss": 4.4919, "step": 815 }, { "epoch": 0.23798760481224937, "grad_norm": 3.699991226196289, "learning_rate": 0.00018424431044543864, "loss": 4.4356, "step": 816 }, { "epoch": 0.23827925628873498, "grad_norm": 4.3801589012146, "learning_rate": 0.00018422485897685276, "loss": 4.641, "step": 817 }, { "epoch": 0.23857090776522055, "grad_norm": 3.607011556625366, "learning_rate": 0.00018420540750826688, "loss": 4.6265, "step": 818 }, { "epoch": 0.23886255924170616, "grad_norm": 3.728663444519043, "learning_rate": 0.000184185956039681, "loss": 4.5719, "step": 819 }, { "epoch": 0.23915421071819176, "grad_norm": 4.349853515625, "learning_rate": 0.00018416650457109512, "loss": 4.6728, "step": 820 }, { "epoch": 0.23944586219467737, "grad_norm": 5.003712177276611, "learning_rate": 0.00018414705310250927, "loss": 4.8329, "step": 821 }, { "epoch": 0.23973751367116297, "grad_norm": 3.2356479167938232, "learning_rate": 0.00018412760163392337, "loss": 4.56, "step": 822 }, { "epoch": 0.24002916514764855, "grad_norm": 2.8915910720825195, "learning_rate": 0.0001841081501653375, "loss": 4.5496, "step": 823 }, { "epoch": 0.24032081662413415, "grad_norm": 2.8627171516418457, "learning_rate": 0.0001840886986967516, "loss": 4.4626, "step": 824 }, { "epoch": 0.24061246810061976, "grad_norm": 2.509336233139038, "learning_rate": 0.00018406924722816573, "loss": 4.6728, "step": 825 }, { "epoch": 0.24090411957710536, "grad_norm": 3.280853271484375, "learning_rate": 0.00018404979575957985, "loss": 4.6998, "step": 826 }, { "epoch": 0.24119577105359097, "grad_norm": 3.8401193618774414, "learning_rate": 0.00018403034429099397, "loss": 4.3321, "step": 827 }, { "epoch": 0.24148742253007655, "grad_norm": 2.178219795227051, "learning_rate": 0.00018401089282240812, "loss": 4.6476, "step": 828 }, { "epoch": 0.24177907400656215, "grad_norm": 3.4799318313598633, "learning_rate": 0.00018399144135382222, "loss": 4.4984, "step": 829 }, { "epoch": 0.24207072548304776, "grad_norm": 2.4823853969573975, "learning_rate": 0.00018397198988523634, "loss": 4.7547, "step": 830 }, { "epoch": 0.24236237695953336, "grad_norm": 2.5169806480407715, "learning_rate": 0.00018395253841665046, "loss": 4.5384, "step": 831 }, { "epoch": 0.24265402843601896, "grad_norm": 2.731765031814575, "learning_rate": 0.00018393308694806458, "loss": 4.1388, "step": 832 }, { "epoch": 0.24294567991250457, "grad_norm": 3.0954301357269287, "learning_rate": 0.0001839136354794787, "loss": 4.4911, "step": 833 }, { "epoch": 0.24323733138899015, "grad_norm": 3.052668333053589, "learning_rate": 0.00018389418401089282, "loss": 4.6142, "step": 834 }, { "epoch": 0.24352898286547575, "grad_norm": 4.294070720672607, "learning_rate": 0.00018387473254230695, "loss": 4.8172, "step": 835 }, { "epoch": 0.24382063434196136, "grad_norm": 3.550797939300537, "learning_rate": 0.00018385528107372107, "loss": 4.4064, "step": 836 }, { "epoch": 0.24411228581844696, "grad_norm": 2.6012916564941406, "learning_rate": 0.0001838358296051352, "loss": 4.6249, "step": 837 }, { "epoch": 0.24440393729493257, "grad_norm": 2.763291120529175, "learning_rate": 0.00018381637813654934, "loss": 4.4655, "step": 838 }, { "epoch": 0.24469558877141814, "grad_norm": 3.2272274494171143, "learning_rate": 0.00018379692666796343, "loss": 4.4942, "step": 839 }, { "epoch": 0.24498724024790375, "grad_norm": 3.0952701568603516, "learning_rate": 0.00018377747519937755, "loss": 4.4053, "step": 840 }, { "epoch": 0.24527889172438935, "grad_norm": 2.454678535461426, "learning_rate": 0.00018375802373079168, "loss": 4.8968, "step": 841 }, { "epoch": 0.24557054320087496, "grad_norm": 5.348843574523926, "learning_rate": 0.0001837385722622058, "loss": 4.7154, "step": 842 }, { "epoch": 0.24586219467736056, "grad_norm": 2.8753981590270996, "learning_rate": 0.00018371912079361992, "loss": 4.3657, "step": 843 }, { "epoch": 0.24615384615384617, "grad_norm": 2.5538721084594727, "learning_rate": 0.00018369966932503404, "loss": 4.9454, "step": 844 }, { "epoch": 0.24644549763033174, "grad_norm": 4.416300296783447, "learning_rate": 0.0001836802178564482, "loss": 4.6792, "step": 845 }, { "epoch": 0.24673714910681735, "grad_norm": 2.912442684173584, "learning_rate": 0.00018366076638786228, "loss": 4.3974, "step": 846 }, { "epoch": 0.24702880058330295, "grad_norm": 2.5420446395874023, "learning_rate": 0.0001836413149192764, "loss": 4.4927, "step": 847 }, { "epoch": 0.24732045205978856, "grad_norm": 3.1180198192596436, "learning_rate": 0.00018362186345069055, "loss": 4.2925, "step": 848 }, { "epoch": 0.24761210353627416, "grad_norm": 2.595703125, "learning_rate": 0.00018360241198210465, "loss": 4.7155, "step": 849 }, { "epoch": 0.24790375501275974, "grad_norm": 2.711313247680664, "learning_rate": 0.00018358296051351877, "loss": 4.7321, "step": 850 }, { "epoch": 0.24819540648924535, "grad_norm": 3.9545421600341797, "learning_rate": 0.0001835635090449329, "loss": 4.7523, "step": 851 }, { "epoch": 0.24848705796573095, "grad_norm": 4.417956352233887, "learning_rate": 0.00018354405757634704, "loss": 4.5344, "step": 852 }, { "epoch": 0.24877870944221656, "grad_norm": 1.7431962490081787, "learning_rate": 0.00018352460610776114, "loss": 4.5215, "step": 853 }, { "epoch": 0.24907036091870216, "grad_norm": 2.2470366954803467, "learning_rate": 0.00018350515463917526, "loss": 4.4987, "step": 854 }, { "epoch": 0.24936201239518774, "grad_norm": 3.847504138946533, "learning_rate": 0.0001834857031705894, "loss": 4.8048, "step": 855 }, { "epoch": 0.24965366387167334, "grad_norm": 3.4686625003814697, "learning_rate": 0.0001834662517020035, "loss": 4.5989, "step": 856 }, { "epoch": 0.24994531534815895, "grad_norm": 2.956010341644287, "learning_rate": 0.00018344680023341762, "loss": 4.5792, "step": 857 }, { "epoch": 0.2502369668246445, "grad_norm": 3.1322412490844727, "learning_rate": 0.00018342734876483177, "loss": 4.1936, "step": 858 }, { "epoch": 0.25052861830113016, "grad_norm": 3.9839093685150146, "learning_rate": 0.0001834078972962459, "loss": 4.8369, "step": 859 }, { "epoch": 0.25082026977761573, "grad_norm": 2.7421228885650635, "learning_rate": 0.00018338844582765999, "loss": 4.623, "step": 860 }, { "epoch": 0.25111192125410137, "grad_norm": 3.2044830322265625, "learning_rate": 0.0001833689943590741, "loss": 4.6095, "step": 861 }, { "epoch": 0.25140357273058694, "grad_norm": 4.964054584503174, "learning_rate": 0.00018334954289048826, "loss": 4.5462, "step": 862 }, { "epoch": 0.2516952242070725, "grad_norm": 3.920336961746216, "learning_rate": 0.00018333009142190235, "loss": 4.765, "step": 863 }, { "epoch": 0.25198687568355815, "grad_norm": 2.9010140895843506, "learning_rate": 0.00018331063995331647, "loss": 4.5159, "step": 864 }, { "epoch": 0.25227852716004373, "grad_norm": 4.012448310852051, "learning_rate": 0.00018329118848473062, "loss": 4.7174, "step": 865 }, { "epoch": 0.25257017863652936, "grad_norm": 3.625417709350586, "learning_rate": 0.00018327173701614472, "loss": 4.7252, "step": 866 }, { "epoch": 0.25286183011301494, "grad_norm": 2.0909242630004883, "learning_rate": 0.00018325228554755884, "loss": 4.3605, "step": 867 }, { "epoch": 0.2531534815895006, "grad_norm": 2.3438351154327393, "learning_rate": 0.00018323283407897299, "loss": 4.3914, "step": 868 }, { "epoch": 0.25344513306598615, "grad_norm": 2.9618444442749023, "learning_rate": 0.0001832133826103871, "loss": 4.776, "step": 869 }, { "epoch": 0.2537367845424717, "grad_norm": 3.6006245613098145, "learning_rate": 0.0001831939311418012, "loss": 4.5151, "step": 870 }, { "epoch": 0.25402843601895736, "grad_norm": 2.0657124519348145, "learning_rate": 0.00018317447967321532, "loss": 4.3472, "step": 871 }, { "epoch": 0.25432008749544294, "grad_norm": 2.8908467292785645, "learning_rate": 0.00018315502820462947, "loss": 3.9566, "step": 872 }, { "epoch": 0.25461173897192857, "grad_norm": 2.419646739959717, "learning_rate": 0.00018313557673604357, "loss": 4.2703, "step": 873 }, { "epoch": 0.25490339044841415, "grad_norm": 3.7889678478240967, "learning_rate": 0.0001831161252674577, "loss": 4.7407, "step": 874 }, { "epoch": 0.2551950419248997, "grad_norm": 3.26938796043396, "learning_rate": 0.00018309667379887184, "loss": 4.3023, "step": 875 }, { "epoch": 0.25548669340138536, "grad_norm": 3.2155704498291016, "learning_rate": 0.00018307722233028596, "loss": 4.4427, "step": 876 }, { "epoch": 0.25577834487787093, "grad_norm": 3.1780219078063965, "learning_rate": 0.00018305777086170005, "loss": 4.8456, "step": 877 }, { "epoch": 0.25606999635435657, "grad_norm": 2.7616970539093018, "learning_rate": 0.0001830383193931142, "loss": 4.232, "step": 878 }, { "epoch": 0.25636164783084214, "grad_norm": 2.6607024669647217, "learning_rate": 0.00018301886792452832, "loss": 4.8142, "step": 879 }, { "epoch": 0.2566532993073277, "grad_norm": 3.1132004261016846, "learning_rate": 0.00018299941645594242, "loss": 4.645, "step": 880 }, { "epoch": 0.25694495078381335, "grad_norm": 2.6189589500427246, "learning_rate": 0.00018297996498735654, "loss": 4.5167, "step": 881 }, { "epoch": 0.25723660226029893, "grad_norm": 3.8827085494995117, "learning_rate": 0.0001829605135187707, "loss": 4.5651, "step": 882 }, { "epoch": 0.25752825373678456, "grad_norm": 3.2167246341705322, "learning_rate": 0.0001829410620501848, "loss": 4.7039, "step": 883 }, { "epoch": 0.25781990521327014, "grad_norm": 2.7825639247894287, "learning_rate": 0.0001829216105815989, "loss": 4.747, "step": 884 }, { "epoch": 0.2581115566897557, "grad_norm": 2.925217390060425, "learning_rate": 0.00018290215911301305, "loss": 4.6408, "step": 885 }, { "epoch": 0.25840320816624135, "grad_norm": 2.9718449115753174, "learning_rate": 0.00018288270764442717, "loss": 4.5864, "step": 886 }, { "epoch": 0.2586948596427269, "grad_norm": 3.395251512527466, "learning_rate": 0.00018286325617584127, "loss": 4.5866, "step": 887 }, { "epoch": 0.25898651111921256, "grad_norm": 4.3940911293029785, "learning_rate": 0.00018284380470725542, "loss": 4.8636, "step": 888 }, { "epoch": 0.25927816259569814, "grad_norm": 2.8419206142425537, "learning_rate": 0.00018282435323866954, "loss": 4.5937, "step": 889 }, { "epoch": 0.2595698140721837, "grad_norm": 3.215470314025879, "learning_rate": 0.00018280490177008363, "loss": 4.5062, "step": 890 }, { "epoch": 0.25986146554866935, "grad_norm": 2.569932222366333, "learning_rate": 0.00018278545030149776, "loss": 4.7046, "step": 891 }, { "epoch": 0.2601531170251549, "grad_norm": 3.7216272354125977, "learning_rate": 0.0001827659988329119, "loss": 4.463, "step": 892 }, { "epoch": 0.26044476850164056, "grad_norm": 2.6667356491088867, "learning_rate": 0.00018274654736432603, "loss": 4.1829, "step": 893 }, { "epoch": 0.26073641997812613, "grad_norm": 2.8750052452087402, "learning_rate": 0.00018272709589574012, "loss": 4.2373, "step": 894 }, { "epoch": 0.26102807145461177, "grad_norm": 2.3844337463378906, "learning_rate": 0.00018270764442715427, "loss": 4.1885, "step": 895 }, { "epoch": 0.26131972293109734, "grad_norm": 2.433546543121338, "learning_rate": 0.0001826881929585684, "loss": 4.5789, "step": 896 }, { "epoch": 0.2616113744075829, "grad_norm": 2.5693373680114746, "learning_rate": 0.00018266874148998249, "loss": 4.4635, "step": 897 }, { "epoch": 0.26190302588406855, "grad_norm": 3.873786211013794, "learning_rate": 0.00018264929002139663, "loss": 4.8223, "step": 898 }, { "epoch": 0.26219467736055413, "grad_norm": 2.4009275436401367, "learning_rate": 0.00018262983855281076, "loss": 4.5458, "step": 899 }, { "epoch": 0.26248632883703976, "grad_norm": 2.610816240310669, "learning_rate": 0.00018261038708422488, "loss": 4.4188, "step": 900 }, { "epoch": 0.26277798031352534, "grad_norm": 2.9148659706115723, "learning_rate": 0.00018259093561563897, "loss": 4.5987, "step": 901 }, { "epoch": 0.2630696317900109, "grad_norm": 2.8000524044036865, "learning_rate": 0.00018257148414705312, "loss": 4.4465, "step": 902 }, { "epoch": 0.26336128326649655, "grad_norm": 2.602782726287842, "learning_rate": 0.00018255203267846724, "loss": 4.6005, "step": 903 }, { "epoch": 0.2636529347429821, "grad_norm": 2.379239797592163, "learning_rate": 0.00018253258120988134, "loss": 4.3147, "step": 904 }, { "epoch": 0.26394458621946776, "grad_norm": 3.1066794395446777, "learning_rate": 0.00018251312974129549, "loss": 4.6475, "step": 905 }, { "epoch": 0.26423623769595334, "grad_norm": 2.6031246185302734, "learning_rate": 0.0001824936782727096, "loss": 4.5116, "step": 906 }, { "epoch": 0.2645278891724389, "grad_norm": 3.2965428829193115, "learning_rate": 0.00018247422680412373, "loss": 4.5324, "step": 907 }, { "epoch": 0.26481954064892455, "grad_norm": 4.224785804748535, "learning_rate": 0.00018245477533553782, "loss": 4.4726, "step": 908 }, { "epoch": 0.2651111921254101, "grad_norm": 3.0473365783691406, "learning_rate": 0.00018243532386695197, "loss": 4.5758, "step": 909 }, { "epoch": 0.26540284360189575, "grad_norm": 4.659058570861816, "learning_rate": 0.0001824158723983661, "loss": 4.5515, "step": 910 }, { "epoch": 0.26569449507838133, "grad_norm": 2.091400146484375, "learning_rate": 0.0001823964209297802, "loss": 4.6833, "step": 911 }, { "epoch": 0.2659861465548669, "grad_norm": 2.790565252304077, "learning_rate": 0.00018237696946119434, "loss": 4.3474, "step": 912 }, { "epoch": 0.26627779803135254, "grad_norm": 3.460287570953369, "learning_rate": 0.00018235751799260846, "loss": 4.6665, "step": 913 }, { "epoch": 0.2665694495078381, "grad_norm": 2.427500009536743, "learning_rate": 0.00018233806652402258, "loss": 4.7318, "step": 914 }, { "epoch": 0.26686110098432375, "grad_norm": 3.569716453552246, "learning_rate": 0.0001823186150554367, "loss": 4.4439, "step": 915 }, { "epoch": 0.26715275246080933, "grad_norm": 2.8687901496887207, "learning_rate": 0.00018229916358685082, "loss": 4.6927, "step": 916 }, { "epoch": 0.2674444039372949, "grad_norm": 4.1426544189453125, "learning_rate": 0.00018227971211826494, "loss": 4.7318, "step": 917 }, { "epoch": 0.26773605541378054, "grad_norm": 3.5155019760131836, "learning_rate": 0.00018226026064967904, "loss": 4.7996, "step": 918 }, { "epoch": 0.2680277068902661, "grad_norm": 2.2897627353668213, "learning_rate": 0.0001822408091810932, "loss": 4.4055, "step": 919 }, { "epoch": 0.26831935836675175, "grad_norm": 3.3724234104156494, "learning_rate": 0.0001822213577125073, "loss": 4.736, "step": 920 }, { "epoch": 0.2686110098432373, "grad_norm": 2.5665218830108643, "learning_rate": 0.0001822019062439214, "loss": 4.6478, "step": 921 }, { "epoch": 0.26890266131972296, "grad_norm": 2.2701001167297363, "learning_rate": 0.00018218245477533555, "loss": 4.7562, "step": 922 }, { "epoch": 0.26919431279620853, "grad_norm": 3.289482831954956, "learning_rate": 0.00018216300330674967, "loss": 4.7541, "step": 923 }, { "epoch": 0.2694859642726941, "grad_norm": 2.9095354080200195, "learning_rate": 0.0001821435518381638, "loss": 4.6016, "step": 924 }, { "epoch": 0.26977761574917974, "grad_norm": 3.742574691772461, "learning_rate": 0.00018212410036957792, "loss": 4.4034, "step": 925 }, { "epoch": 0.2700692672256653, "grad_norm": 2.856580972671509, "learning_rate": 0.00018210464890099204, "loss": 4.0784, "step": 926 }, { "epoch": 0.27036091870215095, "grad_norm": 3.126695156097412, "learning_rate": 0.00018208519743240616, "loss": 4.4571, "step": 927 }, { "epoch": 0.27065257017863653, "grad_norm": 3.541738986968994, "learning_rate": 0.00018206574596382025, "loss": 4.2841, "step": 928 }, { "epoch": 0.2709442216551221, "grad_norm": 3.6731019020080566, "learning_rate": 0.0001820462944952344, "loss": 4.625, "step": 929 }, { "epoch": 0.27123587313160774, "grad_norm": 2.9029083251953125, "learning_rate": 0.00018202684302664852, "loss": 4.4976, "step": 930 }, { "epoch": 0.2715275246080933, "grad_norm": 3.1331851482391357, "learning_rate": 0.00018200739155806265, "loss": 4.7081, "step": 931 }, { "epoch": 0.27181917608457895, "grad_norm": 3.2019622325897217, "learning_rate": 0.00018198794008947677, "loss": 4.5701, "step": 932 }, { "epoch": 0.2721108275610645, "grad_norm": 2.3046936988830566, "learning_rate": 0.0001819684886208909, "loss": 4.739, "step": 933 }, { "epoch": 0.2724024790375501, "grad_norm": 3.830655336380005, "learning_rate": 0.000181949037152305, "loss": 4.7978, "step": 934 }, { "epoch": 0.27269413051403574, "grad_norm": 3.829442024230957, "learning_rate": 0.00018192958568371913, "loss": 4.5926, "step": 935 }, { "epoch": 0.2729857819905213, "grad_norm": 2.277233600616455, "learning_rate": 0.00018191013421513325, "loss": 4.1369, "step": 936 }, { "epoch": 0.27327743346700695, "grad_norm": 4.655283451080322, "learning_rate": 0.00018189068274654738, "loss": 4.4857, "step": 937 }, { "epoch": 0.2735690849434925, "grad_norm": 4.009754657745361, "learning_rate": 0.0001818712312779615, "loss": 4.2749, "step": 938 }, { "epoch": 0.2738607364199781, "grad_norm": 2.55291748046875, "learning_rate": 0.00018185177980937562, "loss": 4.4687, "step": 939 }, { "epoch": 0.27415238789646373, "grad_norm": 2.2513427734375, "learning_rate": 0.00018183232834078974, "loss": 4.601, "step": 940 }, { "epoch": 0.2744440393729493, "grad_norm": 2.537051200866699, "learning_rate": 0.00018181287687220386, "loss": 4.3243, "step": 941 }, { "epoch": 0.27473569084943494, "grad_norm": 3.2681825160980225, "learning_rate": 0.00018179342540361798, "loss": 4.57, "step": 942 }, { "epoch": 0.2750273423259205, "grad_norm": 3.6844348907470703, "learning_rate": 0.0001817739739350321, "loss": 4.5957, "step": 943 }, { "epoch": 0.2753189938024061, "grad_norm": 2.854861259460449, "learning_rate": 0.00018175452246644623, "loss": 4.7107, "step": 944 }, { "epoch": 0.27561064527889173, "grad_norm": 3.360569715499878, "learning_rate": 0.00018173507099786035, "loss": 4.4852, "step": 945 }, { "epoch": 0.2759022967553773, "grad_norm": 3.04850435256958, "learning_rate": 0.00018171561952927447, "loss": 4.465, "step": 946 }, { "epoch": 0.27619394823186294, "grad_norm": 2.734900951385498, "learning_rate": 0.0001816961680606886, "loss": 4.3639, "step": 947 }, { "epoch": 0.2764855997083485, "grad_norm": 3.6183159351348877, "learning_rate": 0.00018167671659210271, "loss": 4.5184, "step": 948 }, { "epoch": 0.27677725118483415, "grad_norm": 3.3481202125549316, "learning_rate": 0.00018165726512351684, "loss": 4.1628, "step": 949 }, { "epoch": 0.2770689026613197, "grad_norm": 3.39790415763855, "learning_rate": 0.00018163781365493096, "loss": 4.4729, "step": 950 }, { "epoch": 0.2773605541378053, "grad_norm": 2.2381324768066406, "learning_rate": 0.00018161836218634508, "loss": 4.4794, "step": 951 }, { "epoch": 0.27765220561429094, "grad_norm": 3.3942277431488037, "learning_rate": 0.0001815989107177592, "loss": 4.4363, "step": 952 }, { "epoch": 0.2779438570907765, "grad_norm": 2.76975679397583, "learning_rate": 0.00018157945924917332, "loss": 4.7184, "step": 953 }, { "epoch": 0.27823550856726215, "grad_norm": 2.502422332763672, "learning_rate": 0.00018156000778058744, "loss": 4.3215, "step": 954 }, { "epoch": 0.2785271600437477, "grad_norm": 3.035336494445801, "learning_rate": 0.00018154055631200156, "loss": 4.3069, "step": 955 }, { "epoch": 0.2788188115202333, "grad_norm": 3.259874105453491, "learning_rate": 0.00018152110484341569, "loss": 4.3728, "step": 956 }, { "epoch": 0.27911046299671893, "grad_norm": 2.2581865787506104, "learning_rate": 0.0001815016533748298, "loss": 4.5635, "step": 957 }, { "epoch": 0.2794021144732045, "grad_norm": 3.4779093265533447, "learning_rate": 0.00018148220190624393, "loss": 4.526, "step": 958 }, { "epoch": 0.27969376594969014, "grad_norm": 2.7343342304229736, "learning_rate": 0.00018146275043765805, "loss": 4.4734, "step": 959 }, { "epoch": 0.2799854174261757, "grad_norm": 2.9250471591949463, "learning_rate": 0.00018144329896907217, "loss": 4.5288, "step": 960 }, { "epoch": 0.2802770689026613, "grad_norm": 2.8235647678375244, "learning_rate": 0.0001814238475004863, "loss": 4.6288, "step": 961 }, { "epoch": 0.28056872037914693, "grad_norm": 3.281961441040039, "learning_rate": 0.00018140439603190042, "loss": 4.6519, "step": 962 }, { "epoch": 0.2808603718556325, "grad_norm": 3.4258739948272705, "learning_rate": 0.00018138494456331454, "loss": 4.83, "step": 963 }, { "epoch": 0.28115202333211814, "grad_norm": 2.9684700965881348, "learning_rate": 0.00018136549309472866, "loss": 4.3324, "step": 964 }, { "epoch": 0.2814436748086037, "grad_norm": 4.1014018058776855, "learning_rate": 0.00018134604162614278, "loss": 4.4051, "step": 965 }, { "epoch": 0.2817353262850893, "grad_norm": 3.681142568588257, "learning_rate": 0.0001813265901575569, "loss": 4.7034, "step": 966 }, { "epoch": 0.2820269777615749, "grad_norm": 3.3199033737182617, "learning_rate": 0.00018130713868897102, "loss": 4.5935, "step": 967 }, { "epoch": 0.2823186292380605, "grad_norm": 3.086571216583252, "learning_rate": 0.00018128768722038515, "loss": 4.8147, "step": 968 }, { "epoch": 0.28261028071454614, "grad_norm": 2.8419129848480225, "learning_rate": 0.00018126823575179927, "loss": 4.6157, "step": 969 }, { "epoch": 0.2829019321910317, "grad_norm": 2.0946695804595947, "learning_rate": 0.0001812487842832134, "loss": 4.3477, "step": 970 }, { "epoch": 0.2831935836675173, "grad_norm": 2.8986430168151855, "learning_rate": 0.0001812293328146275, "loss": 4.6701, "step": 971 }, { "epoch": 0.2834852351440029, "grad_norm": 2.9014179706573486, "learning_rate": 0.00018120988134604163, "loss": 4.5341, "step": 972 }, { "epoch": 0.2837768866204885, "grad_norm": 2.376711368560791, "learning_rate": 0.00018119042987745575, "loss": 4.632, "step": 973 }, { "epoch": 0.28406853809697413, "grad_norm": 2.6001265048980713, "learning_rate": 0.00018117097840886988, "loss": 4.2876, "step": 974 }, { "epoch": 0.2843601895734597, "grad_norm": 2.2956936359405518, "learning_rate": 0.00018115152694028402, "loss": 4.482, "step": 975 }, { "epoch": 0.28465184104994534, "grad_norm": 2.754587411880493, "learning_rate": 0.00018113207547169812, "loss": 4.3845, "step": 976 }, { "epoch": 0.2849434925264309, "grad_norm": 2.7269506454467773, "learning_rate": 0.00018111262400311224, "loss": 4.285, "step": 977 }, { "epoch": 0.2852351440029165, "grad_norm": 4.895108222961426, "learning_rate": 0.00018109317253452636, "loss": 4.6614, "step": 978 }, { "epoch": 0.28552679547940213, "grad_norm": 3.6166257858276367, "learning_rate": 0.00018107372106594048, "loss": 4.4627, "step": 979 }, { "epoch": 0.2858184469558877, "grad_norm": 2.7193949222564697, "learning_rate": 0.0001810542695973546, "loss": 4.8937, "step": 980 }, { "epoch": 0.28611009843237334, "grad_norm": 2.4634039402008057, "learning_rate": 0.00018103481812876873, "loss": 4.4493, "step": 981 }, { "epoch": 0.2864017499088589, "grad_norm": 2.531376838684082, "learning_rate": 0.00018101536666018285, "loss": 4.564, "step": 982 }, { "epoch": 0.2866934013853445, "grad_norm": 2.405830144882202, "learning_rate": 0.00018099591519159697, "loss": 4.1424, "step": 983 }, { "epoch": 0.2869850528618301, "grad_norm": 4.284375190734863, "learning_rate": 0.0001809764637230111, "loss": 4.8184, "step": 984 }, { "epoch": 0.2872767043383157, "grad_norm": 3.1471104621887207, "learning_rate": 0.00018095701225442524, "loss": 4.1365, "step": 985 }, { "epoch": 0.28756835581480134, "grad_norm": 3.1491692066192627, "learning_rate": 0.00018093756078583933, "loss": 4.6006, "step": 986 }, { "epoch": 0.2878600072912869, "grad_norm": 2.9614932537078857, "learning_rate": 0.00018091810931725346, "loss": 4.5722, "step": 987 }, { "epoch": 0.2881516587677725, "grad_norm": 2.9936399459838867, "learning_rate": 0.00018089865784866758, "loss": 4.6972, "step": 988 }, { "epoch": 0.2884433102442581, "grad_norm": 2.7780163288116455, "learning_rate": 0.0001808792063800817, "loss": 4.8164, "step": 989 }, { "epoch": 0.2887349617207437, "grad_norm": 3.660188674926758, "learning_rate": 0.00018085975491149582, "loss": 4.6018, "step": 990 }, { "epoch": 0.28902661319722933, "grad_norm": 3.1855087280273438, "learning_rate": 0.00018084030344290994, "loss": 4.2168, "step": 991 }, { "epoch": 0.2893182646737149, "grad_norm": 3.1112072467803955, "learning_rate": 0.0001808208519743241, "loss": 4.5317, "step": 992 }, { "epoch": 0.2896099161502005, "grad_norm": 2.471564531326294, "learning_rate": 0.00018080140050573819, "loss": 4.1739, "step": 993 }, { "epoch": 0.2899015676266861, "grad_norm": 2.671708583831787, "learning_rate": 0.0001807819490371523, "loss": 4.6216, "step": 994 }, { "epoch": 0.2901932191031717, "grad_norm": 2.776989221572876, "learning_rate": 0.00018076249756856643, "loss": 4.5638, "step": 995 }, { "epoch": 0.29048487057965733, "grad_norm": 2.198660373687744, "learning_rate": 0.00018074304609998055, "loss": 4.4853, "step": 996 }, { "epoch": 0.2907765220561429, "grad_norm": 2.159093141555786, "learning_rate": 0.00018072359463139467, "loss": 4.4363, "step": 997 }, { "epoch": 0.2910681735326285, "grad_norm": 2.9357781410217285, "learning_rate": 0.0001807041431628088, "loss": 4.6865, "step": 998 }, { "epoch": 0.2913598250091141, "grad_norm": 3.009221315383911, "learning_rate": 0.00018068469169422294, "loss": 4.5114, "step": 999 }, { "epoch": 0.2916514764855997, "grad_norm": 2.4230310916900635, "learning_rate": 0.00018066524022563704, "loss": 4.6931, "step": 1000 }, { "epoch": 0.2919431279620853, "grad_norm": 3.5892746448516846, "learning_rate": 0.00018064578875705116, "loss": 4.3576, "step": 1001 }, { "epoch": 0.2922347794385709, "grad_norm": 2.172351598739624, "learning_rate": 0.0001806263372884653, "loss": 4.3922, "step": 1002 }, { "epoch": 0.29252643091505653, "grad_norm": 2.7057747840881348, "learning_rate": 0.0001806068858198794, "loss": 4.7105, "step": 1003 }, { "epoch": 0.2928180823915421, "grad_norm": 3.9026920795440674, "learning_rate": 0.00018058743435129352, "loss": 4.533, "step": 1004 }, { "epoch": 0.2931097338680277, "grad_norm": 3.901470422744751, "learning_rate": 0.00018056798288270764, "loss": 4.2332, "step": 1005 }, { "epoch": 0.2934013853445133, "grad_norm": 2.5886404514312744, "learning_rate": 0.0001805485314141218, "loss": 4.4069, "step": 1006 }, { "epoch": 0.2936930368209989, "grad_norm": 6.505303859710693, "learning_rate": 0.0001805290799455359, "loss": 4.8214, "step": 1007 }, { "epoch": 0.29398468829748453, "grad_norm": 3.4017651081085205, "learning_rate": 0.00018050962847695, "loss": 4.397, "step": 1008 }, { "epoch": 0.2942763397739701, "grad_norm": 1.9845612049102783, "learning_rate": 0.00018049017700836416, "loss": 4.2863, "step": 1009 }, { "epoch": 0.2945679912504557, "grad_norm": 2.679077386856079, "learning_rate": 0.00018047072553977825, "loss": 4.5839, "step": 1010 }, { "epoch": 0.2948596427269413, "grad_norm": 3.6313836574554443, "learning_rate": 0.00018045127407119237, "loss": 4.6826, "step": 1011 }, { "epoch": 0.2951512942034269, "grad_norm": 4.18046236038208, "learning_rate": 0.00018043182260260652, "loss": 4.3975, "step": 1012 }, { "epoch": 0.2954429456799125, "grad_norm": 2.3462743759155273, "learning_rate": 0.00018041237113402062, "loss": 4.4575, "step": 1013 }, { "epoch": 0.2957345971563981, "grad_norm": 4.3241376876831055, "learning_rate": 0.00018039291966543474, "loss": 4.4517, "step": 1014 }, { "epoch": 0.2960262486328837, "grad_norm": 2.081380844116211, "learning_rate": 0.00018037346819684886, "loss": 4.5181, "step": 1015 }, { "epoch": 0.2963179001093693, "grad_norm": 2.674269676208496, "learning_rate": 0.000180354016728263, "loss": 4.489, "step": 1016 }, { "epoch": 0.2966095515858549, "grad_norm": 5.15725040435791, "learning_rate": 0.0001803345652596771, "loss": 4.5212, "step": 1017 }, { "epoch": 0.2969012030623405, "grad_norm": 4.408202648162842, "learning_rate": 0.00018031511379109123, "loss": 4.9358, "step": 1018 }, { "epoch": 0.2971928545388261, "grad_norm": 2.6191744804382324, "learning_rate": 0.00018029566232250537, "loss": 4.4199, "step": 1019 }, { "epoch": 0.2974845060153117, "grad_norm": 2.574160575866699, "learning_rate": 0.00018027621085391947, "loss": 4.6369, "step": 1020 }, { "epoch": 0.2977761574917973, "grad_norm": 1.9898086786270142, "learning_rate": 0.0001802567593853336, "loss": 4.3638, "step": 1021 }, { "epoch": 0.2980678089682829, "grad_norm": 4.105470180511475, "learning_rate": 0.00018023730791674774, "loss": 4.6551, "step": 1022 }, { "epoch": 0.2983594604447685, "grad_norm": 3.902003526687622, "learning_rate": 0.00018021785644816186, "loss": 4.6197, "step": 1023 }, { "epoch": 0.2986511119212541, "grad_norm": 2.60406494140625, "learning_rate": 0.00018019840497957595, "loss": 4.5843, "step": 1024 }, { "epoch": 0.2989427633977397, "grad_norm": 2.471078872680664, "learning_rate": 0.00018017895351099008, "loss": 4.3981, "step": 1025 }, { "epoch": 0.2992344148742253, "grad_norm": 3.0818331241607666, "learning_rate": 0.00018015950204240423, "loss": 4.6157, "step": 1026 }, { "epoch": 0.2995260663507109, "grad_norm": 2.4437153339385986, "learning_rate": 0.00018014005057381832, "loss": 4.4958, "step": 1027 }, { "epoch": 0.2998177178271965, "grad_norm": 3.781250476837158, "learning_rate": 0.00018012059910523244, "loss": 4.6602, "step": 1028 }, { "epoch": 0.3001093693036821, "grad_norm": 2.3718297481536865, "learning_rate": 0.0001801011476366466, "loss": 4.4787, "step": 1029 }, { "epoch": 0.3004010207801677, "grad_norm": 3.6673803329467773, "learning_rate": 0.0001800816961680607, "loss": 4.3257, "step": 1030 }, { "epoch": 0.3006926722566533, "grad_norm": 2.3554539680480957, "learning_rate": 0.0001800622446994748, "loss": 4.4623, "step": 1031 }, { "epoch": 0.3009843237331389, "grad_norm": 2.5130558013916016, "learning_rate": 0.00018004279323088895, "loss": 4.5661, "step": 1032 }, { "epoch": 0.3012759752096245, "grad_norm": 3.996344566345215, "learning_rate": 0.00018002334176230308, "loss": 4.5131, "step": 1033 }, { "epoch": 0.3015676266861101, "grad_norm": 2.2256553173065186, "learning_rate": 0.00018000389029371717, "loss": 4.5091, "step": 1034 }, { "epoch": 0.3018592781625957, "grad_norm": 2.329571485519409, "learning_rate": 0.0001799844388251313, "loss": 4.5101, "step": 1035 }, { "epoch": 0.3021509296390813, "grad_norm": 3.072093963623047, "learning_rate": 0.00017996498735654544, "loss": 4.4542, "step": 1036 }, { "epoch": 0.3024425811155669, "grad_norm": 3.157684564590454, "learning_rate": 0.00017994553588795954, "loss": 4.3021, "step": 1037 }, { "epoch": 0.3027342325920525, "grad_norm": 3.718353271484375, "learning_rate": 0.00017992608441937366, "loss": 4.2246, "step": 1038 }, { "epoch": 0.3030258840685381, "grad_norm": 3.138277530670166, "learning_rate": 0.0001799066329507878, "loss": 4.5891, "step": 1039 }, { "epoch": 0.3033175355450237, "grad_norm": 2.1776888370513916, "learning_rate": 0.00017988718148220193, "loss": 4.4976, "step": 1040 }, { "epoch": 0.3036091870215093, "grad_norm": 2.4615564346313477, "learning_rate": 0.00017986773001361602, "loss": 4.2674, "step": 1041 }, { "epoch": 0.3039008384979949, "grad_norm": 2.5812864303588867, "learning_rate": 0.00017984827854503017, "loss": 4.6748, "step": 1042 }, { "epoch": 0.3041924899744805, "grad_norm": 3.08701229095459, "learning_rate": 0.0001798288270764443, "loss": 4.7263, "step": 1043 }, { "epoch": 0.3044841414509661, "grad_norm": 2.826693058013916, "learning_rate": 0.0001798093756078584, "loss": 4.4728, "step": 1044 }, { "epoch": 0.3047757929274517, "grad_norm": 3.346087694168091, "learning_rate": 0.0001797899241392725, "loss": 4.4366, "step": 1045 }, { "epoch": 0.3050674444039373, "grad_norm": 2.4186623096466064, "learning_rate": 0.00017977047267068666, "loss": 4.2966, "step": 1046 }, { "epoch": 0.30535909588042287, "grad_norm": 2.689502239227295, "learning_rate": 0.00017975102120210078, "loss": 4.2344, "step": 1047 }, { "epoch": 0.3056507473569085, "grad_norm": 2.3369789123535156, "learning_rate": 0.00017973156973351487, "loss": 4.6088, "step": 1048 }, { "epoch": 0.3059423988333941, "grad_norm": 2.6013059616088867, "learning_rate": 0.00017971211826492902, "loss": 4.6601, "step": 1049 }, { "epoch": 0.3062340503098797, "grad_norm": 2.2491989135742188, "learning_rate": 0.00017969266679634314, "loss": 4.3814, "step": 1050 }, { "epoch": 0.3065257017863653, "grad_norm": 2.7879269123077393, "learning_rate": 0.00017967321532775724, "loss": 4.7267, "step": 1051 }, { "epoch": 0.30681735326285087, "grad_norm": 3.453599691390991, "learning_rate": 0.0001796537638591714, "loss": 4.5652, "step": 1052 }, { "epoch": 0.3071090047393365, "grad_norm": 2.402369976043701, "learning_rate": 0.0001796343123905855, "loss": 4.5466, "step": 1053 }, { "epoch": 0.3074006562158221, "grad_norm": 3.3110909461975098, "learning_rate": 0.00017961486092199963, "loss": 4.5351, "step": 1054 }, { "epoch": 0.3076923076923077, "grad_norm": 2.590883493423462, "learning_rate": 0.00017959540945341372, "loss": 4.0904, "step": 1055 }, { "epoch": 0.3079839591687933, "grad_norm": 2.4325168132781982, "learning_rate": 0.00017957595798482787, "loss": 4.2543, "step": 1056 }, { "epoch": 0.3082756106452789, "grad_norm": 2.208306074142456, "learning_rate": 0.000179556506516242, "loss": 4.7516, "step": 1057 }, { "epoch": 0.3085672621217645, "grad_norm": 3.8309485912323, "learning_rate": 0.0001795370550476561, "loss": 4.5568, "step": 1058 }, { "epoch": 0.3088589135982501, "grad_norm": 2.577030658721924, "learning_rate": 0.00017951760357907024, "loss": 4.4789, "step": 1059 }, { "epoch": 0.3091505650747357, "grad_norm": 3.4294164180755615, "learning_rate": 0.00017949815211048436, "loss": 4.5486, "step": 1060 }, { "epoch": 0.3094422165512213, "grad_norm": 3.4210143089294434, "learning_rate": 0.00017947870064189845, "loss": 4.7131, "step": 1061 }, { "epoch": 0.3097338680277069, "grad_norm": 2.86308217048645, "learning_rate": 0.0001794592491733126, "loss": 4.6802, "step": 1062 }, { "epoch": 0.3100255195041925, "grad_norm": 3.1586666107177734, "learning_rate": 0.00017943979770472672, "loss": 4.7129, "step": 1063 }, { "epoch": 0.31031717098067807, "grad_norm": 2.17549204826355, "learning_rate": 0.00017942034623614085, "loss": 4.4941, "step": 1064 }, { "epoch": 0.3106088224571637, "grad_norm": 2.222010850906372, "learning_rate": 0.00017940089476755494, "loss": 4.4879, "step": 1065 }, { "epoch": 0.3109004739336493, "grad_norm": 2.8590240478515625, "learning_rate": 0.0001793814432989691, "loss": 4.589, "step": 1066 }, { "epoch": 0.3111921254101349, "grad_norm": 2.8061201572418213, "learning_rate": 0.0001793619918303832, "loss": 4.4179, "step": 1067 }, { "epoch": 0.3114837768866205, "grad_norm": 2.563183307647705, "learning_rate": 0.0001793425403617973, "loss": 4.5651, "step": 1068 }, { "epoch": 0.31177542836310607, "grad_norm": 2.491063356399536, "learning_rate": 0.00017932308889321145, "loss": 4.4151, "step": 1069 }, { "epoch": 0.3120670798395917, "grad_norm": 2.9937868118286133, "learning_rate": 0.00017930363742462558, "loss": 4.9328, "step": 1070 }, { "epoch": 0.3123587313160773, "grad_norm": 2.885955572128296, "learning_rate": 0.0001792841859560397, "loss": 4.6452, "step": 1071 }, { "epoch": 0.3126503827925629, "grad_norm": 3.2374236583709717, "learning_rate": 0.0001792647344874538, "loss": 4.6774, "step": 1072 }, { "epoch": 0.3129420342690485, "grad_norm": 2.668114423751831, "learning_rate": 0.00017924528301886794, "loss": 4.5091, "step": 1073 }, { "epoch": 0.31323368574553406, "grad_norm": 2.9258241653442383, "learning_rate": 0.00017922583155028206, "loss": 4.3766, "step": 1074 }, { "epoch": 0.3135253372220197, "grad_norm": 2.5529367923736572, "learning_rate": 0.00017920638008169616, "loss": 4.1747, "step": 1075 }, { "epoch": 0.3138169886985053, "grad_norm": 3.1663520336151123, "learning_rate": 0.0001791869286131103, "loss": 4.5265, "step": 1076 }, { "epoch": 0.3141086401749909, "grad_norm": 6.998344421386719, "learning_rate": 0.00017916747714452443, "loss": 4.5405, "step": 1077 }, { "epoch": 0.3144002916514765, "grad_norm": 3.3080575466156006, "learning_rate": 0.00017914802567593855, "loss": 4.9159, "step": 1078 }, { "epoch": 0.31469194312796206, "grad_norm": 2.4695544242858887, "learning_rate": 0.00017912857420735267, "loss": 4.3078, "step": 1079 }, { "epoch": 0.3149835946044477, "grad_norm": 2.872800350189209, "learning_rate": 0.0001791091227387668, "loss": 4.4999, "step": 1080 }, { "epoch": 0.31527524608093327, "grad_norm": 3.0500426292419434, "learning_rate": 0.0001790896712701809, "loss": 4.5141, "step": 1081 }, { "epoch": 0.3155668975574189, "grad_norm": 2.859193801879883, "learning_rate": 0.000179070219801595, "loss": 4.4476, "step": 1082 }, { "epoch": 0.3158585490339045, "grad_norm": 2.561886787414551, "learning_rate": 0.00017905076833300916, "loss": 4.6415, "step": 1083 }, { "epoch": 0.3161502005103901, "grad_norm": 2.2389800548553467, "learning_rate": 0.00017903131686442328, "loss": 4.4914, "step": 1084 }, { "epoch": 0.3164418519868757, "grad_norm": 2.1097018718719482, "learning_rate": 0.0001790118653958374, "loss": 4.3274, "step": 1085 }, { "epoch": 0.31673350346336127, "grad_norm": 1.99626624584198, "learning_rate": 0.00017899241392725152, "loss": 4.5344, "step": 1086 }, { "epoch": 0.3170251549398469, "grad_norm": 3.380662441253662, "learning_rate": 0.00017897296245866564, "loss": 4.358, "step": 1087 }, { "epoch": 0.3173168064163325, "grad_norm": 2.405761480331421, "learning_rate": 0.00017895351099007976, "loss": 4.5419, "step": 1088 }, { "epoch": 0.3176084578928181, "grad_norm": 2.393461227416992, "learning_rate": 0.00017893405952149389, "loss": 4.5248, "step": 1089 }, { "epoch": 0.3179001093693037, "grad_norm": 2.0993311405181885, "learning_rate": 0.000178914608052908, "loss": 4.424, "step": 1090 }, { "epoch": 0.31819176084578926, "grad_norm": 2.801583766937256, "learning_rate": 0.00017889515658432213, "loss": 4.4097, "step": 1091 }, { "epoch": 0.3184834123222749, "grad_norm": 2.5001306533813477, "learning_rate": 0.00017887570511573622, "loss": 4.5657, "step": 1092 }, { "epoch": 0.31877506379876047, "grad_norm": 2.9304285049438477, "learning_rate": 0.00017885625364715037, "loss": 4.5794, "step": 1093 }, { "epoch": 0.3190667152752461, "grad_norm": 2.10331130027771, "learning_rate": 0.0001788368021785645, "loss": 4.5605, "step": 1094 }, { "epoch": 0.3193583667517317, "grad_norm": 2.0157623291015625, "learning_rate": 0.00017881735070997862, "loss": 4.3139, "step": 1095 }, { "epoch": 0.31965001822821726, "grad_norm": 2.2043492794036865, "learning_rate": 0.00017879789924139274, "loss": 4.7022, "step": 1096 }, { "epoch": 0.3199416697047029, "grad_norm": 2.89178729057312, "learning_rate": 0.00017877844777280686, "loss": 4.4125, "step": 1097 }, { "epoch": 0.32023332118118847, "grad_norm": 2.9423182010650635, "learning_rate": 0.00017875899630422098, "loss": 4.5354, "step": 1098 }, { "epoch": 0.3205249726576741, "grad_norm": 2.1678922176361084, "learning_rate": 0.0001787395448356351, "loss": 4.7421, "step": 1099 }, { "epoch": 0.3208166241341597, "grad_norm": 3.0321528911590576, "learning_rate": 0.00017872009336704922, "loss": 4.4262, "step": 1100 }, { "epoch": 0.32110827561064526, "grad_norm": 2.693889617919922, "learning_rate": 0.00017870064189846334, "loss": 4.6579, "step": 1101 }, { "epoch": 0.3213999270871309, "grad_norm": 2.7115190029144287, "learning_rate": 0.00017868119042987747, "loss": 4.3972, "step": 1102 }, { "epoch": 0.32169157856361646, "grad_norm": 2.8831443786621094, "learning_rate": 0.0001786617389612916, "loss": 4.4472, "step": 1103 }, { "epoch": 0.3219832300401021, "grad_norm": 3.1059701442718506, "learning_rate": 0.0001786422874927057, "loss": 4.5308, "step": 1104 }, { "epoch": 0.3222748815165877, "grad_norm": 2.739820957183838, "learning_rate": 0.00017862283602411983, "loss": 4.3993, "step": 1105 }, { "epoch": 0.32256653299307325, "grad_norm": 2.448288679122925, "learning_rate": 0.00017860338455553395, "loss": 4.3352, "step": 1106 }, { "epoch": 0.3228581844695589, "grad_norm": 2.8825342655181885, "learning_rate": 0.00017858393308694807, "loss": 4.6526, "step": 1107 }, { "epoch": 0.32314983594604446, "grad_norm": 4.759485721588135, "learning_rate": 0.0001785644816183622, "loss": 4.6884, "step": 1108 }, { "epoch": 0.3234414874225301, "grad_norm": 2.212366819381714, "learning_rate": 0.00017854503014977632, "loss": 4.46, "step": 1109 }, { "epoch": 0.32373313889901567, "grad_norm": 3.352062463760376, "learning_rate": 0.00017852557868119044, "loss": 4.4499, "step": 1110 }, { "epoch": 0.3240247903755013, "grad_norm": 3.779519557952881, "learning_rate": 0.00017850612721260456, "loss": 4.6569, "step": 1111 }, { "epoch": 0.3243164418519869, "grad_norm": 2.711879014968872, "learning_rate": 0.00017848667574401868, "loss": 4.4582, "step": 1112 }, { "epoch": 0.32460809332847246, "grad_norm": 2.5148355960845947, "learning_rate": 0.0001784672242754328, "loss": 4.1027, "step": 1113 }, { "epoch": 0.3248997448049581, "grad_norm": 2.549851417541504, "learning_rate": 0.00017844777280684693, "loss": 4.451, "step": 1114 }, { "epoch": 0.32519139628144367, "grad_norm": 2.1883106231689453, "learning_rate": 0.00017842832133826105, "loss": 4.4155, "step": 1115 }, { "epoch": 0.3254830477579293, "grad_norm": 1.7071841955184937, "learning_rate": 0.00017840886986967517, "loss": 4.421, "step": 1116 }, { "epoch": 0.3257746992344149, "grad_norm": 3.982304096221924, "learning_rate": 0.0001783894184010893, "loss": 4.5772, "step": 1117 }, { "epoch": 0.32606635071090045, "grad_norm": 2.9020838737487793, "learning_rate": 0.0001783699669325034, "loss": 4.4121, "step": 1118 }, { "epoch": 0.3263580021873861, "grad_norm": 2.3561794757843018, "learning_rate": 0.00017835051546391753, "loss": 4.7298, "step": 1119 }, { "epoch": 0.32664965366387166, "grad_norm": 3.60958194732666, "learning_rate": 0.00017833106399533165, "loss": 4.6759, "step": 1120 }, { "epoch": 0.3269413051403573, "grad_norm": 2.356527328491211, "learning_rate": 0.00017831161252674578, "loss": 4.7316, "step": 1121 }, { "epoch": 0.3272329566168429, "grad_norm": 3.0893924236297607, "learning_rate": 0.0001782921610581599, "loss": 4.6241, "step": 1122 }, { "epoch": 0.32752460809332845, "grad_norm": 3.808152675628662, "learning_rate": 0.00017827270958957402, "loss": 4.3683, "step": 1123 }, { "epoch": 0.3278162595698141, "grad_norm": 2.7183010578155518, "learning_rate": 0.00017825325812098814, "loss": 4.4949, "step": 1124 }, { "epoch": 0.32810791104629966, "grad_norm": 2.2837793827056885, "learning_rate": 0.00017823380665240226, "loss": 4.5206, "step": 1125 }, { "epoch": 0.3283995625227853, "grad_norm": 2.2845382690429688, "learning_rate": 0.00017821435518381638, "loss": 4.6476, "step": 1126 }, { "epoch": 0.32869121399927087, "grad_norm": 2.2997233867645264, "learning_rate": 0.0001781949037152305, "loss": 4.7176, "step": 1127 }, { "epoch": 0.32898286547575645, "grad_norm": 1.8410285711288452, "learning_rate": 0.00017817545224664463, "loss": 4.503, "step": 1128 }, { "epoch": 0.3292745169522421, "grad_norm": 2.811007022857666, "learning_rate": 0.00017815600077805875, "loss": 4.4934, "step": 1129 }, { "epoch": 0.32956616842872766, "grad_norm": 2.7285263538360596, "learning_rate": 0.00017813654930947287, "loss": 4.3629, "step": 1130 }, { "epoch": 0.3298578199052133, "grad_norm": 3.136348247528076, "learning_rate": 0.000178117097840887, "loss": 4.4112, "step": 1131 }, { "epoch": 0.33014947138169887, "grad_norm": 3.487299919128418, "learning_rate": 0.00017809764637230111, "loss": 4.5025, "step": 1132 }, { "epoch": 0.33044112285818444, "grad_norm": 2.741180419921875, "learning_rate": 0.00017807819490371524, "loss": 4.4221, "step": 1133 }, { "epoch": 0.3307327743346701, "grad_norm": 2.1465492248535156, "learning_rate": 0.00017805874343512936, "loss": 4.3773, "step": 1134 }, { "epoch": 0.33102442581115565, "grad_norm": 3.3313214778900146, "learning_rate": 0.00017803929196654348, "loss": 4.4929, "step": 1135 }, { "epoch": 0.3313160772876413, "grad_norm": 3.54565167427063, "learning_rate": 0.0001780198404979576, "loss": 4.3173, "step": 1136 }, { "epoch": 0.33160772876412686, "grad_norm": 2.343043804168701, "learning_rate": 0.00017800038902937172, "loss": 4.4953, "step": 1137 }, { "epoch": 0.33189938024061244, "grad_norm": 3.1847617626190186, "learning_rate": 0.00017798093756078584, "loss": 4.3298, "step": 1138 }, { "epoch": 0.3321910317170981, "grad_norm": 2.8231887817382812, "learning_rate": 0.0001779614860922, "loss": 4.6274, "step": 1139 }, { "epoch": 0.33248268319358365, "grad_norm": 2.550316095352173, "learning_rate": 0.0001779420346236141, "loss": 4.5321, "step": 1140 }, { "epoch": 0.3327743346700693, "grad_norm": 2.5577282905578613, "learning_rate": 0.0001779225831550282, "loss": 4.4308, "step": 1141 }, { "epoch": 0.33306598614655486, "grad_norm": 3.7577762603759766, "learning_rate": 0.00017790313168644233, "loss": 4.5729, "step": 1142 }, { "epoch": 0.3333576376230405, "grad_norm": 3.2185347080230713, "learning_rate": 0.00017788368021785645, "loss": 4.4749, "step": 1143 }, { "epoch": 0.33364928909952607, "grad_norm": 3.540191650390625, "learning_rate": 0.00017786422874927057, "loss": 4.4913, "step": 1144 }, { "epoch": 0.33394094057601165, "grad_norm": 2.1254236698150635, "learning_rate": 0.0001778447772806847, "loss": 4.4862, "step": 1145 }, { "epoch": 0.3342325920524973, "grad_norm": 2.141792058944702, "learning_rate": 0.00017782532581209884, "loss": 4.3605, "step": 1146 }, { "epoch": 0.33452424352898286, "grad_norm": 2.7703208923339844, "learning_rate": 0.00017780587434351294, "loss": 4.5781, "step": 1147 }, { "epoch": 0.3348158950054685, "grad_norm": 2.863664150238037, "learning_rate": 0.00017778642287492706, "loss": 4.5582, "step": 1148 }, { "epoch": 0.33510754648195407, "grad_norm": 3.724827289581299, "learning_rate": 0.00017776697140634118, "loss": 4.6561, "step": 1149 }, { "epoch": 0.33539919795843964, "grad_norm": 2.8234915733337402, "learning_rate": 0.0001777475199377553, "loss": 4.5951, "step": 1150 }, { "epoch": 0.3356908494349253, "grad_norm": 3.25119686126709, "learning_rate": 0.00017772806846916942, "loss": 4.8054, "step": 1151 }, { "epoch": 0.33598250091141085, "grad_norm": 2.0966732501983643, "learning_rate": 0.00017770861700058355, "loss": 4.6626, "step": 1152 }, { "epoch": 0.3362741523878965, "grad_norm": 3.1200156211853027, "learning_rate": 0.0001776891655319977, "loss": 4.563, "step": 1153 }, { "epoch": 0.33656580386438206, "grad_norm": 3.075165271759033, "learning_rate": 0.0001776697140634118, "loss": 4.6877, "step": 1154 }, { "epoch": 0.33685745534086764, "grad_norm": 2.0727813243865967, "learning_rate": 0.0001776502625948259, "loss": 4.4545, "step": 1155 }, { "epoch": 0.3371491068173533, "grad_norm": 2.687572956085205, "learning_rate": 0.00017763081112624006, "loss": 4.8519, "step": 1156 }, { "epoch": 0.33744075829383885, "grad_norm": 3.2346508502960205, "learning_rate": 0.00017761135965765415, "loss": 4.5716, "step": 1157 }, { "epoch": 0.3377324097703245, "grad_norm": 2.958810567855835, "learning_rate": 0.00017759190818906828, "loss": 4.387, "step": 1158 }, { "epoch": 0.33802406124681006, "grad_norm": 4.273870468139648, "learning_rate": 0.0001775724567204824, "loss": 4.4548, "step": 1159 }, { "epoch": 0.33831571272329564, "grad_norm": 2.5627903938293457, "learning_rate": 0.00017755300525189652, "loss": 4.4476, "step": 1160 }, { "epoch": 0.33860736419978127, "grad_norm": 2.5192909240722656, "learning_rate": 0.00017753355378331064, "loss": 4.273, "step": 1161 }, { "epoch": 0.33889901567626685, "grad_norm": 2.592876434326172, "learning_rate": 0.00017751410231472476, "loss": 4.355, "step": 1162 }, { "epoch": 0.3391906671527525, "grad_norm": 3.4373741149902344, "learning_rate": 0.0001774946508461389, "loss": 4.7145, "step": 1163 }, { "epoch": 0.33948231862923806, "grad_norm": 2.9488818645477295, "learning_rate": 0.000177475199377553, "loss": 4.3278, "step": 1164 }, { "epoch": 0.33977397010572363, "grad_norm": 2.661316394805908, "learning_rate": 0.00017745574790896713, "loss": 4.6008, "step": 1165 }, { "epoch": 0.34006562158220927, "grad_norm": 3.5569725036621094, "learning_rate": 0.00017743629644038128, "loss": 4.4155, "step": 1166 }, { "epoch": 0.34035727305869484, "grad_norm": 3.0264768600463867, "learning_rate": 0.00017741684497179537, "loss": 4.387, "step": 1167 }, { "epoch": 0.3406489245351805, "grad_norm": 2.6230413913726807, "learning_rate": 0.0001773973935032095, "loss": 4.4192, "step": 1168 }, { "epoch": 0.34094057601166605, "grad_norm": 2.9166624546051025, "learning_rate": 0.0001773779420346236, "loss": 4.2743, "step": 1169 }, { "epoch": 0.3412322274881517, "grad_norm": 2.8672988414764404, "learning_rate": 0.00017735849056603776, "loss": 4.4568, "step": 1170 }, { "epoch": 0.34152387896463726, "grad_norm": 2.933450222015381, "learning_rate": 0.00017733903909745186, "loss": 4.616, "step": 1171 }, { "epoch": 0.34181553044112284, "grad_norm": 3.4038174152374268, "learning_rate": 0.00017731958762886598, "loss": 4.5857, "step": 1172 }, { "epoch": 0.34210718191760847, "grad_norm": 2.076707363128662, "learning_rate": 0.00017730013616028013, "loss": 4.4963, "step": 1173 }, { "epoch": 0.34239883339409405, "grad_norm": 4.998654842376709, "learning_rate": 0.00017728068469169422, "loss": 4.7331, "step": 1174 }, { "epoch": 0.3426904848705797, "grad_norm": 2.7374157905578613, "learning_rate": 0.00017726123322310834, "loss": 4.6559, "step": 1175 }, { "epoch": 0.34298213634706526, "grad_norm": 3.927427053451538, "learning_rate": 0.0001772417817545225, "loss": 4.6962, "step": 1176 }, { "epoch": 0.34327378782355084, "grad_norm": 2.3067500591278076, "learning_rate": 0.0001772223302859366, "loss": 4.507, "step": 1177 }, { "epoch": 0.34356543930003647, "grad_norm": 2.502756118774414, "learning_rate": 0.0001772028788173507, "loss": 4.5398, "step": 1178 }, { "epoch": 0.34385709077652205, "grad_norm": 2.7696332931518555, "learning_rate": 0.00017718342734876483, "loss": 4.4311, "step": 1179 }, { "epoch": 0.3441487422530077, "grad_norm": 3.350590944290161, "learning_rate": 0.00017716397588017898, "loss": 4.4533, "step": 1180 }, { "epoch": 0.34444039372949325, "grad_norm": 2.922180414199829, "learning_rate": 0.00017714452441159307, "loss": 4.6322, "step": 1181 }, { "epoch": 0.34473204520597883, "grad_norm": 2.850008249282837, "learning_rate": 0.0001771250729430072, "loss": 4.4518, "step": 1182 }, { "epoch": 0.34502369668246446, "grad_norm": 2.5730996131896973, "learning_rate": 0.00017710562147442134, "loss": 4.5387, "step": 1183 }, { "epoch": 0.34531534815895004, "grad_norm": 1.4006218910217285, "learning_rate": 0.00017708617000583544, "loss": 4.3034, "step": 1184 }, { "epoch": 0.3456069996354357, "grad_norm": 2.911205768585205, "learning_rate": 0.00017706671853724956, "loss": 4.5768, "step": 1185 }, { "epoch": 0.34589865111192125, "grad_norm": 2.3314497470855713, "learning_rate": 0.0001770472670686637, "loss": 4.6392, "step": 1186 }, { "epoch": 0.34619030258840683, "grad_norm": 2.920581817626953, "learning_rate": 0.00017702781560007783, "loss": 4.6807, "step": 1187 }, { "epoch": 0.34648195406489246, "grad_norm": 2.71354341506958, "learning_rate": 0.00017700836413149192, "loss": 4.7952, "step": 1188 }, { "epoch": 0.34677360554137804, "grad_norm": 4.95272159576416, "learning_rate": 0.00017698891266290604, "loss": 4.2405, "step": 1189 }, { "epoch": 0.34706525701786367, "grad_norm": 4.100567817687988, "learning_rate": 0.0001769694611943202, "loss": 4.4195, "step": 1190 }, { "epoch": 0.34735690849434925, "grad_norm": 3.9103384017944336, "learning_rate": 0.0001769500097257343, "loss": 4.3446, "step": 1191 }, { "epoch": 0.3476485599708348, "grad_norm": 2.5346784591674805, "learning_rate": 0.0001769305582571484, "loss": 4.4844, "step": 1192 }, { "epoch": 0.34794021144732046, "grad_norm": 2.796638011932373, "learning_rate": 0.00017691110678856256, "loss": 4.5472, "step": 1193 }, { "epoch": 0.34823186292380603, "grad_norm": 5.5765886306762695, "learning_rate": 0.00017689165531997668, "loss": 4.6532, "step": 1194 }, { "epoch": 0.34852351440029167, "grad_norm": 3.282116174697876, "learning_rate": 0.00017687220385139077, "loss": 4.5867, "step": 1195 }, { "epoch": 0.34881516587677724, "grad_norm": 2.6672122478485107, "learning_rate": 0.00017685275238280492, "loss": 4.338, "step": 1196 }, { "epoch": 0.3491068173532629, "grad_norm": 3.6286933422088623, "learning_rate": 0.00017683330091421904, "loss": 4.7237, "step": 1197 }, { "epoch": 0.34939846882974845, "grad_norm": 3.800868034362793, "learning_rate": 0.00017681384944563314, "loss": 4.5715, "step": 1198 }, { "epoch": 0.34969012030623403, "grad_norm": 2.050668716430664, "learning_rate": 0.00017679439797704726, "loss": 4.3804, "step": 1199 }, { "epoch": 0.34998177178271966, "grad_norm": 3.45704984664917, "learning_rate": 0.0001767749465084614, "loss": 4.3203, "step": 1200 }, { "epoch": 0.35027342325920524, "grad_norm": 2.506547212600708, "learning_rate": 0.00017675549503987553, "loss": 4.318, "step": 1201 }, { "epoch": 0.3505650747356909, "grad_norm": 4.062066555023193, "learning_rate": 0.00017673604357128963, "loss": 4.498, "step": 1202 }, { "epoch": 0.35085672621217645, "grad_norm": 5.124155044555664, "learning_rate": 0.00017671659210270377, "loss": 4.5065, "step": 1203 }, { "epoch": 0.35114837768866203, "grad_norm": 4.127408981323242, "learning_rate": 0.0001766971406341179, "loss": 4.2891, "step": 1204 }, { "epoch": 0.35144002916514766, "grad_norm": 3.1936168670654297, "learning_rate": 0.000176677689165532, "loss": 4.4879, "step": 1205 }, { "epoch": 0.35173168064163324, "grad_norm": 2.357872247695923, "learning_rate": 0.00017665823769694614, "loss": 4.4104, "step": 1206 }, { "epoch": 0.35202333211811887, "grad_norm": 2.6832196712493896, "learning_rate": 0.00017663878622836026, "loss": 4.5152, "step": 1207 }, { "epoch": 0.35231498359460445, "grad_norm": 1.903099775314331, "learning_rate": 0.00017661933475977436, "loss": 4.3706, "step": 1208 }, { "epoch": 0.35260663507109, "grad_norm": 2.174852132797241, "learning_rate": 0.00017659988329118848, "loss": 4.3357, "step": 1209 }, { "epoch": 0.35289828654757566, "grad_norm": 1.9975764751434326, "learning_rate": 0.00017658043182260263, "loss": 4.3768, "step": 1210 }, { "epoch": 0.35318993802406123, "grad_norm": 2.952012062072754, "learning_rate": 0.00017656098035401675, "loss": 4.8351, "step": 1211 }, { "epoch": 0.35348158950054687, "grad_norm": 3.0893056392669678, "learning_rate": 0.00017654152888543084, "loss": 4.349, "step": 1212 }, { "epoch": 0.35377324097703244, "grad_norm": 2.706712484359741, "learning_rate": 0.000176522077416845, "loss": 4.4518, "step": 1213 }, { "epoch": 0.354064892453518, "grad_norm": 2.1332595348358154, "learning_rate": 0.0001765026259482591, "loss": 4.5844, "step": 1214 }, { "epoch": 0.35435654393000365, "grad_norm": 2.1260931491851807, "learning_rate": 0.0001764831744796732, "loss": 4.4941, "step": 1215 }, { "epoch": 0.35464819540648923, "grad_norm": 3.226846218109131, "learning_rate": 0.00017646372301108736, "loss": 4.4839, "step": 1216 }, { "epoch": 0.35493984688297486, "grad_norm": 1.7958468198776245, "learning_rate": 0.00017644427154250148, "loss": 4.2814, "step": 1217 }, { "epoch": 0.35523149835946044, "grad_norm": 2.817455768585205, "learning_rate": 0.0001764248200739156, "loss": 4.6794, "step": 1218 }, { "epoch": 0.355523149835946, "grad_norm": 3.4859426021575928, "learning_rate": 0.0001764053686053297, "loss": 4.6218, "step": 1219 }, { "epoch": 0.35581480131243165, "grad_norm": 2.8786661624908447, "learning_rate": 0.00017638591713674384, "loss": 4.5495, "step": 1220 }, { "epoch": 0.3561064527889172, "grad_norm": 3.78843355178833, "learning_rate": 0.00017636646566815796, "loss": 4.5616, "step": 1221 }, { "epoch": 0.35639810426540286, "grad_norm": 2.819976568222046, "learning_rate": 0.00017634701419957206, "loss": 4.5116, "step": 1222 }, { "epoch": 0.35668975574188844, "grad_norm": 2.22172474861145, "learning_rate": 0.0001763275627309862, "loss": 4.4941, "step": 1223 }, { "epoch": 0.35698140721837407, "grad_norm": 2.2595088481903076, "learning_rate": 0.00017630811126240033, "loss": 4.6169, "step": 1224 }, { "epoch": 0.35727305869485965, "grad_norm": 2.504345178604126, "learning_rate": 0.00017628865979381445, "loss": 4.4441, "step": 1225 }, { "epoch": 0.3575647101713452, "grad_norm": 2.023341417312622, "learning_rate": 0.00017626920832522857, "loss": 4.0554, "step": 1226 }, { "epoch": 0.35785636164783086, "grad_norm": 3.0917649269104004, "learning_rate": 0.0001762497568566427, "loss": 4.3576, "step": 1227 }, { "epoch": 0.35814801312431643, "grad_norm": 1.9751224517822266, "learning_rate": 0.00017623030538805681, "loss": 4.3732, "step": 1228 }, { "epoch": 0.35843966460080207, "grad_norm": 2.354872226715088, "learning_rate": 0.0001762108539194709, "loss": 4.4433, "step": 1229 }, { "epoch": 0.35873131607728764, "grad_norm": 2.709127902984619, "learning_rate": 0.00017619140245088506, "loss": 4.5297, "step": 1230 }, { "epoch": 0.3590229675537732, "grad_norm": 3.193901777267456, "learning_rate": 0.00017617195098229918, "loss": 4.4451, "step": 1231 }, { "epoch": 0.35931461903025885, "grad_norm": 4.517493724822998, "learning_rate": 0.0001761524995137133, "loss": 4.7654, "step": 1232 }, { "epoch": 0.35960627050674443, "grad_norm": 2.9823601245880127, "learning_rate": 0.00017613304804512742, "loss": 4.6681, "step": 1233 }, { "epoch": 0.35989792198323006, "grad_norm": 2.061765193939209, "learning_rate": 0.00017611359657654154, "loss": 4.541, "step": 1234 }, { "epoch": 0.36018957345971564, "grad_norm": 2.8424623012542725, "learning_rate": 0.00017609414510795567, "loss": 4.6351, "step": 1235 }, { "epoch": 0.3604812249362012, "grad_norm": 3.4036829471588135, "learning_rate": 0.00017607469363936976, "loss": 4.5036, "step": 1236 }, { "epoch": 0.36077287641268685, "grad_norm": 2.7642269134521484, "learning_rate": 0.0001760552421707839, "loss": 4.6524, "step": 1237 }, { "epoch": 0.3610645278891724, "grad_norm": 2.7102463245391846, "learning_rate": 0.00017603579070219803, "loss": 4.4515, "step": 1238 }, { "epoch": 0.36135617936565806, "grad_norm": 2.498394250869751, "learning_rate": 0.00017601633923361212, "loss": 4.0817, "step": 1239 }, { "epoch": 0.36164783084214364, "grad_norm": 3.6616926193237305, "learning_rate": 0.00017599688776502627, "loss": 4.613, "step": 1240 }, { "epoch": 0.3619394823186292, "grad_norm": 2.639096736907959, "learning_rate": 0.0001759774362964404, "loss": 4.4293, "step": 1241 }, { "epoch": 0.36223113379511485, "grad_norm": 2.2976789474487305, "learning_rate": 0.00017595798482785452, "loss": 4.3594, "step": 1242 }, { "epoch": 0.3625227852716004, "grad_norm": 3.129788875579834, "learning_rate": 0.00017593853335926864, "loss": 4.4864, "step": 1243 }, { "epoch": 0.36281443674808606, "grad_norm": 2.771576166152954, "learning_rate": 0.00017591908189068276, "loss": 4.5495, "step": 1244 }, { "epoch": 0.36310608822457163, "grad_norm": 2.9392614364624023, "learning_rate": 0.00017589963042209688, "loss": 4.2806, "step": 1245 }, { "epoch": 0.3633977397010572, "grad_norm": 2.9986560344696045, "learning_rate": 0.00017588017895351098, "loss": 4.28, "step": 1246 }, { "epoch": 0.36368939117754284, "grad_norm": 2.6872398853302, "learning_rate": 0.00017586072748492512, "loss": 4.6701, "step": 1247 }, { "epoch": 0.3639810426540284, "grad_norm": 2.700333833694458, "learning_rate": 0.00017584127601633925, "loss": 4.8035, "step": 1248 }, { "epoch": 0.36427269413051405, "grad_norm": 3.1276817321777344, "learning_rate": 0.00017582182454775337, "loss": 4.5677, "step": 1249 }, { "epoch": 0.36456434560699963, "grad_norm": 2.951049327850342, "learning_rate": 0.0001758023730791675, "loss": 4.5659, "step": 1250 }, { "epoch": 0.36485599708348526, "grad_norm": 2.385820150375366, "learning_rate": 0.0001757829216105816, "loss": 4.2896, "step": 1251 }, { "epoch": 0.36514764855997084, "grad_norm": 3.0691781044006348, "learning_rate": 0.00017576347014199573, "loss": 4.4775, "step": 1252 }, { "epoch": 0.3654393000364564, "grad_norm": 2.588818073272705, "learning_rate": 0.00017574401867340985, "loss": 4.5008, "step": 1253 }, { "epoch": 0.36573095151294205, "grad_norm": 2.6958911418914795, "learning_rate": 0.00017572456720482398, "loss": 4.5839, "step": 1254 }, { "epoch": 0.3660226029894276, "grad_norm": 3.2136223316192627, "learning_rate": 0.0001757051157362381, "loss": 4.4292, "step": 1255 }, { "epoch": 0.36631425446591326, "grad_norm": 3.051199197769165, "learning_rate": 0.00017568566426765222, "loss": 4.7238, "step": 1256 }, { "epoch": 0.36660590594239884, "grad_norm": 3.1484320163726807, "learning_rate": 0.00017566621279906634, "loss": 4.3659, "step": 1257 }, { "epoch": 0.3668975574188844, "grad_norm": 2.455749750137329, "learning_rate": 0.00017564676133048046, "loss": 4.5529, "step": 1258 }, { "epoch": 0.36718920889537005, "grad_norm": 2.9875826835632324, "learning_rate": 0.00017562730986189458, "loss": 4.7718, "step": 1259 }, { "epoch": 0.3674808603718556, "grad_norm": 3.7640507221221924, "learning_rate": 0.0001756078583933087, "loss": 4.8655, "step": 1260 }, { "epoch": 0.36777251184834125, "grad_norm": 2.1438419818878174, "learning_rate": 0.00017558840692472283, "loss": 4.4878, "step": 1261 }, { "epoch": 0.36806416332482683, "grad_norm": 4.133961200714111, "learning_rate": 0.00017556895545613695, "loss": 4.4664, "step": 1262 }, { "epoch": 0.3683558148013124, "grad_norm": 2.1371355056762695, "learning_rate": 0.00017554950398755107, "loss": 4.23, "step": 1263 }, { "epoch": 0.36864746627779804, "grad_norm": 2.8648788928985596, "learning_rate": 0.0001755300525189652, "loss": 4.5331, "step": 1264 }, { "epoch": 0.3689391177542836, "grad_norm": 3.318657875061035, "learning_rate": 0.0001755106010503793, "loss": 4.6453, "step": 1265 }, { "epoch": 0.36923076923076925, "grad_norm": 3.1688196659088135, "learning_rate": 0.00017549114958179343, "loss": 4.5956, "step": 1266 }, { "epoch": 0.36952242070725483, "grad_norm": 2.9021358489990234, "learning_rate": 0.00017547169811320756, "loss": 4.5049, "step": 1267 }, { "epoch": 0.3698140721837404, "grad_norm": 2.7046890258789062, "learning_rate": 0.00017545224664462168, "loss": 4.542, "step": 1268 }, { "epoch": 0.37010572366022604, "grad_norm": 3.8749022483825684, "learning_rate": 0.0001754327951760358, "loss": 4.8399, "step": 1269 }, { "epoch": 0.3703973751367116, "grad_norm": 2.905812978744507, "learning_rate": 0.00017541334370744992, "loss": 4.6378, "step": 1270 }, { "epoch": 0.37068902661319725, "grad_norm": 4.491177558898926, "learning_rate": 0.00017539389223886404, "loss": 4.8089, "step": 1271 }, { "epoch": 0.3709806780896828, "grad_norm": 3.838050127029419, "learning_rate": 0.00017537444077027816, "loss": 4.3138, "step": 1272 }, { "epoch": 0.3712723295661684, "grad_norm": 2.8107051849365234, "learning_rate": 0.00017535498930169229, "loss": 4.4998, "step": 1273 }, { "epoch": 0.37156398104265403, "grad_norm": 1.8959779739379883, "learning_rate": 0.0001753355378331064, "loss": 4.3174, "step": 1274 }, { "epoch": 0.3718556325191396, "grad_norm": 3.001485824584961, "learning_rate": 0.00017531608636452053, "loss": 4.5434, "step": 1275 }, { "epoch": 0.37214728399562524, "grad_norm": 2.605036497116089, "learning_rate": 0.00017529663489593465, "loss": 4.2816, "step": 1276 }, { "epoch": 0.3724389354721108, "grad_norm": 3.5271499156951904, "learning_rate": 0.00017527718342734877, "loss": 4.425, "step": 1277 }, { "epoch": 0.37273058694859645, "grad_norm": 2.7763915061950684, "learning_rate": 0.0001752577319587629, "loss": 4.4109, "step": 1278 }, { "epoch": 0.37302223842508203, "grad_norm": 4.100903034210205, "learning_rate": 0.00017523828049017702, "loss": 4.7894, "step": 1279 }, { "epoch": 0.3733138899015676, "grad_norm": 2.8597800731658936, "learning_rate": 0.00017521882902159114, "loss": 4.3607, "step": 1280 }, { "epoch": 0.37360554137805324, "grad_norm": 2.19535493850708, "learning_rate": 0.00017519937755300526, "loss": 4.4963, "step": 1281 }, { "epoch": 0.3738971928545388, "grad_norm": 2.5095391273498535, "learning_rate": 0.00017517992608441938, "loss": 4.5022, "step": 1282 }, { "epoch": 0.37418884433102445, "grad_norm": 2.901801109313965, "learning_rate": 0.0001751604746158335, "loss": 4.8616, "step": 1283 }, { "epoch": 0.37448049580751, "grad_norm": 2.9061219692230225, "learning_rate": 0.00017514102314724762, "loss": 4.4548, "step": 1284 }, { "epoch": 0.3747721472839956, "grad_norm": 2.3612983226776123, "learning_rate": 0.00017512157167866175, "loss": 4.4014, "step": 1285 }, { "epoch": 0.37506379876048124, "grad_norm": 2.6931040287017822, "learning_rate": 0.00017510212021007587, "loss": 4.461, "step": 1286 }, { "epoch": 0.3753554502369668, "grad_norm": 2.63236927986145, "learning_rate": 0.00017508266874149, "loss": 4.4005, "step": 1287 }, { "epoch": 0.37564710171345245, "grad_norm": 2.3327295780181885, "learning_rate": 0.0001750632172729041, "loss": 4.5157, "step": 1288 }, { "epoch": 0.375938753189938, "grad_norm": 4.032588005065918, "learning_rate": 0.00017504376580431823, "loss": 4.5769, "step": 1289 }, { "epoch": 0.3762304046664236, "grad_norm": 2.79107403755188, "learning_rate": 0.00017502431433573235, "loss": 4.3058, "step": 1290 }, { "epoch": 0.37652205614290923, "grad_norm": 3.3477067947387695, "learning_rate": 0.00017500486286714647, "loss": 4.6372, "step": 1291 }, { "epoch": 0.3768137076193948, "grad_norm": 2.688896417617798, "learning_rate": 0.0001749854113985606, "loss": 4.3945, "step": 1292 }, { "epoch": 0.37710535909588044, "grad_norm": 3.2177398204803467, "learning_rate": 0.00017496595992997474, "loss": 4.4174, "step": 1293 }, { "epoch": 0.377397010572366, "grad_norm": 2.920994520187378, "learning_rate": 0.00017494650846138884, "loss": 4.3991, "step": 1294 }, { "epoch": 0.3776886620488516, "grad_norm": 2.7373435497283936, "learning_rate": 0.00017492705699280296, "loss": 4.5973, "step": 1295 }, { "epoch": 0.37798031352533723, "grad_norm": 3.7929327487945557, "learning_rate": 0.00017490760552421708, "loss": 4.6507, "step": 1296 }, { "epoch": 0.3782719650018228, "grad_norm": 3.019359827041626, "learning_rate": 0.0001748881540556312, "loss": 4.3843, "step": 1297 }, { "epoch": 0.37856361647830844, "grad_norm": 1.877858281135559, "learning_rate": 0.00017486870258704533, "loss": 4.3123, "step": 1298 }, { "epoch": 0.378855267954794, "grad_norm": 1.9111145734786987, "learning_rate": 0.00017484925111845945, "loss": 4.4583, "step": 1299 }, { "epoch": 0.3791469194312796, "grad_norm": 2.899517774581909, "learning_rate": 0.00017482979964987357, "loss": 4.3771, "step": 1300 }, { "epoch": 0.3794385709077652, "grad_norm": 2.8186111450195312, "learning_rate": 0.0001748103481812877, "loss": 4.6283, "step": 1301 }, { "epoch": 0.3797302223842508, "grad_norm": 3.023254871368408, "learning_rate": 0.0001747908967127018, "loss": 4.6447, "step": 1302 }, { "epoch": 0.38002187386073644, "grad_norm": 2.004204273223877, "learning_rate": 0.00017477144524411596, "loss": 4.5504, "step": 1303 }, { "epoch": 0.380313525337222, "grad_norm": 2.0571863651275635, "learning_rate": 0.00017475199377553006, "loss": 4.3604, "step": 1304 }, { "epoch": 0.38060517681370765, "grad_norm": 2.6661884784698486, "learning_rate": 0.00017473254230694418, "loss": 4.55, "step": 1305 }, { "epoch": 0.3808968282901932, "grad_norm": 2.8553314208984375, "learning_rate": 0.0001747130908383583, "loss": 4.6189, "step": 1306 }, { "epoch": 0.3811884797666788, "grad_norm": 1.8085530996322632, "learning_rate": 0.00017469363936977242, "loss": 4.5416, "step": 1307 }, { "epoch": 0.38148013124316443, "grad_norm": 2.87648344039917, "learning_rate": 0.00017467418790118654, "loss": 4.4649, "step": 1308 }, { "epoch": 0.38177178271965, "grad_norm": 3.4572391510009766, "learning_rate": 0.00017465473643260066, "loss": 4.4601, "step": 1309 }, { "epoch": 0.38206343419613564, "grad_norm": 3.0717103481292725, "learning_rate": 0.0001746352849640148, "loss": 4.5165, "step": 1310 }, { "epoch": 0.3823550856726212, "grad_norm": 2.2125744819641113, "learning_rate": 0.0001746158334954289, "loss": 4.4995, "step": 1311 }, { "epoch": 0.3826467371491068, "grad_norm": 2.2923905849456787, "learning_rate": 0.00017459638202684303, "loss": 4.5522, "step": 1312 }, { "epoch": 0.38293838862559243, "grad_norm": 1.6171849966049194, "learning_rate": 0.00017457693055825715, "loss": 4.421, "step": 1313 }, { "epoch": 0.383230040102078, "grad_norm": 2.9998481273651123, "learning_rate": 0.00017455747908967127, "loss": 4.7808, "step": 1314 }, { "epoch": 0.38352169157856364, "grad_norm": 3.727531671524048, "learning_rate": 0.0001745380276210854, "loss": 4.666, "step": 1315 }, { "epoch": 0.3838133430550492, "grad_norm": 2.2775981426239014, "learning_rate": 0.00017451857615249951, "loss": 4.4986, "step": 1316 }, { "epoch": 0.3841049945315348, "grad_norm": 2.60878324508667, "learning_rate": 0.00017449912468391366, "loss": 4.7174, "step": 1317 }, { "epoch": 0.3843966460080204, "grad_norm": 3.20064115524292, "learning_rate": 0.00017447967321532776, "loss": 4.7291, "step": 1318 }, { "epoch": 0.384688297484506, "grad_norm": 3.1437392234802246, "learning_rate": 0.00017446022174674188, "loss": 4.402, "step": 1319 }, { "epoch": 0.38497994896099164, "grad_norm": 2.004809617996216, "learning_rate": 0.00017444077027815603, "loss": 4.4279, "step": 1320 }, { "epoch": 0.3852716004374772, "grad_norm": 2.0728678703308105, "learning_rate": 0.00017442131880957012, "loss": 4.4502, "step": 1321 }, { "epoch": 0.3855632519139628, "grad_norm": 2.811269760131836, "learning_rate": 0.00017440186734098424, "loss": 4.5432, "step": 1322 }, { "epoch": 0.3858549033904484, "grad_norm": 4.379106521606445, "learning_rate": 0.00017438241587239837, "loss": 4.5097, "step": 1323 }, { "epoch": 0.386146554866934, "grad_norm": 2.1892120838165283, "learning_rate": 0.00017436296440381251, "loss": 4.4447, "step": 1324 }, { "epoch": 0.38643820634341963, "grad_norm": 2.981058359146118, "learning_rate": 0.0001743435129352266, "loss": 4.5018, "step": 1325 }, { "epoch": 0.3867298578199052, "grad_norm": 2.1059415340423584, "learning_rate": 0.00017432406146664073, "loss": 4.2588, "step": 1326 }, { "epoch": 0.3870215092963908, "grad_norm": 2.780829668045044, "learning_rate": 0.00017430460999805488, "loss": 4.4806, "step": 1327 }, { "epoch": 0.3873131607728764, "grad_norm": 2.761164665222168, "learning_rate": 0.00017428515852946897, "loss": 4.5981, "step": 1328 }, { "epoch": 0.387604812249362, "grad_norm": 5.620452880859375, "learning_rate": 0.0001742657070608831, "loss": 4.7125, "step": 1329 }, { "epoch": 0.38789646372584763, "grad_norm": 2.576843500137329, "learning_rate": 0.00017424625559229724, "loss": 4.3718, "step": 1330 }, { "epoch": 0.3881881152023332, "grad_norm": 3.2907779216766357, "learning_rate": 0.00017422680412371134, "loss": 4.556, "step": 1331 }, { "epoch": 0.38847976667881884, "grad_norm": 2.8948826789855957, "learning_rate": 0.00017420735265512546, "loss": 4.5129, "step": 1332 }, { "epoch": 0.3887714181553044, "grad_norm": 3.206678867340088, "learning_rate": 0.00017418790118653958, "loss": 4.5829, "step": 1333 }, { "epoch": 0.38906306963179, "grad_norm": 2.6877543926239014, "learning_rate": 0.00017416844971795373, "loss": 4.6339, "step": 1334 }, { "epoch": 0.3893547211082756, "grad_norm": 2.2380189895629883, "learning_rate": 0.00017414899824936782, "loss": 4.5398, "step": 1335 }, { "epoch": 0.3896463725847612, "grad_norm": 4.172061920166016, "learning_rate": 0.00017412954678078195, "loss": 4.5284, "step": 1336 }, { "epoch": 0.38993802406124684, "grad_norm": 2.721508264541626, "learning_rate": 0.0001741100953121961, "loss": 4.2183, "step": 1337 }, { "epoch": 0.3902296755377324, "grad_norm": 2.499545097351074, "learning_rate": 0.0001740906438436102, "loss": 4.3203, "step": 1338 }, { "epoch": 0.390521327014218, "grad_norm": 2.269761085510254, "learning_rate": 0.0001740711923750243, "loss": 4.5341, "step": 1339 }, { "epoch": 0.3908129784907036, "grad_norm": 3.6112546920776367, "learning_rate": 0.00017405174090643846, "loss": 4.6593, "step": 1340 }, { "epoch": 0.3911046299671892, "grad_norm": 2.447064161300659, "learning_rate": 0.00017403228943785258, "loss": 4.7116, "step": 1341 }, { "epoch": 0.39139628144367483, "grad_norm": 3.1545209884643555, "learning_rate": 0.00017401283796926668, "loss": 4.6766, "step": 1342 }, { "epoch": 0.3916879329201604, "grad_norm": 3.1354851722717285, "learning_rate": 0.0001739933865006808, "loss": 4.5168, "step": 1343 }, { "epoch": 0.391979584396646, "grad_norm": 3.1610312461853027, "learning_rate": 0.00017397393503209495, "loss": 5.0154, "step": 1344 }, { "epoch": 0.3922712358731316, "grad_norm": 2.5744707584381104, "learning_rate": 0.00017395448356350904, "loss": 4.0116, "step": 1345 }, { "epoch": 0.3925628873496172, "grad_norm": 4.499542236328125, "learning_rate": 0.00017393503209492316, "loss": 4.4439, "step": 1346 }, { "epoch": 0.39285453882610283, "grad_norm": 4.399485111236572, "learning_rate": 0.0001739155806263373, "loss": 4.2148, "step": 1347 }, { "epoch": 0.3931461903025884, "grad_norm": 5.565404415130615, "learning_rate": 0.00017389612915775143, "loss": 4.5645, "step": 1348 }, { "epoch": 0.393437841779074, "grad_norm": 2.819756507873535, "learning_rate": 0.00017387667768916553, "loss": 4.5851, "step": 1349 }, { "epoch": 0.3937294932555596, "grad_norm": 3.6942856311798096, "learning_rate": 0.00017385722622057968, "loss": 4.7651, "step": 1350 }, { "epoch": 0.3940211447320452, "grad_norm": 2.5444259643554688, "learning_rate": 0.0001738377747519938, "loss": 4.6206, "step": 1351 }, { "epoch": 0.3943127962085308, "grad_norm": 3.205679178237915, "learning_rate": 0.0001738183232834079, "loss": 4.5685, "step": 1352 }, { "epoch": 0.3946044476850164, "grad_norm": 2.5269935131073, "learning_rate": 0.000173798871814822, "loss": 4.1945, "step": 1353 }, { "epoch": 0.394896099161502, "grad_norm": 2.450829029083252, "learning_rate": 0.00017377942034623616, "loss": 4.2905, "step": 1354 }, { "epoch": 0.3951877506379876, "grad_norm": 2.0542409420013428, "learning_rate": 0.00017375996887765026, "loss": 4.343, "step": 1355 }, { "epoch": 0.3954794021144732, "grad_norm": 2.0842254161834717, "learning_rate": 0.00017374051740906438, "loss": 4.693, "step": 1356 }, { "epoch": 0.3957710535909588, "grad_norm": 3.1108481884002686, "learning_rate": 0.00017372106594047853, "loss": 4.6007, "step": 1357 }, { "epoch": 0.3960627050674444, "grad_norm": 4.112346172332764, "learning_rate": 0.00017370161447189265, "loss": 4.5029, "step": 1358 }, { "epoch": 0.39635435654393003, "grad_norm": 2.711804151535034, "learning_rate": 0.00017368216300330674, "loss": 4.3137, "step": 1359 }, { "epoch": 0.3966460080204156, "grad_norm": 3.032407760620117, "learning_rate": 0.0001736627115347209, "loss": 4.5765, "step": 1360 }, { "epoch": 0.3969376594969012, "grad_norm": 3.0277085304260254, "learning_rate": 0.000173643260066135, "loss": 4.482, "step": 1361 }, { "epoch": 0.3972293109733868, "grad_norm": 2.5072784423828125, "learning_rate": 0.0001736238085975491, "loss": 4.32, "step": 1362 }, { "epoch": 0.3975209624498724, "grad_norm": 2.445128917694092, "learning_rate": 0.00017360435712896323, "loss": 4.5383, "step": 1363 }, { "epoch": 0.397812613926358, "grad_norm": 2.0463757514953613, "learning_rate": 0.00017358490566037738, "loss": 4.6464, "step": 1364 }, { "epoch": 0.3981042654028436, "grad_norm": 2.6678659915924072, "learning_rate": 0.0001735654541917915, "loss": 4.4004, "step": 1365 }, { "epoch": 0.3983959168793292, "grad_norm": 2.725379467010498, "learning_rate": 0.0001735460027232056, "loss": 4.4802, "step": 1366 }, { "epoch": 0.3986875683558148, "grad_norm": 2.8974671363830566, "learning_rate": 0.00017352655125461974, "loss": 4.597, "step": 1367 }, { "epoch": 0.3989792198323004, "grad_norm": 2.9844775199890137, "learning_rate": 0.00017350709978603386, "loss": 4.3439, "step": 1368 }, { "epoch": 0.399270871308786, "grad_norm": 2.4372615814208984, "learning_rate": 0.00017348764831744796, "loss": 4.324, "step": 1369 }, { "epoch": 0.3995625227852716, "grad_norm": 2.3564388751983643, "learning_rate": 0.0001734681968488621, "loss": 4.6684, "step": 1370 }, { "epoch": 0.3998541742617572, "grad_norm": 1.7240569591522217, "learning_rate": 0.00017344874538027623, "loss": 4.3064, "step": 1371 }, { "epoch": 0.4001458257382428, "grad_norm": 2.3921701908111572, "learning_rate": 0.00017342929391169035, "loss": 4.6245, "step": 1372 }, { "epoch": 0.4004374772147284, "grad_norm": 2.930800676345825, "learning_rate": 0.00017340984244310445, "loss": 4.418, "step": 1373 }, { "epoch": 0.400729128691214, "grad_norm": 3.0173308849334717, "learning_rate": 0.0001733903909745186, "loss": 4.6118, "step": 1374 }, { "epoch": 0.4010207801676996, "grad_norm": 2.3539669513702393, "learning_rate": 0.00017337093950593272, "loss": 4.6489, "step": 1375 }, { "epoch": 0.4013124316441852, "grad_norm": 2.574017286300659, "learning_rate": 0.0001733514880373468, "loss": 4.4908, "step": 1376 }, { "epoch": 0.4016040831206708, "grad_norm": 2.1166722774505615, "learning_rate": 0.00017333203656876096, "loss": 4.1689, "step": 1377 }, { "epoch": 0.4018957345971564, "grad_norm": 2.6104211807250977, "learning_rate": 0.00017331258510017508, "loss": 4.6146, "step": 1378 }, { "epoch": 0.402187386073642, "grad_norm": 2.5416104793548584, "learning_rate": 0.0001732931336315892, "loss": 4.6227, "step": 1379 }, { "epoch": 0.4024790375501276, "grad_norm": 3.046062469482422, "learning_rate": 0.00017327368216300332, "loss": 4.6651, "step": 1380 }, { "epoch": 0.40277068902661317, "grad_norm": 3.6967551708221436, "learning_rate": 0.00017325423069441745, "loss": 4.3864, "step": 1381 }, { "epoch": 0.4030623405030988, "grad_norm": 3.289407730102539, "learning_rate": 0.00017323477922583157, "loss": 4.713, "step": 1382 }, { "epoch": 0.4033539919795844, "grad_norm": 2.5590314865112305, "learning_rate": 0.00017321532775724566, "loss": 4.4533, "step": 1383 }, { "epoch": 0.40364564345607, "grad_norm": 2.6076860427856445, "learning_rate": 0.0001731958762886598, "loss": 4.7415, "step": 1384 }, { "epoch": 0.4039372949325556, "grad_norm": 2.522352695465088, "learning_rate": 0.00017317642482007393, "loss": 4.5419, "step": 1385 }, { "epoch": 0.4042289464090412, "grad_norm": 2.938706636428833, "learning_rate": 0.00017315697335148803, "loss": 4.5206, "step": 1386 }, { "epoch": 0.4045205978855268, "grad_norm": 2.3080291748046875, "learning_rate": 0.00017313752188290217, "loss": 4.1959, "step": 1387 }, { "epoch": 0.4048122493620124, "grad_norm": 2.482694625854492, "learning_rate": 0.0001731180704143163, "loss": 4.4226, "step": 1388 }, { "epoch": 0.405103900838498, "grad_norm": 2.2907590866088867, "learning_rate": 0.00017309861894573042, "loss": 4.5123, "step": 1389 }, { "epoch": 0.4053955523149836, "grad_norm": 2.790625810623169, "learning_rate": 0.00017307916747714454, "loss": 4.5642, "step": 1390 }, { "epoch": 0.4056872037914692, "grad_norm": 2.2563273906707764, "learning_rate": 0.00017305971600855866, "loss": 4.4915, "step": 1391 }, { "epoch": 0.4059788552679548, "grad_norm": 3.1624157428741455, "learning_rate": 0.00017304026453997278, "loss": 4.5846, "step": 1392 }, { "epoch": 0.4062705067444404, "grad_norm": 2.6615991592407227, "learning_rate": 0.00017302081307138688, "loss": 4.5372, "step": 1393 }, { "epoch": 0.406562158220926, "grad_norm": 2.607020616531372, "learning_rate": 0.00017300136160280103, "loss": 4.5841, "step": 1394 }, { "epoch": 0.4068538096974116, "grad_norm": 3.9241392612457275, "learning_rate": 0.00017298191013421515, "loss": 4.4081, "step": 1395 }, { "epoch": 0.4071454611738972, "grad_norm": 1.8952196836471558, "learning_rate": 0.00017296245866562927, "loss": 4.477, "step": 1396 }, { "epoch": 0.4074371126503828, "grad_norm": 2.6270079612731934, "learning_rate": 0.0001729430071970434, "loss": 4.5425, "step": 1397 }, { "epoch": 0.40772876412686837, "grad_norm": 3.25103497505188, "learning_rate": 0.0001729235557284575, "loss": 4.6752, "step": 1398 }, { "epoch": 0.408020415603354, "grad_norm": 2.6809027194976807, "learning_rate": 0.00017290410425987163, "loss": 4.3353, "step": 1399 }, { "epoch": 0.4083120670798396, "grad_norm": 2.900933265686035, "learning_rate": 0.00017288465279128573, "loss": 4.3826, "step": 1400 }, { "epoch": 0.4086037185563252, "grad_norm": 2.3961126804351807, "learning_rate": 0.00017286520132269988, "loss": 4.2305, "step": 1401 }, { "epoch": 0.4088953700328108, "grad_norm": 3.2276129722595215, "learning_rate": 0.000172845749854114, "loss": 4.1041, "step": 1402 }, { "epoch": 0.40918702150929637, "grad_norm": 2.775151491165161, "learning_rate": 0.00017282629838552812, "loss": 4.3109, "step": 1403 }, { "epoch": 0.409478672985782, "grad_norm": 2.887735366821289, "learning_rate": 0.00017280684691694224, "loss": 4.5772, "step": 1404 }, { "epoch": 0.4097703244622676, "grad_norm": 2.694093942642212, "learning_rate": 0.00017278739544835636, "loss": 4.3533, "step": 1405 }, { "epoch": 0.4100619759387532, "grad_norm": 2.3980820178985596, "learning_rate": 0.00017276794397977049, "loss": 4.1741, "step": 1406 }, { "epoch": 0.4103536274152388, "grad_norm": 2.9887959957122803, "learning_rate": 0.0001727484925111846, "loss": 4.4938, "step": 1407 }, { "epoch": 0.41064527889172436, "grad_norm": 3.4835283756256104, "learning_rate": 0.00017272904104259873, "loss": 4.3953, "step": 1408 }, { "epoch": 0.41093693036821, "grad_norm": 2.8985214233398438, "learning_rate": 0.00017270958957401285, "loss": 4.6456, "step": 1409 }, { "epoch": 0.4112285818446956, "grad_norm": 2.5493223667144775, "learning_rate": 0.00017269013810542694, "loss": 4.5893, "step": 1410 }, { "epoch": 0.4115202333211812, "grad_norm": 2.4553027153015137, "learning_rate": 0.0001726706866368411, "loss": 4.3353, "step": 1411 }, { "epoch": 0.4118118847976668, "grad_norm": 2.710907220840454, "learning_rate": 0.00017265123516825521, "loss": 4.468, "step": 1412 }, { "epoch": 0.4121035362741524, "grad_norm": 2.536273956298828, "learning_rate": 0.00017263178369966934, "loss": 4.7881, "step": 1413 }, { "epoch": 0.412395187750638, "grad_norm": 3.410524845123291, "learning_rate": 0.00017261233223108346, "loss": 4.3702, "step": 1414 }, { "epoch": 0.41268683922712357, "grad_norm": 3.3330187797546387, "learning_rate": 0.00017259288076249758, "loss": 3.9445, "step": 1415 }, { "epoch": 0.4129784907036092, "grad_norm": 2.8401365280151367, "learning_rate": 0.0001725734292939117, "loss": 4.5707, "step": 1416 }, { "epoch": 0.4132701421800948, "grad_norm": 2.314526081085205, "learning_rate": 0.00017255397782532582, "loss": 4.4412, "step": 1417 }, { "epoch": 0.4135617936565804, "grad_norm": 3.55175518989563, "learning_rate": 0.00017253452635673994, "loss": 4.605, "step": 1418 }, { "epoch": 0.413853445133066, "grad_norm": 2.056422472000122, "learning_rate": 0.00017251507488815407, "loss": 4.4779, "step": 1419 }, { "epoch": 0.41414509660955157, "grad_norm": 3.029111862182617, "learning_rate": 0.0001724956234195682, "loss": 4.6697, "step": 1420 }, { "epoch": 0.4144367480860372, "grad_norm": 2.235849618911743, "learning_rate": 0.0001724761719509823, "loss": 4.2005, "step": 1421 }, { "epoch": 0.4147283995625228, "grad_norm": 2.6585822105407715, "learning_rate": 0.00017245672048239643, "loss": 4.5101, "step": 1422 }, { "epoch": 0.4150200510390084, "grad_norm": 2.135563850402832, "learning_rate": 0.00017243726901381055, "loss": 4.5494, "step": 1423 }, { "epoch": 0.415311702515494, "grad_norm": 2.9669992923736572, "learning_rate": 0.00017241781754522467, "loss": 4.6513, "step": 1424 }, { "epoch": 0.41560335399197956, "grad_norm": 2.9781107902526855, "learning_rate": 0.0001723983660766388, "loss": 4.5519, "step": 1425 }, { "epoch": 0.4158950054684652, "grad_norm": 4.018941879272461, "learning_rate": 0.00017237891460805292, "loss": 3.8685, "step": 1426 }, { "epoch": 0.4161866569449508, "grad_norm": 3.416755437850952, "learning_rate": 0.00017235946313946704, "loss": 4.6777, "step": 1427 }, { "epoch": 0.4164783084214364, "grad_norm": 4.568665504455566, "learning_rate": 0.00017234001167088116, "loss": 4.5613, "step": 1428 }, { "epoch": 0.416769959897922, "grad_norm": 1.9452449083328247, "learning_rate": 0.00017232056020229528, "loss": 4.4649, "step": 1429 }, { "epoch": 0.41706161137440756, "grad_norm": 3.123260021209717, "learning_rate": 0.0001723011087337094, "loss": 4.394, "step": 1430 }, { "epoch": 0.4173532628508932, "grad_norm": 3.2838659286499023, "learning_rate": 0.00017228165726512352, "loss": 4.2875, "step": 1431 }, { "epoch": 0.41764491432737877, "grad_norm": 3.007009506225586, "learning_rate": 0.00017226220579653765, "loss": 4.5973, "step": 1432 }, { "epoch": 0.4179365658038644, "grad_norm": 2.0569963455200195, "learning_rate": 0.00017224275432795177, "loss": 4.4154, "step": 1433 }, { "epoch": 0.41822821728035, "grad_norm": 2.7815358638763428, "learning_rate": 0.0001722233028593659, "loss": 4.1743, "step": 1434 }, { "epoch": 0.41851986875683556, "grad_norm": 2.233372449874878, "learning_rate": 0.00017220385139078, "loss": 4.4611, "step": 1435 }, { "epoch": 0.4188115202333212, "grad_norm": 1.7795745134353638, "learning_rate": 0.00017218439992219413, "loss": 4.5124, "step": 1436 }, { "epoch": 0.41910317170980677, "grad_norm": 2.223618984222412, "learning_rate": 0.00017216494845360825, "loss": 4.4284, "step": 1437 }, { "epoch": 0.4193948231862924, "grad_norm": 2.679556369781494, "learning_rate": 0.00017214549698502238, "loss": 4.4034, "step": 1438 }, { "epoch": 0.419686474662778, "grad_norm": 3.0321426391601562, "learning_rate": 0.0001721260455164365, "loss": 4.3235, "step": 1439 }, { "epoch": 0.41997812613926355, "grad_norm": 3.5299484729766846, "learning_rate": 0.00017210659404785062, "loss": 4.6687, "step": 1440 }, { "epoch": 0.4202697776157492, "grad_norm": 3.952692985534668, "learning_rate": 0.00017208714257926474, "loss": 4.5081, "step": 1441 }, { "epoch": 0.42056142909223476, "grad_norm": 3.0566163063049316, "learning_rate": 0.00017206769111067886, "loss": 4.2796, "step": 1442 }, { "epoch": 0.4208530805687204, "grad_norm": 3.5320327281951904, "learning_rate": 0.00017204823964209298, "loss": 4.6079, "step": 1443 }, { "epoch": 0.42114473204520597, "grad_norm": 2.3637638092041016, "learning_rate": 0.0001720287881735071, "loss": 4.7151, "step": 1444 }, { "epoch": 0.4214363835216916, "grad_norm": 3.232579231262207, "learning_rate": 0.00017200933670492123, "loss": 4.5832, "step": 1445 }, { "epoch": 0.4217280349981772, "grad_norm": 2.5499489307403564, "learning_rate": 0.00017198988523633535, "loss": 4.5231, "step": 1446 }, { "epoch": 0.42201968647466276, "grad_norm": 2.551872730255127, "learning_rate": 0.00017197043376774947, "loss": 4.0384, "step": 1447 }, { "epoch": 0.4223113379511484, "grad_norm": 3.566239595413208, "learning_rate": 0.0001719509822991636, "loss": 4.6296, "step": 1448 }, { "epoch": 0.42260298942763397, "grad_norm": 2.881608724594116, "learning_rate": 0.0001719315308305777, "loss": 4.7847, "step": 1449 }, { "epoch": 0.4228946409041196, "grad_norm": 2.539767265319824, "learning_rate": 0.00017191207936199184, "loss": 4.5011, "step": 1450 }, { "epoch": 0.4231862923806052, "grad_norm": 1.9920523166656494, "learning_rate": 0.00017189262789340596, "loss": 4.3899, "step": 1451 }, { "epoch": 0.42347794385709076, "grad_norm": 2.1425530910491943, "learning_rate": 0.00017187317642482008, "loss": 4.5691, "step": 1452 }, { "epoch": 0.4237695953335764, "grad_norm": 2.4585931301116943, "learning_rate": 0.0001718537249562342, "loss": 4.6431, "step": 1453 }, { "epoch": 0.42406124681006196, "grad_norm": 3.2692041397094727, "learning_rate": 0.00017183427348764832, "loss": 4.542, "step": 1454 }, { "epoch": 0.4243528982865476, "grad_norm": 2.479025363922119, "learning_rate": 0.00017181482201906244, "loss": 4.5589, "step": 1455 }, { "epoch": 0.4246445497630332, "grad_norm": 3.7543792724609375, "learning_rate": 0.00017179537055047656, "loss": 4.5061, "step": 1456 }, { "epoch": 0.42493620123951875, "grad_norm": 1.927085518836975, "learning_rate": 0.0001717759190818907, "loss": 4.7687, "step": 1457 }, { "epoch": 0.4252278527160044, "grad_norm": 3.2601523399353027, "learning_rate": 0.0001717564676133048, "loss": 4.5296, "step": 1458 }, { "epoch": 0.42551950419248996, "grad_norm": 2.4126994609832764, "learning_rate": 0.00017173701614471893, "loss": 4.5159, "step": 1459 }, { "epoch": 0.4258111556689756, "grad_norm": 2.4034805297851562, "learning_rate": 0.00017171756467613305, "loss": 4.3901, "step": 1460 }, { "epoch": 0.42610280714546117, "grad_norm": 2.759125232696533, "learning_rate": 0.00017169811320754717, "loss": 4.0347, "step": 1461 }, { "epoch": 0.42639445862194675, "grad_norm": 2.8188023567199707, "learning_rate": 0.0001716786617389613, "loss": 4.5024, "step": 1462 }, { "epoch": 0.4266861100984324, "grad_norm": 3.269064426422119, "learning_rate": 0.00017165921027037542, "loss": 4.68, "step": 1463 }, { "epoch": 0.42697776157491796, "grad_norm": 2.759246826171875, "learning_rate": 0.00017163975880178956, "loss": 4.3532, "step": 1464 }, { "epoch": 0.4272694130514036, "grad_norm": 2.4753894805908203, "learning_rate": 0.00017162030733320366, "loss": 4.6519, "step": 1465 }, { "epoch": 0.42756106452788917, "grad_norm": 1.8815137147903442, "learning_rate": 0.00017160085586461778, "loss": 4.5244, "step": 1466 }, { "epoch": 0.42785271600437474, "grad_norm": 2.734689712524414, "learning_rate": 0.00017158140439603193, "loss": 4.4439, "step": 1467 }, { "epoch": 0.4281443674808604, "grad_norm": 3.543675184249878, "learning_rate": 0.00017156195292744602, "loss": 4.4914, "step": 1468 }, { "epoch": 0.42843601895734595, "grad_norm": 2.836369752883911, "learning_rate": 0.00017154250145886015, "loss": 4.5756, "step": 1469 }, { "epoch": 0.4287276704338316, "grad_norm": 3.0343494415283203, "learning_rate": 0.00017152304999027427, "loss": 4.7104, "step": 1470 }, { "epoch": 0.42901932191031716, "grad_norm": 2.24696946144104, "learning_rate": 0.00017150359852168842, "loss": 4.46, "step": 1471 }, { "epoch": 0.4293109733868028, "grad_norm": 2.563318967819214, "learning_rate": 0.0001714841470531025, "loss": 4.4522, "step": 1472 }, { "epoch": 0.4296026248632884, "grad_norm": 2.626352310180664, "learning_rate": 0.00017146469558451663, "loss": 4.5151, "step": 1473 }, { "epoch": 0.42989427633977395, "grad_norm": 2.1928443908691406, "learning_rate": 0.00017144524411593078, "loss": 4.4871, "step": 1474 }, { "epoch": 0.4301859278162596, "grad_norm": 2.5068676471710205, "learning_rate": 0.00017142579264734487, "loss": 4.5262, "step": 1475 }, { "epoch": 0.43047757929274516, "grad_norm": 3.8974225521087646, "learning_rate": 0.000171406341178759, "loss": 4.4703, "step": 1476 }, { "epoch": 0.4307692307692308, "grad_norm": 2.745652914047241, "learning_rate": 0.00017138688971017312, "loss": 4.5174, "step": 1477 }, { "epoch": 0.43106088224571637, "grad_norm": 2.353938102722168, "learning_rate": 0.00017136743824158724, "loss": 4.5099, "step": 1478 }, { "epoch": 0.43135253372220195, "grad_norm": 2.49061918258667, "learning_rate": 0.00017134798677300136, "loss": 4.6789, "step": 1479 }, { "epoch": 0.4316441851986876, "grad_norm": 3.759302854537964, "learning_rate": 0.00017132853530441548, "loss": 4.581, "step": 1480 }, { "epoch": 0.43193583667517316, "grad_norm": 2.655848741531372, "learning_rate": 0.00017130908383582963, "loss": 4.2488, "step": 1481 }, { "epoch": 0.4322274881516588, "grad_norm": 3.514204740524292, "learning_rate": 0.00017128963236724373, "loss": 4.4859, "step": 1482 }, { "epoch": 0.43251913962814437, "grad_norm": 2.474724292755127, "learning_rate": 0.00017127018089865785, "loss": 4.4845, "step": 1483 }, { "epoch": 0.43281079110462994, "grad_norm": 2.88610577583313, "learning_rate": 0.000171250729430072, "loss": 4.4505, "step": 1484 }, { "epoch": 0.4331024425811156, "grad_norm": 2.454451560974121, "learning_rate": 0.0001712312779614861, "loss": 4.5337, "step": 1485 }, { "epoch": 0.43339409405760115, "grad_norm": 2.0708038806915283, "learning_rate": 0.0001712118264929002, "loss": 4.3664, "step": 1486 }, { "epoch": 0.4336857455340868, "grad_norm": 3.619753122329712, "learning_rate": 0.00017119237502431433, "loss": 4.6041, "step": 1487 }, { "epoch": 0.43397739701057236, "grad_norm": 2.5957186222076416, "learning_rate": 0.00017117292355572848, "loss": 4.522, "step": 1488 }, { "epoch": 0.43426904848705794, "grad_norm": 3.4386672973632812, "learning_rate": 0.00017115347208714258, "loss": 4.4476, "step": 1489 }, { "epoch": 0.4345606999635436, "grad_norm": 2.386714458465576, "learning_rate": 0.0001711340206185567, "loss": 4.4116, "step": 1490 }, { "epoch": 0.43485235144002915, "grad_norm": 2.7172205448150635, "learning_rate": 0.00017111456914997085, "loss": 4.3809, "step": 1491 }, { "epoch": 0.4351440029165148, "grad_norm": 2.095721483230591, "learning_rate": 0.00017109511768138494, "loss": 4.2757, "step": 1492 }, { "epoch": 0.43543565439300036, "grad_norm": 2.7614569664001465, "learning_rate": 0.00017107566621279906, "loss": 4.6438, "step": 1493 }, { "epoch": 0.43572730586948594, "grad_norm": 2.107954502105713, "learning_rate": 0.0001710562147442132, "loss": 4.5639, "step": 1494 }, { "epoch": 0.43601895734597157, "grad_norm": 2.675074577331543, "learning_rate": 0.00017103676327562733, "loss": 4.4099, "step": 1495 }, { "epoch": 0.43631060882245715, "grad_norm": 4.116441249847412, "learning_rate": 0.00017101731180704143, "loss": 4.6245, "step": 1496 }, { "epoch": 0.4366022602989428, "grad_norm": 3.214047431945801, "learning_rate": 0.00017099786033845555, "loss": 4.3644, "step": 1497 }, { "epoch": 0.43689391177542836, "grad_norm": 2.312257766723633, "learning_rate": 0.0001709784088698697, "loss": 4.5912, "step": 1498 }, { "epoch": 0.437185563251914, "grad_norm": 3.6364331245422363, "learning_rate": 0.0001709589574012838, "loss": 4.6252, "step": 1499 }, { "epoch": 0.43747721472839957, "grad_norm": 2.5296998023986816, "learning_rate": 0.00017093950593269791, "loss": 4.4678, "step": 1500 }, { "epoch": 0.43776886620488514, "grad_norm": 2.397819995880127, "learning_rate": 0.00017092005446411206, "loss": 4.2443, "step": 1501 }, { "epoch": 0.4380605176813708, "grad_norm": 2.5445666313171387, "learning_rate": 0.00017090060299552616, "loss": 4.4984, "step": 1502 }, { "epoch": 0.43835216915785635, "grad_norm": 3.3043060302734375, "learning_rate": 0.00017088115152694028, "loss": 4.4347, "step": 1503 }, { "epoch": 0.438643820634342, "grad_norm": 2.504504442214966, "learning_rate": 0.00017086170005835443, "loss": 4.5991, "step": 1504 }, { "epoch": 0.43893547211082756, "grad_norm": 2.8330867290496826, "learning_rate": 0.00017084224858976855, "loss": 4.48, "step": 1505 }, { "epoch": 0.43922712358731314, "grad_norm": 2.349926471710205, "learning_rate": 0.00017082279712118264, "loss": 4.3808, "step": 1506 }, { "epoch": 0.4395187750637988, "grad_norm": 2.068993330001831, "learning_rate": 0.00017080334565259677, "loss": 4.2591, "step": 1507 }, { "epoch": 0.43981042654028435, "grad_norm": 2.9419660568237305, "learning_rate": 0.00017078389418401091, "loss": 4.3647, "step": 1508 }, { "epoch": 0.44010207801677, "grad_norm": 2.8261399269104004, "learning_rate": 0.000170764442715425, "loss": 4.4105, "step": 1509 }, { "epoch": 0.44039372949325556, "grad_norm": 2.86095929145813, "learning_rate": 0.00017074499124683913, "loss": 4.332, "step": 1510 }, { "epoch": 0.44068538096974114, "grad_norm": 2.8382537364959717, "learning_rate": 0.00017072553977825328, "loss": 4.443, "step": 1511 }, { "epoch": 0.44097703244622677, "grad_norm": 2.2347335815429688, "learning_rate": 0.0001707060883096674, "loss": 4.6755, "step": 1512 }, { "epoch": 0.44126868392271235, "grad_norm": 3.613821506500244, "learning_rate": 0.0001706866368410815, "loss": 4.6457, "step": 1513 }, { "epoch": 0.441560335399198, "grad_norm": 2.460024356842041, "learning_rate": 0.00017066718537249564, "loss": 4.4475, "step": 1514 }, { "epoch": 0.44185198687568356, "grad_norm": 2.6869351863861084, "learning_rate": 0.00017064773390390977, "loss": 4.4238, "step": 1515 }, { "epoch": 0.44214363835216913, "grad_norm": 2.9888343811035156, "learning_rate": 0.00017062828243532386, "loss": 4.4963, "step": 1516 }, { "epoch": 0.44243528982865477, "grad_norm": 2.6265852451324463, "learning_rate": 0.00017060883096673798, "loss": 4.5047, "step": 1517 }, { "epoch": 0.44272694130514034, "grad_norm": 3.1353752613067627, "learning_rate": 0.00017058937949815213, "loss": 4.5373, "step": 1518 }, { "epoch": 0.443018592781626, "grad_norm": 3.2772510051727295, "learning_rate": 0.00017056992802956625, "loss": 4.2138, "step": 1519 }, { "epoch": 0.44331024425811155, "grad_norm": 2.409822940826416, "learning_rate": 0.00017055047656098035, "loss": 4.5345, "step": 1520 }, { "epoch": 0.44360189573459713, "grad_norm": 2.5150344371795654, "learning_rate": 0.0001705310250923945, "loss": 4.3121, "step": 1521 }, { "epoch": 0.44389354721108276, "grad_norm": 2.6488542556762695, "learning_rate": 0.00017051157362380862, "loss": 4.6777, "step": 1522 }, { "epoch": 0.44418519868756834, "grad_norm": 1.9796192646026611, "learning_rate": 0.0001704921221552227, "loss": 4.5644, "step": 1523 }, { "epoch": 0.44447685016405397, "grad_norm": 2.263807535171509, "learning_rate": 0.00017047267068663686, "loss": 4.338, "step": 1524 }, { "epoch": 0.44476850164053955, "grad_norm": 3.6913297176361084, "learning_rate": 0.00017045321921805098, "loss": 4.2766, "step": 1525 }, { "epoch": 0.4450601531170252, "grad_norm": 2.535554885864258, "learning_rate": 0.00017043376774946508, "loss": 4.4923, "step": 1526 }, { "epoch": 0.44535180459351076, "grad_norm": 1.694980263710022, "learning_rate": 0.0001704143162808792, "loss": 4.2757, "step": 1527 }, { "epoch": 0.44564345606999634, "grad_norm": 4.626744747161865, "learning_rate": 0.00017039486481229335, "loss": 4.6737, "step": 1528 }, { "epoch": 0.44593510754648197, "grad_norm": 3.089491605758667, "learning_rate": 0.00017037541334370747, "loss": 4.5992, "step": 1529 }, { "epoch": 0.44622675902296755, "grad_norm": 2.3680622577667236, "learning_rate": 0.00017035596187512156, "loss": 4.6953, "step": 1530 }, { "epoch": 0.4465184104994532, "grad_norm": 3.1658310890197754, "learning_rate": 0.0001703365104065357, "loss": 4.3876, "step": 1531 }, { "epoch": 0.44681006197593875, "grad_norm": 2.9166102409362793, "learning_rate": 0.00017031705893794983, "loss": 4.4138, "step": 1532 }, { "epoch": 0.44710171345242433, "grad_norm": 3.987621545791626, "learning_rate": 0.00017029760746936393, "loss": 4.1971, "step": 1533 }, { "epoch": 0.44739336492890996, "grad_norm": 4.108159065246582, "learning_rate": 0.00017027815600077808, "loss": 4.4232, "step": 1534 }, { "epoch": 0.44768501640539554, "grad_norm": 2.4291372299194336, "learning_rate": 0.0001702587045321922, "loss": 4.502, "step": 1535 }, { "epoch": 0.4479766678818812, "grad_norm": 2.6166560649871826, "learning_rate": 0.00017023925306360632, "loss": 4.6787, "step": 1536 }, { "epoch": 0.44826831935836675, "grad_norm": 2.8644604682922363, "learning_rate": 0.00017021980159502041, "loss": 4.306, "step": 1537 }, { "epoch": 0.44855997083485233, "grad_norm": 3.245999336242676, "learning_rate": 0.00017020035012643456, "loss": 4.4137, "step": 1538 }, { "epoch": 0.44885162231133796, "grad_norm": 2.925076723098755, "learning_rate": 0.00017018089865784868, "loss": 4.075, "step": 1539 }, { "epoch": 0.44914327378782354, "grad_norm": 3.349163770675659, "learning_rate": 0.00017016144718926278, "loss": 4.4389, "step": 1540 }, { "epoch": 0.44943492526430917, "grad_norm": 2.1112611293792725, "learning_rate": 0.00017014199572067693, "loss": 4.1924, "step": 1541 }, { "epoch": 0.44972657674079475, "grad_norm": 3.1416962146759033, "learning_rate": 0.00017012254425209105, "loss": 4.56, "step": 1542 }, { "epoch": 0.4500182282172803, "grad_norm": 2.1659913063049316, "learning_rate": 0.00017010309278350517, "loss": 4.4549, "step": 1543 }, { "epoch": 0.45030987969376596, "grad_norm": 2.915001392364502, "learning_rate": 0.0001700836413149193, "loss": 4.3345, "step": 1544 }, { "epoch": 0.45060153117025153, "grad_norm": 2.76812744140625, "learning_rate": 0.00017006418984633341, "loss": 4.1329, "step": 1545 }, { "epoch": 0.45089318264673717, "grad_norm": 2.32857084274292, "learning_rate": 0.00017004473837774754, "loss": 4.5205, "step": 1546 }, { "epoch": 0.45118483412322274, "grad_norm": 1.9454349279403687, "learning_rate": 0.00017002528690916163, "loss": 4.4679, "step": 1547 }, { "epoch": 0.4514764855997083, "grad_norm": 2.5472614765167236, "learning_rate": 0.00017000583544057578, "loss": 4.4446, "step": 1548 }, { "epoch": 0.45176813707619395, "grad_norm": 2.681445360183716, "learning_rate": 0.0001699863839719899, "loss": 4.4394, "step": 1549 }, { "epoch": 0.45205978855267953, "grad_norm": 2.6119585037231445, "learning_rate": 0.00016996693250340402, "loss": 4.6661, "step": 1550 }, { "epoch": 0.45235144002916516, "grad_norm": 2.1177375316619873, "learning_rate": 0.00016994748103481814, "loss": 4.6103, "step": 1551 }, { "epoch": 0.45264309150565074, "grad_norm": 3.9427530765533447, "learning_rate": 0.00016992802956623226, "loss": 4.5256, "step": 1552 }, { "epoch": 0.4529347429821364, "grad_norm": 4.303971767425537, "learning_rate": 0.00016990857809764639, "loss": 4.4869, "step": 1553 }, { "epoch": 0.45322639445862195, "grad_norm": 3.0154030323028564, "learning_rate": 0.0001698891266290605, "loss": 4.7874, "step": 1554 }, { "epoch": 0.4535180459351075, "grad_norm": 3.244389772415161, "learning_rate": 0.00016986967516047463, "loss": 4.4318, "step": 1555 }, { "epoch": 0.45380969741159316, "grad_norm": 2.417712688446045, "learning_rate": 0.00016985022369188875, "loss": 4.6624, "step": 1556 }, { "epoch": 0.45410134888807874, "grad_norm": 3.292527675628662, "learning_rate": 0.00016983077222330285, "loss": 4.7813, "step": 1557 }, { "epoch": 0.45439300036456437, "grad_norm": 2.2487096786499023, "learning_rate": 0.000169811320754717, "loss": 4.171, "step": 1558 }, { "epoch": 0.45468465184104995, "grad_norm": 2.4701106548309326, "learning_rate": 0.00016979186928613112, "loss": 4.1359, "step": 1559 }, { "epoch": 0.4549763033175355, "grad_norm": 1.7634177207946777, "learning_rate": 0.00016977241781754524, "loss": 4.2403, "step": 1560 }, { "epoch": 0.45526795479402116, "grad_norm": 2.6408984661102295, "learning_rate": 0.00016975296634895936, "loss": 4.4394, "step": 1561 }, { "epoch": 0.45555960627050673, "grad_norm": 4.409041404724121, "learning_rate": 0.00016973351488037348, "loss": 4.5582, "step": 1562 }, { "epoch": 0.45585125774699237, "grad_norm": 2.6671242713928223, "learning_rate": 0.0001697140634117876, "loss": 4.6104, "step": 1563 }, { "epoch": 0.45614290922347794, "grad_norm": 2.3707025051116943, "learning_rate": 0.0001696946119432017, "loss": 4.3898, "step": 1564 }, { "epoch": 0.4564345606999635, "grad_norm": 2.7907862663269043, "learning_rate": 0.00016967516047461585, "loss": 4.0662, "step": 1565 }, { "epoch": 0.45672621217644915, "grad_norm": 2.30145263671875, "learning_rate": 0.00016965570900602997, "loss": 4.418, "step": 1566 }, { "epoch": 0.45701786365293473, "grad_norm": 3.171985149383545, "learning_rate": 0.0001696362575374441, "loss": 4.2438, "step": 1567 }, { "epoch": 0.45730951512942036, "grad_norm": 3.9732723236083984, "learning_rate": 0.0001696168060688582, "loss": 4.7065, "step": 1568 }, { "epoch": 0.45760116660590594, "grad_norm": 2.1142637729644775, "learning_rate": 0.00016959735460027233, "loss": 4.4766, "step": 1569 }, { "epoch": 0.4578928180823915, "grad_norm": 1.7309470176696777, "learning_rate": 0.00016957790313168645, "loss": 4.2896, "step": 1570 }, { "epoch": 0.45818446955887715, "grad_norm": 3.622689962387085, "learning_rate": 0.00016955845166310058, "loss": 4.5151, "step": 1571 }, { "epoch": 0.4584761210353627, "grad_norm": 2.45910382270813, "learning_rate": 0.0001695390001945147, "loss": 3.8862, "step": 1572 }, { "epoch": 0.45876777251184836, "grad_norm": 2.930189847946167, "learning_rate": 0.00016951954872592882, "loss": 4.5258, "step": 1573 }, { "epoch": 0.45905942398833394, "grad_norm": 2.0354020595550537, "learning_rate": 0.00016950009725734294, "loss": 4.6443, "step": 1574 }, { "epoch": 0.4593510754648195, "grad_norm": 2.5338714122772217, "learning_rate": 0.00016948064578875706, "loss": 4.6023, "step": 1575 }, { "epoch": 0.45964272694130515, "grad_norm": 2.197129726409912, "learning_rate": 0.00016946119432017118, "loss": 4.5915, "step": 1576 }, { "epoch": 0.4599343784177907, "grad_norm": 2.5400075912475586, "learning_rate": 0.0001694417428515853, "loss": 4.7677, "step": 1577 }, { "epoch": 0.46022602989427636, "grad_norm": 2.682941198348999, "learning_rate": 0.00016942229138299943, "loss": 4.3614, "step": 1578 }, { "epoch": 0.46051768137076193, "grad_norm": 3.2124595642089844, "learning_rate": 0.00016940283991441355, "loss": 4.587, "step": 1579 }, { "epoch": 0.46080933284724757, "grad_norm": 1.9325346946716309, "learning_rate": 0.00016938338844582767, "loss": 4.538, "step": 1580 }, { "epoch": 0.46110098432373314, "grad_norm": 3.1374475955963135, "learning_rate": 0.0001693639369772418, "loss": 4.5173, "step": 1581 }, { "epoch": 0.4613926358002187, "grad_norm": 2.3502933979034424, "learning_rate": 0.0001693444855086559, "loss": 4.4969, "step": 1582 }, { "epoch": 0.46168428727670435, "grad_norm": 3.162824869155884, "learning_rate": 0.00016932503404007003, "loss": 4.4214, "step": 1583 }, { "epoch": 0.46197593875318993, "grad_norm": 2.273456573486328, "learning_rate": 0.00016930558257148416, "loss": 4.6496, "step": 1584 }, { "epoch": 0.46226759022967556, "grad_norm": 2.56491756439209, "learning_rate": 0.00016928613110289828, "loss": 4.5563, "step": 1585 }, { "epoch": 0.46255924170616114, "grad_norm": 1.9051166772842407, "learning_rate": 0.0001692666796343124, "loss": 4.6693, "step": 1586 }, { "epoch": 0.4628508931826467, "grad_norm": 2.233158826828003, "learning_rate": 0.00016924722816572652, "loss": 4.5101, "step": 1587 }, { "epoch": 0.46314254465913235, "grad_norm": 2.7625064849853516, "learning_rate": 0.00016922777669714064, "loss": 4.5192, "step": 1588 }, { "epoch": 0.4634341961356179, "grad_norm": 2.6961376667022705, "learning_rate": 0.00016920832522855476, "loss": 4.4721, "step": 1589 }, { "epoch": 0.46372584761210356, "grad_norm": 1.9258490800857544, "learning_rate": 0.00016918887375996889, "loss": 4.5942, "step": 1590 }, { "epoch": 0.46401749908858914, "grad_norm": 3.277998447418213, "learning_rate": 0.000169169422291383, "loss": 4.2461, "step": 1591 }, { "epoch": 0.4643091505650747, "grad_norm": 2.6314949989318848, "learning_rate": 0.00016914997082279713, "loss": 4.2127, "step": 1592 }, { "epoch": 0.46460080204156035, "grad_norm": 2.3910024166107178, "learning_rate": 0.00016913051935421125, "loss": 4.3699, "step": 1593 }, { "epoch": 0.4648924535180459, "grad_norm": 3.05875301361084, "learning_rate": 0.00016911106788562537, "loss": 4.35, "step": 1594 }, { "epoch": 0.46518410499453156, "grad_norm": 2.0519471168518066, "learning_rate": 0.0001690916164170395, "loss": 4.5087, "step": 1595 }, { "epoch": 0.46547575647101713, "grad_norm": 2.250986337661743, "learning_rate": 0.00016907216494845361, "loss": 4.5371, "step": 1596 }, { "epoch": 0.4657674079475027, "grad_norm": 3.2536230087280273, "learning_rate": 0.00016905271347986774, "loss": 4.4465, "step": 1597 }, { "epoch": 0.46605905942398834, "grad_norm": 2.377401113510132, "learning_rate": 0.00016903326201128186, "loss": 4.2263, "step": 1598 }, { "epoch": 0.4663507109004739, "grad_norm": 2.495924472808838, "learning_rate": 0.00016901381054269598, "loss": 4.4101, "step": 1599 }, { "epoch": 0.46664236237695955, "grad_norm": 2.6485209465026855, "learning_rate": 0.0001689943590741101, "loss": 4.5272, "step": 1600 }, { "epoch": 0.46693401385344513, "grad_norm": 3.5430798530578613, "learning_rate": 0.00016897490760552422, "loss": 4.4421, "step": 1601 }, { "epoch": 0.4672256653299307, "grad_norm": 2.9309494495391846, "learning_rate": 0.00016895545613693834, "loss": 4.5522, "step": 1602 }, { "epoch": 0.46751731680641634, "grad_norm": 2.3674228191375732, "learning_rate": 0.00016893600466835247, "loss": 4.6255, "step": 1603 }, { "epoch": 0.4678089682829019, "grad_norm": 1.874340295791626, "learning_rate": 0.0001689165531997666, "loss": 4.4722, "step": 1604 }, { "epoch": 0.46810061975938755, "grad_norm": 2.3240504264831543, "learning_rate": 0.0001688971017311807, "loss": 4.3425, "step": 1605 }, { "epoch": 0.4683922712358731, "grad_norm": 2.5977625846862793, "learning_rate": 0.00016887765026259483, "loss": 4.4959, "step": 1606 }, { "epoch": 0.46868392271235876, "grad_norm": 2.9052412509918213, "learning_rate": 0.00016885819879400895, "loss": 4.6181, "step": 1607 }, { "epoch": 0.46897557418884434, "grad_norm": 2.7959394454956055, "learning_rate": 0.00016883874732542307, "loss": 4.5495, "step": 1608 }, { "epoch": 0.4692672256653299, "grad_norm": 2.463561773300171, "learning_rate": 0.0001688192958568372, "loss": 4.5318, "step": 1609 }, { "epoch": 0.46955887714181554, "grad_norm": 3.205758810043335, "learning_rate": 0.00016879984438825132, "loss": 4.4827, "step": 1610 }, { "epoch": 0.4698505286183011, "grad_norm": 2.973374843597412, "learning_rate": 0.00016878039291966547, "loss": 4.3354, "step": 1611 }, { "epoch": 0.47014218009478675, "grad_norm": 3.5650064945220947, "learning_rate": 0.00016876094145107956, "loss": 4.5051, "step": 1612 }, { "epoch": 0.47043383157127233, "grad_norm": 2.9197375774383545, "learning_rate": 0.00016874148998249368, "loss": 4.5178, "step": 1613 }, { "epoch": 0.4707254830477579, "grad_norm": 2.5021235942840576, "learning_rate": 0.0001687220385139078, "loss": 4.5885, "step": 1614 }, { "epoch": 0.47101713452424354, "grad_norm": 2.7029082775115967, "learning_rate": 0.00016870258704532193, "loss": 4.5037, "step": 1615 }, { "epoch": 0.4713087860007291, "grad_norm": 2.674102783203125, "learning_rate": 0.00016868313557673605, "loss": 4.5636, "step": 1616 }, { "epoch": 0.47160043747721475, "grad_norm": 3.4539144039154053, "learning_rate": 0.00016866368410815017, "loss": 4.29, "step": 1617 }, { "epoch": 0.47189208895370033, "grad_norm": 2.209900140762329, "learning_rate": 0.0001686442326395643, "loss": 4.2124, "step": 1618 }, { "epoch": 0.4721837404301859, "grad_norm": 3.3737196922302246, "learning_rate": 0.0001686247811709784, "loss": 4.741, "step": 1619 }, { "epoch": 0.47247539190667154, "grad_norm": 3.704805374145508, "learning_rate": 0.00016860532970239253, "loss": 4.2776, "step": 1620 }, { "epoch": 0.4727670433831571, "grad_norm": 2.272569417953491, "learning_rate": 0.00016858587823380668, "loss": 4.6037, "step": 1621 }, { "epoch": 0.47305869485964275, "grad_norm": 1.922649621963501, "learning_rate": 0.00016856642676522078, "loss": 4.2811, "step": 1622 }, { "epoch": 0.4733503463361283, "grad_norm": 2.912355661392212, "learning_rate": 0.0001685469752966349, "loss": 4.6421, "step": 1623 }, { "epoch": 0.4736419978126139, "grad_norm": 3.261472225189209, "learning_rate": 0.00016852752382804902, "loss": 4.3907, "step": 1624 }, { "epoch": 0.47393364928909953, "grad_norm": 2.1401491165161133, "learning_rate": 0.00016850807235946314, "loss": 4.3841, "step": 1625 }, { "epoch": 0.4742253007655851, "grad_norm": 2.8387067317962646, "learning_rate": 0.00016848862089087726, "loss": 4.5603, "step": 1626 }, { "epoch": 0.47451695224207074, "grad_norm": 2.7605020999908447, "learning_rate": 0.00016846916942229138, "loss": 4.3951, "step": 1627 }, { "epoch": 0.4748086037185563, "grad_norm": 2.299298048019409, "learning_rate": 0.00016844971795370553, "loss": 4.6276, "step": 1628 }, { "epoch": 0.4751002551950419, "grad_norm": 2.434077739715576, "learning_rate": 0.00016843026648511963, "loss": 4.4263, "step": 1629 }, { "epoch": 0.47539190667152753, "grad_norm": 2.3801004886627197, "learning_rate": 0.00016841081501653375, "loss": 4.4079, "step": 1630 }, { "epoch": 0.4756835581480131, "grad_norm": 2.0509603023529053, "learning_rate": 0.0001683913635479479, "loss": 4.4995, "step": 1631 }, { "epoch": 0.47597520962449874, "grad_norm": 3.910961866378784, "learning_rate": 0.000168371912079362, "loss": 4.4297, "step": 1632 }, { "epoch": 0.4762668611009843, "grad_norm": 2.3980071544647217, "learning_rate": 0.00016835246061077611, "loss": 4.2411, "step": 1633 }, { "epoch": 0.47655851257746995, "grad_norm": 2.2309746742248535, "learning_rate": 0.00016833300914219024, "loss": 4.676, "step": 1634 }, { "epoch": 0.4768501640539555, "grad_norm": 2.3161940574645996, "learning_rate": 0.00016831355767360438, "loss": 4.4559, "step": 1635 }, { "epoch": 0.4771418155304411, "grad_norm": 2.271172285079956, "learning_rate": 0.00016829410620501848, "loss": 4.3255, "step": 1636 }, { "epoch": 0.47743346700692674, "grad_norm": 3.1353678703308105, "learning_rate": 0.0001682746547364326, "loss": 4.3073, "step": 1637 }, { "epoch": 0.4777251184834123, "grad_norm": 2.455713987350464, "learning_rate": 0.00016825520326784675, "loss": 4.1155, "step": 1638 }, { "epoch": 0.47801676995989795, "grad_norm": 3.1328864097595215, "learning_rate": 0.00016823575179926084, "loss": 4.2951, "step": 1639 }, { "epoch": 0.4783084214363835, "grad_norm": 2.9178130626678467, "learning_rate": 0.00016821630033067497, "loss": 4.3071, "step": 1640 }, { "epoch": 0.4786000729128691, "grad_norm": 2.583307981491089, "learning_rate": 0.0001681968488620891, "loss": 4.3767, "step": 1641 }, { "epoch": 0.47889172438935473, "grad_norm": 2.9128427505493164, "learning_rate": 0.00016817739739350324, "loss": 4.5536, "step": 1642 }, { "epoch": 0.4791833758658403, "grad_norm": 2.613022804260254, "learning_rate": 0.00016815794592491733, "loss": 4.408, "step": 1643 }, { "epoch": 0.47947502734232594, "grad_norm": 3.323932647705078, "learning_rate": 0.00016813849445633145, "loss": 4.7458, "step": 1644 }, { "epoch": 0.4797666788188115, "grad_norm": 3.1241910457611084, "learning_rate": 0.0001681190429877456, "loss": 4.4658, "step": 1645 }, { "epoch": 0.4800583302952971, "grad_norm": 4.709049224853516, "learning_rate": 0.0001680995915191597, "loss": 4.3705, "step": 1646 }, { "epoch": 0.48034998177178273, "grad_norm": 2.768061637878418, "learning_rate": 0.00016808014005057382, "loss": 4.5215, "step": 1647 }, { "epoch": 0.4806416332482683, "grad_norm": 2.362276315689087, "learning_rate": 0.00016806068858198796, "loss": 4.4935, "step": 1648 }, { "epoch": 0.48093328472475394, "grad_norm": 2.4819185733795166, "learning_rate": 0.00016804123711340206, "loss": 4.2494, "step": 1649 }, { "epoch": 0.4812249362012395, "grad_norm": 1.7927018404006958, "learning_rate": 0.00016802178564481618, "loss": 4.2005, "step": 1650 }, { "epoch": 0.4815165876777251, "grad_norm": 2.515878677368164, "learning_rate": 0.0001680023341762303, "loss": 4.5075, "step": 1651 }, { "epoch": 0.4818082391542107, "grad_norm": 2.488715648651123, "learning_rate": 0.00016798288270764445, "loss": 4.555, "step": 1652 }, { "epoch": 0.4820998906306963, "grad_norm": 2.6351990699768066, "learning_rate": 0.00016796343123905855, "loss": 4.43, "step": 1653 }, { "epoch": 0.48239154210718194, "grad_norm": 2.8231143951416016, "learning_rate": 0.00016794397977047267, "loss": 4.7431, "step": 1654 }, { "epoch": 0.4826831935836675, "grad_norm": 3.2316930294036865, "learning_rate": 0.00016792452830188682, "loss": 4.3457, "step": 1655 }, { "epoch": 0.4829748450601531, "grad_norm": 2.3163530826568604, "learning_rate": 0.0001679050768333009, "loss": 4.19, "step": 1656 }, { "epoch": 0.4832664965366387, "grad_norm": 2.6923718452453613, "learning_rate": 0.00016788562536471503, "loss": 4.4649, "step": 1657 }, { "epoch": 0.4835581480131243, "grad_norm": 2.39280104637146, "learning_rate": 0.00016786617389612918, "loss": 4.2301, "step": 1658 }, { "epoch": 0.48384979948960993, "grad_norm": 2.309494733810425, "learning_rate": 0.0001678467224275433, "loss": 4.4145, "step": 1659 }, { "epoch": 0.4841414509660955, "grad_norm": 2.703267812728882, "learning_rate": 0.0001678272709589574, "loss": 4.5179, "step": 1660 }, { "epoch": 0.48443310244258114, "grad_norm": 2.8593404293060303, "learning_rate": 0.00016780781949037152, "loss": 4.3895, "step": 1661 }, { "epoch": 0.4847247539190667, "grad_norm": 2.739142894744873, "learning_rate": 0.00016778836802178567, "loss": 4.7413, "step": 1662 }, { "epoch": 0.4850164053955523, "grad_norm": 2.0358009338378906, "learning_rate": 0.00016776891655319976, "loss": 4.7199, "step": 1663 }, { "epoch": 0.48530805687203793, "grad_norm": 3.4050233364105225, "learning_rate": 0.00016774946508461388, "loss": 4.587, "step": 1664 }, { "epoch": 0.4855997083485235, "grad_norm": 2.849076271057129, "learning_rate": 0.00016773001361602803, "loss": 4.6099, "step": 1665 }, { "epoch": 0.48589135982500914, "grad_norm": 2.3704919815063477, "learning_rate": 0.00016771056214744215, "loss": 4.3437, "step": 1666 }, { "epoch": 0.4861830113014947, "grad_norm": 2.2899012565612793, "learning_rate": 0.00016769111067885625, "loss": 4.5939, "step": 1667 }, { "epoch": 0.4864746627779803, "grad_norm": 2.831625461578369, "learning_rate": 0.0001676716592102704, "loss": 4.4338, "step": 1668 }, { "epoch": 0.4867663142544659, "grad_norm": 3.395745038986206, "learning_rate": 0.00016765220774168452, "loss": 4.4774, "step": 1669 }, { "epoch": 0.4870579657309515, "grad_norm": 3.2790756225585938, "learning_rate": 0.0001676327562730986, "loss": 4.4311, "step": 1670 }, { "epoch": 0.48734961720743714, "grad_norm": 2.636113166809082, "learning_rate": 0.00016761330480451273, "loss": 4.2436, "step": 1671 }, { "epoch": 0.4876412686839227, "grad_norm": 2.6493561267852783, "learning_rate": 0.00016759385333592688, "loss": 4.4807, "step": 1672 }, { "epoch": 0.4879329201604083, "grad_norm": 2.9684741497039795, "learning_rate": 0.00016757440186734098, "loss": 4.7092, "step": 1673 }, { "epoch": 0.4882245716368939, "grad_norm": 2.065124988555908, "learning_rate": 0.0001675549503987551, "loss": 4.2647, "step": 1674 }, { "epoch": 0.4885162231133795, "grad_norm": 2.1902313232421875, "learning_rate": 0.00016753549893016925, "loss": 4.2272, "step": 1675 }, { "epoch": 0.48880787458986513, "grad_norm": 2.946654796600342, "learning_rate": 0.00016751604746158337, "loss": 4.6913, "step": 1676 }, { "epoch": 0.4890995260663507, "grad_norm": 1.8565019369125366, "learning_rate": 0.00016749659599299746, "loss": 4.4744, "step": 1677 }, { "epoch": 0.4893911775428363, "grad_norm": 3.2434334754943848, "learning_rate": 0.0001674771445244116, "loss": 4.4356, "step": 1678 }, { "epoch": 0.4896828290193219, "grad_norm": 3.076382875442505, "learning_rate": 0.00016745769305582573, "loss": 4.3056, "step": 1679 }, { "epoch": 0.4899744804958075, "grad_norm": 2.7160727977752686, "learning_rate": 0.00016743824158723983, "loss": 4.3937, "step": 1680 }, { "epoch": 0.49026613197229313, "grad_norm": 2.430354595184326, "learning_rate": 0.00016741879011865395, "loss": 4.3146, "step": 1681 }, { "epoch": 0.4905577834487787, "grad_norm": 3.214609146118164, "learning_rate": 0.0001673993386500681, "loss": 4.1497, "step": 1682 }, { "epoch": 0.4908494349252643, "grad_norm": 2.1312625408172607, "learning_rate": 0.00016737988718148222, "loss": 4.5401, "step": 1683 }, { "epoch": 0.4911410864017499, "grad_norm": 3.518298625946045, "learning_rate": 0.00016736043571289632, "loss": 4.7525, "step": 1684 }, { "epoch": 0.4914327378782355, "grad_norm": 2.7940781116485596, "learning_rate": 0.00016734098424431046, "loss": 4.4336, "step": 1685 }, { "epoch": 0.4917243893547211, "grad_norm": 3.288437604904175, "learning_rate": 0.00016732153277572459, "loss": 4.51, "step": 1686 }, { "epoch": 0.4920160408312067, "grad_norm": 1.9189974069595337, "learning_rate": 0.00016730208130713868, "loss": 4.7396, "step": 1687 }, { "epoch": 0.49230769230769234, "grad_norm": 3.1210386753082275, "learning_rate": 0.00016728262983855283, "loss": 4.4259, "step": 1688 }, { "epoch": 0.4925993437841779, "grad_norm": 2.8563549518585205, "learning_rate": 0.00016726317836996695, "loss": 4.4628, "step": 1689 }, { "epoch": 0.4928909952606635, "grad_norm": 2.556227207183838, "learning_rate": 0.00016724372690138107, "loss": 4.4976, "step": 1690 }, { "epoch": 0.4931826467371491, "grad_norm": 1.745159387588501, "learning_rate": 0.00016722427543279517, "loss": 4.5701, "step": 1691 }, { "epoch": 0.4934742982136347, "grad_norm": 2.686854362487793, "learning_rate": 0.00016720482396420932, "loss": 4.6856, "step": 1692 }, { "epoch": 0.49376594969012033, "grad_norm": 3.836987018585205, "learning_rate": 0.00016718537249562344, "loss": 4.365, "step": 1693 }, { "epoch": 0.4940576011666059, "grad_norm": 2.50612211227417, "learning_rate": 0.00016716592102703753, "loss": 4.5239, "step": 1694 }, { "epoch": 0.4943492526430915, "grad_norm": 3.3775880336761475, "learning_rate": 0.00016714646955845168, "loss": 4.8713, "step": 1695 }, { "epoch": 0.4946409041195771, "grad_norm": 2.145292282104492, "learning_rate": 0.0001671270180898658, "loss": 4.5127, "step": 1696 }, { "epoch": 0.4949325555960627, "grad_norm": 2.612708330154419, "learning_rate": 0.00016710756662127992, "loss": 4.3611, "step": 1697 }, { "epoch": 0.49522420707254833, "grad_norm": 4.453402996063232, "learning_rate": 0.00016708811515269404, "loss": 4.6637, "step": 1698 }, { "epoch": 0.4955158585490339, "grad_norm": 2.666440486907959, "learning_rate": 0.00016706866368410817, "loss": 4.4146, "step": 1699 }, { "epoch": 0.4958075100255195, "grad_norm": 3.836228132247925, "learning_rate": 0.0001670492122155223, "loss": 4.6119, "step": 1700 }, { "epoch": 0.4960991615020051, "grad_norm": 2.0900533199310303, "learning_rate": 0.00016702976074693638, "loss": 4.6449, "step": 1701 }, { "epoch": 0.4963908129784907, "grad_norm": 2.357597589492798, "learning_rate": 0.00016701030927835053, "loss": 4.3806, "step": 1702 }, { "epoch": 0.4966824644549763, "grad_norm": 2.470543146133423, "learning_rate": 0.00016699085780976465, "loss": 4.388, "step": 1703 }, { "epoch": 0.4969741159314619, "grad_norm": 2.110642910003662, "learning_rate": 0.00016697140634117875, "loss": 4.2655, "step": 1704 }, { "epoch": 0.4972657674079475, "grad_norm": 3.5704402923583984, "learning_rate": 0.0001669519548725929, "loss": 4.3613, "step": 1705 }, { "epoch": 0.4975574188844331, "grad_norm": 3.470667600631714, "learning_rate": 0.00016693250340400702, "loss": 4.2727, "step": 1706 }, { "epoch": 0.4978490703609187, "grad_norm": 4.321897983551025, "learning_rate": 0.00016691305193542114, "loss": 4.6333, "step": 1707 }, { "epoch": 0.4981407218374043, "grad_norm": 2.488215684890747, "learning_rate": 0.00016689360046683526, "loss": 4.1715, "step": 1708 }, { "epoch": 0.4984323733138899, "grad_norm": 5.064474582672119, "learning_rate": 0.00016687414899824938, "loss": 4.3708, "step": 1709 }, { "epoch": 0.4987240247903755, "grad_norm": 3.7401227951049805, "learning_rate": 0.0001668546975296635, "loss": 4.3096, "step": 1710 }, { "epoch": 0.4990156762668611, "grad_norm": 2.3441576957702637, "learning_rate": 0.0001668352460610776, "loss": 4.4056, "step": 1711 }, { "epoch": 0.4993073277433467, "grad_norm": 2.5148980617523193, "learning_rate": 0.00016681579459249175, "loss": 4.5779, "step": 1712 }, { "epoch": 0.4995989792198323, "grad_norm": 2.207287311553955, "learning_rate": 0.00016679634312390587, "loss": 3.9973, "step": 1713 }, { "epoch": 0.4998906306963179, "grad_norm": 1.8896124362945557, "learning_rate": 0.00016677689165532, "loss": 4.4583, "step": 1714 }, { "epoch": 0.5001822821728035, "grad_norm": 2.9943153858184814, "learning_rate": 0.0001667574401867341, "loss": 4.308, "step": 1715 }, { "epoch": 0.500473933649289, "grad_norm": 5.350209712982178, "learning_rate": 0.00016673798871814823, "loss": 4.0645, "step": 1716 }, { "epoch": 0.5007655851257747, "grad_norm": 4.051778793334961, "learning_rate": 0.00016671853724956235, "loss": 4.8268, "step": 1717 }, { "epoch": 0.5010572366022603, "grad_norm": 2.7786123752593994, "learning_rate": 0.00016669908578097648, "loss": 4.4734, "step": 1718 }, { "epoch": 0.5013488880787459, "grad_norm": 3.697319507598877, "learning_rate": 0.0001666796343123906, "loss": 4.5942, "step": 1719 }, { "epoch": 0.5016405395552315, "grad_norm": 2.900470018386841, "learning_rate": 0.00016666018284380472, "loss": 4.453, "step": 1720 }, { "epoch": 0.501932191031717, "grad_norm": 3.0270514488220215, "learning_rate": 0.00016664073137521884, "loss": 4.1447, "step": 1721 }, { "epoch": 0.5022238425082027, "grad_norm": 4.151137828826904, "learning_rate": 0.00016662127990663296, "loss": 4.4071, "step": 1722 }, { "epoch": 0.5025154939846883, "grad_norm": 2.253283977508545, "learning_rate": 0.00016660182843804708, "loss": 4.2124, "step": 1723 }, { "epoch": 0.5028071454611739, "grad_norm": 2.6119635105133057, "learning_rate": 0.0001665823769694612, "loss": 4.3974, "step": 1724 }, { "epoch": 0.5030987969376595, "grad_norm": 2.22757625579834, "learning_rate": 0.00016656292550087533, "loss": 4.5865, "step": 1725 }, { "epoch": 0.503390448414145, "grad_norm": 2.138556718826294, "learning_rate": 0.00016654347403228945, "loss": 3.9842, "step": 1726 }, { "epoch": 0.5036820998906307, "grad_norm": 1.719115138053894, "learning_rate": 0.00016652402256370357, "loss": 4.4872, "step": 1727 }, { "epoch": 0.5039737513671163, "grad_norm": 3.178161859512329, "learning_rate": 0.00016650457109511767, "loss": 4.1583, "step": 1728 }, { "epoch": 0.5042654028436019, "grad_norm": 3.309722423553467, "learning_rate": 0.00016648511962653181, "loss": 4.7822, "step": 1729 }, { "epoch": 0.5045570543200875, "grad_norm": 2.5806849002838135, "learning_rate": 0.00016646566815794594, "loss": 4.6818, "step": 1730 }, { "epoch": 0.5048487057965731, "grad_norm": 3.1456868648529053, "learning_rate": 0.00016644621668936006, "loss": 4.5301, "step": 1731 }, { "epoch": 0.5051403572730587, "grad_norm": 1.845948576927185, "learning_rate": 0.00016642676522077418, "loss": 4.5941, "step": 1732 }, { "epoch": 0.5054320087495443, "grad_norm": 3.1842167377471924, "learning_rate": 0.0001664073137521883, "loss": 4.6113, "step": 1733 }, { "epoch": 0.5057236602260299, "grad_norm": 2.392902374267578, "learning_rate": 0.00016638786228360242, "loss": 4.2957, "step": 1734 }, { "epoch": 0.5060153117025155, "grad_norm": 2.7591028213500977, "learning_rate": 0.00016636841081501654, "loss": 4.2749, "step": 1735 }, { "epoch": 0.5063069631790011, "grad_norm": 2.161865711212158, "learning_rate": 0.00016634895934643067, "loss": 4.5443, "step": 1736 }, { "epoch": 0.5065986146554867, "grad_norm": 2.754452705383301, "learning_rate": 0.0001663295078778448, "loss": 4.3933, "step": 1737 }, { "epoch": 0.5068902661319723, "grad_norm": 2.2845919132232666, "learning_rate": 0.0001663100564092589, "loss": 4.056, "step": 1738 }, { "epoch": 0.5071819176084579, "grad_norm": 2.909348249435425, "learning_rate": 0.00016629060494067303, "loss": 4.8459, "step": 1739 }, { "epoch": 0.5074735690849435, "grad_norm": 2.733834981918335, "learning_rate": 0.00016627115347208715, "loss": 4.2949, "step": 1740 }, { "epoch": 0.5077652205614291, "grad_norm": 3.12203049659729, "learning_rate": 0.00016625170200350127, "loss": 4.5738, "step": 1741 }, { "epoch": 0.5080568720379147, "grad_norm": 2.6664133071899414, "learning_rate": 0.0001662322505349154, "loss": 4.3651, "step": 1742 }, { "epoch": 0.5083485235144003, "grad_norm": 2.3456835746765137, "learning_rate": 0.00016621279906632952, "loss": 4.262, "step": 1743 }, { "epoch": 0.5086401749908859, "grad_norm": 2.3372178077697754, "learning_rate": 0.00016619334759774364, "loss": 4.3447, "step": 1744 }, { "epoch": 0.5089318264673715, "grad_norm": 2.5392863750457764, "learning_rate": 0.00016617389612915776, "loss": 4.517, "step": 1745 }, { "epoch": 0.5092234779438571, "grad_norm": 2.0860555171966553, "learning_rate": 0.00016615444466057188, "loss": 4.3794, "step": 1746 }, { "epoch": 0.5095151294203427, "grad_norm": 2.837566375732422, "learning_rate": 0.000166134993191986, "loss": 4.1206, "step": 1747 }, { "epoch": 0.5098067808968283, "grad_norm": 3.388219118118286, "learning_rate": 0.00016611554172340012, "loss": 4.4001, "step": 1748 }, { "epoch": 0.5100984323733139, "grad_norm": 2.6656508445739746, "learning_rate": 0.00016609609025481425, "loss": 4.1327, "step": 1749 }, { "epoch": 0.5103900838497994, "grad_norm": 2.3003954887390137, "learning_rate": 0.00016607663878622837, "loss": 4.4748, "step": 1750 }, { "epoch": 0.5106817353262851, "grad_norm": 3.042309522628784, "learning_rate": 0.0001660571873176425, "loss": 4.2806, "step": 1751 }, { "epoch": 0.5109733868027707, "grad_norm": 2.923701524734497, "learning_rate": 0.0001660377358490566, "loss": 4.2298, "step": 1752 }, { "epoch": 0.5112650382792563, "grad_norm": 3.38179087638855, "learning_rate": 0.00016601828438047073, "loss": 4.2518, "step": 1753 }, { "epoch": 0.5115566897557419, "grad_norm": 2.91685152053833, "learning_rate": 0.00016599883291188485, "loss": 4.3169, "step": 1754 }, { "epoch": 0.5118483412322274, "grad_norm": 2.74375057220459, "learning_rate": 0.00016597938144329898, "loss": 4.5261, "step": 1755 }, { "epoch": 0.5121399927087131, "grad_norm": 3.6301941871643066, "learning_rate": 0.0001659599299747131, "loss": 4.5404, "step": 1756 }, { "epoch": 0.5124316441851987, "grad_norm": 2.4009039402008057, "learning_rate": 0.00016594047850612722, "loss": 4.3972, "step": 1757 }, { "epoch": 0.5127232956616843, "grad_norm": 2.119097948074341, "learning_rate": 0.00016592102703754134, "loss": 4.471, "step": 1758 }, { "epoch": 0.5130149471381699, "grad_norm": 2.5719246864318848, "learning_rate": 0.00016590157556895546, "loss": 4.5855, "step": 1759 }, { "epoch": 0.5133065986146554, "grad_norm": 2.4004123210906982, "learning_rate": 0.00016588212410036958, "loss": 4.4413, "step": 1760 }, { "epoch": 0.5135982500911411, "grad_norm": 3.2533621788024902, "learning_rate": 0.0001658626726317837, "loss": 4.5316, "step": 1761 }, { "epoch": 0.5138899015676267, "grad_norm": 3.567354202270508, "learning_rate": 0.00016584322116319783, "loss": 4.508, "step": 1762 }, { "epoch": 0.5141815530441123, "grad_norm": 2.5760886669158936, "learning_rate": 0.00016582376969461195, "loss": 4.3291, "step": 1763 }, { "epoch": 0.5144732045205979, "grad_norm": 2.605013608932495, "learning_rate": 0.00016580431822602607, "loss": 4.2749, "step": 1764 }, { "epoch": 0.5147648559970834, "grad_norm": 2.471440076828003, "learning_rate": 0.0001657848667574402, "loss": 4.6197, "step": 1765 }, { "epoch": 0.5150565074735691, "grad_norm": 2.372479200363159, "learning_rate": 0.0001657654152888543, "loss": 4.2605, "step": 1766 }, { "epoch": 0.5153481589500547, "grad_norm": 5.731393337249756, "learning_rate": 0.00016574596382026843, "loss": 4.218, "step": 1767 }, { "epoch": 0.5156398104265403, "grad_norm": 2.534742593765259, "learning_rate": 0.00016572651235168256, "loss": 4.2694, "step": 1768 }, { "epoch": 0.5159314619030259, "grad_norm": 2.6292364597320557, "learning_rate": 0.00016570706088309668, "loss": 4.5373, "step": 1769 }, { "epoch": 0.5162231133795114, "grad_norm": 3.2613344192504883, "learning_rate": 0.0001656876094145108, "loss": 4.3573, "step": 1770 }, { "epoch": 0.5165147648559971, "grad_norm": 2.9583232402801514, "learning_rate": 0.00016566815794592492, "loss": 4.476, "step": 1771 }, { "epoch": 0.5168064163324827, "grad_norm": 2.427100658416748, "learning_rate": 0.00016564870647733904, "loss": 4.5046, "step": 1772 }, { "epoch": 0.5170980678089683, "grad_norm": 3.3320870399475098, "learning_rate": 0.00016562925500875316, "loss": 4.4486, "step": 1773 }, { "epoch": 0.5173897192854539, "grad_norm": 2.8858981132507324, "learning_rate": 0.00016560980354016729, "loss": 4.5214, "step": 1774 }, { "epoch": 0.5176813707619394, "grad_norm": 3.016476631164551, "learning_rate": 0.00016559035207158143, "loss": 4.2016, "step": 1775 }, { "epoch": 0.5179730222384251, "grad_norm": 2.1788530349731445, "learning_rate": 0.00016557090060299553, "loss": 4.4321, "step": 1776 }, { "epoch": 0.5182646737149107, "grad_norm": 3.3769004344940186, "learning_rate": 0.00016555144913440965, "loss": 4.4075, "step": 1777 }, { "epoch": 0.5185563251913963, "grad_norm": 3.4399044513702393, "learning_rate": 0.00016553199766582377, "loss": 4.4147, "step": 1778 }, { "epoch": 0.5188479766678818, "grad_norm": 2.052940845489502, "learning_rate": 0.0001655125461972379, "loss": 4.4742, "step": 1779 }, { "epoch": 0.5191396281443674, "grad_norm": 2.2280406951904297, "learning_rate": 0.00016549309472865202, "loss": 4.4614, "step": 1780 }, { "epoch": 0.5194312796208531, "grad_norm": 2.249501943588257, "learning_rate": 0.00016547364326006614, "loss": 4.5959, "step": 1781 }, { "epoch": 0.5197229310973387, "grad_norm": 2.4690370559692383, "learning_rate": 0.00016545419179148029, "loss": 4.3438, "step": 1782 }, { "epoch": 0.5200145825738243, "grad_norm": 2.2575490474700928, "learning_rate": 0.00016543474032289438, "loss": 4.2975, "step": 1783 }, { "epoch": 0.5203062340503098, "grad_norm": 2.1861374378204346, "learning_rate": 0.0001654152888543085, "loss": 4.4522, "step": 1784 }, { "epoch": 0.5205978855267955, "grad_norm": 2.050799608230591, "learning_rate": 0.00016539583738572265, "loss": 4.5356, "step": 1785 }, { "epoch": 0.5208895370032811, "grad_norm": 5.080857276916504, "learning_rate": 0.00016537638591713674, "loss": 4.3931, "step": 1786 }, { "epoch": 0.5211811884797667, "grad_norm": 4.341304302215576, "learning_rate": 0.00016535693444855087, "loss": 4.4254, "step": 1787 }, { "epoch": 0.5214728399562523, "grad_norm": 2.3332319259643555, "learning_rate": 0.000165337482979965, "loss": 4.2784, "step": 1788 }, { "epoch": 0.5217644914327378, "grad_norm": 3.074981212615967, "learning_rate": 0.00016531803151137914, "loss": 4.6141, "step": 1789 }, { "epoch": 0.5220561429092235, "grad_norm": 2.7330033779144287, "learning_rate": 0.00016529858004279323, "loss": 4.1679, "step": 1790 }, { "epoch": 0.5223477943857091, "grad_norm": 1.9947489500045776, "learning_rate": 0.00016527912857420735, "loss": 4.2821, "step": 1791 }, { "epoch": 0.5226394458621947, "grad_norm": 1.9314813613891602, "learning_rate": 0.0001652596771056215, "loss": 4.4301, "step": 1792 }, { "epoch": 0.5229310973386803, "grad_norm": 3.2701921463012695, "learning_rate": 0.0001652402256370356, "loss": 4.1368, "step": 1793 }, { "epoch": 0.5232227488151658, "grad_norm": 2.26045560836792, "learning_rate": 0.00016522077416844972, "loss": 4.5024, "step": 1794 }, { "epoch": 0.5235144002916515, "grad_norm": 2.771430015563965, "learning_rate": 0.00016520132269986387, "loss": 4.5359, "step": 1795 }, { "epoch": 0.5238060517681371, "grad_norm": 2.7485263347625732, "learning_rate": 0.00016518187123127796, "loss": 4.5334, "step": 1796 }, { "epoch": 0.5240977032446227, "grad_norm": 2.307821273803711, "learning_rate": 0.00016516241976269208, "loss": 4.3129, "step": 1797 }, { "epoch": 0.5243893547211083, "grad_norm": 3.4900546073913574, "learning_rate": 0.0001651429682941062, "loss": 4.5515, "step": 1798 }, { "epoch": 0.5246810061975938, "grad_norm": 2.996000051498413, "learning_rate": 0.00016512351682552035, "loss": 4.6648, "step": 1799 }, { "epoch": 0.5249726576740795, "grad_norm": 1.9217612743377686, "learning_rate": 0.00016510406535693445, "loss": 4.3865, "step": 1800 }, { "epoch": 0.5252643091505651, "grad_norm": 2.3929569721221924, "learning_rate": 0.00016508461388834857, "loss": 4.5574, "step": 1801 }, { "epoch": 0.5255559606270507, "grad_norm": 2.789442777633667, "learning_rate": 0.00016506516241976272, "loss": 4.6617, "step": 1802 }, { "epoch": 0.5258476121035363, "grad_norm": 2.663844108581543, "learning_rate": 0.0001650457109511768, "loss": 4.3434, "step": 1803 }, { "epoch": 0.5261392635800218, "grad_norm": 3.5875015258789062, "learning_rate": 0.00016502625948259093, "loss": 4.5536, "step": 1804 }, { "epoch": 0.5264309150565075, "grad_norm": 2.925231695175171, "learning_rate": 0.00016500680801400506, "loss": 4.4679, "step": 1805 }, { "epoch": 0.5267225665329931, "grad_norm": 2.457911252975464, "learning_rate": 0.0001649873565454192, "loss": 4.4191, "step": 1806 }, { "epoch": 0.5270142180094787, "grad_norm": 2.5424065589904785, "learning_rate": 0.0001649679050768333, "loss": 4.57, "step": 1807 }, { "epoch": 0.5273058694859643, "grad_norm": 2.152331829071045, "learning_rate": 0.00016494845360824742, "loss": 4.3826, "step": 1808 }, { "epoch": 0.5275975209624498, "grad_norm": 2.2238926887512207, "learning_rate": 0.00016492900213966157, "loss": 4.6008, "step": 1809 }, { "epoch": 0.5278891724389355, "grad_norm": 3.029357433319092, "learning_rate": 0.00016490955067107566, "loss": 4.3922, "step": 1810 }, { "epoch": 0.5281808239154211, "grad_norm": 2.863779067993164, "learning_rate": 0.00016489009920248978, "loss": 4.4141, "step": 1811 }, { "epoch": 0.5284724753919067, "grad_norm": 3.365574598312378, "learning_rate": 0.00016487064773390393, "loss": 4.6893, "step": 1812 }, { "epoch": 0.5287641268683922, "grad_norm": 2.296304225921631, "learning_rate": 0.00016485119626531806, "loss": 4.3277, "step": 1813 }, { "epoch": 0.5290557783448778, "grad_norm": 2.3453638553619385, "learning_rate": 0.00016483174479673215, "loss": 4.4729, "step": 1814 }, { "epoch": 0.5293474298213635, "grad_norm": 2.762958526611328, "learning_rate": 0.00016481229332814627, "loss": 4.5093, "step": 1815 }, { "epoch": 0.5296390812978491, "grad_norm": 1.9847302436828613, "learning_rate": 0.00016479284185956042, "loss": 4.3721, "step": 1816 }, { "epoch": 0.5299307327743347, "grad_norm": 2.2937066555023193, "learning_rate": 0.00016477339039097451, "loss": 4.2942, "step": 1817 }, { "epoch": 0.5302223842508202, "grad_norm": 3.0134506225585938, "learning_rate": 0.00016475393892238864, "loss": 4.3601, "step": 1818 }, { "epoch": 0.5305140357273058, "grad_norm": 3.5886178016662598, "learning_rate": 0.00016473448745380278, "loss": 4.5075, "step": 1819 }, { "epoch": 0.5308056872037915, "grad_norm": 3.5287699699401855, "learning_rate": 0.00016471503598521688, "loss": 4.5901, "step": 1820 }, { "epoch": 0.5310973386802771, "grad_norm": 2.422670602798462, "learning_rate": 0.000164695584516631, "loss": 4.3097, "step": 1821 }, { "epoch": 0.5313889901567627, "grad_norm": 3.3849802017211914, "learning_rate": 0.00016467613304804515, "loss": 4.3835, "step": 1822 }, { "epoch": 0.5316806416332482, "grad_norm": 2.9997777938842773, "learning_rate": 0.00016465668157945927, "loss": 4.7896, "step": 1823 }, { "epoch": 0.5319722931097338, "grad_norm": 1.8922972679138184, "learning_rate": 0.00016463723011087337, "loss": 4.0215, "step": 1824 }, { "epoch": 0.5322639445862195, "grad_norm": 2.163541555404663, "learning_rate": 0.0001646177786422875, "loss": 4.0962, "step": 1825 }, { "epoch": 0.5325555960627051, "grad_norm": 6.621460437774658, "learning_rate": 0.00016459832717370164, "loss": 4.5464, "step": 1826 }, { "epoch": 0.5328472475391907, "grad_norm": 3.1586618423461914, "learning_rate": 0.00016457887570511573, "loss": 4.5895, "step": 1827 }, { "epoch": 0.5331388990156762, "grad_norm": 2.786712646484375, "learning_rate": 0.00016455942423652985, "loss": 4.4848, "step": 1828 }, { "epoch": 0.5334305504921618, "grad_norm": 3.767674446105957, "learning_rate": 0.000164539972767944, "loss": 4.3801, "step": 1829 }, { "epoch": 0.5337222019686475, "grad_norm": 2.3487069606781006, "learning_rate": 0.00016452052129935812, "loss": 4.2817, "step": 1830 }, { "epoch": 0.5340138534451331, "grad_norm": 2.733771562576294, "learning_rate": 0.00016450106983077222, "loss": 4.6817, "step": 1831 }, { "epoch": 0.5343055049216187, "grad_norm": 2.2502517700195312, "learning_rate": 0.00016448161836218637, "loss": 4.5383, "step": 1832 }, { "epoch": 0.5345971563981042, "grad_norm": 3.2017555236816406, "learning_rate": 0.0001644621668936005, "loss": 4.6019, "step": 1833 }, { "epoch": 0.5348888078745898, "grad_norm": 2.679551124572754, "learning_rate": 0.00016444271542501458, "loss": 4.4865, "step": 1834 }, { "epoch": 0.5351804593510755, "grad_norm": 3.124843120574951, "learning_rate": 0.0001644232639564287, "loss": 4.4357, "step": 1835 }, { "epoch": 0.5354721108275611, "grad_norm": 2.843515396118164, "learning_rate": 0.00016440381248784285, "loss": 4.5018, "step": 1836 }, { "epoch": 0.5357637623040467, "grad_norm": 2.7090797424316406, "learning_rate": 0.00016438436101925697, "loss": 4.396, "step": 1837 }, { "epoch": 0.5360554137805322, "grad_norm": 1.990626573562622, "learning_rate": 0.00016436490955067107, "loss": 4.6556, "step": 1838 }, { "epoch": 0.5363470652570179, "grad_norm": 2.9319045543670654, "learning_rate": 0.00016434545808208522, "loss": 4.1928, "step": 1839 }, { "epoch": 0.5366387167335035, "grad_norm": 2.2402334213256836, "learning_rate": 0.00016432600661349934, "loss": 4.5995, "step": 1840 }, { "epoch": 0.5369303682099891, "grad_norm": 2.1020593643188477, "learning_rate": 0.00016430655514491343, "loss": 4.1321, "step": 1841 }, { "epoch": 0.5372220196864746, "grad_norm": 2.2812588214874268, "learning_rate": 0.00016428710367632758, "loss": 4.2034, "step": 1842 }, { "epoch": 0.5375136711629602, "grad_norm": 3.3313004970550537, "learning_rate": 0.0001642676522077417, "loss": 4.3836, "step": 1843 }, { "epoch": 0.5378053226394459, "grad_norm": 1.9380820989608765, "learning_rate": 0.0001642482007391558, "loss": 4.4387, "step": 1844 }, { "epoch": 0.5380969741159315, "grad_norm": 1.9918708801269531, "learning_rate": 0.00016422874927056992, "loss": 4.3883, "step": 1845 }, { "epoch": 0.5383886255924171, "grad_norm": 4.616386890411377, "learning_rate": 0.00016420929780198407, "loss": 4.5023, "step": 1846 }, { "epoch": 0.5386802770689026, "grad_norm": 3.120316982269287, "learning_rate": 0.0001641898463333982, "loss": 4.1717, "step": 1847 }, { "epoch": 0.5389719285453882, "grad_norm": 2.3358476161956787, "learning_rate": 0.00016417039486481228, "loss": 4.3531, "step": 1848 }, { "epoch": 0.5392635800218739, "grad_norm": 2.440647602081299, "learning_rate": 0.00016415094339622643, "loss": 4.6862, "step": 1849 }, { "epoch": 0.5395552314983595, "grad_norm": 3.1291322708129883, "learning_rate": 0.00016413149192764055, "loss": 4.5436, "step": 1850 }, { "epoch": 0.5398468829748451, "grad_norm": 2.1020286083221436, "learning_rate": 0.00016411204045905465, "loss": 4.4204, "step": 1851 }, { "epoch": 0.5401385344513306, "grad_norm": 1.7530839443206787, "learning_rate": 0.0001640925889904688, "loss": 4.3644, "step": 1852 }, { "epoch": 0.5404301859278162, "grad_norm": 2.402013063430786, "learning_rate": 0.00016407313752188292, "loss": 4.296, "step": 1853 }, { "epoch": 0.5407218374043019, "grad_norm": 2.363417148590088, "learning_rate": 0.00016405368605329704, "loss": 4.4353, "step": 1854 }, { "epoch": 0.5410134888807875, "grad_norm": 2.7639219760894775, "learning_rate": 0.00016403423458471113, "loss": 4.4884, "step": 1855 }, { "epoch": 0.5413051403572731, "grad_norm": 2.1559252738952637, "learning_rate": 0.00016401478311612528, "loss": 4.0763, "step": 1856 }, { "epoch": 0.5415967918337586, "grad_norm": 2.249889373779297, "learning_rate": 0.0001639953316475394, "loss": 4.5353, "step": 1857 }, { "epoch": 0.5418884433102442, "grad_norm": 1.9930373430252075, "learning_rate": 0.0001639758801789535, "loss": 4.0783, "step": 1858 }, { "epoch": 0.5421800947867299, "grad_norm": 2.5342812538146973, "learning_rate": 0.00016395642871036765, "loss": 4.51, "step": 1859 }, { "epoch": 0.5424717462632155, "grad_norm": 2.539196014404297, "learning_rate": 0.00016393697724178177, "loss": 4.753, "step": 1860 }, { "epoch": 0.5427633977397011, "grad_norm": 2.2923383712768555, "learning_rate": 0.0001639175257731959, "loss": 4.148, "step": 1861 }, { "epoch": 0.5430550492161866, "grad_norm": 2.6668105125427246, "learning_rate": 0.00016389807430461, "loss": 4.4016, "step": 1862 }, { "epoch": 0.5433467006926722, "grad_norm": 3.323136806488037, "learning_rate": 0.00016387862283602413, "loss": 4.5473, "step": 1863 }, { "epoch": 0.5436383521691579, "grad_norm": 3.826636552810669, "learning_rate": 0.00016385917136743826, "loss": 4.6526, "step": 1864 }, { "epoch": 0.5439300036456435, "grad_norm": 2.3087868690490723, "learning_rate": 0.00016383971989885235, "loss": 4.3657, "step": 1865 }, { "epoch": 0.544221655122129, "grad_norm": 2.7869179248809814, "learning_rate": 0.0001638202684302665, "loss": 4.2362, "step": 1866 }, { "epoch": 0.5445133065986146, "grad_norm": 3.1192626953125, "learning_rate": 0.00016380081696168062, "loss": 4.4879, "step": 1867 }, { "epoch": 0.5448049580751002, "grad_norm": 2.451077699661255, "learning_rate": 0.00016378136549309474, "loss": 4.3353, "step": 1868 }, { "epoch": 0.5450966095515859, "grad_norm": 2.1042792797088623, "learning_rate": 0.00016376191402450886, "loss": 4.5662, "step": 1869 }, { "epoch": 0.5453882610280715, "grad_norm": 1.861940622329712, "learning_rate": 0.00016374246255592299, "loss": 4.4709, "step": 1870 }, { "epoch": 0.545679912504557, "grad_norm": 2.4429590702056885, "learning_rate": 0.0001637230110873371, "loss": 4.2779, "step": 1871 }, { "epoch": 0.5459715639810426, "grad_norm": 2.604729652404785, "learning_rate": 0.00016370355961875123, "loss": 4.5584, "step": 1872 }, { "epoch": 0.5462632154575282, "grad_norm": 2.4938788414001465, "learning_rate": 0.00016368410815016535, "loss": 4.38, "step": 1873 }, { "epoch": 0.5465548669340139, "grad_norm": 1.8939335346221924, "learning_rate": 0.00016366465668157947, "loss": 4.3091, "step": 1874 }, { "epoch": 0.5468465184104995, "grad_norm": 6.579743385314941, "learning_rate": 0.00016364520521299357, "loss": 4.5512, "step": 1875 }, { "epoch": 0.547138169886985, "grad_norm": 2.4229767322540283, "learning_rate": 0.00016362575374440772, "loss": 4.647, "step": 1876 }, { "epoch": 0.5474298213634706, "grad_norm": 2.4490559101104736, "learning_rate": 0.00016360630227582184, "loss": 4.775, "step": 1877 }, { "epoch": 0.5477214728399562, "grad_norm": 3.229632616043091, "learning_rate": 0.00016358685080723596, "loss": 4.5694, "step": 1878 }, { "epoch": 0.5480131243164419, "grad_norm": 2.5524628162384033, "learning_rate": 0.00016356739933865008, "loss": 4.2943, "step": 1879 }, { "epoch": 0.5483047757929275, "grad_norm": 2.4401752948760986, "learning_rate": 0.0001635479478700642, "loss": 4.3637, "step": 1880 }, { "epoch": 0.548596427269413, "grad_norm": 2.0960233211517334, "learning_rate": 0.00016352849640147832, "loss": 4.5485, "step": 1881 }, { "epoch": 0.5488880787458986, "grad_norm": 3.5352299213409424, "learning_rate": 0.00016350904493289245, "loss": 4.5714, "step": 1882 }, { "epoch": 0.5491797302223842, "grad_norm": 2.1738648414611816, "learning_rate": 0.00016348959346430657, "loss": 4.5748, "step": 1883 }, { "epoch": 0.5494713816988699, "grad_norm": 2.1155638694763184, "learning_rate": 0.0001634701419957207, "loss": 4.4354, "step": 1884 }, { "epoch": 0.5497630331753555, "grad_norm": 3.446930408477783, "learning_rate": 0.0001634506905271348, "loss": 4.6194, "step": 1885 }, { "epoch": 0.550054684651841, "grad_norm": 1.770503282546997, "learning_rate": 0.00016343123905854893, "loss": 4.4031, "step": 1886 }, { "epoch": 0.5503463361283266, "grad_norm": 3.4617836475372314, "learning_rate": 0.00016341178758996305, "loss": 4.3463, "step": 1887 }, { "epoch": 0.5506379876048122, "grad_norm": 3.707359552383423, "learning_rate": 0.00016339233612137717, "loss": 4.8102, "step": 1888 }, { "epoch": 0.5509296390812979, "grad_norm": 2.3936915397644043, "learning_rate": 0.0001633728846527913, "loss": 4.5318, "step": 1889 }, { "epoch": 0.5512212905577835, "grad_norm": 2.1688809394836426, "learning_rate": 0.00016335343318420542, "loss": 4.3463, "step": 1890 }, { "epoch": 0.551512942034269, "grad_norm": 1.821177363395691, "learning_rate": 0.00016333398171561954, "loss": 4.579, "step": 1891 }, { "epoch": 0.5518045935107546, "grad_norm": 3.2514991760253906, "learning_rate": 0.00016331453024703366, "loss": 4.4547, "step": 1892 }, { "epoch": 0.5520962449872403, "grad_norm": 3.3151228427886963, "learning_rate": 0.00016329507877844778, "loss": 4.4189, "step": 1893 }, { "epoch": 0.5523878964637259, "grad_norm": 4.805200576782227, "learning_rate": 0.0001632756273098619, "loss": 4.1373, "step": 1894 }, { "epoch": 0.5526795479402115, "grad_norm": 2.9507312774658203, "learning_rate": 0.00016325617584127603, "loss": 4.4996, "step": 1895 }, { "epoch": 0.552971199416697, "grad_norm": 2.1608073711395264, "learning_rate": 0.00016323672437269015, "loss": 3.9042, "step": 1896 }, { "epoch": 0.5532628508931826, "grad_norm": 2.467256784439087, "learning_rate": 0.00016321727290410427, "loss": 4.5594, "step": 1897 }, { "epoch": 0.5535545023696683, "grad_norm": 2.8472137451171875, "learning_rate": 0.0001631978214355184, "loss": 4.2653, "step": 1898 }, { "epoch": 0.5538461538461539, "grad_norm": 3.0456390380859375, "learning_rate": 0.0001631783699669325, "loss": 4.4703, "step": 1899 }, { "epoch": 0.5541378053226395, "grad_norm": 2.28765606880188, "learning_rate": 0.00016315891849834663, "loss": 4.8342, "step": 1900 }, { "epoch": 0.554429456799125, "grad_norm": 1.6868144273757935, "learning_rate": 0.00016313946702976076, "loss": 4.4219, "step": 1901 }, { "epoch": 0.5547211082756106, "grad_norm": 3.725825071334839, "learning_rate": 0.00016312001556117488, "loss": 4.4211, "step": 1902 }, { "epoch": 0.5550127597520963, "grad_norm": 2.901487112045288, "learning_rate": 0.000163100564092589, "loss": 4.2893, "step": 1903 }, { "epoch": 0.5553044112285819, "grad_norm": 2.1586551666259766, "learning_rate": 0.00016308111262400312, "loss": 4.2427, "step": 1904 }, { "epoch": 0.5555960627050675, "grad_norm": 2.7070741653442383, "learning_rate": 0.00016306166115541724, "loss": 4.357, "step": 1905 }, { "epoch": 0.555887714181553, "grad_norm": 2.188530921936035, "learning_rate": 0.00016304220968683136, "loss": 4.3609, "step": 1906 }, { "epoch": 0.5561793656580386, "grad_norm": 3.5525941848754883, "learning_rate": 0.00016302275821824548, "loss": 4.8006, "step": 1907 }, { "epoch": 0.5564710171345243, "grad_norm": 2.3895044326782227, "learning_rate": 0.0001630033067496596, "loss": 4.295, "step": 1908 }, { "epoch": 0.5567626686110099, "grad_norm": 2.62111759185791, "learning_rate": 0.00016298385528107373, "loss": 4.3469, "step": 1909 }, { "epoch": 0.5570543200874954, "grad_norm": 2.4315738677978516, "learning_rate": 0.00016296440381248785, "loss": 4.3519, "step": 1910 }, { "epoch": 0.557345971563981, "grad_norm": 4.440062046051025, "learning_rate": 0.00016294495234390197, "loss": 4.503, "step": 1911 }, { "epoch": 0.5576376230404666, "grad_norm": 2.313378095626831, "learning_rate": 0.0001629255008753161, "loss": 4.2726, "step": 1912 }, { "epoch": 0.5579292745169523, "grad_norm": 2.4244751930236816, "learning_rate": 0.00016290604940673021, "loss": 4.3181, "step": 1913 }, { "epoch": 0.5582209259934379, "grad_norm": 2.9103806018829346, "learning_rate": 0.00016288659793814434, "loss": 4.4725, "step": 1914 }, { "epoch": 0.5585125774699234, "grad_norm": 2.5105457305908203, "learning_rate": 0.00016286714646955846, "loss": 4.7583, "step": 1915 }, { "epoch": 0.558804228946409, "grad_norm": 3.061160087585449, "learning_rate": 0.00016284769500097258, "loss": 4.372, "step": 1916 }, { "epoch": 0.5590958804228946, "grad_norm": 2.452355146408081, "learning_rate": 0.0001628282435323867, "loss": 4.3008, "step": 1917 }, { "epoch": 0.5593875318993803, "grad_norm": 2.670449733734131, "learning_rate": 0.00016280879206380082, "loss": 4.4424, "step": 1918 }, { "epoch": 0.5596791833758659, "grad_norm": 3.9293596744537354, "learning_rate": 0.00016278934059521494, "loss": 4.6334, "step": 1919 }, { "epoch": 0.5599708348523514, "grad_norm": 2.5282013416290283, "learning_rate": 0.00016276988912662907, "loss": 4.4969, "step": 1920 }, { "epoch": 0.560262486328837, "grad_norm": 3.0080440044403076, "learning_rate": 0.0001627504376580432, "loss": 4.4113, "step": 1921 }, { "epoch": 0.5605541378053226, "grad_norm": 2.2298660278320312, "learning_rate": 0.0001627309861894573, "loss": 4.3306, "step": 1922 }, { "epoch": 0.5608457892818083, "grad_norm": 2.918893814086914, "learning_rate": 0.00016271153472087143, "loss": 4.4796, "step": 1923 }, { "epoch": 0.5611374407582939, "grad_norm": 1.9416778087615967, "learning_rate": 0.00016269208325228555, "loss": 4.4246, "step": 1924 }, { "epoch": 0.5614290922347794, "grad_norm": 3.2357699871063232, "learning_rate": 0.00016267263178369967, "loss": 4.4433, "step": 1925 }, { "epoch": 0.561720743711265, "grad_norm": 2.501141309738159, "learning_rate": 0.0001626531803151138, "loss": 4.4392, "step": 1926 }, { "epoch": 0.5620123951877506, "grad_norm": 2.046938419342041, "learning_rate": 0.00016263372884652792, "loss": 4.077, "step": 1927 }, { "epoch": 0.5623040466642363, "grad_norm": 2.1078014373779297, "learning_rate": 0.00016261427737794204, "loss": 4.1719, "step": 1928 }, { "epoch": 0.5625956981407219, "grad_norm": 2.2803680896759033, "learning_rate": 0.0001625948259093562, "loss": 4.5161, "step": 1929 }, { "epoch": 0.5628873496172074, "grad_norm": 2.8503451347351074, "learning_rate": 0.00016257537444077028, "loss": 4.5405, "step": 1930 }, { "epoch": 0.563179001093693, "grad_norm": 2.293189764022827, "learning_rate": 0.0001625559229721844, "loss": 4.3533, "step": 1931 }, { "epoch": 0.5634706525701786, "grad_norm": 2.0671825408935547, "learning_rate": 0.00016253647150359852, "loss": 4.4504, "step": 1932 }, { "epoch": 0.5637623040466643, "grad_norm": 2.4015097618103027, "learning_rate": 0.00016251702003501265, "loss": 4.5084, "step": 1933 }, { "epoch": 0.5640539555231499, "grad_norm": 2.6346352100372314, "learning_rate": 0.00016249756856642677, "loss": 4.5333, "step": 1934 }, { "epoch": 0.5643456069996354, "grad_norm": 2.6567351818084717, "learning_rate": 0.0001624781170978409, "loss": 4.5191, "step": 1935 }, { "epoch": 0.564637258476121, "grad_norm": 2.815065622329712, "learning_rate": 0.00016245866562925504, "loss": 4.2598, "step": 1936 }, { "epoch": 0.5649289099526066, "grad_norm": 2.8364429473876953, "learning_rate": 0.00016243921416066913, "loss": 4.4093, "step": 1937 }, { "epoch": 0.5652205614290923, "grad_norm": 2.5176990032196045, "learning_rate": 0.00016241976269208325, "loss": 4.4062, "step": 1938 }, { "epoch": 0.5655122129055778, "grad_norm": 2.623054265975952, "learning_rate": 0.0001624003112234974, "loss": 4.1744, "step": 1939 }, { "epoch": 0.5658038643820634, "grad_norm": 2.6827285289764404, "learning_rate": 0.0001623808597549115, "loss": 4.5878, "step": 1940 }, { "epoch": 0.566095515858549, "grad_norm": 2.329421043395996, "learning_rate": 0.00016236140828632562, "loss": 4.5084, "step": 1941 }, { "epoch": 0.5663871673350346, "grad_norm": 2.669257879257202, "learning_rate": 0.00016234195681773974, "loss": 4.2148, "step": 1942 }, { "epoch": 0.5666788188115203, "grad_norm": 3.5096287727355957, "learning_rate": 0.00016232250534915386, "loss": 4.3273, "step": 1943 }, { "epoch": 0.5669704702880058, "grad_norm": 2.336224317550659, "learning_rate": 0.00016230305388056798, "loss": 4.333, "step": 1944 }, { "epoch": 0.5672621217644914, "grad_norm": 2.3507301807403564, "learning_rate": 0.0001622836024119821, "loss": 4.5104, "step": 1945 }, { "epoch": 0.567553773240977, "grad_norm": 2.3554694652557373, "learning_rate": 0.00016226415094339625, "loss": 4.5191, "step": 1946 }, { "epoch": 0.5678454247174626, "grad_norm": 2.5249998569488525, "learning_rate": 0.00016224469947481035, "loss": 4.6063, "step": 1947 }, { "epoch": 0.5681370761939483, "grad_norm": 1.8941625356674194, "learning_rate": 0.00016222524800622447, "loss": 4.375, "step": 1948 }, { "epoch": 0.5684287276704338, "grad_norm": 2.6512293815612793, "learning_rate": 0.00016220579653763862, "loss": 4.4704, "step": 1949 }, { "epoch": 0.5687203791469194, "grad_norm": 3.1128551959991455, "learning_rate": 0.0001621863450690527, "loss": 4.4739, "step": 1950 }, { "epoch": 0.569012030623405, "grad_norm": 2.006643056869507, "learning_rate": 0.00016216689360046684, "loss": 4.1508, "step": 1951 }, { "epoch": 0.5693036820998907, "grad_norm": 3.7308802604675293, "learning_rate": 0.00016214744213188096, "loss": 4.3847, "step": 1952 }, { "epoch": 0.5695953335763763, "grad_norm": 2.460527181625366, "learning_rate": 0.0001621279906632951, "loss": 4.475, "step": 1953 }, { "epoch": 0.5698869850528618, "grad_norm": 2.6713881492614746, "learning_rate": 0.0001621085391947092, "loss": 4.4041, "step": 1954 }, { "epoch": 0.5701786365293474, "grad_norm": 1.8802825212478638, "learning_rate": 0.00016208908772612332, "loss": 3.9791, "step": 1955 }, { "epoch": 0.570470288005833, "grad_norm": 5.414878845214844, "learning_rate": 0.00016206963625753747, "loss": 4.3163, "step": 1956 }, { "epoch": 0.5707619394823187, "grad_norm": 2.1542067527770996, "learning_rate": 0.00016205018478895156, "loss": 4.5641, "step": 1957 }, { "epoch": 0.5710535909588043, "grad_norm": 1.9597783088684082, "learning_rate": 0.00016203073332036569, "loss": 4.2728, "step": 1958 }, { "epoch": 0.5713452424352898, "grad_norm": 2.1915674209594727, "learning_rate": 0.00016201128185177983, "loss": 4.2407, "step": 1959 }, { "epoch": 0.5716368939117754, "grad_norm": 2.011504888534546, "learning_rate": 0.00016199183038319396, "loss": 4.37, "step": 1960 }, { "epoch": 0.571928545388261, "grad_norm": 4.748679161071777, "learning_rate": 0.00016197237891460805, "loss": 4.5783, "step": 1961 }, { "epoch": 0.5722201968647467, "grad_norm": 3.223508834838867, "learning_rate": 0.00016195292744602217, "loss": 4.3752, "step": 1962 }, { "epoch": 0.5725118483412323, "grad_norm": 3.683262348175049, "learning_rate": 0.00016193347597743632, "loss": 4.3989, "step": 1963 }, { "epoch": 0.5728034998177178, "grad_norm": 2.904700517654419, "learning_rate": 0.00016191402450885042, "loss": 4.325, "step": 1964 }, { "epoch": 0.5730951512942034, "grad_norm": 2.862579107284546, "learning_rate": 0.00016189457304026454, "loss": 4.4495, "step": 1965 }, { "epoch": 0.573386802770689, "grad_norm": 1.9862879514694214, "learning_rate": 0.00016187512157167869, "loss": 4.0458, "step": 1966 }, { "epoch": 0.5736784542471747, "grad_norm": 4.094940662384033, "learning_rate": 0.00016185567010309278, "loss": 4.4457, "step": 1967 }, { "epoch": 0.5739701057236603, "grad_norm": 3.049807548522949, "learning_rate": 0.0001618362186345069, "loss": 3.9204, "step": 1968 }, { "epoch": 0.5742617572001458, "grad_norm": 3.6428513526916504, "learning_rate": 0.00016181676716592102, "loss": 4.433, "step": 1969 }, { "epoch": 0.5745534086766314, "grad_norm": 4.0428147315979, "learning_rate": 0.00016179731569733517, "loss": 4.4043, "step": 1970 }, { "epoch": 0.574845060153117, "grad_norm": 4.272934913635254, "learning_rate": 0.00016177786422874927, "loss": 4.3705, "step": 1971 }, { "epoch": 0.5751367116296027, "grad_norm": 2.655813455581665, "learning_rate": 0.0001617584127601634, "loss": 4.6021, "step": 1972 }, { "epoch": 0.5754283631060882, "grad_norm": 3.7610244750976562, "learning_rate": 0.00016173896129157754, "loss": 4.4551, "step": 1973 }, { "epoch": 0.5757200145825738, "grad_norm": 1.9032502174377441, "learning_rate": 0.00016171950982299163, "loss": 4.6108, "step": 1974 }, { "epoch": 0.5760116660590594, "grad_norm": 5.3712358474731445, "learning_rate": 0.00016170005835440575, "loss": 4.4779, "step": 1975 }, { "epoch": 0.576303317535545, "grad_norm": 3.326528787612915, "learning_rate": 0.0001616806068858199, "loss": 4.4401, "step": 1976 }, { "epoch": 0.5765949690120307, "grad_norm": 2.973543405532837, "learning_rate": 0.00016166115541723402, "loss": 4.3688, "step": 1977 }, { "epoch": 0.5768866204885162, "grad_norm": 3.711071491241455, "learning_rate": 0.00016164170394864812, "loss": 4.3212, "step": 1978 }, { "epoch": 0.5771782719650018, "grad_norm": 4.697476387023926, "learning_rate": 0.00016162225248006224, "loss": 4.5509, "step": 1979 }, { "epoch": 0.5774699234414874, "grad_norm": 3.5494606494903564, "learning_rate": 0.0001616028010114764, "loss": 4.444, "step": 1980 }, { "epoch": 0.577761574917973, "grad_norm": 2.3469533920288086, "learning_rate": 0.00016158334954289048, "loss": 4.1656, "step": 1981 }, { "epoch": 0.5780532263944587, "grad_norm": 2.2894532680511475, "learning_rate": 0.0001615638980743046, "loss": 4.6755, "step": 1982 }, { "epoch": 0.5783448778709442, "grad_norm": 2.291663408279419, "learning_rate": 0.00016154444660571875, "loss": 4.2727, "step": 1983 }, { "epoch": 0.5786365293474298, "grad_norm": 3.6882476806640625, "learning_rate": 0.00016152499513713287, "loss": 4.5703, "step": 1984 }, { "epoch": 0.5789281808239154, "grad_norm": 2.5997650623321533, "learning_rate": 0.00016150554366854697, "loss": 4.4219, "step": 1985 }, { "epoch": 0.579219832300401, "grad_norm": 1.8419091701507568, "learning_rate": 0.00016148609219996112, "loss": 4.5007, "step": 1986 }, { "epoch": 0.5795114837768867, "grad_norm": 3.6659159660339355, "learning_rate": 0.00016146664073137524, "loss": 4.2117, "step": 1987 }, { "epoch": 0.5798031352533722, "grad_norm": 2.965646266937256, "learning_rate": 0.00016144718926278933, "loss": 4.5058, "step": 1988 }, { "epoch": 0.5800947867298578, "grad_norm": 2.3507604598999023, "learning_rate": 0.00016142773779420346, "loss": 4.3095, "step": 1989 }, { "epoch": 0.5803864382063434, "grad_norm": 1.9784704446792603, "learning_rate": 0.0001614082863256176, "loss": 4.2268, "step": 1990 }, { "epoch": 0.580678089682829, "grad_norm": 2.9521381855010986, "learning_rate": 0.0001613888348570317, "loss": 4.3437, "step": 1991 }, { "epoch": 0.5809697411593147, "grad_norm": 2.5010905265808105, "learning_rate": 0.00016136938338844582, "loss": 4.1508, "step": 1992 }, { "epoch": 0.5812613926358002, "grad_norm": 3.034726142883301, "learning_rate": 0.00016134993191985997, "loss": 4.5587, "step": 1993 }, { "epoch": 0.5815530441122858, "grad_norm": 1.8856585025787354, "learning_rate": 0.0001613304804512741, "loss": 4.3745, "step": 1994 }, { "epoch": 0.5818446955887714, "grad_norm": 2.235912322998047, "learning_rate": 0.00016131102898268819, "loss": 4.4068, "step": 1995 }, { "epoch": 0.582136347065257, "grad_norm": 2.6451666355133057, "learning_rate": 0.00016129157751410233, "loss": 4.4913, "step": 1996 }, { "epoch": 0.5824279985417427, "grad_norm": 3.131521463394165, "learning_rate": 0.00016127212604551646, "loss": 4.3747, "step": 1997 }, { "epoch": 0.5827196500182282, "grad_norm": 2.67465877532959, "learning_rate": 0.00016125267457693055, "loss": 4.2881, "step": 1998 }, { "epoch": 0.5830113014947138, "grad_norm": 3.1334469318389893, "learning_rate": 0.00016123322310834467, "loss": 4.2481, "step": 1999 }, { "epoch": 0.5833029529711994, "grad_norm": 2.274250030517578, "learning_rate": 0.00016121377163975882, "loss": 4.4052, "step": 2000 }, { "epoch": 0.583594604447685, "grad_norm": 2.873162269592285, "learning_rate": 0.00016119432017117294, "loss": 4.5825, "step": 2001 }, { "epoch": 0.5838862559241706, "grad_norm": 2.7017040252685547, "learning_rate": 0.00016117486870258704, "loss": 4.5322, "step": 2002 }, { "epoch": 0.5841779074006562, "grad_norm": 3.7162435054779053, "learning_rate": 0.00016115541723400118, "loss": 4.4429, "step": 2003 }, { "epoch": 0.5844695588771418, "grad_norm": 3.1199467182159424, "learning_rate": 0.0001611359657654153, "loss": 4.5417, "step": 2004 }, { "epoch": 0.5847612103536274, "grad_norm": 3.5474600791931152, "learning_rate": 0.0001611165142968294, "loss": 4.5755, "step": 2005 }, { "epoch": 0.5850528618301131, "grad_norm": 2.5497453212738037, "learning_rate": 0.00016109706282824355, "loss": 4.3847, "step": 2006 }, { "epoch": 0.5853445133065986, "grad_norm": 3.488986015319824, "learning_rate": 0.00016107761135965767, "loss": 4.417, "step": 2007 }, { "epoch": 0.5856361647830842, "grad_norm": 2.38079571723938, "learning_rate": 0.0001610581598910718, "loss": 4.3713, "step": 2008 }, { "epoch": 0.5859278162595698, "grad_norm": 3.3115811347961426, "learning_rate": 0.0001610387084224859, "loss": 4.4082, "step": 2009 }, { "epoch": 0.5862194677360554, "grad_norm": 3.774425745010376, "learning_rate": 0.00016101925695390004, "loss": 4.2597, "step": 2010 }, { "epoch": 0.5865111192125411, "grad_norm": 3.0726678371429443, "learning_rate": 0.00016099980548531416, "loss": 4.5848, "step": 2011 }, { "epoch": 0.5868027706890266, "grad_norm": 3.2236955165863037, "learning_rate": 0.00016098035401672825, "loss": 4.3427, "step": 2012 }, { "epoch": 0.5870944221655122, "grad_norm": 2.449772357940674, "learning_rate": 0.0001609609025481424, "loss": 4.6024, "step": 2013 }, { "epoch": 0.5873860736419978, "grad_norm": 4.136505603790283, "learning_rate": 0.00016094145107955652, "loss": 4.3679, "step": 2014 }, { "epoch": 0.5876777251184834, "grad_norm": 2.6302196979522705, "learning_rate": 0.00016092199961097064, "loss": 4.264, "step": 2015 }, { "epoch": 0.5879693765949691, "grad_norm": 2.241518497467041, "learning_rate": 0.00016090254814238477, "loss": 4.525, "step": 2016 }, { "epoch": 0.5882610280714546, "grad_norm": 2.0726263523101807, "learning_rate": 0.0001608830966737989, "loss": 4.4643, "step": 2017 }, { "epoch": 0.5885526795479402, "grad_norm": 2.9032857418060303, "learning_rate": 0.000160863645205213, "loss": 4.4367, "step": 2018 }, { "epoch": 0.5888443310244258, "grad_norm": 3.141486883163452, "learning_rate": 0.0001608441937366271, "loss": 4.5411, "step": 2019 }, { "epoch": 0.5891359825009114, "grad_norm": 2.8911542892456055, "learning_rate": 0.00016082474226804125, "loss": 4.5311, "step": 2020 }, { "epoch": 0.5894276339773971, "grad_norm": 2.2248547077178955, "learning_rate": 0.00016080529079945537, "loss": 4.3123, "step": 2021 }, { "epoch": 0.5897192854538826, "grad_norm": 2.198749303817749, "learning_rate": 0.00016078583933086947, "loss": 4.4244, "step": 2022 }, { "epoch": 0.5900109369303682, "grad_norm": 2.5441465377807617, "learning_rate": 0.00016076638786228362, "loss": 4.3911, "step": 2023 }, { "epoch": 0.5903025884068538, "grad_norm": 2.8929338455200195, "learning_rate": 0.00016074693639369774, "loss": 4.3198, "step": 2024 }, { "epoch": 0.5905942398833394, "grad_norm": 3.6960737705230713, "learning_rate": 0.00016072748492511186, "loss": 3.9565, "step": 2025 }, { "epoch": 0.590885891359825, "grad_norm": 2.4138071537017822, "learning_rate": 0.00016070803345652598, "loss": 4.6199, "step": 2026 }, { "epoch": 0.5911775428363106, "grad_norm": 6.406846523284912, "learning_rate": 0.0001606885819879401, "loss": 4.7167, "step": 2027 }, { "epoch": 0.5914691943127962, "grad_norm": 1.8282833099365234, "learning_rate": 0.00016066913051935422, "loss": 4.6745, "step": 2028 }, { "epoch": 0.5917608457892818, "grad_norm": 4.29815149307251, "learning_rate": 0.00016064967905076832, "loss": 4.5837, "step": 2029 }, { "epoch": 0.5920524972657674, "grad_norm": 4.86417293548584, "learning_rate": 0.00016063022758218247, "loss": 4.3275, "step": 2030 }, { "epoch": 0.592344148742253, "grad_norm": 3.3536081314086914, "learning_rate": 0.0001606107761135966, "loss": 4.1939, "step": 2031 }, { "epoch": 0.5926358002187386, "grad_norm": 2.4762113094329834, "learning_rate": 0.0001605913246450107, "loss": 4.3572, "step": 2032 }, { "epoch": 0.5929274516952242, "grad_norm": 4.618116855621338, "learning_rate": 0.00016057187317642483, "loss": 4.3752, "step": 2033 }, { "epoch": 0.5932191031717098, "grad_norm": 3.2075130939483643, "learning_rate": 0.00016055242170783895, "loss": 4.3195, "step": 2034 }, { "epoch": 0.5935107546481954, "grad_norm": 2.8272790908813477, "learning_rate": 0.00016053297023925308, "loss": 4.3857, "step": 2035 }, { "epoch": 0.593802406124681, "grad_norm": 1.6300307512283325, "learning_rate": 0.0001605135187706672, "loss": 4.4175, "step": 2036 }, { "epoch": 0.5940940576011666, "grad_norm": 2.0918898582458496, "learning_rate": 0.00016049406730208132, "loss": 4.4962, "step": 2037 }, { "epoch": 0.5943857090776522, "grad_norm": 2.2646167278289795, "learning_rate": 0.00016047461583349544, "loss": 4.4196, "step": 2038 }, { "epoch": 0.5946773605541378, "grad_norm": 2.384415864944458, "learning_rate": 0.00016045516436490956, "loss": 4.0258, "step": 2039 }, { "epoch": 0.5949690120306234, "grad_norm": 2.555375576019287, "learning_rate": 0.00016043571289632368, "loss": 4.4099, "step": 2040 }, { "epoch": 0.595260663507109, "grad_norm": 2.3194053173065186, "learning_rate": 0.0001604162614277378, "loss": 4.5018, "step": 2041 }, { "epoch": 0.5955523149835946, "grad_norm": 2.9269330501556396, "learning_rate": 0.00016039680995915193, "loss": 4.4216, "step": 2042 }, { "epoch": 0.5958439664600802, "grad_norm": 2.2326390743255615, "learning_rate": 0.00016037735849056605, "loss": 4.4017, "step": 2043 }, { "epoch": 0.5961356179365658, "grad_norm": 2.5432791709899902, "learning_rate": 0.00016035790702198017, "loss": 4.505, "step": 2044 }, { "epoch": 0.5964272694130514, "grad_norm": 2.853121280670166, "learning_rate": 0.0001603384555533943, "loss": 4.5117, "step": 2045 }, { "epoch": 0.596718920889537, "grad_norm": 2.4210028648376465, "learning_rate": 0.0001603190040848084, "loss": 4.5355, "step": 2046 }, { "epoch": 0.5970105723660226, "grad_norm": 2.4960672855377197, "learning_rate": 0.00016029955261622254, "loss": 4.3131, "step": 2047 }, { "epoch": 0.5973022238425082, "grad_norm": 2.230085611343384, "learning_rate": 0.00016028010114763666, "loss": 4.3622, "step": 2048 }, { "epoch": 0.5975938753189938, "grad_norm": 2.859434127807617, "learning_rate": 0.00016026064967905078, "loss": 4.3055, "step": 2049 }, { "epoch": 0.5978855267954793, "grad_norm": 2.6134145259857178, "learning_rate": 0.0001602411982104649, "loss": 4.6233, "step": 2050 }, { "epoch": 0.598177178271965, "grad_norm": 2.8293001651763916, "learning_rate": 0.00016022174674187902, "loss": 4.1323, "step": 2051 }, { "epoch": 0.5984688297484506, "grad_norm": 2.512322425842285, "learning_rate": 0.00016020229527329314, "loss": 4.372, "step": 2052 }, { "epoch": 0.5987604812249362, "grad_norm": 3.075317621231079, "learning_rate": 0.00016018284380470726, "loss": 4.4117, "step": 2053 }, { "epoch": 0.5990521327014218, "grad_norm": 3.0408005714416504, "learning_rate": 0.00016016339233612139, "loss": 4.5838, "step": 2054 }, { "epoch": 0.5993437841779073, "grad_norm": 2.344205379486084, "learning_rate": 0.0001601439408675355, "loss": 4.2143, "step": 2055 }, { "epoch": 0.599635435654393, "grad_norm": 2.281528949737549, "learning_rate": 0.00016012448939894963, "loss": 4.3913, "step": 2056 }, { "epoch": 0.5999270871308786, "grad_norm": 2.773210048675537, "learning_rate": 0.00016010503793036375, "loss": 4.3892, "step": 2057 }, { "epoch": 0.6002187386073642, "grad_norm": 2.1700620651245117, "learning_rate": 0.00016008558646177787, "loss": 4.6926, "step": 2058 }, { "epoch": 0.6005103900838498, "grad_norm": 2.3355703353881836, "learning_rate": 0.000160066134993192, "loss": 4.3545, "step": 2059 }, { "epoch": 0.6008020415603355, "grad_norm": 2.2490155696868896, "learning_rate": 0.00016004668352460612, "loss": 4.6488, "step": 2060 }, { "epoch": 0.601093693036821, "grad_norm": 1.734222412109375, "learning_rate": 0.00016002723205602024, "loss": 4.2049, "step": 2061 }, { "epoch": 0.6013853445133066, "grad_norm": 2.460561752319336, "learning_rate": 0.00016000778058743436, "loss": 4.5358, "step": 2062 }, { "epoch": 0.6016769959897922, "grad_norm": 2.4372975826263428, "learning_rate": 0.00015998832911884848, "loss": 4.4574, "step": 2063 }, { "epoch": 0.6019686474662778, "grad_norm": 2.3150854110717773, "learning_rate": 0.0001599688776502626, "loss": 4.3342, "step": 2064 }, { "epoch": 0.6022602989427635, "grad_norm": 2.394489288330078, "learning_rate": 0.00015994942618167672, "loss": 4.1353, "step": 2065 }, { "epoch": 0.602551950419249, "grad_norm": 2.9140114784240723, "learning_rate": 0.00015992997471309085, "loss": 4.5796, "step": 2066 }, { "epoch": 0.6028436018957346, "grad_norm": 1.9464188814163208, "learning_rate": 0.00015991052324450497, "loss": 4.1716, "step": 2067 }, { "epoch": 0.6031352533722202, "grad_norm": 3.4251065254211426, "learning_rate": 0.0001598910717759191, "loss": 4.3199, "step": 2068 }, { "epoch": 0.6034269048487058, "grad_norm": 2.4387285709381104, "learning_rate": 0.0001598716203073332, "loss": 4.6993, "step": 2069 }, { "epoch": 0.6037185563251914, "grad_norm": 2.412602424621582, "learning_rate": 0.00015985216883874733, "loss": 4.2885, "step": 2070 }, { "epoch": 0.604010207801677, "grad_norm": 2.723435640335083, "learning_rate": 0.00015983271737016145, "loss": 4.3078, "step": 2071 }, { "epoch": 0.6043018592781626, "grad_norm": 4.251087188720703, "learning_rate": 0.00015981326590157557, "loss": 4.4883, "step": 2072 }, { "epoch": 0.6045935107546482, "grad_norm": 2.752960681915283, "learning_rate": 0.0001597938144329897, "loss": 4.4344, "step": 2073 }, { "epoch": 0.6048851622311338, "grad_norm": 2.0718939304351807, "learning_rate": 0.00015977436296440382, "loss": 4.4809, "step": 2074 }, { "epoch": 0.6051768137076194, "grad_norm": 2.5887999534606934, "learning_rate": 0.00015975491149581794, "loss": 4.2928, "step": 2075 }, { "epoch": 0.605468465184105, "grad_norm": 1.9622628688812256, "learning_rate": 0.00015973546002723206, "loss": 4.8126, "step": 2076 }, { "epoch": 0.6057601166605906, "grad_norm": 2.4959943294525146, "learning_rate": 0.00015971600855864618, "loss": 4.5803, "step": 2077 }, { "epoch": 0.6060517681370762, "grad_norm": 2.4680190086364746, "learning_rate": 0.0001596965570900603, "loss": 4.5186, "step": 2078 }, { "epoch": 0.6063434196135618, "grad_norm": 3.034837484359741, "learning_rate": 0.00015967710562147443, "loss": 4.5359, "step": 2079 }, { "epoch": 0.6066350710900474, "grad_norm": 2.571540594100952, "learning_rate": 0.00015965765415288855, "loss": 4.347, "step": 2080 }, { "epoch": 0.606926722566533, "grad_norm": 1.7622931003570557, "learning_rate": 0.00015963820268430267, "loss": 4.6102, "step": 2081 }, { "epoch": 0.6072183740430186, "grad_norm": 2.9573092460632324, "learning_rate": 0.0001596187512157168, "loss": 4.4165, "step": 2082 }, { "epoch": 0.6075100255195042, "grad_norm": 5.45331335067749, "learning_rate": 0.0001595992997471309, "loss": 4.6718, "step": 2083 }, { "epoch": 0.6078016769959897, "grad_norm": 2.5730743408203125, "learning_rate": 0.00015957984827854503, "loss": 4.378, "step": 2084 }, { "epoch": 0.6080933284724754, "grad_norm": 2.9125401973724365, "learning_rate": 0.00015956039680995916, "loss": 4.6265, "step": 2085 }, { "epoch": 0.608384979948961, "grad_norm": 2.365567684173584, "learning_rate": 0.00015954094534137328, "loss": 4.436, "step": 2086 }, { "epoch": 0.6086766314254466, "grad_norm": 4.161392688751221, "learning_rate": 0.0001595214938727874, "loss": 4.6864, "step": 2087 }, { "epoch": 0.6089682829019322, "grad_norm": 2.0402450561523438, "learning_rate": 0.00015950204240420152, "loss": 4.5226, "step": 2088 }, { "epoch": 0.6092599343784177, "grad_norm": 2.3988637924194336, "learning_rate": 0.00015948259093561564, "loss": 4.2413, "step": 2089 }, { "epoch": 0.6095515858549034, "grad_norm": 2.8210196495056152, "learning_rate": 0.00015946313946702976, "loss": 4.232, "step": 2090 }, { "epoch": 0.609843237331389, "grad_norm": 2.7119712829589844, "learning_rate": 0.00015944368799844389, "loss": 4.285, "step": 2091 }, { "epoch": 0.6101348888078746, "grad_norm": 3.117579460144043, "learning_rate": 0.000159424236529858, "loss": 4.0851, "step": 2092 }, { "epoch": 0.6104265402843602, "grad_norm": 3.1409711837768555, "learning_rate": 0.00015940478506127216, "loss": 4.5209, "step": 2093 }, { "epoch": 0.6107181917608457, "grad_norm": 3.3945369720458984, "learning_rate": 0.00015938533359268625, "loss": 4.4918, "step": 2094 }, { "epoch": 0.6110098432373314, "grad_norm": 2.0241799354553223, "learning_rate": 0.00015936588212410037, "loss": 4.4741, "step": 2095 }, { "epoch": 0.611301494713817, "grad_norm": 2.1152396202087402, "learning_rate": 0.0001593464306555145, "loss": 4.2654, "step": 2096 }, { "epoch": 0.6115931461903026, "grad_norm": 2.547137498855591, "learning_rate": 0.00015932697918692861, "loss": 4.5158, "step": 2097 }, { "epoch": 0.6118847976667882, "grad_norm": 2.5195884704589844, "learning_rate": 0.00015930752771834274, "loss": 4.5441, "step": 2098 }, { "epoch": 0.6121764491432737, "grad_norm": 3.656815767288208, "learning_rate": 0.00015928807624975686, "loss": 4.1291, "step": 2099 }, { "epoch": 0.6124681006197594, "grad_norm": 2.996337652206421, "learning_rate": 0.000159268624781171, "loss": 4.4631, "step": 2100 }, { "epoch": 0.612759752096245, "grad_norm": 3.7551865577697754, "learning_rate": 0.0001592491733125851, "loss": 4.8076, "step": 2101 }, { "epoch": 0.6130514035727306, "grad_norm": 2.6226024627685547, "learning_rate": 0.00015922972184399922, "loss": 4.478, "step": 2102 }, { "epoch": 0.6133430550492162, "grad_norm": 3.464700222015381, "learning_rate": 0.00015921027037541337, "loss": 4.5126, "step": 2103 }, { "epoch": 0.6136347065257017, "grad_norm": 2.052961826324463, "learning_rate": 0.00015919081890682747, "loss": 4.4715, "step": 2104 }, { "epoch": 0.6139263580021874, "grad_norm": 2.1044161319732666, "learning_rate": 0.0001591713674382416, "loss": 3.9894, "step": 2105 }, { "epoch": 0.614218009478673, "grad_norm": 3.479292392730713, "learning_rate": 0.0001591519159696557, "loss": 4.5092, "step": 2106 }, { "epoch": 0.6145096609551586, "grad_norm": 2.730619430541992, "learning_rate": 0.00015913246450106986, "loss": 4.1045, "step": 2107 }, { "epoch": 0.6148013124316442, "grad_norm": 1.8660814762115479, "learning_rate": 0.00015911301303248395, "loss": 4.1041, "step": 2108 }, { "epoch": 0.6150929639081297, "grad_norm": 2.49281907081604, "learning_rate": 0.00015909356156389807, "loss": 4.6007, "step": 2109 }, { "epoch": 0.6153846153846154, "grad_norm": 2.6716179847717285, "learning_rate": 0.00015907411009531222, "loss": 4.5796, "step": 2110 }, { "epoch": 0.615676266861101, "grad_norm": 2.497832775115967, "learning_rate": 0.00015905465862672632, "loss": 4.2916, "step": 2111 }, { "epoch": 0.6159679183375866, "grad_norm": 2.6061480045318604, "learning_rate": 0.00015903520715814044, "loss": 4.453, "step": 2112 }, { "epoch": 0.6162595698140722, "grad_norm": 2.7143571376800537, "learning_rate": 0.0001590157556895546, "loss": 4.2491, "step": 2113 }, { "epoch": 0.6165512212905578, "grad_norm": 3.101837158203125, "learning_rate": 0.00015899630422096868, "loss": 4.2623, "step": 2114 }, { "epoch": 0.6168428727670434, "grad_norm": 3.0875422954559326, "learning_rate": 0.0001589768527523828, "loss": 4.224, "step": 2115 }, { "epoch": 0.617134524243529, "grad_norm": 2.4365859031677246, "learning_rate": 0.00015895740128379693, "loss": 4.3646, "step": 2116 }, { "epoch": 0.6174261757200146, "grad_norm": 2.114365577697754, "learning_rate": 0.00015893794981521107, "loss": 4.4638, "step": 2117 }, { "epoch": 0.6177178271965001, "grad_norm": 2.3727262020111084, "learning_rate": 0.00015891849834662517, "loss": 4.4049, "step": 2118 }, { "epoch": 0.6180094786729858, "grad_norm": 1.9838407039642334, "learning_rate": 0.0001588990468780393, "loss": 4.3112, "step": 2119 }, { "epoch": 0.6183011301494714, "grad_norm": 2.8058254718780518, "learning_rate": 0.00015887959540945344, "loss": 4.502, "step": 2120 }, { "epoch": 0.618592781625957, "grad_norm": 3.5561773777008057, "learning_rate": 0.00015886014394086753, "loss": 4.3628, "step": 2121 }, { "epoch": 0.6188844331024426, "grad_norm": 2.465651512145996, "learning_rate": 0.00015884069247228165, "loss": 4.5796, "step": 2122 }, { "epoch": 0.6191760845789281, "grad_norm": 2.605046510696411, "learning_rate": 0.0001588212410036958, "loss": 4.6, "step": 2123 }, { "epoch": 0.6194677360554138, "grad_norm": 3.4881019592285156, "learning_rate": 0.00015880178953510992, "loss": 4.2465, "step": 2124 }, { "epoch": 0.6197593875318994, "grad_norm": 2.937269449234009, "learning_rate": 0.00015878233806652402, "loss": 4.3682, "step": 2125 }, { "epoch": 0.620051039008385, "grad_norm": 3.2734129428863525, "learning_rate": 0.00015876288659793814, "loss": 4.6742, "step": 2126 }, { "epoch": 0.6203426904848706, "grad_norm": 2.500852346420288, "learning_rate": 0.0001587434351293523, "loss": 4.5006, "step": 2127 }, { "epoch": 0.6206343419613561, "grad_norm": 2.250678539276123, "learning_rate": 0.00015872398366076638, "loss": 4.2466, "step": 2128 }, { "epoch": 0.6209259934378418, "grad_norm": 2.3127479553222656, "learning_rate": 0.0001587045321921805, "loss": 4.6416, "step": 2129 }, { "epoch": 0.6212176449143274, "grad_norm": 2.6009654998779297, "learning_rate": 0.00015868508072359465, "loss": 4.5437, "step": 2130 }, { "epoch": 0.621509296390813, "grad_norm": 3.133873224258423, "learning_rate": 0.00015866562925500878, "loss": 4.4545, "step": 2131 }, { "epoch": 0.6218009478672986, "grad_norm": 2.307788133621216, "learning_rate": 0.00015864617778642287, "loss": 4.3826, "step": 2132 }, { "epoch": 0.6220925993437841, "grad_norm": 2.369298219680786, "learning_rate": 0.000158626726317837, "loss": 4.4538, "step": 2133 }, { "epoch": 0.6223842508202698, "grad_norm": 2.0109808444976807, "learning_rate": 0.00015860727484925114, "loss": 3.8792, "step": 2134 }, { "epoch": 0.6226759022967554, "grad_norm": 2.121673583984375, "learning_rate": 0.00015858782338066524, "loss": 4.4064, "step": 2135 }, { "epoch": 0.622967553773241, "grad_norm": 1.6207703351974487, "learning_rate": 0.00015856837191207936, "loss": 4.4281, "step": 2136 }, { "epoch": 0.6232592052497266, "grad_norm": 2.4020817279815674, "learning_rate": 0.0001585489204434935, "loss": 4.406, "step": 2137 }, { "epoch": 0.6235508567262121, "grad_norm": 2.3365976810455322, "learning_rate": 0.0001585294689749076, "loss": 4.1562, "step": 2138 }, { "epoch": 0.6238425082026978, "grad_norm": 3.155766725540161, "learning_rate": 0.00015851001750632172, "loss": 4.3863, "step": 2139 }, { "epoch": 0.6241341596791834, "grad_norm": 2.50474214553833, "learning_rate": 0.00015849056603773587, "loss": 4.4269, "step": 2140 }, { "epoch": 0.624425811155669, "grad_norm": 1.928035855293274, "learning_rate": 0.00015847111456915, "loss": 4.406, "step": 2141 }, { "epoch": 0.6247174626321546, "grad_norm": 2.327632427215576, "learning_rate": 0.0001584516631005641, "loss": 4.4245, "step": 2142 }, { "epoch": 0.6250091141086401, "grad_norm": 3.1765644550323486, "learning_rate": 0.0001584322116319782, "loss": 4.4776, "step": 2143 }, { "epoch": 0.6253007655851258, "grad_norm": 3.2010953426361084, "learning_rate": 0.00015841276016339236, "loss": 4.4506, "step": 2144 }, { "epoch": 0.6255924170616114, "grad_norm": 2.3374831676483154, "learning_rate": 0.00015839330869480645, "loss": 4.4953, "step": 2145 }, { "epoch": 0.625884068538097, "grad_norm": 2.1472134590148926, "learning_rate": 0.00015837385722622057, "loss": 3.8958, "step": 2146 }, { "epoch": 0.6261757200145825, "grad_norm": 2.093501567840576, "learning_rate": 0.00015835440575763472, "loss": 3.8215, "step": 2147 }, { "epoch": 0.6264673714910681, "grad_norm": 3.8907310962677, "learning_rate": 0.00015833495428904884, "loss": 4.0077, "step": 2148 }, { "epoch": 0.6267590229675538, "grad_norm": 3.2228574752807617, "learning_rate": 0.00015831550282046294, "loss": 4.733, "step": 2149 }, { "epoch": 0.6270506744440394, "grad_norm": 2.821068525314331, "learning_rate": 0.00015829605135187709, "loss": 4.5362, "step": 2150 }, { "epoch": 0.627342325920525, "grad_norm": 2.2403564453125, "learning_rate": 0.0001582765998832912, "loss": 4.6964, "step": 2151 }, { "epoch": 0.6276339773970105, "grad_norm": 3.171522378921509, "learning_rate": 0.0001582571484147053, "loss": 4.2451, "step": 2152 }, { "epoch": 0.6279256288734961, "grad_norm": 2.245216131210327, "learning_rate": 0.00015823769694611942, "loss": 4.3815, "step": 2153 }, { "epoch": 0.6282172803499818, "grad_norm": 1.8952711820602417, "learning_rate": 0.00015821824547753357, "loss": 4.3197, "step": 2154 }, { "epoch": 0.6285089318264674, "grad_norm": 4.116556167602539, "learning_rate": 0.0001581987940089477, "loss": 4.351, "step": 2155 }, { "epoch": 0.628800583302953, "grad_norm": 2.175063133239746, "learning_rate": 0.0001581793425403618, "loss": 4.3117, "step": 2156 }, { "epoch": 0.6290922347794385, "grad_norm": 2.7533111572265625, "learning_rate": 0.00015815989107177594, "loss": 4.5232, "step": 2157 }, { "epoch": 0.6293838862559241, "grad_norm": 2.4933247566223145, "learning_rate": 0.00015814043960319006, "loss": 4.3785, "step": 2158 }, { "epoch": 0.6296755377324098, "grad_norm": 2.176405906677246, "learning_rate": 0.00015812098813460415, "loss": 4.3878, "step": 2159 }, { "epoch": 0.6299671892088954, "grad_norm": 3.117621660232544, "learning_rate": 0.0001581015366660183, "loss": 4.3843, "step": 2160 }, { "epoch": 0.630258840685381, "grad_norm": 2.521998643875122, "learning_rate": 0.00015808208519743242, "loss": 4.3387, "step": 2161 }, { "epoch": 0.6305504921618665, "grad_norm": 2.5553300380706787, "learning_rate": 0.00015806263372884655, "loss": 4.7025, "step": 2162 }, { "epoch": 0.6308421436383521, "grad_norm": 2.502566337585449, "learning_rate": 0.00015804318226026064, "loss": 4.3429, "step": 2163 }, { "epoch": 0.6311337951148378, "grad_norm": 3.273075819015503, "learning_rate": 0.0001580237307916748, "loss": 4.3617, "step": 2164 }, { "epoch": 0.6314254465913234, "grad_norm": 2.8707096576690674, "learning_rate": 0.0001580042793230889, "loss": 4.7446, "step": 2165 }, { "epoch": 0.631717098067809, "grad_norm": 2.4246103763580322, "learning_rate": 0.000157984827854503, "loss": 4.2209, "step": 2166 }, { "epoch": 0.6320087495442945, "grad_norm": 2.8556606769561768, "learning_rate": 0.00015796537638591715, "loss": 4.5369, "step": 2167 }, { "epoch": 0.6323004010207802, "grad_norm": 3.130080461502075, "learning_rate": 0.00015794592491733128, "loss": 4.3129, "step": 2168 }, { "epoch": 0.6325920524972658, "grad_norm": 1.68805992603302, "learning_rate": 0.00015792647344874537, "loss": 4.1784, "step": 2169 }, { "epoch": 0.6328837039737514, "grad_norm": 3.8886594772338867, "learning_rate": 0.00015790702198015952, "loss": 4.2958, "step": 2170 }, { "epoch": 0.633175355450237, "grad_norm": 2.187824010848999, "learning_rate": 0.00015788757051157364, "loss": 4.5393, "step": 2171 }, { "epoch": 0.6334670069267225, "grad_norm": 3.176471471786499, "learning_rate": 0.00015786811904298776, "loss": 4.2896, "step": 2172 }, { "epoch": 0.6337586584032082, "grad_norm": 4.638916969299316, "learning_rate": 0.00015784866757440186, "loss": 4.6871, "step": 2173 }, { "epoch": 0.6340503098796938, "grad_norm": 4.090147972106934, "learning_rate": 0.000157829216105816, "loss": 4.668, "step": 2174 }, { "epoch": 0.6343419613561794, "grad_norm": 2.4662930965423584, "learning_rate": 0.00015780976463723013, "loss": 4.4995, "step": 2175 }, { "epoch": 0.634633612832665, "grad_norm": 2.6010782718658447, "learning_rate": 0.00015779031316864422, "loss": 4.3438, "step": 2176 }, { "epoch": 0.6349252643091505, "grad_norm": 3.7584915161132812, "learning_rate": 0.00015777086170005837, "loss": 4.3231, "step": 2177 }, { "epoch": 0.6352169157856362, "grad_norm": 3.7129640579223633, "learning_rate": 0.0001577514102314725, "loss": 4.3877, "step": 2178 }, { "epoch": 0.6355085672621218, "grad_norm": 3.494943857192993, "learning_rate": 0.0001577319587628866, "loss": 4.6926, "step": 2179 }, { "epoch": 0.6358002187386074, "grad_norm": 2.845355987548828, "learning_rate": 0.00015771250729430073, "loss": 4.469, "step": 2180 }, { "epoch": 0.636091870215093, "grad_norm": 2.3909752368927, "learning_rate": 0.00015769305582571486, "loss": 4.3305, "step": 2181 }, { "epoch": 0.6363835216915785, "grad_norm": 2.268322706222534, "learning_rate": 0.00015767360435712898, "loss": 4.376, "step": 2182 }, { "epoch": 0.6366751731680642, "grad_norm": 2.1692357063293457, "learning_rate": 0.00015765415288854307, "loss": 4.3515, "step": 2183 }, { "epoch": 0.6369668246445498, "grad_norm": 3.4869918823242188, "learning_rate": 0.00015763470141995722, "loss": 4.1748, "step": 2184 }, { "epoch": 0.6372584761210354, "grad_norm": 1.6745244264602661, "learning_rate": 0.00015761524995137134, "loss": 4.3258, "step": 2185 }, { "epoch": 0.6375501275975209, "grad_norm": 2.5688228607177734, "learning_rate": 0.00015759579848278546, "loss": 4.433, "step": 2186 }, { "epoch": 0.6378417790740065, "grad_norm": 1.6316113471984863, "learning_rate": 0.00015757634701419959, "loss": 4.1563, "step": 2187 }, { "epoch": 0.6381334305504922, "grad_norm": 2.6174371242523193, "learning_rate": 0.0001575568955456137, "loss": 4.3767, "step": 2188 }, { "epoch": 0.6384250820269778, "grad_norm": 1.9268962144851685, "learning_rate": 0.00015753744407702783, "loss": 4.4248, "step": 2189 }, { "epoch": 0.6387167335034634, "grad_norm": 2.0624332427978516, "learning_rate": 0.00015751799260844195, "loss": 4.3768, "step": 2190 }, { "epoch": 0.6390083849799489, "grad_norm": 2.587205171585083, "learning_rate": 0.00015749854113985607, "loss": 4.5514, "step": 2191 }, { "epoch": 0.6393000364564345, "grad_norm": 2.8921754360198975, "learning_rate": 0.0001574790896712702, "loss": 4.4264, "step": 2192 }, { "epoch": 0.6395916879329202, "grad_norm": 2.439490556716919, "learning_rate": 0.0001574596382026843, "loss": 4.4982, "step": 2193 }, { "epoch": 0.6398833394094058, "grad_norm": 2.3657472133636475, "learning_rate": 0.00015744018673409844, "loss": 4.6673, "step": 2194 }, { "epoch": 0.6401749908858914, "grad_norm": 3.0442068576812744, "learning_rate": 0.00015742073526551256, "loss": 4.5541, "step": 2195 }, { "epoch": 0.6404666423623769, "grad_norm": 2.4139909744262695, "learning_rate": 0.00015740128379692668, "loss": 4.6607, "step": 2196 }, { "epoch": 0.6407582938388625, "grad_norm": 2.273526906967163, "learning_rate": 0.0001573818323283408, "loss": 4.5581, "step": 2197 }, { "epoch": 0.6410499453153482, "grad_norm": 2.280839443206787, "learning_rate": 0.00015736238085975492, "loss": 4.4109, "step": 2198 }, { "epoch": 0.6413415967918338, "grad_norm": 1.6544018983840942, "learning_rate": 0.00015734292939116904, "loss": 4.2717, "step": 2199 }, { "epoch": 0.6416332482683194, "grad_norm": 2.8559248447418213, "learning_rate": 0.00015732347792258317, "loss": 4.1212, "step": 2200 }, { "epoch": 0.6419248997448049, "grad_norm": 2.5558478832244873, "learning_rate": 0.0001573040264539973, "loss": 4.3455, "step": 2201 }, { "epoch": 0.6422165512212905, "grad_norm": 2.503641128540039, "learning_rate": 0.0001572845749854114, "loss": 4.6614, "step": 2202 }, { "epoch": 0.6425082026977762, "grad_norm": 2.9628355503082275, "learning_rate": 0.00015726512351682553, "loss": 4.5787, "step": 2203 }, { "epoch": 0.6427998541742618, "grad_norm": 2.5839645862579346, "learning_rate": 0.00015724567204823965, "loss": 4.7444, "step": 2204 }, { "epoch": 0.6430915056507474, "grad_norm": 2.082859516143799, "learning_rate": 0.00015722622057965377, "loss": 4.2288, "step": 2205 }, { "epoch": 0.6433831571272329, "grad_norm": 1.973238468170166, "learning_rate": 0.0001572067691110679, "loss": 4.5594, "step": 2206 }, { "epoch": 0.6436748086037185, "grad_norm": 2.501220703125, "learning_rate": 0.00015718731764248202, "loss": 4.4242, "step": 2207 }, { "epoch": 0.6439664600802042, "grad_norm": 2.453627347946167, "learning_rate": 0.00015716786617389614, "loss": 4.3535, "step": 2208 }, { "epoch": 0.6442581115566898, "grad_norm": 2.2259511947631836, "learning_rate": 0.00015714841470531026, "loss": 4.5714, "step": 2209 }, { "epoch": 0.6445497630331753, "grad_norm": 2.716102361679077, "learning_rate": 0.00015712896323672438, "loss": 4.6282, "step": 2210 }, { "epoch": 0.6448414145096609, "grad_norm": 2.458571672439575, "learning_rate": 0.0001571095117681385, "loss": 4.7455, "step": 2211 }, { "epoch": 0.6451330659861465, "grad_norm": 2.9455881118774414, "learning_rate": 0.00015709006029955263, "loss": 4.3981, "step": 2212 }, { "epoch": 0.6454247174626322, "grad_norm": 2.615732431411743, "learning_rate": 0.00015707060883096675, "loss": 4.2502, "step": 2213 }, { "epoch": 0.6457163689391178, "grad_norm": 2.9549455642700195, "learning_rate": 0.00015705115736238087, "loss": 4.3653, "step": 2214 }, { "epoch": 0.6460080204156033, "grad_norm": 2.641829013824463, "learning_rate": 0.000157031705893795, "loss": 4.5234, "step": 2215 }, { "epoch": 0.6462996718920889, "grad_norm": 2.585648536682129, "learning_rate": 0.0001570122544252091, "loss": 4.4156, "step": 2216 }, { "epoch": 0.6465913233685745, "grad_norm": 2.233513832092285, "learning_rate": 0.00015699280295662323, "loss": 4.6956, "step": 2217 }, { "epoch": 0.6468829748450602, "grad_norm": 1.973343014717102, "learning_rate": 0.00015697335148803735, "loss": 4.4519, "step": 2218 }, { "epoch": 0.6471746263215458, "grad_norm": 2.163283586502075, "learning_rate": 0.00015695390001945148, "loss": 4.5178, "step": 2219 }, { "epoch": 0.6474662777980313, "grad_norm": 3.0247225761413574, "learning_rate": 0.0001569344485508656, "loss": 4.598, "step": 2220 }, { "epoch": 0.6477579292745169, "grad_norm": 2.714186668395996, "learning_rate": 0.00015691499708227972, "loss": 4.6013, "step": 2221 }, { "epoch": 0.6480495807510026, "grad_norm": 2.1075544357299805, "learning_rate": 0.00015689554561369384, "loss": 4.4467, "step": 2222 }, { "epoch": 0.6483412322274882, "grad_norm": 3.5137343406677246, "learning_rate": 0.00015687609414510796, "loss": 4.4601, "step": 2223 }, { "epoch": 0.6486328837039738, "grad_norm": 2.405052661895752, "learning_rate": 0.00015685664267652208, "loss": 4.4074, "step": 2224 }, { "epoch": 0.6489245351804593, "grad_norm": 2.6980836391448975, "learning_rate": 0.0001568371912079362, "loss": 4.6068, "step": 2225 }, { "epoch": 0.6492161866569449, "grad_norm": 2.4056031703948975, "learning_rate": 0.00015681773973935033, "loss": 4.4817, "step": 2226 }, { "epoch": 0.6495078381334306, "grad_norm": 3.453946113586426, "learning_rate": 0.00015679828827076445, "loss": 4.1673, "step": 2227 }, { "epoch": 0.6497994896099162, "grad_norm": 2.1087052822113037, "learning_rate": 0.00015677883680217857, "loss": 4.5401, "step": 2228 }, { "epoch": 0.6500911410864018, "grad_norm": 2.4831202030181885, "learning_rate": 0.0001567593853335927, "loss": 4.558, "step": 2229 }, { "epoch": 0.6503827925628873, "grad_norm": 3.0353572368621826, "learning_rate": 0.00015673993386500681, "loss": 4.2327, "step": 2230 }, { "epoch": 0.6506744440393729, "grad_norm": 3.4438679218292236, "learning_rate": 0.00015672048239642094, "loss": 4.3849, "step": 2231 }, { "epoch": 0.6509660955158586, "grad_norm": 2.6628425121307373, "learning_rate": 0.00015670103092783506, "loss": 4.5375, "step": 2232 }, { "epoch": 0.6512577469923442, "grad_norm": 2.7874603271484375, "learning_rate": 0.00015668157945924918, "loss": 4.3246, "step": 2233 }, { "epoch": 0.6515493984688298, "grad_norm": 2.304753065109253, "learning_rate": 0.0001566621279906633, "loss": 4.4029, "step": 2234 }, { "epoch": 0.6518410499453153, "grad_norm": 3.018535852432251, "learning_rate": 0.00015664267652207742, "loss": 4.4195, "step": 2235 }, { "epoch": 0.6521327014218009, "grad_norm": 2.8916120529174805, "learning_rate": 0.00015662322505349154, "loss": 4.5204, "step": 2236 }, { "epoch": 0.6524243528982866, "grad_norm": 2.2448527812957764, "learning_rate": 0.00015660377358490567, "loss": 4.4382, "step": 2237 }, { "epoch": 0.6527160043747722, "grad_norm": 2.4887945652008057, "learning_rate": 0.0001565843221163198, "loss": 4.5953, "step": 2238 }, { "epoch": 0.6530076558512578, "grad_norm": 1.9287234544754028, "learning_rate": 0.0001565648706477339, "loss": 4.4569, "step": 2239 }, { "epoch": 0.6532993073277433, "grad_norm": 2.1243464946746826, "learning_rate": 0.00015654541917914803, "loss": 4.3467, "step": 2240 }, { "epoch": 0.6535909588042289, "grad_norm": 3.644710063934326, "learning_rate": 0.00015652596771056215, "loss": 4.508, "step": 2241 }, { "epoch": 0.6538826102807146, "grad_norm": 3.040483236312866, "learning_rate": 0.00015650651624197627, "loss": 4.5126, "step": 2242 }, { "epoch": 0.6541742617572002, "grad_norm": 2.780414581298828, "learning_rate": 0.0001564870647733904, "loss": 4.423, "step": 2243 }, { "epoch": 0.6544659132336857, "grad_norm": 2.0564002990722656, "learning_rate": 0.00015646761330480452, "loss": 4.362, "step": 2244 }, { "epoch": 0.6547575647101713, "grad_norm": 2.8643815517425537, "learning_rate": 0.00015644816183621864, "loss": 4.3375, "step": 2245 }, { "epoch": 0.6550492161866569, "grad_norm": 3.637454032897949, "learning_rate": 0.00015642871036763276, "loss": 4.5957, "step": 2246 }, { "epoch": 0.6553408676631426, "grad_norm": 2.292072057723999, "learning_rate": 0.0001564092588990469, "loss": 4.5336, "step": 2247 }, { "epoch": 0.6556325191396282, "grad_norm": 2.248558759689331, "learning_rate": 0.000156389807430461, "loss": 4.5848, "step": 2248 }, { "epoch": 0.6559241706161137, "grad_norm": 2.402109384536743, "learning_rate": 0.00015637035596187512, "loss": 4.5027, "step": 2249 }, { "epoch": 0.6562158220925993, "grad_norm": 2.2580204010009766, "learning_rate": 0.00015635090449328925, "loss": 4.0344, "step": 2250 }, { "epoch": 0.6565074735690849, "grad_norm": 2.258934736251831, "learning_rate": 0.00015633145302470337, "loss": 4.4213, "step": 2251 }, { "epoch": 0.6567991250455706, "grad_norm": 2.2228848934173584, "learning_rate": 0.0001563120015561175, "loss": 4.5821, "step": 2252 }, { "epoch": 0.6570907765220562, "grad_norm": 2.1203629970550537, "learning_rate": 0.0001562925500875316, "loss": 4.1937, "step": 2253 }, { "epoch": 0.6573824279985417, "grad_norm": 1.8420816659927368, "learning_rate": 0.00015627309861894576, "loss": 4.4328, "step": 2254 }, { "epoch": 0.6576740794750273, "grad_norm": 2.0723321437835693, "learning_rate": 0.00015625364715035985, "loss": 4.7798, "step": 2255 }, { "epoch": 0.6579657309515129, "grad_norm": 2.826841354370117, "learning_rate": 0.00015623419568177398, "loss": 4.5, "step": 2256 }, { "epoch": 0.6582573824279986, "grad_norm": 2.869523763656616, "learning_rate": 0.00015621474421318812, "loss": 4.2795, "step": 2257 }, { "epoch": 0.6585490339044842, "grad_norm": 2.6421375274658203, "learning_rate": 0.00015619529274460222, "loss": 4.4508, "step": 2258 }, { "epoch": 0.6588406853809697, "grad_norm": 2.4398486614227295, "learning_rate": 0.00015617584127601634, "loss": 4.3329, "step": 2259 }, { "epoch": 0.6591323368574553, "grad_norm": 2.7608206272125244, "learning_rate": 0.00015615638980743046, "loss": 4.5523, "step": 2260 }, { "epoch": 0.6594239883339409, "grad_norm": 2.667217254638672, "learning_rate": 0.00015613693833884458, "loss": 4.5543, "step": 2261 }, { "epoch": 0.6597156398104266, "grad_norm": 1.9833370447158813, "learning_rate": 0.0001561174868702587, "loss": 4.4554, "step": 2262 }, { "epoch": 0.6600072912869122, "grad_norm": 2.3345370292663574, "learning_rate": 0.00015609803540167283, "loss": 4.646, "step": 2263 }, { "epoch": 0.6602989427633977, "grad_norm": 2.535107374191284, "learning_rate": 0.00015607858393308698, "loss": 4.3019, "step": 2264 }, { "epoch": 0.6605905942398833, "grad_norm": 2.0544347763061523, "learning_rate": 0.00015605913246450107, "loss": 4.393, "step": 2265 }, { "epoch": 0.6608822457163689, "grad_norm": 3.2944562435150146, "learning_rate": 0.0001560396809959152, "loss": 4.6839, "step": 2266 }, { "epoch": 0.6611738971928546, "grad_norm": 1.946864128112793, "learning_rate": 0.00015602022952732934, "loss": 4.4743, "step": 2267 }, { "epoch": 0.6614655486693402, "grad_norm": 2.4377260208129883, "learning_rate": 0.00015600077805874343, "loss": 4.1788, "step": 2268 }, { "epoch": 0.6617572001458257, "grad_norm": 2.3113765716552734, "learning_rate": 0.00015598132659015756, "loss": 4.246, "step": 2269 }, { "epoch": 0.6620488516223113, "grad_norm": 2.8579249382019043, "learning_rate": 0.00015596187512157168, "loss": 4.5467, "step": 2270 }, { "epoch": 0.6623405030987969, "grad_norm": 3.4354469776153564, "learning_rate": 0.00015594242365298583, "loss": 4.4318, "step": 2271 }, { "epoch": 0.6626321545752826, "grad_norm": 2.948143720626831, "learning_rate": 0.00015592297218439992, "loss": 4.2358, "step": 2272 }, { "epoch": 0.6629238060517681, "grad_norm": 2.5259745121002197, "learning_rate": 0.00015590352071581404, "loss": 4.3758, "step": 2273 }, { "epoch": 0.6632154575282537, "grad_norm": 2.4340081214904785, "learning_rate": 0.0001558840692472282, "loss": 4.2879, "step": 2274 }, { "epoch": 0.6635071090047393, "grad_norm": 2.252383232116699, "learning_rate": 0.00015586461777864229, "loss": 4.3899, "step": 2275 }, { "epoch": 0.6637987604812249, "grad_norm": 2.6932284832000732, "learning_rate": 0.0001558451663100564, "loss": 4.2867, "step": 2276 }, { "epoch": 0.6640904119577106, "grad_norm": 4.72757625579834, "learning_rate": 0.00015582571484147056, "loss": 4.2028, "step": 2277 }, { "epoch": 0.6643820634341961, "grad_norm": 3.4220082759857178, "learning_rate": 0.00015580626337288468, "loss": 4.4206, "step": 2278 }, { "epoch": 0.6646737149106817, "grad_norm": 2.165372848510742, "learning_rate": 0.00015578681190429877, "loss": 4.3237, "step": 2279 }, { "epoch": 0.6649653663871673, "grad_norm": 3.566659927368164, "learning_rate": 0.0001557673604357129, "loss": 4.5616, "step": 2280 }, { "epoch": 0.665257017863653, "grad_norm": 2.6880602836608887, "learning_rate": 0.00015574790896712704, "loss": 4.4899, "step": 2281 }, { "epoch": 0.6655486693401386, "grad_norm": 2.246066093444824, "learning_rate": 0.00015572845749854114, "loss": 3.8948, "step": 2282 }, { "epoch": 0.6658403208166241, "grad_norm": 2.5072929859161377, "learning_rate": 0.00015570900602995526, "loss": 4.5075, "step": 2283 }, { "epoch": 0.6661319722931097, "grad_norm": 2.4770820140838623, "learning_rate": 0.0001556895545613694, "loss": 4.6724, "step": 2284 }, { "epoch": 0.6664236237695953, "grad_norm": 2.7337646484375, "learning_rate": 0.0001556701030927835, "loss": 4.4267, "step": 2285 }, { "epoch": 0.666715275246081, "grad_norm": 2.6353633403778076, "learning_rate": 0.00015565065162419762, "loss": 4.5795, "step": 2286 }, { "epoch": 0.6670069267225666, "grad_norm": 5.011528015136719, "learning_rate": 0.00015563120015561177, "loss": 4.4043, "step": 2287 }, { "epoch": 0.6672985781990521, "grad_norm": 2.410961151123047, "learning_rate": 0.0001556117486870259, "loss": 3.9827, "step": 2288 }, { "epoch": 0.6675902296755377, "grad_norm": 3.275545358657837, "learning_rate": 0.00015559229721844, "loss": 4.4307, "step": 2289 }, { "epoch": 0.6678818811520233, "grad_norm": 2.343069076538086, "learning_rate": 0.0001555728457498541, "loss": 4.3263, "step": 2290 }, { "epoch": 0.668173532628509, "grad_norm": 2.7001376152038574, "learning_rate": 0.00015555339428126826, "loss": 4.2273, "step": 2291 }, { "epoch": 0.6684651841049946, "grad_norm": 2.5925474166870117, "learning_rate": 0.00015553394281268235, "loss": 4.6043, "step": 2292 }, { "epoch": 0.6687568355814801, "grad_norm": 1.8295079469680786, "learning_rate": 0.00015551449134409647, "loss": 4.2319, "step": 2293 }, { "epoch": 0.6690484870579657, "grad_norm": 3.006027936935425, "learning_rate": 0.00015549503987551062, "loss": 4.2101, "step": 2294 }, { "epoch": 0.6693401385344513, "grad_norm": 2.333090305328369, "learning_rate": 0.00015547558840692474, "loss": 4.3996, "step": 2295 }, { "epoch": 0.669631790010937, "grad_norm": 4.346424579620361, "learning_rate": 0.00015545613693833884, "loss": 4.5275, "step": 2296 }, { "epoch": 0.6699234414874226, "grad_norm": 4.003733158111572, "learning_rate": 0.00015543668546975296, "loss": 4.3217, "step": 2297 }, { "epoch": 0.6702150929639081, "grad_norm": 3.201472520828247, "learning_rate": 0.0001554172340011671, "loss": 4.5625, "step": 2298 }, { "epoch": 0.6705067444403937, "grad_norm": 2.8730506896972656, "learning_rate": 0.0001553977825325812, "loss": 4.2207, "step": 2299 }, { "epoch": 0.6707983959168793, "grad_norm": 2.6847143173217773, "learning_rate": 0.00015537833106399533, "loss": 4.3311, "step": 2300 }, { "epoch": 0.671090047393365, "grad_norm": 1.9940379858016968, "learning_rate": 0.00015535887959540947, "loss": 4.2438, "step": 2301 }, { "epoch": 0.6713816988698506, "grad_norm": 1.8871147632598877, "learning_rate": 0.0001553394281268236, "loss": 4.5458, "step": 2302 }, { "epoch": 0.6716733503463361, "grad_norm": 2.539755344390869, "learning_rate": 0.0001553199766582377, "loss": 4.3167, "step": 2303 }, { "epoch": 0.6719650018228217, "grad_norm": 3.736846923828125, "learning_rate": 0.00015530052518965184, "loss": 4.3304, "step": 2304 }, { "epoch": 0.6722566532993073, "grad_norm": 3.2489991188049316, "learning_rate": 0.00015528107372106596, "loss": 4.5258, "step": 2305 }, { "epoch": 0.672548304775793, "grad_norm": 2.1257436275482178, "learning_rate": 0.00015526162225248006, "loss": 4.3188, "step": 2306 }, { "epoch": 0.6728399562522785, "grad_norm": 2.333522319793701, "learning_rate": 0.00015524217078389418, "loss": 4.5075, "step": 2307 }, { "epoch": 0.6731316077287641, "grad_norm": 2.438828945159912, "learning_rate": 0.00015522271931530833, "loss": 4.2953, "step": 2308 }, { "epoch": 0.6734232592052497, "grad_norm": 2.153489828109741, "learning_rate": 0.00015520326784672242, "loss": 4.3522, "step": 2309 }, { "epoch": 0.6737149106817353, "grad_norm": 2.696702480316162, "learning_rate": 0.00015518381637813654, "loss": 4.6603, "step": 2310 }, { "epoch": 0.674006562158221, "grad_norm": 2.2397687435150146, "learning_rate": 0.0001551643649095507, "loss": 4.3198, "step": 2311 }, { "epoch": 0.6742982136347065, "grad_norm": 1.9369601011276245, "learning_rate": 0.0001551449134409648, "loss": 4.5714, "step": 2312 }, { "epoch": 0.6745898651111921, "grad_norm": 2.4124555587768555, "learning_rate": 0.0001551254619723789, "loss": 4.5767, "step": 2313 }, { "epoch": 0.6748815165876777, "grad_norm": 2.0459516048431396, "learning_rate": 0.00015510601050379305, "loss": 4.2361, "step": 2314 }, { "epoch": 0.6751731680641633, "grad_norm": 2.342749834060669, "learning_rate": 0.00015508655903520718, "loss": 4.2486, "step": 2315 }, { "epoch": 0.675464819540649, "grad_norm": 2.37141489982605, "learning_rate": 0.00015506710756662127, "loss": 4.4474, "step": 2316 }, { "epoch": 0.6757564710171345, "grad_norm": 2.982640504837036, "learning_rate": 0.0001550476560980354, "loss": 4.2688, "step": 2317 }, { "epoch": 0.6760481224936201, "grad_norm": 2.2247140407562256, "learning_rate": 0.00015502820462944954, "loss": 4.31, "step": 2318 }, { "epoch": 0.6763397739701057, "grad_norm": 3.0162243843078613, "learning_rate": 0.00015500875316086366, "loss": 4.7247, "step": 2319 }, { "epoch": 0.6766314254465913, "grad_norm": 2.5737905502319336, "learning_rate": 0.00015498930169227776, "loss": 4.5655, "step": 2320 }, { "epoch": 0.676923076923077, "grad_norm": 2.200572967529297, "learning_rate": 0.0001549698502236919, "loss": 4.2649, "step": 2321 }, { "epoch": 0.6772147283995625, "grad_norm": 2.6385679244995117, "learning_rate": 0.00015495039875510603, "loss": 4.3361, "step": 2322 }, { "epoch": 0.6775063798760481, "grad_norm": 2.385308027267456, "learning_rate": 0.00015493094728652012, "loss": 4.1275, "step": 2323 }, { "epoch": 0.6777980313525337, "grad_norm": 1.744091510772705, "learning_rate": 0.00015491149581793427, "loss": 4.2655, "step": 2324 }, { "epoch": 0.6780896828290193, "grad_norm": 5.963216781616211, "learning_rate": 0.0001548920443493484, "loss": 4.3612, "step": 2325 }, { "epoch": 0.678381334305505, "grad_norm": 3.693180799484253, "learning_rate": 0.00015487259288076251, "loss": 4.3373, "step": 2326 }, { "epoch": 0.6786729857819905, "grad_norm": 2.9833743572235107, "learning_rate": 0.0001548531414121766, "loss": 4.3591, "step": 2327 }, { "epoch": 0.6789646372584761, "grad_norm": 1.904102087020874, "learning_rate": 0.00015483368994359076, "loss": 4.3956, "step": 2328 }, { "epoch": 0.6792562887349617, "grad_norm": 2.511021852493286, "learning_rate": 0.00015481423847500488, "loss": 4.1665, "step": 2329 }, { "epoch": 0.6795479402114473, "grad_norm": 2.0641796588897705, "learning_rate": 0.00015479478700641897, "loss": 4.3296, "step": 2330 }, { "epoch": 0.679839591687933, "grad_norm": 2.626007080078125, "learning_rate": 0.00015477533553783312, "loss": 4.5459, "step": 2331 }, { "epoch": 0.6801312431644185, "grad_norm": 2.7110652923583984, "learning_rate": 0.00015475588406924724, "loss": 4.4238, "step": 2332 }, { "epoch": 0.6804228946409041, "grad_norm": 2.926737070083618, "learning_rate": 0.00015473643260066137, "loss": 4.3539, "step": 2333 }, { "epoch": 0.6807145461173897, "grad_norm": 3.196803331375122, "learning_rate": 0.0001547169811320755, "loss": 4.6883, "step": 2334 }, { "epoch": 0.6810061975938754, "grad_norm": 2.552319049835205, "learning_rate": 0.0001546975296634896, "loss": 4.4429, "step": 2335 }, { "epoch": 0.681297849070361, "grad_norm": 2.214672565460205, "learning_rate": 0.00015467807819490373, "loss": 4.3963, "step": 2336 }, { "epoch": 0.6815895005468465, "grad_norm": 2.055748701095581, "learning_rate": 0.00015465862672631782, "loss": 4.3994, "step": 2337 }, { "epoch": 0.6818811520233321, "grad_norm": 2.25826358795166, "learning_rate": 0.00015463917525773197, "loss": 4.1371, "step": 2338 }, { "epoch": 0.6821728034998177, "grad_norm": 3.0523812770843506, "learning_rate": 0.0001546197237891461, "loss": 4.5022, "step": 2339 }, { "epoch": 0.6824644549763034, "grad_norm": 3.249765157699585, "learning_rate": 0.0001546002723205602, "loss": 4.5467, "step": 2340 }, { "epoch": 0.682756106452789, "grad_norm": 2.067251682281494, "learning_rate": 0.00015458082085197434, "loss": 4.2417, "step": 2341 }, { "epoch": 0.6830477579292745, "grad_norm": 3.0523970127105713, "learning_rate": 0.00015456136938338846, "loss": 4.2445, "step": 2342 }, { "epoch": 0.6833394094057601, "grad_norm": 2.3751792907714844, "learning_rate": 0.00015454191791480258, "loss": 4.4721, "step": 2343 }, { "epoch": 0.6836310608822457, "grad_norm": 2.5193684101104736, "learning_rate": 0.0001545224664462167, "loss": 4.5361, "step": 2344 }, { "epoch": 0.6839227123587314, "grad_norm": 2.279595136642456, "learning_rate": 0.00015450301497763082, "loss": 4.3442, "step": 2345 }, { "epoch": 0.6842143638352169, "grad_norm": 2.2745449542999268, "learning_rate": 0.00015448356350904495, "loss": 4.1932, "step": 2346 }, { "epoch": 0.6845060153117025, "grad_norm": 3.0858850479125977, "learning_rate": 0.00015446411204045904, "loss": 4.6349, "step": 2347 }, { "epoch": 0.6847976667881881, "grad_norm": 1.9158724546432495, "learning_rate": 0.0001544446605718732, "loss": 4.2037, "step": 2348 }, { "epoch": 0.6850893182646737, "grad_norm": 2.249324321746826, "learning_rate": 0.0001544252091032873, "loss": 4.4174, "step": 2349 }, { "epoch": 0.6853809697411594, "grad_norm": 2.2359108924865723, "learning_rate": 0.00015440575763470143, "loss": 4.2248, "step": 2350 }, { "epoch": 0.6856726212176449, "grad_norm": 1.840477466583252, "learning_rate": 0.00015438630616611555, "loss": 4.369, "step": 2351 }, { "epoch": 0.6859642726941305, "grad_norm": 2.367276668548584, "learning_rate": 0.00015436685469752968, "loss": 4.4586, "step": 2352 }, { "epoch": 0.6862559241706161, "grad_norm": 2.3356311321258545, "learning_rate": 0.0001543474032289438, "loss": 4.4076, "step": 2353 }, { "epoch": 0.6865475756471017, "grad_norm": 2.845879554748535, "learning_rate": 0.00015432795176035792, "loss": 4.321, "step": 2354 }, { "epoch": 0.6868392271235874, "grad_norm": 3.0647170543670654, "learning_rate": 0.00015430850029177204, "loss": 4.2794, "step": 2355 }, { "epoch": 0.6871308786000729, "grad_norm": 2.5351595878601074, "learning_rate": 0.00015428904882318616, "loss": 4.2079, "step": 2356 }, { "epoch": 0.6874225300765585, "grad_norm": 2.653730869293213, "learning_rate": 0.00015426959735460028, "loss": 4.3529, "step": 2357 }, { "epoch": 0.6877141815530441, "grad_norm": 3.161334276199341, "learning_rate": 0.0001542501458860144, "loss": 4.266, "step": 2358 }, { "epoch": 0.6880058330295297, "grad_norm": 2.04544997215271, "learning_rate": 0.00015423069441742853, "loss": 4.1717, "step": 2359 }, { "epoch": 0.6882974845060154, "grad_norm": 2.4355430603027344, "learning_rate": 0.00015421124294884265, "loss": 4.4341, "step": 2360 }, { "epoch": 0.6885891359825009, "grad_norm": 2.0975780487060547, "learning_rate": 0.00015419179148025677, "loss": 4.1721, "step": 2361 }, { "epoch": 0.6888807874589865, "grad_norm": 2.231262445449829, "learning_rate": 0.0001541723400116709, "loss": 4.3151, "step": 2362 }, { "epoch": 0.6891724389354721, "grad_norm": 2.868478536605835, "learning_rate": 0.000154152888543085, "loss": 4.4214, "step": 2363 }, { "epoch": 0.6894640904119577, "grad_norm": 2.528679370880127, "learning_rate": 0.00015413343707449913, "loss": 4.2459, "step": 2364 }, { "epoch": 0.6897557418884434, "grad_norm": 2.8783247470855713, "learning_rate": 0.00015411398560591326, "loss": 4.5162, "step": 2365 }, { "epoch": 0.6900473933649289, "grad_norm": 3.4748759269714355, "learning_rate": 0.00015409453413732738, "loss": 4.1691, "step": 2366 }, { "epoch": 0.6903390448414145, "grad_norm": 2.6941637992858887, "learning_rate": 0.0001540750826687415, "loss": 4.2871, "step": 2367 }, { "epoch": 0.6906306963179001, "grad_norm": 2.936803102493286, "learning_rate": 0.00015405563120015562, "loss": 4.5769, "step": 2368 }, { "epoch": 0.6909223477943857, "grad_norm": 2.4327940940856934, "learning_rate": 0.00015403617973156974, "loss": 4.3363, "step": 2369 }, { "epoch": 0.6912139992708713, "grad_norm": 2.706592559814453, "learning_rate": 0.00015401672826298386, "loss": 4.48, "step": 2370 }, { "epoch": 0.6915056507473569, "grad_norm": 2.019561529159546, "learning_rate": 0.00015399727679439799, "loss": 4.3086, "step": 2371 }, { "epoch": 0.6917973022238425, "grad_norm": 4.30936336517334, "learning_rate": 0.0001539778253258121, "loss": 4.3201, "step": 2372 }, { "epoch": 0.6920889537003281, "grad_norm": 2.1608691215515137, "learning_rate": 0.00015395837385722623, "loss": 4.6168, "step": 2373 }, { "epoch": 0.6923806051768137, "grad_norm": 1.8376600742340088, "learning_rate": 0.00015393892238864035, "loss": 4.5463, "step": 2374 }, { "epoch": 0.6926722566532993, "grad_norm": 3.1054673194885254, "learning_rate": 0.00015391947092005447, "loss": 4.3932, "step": 2375 }, { "epoch": 0.6929639081297849, "grad_norm": 3.183250665664673, "learning_rate": 0.0001539000194514686, "loss": 4.5532, "step": 2376 }, { "epoch": 0.6932555596062705, "grad_norm": 3.983808994293213, "learning_rate": 0.00015388056798288272, "loss": 4.4452, "step": 2377 }, { "epoch": 0.6935472110827561, "grad_norm": 3.747873544692993, "learning_rate": 0.00015386111651429684, "loss": 4.5488, "step": 2378 }, { "epoch": 0.6938388625592417, "grad_norm": 3.3091320991516113, "learning_rate": 0.00015384166504571096, "loss": 4.2021, "step": 2379 }, { "epoch": 0.6941305140357273, "grad_norm": 2.4746828079223633, "learning_rate": 0.00015382221357712508, "loss": 4.5732, "step": 2380 }, { "epoch": 0.6944221655122129, "grad_norm": 3.1630070209503174, "learning_rate": 0.0001538027621085392, "loss": 4.2442, "step": 2381 }, { "epoch": 0.6947138169886985, "grad_norm": 2.6582350730895996, "learning_rate": 0.00015378331063995332, "loss": 4.4468, "step": 2382 }, { "epoch": 0.6950054684651841, "grad_norm": 2.157139301300049, "learning_rate": 0.00015376385917136744, "loss": 4.568, "step": 2383 }, { "epoch": 0.6952971199416697, "grad_norm": 4.408022403717041, "learning_rate": 0.00015374440770278157, "loss": 4.3734, "step": 2384 }, { "epoch": 0.6955887714181553, "grad_norm": 2.854814291000366, "learning_rate": 0.0001537249562341957, "loss": 4.5545, "step": 2385 }, { "epoch": 0.6958804228946409, "grad_norm": 3.1970314979553223, "learning_rate": 0.0001537055047656098, "loss": 4.2995, "step": 2386 }, { "epoch": 0.6961720743711265, "grad_norm": 3.3407344818115234, "learning_rate": 0.00015368605329702393, "loss": 4.6169, "step": 2387 }, { "epoch": 0.6964637258476121, "grad_norm": 2.484602928161621, "learning_rate": 0.00015366660182843805, "loss": 4.4767, "step": 2388 }, { "epoch": 0.6967553773240978, "grad_norm": 1.9268450736999512, "learning_rate": 0.00015364715035985217, "loss": 4.3081, "step": 2389 }, { "epoch": 0.6970470288005833, "grad_norm": 2.360761880874634, "learning_rate": 0.0001536276988912663, "loss": 4.5911, "step": 2390 }, { "epoch": 0.6973386802770689, "grad_norm": 3.2361466884613037, "learning_rate": 0.00015360824742268042, "loss": 4.2915, "step": 2391 }, { "epoch": 0.6976303317535545, "grad_norm": 2.194753885269165, "learning_rate": 0.00015358879595409454, "loss": 4.4088, "step": 2392 }, { "epoch": 0.6979219832300401, "grad_norm": 1.890602707862854, "learning_rate": 0.00015356934448550866, "loss": 4.3282, "step": 2393 }, { "epoch": 0.6982136347065258, "grad_norm": 3.0933830738067627, "learning_rate": 0.00015354989301692278, "loss": 4.6159, "step": 2394 }, { "epoch": 0.6985052861830113, "grad_norm": 3.5377025604248047, "learning_rate": 0.0001535304415483369, "loss": 4.5114, "step": 2395 }, { "epoch": 0.6987969376594969, "grad_norm": 1.9224460124969482, "learning_rate": 0.00015351099007975103, "loss": 4.5139, "step": 2396 }, { "epoch": 0.6990885891359825, "grad_norm": 2.9942047595977783, "learning_rate": 0.00015349153861116515, "loss": 4.4467, "step": 2397 }, { "epoch": 0.6993802406124681, "grad_norm": 2.3077588081359863, "learning_rate": 0.00015347208714257927, "loss": 4.4168, "step": 2398 }, { "epoch": 0.6996718920889538, "grad_norm": 2.797964334487915, "learning_rate": 0.0001534526356739934, "loss": 4.5364, "step": 2399 }, { "epoch": 0.6999635435654393, "grad_norm": 2.0835788249969482, "learning_rate": 0.0001534331842054075, "loss": 4.4609, "step": 2400 }, { "epoch": 0.7002551950419249, "grad_norm": 2.9533727169036865, "learning_rate": 0.00015341373273682163, "loss": 4.8001, "step": 2401 }, { "epoch": 0.7005468465184105, "grad_norm": 2.641092538833618, "learning_rate": 0.00015339428126823576, "loss": 4.3574, "step": 2402 }, { "epoch": 0.7008384979948961, "grad_norm": 2.677605628967285, "learning_rate": 0.00015337482979964988, "loss": 4.4181, "step": 2403 }, { "epoch": 0.7011301494713817, "grad_norm": 2.132357358932495, "learning_rate": 0.000153355378331064, "loss": 4.5235, "step": 2404 }, { "epoch": 0.7014218009478673, "grad_norm": 2.2957069873809814, "learning_rate": 0.00015333592686247812, "loss": 4.2668, "step": 2405 }, { "epoch": 0.7017134524243529, "grad_norm": 2.34346079826355, "learning_rate": 0.00015331647539389224, "loss": 4.4415, "step": 2406 }, { "epoch": 0.7020051039008385, "grad_norm": 2.5418381690979004, "learning_rate": 0.00015329702392530636, "loss": 4.5074, "step": 2407 }, { "epoch": 0.7022967553773241, "grad_norm": 3.494810104370117, "learning_rate": 0.00015327757245672048, "loss": 4.3651, "step": 2408 }, { "epoch": 0.7025884068538097, "grad_norm": 3.118584632873535, "learning_rate": 0.0001532581209881346, "loss": 3.8822, "step": 2409 }, { "epoch": 0.7028800583302953, "grad_norm": 2.4980978965759277, "learning_rate": 0.00015323866951954873, "loss": 4.1379, "step": 2410 }, { "epoch": 0.7031717098067809, "grad_norm": 2.665921688079834, "learning_rate": 0.00015321921805096288, "loss": 4.6025, "step": 2411 }, { "epoch": 0.7034633612832665, "grad_norm": 2.6509456634521484, "learning_rate": 0.00015319976658237697, "loss": 4.3441, "step": 2412 }, { "epoch": 0.703755012759752, "grad_norm": 2.8297107219696045, "learning_rate": 0.0001531803151137911, "loss": 4.1657, "step": 2413 }, { "epoch": 0.7040466642362377, "grad_norm": 2.8971009254455566, "learning_rate": 0.00015316086364520521, "loss": 4.5168, "step": 2414 }, { "epoch": 0.7043383157127233, "grad_norm": 2.3571014404296875, "learning_rate": 0.00015314141217661934, "loss": 4.3023, "step": 2415 }, { "epoch": 0.7046299671892089, "grad_norm": 3.2060117721557617, "learning_rate": 0.00015312196070803346, "loss": 4.6749, "step": 2416 }, { "epoch": 0.7049216186656945, "grad_norm": 1.9423112869262695, "learning_rate": 0.00015310250923944758, "loss": 4.4907, "step": 2417 }, { "epoch": 0.70521327014218, "grad_norm": 2.6705591678619385, "learning_rate": 0.00015308305777086173, "loss": 4.5263, "step": 2418 }, { "epoch": 0.7055049216186657, "grad_norm": 2.730862855911255, "learning_rate": 0.00015306360630227582, "loss": 4.777, "step": 2419 }, { "epoch": 0.7057965730951513, "grad_norm": 3.285917282104492, "learning_rate": 0.00015304415483368994, "loss": 4.6208, "step": 2420 }, { "epoch": 0.7060882245716369, "grad_norm": 2.3843443393707275, "learning_rate": 0.0001530247033651041, "loss": 4.5559, "step": 2421 }, { "epoch": 0.7063798760481225, "grad_norm": 1.9667627811431885, "learning_rate": 0.0001530052518965182, "loss": 4.5034, "step": 2422 }, { "epoch": 0.706671527524608, "grad_norm": 2.413362979888916, "learning_rate": 0.0001529858004279323, "loss": 4.7087, "step": 2423 }, { "epoch": 0.7069631790010937, "grad_norm": 1.9664363861083984, "learning_rate": 0.00015296634895934643, "loss": 4.184, "step": 2424 }, { "epoch": 0.7072548304775793, "grad_norm": 2.9186642169952393, "learning_rate": 0.00015294689749076058, "loss": 4.1758, "step": 2425 }, { "epoch": 0.7075464819540649, "grad_norm": 2.8377459049224854, "learning_rate": 0.00015292744602217467, "loss": 4.7494, "step": 2426 }, { "epoch": 0.7078381334305505, "grad_norm": 2.7269301414489746, "learning_rate": 0.0001529079945535888, "loss": 4.4378, "step": 2427 }, { "epoch": 0.708129784907036, "grad_norm": 1.8369779586791992, "learning_rate": 0.00015288854308500294, "loss": 4.1458, "step": 2428 }, { "epoch": 0.7084214363835217, "grad_norm": 2.3662285804748535, "learning_rate": 0.00015286909161641704, "loss": 4.3066, "step": 2429 }, { "epoch": 0.7087130878600073, "grad_norm": 2.538971185684204, "learning_rate": 0.00015284964014783116, "loss": 4.3985, "step": 2430 }, { "epoch": 0.7090047393364929, "grad_norm": 3.5649831295013428, "learning_rate": 0.0001528301886792453, "loss": 3.9576, "step": 2431 }, { "epoch": 0.7092963908129785, "grad_norm": 1.9177734851837158, "learning_rate": 0.0001528107372106594, "loss": 4.2701, "step": 2432 }, { "epoch": 0.709588042289464, "grad_norm": 2.899134874343872, "learning_rate": 0.00015279128574207352, "loss": 4.3824, "step": 2433 }, { "epoch": 0.7098796937659497, "grad_norm": 2.873622179031372, "learning_rate": 0.00015277183427348765, "loss": 4.4638, "step": 2434 }, { "epoch": 0.7101713452424353, "grad_norm": 3.1010515689849854, "learning_rate": 0.0001527523828049018, "loss": 4.4696, "step": 2435 }, { "epoch": 0.7104629967189209, "grad_norm": 2.1056623458862305, "learning_rate": 0.0001527329313363159, "loss": 4.6165, "step": 2436 }, { "epoch": 0.7107546481954065, "grad_norm": 1.7831603288650513, "learning_rate": 0.00015271347986773, "loss": 4.5751, "step": 2437 }, { "epoch": 0.711046299671892, "grad_norm": 2.765665054321289, "learning_rate": 0.00015269402839914416, "loss": 4.4924, "step": 2438 }, { "epoch": 0.7113379511483777, "grad_norm": 3.7052817344665527, "learning_rate": 0.00015267457693055825, "loss": 4.2953, "step": 2439 }, { "epoch": 0.7116296026248633, "grad_norm": 3.0056302547454834, "learning_rate": 0.00015265512546197238, "loss": 4.4823, "step": 2440 }, { "epoch": 0.7119212541013489, "grad_norm": 3.7908923625946045, "learning_rate": 0.00015263567399338652, "loss": 4.5791, "step": 2441 }, { "epoch": 0.7122129055778345, "grad_norm": 2.7172698974609375, "learning_rate": 0.00015261622252480065, "loss": 4.3917, "step": 2442 }, { "epoch": 0.7125045570543201, "grad_norm": 2.179227113723755, "learning_rate": 0.00015259677105621474, "loss": 4.1685, "step": 2443 }, { "epoch": 0.7127962085308057, "grad_norm": 2.0455360412597656, "learning_rate": 0.00015257731958762886, "loss": 4.545, "step": 2444 }, { "epoch": 0.7130878600072913, "grad_norm": 2.4804975986480713, "learning_rate": 0.000152557868119043, "loss": 4.575, "step": 2445 }, { "epoch": 0.7133795114837769, "grad_norm": 2.051804780960083, "learning_rate": 0.0001525384166504571, "loss": 4.0911, "step": 2446 }, { "epoch": 0.7136711629602625, "grad_norm": 2.386639356613159, "learning_rate": 0.00015251896518187123, "loss": 4.5799, "step": 2447 }, { "epoch": 0.7139628144367481, "grad_norm": 2.983159065246582, "learning_rate": 0.00015249951371328538, "loss": 4.4495, "step": 2448 }, { "epoch": 0.7142544659132337, "grad_norm": 3.9037516117095947, "learning_rate": 0.0001524800622446995, "loss": 4.7038, "step": 2449 }, { "epoch": 0.7145461173897193, "grad_norm": 2.7036349773406982, "learning_rate": 0.0001524606107761136, "loss": 4.3257, "step": 2450 }, { "epoch": 0.7148377688662049, "grad_norm": 2.106462001800537, "learning_rate": 0.00015244115930752774, "loss": 4.3855, "step": 2451 }, { "epoch": 0.7151294203426904, "grad_norm": 2.8050148487091064, "learning_rate": 0.00015242170783894186, "loss": 4.3076, "step": 2452 }, { "epoch": 0.7154210718191761, "grad_norm": 3.0473101139068604, "learning_rate": 0.00015240225637035596, "loss": 4.5743, "step": 2453 }, { "epoch": 0.7157127232956617, "grad_norm": 3.0623786449432373, "learning_rate": 0.00015238280490177008, "loss": 4.191, "step": 2454 }, { "epoch": 0.7160043747721473, "grad_norm": 2.4538490772247314, "learning_rate": 0.00015236335343318423, "loss": 4.4429, "step": 2455 }, { "epoch": 0.7162960262486329, "grad_norm": 1.909372329711914, "learning_rate": 0.00015234390196459832, "loss": 4.5466, "step": 2456 }, { "epoch": 0.7165876777251184, "grad_norm": 3.7510573863983154, "learning_rate": 0.00015232445049601244, "loss": 4.3494, "step": 2457 }, { "epoch": 0.7168793292016041, "grad_norm": 2.947935104370117, "learning_rate": 0.0001523049990274266, "loss": 4.5168, "step": 2458 }, { "epoch": 0.7171709806780897, "grad_norm": 2.1839373111724854, "learning_rate": 0.0001522855475588407, "loss": 4.564, "step": 2459 }, { "epoch": 0.7174626321545753, "grad_norm": 2.0927932262420654, "learning_rate": 0.0001522660960902548, "loss": 3.8484, "step": 2460 }, { "epoch": 0.7177542836310609, "grad_norm": 2.2488772869110107, "learning_rate": 0.00015224664462166893, "loss": 4.2776, "step": 2461 }, { "epoch": 0.7180459351075464, "grad_norm": 2.788658618927002, "learning_rate": 0.00015222719315308308, "loss": 4.1194, "step": 2462 }, { "epoch": 0.7183375865840321, "grad_norm": 3.712092399597168, "learning_rate": 0.00015220774168449717, "loss": 4.6854, "step": 2463 }, { "epoch": 0.7186292380605177, "grad_norm": 3.2961173057556152, "learning_rate": 0.0001521882902159113, "loss": 4.2345, "step": 2464 }, { "epoch": 0.7189208895370033, "grad_norm": 2.8532004356384277, "learning_rate": 0.00015216883874732544, "loss": 4.4134, "step": 2465 }, { "epoch": 0.7192125410134889, "grad_norm": 2.0746328830718994, "learning_rate": 0.00015214938727873956, "loss": 4.2437, "step": 2466 }, { "epoch": 0.7195041924899744, "grad_norm": 1.9168422222137451, "learning_rate": 0.00015212993581015366, "loss": 4.461, "step": 2467 }, { "epoch": 0.7197958439664601, "grad_norm": 2.8829660415649414, "learning_rate": 0.0001521104843415678, "loss": 4.2748, "step": 2468 }, { "epoch": 0.7200874954429457, "grad_norm": 2.7287662029266357, "learning_rate": 0.00015209103287298193, "loss": 4.4943, "step": 2469 }, { "epoch": 0.7203791469194313, "grad_norm": 2.9598748683929443, "learning_rate": 0.00015207158140439602, "loss": 4.1136, "step": 2470 }, { "epoch": 0.7206707983959169, "grad_norm": 2.77040433883667, "learning_rate": 0.00015205212993581015, "loss": 4.4788, "step": 2471 }, { "epoch": 0.7209624498724024, "grad_norm": 2.2436928749084473, "learning_rate": 0.0001520326784672243, "loss": 4.5515, "step": 2472 }, { "epoch": 0.7212541013488881, "grad_norm": 2.142392158508301, "learning_rate": 0.00015201322699863842, "loss": 4.6362, "step": 2473 }, { "epoch": 0.7215457528253737, "grad_norm": 2.482677459716797, "learning_rate": 0.0001519937755300525, "loss": 4.4149, "step": 2474 }, { "epoch": 0.7218374043018593, "grad_norm": 2.6646342277526855, "learning_rate": 0.00015197432406146666, "loss": 4.3822, "step": 2475 }, { "epoch": 0.7221290557783449, "grad_norm": 3.336907386779785, "learning_rate": 0.00015195487259288078, "loss": 4.357, "step": 2476 }, { "epoch": 0.7224207072548304, "grad_norm": 2.299849271774292, "learning_rate": 0.00015193542112429487, "loss": 4.3576, "step": 2477 }, { "epoch": 0.7227123587313161, "grad_norm": 2.1935248374938965, "learning_rate": 0.00015191596965570902, "loss": 4.3169, "step": 2478 }, { "epoch": 0.7230040102078017, "grad_norm": 2.1051907539367676, "learning_rate": 0.00015189651818712315, "loss": 4.2118, "step": 2479 }, { "epoch": 0.7232956616842873, "grad_norm": 2.616067886352539, "learning_rate": 0.00015187706671853727, "loss": 4.2067, "step": 2480 }, { "epoch": 0.7235873131607728, "grad_norm": 3.4021403789520264, "learning_rate": 0.00015185761524995136, "loss": 4.5349, "step": 2481 }, { "epoch": 0.7238789646372584, "grad_norm": 3.519183397293091, "learning_rate": 0.0001518381637813655, "loss": 4.3055, "step": 2482 }, { "epoch": 0.7241706161137441, "grad_norm": 2.6832196712493896, "learning_rate": 0.00015181871231277963, "loss": 4.4909, "step": 2483 }, { "epoch": 0.7244622675902297, "grad_norm": 2.743666648864746, "learning_rate": 0.00015179926084419373, "loss": 4.2311, "step": 2484 }, { "epoch": 0.7247539190667153, "grad_norm": 6.166667938232422, "learning_rate": 0.00015177980937560787, "loss": 4.7073, "step": 2485 }, { "epoch": 0.7250455705432008, "grad_norm": 2.1623528003692627, "learning_rate": 0.000151760357907022, "loss": 4.4569, "step": 2486 }, { "epoch": 0.7253372220196864, "grad_norm": 2.2586164474487305, "learning_rate": 0.0001517409064384361, "loss": 4.2482, "step": 2487 }, { "epoch": 0.7256288734961721, "grad_norm": 2.654231309890747, "learning_rate": 0.00015172145496985024, "loss": 4.1031, "step": 2488 }, { "epoch": 0.7259205249726577, "grad_norm": 3.006989002227783, "learning_rate": 0.00015170200350126436, "loss": 4.4323, "step": 2489 }, { "epoch": 0.7262121764491433, "grad_norm": 4.012389183044434, "learning_rate": 0.00015168255203267848, "loss": 4.3016, "step": 2490 }, { "epoch": 0.7265038279256288, "grad_norm": 3.213592052459717, "learning_rate": 0.00015166310056409258, "loss": 4.5464, "step": 2491 }, { "epoch": 0.7267954794021144, "grad_norm": 1.9080443382263184, "learning_rate": 0.00015164364909550673, "loss": 4.4964, "step": 2492 }, { "epoch": 0.7270871308786001, "grad_norm": 2.9068989753723145, "learning_rate": 0.00015162419762692085, "loss": 4.5211, "step": 2493 }, { "epoch": 0.7273787823550857, "grad_norm": 2.892639636993408, "learning_rate": 0.00015160474615833494, "loss": 4.2375, "step": 2494 }, { "epoch": 0.7276704338315713, "grad_norm": 2.444861888885498, "learning_rate": 0.0001515852946897491, "loss": 4.7689, "step": 2495 }, { "epoch": 0.7279620853080568, "grad_norm": 1.4792031049728394, "learning_rate": 0.0001515658432211632, "loss": 4.0987, "step": 2496 }, { "epoch": 0.7282537367845425, "grad_norm": 1.9943393468856812, "learning_rate": 0.00015154639175257733, "loss": 4.6188, "step": 2497 }, { "epoch": 0.7285453882610281, "grad_norm": 2.0389087200164795, "learning_rate": 0.00015152694028399146, "loss": 4.3206, "step": 2498 }, { "epoch": 0.7288370397375137, "grad_norm": 2.1280531883239746, "learning_rate": 0.00015150748881540558, "loss": 4.1992, "step": 2499 }, { "epoch": 0.7291286912139993, "grad_norm": 2.8128292560577393, "learning_rate": 0.0001514880373468197, "loss": 4.3401, "step": 2500 }, { "epoch": 0.7294203426904848, "grad_norm": 3.27543306350708, "learning_rate": 0.0001514685858782338, "loss": 4.6573, "step": 2501 }, { "epoch": 0.7297119941669705, "grad_norm": 3.0491392612457275, "learning_rate": 0.00015144913440964794, "loss": 4.5398, "step": 2502 }, { "epoch": 0.7300036456434561, "grad_norm": 2.4554221630096436, "learning_rate": 0.00015142968294106206, "loss": 4.5896, "step": 2503 }, { "epoch": 0.7302952971199417, "grad_norm": 2.72151517868042, "learning_rate": 0.00015141023147247618, "loss": 4.5714, "step": 2504 }, { "epoch": 0.7305869485964273, "grad_norm": 2.408367872238159, "learning_rate": 0.0001513907800038903, "loss": 4.1197, "step": 2505 }, { "epoch": 0.7308786000729128, "grad_norm": 2.189120292663574, "learning_rate": 0.00015137132853530443, "loss": 4.3652, "step": 2506 }, { "epoch": 0.7311702515493985, "grad_norm": 2.6977298259735107, "learning_rate": 0.00015135187706671855, "loss": 4.2133, "step": 2507 }, { "epoch": 0.7314619030258841, "grad_norm": 1.7989650964736938, "learning_rate": 0.00015133242559813267, "loss": 4.228, "step": 2508 }, { "epoch": 0.7317535545023697, "grad_norm": 2.354710578918457, "learning_rate": 0.0001513129741295468, "loss": 4.3489, "step": 2509 }, { "epoch": 0.7320452059788553, "grad_norm": 2.91229248046875, "learning_rate": 0.00015129352266096091, "loss": 4.563, "step": 2510 }, { "epoch": 0.7323368574553408, "grad_norm": 3.5529937744140625, "learning_rate": 0.000151274071192375, "loss": 4.5673, "step": 2511 }, { "epoch": 0.7326285089318265, "grad_norm": 4.80015754699707, "learning_rate": 0.00015125461972378916, "loss": 4.4224, "step": 2512 }, { "epoch": 0.7329201604083121, "grad_norm": 1.9141250848770142, "learning_rate": 0.00015123516825520328, "loss": 4.4967, "step": 2513 }, { "epoch": 0.7332118118847977, "grad_norm": 2.330155611038208, "learning_rate": 0.0001512157167866174, "loss": 4.3732, "step": 2514 }, { "epoch": 0.7335034633612832, "grad_norm": 2.64730167388916, "learning_rate": 0.00015119626531803152, "loss": 4.5736, "step": 2515 }, { "epoch": 0.7337951148377688, "grad_norm": 2.3277840614318848, "learning_rate": 0.00015117681384944564, "loss": 4.6258, "step": 2516 }, { "epoch": 0.7340867663142545, "grad_norm": 2.456047296524048, "learning_rate": 0.00015115736238085977, "loss": 4.5319, "step": 2517 }, { "epoch": 0.7343784177907401, "grad_norm": 2.2502269744873047, "learning_rate": 0.0001511379109122739, "loss": 4.561, "step": 2518 }, { "epoch": 0.7346700692672257, "grad_norm": 3.851377010345459, "learning_rate": 0.000151118459443688, "loss": 4.2302, "step": 2519 }, { "epoch": 0.7349617207437112, "grad_norm": 2.623746395111084, "learning_rate": 0.00015109900797510213, "loss": 4.6807, "step": 2520 }, { "epoch": 0.7352533722201968, "grad_norm": 3.2555370330810547, "learning_rate": 0.00015107955650651625, "loss": 4.4573, "step": 2521 }, { "epoch": 0.7355450236966825, "grad_norm": 2.389580488204956, "learning_rate": 0.00015106010503793037, "loss": 4.3903, "step": 2522 }, { "epoch": 0.7358366751731681, "grad_norm": 2.487711191177368, "learning_rate": 0.0001510406535693445, "loss": 4.5264, "step": 2523 }, { "epoch": 0.7361283266496537, "grad_norm": 2.8673009872436523, "learning_rate": 0.00015102120210075862, "loss": 4.4616, "step": 2524 }, { "epoch": 0.7364199781261392, "grad_norm": 3.2879140377044678, "learning_rate": 0.00015100175063217274, "loss": 4.6505, "step": 2525 }, { "epoch": 0.7367116296026248, "grad_norm": 2.031306028366089, "learning_rate": 0.00015098229916358686, "loss": 4.4866, "step": 2526 }, { "epoch": 0.7370032810791105, "grad_norm": 2.754295587539673, "learning_rate": 0.00015096284769500098, "loss": 4.5423, "step": 2527 }, { "epoch": 0.7372949325555961, "grad_norm": 2.0576207637786865, "learning_rate": 0.0001509433962264151, "loss": 4.3751, "step": 2528 }, { "epoch": 0.7375865840320817, "grad_norm": 2.5293495655059814, "learning_rate": 0.00015092394475782922, "loss": 4.3068, "step": 2529 }, { "epoch": 0.7378782355085672, "grad_norm": 3.351627826690674, "learning_rate": 0.00015090449328924335, "loss": 4.7179, "step": 2530 }, { "epoch": 0.7381698869850528, "grad_norm": 2.454860210418701, "learning_rate": 0.00015088504182065747, "loss": 4.3633, "step": 2531 }, { "epoch": 0.7384615384615385, "grad_norm": 3.20382022857666, "learning_rate": 0.0001508655903520716, "loss": 4.2449, "step": 2532 }, { "epoch": 0.7387531899380241, "grad_norm": 2.2893733978271484, "learning_rate": 0.0001508461388834857, "loss": 4.4071, "step": 2533 }, { "epoch": 0.7390448414145097, "grad_norm": 2.7135190963745117, "learning_rate": 0.00015082668741489983, "loss": 4.1784, "step": 2534 }, { "epoch": 0.7393364928909952, "grad_norm": 2.2014260292053223, "learning_rate": 0.00015080723594631395, "loss": 4.4314, "step": 2535 }, { "epoch": 0.7396281443674808, "grad_norm": 2.2257750034332275, "learning_rate": 0.00015078778447772808, "loss": 4.4173, "step": 2536 }, { "epoch": 0.7399197958439665, "grad_norm": 2.3861968517303467, "learning_rate": 0.0001507683330091422, "loss": 4.4798, "step": 2537 }, { "epoch": 0.7402114473204521, "grad_norm": 3.350811243057251, "learning_rate": 0.00015074888154055632, "loss": 4.3382, "step": 2538 }, { "epoch": 0.7405030987969377, "grad_norm": 3.491464853286743, "learning_rate": 0.00015072943007197044, "loss": 4.3167, "step": 2539 }, { "epoch": 0.7407947502734232, "grad_norm": 2.5795419216156006, "learning_rate": 0.00015070997860338456, "loss": 4.3823, "step": 2540 }, { "epoch": 0.7410864017499088, "grad_norm": 3.551936149597168, "learning_rate": 0.00015069052713479868, "loss": 4.5211, "step": 2541 }, { "epoch": 0.7413780532263945, "grad_norm": 2.3369219303131104, "learning_rate": 0.0001506710756662128, "loss": 4.3135, "step": 2542 }, { "epoch": 0.7416697047028801, "grad_norm": 1.785071611404419, "learning_rate": 0.00015065162419762693, "loss": 4.18, "step": 2543 }, { "epoch": 0.7419613561793656, "grad_norm": 1.9330154657363892, "learning_rate": 0.00015063217272904105, "loss": 4.4413, "step": 2544 }, { "epoch": 0.7422530076558512, "grad_norm": 4.2632341384887695, "learning_rate": 0.00015061272126045517, "loss": 4.7019, "step": 2545 }, { "epoch": 0.7425446591323368, "grad_norm": 2.3958170413970947, "learning_rate": 0.0001505932697918693, "loss": 4.4783, "step": 2546 }, { "epoch": 0.7428363106088225, "grad_norm": 2.7379770278930664, "learning_rate": 0.0001505738183232834, "loss": 3.953, "step": 2547 }, { "epoch": 0.7431279620853081, "grad_norm": 2.573438882827759, "learning_rate": 0.00015055436685469753, "loss": 4.657, "step": 2548 }, { "epoch": 0.7434196135617936, "grad_norm": 2.200711250305176, "learning_rate": 0.00015053491538611166, "loss": 4.3571, "step": 2549 }, { "epoch": 0.7437112650382792, "grad_norm": 2.156052589416504, "learning_rate": 0.00015051546391752578, "loss": 4.2926, "step": 2550 }, { "epoch": 0.7440029165147649, "grad_norm": 1.9511581659317017, "learning_rate": 0.0001504960124489399, "loss": 4.1399, "step": 2551 }, { "epoch": 0.7442945679912505, "grad_norm": 2.4679152965545654, "learning_rate": 0.00015047656098035402, "loss": 4.2698, "step": 2552 }, { "epoch": 0.7445862194677361, "grad_norm": 2.2492122650146484, "learning_rate": 0.00015045710951176814, "loss": 4.4791, "step": 2553 }, { "epoch": 0.7448778709442216, "grad_norm": 2.4647247791290283, "learning_rate": 0.00015043765804318226, "loss": 4.4197, "step": 2554 }, { "epoch": 0.7451695224207072, "grad_norm": 2.4546120166778564, "learning_rate": 0.00015041820657459639, "loss": 4.425, "step": 2555 }, { "epoch": 0.7454611738971929, "grad_norm": 2.166318416595459, "learning_rate": 0.0001503987551060105, "loss": 4.5035, "step": 2556 }, { "epoch": 0.7457528253736785, "grad_norm": 2.340508222579956, "learning_rate": 0.00015037930363742463, "loss": 4.3393, "step": 2557 }, { "epoch": 0.7460444768501641, "grad_norm": 2.4203436374664307, "learning_rate": 0.00015035985216883875, "loss": 4.233, "step": 2558 }, { "epoch": 0.7463361283266496, "grad_norm": 2.5152928829193115, "learning_rate": 0.00015034040070025287, "loss": 4.446, "step": 2559 }, { "epoch": 0.7466277798031352, "grad_norm": 1.9174275398254395, "learning_rate": 0.000150320949231667, "loss": 4.4793, "step": 2560 }, { "epoch": 0.7469194312796209, "grad_norm": 1.731989860534668, "learning_rate": 0.00015030149776308112, "loss": 4.43, "step": 2561 }, { "epoch": 0.7472110827561065, "grad_norm": 3.0934271812438965, "learning_rate": 0.00015028204629449524, "loss": 4.6695, "step": 2562 }, { "epoch": 0.7475027342325921, "grad_norm": 1.7175320386886597, "learning_rate": 0.00015026259482590936, "loss": 4.4263, "step": 2563 }, { "epoch": 0.7477943857090776, "grad_norm": 1.9586858749389648, "learning_rate": 0.00015024314335732348, "loss": 4.223, "step": 2564 }, { "epoch": 0.7480860371855632, "grad_norm": 1.9741307497024536, "learning_rate": 0.00015022369188873763, "loss": 4.2853, "step": 2565 }, { "epoch": 0.7483776886620489, "grad_norm": 2.512439727783203, "learning_rate": 0.00015020424042015172, "loss": 4.3686, "step": 2566 }, { "epoch": 0.7486693401385345, "grad_norm": 2.698857069015503, "learning_rate": 0.00015018478895156585, "loss": 4.2286, "step": 2567 }, { "epoch": 0.74896099161502, "grad_norm": 2.2837977409362793, "learning_rate": 0.00015016533748297997, "loss": 4.5319, "step": 2568 }, { "epoch": 0.7492526430915056, "grad_norm": 2.8764264583587646, "learning_rate": 0.0001501458860143941, "loss": 4.5384, "step": 2569 }, { "epoch": 0.7495442945679912, "grad_norm": 2.215221881866455, "learning_rate": 0.0001501264345458082, "loss": 4.59, "step": 2570 }, { "epoch": 0.7498359460444769, "grad_norm": 2.588426351547241, "learning_rate": 0.00015010698307722233, "loss": 4.3583, "step": 2571 }, { "epoch": 0.7501275975209625, "grad_norm": 2.6747703552246094, "learning_rate": 0.00015008753160863648, "loss": 4.3123, "step": 2572 }, { "epoch": 0.750419248997448, "grad_norm": 2.4299211502075195, "learning_rate": 0.00015006808014005057, "loss": 4.5279, "step": 2573 }, { "epoch": 0.7507109004739336, "grad_norm": 1.9714173078536987, "learning_rate": 0.0001500486286714647, "loss": 4.1995, "step": 2574 }, { "epoch": 0.7510025519504192, "grad_norm": 2.560520887374878, "learning_rate": 0.00015002917720287885, "loss": 4.4513, "step": 2575 }, { "epoch": 0.7512942034269049, "grad_norm": 2.4149935245513916, "learning_rate": 0.00015000972573429294, "loss": 4.3276, "step": 2576 }, { "epoch": 0.7515858549033905, "grad_norm": 2.3523502349853516, "learning_rate": 0.00014999027426570706, "loss": 4.2937, "step": 2577 }, { "epoch": 0.751877506379876, "grad_norm": 1.8668893575668335, "learning_rate": 0.00014997082279712118, "loss": 4.3965, "step": 2578 }, { "epoch": 0.7521691578563616, "grad_norm": 2.630133867263794, "learning_rate": 0.0001499513713285353, "loss": 4.4303, "step": 2579 }, { "epoch": 0.7524608093328472, "grad_norm": 2.175570487976074, "learning_rate": 0.00014993191985994943, "loss": 4.6205, "step": 2580 }, { "epoch": 0.7527524608093329, "grad_norm": 2.311983823776245, "learning_rate": 0.00014991246839136355, "loss": 4.3181, "step": 2581 }, { "epoch": 0.7530441122858185, "grad_norm": 2.3136041164398193, "learning_rate": 0.0001498930169227777, "loss": 4.5681, "step": 2582 }, { "epoch": 0.753335763762304, "grad_norm": 1.6228424310684204, "learning_rate": 0.0001498735654541918, "loss": 4.2127, "step": 2583 }, { "epoch": 0.7536274152387896, "grad_norm": 1.8283442258834839, "learning_rate": 0.0001498541139856059, "loss": 4.4317, "step": 2584 }, { "epoch": 0.7539190667152752, "grad_norm": 2.853330373764038, "learning_rate": 0.00014983466251702006, "loss": 4.4381, "step": 2585 }, { "epoch": 0.7542107181917609, "grad_norm": 2.0398528575897217, "learning_rate": 0.00014981521104843416, "loss": 4.3575, "step": 2586 }, { "epoch": 0.7545023696682465, "grad_norm": 2.1052005290985107, "learning_rate": 0.00014979575957984828, "loss": 4.573, "step": 2587 }, { "epoch": 0.754794021144732, "grad_norm": 2.457582473754883, "learning_rate": 0.0001497763081112624, "loss": 4.5729, "step": 2588 }, { "epoch": 0.7550856726212176, "grad_norm": 2.348501443862915, "learning_rate": 0.00014975685664267655, "loss": 4.6718, "step": 2589 }, { "epoch": 0.7553773240977032, "grad_norm": 1.9891310930252075, "learning_rate": 0.00014973740517409064, "loss": 4.4835, "step": 2590 }, { "epoch": 0.7556689755741889, "grad_norm": 2.260352373123169, "learning_rate": 0.00014971795370550476, "loss": 4.0953, "step": 2591 }, { "epoch": 0.7559606270506745, "grad_norm": 2.5519297122955322, "learning_rate": 0.0001496985022369189, "loss": 4.2248, "step": 2592 }, { "epoch": 0.75625227852716, "grad_norm": 2.260202407836914, "learning_rate": 0.000149679050768333, "loss": 4.4188, "step": 2593 }, { "epoch": 0.7565439300036456, "grad_norm": 3.0118587017059326, "learning_rate": 0.00014965959929974713, "loss": 4.4791, "step": 2594 }, { "epoch": 0.7568355814801312, "grad_norm": 2.044414758682251, "learning_rate": 0.00014964014783116128, "loss": 4.179, "step": 2595 }, { "epoch": 0.7571272329566169, "grad_norm": 2.3002800941467285, "learning_rate": 0.0001496206963625754, "loss": 4.4128, "step": 2596 }, { "epoch": 0.7574188844331025, "grad_norm": 2.06858229637146, "learning_rate": 0.0001496012448939895, "loss": 4.1121, "step": 2597 }, { "epoch": 0.757710535909588, "grad_norm": 2.1476612091064453, "learning_rate": 0.00014958179342540361, "loss": 4.4002, "step": 2598 }, { "epoch": 0.7580021873860736, "grad_norm": 1.9278197288513184, "learning_rate": 0.00014956234195681776, "loss": 4.5416, "step": 2599 }, { "epoch": 0.7582938388625592, "grad_norm": 3.2266902923583984, "learning_rate": 0.00014954289048823186, "loss": 4.2859, "step": 2600 }, { "epoch": 0.7585854903390449, "grad_norm": 2.0428555011749268, "learning_rate": 0.00014952343901964598, "loss": 4.0481, "step": 2601 }, { "epoch": 0.7588771418155305, "grad_norm": 3.077526330947876, "learning_rate": 0.00014950398755106013, "loss": 4.3295, "step": 2602 }, { "epoch": 0.759168793292016, "grad_norm": 2.7813191413879395, "learning_rate": 0.00014948453608247422, "loss": 4.2366, "step": 2603 }, { "epoch": 0.7594604447685016, "grad_norm": 2.1143674850463867, "learning_rate": 0.00014946508461388834, "loss": 4.1664, "step": 2604 }, { "epoch": 0.7597520962449872, "grad_norm": 2.5403287410736084, "learning_rate": 0.0001494456331453025, "loss": 4.5605, "step": 2605 }, { "epoch": 0.7600437477214729, "grad_norm": 2.1761064529418945, "learning_rate": 0.00014942618167671661, "loss": 4.4216, "step": 2606 }, { "epoch": 0.7603353991979585, "grad_norm": 2.2008280754089355, "learning_rate": 0.0001494067302081307, "loss": 4.7318, "step": 2607 }, { "epoch": 0.760627050674444, "grad_norm": 2.261990547180176, "learning_rate": 0.00014938727873954483, "loss": 4.5407, "step": 2608 }, { "epoch": 0.7609187021509296, "grad_norm": 1.999077320098877, "learning_rate": 0.00014936782727095898, "loss": 4.4835, "step": 2609 }, { "epoch": 0.7612103536274153, "grad_norm": 3.192777633666992, "learning_rate": 0.00014934837580237307, "loss": 4.4682, "step": 2610 }, { "epoch": 0.7615020051039009, "grad_norm": 2.3221678733825684, "learning_rate": 0.0001493289243337872, "loss": 4.37, "step": 2611 }, { "epoch": 0.7617936565803864, "grad_norm": 3.2160255908966064, "learning_rate": 0.00014930947286520134, "loss": 4.6294, "step": 2612 }, { "epoch": 0.762085308056872, "grad_norm": 2.412095308303833, "learning_rate": 0.00014929002139661547, "loss": 4.5589, "step": 2613 }, { "epoch": 0.7623769595333576, "grad_norm": 2.2397403717041016, "learning_rate": 0.00014927056992802956, "loss": 4.7157, "step": 2614 }, { "epoch": 0.7626686110098433, "grad_norm": 3.383363723754883, "learning_rate": 0.0001492511184594437, "loss": 4.3313, "step": 2615 }, { "epoch": 0.7629602624863289, "grad_norm": 3.7269222736358643, "learning_rate": 0.00014923166699085783, "loss": 4.2557, "step": 2616 }, { "epoch": 0.7632519139628144, "grad_norm": 3.3269898891448975, "learning_rate": 0.00014921221552227192, "loss": 4.3258, "step": 2617 }, { "epoch": 0.7635435654393, "grad_norm": 2.3037893772125244, "learning_rate": 0.00014919276405368605, "loss": 4.4888, "step": 2618 }, { "epoch": 0.7638352169157856, "grad_norm": 1.9640206098556519, "learning_rate": 0.0001491733125851002, "loss": 4.5742, "step": 2619 }, { "epoch": 0.7641268683922713, "grad_norm": 2.953092336654663, "learning_rate": 0.00014915386111651432, "loss": 4.2986, "step": 2620 }, { "epoch": 0.7644185198687569, "grad_norm": 3.389009475708008, "learning_rate": 0.0001491344096479284, "loss": 4.56, "step": 2621 }, { "epoch": 0.7647101713452424, "grad_norm": 4.165693759918213, "learning_rate": 0.00014911495817934256, "loss": 4.1718, "step": 2622 }, { "epoch": 0.765001822821728, "grad_norm": 2.410318374633789, "learning_rate": 0.00014909550671075668, "loss": 4.2228, "step": 2623 }, { "epoch": 0.7652934742982136, "grad_norm": 2.337794303894043, "learning_rate": 0.00014907605524217078, "loss": 4.5071, "step": 2624 }, { "epoch": 0.7655851257746993, "grad_norm": 2.490621328353882, "learning_rate": 0.0001490566037735849, "loss": 4.3789, "step": 2625 }, { "epoch": 0.7658767772511849, "grad_norm": 1.9605070352554321, "learning_rate": 0.00014903715230499905, "loss": 4.3857, "step": 2626 }, { "epoch": 0.7661684287276704, "grad_norm": 2.428678512573242, "learning_rate": 0.00014901770083641314, "loss": 4.3008, "step": 2627 }, { "epoch": 0.766460080204156, "grad_norm": 1.9955047369003296, "learning_rate": 0.00014899824936782726, "loss": 4.3917, "step": 2628 }, { "epoch": 0.7667517316806416, "grad_norm": 2.7258059978485107, "learning_rate": 0.0001489787978992414, "loss": 4.4493, "step": 2629 }, { "epoch": 0.7670433831571273, "grad_norm": 3.3363802433013916, "learning_rate": 0.00014895934643065553, "loss": 4.2992, "step": 2630 }, { "epoch": 0.7673350346336129, "grad_norm": 2.1759140491485596, "learning_rate": 0.00014893989496206963, "loss": 4.3446, "step": 2631 }, { "epoch": 0.7676266861100984, "grad_norm": 2.5120561122894287, "learning_rate": 0.00014892044349348378, "loss": 4.2935, "step": 2632 }, { "epoch": 0.767918337586584, "grad_norm": 1.6992019414901733, "learning_rate": 0.0001489009920248979, "loss": 4.5706, "step": 2633 }, { "epoch": 0.7682099890630696, "grad_norm": 2.1700103282928467, "learning_rate": 0.000148881540556312, "loss": 4.2669, "step": 2634 }, { "epoch": 0.7685016405395553, "grad_norm": 1.7107188701629639, "learning_rate": 0.00014886208908772611, "loss": 4.2448, "step": 2635 }, { "epoch": 0.7687932920160409, "grad_norm": 1.8933542966842651, "learning_rate": 0.00014884263761914026, "loss": 4.3977, "step": 2636 }, { "epoch": 0.7690849434925264, "grad_norm": 2.065427303314209, "learning_rate": 0.00014882318615055438, "loss": 4.1782, "step": 2637 }, { "epoch": 0.769376594969012, "grad_norm": 3.0617923736572266, "learning_rate": 0.00014880373468196848, "loss": 4.3588, "step": 2638 }, { "epoch": 0.7696682464454976, "grad_norm": 3.8835442066192627, "learning_rate": 0.00014878428321338263, "loss": 4.181, "step": 2639 }, { "epoch": 0.7699598979219833, "grad_norm": 2.3070785999298096, "learning_rate": 0.00014876483174479675, "loss": 4.0093, "step": 2640 }, { "epoch": 0.7702515493984688, "grad_norm": 3.674779176712036, "learning_rate": 0.00014874538027621084, "loss": 4.2347, "step": 2641 }, { "epoch": 0.7705432008749544, "grad_norm": 2.254138469696045, "learning_rate": 0.000148725928807625, "loss": 4.501, "step": 2642 }, { "epoch": 0.77083485235144, "grad_norm": 2.2196688652038574, "learning_rate": 0.0001487064773390391, "loss": 4.2943, "step": 2643 }, { "epoch": 0.7711265038279256, "grad_norm": 2.2137110233306885, "learning_rate": 0.00014868702587045324, "loss": 4.4949, "step": 2644 }, { "epoch": 0.7714181553044113, "grad_norm": 3.3340213298797607, "learning_rate": 0.00014866757440186733, "loss": 4.5357, "step": 2645 }, { "epoch": 0.7717098067808968, "grad_norm": 2.8309783935546875, "learning_rate": 0.00014864812293328148, "loss": 4.3272, "step": 2646 }, { "epoch": 0.7720014582573824, "grad_norm": 2.306119441986084, "learning_rate": 0.0001486286714646956, "loss": 4.2266, "step": 2647 }, { "epoch": 0.772293109733868, "grad_norm": 1.9352866411209106, "learning_rate": 0.0001486092199961097, "loss": 4.1099, "step": 2648 }, { "epoch": 0.7725847612103536, "grad_norm": 2.8549890518188477, "learning_rate": 0.00014858976852752384, "loss": 4.325, "step": 2649 }, { "epoch": 0.7728764126868393, "grad_norm": 2.165755271911621, "learning_rate": 0.00014857031705893796, "loss": 4.4118, "step": 2650 }, { "epoch": 0.7731680641633248, "grad_norm": 2.70467472076416, "learning_rate": 0.00014855086559035209, "loss": 4.5081, "step": 2651 }, { "epoch": 0.7734597156398104, "grad_norm": 1.932212233543396, "learning_rate": 0.0001485314141217662, "loss": 4.3532, "step": 2652 }, { "epoch": 0.773751367116296, "grad_norm": 2.5301408767700195, "learning_rate": 0.00014851196265318033, "loss": 4.259, "step": 2653 }, { "epoch": 0.7740430185927816, "grad_norm": 2.4629085063934326, "learning_rate": 0.00014849251118459445, "loss": 4.1954, "step": 2654 }, { "epoch": 0.7743346700692673, "grad_norm": 2.220374345779419, "learning_rate": 0.00014847305971600855, "loss": 4.1817, "step": 2655 }, { "epoch": 0.7746263215457528, "grad_norm": 2.2650363445281982, "learning_rate": 0.0001484536082474227, "loss": 4.2556, "step": 2656 }, { "epoch": 0.7749179730222384, "grad_norm": 3.3331074714660645, "learning_rate": 0.00014843415677883682, "loss": 4.4214, "step": 2657 }, { "epoch": 0.775209624498724, "grad_norm": 1.9340697526931763, "learning_rate": 0.0001484147053102509, "loss": 4.1983, "step": 2658 }, { "epoch": 0.7755012759752096, "grad_norm": 2.173015594482422, "learning_rate": 0.00014839525384166506, "loss": 4.1595, "step": 2659 }, { "epoch": 0.7757929274516953, "grad_norm": 2.430969476699829, "learning_rate": 0.00014837580237307918, "loss": 4.2315, "step": 2660 }, { "epoch": 0.7760845789281808, "grad_norm": 2.4736995697021484, "learning_rate": 0.0001483563509044933, "loss": 4.2856, "step": 2661 }, { "epoch": 0.7763762304046664, "grad_norm": 2.1475958824157715, "learning_rate": 0.00014833689943590742, "loss": 4.4765, "step": 2662 }, { "epoch": 0.776667881881152, "grad_norm": 2.341376543045044, "learning_rate": 0.00014831744796732155, "loss": 4.0557, "step": 2663 }, { "epoch": 0.7769595333576377, "grad_norm": 2.047982692718506, "learning_rate": 0.00014829799649873567, "loss": 4.4349, "step": 2664 }, { "epoch": 0.7772511848341233, "grad_norm": 1.9214835166931152, "learning_rate": 0.00014827854503014976, "loss": 4.1755, "step": 2665 }, { "epoch": 0.7775428363106088, "grad_norm": 2.4037094116210938, "learning_rate": 0.0001482590935615639, "loss": 4.558, "step": 2666 }, { "epoch": 0.7778344877870944, "grad_norm": 5.184794902801514, "learning_rate": 0.00014823964209297803, "loss": 4.5791, "step": 2667 }, { "epoch": 0.77812613926358, "grad_norm": 3.348909854888916, "learning_rate": 0.00014822019062439215, "loss": 4.4718, "step": 2668 }, { "epoch": 0.7784177907400657, "grad_norm": 3.093944549560547, "learning_rate": 0.00014820073915580627, "loss": 4.521, "step": 2669 }, { "epoch": 0.7787094422165513, "grad_norm": 2.511300802230835, "learning_rate": 0.0001481812876872204, "loss": 4.6369, "step": 2670 }, { "epoch": 0.7790010936930368, "grad_norm": 2.6372663974761963, "learning_rate": 0.00014816183621863452, "loss": 4.1814, "step": 2671 }, { "epoch": 0.7792927451695224, "grad_norm": 2.7611749172210693, "learning_rate": 0.00014814238475004864, "loss": 4.3997, "step": 2672 }, { "epoch": 0.779584396646008, "grad_norm": 2.66988205909729, "learning_rate": 0.00014812293328146276, "loss": 4.3161, "step": 2673 }, { "epoch": 0.7798760481224937, "grad_norm": 1.8330650329589844, "learning_rate": 0.00014810348181287688, "loss": 4.3643, "step": 2674 }, { "epoch": 0.7801676995989792, "grad_norm": 2.280526876449585, "learning_rate": 0.000148084030344291, "loss": 4.3506, "step": 2675 }, { "epoch": 0.7804593510754648, "grad_norm": 4.237382411956787, "learning_rate": 0.00014806457887570513, "loss": 4.3707, "step": 2676 }, { "epoch": 0.7807510025519504, "grad_norm": 2.672053337097168, "learning_rate": 0.00014804512740711925, "loss": 4.645, "step": 2677 }, { "epoch": 0.781042654028436, "grad_norm": 2.9069340229034424, "learning_rate": 0.00014802567593853337, "loss": 4.3737, "step": 2678 }, { "epoch": 0.7813343055049217, "grad_norm": 2.7803750038146973, "learning_rate": 0.0001480062244699475, "loss": 4.5758, "step": 2679 }, { "epoch": 0.7816259569814072, "grad_norm": 3.9649910926818848, "learning_rate": 0.0001479867730013616, "loss": 4.562, "step": 2680 }, { "epoch": 0.7819176084578928, "grad_norm": 2.179276704788208, "learning_rate": 0.00014796732153277573, "loss": 4.3436, "step": 2681 }, { "epoch": 0.7822092599343784, "grad_norm": 2.465590238571167, "learning_rate": 0.00014794787006418986, "loss": 4.5166, "step": 2682 }, { "epoch": 0.782500911410864, "grad_norm": 2.1283650398254395, "learning_rate": 0.00014792841859560398, "loss": 4.5623, "step": 2683 }, { "epoch": 0.7827925628873497, "grad_norm": 2.2230348587036133, "learning_rate": 0.0001479089671270181, "loss": 4.4866, "step": 2684 }, { "epoch": 0.7830842143638352, "grad_norm": 2.352076768875122, "learning_rate": 0.00014788951565843222, "loss": 4.5161, "step": 2685 }, { "epoch": 0.7833758658403208, "grad_norm": 2.088242530822754, "learning_rate": 0.00014787006418984634, "loss": 4.2795, "step": 2686 }, { "epoch": 0.7836675173168064, "grad_norm": 2.553769111633301, "learning_rate": 0.00014785061272126046, "loss": 4.3552, "step": 2687 }, { "epoch": 0.783959168793292, "grad_norm": 2.4642679691314697, "learning_rate": 0.00014783116125267459, "loss": 4.4414, "step": 2688 }, { "epoch": 0.7842508202697777, "grad_norm": 3.130713939666748, "learning_rate": 0.0001478117097840887, "loss": 4.3209, "step": 2689 }, { "epoch": 0.7845424717462632, "grad_norm": 2.4005677700042725, "learning_rate": 0.00014779225831550283, "loss": 4.4097, "step": 2690 }, { "epoch": 0.7848341232227488, "grad_norm": 2.2695508003234863, "learning_rate": 0.00014777280684691695, "loss": 4.6433, "step": 2691 }, { "epoch": 0.7851257746992344, "grad_norm": 2.072512626647949, "learning_rate": 0.00014775335537833107, "loss": 4.5608, "step": 2692 }, { "epoch": 0.78541742617572, "grad_norm": 2.648627519607544, "learning_rate": 0.0001477339039097452, "loss": 4.4393, "step": 2693 }, { "epoch": 0.7857090776522057, "grad_norm": 1.766588568687439, "learning_rate": 0.00014771445244115931, "loss": 4.2704, "step": 2694 }, { "epoch": 0.7860007291286912, "grad_norm": 2.142263412475586, "learning_rate": 0.00014769500097257344, "loss": 4.0813, "step": 2695 }, { "epoch": 0.7862923806051768, "grad_norm": 2.6213228702545166, "learning_rate": 0.00014767554950398756, "loss": 4.4218, "step": 2696 }, { "epoch": 0.7865840320816624, "grad_norm": 1.7448598146438599, "learning_rate": 0.00014765609803540168, "loss": 4.1037, "step": 2697 }, { "epoch": 0.786875683558148, "grad_norm": 2.846231460571289, "learning_rate": 0.0001476366465668158, "loss": 4.6316, "step": 2698 }, { "epoch": 0.7871673350346337, "grad_norm": 2.4942286014556885, "learning_rate": 0.00014761719509822992, "loss": 4.1977, "step": 2699 }, { "epoch": 0.7874589865111192, "grad_norm": 2.1270546913146973, "learning_rate": 0.00014759774362964404, "loss": 4.4343, "step": 2700 }, { "epoch": 0.7877506379876048, "grad_norm": 2.8405227661132812, "learning_rate": 0.00014757829216105817, "loss": 4.4517, "step": 2701 }, { "epoch": 0.7880422894640904, "grad_norm": 2.2319021224975586, "learning_rate": 0.0001475588406924723, "loss": 4.2055, "step": 2702 }, { "epoch": 0.788333940940576, "grad_norm": 1.8203620910644531, "learning_rate": 0.0001475393892238864, "loss": 4.4264, "step": 2703 }, { "epoch": 0.7886255924170616, "grad_norm": 2.1274237632751465, "learning_rate": 0.00014751993775530053, "loss": 4.1921, "step": 2704 }, { "epoch": 0.7889172438935472, "grad_norm": 2.0734095573425293, "learning_rate": 0.00014750048628671465, "loss": 4.3277, "step": 2705 }, { "epoch": 0.7892088953700328, "grad_norm": 2.2385380268096924, "learning_rate": 0.00014748103481812877, "loss": 4.438, "step": 2706 }, { "epoch": 0.7895005468465184, "grad_norm": 2.907902479171753, "learning_rate": 0.0001474615833495429, "loss": 4.4831, "step": 2707 }, { "epoch": 0.789792198323004, "grad_norm": 3.276188611984253, "learning_rate": 0.00014744213188095702, "loss": 4.2925, "step": 2708 }, { "epoch": 0.7900838497994896, "grad_norm": 1.9432767629623413, "learning_rate": 0.00014742268041237114, "loss": 4.2178, "step": 2709 }, { "epoch": 0.7903755012759752, "grad_norm": 2.7607250213623047, "learning_rate": 0.00014740322894378526, "loss": 4.5827, "step": 2710 }, { "epoch": 0.7906671527524608, "grad_norm": 1.6210548877716064, "learning_rate": 0.00014738377747519938, "loss": 4.374, "step": 2711 }, { "epoch": 0.7909588042289464, "grad_norm": 2.4093239307403564, "learning_rate": 0.0001473643260066135, "loss": 4.3348, "step": 2712 }, { "epoch": 0.791250455705432, "grad_norm": 2.1778159141540527, "learning_rate": 0.00014734487453802763, "loss": 4.4491, "step": 2713 }, { "epoch": 0.7915421071819176, "grad_norm": 2.40640926361084, "learning_rate": 0.00014732542306944175, "loss": 4.6195, "step": 2714 }, { "epoch": 0.7918337586584032, "grad_norm": 2.953805923461914, "learning_rate": 0.00014730597160085587, "loss": 4.6112, "step": 2715 }, { "epoch": 0.7921254101348888, "grad_norm": 2.000847578048706, "learning_rate": 0.00014728652013227, "loss": 4.2747, "step": 2716 }, { "epoch": 0.7924170616113744, "grad_norm": 3.075915575027466, "learning_rate": 0.0001472670686636841, "loss": 4.4536, "step": 2717 }, { "epoch": 0.7927087130878601, "grad_norm": 2.234755277633667, "learning_rate": 0.00014724761719509823, "loss": 4.2811, "step": 2718 }, { "epoch": 0.7930003645643456, "grad_norm": 2.372243881225586, "learning_rate": 0.00014722816572651235, "loss": 4.5084, "step": 2719 }, { "epoch": 0.7932920160408312, "grad_norm": 2.2752795219421387, "learning_rate": 0.00014720871425792648, "loss": 4.331, "step": 2720 }, { "epoch": 0.7935836675173168, "grad_norm": 2.6117711067199707, "learning_rate": 0.0001471892627893406, "loss": 4.29, "step": 2721 }, { "epoch": 0.7938753189938024, "grad_norm": 2.209355115890503, "learning_rate": 0.00014716981132075472, "loss": 4.3354, "step": 2722 }, { "epoch": 0.7941669704702881, "grad_norm": 2.3292195796966553, "learning_rate": 0.00014715035985216884, "loss": 4.3599, "step": 2723 }, { "epoch": 0.7944586219467736, "grad_norm": 2.2322793006896973, "learning_rate": 0.00014713090838358296, "loss": 4.4236, "step": 2724 }, { "epoch": 0.7947502734232592, "grad_norm": 2.077883720397949, "learning_rate": 0.00014711145691499708, "loss": 4.4738, "step": 2725 }, { "epoch": 0.7950419248997448, "grad_norm": 2.3727076053619385, "learning_rate": 0.0001470920054464112, "loss": 4.2267, "step": 2726 }, { "epoch": 0.7953335763762304, "grad_norm": 3.818071126937866, "learning_rate": 0.00014707255397782533, "loss": 4.6729, "step": 2727 }, { "epoch": 0.795625227852716, "grad_norm": 3.410290002822876, "learning_rate": 0.00014705310250923945, "loss": 4.3173, "step": 2728 }, { "epoch": 0.7959168793292016, "grad_norm": 2.5613207817077637, "learning_rate": 0.0001470336510406536, "loss": 4.6593, "step": 2729 }, { "epoch": 0.7962085308056872, "grad_norm": 2.3325135707855225, "learning_rate": 0.0001470141995720677, "loss": 4.4816, "step": 2730 }, { "epoch": 0.7965001822821728, "grad_norm": 1.853758692741394, "learning_rate": 0.00014699474810348181, "loss": 3.7596, "step": 2731 }, { "epoch": 0.7967918337586584, "grad_norm": 2.0903377532958984, "learning_rate": 0.00014697529663489594, "loss": 4.2751, "step": 2732 }, { "epoch": 0.797083485235144, "grad_norm": 3.467167377471924, "learning_rate": 0.00014695584516631006, "loss": 4.2386, "step": 2733 }, { "epoch": 0.7973751367116296, "grad_norm": 3.366417646408081, "learning_rate": 0.00014693639369772418, "loss": 4.5783, "step": 2734 }, { "epoch": 0.7976667881881152, "grad_norm": 2.73396897315979, "learning_rate": 0.0001469169422291383, "loss": 4.1724, "step": 2735 }, { "epoch": 0.7979584396646008, "grad_norm": 3.3771026134490967, "learning_rate": 0.00014689749076055245, "loss": 4.1811, "step": 2736 }, { "epoch": 0.7982500911410864, "grad_norm": 3.060520648956299, "learning_rate": 0.00014687803929196654, "loss": 4.4575, "step": 2737 }, { "epoch": 0.798541742617572, "grad_norm": 2.3645734786987305, "learning_rate": 0.00014685858782338066, "loss": 4.3076, "step": 2738 }, { "epoch": 0.7988333940940576, "grad_norm": 2.4960615634918213, "learning_rate": 0.00014683913635479481, "loss": 4.2466, "step": 2739 }, { "epoch": 0.7991250455705432, "grad_norm": 3.2011029720306396, "learning_rate": 0.0001468196848862089, "loss": 4.6807, "step": 2740 }, { "epoch": 0.7994166970470288, "grad_norm": 2.5132803916931152, "learning_rate": 0.00014680023341762303, "loss": 4.0018, "step": 2741 }, { "epoch": 0.7997083485235144, "grad_norm": 2.7028188705444336, "learning_rate": 0.00014678078194903715, "loss": 4.4993, "step": 2742 }, { "epoch": 0.8, "grad_norm": 1.9762917757034302, "learning_rate": 0.0001467613304804513, "loss": 4.4127, "step": 2743 }, { "epoch": 0.8002916514764856, "grad_norm": 2.5142664909362793, "learning_rate": 0.0001467418790118654, "loss": 4.3237, "step": 2744 }, { "epoch": 0.8005833029529712, "grad_norm": 2.2105867862701416, "learning_rate": 0.00014672242754327952, "loss": 4.4301, "step": 2745 }, { "epoch": 0.8008749544294568, "grad_norm": 2.0689358711242676, "learning_rate": 0.00014670297607469366, "loss": 4.5103, "step": 2746 }, { "epoch": 0.8011666059059424, "grad_norm": 1.9372881650924683, "learning_rate": 0.00014668352460610776, "loss": 4.292, "step": 2747 }, { "epoch": 0.801458257382428, "grad_norm": 2.311852216720581, "learning_rate": 0.00014666407313752188, "loss": 3.9273, "step": 2748 }, { "epoch": 0.8017499088589136, "grad_norm": 3.405014991760254, "learning_rate": 0.00014664462166893603, "loss": 4.4859, "step": 2749 }, { "epoch": 0.8020415603353992, "grad_norm": 2.607384204864502, "learning_rate": 0.00014662517020035012, "loss": 4.0965, "step": 2750 }, { "epoch": 0.8023332118118848, "grad_norm": 2.7967336177825928, "learning_rate": 0.00014660571873176425, "loss": 4.3429, "step": 2751 }, { "epoch": 0.8026248632883703, "grad_norm": 2.3427586555480957, "learning_rate": 0.00014658626726317837, "loss": 4.6198, "step": 2752 }, { "epoch": 0.802916514764856, "grad_norm": 3.3676888942718506, "learning_rate": 0.00014656681579459252, "loss": 4.5185, "step": 2753 }, { "epoch": 0.8032081662413416, "grad_norm": 2.4695355892181396, "learning_rate": 0.0001465473643260066, "loss": 4.5267, "step": 2754 }, { "epoch": 0.8034998177178272, "grad_norm": 2.6803486347198486, "learning_rate": 0.00014652791285742073, "loss": 4.4544, "step": 2755 }, { "epoch": 0.8037914691943128, "grad_norm": 2.3630003929138184, "learning_rate": 0.00014650846138883488, "loss": 4.5557, "step": 2756 }, { "epoch": 0.8040831206707983, "grad_norm": 3.1077864170074463, "learning_rate": 0.00014648900992024898, "loss": 4.4327, "step": 2757 }, { "epoch": 0.804374772147284, "grad_norm": 2.168970823287964, "learning_rate": 0.0001464695584516631, "loss": 4.542, "step": 2758 }, { "epoch": 0.8046664236237696, "grad_norm": 1.9900200366973877, "learning_rate": 0.00014645010698307725, "loss": 4.1703, "step": 2759 }, { "epoch": 0.8049580751002552, "grad_norm": 3.5072805881500244, "learning_rate": 0.00014643065551449137, "loss": 4.5716, "step": 2760 }, { "epoch": 0.8052497265767408, "grad_norm": 2.671905755996704, "learning_rate": 0.00014641120404590546, "loss": 4.1766, "step": 2761 }, { "epoch": 0.8055413780532263, "grad_norm": 2.3011648654937744, "learning_rate": 0.00014639175257731958, "loss": 4.3972, "step": 2762 }, { "epoch": 0.805833029529712, "grad_norm": 2.5998213291168213, "learning_rate": 0.00014637230110873373, "loss": 3.8796, "step": 2763 }, { "epoch": 0.8061246810061976, "grad_norm": 2.505222797393799, "learning_rate": 0.00014635284964014783, "loss": 4.3326, "step": 2764 }, { "epoch": 0.8064163324826832, "grad_norm": 2.0253148078918457, "learning_rate": 0.00014633339817156195, "loss": 4.4463, "step": 2765 }, { "epoch": 0.8067079839591688, "grad_norm": 3.186161518096924, "learning_rate": 0.0001463139467029761, "loss": 4.4274, "step": 2766 }, { "epoch": 0.8069996354356543, "grad_norm": 1.6318639516830444, "learning_rate": 0.00014629449523439022, "loss": 4.3352, "step": 2767 }, { "epoch": 0.80729128691214, "grad_norm": 2.51831316947937, "learning_rate": 0.0001462750437658043, "loss": 4.5423, "step": 2768 }, { "epoch": 0.8075829383886256, "grad_norm": 3.1927902698516846, "learning_rate": 0.00014625559229721846, "loss": 4.6236, "step": 2769 }, { "epoch": 0.8078745898651112, "grad_norm": 2.4012365341186523, "learning_rate": 0.00014623614082863258, "loss": 4.4502, "step": 2770 }, { "epoch": 0.8081662413415968, "grad_norm": 2.471726417541504, "learning_rate": 0.00014621668936004668, "loss": 4.6043, "step": 2771 }, { "epoch": 0.8084578928180824, "grad_norm": 2.1522772312164307, "learning_rate": 0.0001461972378914608, "loss": 4.0553, "step": 2772 }, { "epoch": 0.808749544294568, "grad_norm": 1.7482630014419556, "learning_rate": 0.00014617778642287495, "loss": 4.2989, "step": 2773 }, { "epoch": 0.8090411957710536, "grad_norm": 2.2597954273223877, "learning_rate": 0.00014615833495428904, "loss": 4.3729, "step": 2774 }, { "epoch": 0.8093328472475392, "grad_norm": 2.894932508468628, "learning_rate": 0.00014613888348570316, "loss": 4.5238, "step": 2775 }, { "epoch": 0.8096244987240248, "grad_norm": 2.256605386734009, "learning_rate": 0.0001461194320171173, "loss": 4.2969, "step": 2776 }, { "epoch": 0.8099161502005104, "grad_norm": 2.9160401821136475, "learning_rate": 0.00014609998054853143, "loss": 4.2629, "step": 2777 }, { "epoch": 0.810207801676996, "grad_norm": 1.9043325185775757, "learning_rate": 0.00014608052907994553, "loss": 4.4376, "step": 2778 }, { "epoch": 0.8104994531534816, "grad_norm": 1.8067340850830078, "learning_rate": 0.00014606107761135968, "loss": 4.3064, "step": 2779 }, { "epoch": 0.8107911046299672, "grad_norm": 2.6493403911590576, "learning_rate": 0.0001460416261427738, "loss": 4.3255, "step": 2780 }, { "epoch": 0.8110827561064528, "grad_norm": 2.03721022605896, "learning_rate": 0.0001460221746741879, "loss": 4.2973, "step": 2781 }, { "epoch": 0.8113744075829384, "grad_norm": 2.3256824016571045, "learning_rate": 0.00014600272320560202, "loss": 4.4335, "step": 2782 }, { "epoch": 0.811666059059424, "grad_norm": 1.6560168266296387, "learning_rate": 0.00014598327173701616, "loss": 4.2513, "step": 2783 }, { "epoch": 0.8119577105359096, "grad_norm": 2.0325944423675537, "learning_rate": 0.00014596382026843029, "loss": 4.6485, "step": 2784 }, { "epoch": 0.8122493620123952, "grad_norm": 2.7935924530029297, "learning_rate": 0.00014594436879984438, "loss": 4.4897, "step": 2785 }, { "epoch": 0.8125410134888807, "grad_norm": 1.8947951793670654, "learning_rate": 0.00014592491733125853, "loss": 4.4292, "step": 2786 }, { "epoch": 0.8128326649653664, "grad_norm": 2.827157497406006, "learning_rate": 0.00014590546586267265, "loss": 4.4078, "step": 2787 }, { "epoch": 0.813124316441852, "grad_norm": 2.4228687286376953, "learning_rate": 0.00014588601439408674, "loss": 4.4333, "step": 2788 }, { "epoch": 0.8134159679183376, "grad_norm": 2.3034489154815674, "learning_rate": 0.00014586656292550087, "loss": 4.4228, "step": 2789 }, { "epoch": 0.8137076193948232, "grad_norm": 1.810688853263855, "learning_rate": 0.00014584711145691501, "loss": 4.3033, "step": 2790 }, { "epoch": 0.8139992708713087, "grad_norm": 2.4121947288513184, "learning_rate": 0.00014582765998832914, "loss": 4.4707, "step": 2791 }, { "epoch": 0.8142909223477944, "grad_norm": 2.413628101348877, "learning_rate": 0.00014580820851974323, "loss": 4.112, "step": 2792 }, { "epoch": 0.81458257382428, "grad_norm": 2.7288103103637695, "learning_rate": 0.00014578875705115738, "loss": 4.4735, "step": 2793 }, { "epoch": 0.8148742253007656, "grad_norm": 2.194776773452759, "learning_rate": 0.0001457693055825715, "loss": 4.3271, "step": 2794 }, { "epoch": 0.8151658767772512, "grad_norm": 3.272624969482422, "learning_rate": 0.0001457498541139856, "loss": 4.4429, "step": 2795 }, { "epoch": 0.8154575282537367, "grad_norm": 2.1059389114379883, "learning_rate": 0.00014573040264539974, "loss": 4.6723, "step": 2796 }, { "epoch": 0.8157491797302224, "grad_norm": 2.601898670196533, "learning_rate": 0.00014571095117681387, "loss": 4.4379, "step": 2797 }, { "epoch": 0.816040831206708, "grad_norm": 2.2081775665283203, "learning_rate": 0.000145691499708228, "loss": 4.5069, "step": 2798 }, { "epoch": 0.8163324826831936, "grad_norm": 2.9994001388549805, "learning_rate": 0.00014567204823964208, "loss": 4.3823, "step": 2799 }, { "epoch": 0.8166241341596792, "grad_norm": 2.6409640312194824, "learning_rate": 0.00014565259677105623, "loss": 4.4485, "step": 2800 }, { "epoch": 0.8169157856361647, "grad_norm": 4.256936073303223, "learning_rate": 0.00014563314530247035, "loss": 4.4961, "step": 2801 }, { "epoch": 0.8172074371126504, "grad_norm": 2.0296835899353027, "learning_rate": 0.00014561369383388445, "loss": 4.3469, "step": 2802 }, { "epoch": 0.817499088589136, "grad_norm": 2.415555477142334, "learning_rate": 0.0001455942423652986, "loss": 4.3602, "step": 2803 }, { "epoch": 0.8177907400656216, "grad_norm": 3.0131826400756836, "learning_rate": 0.00014557479089671272, "loss": 4.3829, "step": 2804 }, { "epoch": 0.8180823915421072, "grad_norm": 2.886007308959961, "learning_rate": 0.0001455553394281268, "loss": 4.5193, "step": 2805 }, { "epoch": 0.8183740430185927, "grad_norm": 2.9811649322509766, "learning_rate": 0.00014553588795954096, "loss": 4.3989, "step": 2806 }, { "epoch": 0.8186656944950784, "grad_norm": 2.6099650859832764, "learning_rate": 0.00014551643649095508, "loss": 4.6302, "step": 2807 }, { "epoch": 0.818957345971564, "grad_norm": 2.267900228500366, "learning_rate": 0.0001454969850223692, "loss": 4.2331, "step": 2808 }, { "epoch": 0.8192489974480496, "grad_norm": 2.381770372390747, "learning_rate": 0.0001454775335537833, "loss": 4.1826, "step": 2809 }, { "epoch": 0.8195406489245352, "grad_norm": 4.58405065536499, "learning_rate": 0.00014545808208519745, "loss": 4.2557, "step": 2810 }, { "epoch": 0.8198323004010207, "grad_norm": 2.966214895248413, "learning_rate": 0.00014543863061661157, "loss": 4.6611, "step": 2811 }, { "epoch": 0.8201239518775064, "grad_norm": 3.703815221786499, "learning_rate": 0.00014541917914802566, "loss": 4.3853, "step": 2812 }, { "epoch": 0.820415603353992, "grad_norm": 1.9237865209579468, "learning_rate": 0.0001453997276794398, "loss": 4.3427, "step": 2813 }, { "epoch": 0.8207072548304776, "grad_norm": 2.8054046630859375, "learning_rate": 0.00014538027621085393, "loss": 4.3954, "step": 2814 }, { "epoch": 0.8209989063069631, "grad_norm": 2.085038185119629, "learning_rate": 0.00014536082474226805, "loss": 4.4064, "step": 2815 }, { "epoch": 0.8212905577834487, "grad_norm": 2.4939305782318115, "learning_rate": 0.00014534137327368218, "loss": 4.2466, "step": 2816 }, { "epoch": 0.8215822092599344, "grad_norm": 3.131471633911133, "learning_rate": 0.0001453219218050963, "loss": 4.3382, "step": 2817 }, { "epoch": 0.82187386073642, "grad_norm": 2.1726925373077393, "learning_rate": 0.00014530247033651042, "loss": 4.3452, "step": 2818 }, { "epoch": 0.8221655122129056, "grad_norm": 1.7365738153457642, "learning_rate": 0.00014528301886792451, "loss": 4.3325, "step": 2819 }, { "epoch": 0.8224571636893911, "grad_norm": 2.737602472305298, "learning_rate": 0.00014526356739933866, "loss": 4.5434, "step": 2820 }, { "epoch": 0.8227488151658767, "grad_norm": 4.796483993530273, "learning_rate": 0.00014524411593075278, "loss": 4.4445, "step": 2821 }, { "epoch": 0.8230404666423624, "grad_norm": 2.9413390159606934, "learning_rate": 0.0001452246644621669, "loss": 4.5093, "step": 2822 }, { "epoch": 0.823332118118848, "grad_norm": 3.0930917263031006, "learning_rate": 0.00014520521299358103, "loss": 4.5502, "step": 2823 }, { "epoch": 0.8236237695953336, "grad_norm": 2.825031042098999, "learning_rate": 0.00014518576152499515, "loss": 4.3132, "step": 2824 }, { "epoch": 0.8239154210718191, "grad_norm": 2.824826717376709, "learning_rate": 0.00014516631005640927, "loss": 4.2866, "step": 2825 }, { "epoch": 0.8242070725483048, "grad_norm": 2.443206787109375, "learning_rate": 0.0001451468585878234, "loss": 4.5227, "step": 2826 }, { "epoch": 0.8244987240247904, "grad_norm": 2.849616289138794, "learning_rate": 0.00014512740711923751, "loss": 4.2527, "step": 2827 }, { "epoch": 0.824790375501276, "grad_norm": 2.419835329055786, "learning_rate": 0.00014510795565065164, "loss": 4.4345, "step": 2828 }, { "epoch": 0.8250820269777616, "grad_norm": 3.0325570106506348, "learning_rate": 0.00014508850418206573, "loss": 4.655, "step": 2829 }, { "epoch": 0.8253736784542471, "grad_norm": 2.057835817337036, "learning_rate": 0.00014506905271347988, "loss": 4.2578, "step": 2830 }, { "epoch": 0.8256653299307328, "grad_norm": 2.428495168685913, "learning_rate": 0.000145049601244894, "loss": 4.6549, "step": 2831 }, { "epoch": 0.8259569814072184, "grad_norm": 5.111625671386719, "learning_rate": 0.00014503014977630812, "loss": 4.5144, "step": 2832 }, { "epoch": 0.826248632883704, "grad_norm": 2.551715850830078, "learning_rate": 0.00014501069830772224, "loss": 4.6005, "step": 2833 }, { "epoch": 0.8265402843601896, "grad_norm": 2.607114315032959, "learning_rate": 0.00014499124683913637, "loss": 4.3482, "step": 2834 }, { "epoch": 0.8268319358366751, "grad_norm": 2.2505087852478027, "learning_rate": 0.0001449717953705505, "loss": 4.2013, "step": 2835 }, { "epoch": 0.8271235873131608, "grad_norm": 2.528418779373169, "learning_rate": 0.0001449523439019646, "loss": 4.2533, "step": 2836 }, { "epoch": 0.8274152387896464, "grad_norm": 3.1723556518554688, "learning_rate": 0.00014493289243337873, "loss": 4.1776, "step": 2837 }, { "epoch": 0.827706890266132, "grad_norm": 3.9704477787017822, "learning_rate": 0.00014491344096479285, "loss": 4.5174, "step": 2838 }, { "epoch": 0.8279985417426176, "grad_norm": 2.2245469093322754, "learning_rate": 0.00014489398949620697, "loss": 4.3135, "step": 2839 }, { "epoch": 0.8282901932191031, "grad_norm": 3.054760456085205, "learning_rate": 0.0001448745380276211, "loss": 4.3625, "step": 2840 }, { "epoch": 0.8285818446955888, "grad_norm": 4.408637523651123, "learning_rate": 0.00014485508655903522, "loss": 4.0964, "step": 2841 }, { "epoch": 0.8288734961720744, "grad_norm": 3.310960292816162, "learning_rate": 0.00014483563509044934, "loss": 4.3938, "step": 2842 }, { "epoch": 0.82916514764856, "grad_norm": 3.0085151195526123, "learning_rate": 0.00014481618362186346, "loss": 4.2845, "step": 2843 }, { "epoch": 0.8294567991250456, "grad_norm": 1.9595386981964111, "learning_rate": 0.00014479673215327758, "loss": 4.1909, "step": 2844 }, { "epoch": 0.8297484506015311, "grad_norm": 2.0461153984069824, "learning_rate": 0.0001447772806846917, "loss": 4.5682, "step": 2845 }, { "epoch": 0.8300401020780168, "grad_norm": 2.9187827110290527, "learning_rate": 0.00014475782921610582, "loss": 4.3852, "step": 2846 }, { "epoch": 0.8303317535545024, "grad_norm": 3.0962259769439697, "learning_rate": 0.00014473837774751995, "loss": 4.2529, "step": 2847 }, { "epoch": 0.830623405030988, "grad_norm": 3.538548231124878, "learning_rate": 0.00014471892627893407, "loss": 4.5987, "step": 2848 }, { "epoch": 0.8309150565074735, "grad_norm": 2.4735093116760254, "learning_rate": 0.0001446994748103482, "loss": 4.2792, "step": 2849 }, { "epoch": 0.8312067079839591, "grad_norm": 3.9951980113983154, "learning_rate": 0.0001446800233417623, "loss": 4.1537, "step": 2850 }, { "epoch": 0.8314983594604448, "grad_norm": 3.036729335784912, "learning_rate": 0.00014466057187317643, "loss": 4.4777, "step": 2851 }, { "epoch": 0.8317900109369304, "grad_norm": 2.6479032039642334, "learning_rate": 0.00014464112040459055, "loss": 4.8889, "step": 2852 }, { "epoch": 0.832081662413416, "grad_norm": 2.5582168102264404, "learning_rate": 0.00014462166893600468, "loss": 4.4001, "step": 2853 }, { "epoch": 0.8323733138899015, "grad_norm": 2.61088490486145, "learning_rate": 0.0001446022174674188, "loss": 4.0517, "step": 2854 }, { "epoch": 0.8326649653663871, "grad_norm": 2.4445199966430664, "learning_rate": 0.00014458276599883292, "loss": 4.2264, "step": 2855 }, { "epoch": 0.8329566168428728, "grad_norm": 2.771050453186035, "learning_rate": 0.00014456331453024704, "loss": 4.3326, "step": 2856 }, { "epoch": 0.8332482683193584, "grad_norm": 1.9264373779296875, "learning_rate": 0.00014454386306166116, "loss": 4.5753, "step": 2857 }, { "epoch": 0.833539919795844, "grad_norm": 1.8514906167984009, "learning_rate": 0.00014452441159307528, "loss": 4.3562, "step": 2858 }, { "epoch": 0.8338315712723295, "grad_norm": 2.8099184036254883, "learning_rate": 0.0001445049601244894, "loss": 4.4106, "step": 2859 }, { "epoch": 0.8341232227488151, "grad_norm": 2.880786180496216, "learning_rate": 0.00014448550865590353, "loss": 4.2701, "step": 2860 }, { "epoch": 0.8344148742253008, "grad_norm": 3.7820637226104736, "learning_rate": 0.00014446605718731765, "loss": 3.7962, "step": 2861 }, { "epoch": 0.8347065257017864, "grad_norm": 2.667553186416626, "learning_rate": 0.00014444660571873177, "loss": 4.3796, "step": 2862 }, { "epoch": 0.834998177178272, "grad_norm": 2.8013381958007812, "learning_rate": 0.0001444271542501459, "loss": 4.4306, "step": 2863 }, { "epoch": 0.8352898286547575, "grad_norm": 2.533017158508301, "learning_rate": 0.00014440770278156, "loss": 4.3215, "step": 2864 }, { "epoch": 0.8355814801312431, "grad_norm": 2.1730117797851562, "learning_rate": 0.00014438825131297413, "loss": 4.419, "step": 2865 }, { "epoch": 0.8358731316077288, "grad_norm": 1.9848321676254272, "learning_rate": 0.00014436879984438826, "loss": 4.4088, "step": 2866 }, { "epoch": 0.8361647830842144, "grad_norm": 2.5379042625427246, "learning_rate": 0.00014434934837580238, "loss": 4.2866, "step": 2867 }, { "epoch": 0.8364564345607, "grad_norm": 2.3836615085601807, "learning_rate": 0.0001443298969072165, "loss": 4.4785, "step": 2868 }, { "epoch": 0.8367480860371855, "grad_norm": 3.6563405990600586, "learning_rate": 0.00014431044543863062, "loss": 4.4108, "step": 2869 }, { "epoch": 0.8370397375136711, "grad_norm": 2.829211950302124, "learning_rate": 0.00014429099397004474, "loss": 3.982, "step": 2870 }, { "epoch": 0.8373313889901568, "grad_norm": 2.2640626430511475, "learning_rate": 0.00014427154250145886, "loss": 4.4509, "step": 2871 }, { "epoch": 0.8376230404666424, "grad_norm": 2.267549991607666, "learning_rate": 0.00014425209103287299, "loss": 4.5105, "step": 2872 }, { "epoch": 0.837914691943128, "grad_norm": 4.075018882751465, "learning_rate": 0.0001442326395642871, "loss": 4.2288, "step": 2873 }, { "epoch": 0.8382063434196135, "grad_norm": 2.1443045139312744, "learning_rate": 0.00014421318809570123, "loss": 4.3271, "step": 2874 }, { "epoch": 0.8384979948960991, "grad_norm": 2.529684543609619, "learning_rate": 0.00014419373662711535, "loss": 4.5067, "step": 2875 }, { "epoch": 0.8387896463725848, "grad_norm": 1.733999252319336, "learning_rate": 0.00014417428515852947, "loss": 4.2782, "step": 2876 }, { "epoch": 0.8390812978490704, "grad_norm": 3.154710292816162, "learning_rate": 0.0001441548336899436, "loss": 4.2928, "step": 2877 }, { "epoch": 0.839372949325556, "grad_norm": 2.848097562789917, "learning_rate": 0.00014413538222135772, "loss": 4.5233, "step": 2878 }, { "epoch": 0.8396646008020415, "grad_norm": 2.2519195079803467, "learning_rate": 0.00014411593075277184, "loss": 4.5908, "step": 2879 }, { "epoch": 0.8399562522785271, "grad_norm": 1.9886702299118042, "learning_rate": 0.00014409647928418596, "loss": 4.2382, "step": 2880 }, { "epoch": 0.8402479037550128, "grad_norm": 2.335707664489746, "learning_rate": 0.00014407702781560008, "loss": 4.2426, "step": 2881 }, { "epoch": 0.8405395552314984, "grad_norm": 1.8174400329589844, "learning_rate": 0.0001440575763470142, "loss": 4.386, "step": 2882 }, { "epoch": 0.840831206707984, "grad_norm": 2.3922557830810547, "learning_rate": 0.00014403812487842835, "loss": 4.4829, "step": 2883 }, { "epoch": 0.8411228581844695, "grad_norm": 2.4659645557403564, "learning_rate": 0.00014401867340984244, "loss": 4.2493, "step": 2884 }, { "epoch": 0.8414145096609552, "grad_norm": 3.324526071548462, "learning_rate": 0.00014399922194125657, "loss": 4.4563, "step": 2885 }, { "epoch": 0.8417061611374408, "grad_norm": 2.505232572555542, "learning_rate": 0.0001439797704726707, "loss": 4.5723, "step": 2886 }, { "epoch": 0.8419978126139264, "grad_norm": 2.1598939895629883, "learning_rate": 0.0001439603190040848, "loss": 4.2827, "step": 2887 }, { "epoch": 0.8422894640904119, "grad_norm": 1.8859760761260986, "learning_rate": 0.00014394086753549893, "loss": 4.1291, "step": 2888 }, { "epoch": 0.8425811155668975, "grad_norm": 2.054196834564209, "learning_rate": 0.00014392141606691305, "loss": 4.3642, "step": 2889 }, { "epoch": 0.8428727670433832, "grad_norm": 2.140888214111328, "learning_rate": 0.0001439019645983272, "loss": 4.3878, "step": 2890 }, { "epoch": 0.8431644185198688, "grad_norm": 2.3467249870300293, "learning_rate": 0.0001438825131297413, "loss": 4.226, "step": 2891 }, { "epoch": 0.8434560699963544, "grad_norm": 1.9710173606872559, "learning_rate": 0.00014386306166115542, "loss": 4.4773, "step": 2892 }, { "epoch": 0.8437477214728399, "grad_norm": 2.6153619289398193, "learning_rate": 0.00014384361019256957, "loss": 4.3358, "step": 2893 }, { "epoch": 0.8440393729493255, "grad_norm": 2.2861287593841553, "learning_rate": 0.00014382415872398366, "loss": 4.3673, "step": 2894 }, { "epoch": 0.8443310244258112, "grad_norm": 2.3595383167266846, "learning_rate": 0.00014380470725539778, "loss": 4.5554, "step": 2895 }, { "epoch": 0.8446226759022968, "grad_norm": 2.11337947845459, "learning_rate": 0.0001437852557868119, "loss": 4.4642, "step": 2896 }, { "epoch": 0.8449143273787824, "grad_norm": 1.9576853513717651, "learning_rate": 0.00014376580431822603, "loss": 4.2179, "step": 2897 }, { "epoch": 0.8452059788552679, "grad_norm": 1.942091941833496, "learning_rate": 0.00014374635284964015, "loss": 4.1832, "step": 2898 }, { "epoch": 0.8454976303317535, "grad_norm": 1.8414256572723389, "learning_rate": 0.00014372690138105427, "loss": 4.4217, "step": 2899 }, { "epoch": 0.8457892818082392, "grad_norm": 1.7646925449371338, "learning_rate": 0.00014370744991246842, "loss": 4.4735, "step": 2900 }, { "epoch": 0.8460809332847248, "grad_norm": 2.854412078857422, "learning_rate": 0.0001436879984438825, "loss": 4.6555, "step": 2901 }, { "epoch": 0.8463725847612104, "grad_norm": 2.6805260181427, "learning_rate": 0.00014366854697529663, "loss": 4.321, "step": 2902 }, { "epoch": 0.8466642362376959, "grad_norm": 3.017512083053589, "learning_rate": 0.00014364909550671078, "loss": 4.4324, "step": 2903 }, { "epoch": 0.8469558877141815, "grad_norm": 2.5616953372955322, "learning_rate": 0.00014362964403812488, "loss": 3.9639, "step": 2904 }, { "epoch": 0.8472475391906672, "grad_norm": 2.037513256072998, "learning_rate": 0.000143610192569539, "loss": 4.3974, "step": 2905 }, { "epoch": 0.8475391906671528, "grad_norm": 2.5363595485687256, "learning_rate": 0.00014359074110095312, "loss": 4.3637, "step": 2906 }, { "epoch": 0.8478308421436384, "grad_norm": 4.265613555908203, "learning_rate": 0.00014357128963236727, "loss": 4.171, "step": 2907 }, { "epoch": 0.8481224936201239, "grad_norm": 2.9961752891540527, "learning_rate": 0.00014355183816378136, "loss": 4.5965, "step": 2908 }, { "epoch": 0.8484141450966095, "grad_norm": 2.582597494125366, "learning_rate": 0.00014353238669519548, "loss": 4.3869, "step": 2909 }, { "epoch": 0.8487057965730952, "grad_norm": 1.898775577545166, "learning_rate": 0.00014351293522660963, "loss": 4.3239, "step": 2910 }, { "epoch": 0.8489974480495808, "grad_norm": 2.568027973175049, "learning_rate": 0.00014349348375802373, "loss": 4.2947, "step": 2911 }, { "epoch": 0.8492890995260663, "grad_norm": 2.2395429611206055, "learning_rate": 0.00014347403228943785, "loss": 4.3538, "step": 2912 }, { "epoch": 0.8495807510025519, "grad_norm": 1.9169821739196777, "learning_rate": 0.000143454580820852, "loss": 4.5736, "step": 2913 }, { "epoch": 0.8498724024790375, "grad_norm": 2.6294825077056885, "learning_rate": 0.00014343512935226612, "loss": 4.2187, "step": 2914 }, { "epoch": 0.8501640539555232, "grad_norm": 2.688359260559082, "learning_rate": 0.00014341567788368021, "loss": 4.351, "step": 2915 }, { "epoch": 0.8504557054320088, "grad_norm": 2.145620107650757, "learning_rate": 0.00014339622641509434, "loss": 4.4225, "step": 2916 }, { "epoch": 0.8507473569084943, "grad_norm": 2.1988158226013184, "learning_rate": 0.00014337677494650848, "loss": 4.4838, "step": 2917 }, { "epoch": 0.8510390083849799, "grad_norm": 2.5381674766540527, "learning_rate": 0.00014335732347792258, "loss": 4.4884, "step": 2918 }, { "epoch": 0.8513306598614655, "grad_norm": 1.891339898109436, "learning_rate": 0.0001433378720093367, "loss": 4.3329, "step": 2919 }, { "epoch": 0.8516223113379512, "grad_norm": 2.2709081172943115, "learning_rate": 0.00014331842054075085, "loss": 4.4059, "step": 2920 }, { "epoch": 0.8519139628144368, "grad_norm": 2.2944536209106445, "learning_rate": 0.00014329896907216494, "loss": 4.3072, "step": 2921 }, { "epoch": 0.8522056142909223, "grad_norm": 2.1849799156188965, "learning_rate": 0.00014327951760357907, "loss": 4.4778, "step": 2922 }, { "epoch": 0.8524972657674079, "grad_norm": 2.0941579341888428, "learning_rate": 0.00014326006613499321, "loss": 4.5435, "step": 2923 }, { "epoch": 0.8527889172438935, "grad_norm": 2.511564016342163, "learning_rate": 0.00014324061466640734, "loss": 4.5224, "step": 2924 }, { "epoch": 0.8530805687203792, "grad_norm": 2.126504898071289, "learning_rate": 0.00014322116319782143, "loss": 4.1835, "step": 2925 }, { "epoch": 0.8533722201968648, "grad_norm": 2.104104518890381, "learning_rate": 0.00014320171172923555, "loss": 4.6059, "step": 2926 }, { "epoch": 0.8536638716733503, "grad_norm": 1.8433274030685425, "learning_rate": 0.0001431822602606497, "loss": 4.6447, "step": 2927 }, { "epoch": 0.8539555231498359, "grad_norm": 1.8213917016983032, "learning_rate": 0.0001431628087920638, "loss": 4.4877, "step": 2928 }, { "epoch": 0.8542471746263215, "grad_norm": 1.913620114326477, "learning_rate": 0.00014314335732347792, "loss": 4.4875, "step": 2929 }, { "epoch": 0.8545388261028072, "grad_norm": 1.9545027017593384, "learning_rate": 0.00014312390585489207, "loss": 4.3744, "step": 2930 }, { "epoch": 0.8548304775792928, "grad_norm": 2.2180066108703613, "learning_rate": 0.0001431044543863062, "loss": 4.6024, "step": 2931 }, { "epoch": 0.8551221290557783, "grad_norm": 2.130373954772949, "learning_rate": 0.00014308500291772028, "loss": 4.0906, "step": 2932 }, { "epoch": 0.8554137805322639, "grad_norm": 2.942690134048462, "learning_rate": 0.00014306555144913443, "loss": 4.5371, "step": 2933 }, { "epoch": 0.8557054320087495, "grad_norm": 2.802647352218628, "learning_rate": 0.00014304609998054855, "loss": 4.4918, "step": 2934 }, { "epoch": 0.8559970834852352, "grad_norm": 2.9518961906433105, "learning_rate": 0.00014302664851196265, "loss": 4.5032, "step": 2935 }, { "epoch": 0.8562887349617208, "grad_norm": 2.886181592941284, "learning_rate": 0.00014300719704337677, "loss": 4.3691, "step": 2936 }, { "epoch": 0.8565803864382063, "grad_norm": 2.1142947673797607, "learning_rate": 0.00014298774557479092, "loss": 4.2087, "step": 2937 }, { "epoch": 0.8568720379146919, "grad_norm": 2.6500797271728516, "learning_rate": 0.00014296829410620504, "loss": 4.3278, "step": 2938 }, { "epoch": 0.8571636893911776, "grad_norm": 2.7210159301757812, "learning_rate": 0.00014294884263761913, "loss": 4.3491, "step": 2939 }, { "epoch": 0.8574553408676632, "grad_norm": 2.880223512649536, "learning_rate": 0.00014292939116903328, "loss": 4.1209, "step": 2940 }, { "epoch": 0.8577469923441488, "grad_norm": 1.9834539890289307, "learning_rate": 0.0001429099397004474, "loss": 4.1789, "step": 2941 }, { "epoch": 0.8580386438206343, "grad_norm": 2.329202890396118, "learning_rate": 0.0001428904882318615, "loss": 4.6045, "step": 2942 }, { "epoch": 0.8583302952971199, "grad_norm": 2.9506876468658447, "learning_rate": 0.00014287103676327565, "loss": 4.3924, "step": 2943 }, { "epoch": 0.8586219467736056, "grad_norm": 2.2026143074035645, "learning_rate": 0.00014285158529468977, "loss": 4.1638, "step": 2944 }, { "epoch": 0.8589135982500912, "grad_norm": 2.346558094024658, "learning_rate": 0.00014283213382610386, "loss": 4.294, "step": 2945 }, { "epoch": 0.8592052497265767, "grad_norm": 2.2414581775665283, "learning_rate": 0.00014281268235751798, "loss": 4.2659, "step": 2946 }, { "epoch": 0.8594969012030623, "grad_norm": 2.088587999343872, "learning_rate": 0.00014279323088893213, "loss": 4.1744, "step": 2947 }, { "epoch": 0.8597885526795479, "grad_norm": 2.0157651901245117, "learning_rate": 0.00014277377942034625, "loss": 4.4379, "step": 2948 }, { "epoch": 0.8600802041560336, "grad_norm": 1.9878125190734863, "learning_rate": 0.00014275432795176035, "loss": 4.3317, "step": 2949 }, { "epoch": 0.8603718556325192, "grad_norm": 1.6565971374511719, "learning_rate": 0.0001427348764831745, "loss": 4.4316, "step": 2950 }, { "epoch": 0.8606635071090047, "grad_norm": 2.8232269287109375, "learning_rate": 0.00014271542501458862, "loss": 4.4294, "step": 2951 }, { "epoch": 0.8609551585854903, "grad_norm": 3.1354682445526123, "learning_rate": 0.0001426959735460027, "loss": 4.2974, "step": 2952 }, { "epoch": 0.8612468100619759, "grad_norm": 5.498987197875977, "learning_rate": 0.00014267652207741683, "loss": 4.1459, "step": 2953 }, { "epoch": 0.8615384615384616, "grad_norm": 2.30434250831604, "learning_rate": 0.00014265707060883098, "loss": 4.3331, "step": 2954 }, { "epoch": 0.8618301130149472, "grad_norm": 2.765925645828247, "learning_rate": 0.0001426376191402451, "loss": 4.1693, "step": 2955 }, { "epoch": 0.8621217644914327, "grad_norm": 1.9108996391296387, "learning_rate": 0.0001426181676716592, "loss": 4.4398, "step": 2956 }, { "epoch": 0.8624134159679183, "grad_norm": 2.4449615478515625, "learning_rate": 0.00014259871620307335, "loss": 4.5506, "step": 2957 }, { "epoch": 0.8627050674444039, "grad_norm": 2.185229539871216, "learning_rate": 0.00014257926473448747, "loss": 4.2229, "step": 2958 }, { "epoch": 0.8629967189208896, "grad_norm": 2.569694995880127, "learning_rate": 0.00014255981326590156, "loss": 4.3525, "step": 2959 }, { "epoch": 0.8632883703973752, "grad_norm": 1.9553889036178589, "learning_rate": 0.0001425403617973157, "loss": 4.1852, "step": 2960 }, { "epoch": 0.8635800218738607, "grad_norm": 2.275172710418701, "learning_rate": 0.00014252091032872983, "loss": 4.2777, "step": 2961 }, { "epoch": 0.8638716733503463, "grad_norm": 4.0084967613220215, "learning_rate": 0.00014250145886014396, "loss": 4.0978, "step": 2962 }, { "epoch": 0.8641633248268319, "grad_norm": 3.533637285232544, "learning_rate": 0.00014248200739155805, "loss": 4.614, "step": 2963 }, { "epoch": 0.8644549763033176, "grad_norm": 2.038362503051758, "learning_rate": 0.0001424625559229722, "loss": 4.3042, "step": 2964 }, { "epoch": 0.8647466277798032, "grad_norm": 2.228748321533203, "learning_rate": 0.00014244310445438632, "loss": 4.3539, "step": 2965 }, { "epoch": 0.8650382792562887, "grad_norm": 2.147510051727295, "learning_rate": 0.00014242365298580042, "loss": 4.2147, "step": 2966 }, { "epoch": 0.8653299307327743, "grad_norm": 2.322099447250366, "learning_rate": 0.00014240420151721456, "loss": 4.2923, "step": 2967 }, { "epoch": 0.8656215822092599, "grad_norm": 2.3988325595855713, "learning_rate": 0.00014238475004862869, "loss": 4.4809, "step": 2968 }, { "epoch": 0.8659132336857456, "grad_norm": 2.0443170070648193, "learning_rate": 0.0001423652985800428, "loss": 4.4431, "step": 2969 }, { "epoch": 0.8662048851622312, "grad_norm": 2.670214891433716, "learning_rate": 0.00014234584711145693, "loss": 4.2518, "step": 2970 }, { "epoch": 0.8664965366387167, "grad_norm": 2.2525758743286133, "learning_rate": 0.00014232639564287105, "loss": 4.6427, "step": 2971 }, { "epoch": 0.8667881881152023, "grad_norm": 2.36582088470459, "learning_rate": 0.00014230694417428517, "loss": 4.3419, "step": 2972 }, { "epoch": 0.8670798395916879, "grad_norm": 3.2335729598999023, "learning_rate": 0.00014228749270569927, "loss": 4.5046, "step": 2973 }, { "epoch": 0.8673714910681736, "grad_norm": 2.177899122238159, "learning_rate": 0.00014226804123711342, "loss": 4.3658, "step": 2974 }, { "epoch": 0.8676631425446591, "grad_norm": 1.9828239679336548, "learning_rate": 0.00014224858976852754, "loss": 4.1699, "step": 2975 }, { "epoch": 0.8679547940211447, "grad_norm": 2.633096218109131, "learning_rate": 0.00014222913829994163, "loss": 4.5673, "step": 2976 }, { "epoch": 0.8682464454976303, "grad_norm": 2.1967766284942627, "learning_rate": 0.00014220968683135578, "loss": 4.2636, "step": 2977 }, { "epoch": 0.8685380969741159, "grad_norm": 2.2016453742980957, "learning_rate": 0.0001421902353627699, "loss": 4.3277, "step": 2978 }, { "epoch": 0.8688297484506016, "grad_norm": 2.5478742122650146, "learning_rate": 0.00014217078389418402, "loss": 4.4395, "step": 2979 }, { "epoch": 0.8691213999270871, "grad_norm": 2.5450685024261475, "learning_rate": 0.00014215133242559814, "loss": 4.473, "step": 2980 }, { "epoch": 0.8694130514035727, "grad_norm": 2.6333260536193848, "learning_rate": 0.00014213188095701227, "loss": 4.2073, "step": 2981 }, { "epoch": 0.8697047028800583, "grad_norm": 2.4105236530303955, "learning_rate": 0.0001421124294884264, "loss": 4.4378, "step": 2982 }, { "epoch": 0.8699963543565439, "grad_norm": 2.2663581371307373, "learning_rate": 0.00014209297801984048, "loss": 4.3927, "step": 2983 }, { "epoch": 0.8702880058330296, "grad_norm": 2.155118942260742, "learning_rate": 0.00014207352655125463, "loss": 4.3567, "step": 2984 }, { "epoch": 0.8705796573095151, "grad_norm": 2.071868658065796, "learning_rate": 0.00014205407508266875, "loss": 4.4991, "step": 2985 }, { "epoch": 0.8708713087860007, "grad_norm": 2.54472279548645, "learning_rate": 0.00014203462361408287, "loss": 4.4466, "step": 2986 }, { "epoch": 0.8711629602624863, "grad_norm": 3.039722204208374, "learning_rate": 0.000142015172145497, "loss": 4.5033, "step": 2987 }, { "epoch": 0.8714546117389719, "grad_norm": 1.8364319801330566, "learning_rate": 0.00014199572067691112, "loss": 4.2534, "step": 2988 }, { "epoch": 0.8717462632154576, "grad_norm": 2.8135013580322266, "learning_rate": 0.00014197626920832524, "loss": 4.6627, "step": 2989 }, { "epoch": 0.8720379146919431, "grad_norm": 1.9398304224014282, "learning_rate": 0.00014195681773973936, "loss": 4.5682, "step": 2990 }, { "epoch": 0.8723295661684287, "grad_norm": 2.199265480041504, "learning_rate": 0.00014193736627115348, "loss": 4.0072, "step": 2991 }, { "epoch": 0.8726212176449143, "grad_norm": 3.219088315963745, "learning_rate": 0.0001419179148025676, "loss": 4.3394, "step": 2992 }, { "epoch": 0.8729128691214, "grad_norm": 2.7596588134765625, "learning_rate": 0.00014189846333398173, "loss": 4.144, "step": 2993 }, { "epoch": 0.8732045205978856, "grad_norm": 2.5671017169952393, "learning_rate": 0.00014187901186539585, "loss": 4.5623, "step": 2994 }, { "epoch": 0.8734961720743711, "grad_norm": 2.2359912395477295, "learning_rate": 0.00014185956039680997, "loss": 4.6037, "step": 2995 }, { "epoch": 0.8737878235508567, "grad_norm": 1.9791306257247925, "learning_rate": 0.0001418401089282241, "loss": 4.4906, "step": 2996 }, { "epoch": 0.8740794750273423, "grad_norm": 2.5341360569000244, "learning_rate": 0.0001418206574596382, "loss": 4.4344, "step": 2997 }, { "epoch": 0.874371126503828, "grad_norm": 2.6497833728790283, "learning_rate": 0.00014180120599105233, "loss": 4.1606, "step": 2998 }, { "epoch": 0.8746627779803136, "grad_norm": 2.851327657699585, "learning_rate": 0.00014178175452246646, "loss": 4.3456, "step": 2999 }, { "epoch": 0.8749544294567991, "grad_norm": 2.43473482131958, "learning_rate": 0.00014176230305388058, "loss": 4.3794, "step": 3000 }, { "epoch": 0.8752460809332847, "grad_norm": 2.8667218685150146, "learning_rate": 0.0001417428515852947, "loss": 4.4594, "step": 3001 }, { "epoch": 0.8755377324097703, "grad_norm": 2.319885015487671, "learning_rate": 0.00014172340011670882, "loss": 4.4049, "step": 3002 }, { "epoch": 0.875829383886256, "grad_norm": 2.323871612548828, "learning_rate": 0.00014170394864812294, "loss": 4.1614, "step": 3003 }, { "epoch": 0.8761210353627416, "grad_norm": 2.904881715774536, "learning_rate": 0.00014168449717953706, "loss": 4.3925, "step": 3004 }, { "epoch": 0.8764126868392271, "grad_norm": 2.6313953399658203, "learning_rate": 0.00014166504571095118, "loss": 4.426, "step": 3005 }, { "epoch": 0.8767043383157127, "grad_norm": 2.702758312225342, "learning_rate": 0.0001416455942423653, "loss": 4.365, "step": 3006 }, { "epoch": 0.8769959897921983, "grad_norm": 2.2184581756591797, "learning_rate": 0.00014162614277377943, "loss": 4.4859, "step": 3007 }, { "epoch": 0.877287641268684, "grad_norm": 3.2936694622039795, "learning_rate": 0.00014160669130519355, "loss": 4.5083, "step": 3008 }, { "epoch": 0.8775792927451695, "grad_norm": 3.0956554412841797, "learning_rate": 0.00014158723983660767, "loss": 4.2443, "step": 3009 }, { "epoch": 0.8778709442216551, "grad_norm": 2.2798445224761963, "learning_rate": 0.0001415677883680218, "loss": 4.3913, "step": 3010 }, { "epoch": 0.8781625956981407, "grad_norm": 2.058321714401245, "learning_rate": 0.00014154833689943591, "loss": 4.2876, "step": 3011 }, { "epoch": 0.8784542471746263, "grad_norm": 1.9158469438552856, "learning_rate": 0.00014152888543085004, "loss": 4.4451, "step": 3012 }, { "epoch": 0.878745898651112, "grad_norm": 1.725974202156067, "learning_rate": 0.00014150943396226416, "loss": 4.2038, "step": 3013 }, { "epoch": 0.8790375501275975, "grad_norm": 3.4189414978027344, "learning_rate": 0.00014148998249367828, "loss": 4.3852, "step": 3014 }, { "epoch": 0.8793292016040831, "grad_norm": 2.522785186767578, "learning_rate": 0.0001414705310250924, "loss": 4.4982, "step": 3015 }, { "epoch": 0.8796208530805687, "grad_norm": 2.748300075531006, "learning_rate": 0.00014145107955650652, "loss": 4.2242, "step": 3016 }, { "epoch": 0.8799125045570543, "grad_norm": 2.2179620265960693, "learning_rate": 0.00014143162808792064, "loss": 4.4724, "step": 3017 }, { "epoch": 0.88020415603354, "grad_norm": 1.840065836906433, "learning_rate": 0.00014141217661933477, "loss": 4.4271, "step": 3018 }, { "epoch": 0.8804958075100255, "grad_norm": 1.9887217283248901, "learning_rate": 0.0001413927251507489, "loss": 4.1579, "step": 3019 }, { "epoch": 0.8807874589865111, "grad_norm": 3.062080144882202, "learning_rate": 0.000141373273682163, "loss": 4.1609, "step": 3020 }, { "epoch": 0.8810791104629967, "grad_norm": 2.1547963619232178, "learning_rate": 0.00014135382221357713, "loss": 4.3944, "step": 3021 }, { "epoch": 0.8813707619394823, "grad_norm": 3.1556358337402344, "learning_rate": 0.00014133437074499125, "loss": 4.4694, "step": 3022 }, { "epoch": 0.881662413415968, "grad_norm": 2.253401756286621, "learning_rate": 0.00014131491927640537, "loss": 4.4115, "step": 3023 }, { "epoch": 0.8819540648924535, "grad_norm": 2.678114652633667, "learning_rate": 0.0001412954678078195, "loss": 4.5957, "step": 3024 }, { "epoch": 0.8822457163689391, "grad_norm": 2.3688976764678955, "learning_rate": 0.00014127601633923362, "loss": 4.0698, "step": 3025 }, { "epoch": 0.8825373678454247, "grad_norm": 2.4964921474456787, "learning_rate": 0.00014125656487064774, "loss": 4.2162, "step": 3026 }, { "epoch": 0.8828290193219103, "grad_norm": 2.8402698040008545, "learning_rate": 0.00014123711340206186, "loss": 4.5258, "step": 3027 }, { "epoch": 0.883120670798396, "grad_norm": 3.3088412284851074, "learning_rate": 0.00014121766193347598, "loss": 4.1209, "step": 3028 }, { "epoch": 0.8834123222748815, "grad_norm": 3.2391834259033203, "learning_rate": 0.0001411982104648901, "loss": 4.2722, "step": 3029 }, { "epoch": 0.8837039737513671, "grad_norm": 1.917966604232788, "learning_rate": 0.00014117875899630422, "loss": 4.3005, "step": 3030 }, { "epoch": 0.8839956252278527, "grad_norm": 2.407363176345825, "learning_rate": 0.00014115930752771835, "loss": 4.4887, "step": 3031 }, { "epoch": 0.8842872767043383, "grad_norm": 2.1290500164031982, "learning_rate": 0.00014113985605913247, "loss": 4.4476, "step": 3032 }, { "epoch": 0.884578928180824, "grad_norm": 2.572460174560547, "learning_rate": 0.0001411204045905466, "loss": 4.3261, "step": 3033 }, { "epoch": 0.8848705796573095, "grad_norm": 2.1555895805358887, "learning_rate": 0.0001411009531219607, "loss": 4.6022, "step": 3034 }, { "epoch": 0.8851622311337951, "grad_norm": 2.8904969692230225, "learning_rate": 0.00014108150165337483, "loss": 4.1647, "step": 3035 }, { "epoch": 0.8854538826102807, "grad_norm": 2.2067272663116455, "learning_rate": 0.00014106205018478895, "loss": 4.16, "step": 3036 }, { "epoch": 0.8857455340867663, "grad_norm": 2.52604603767395, "learning_rate": 0.0001410425987162031, "loss": 4.3132, "step": 3037 }, { "epoch": 0.886037185563252, "grad_norm": 2.0624024868011475, "learning_rate": 0.0001410231472476172, "loss": 4.178, "step": 3038 }, { "epoch": 0.8863288370397375, "grad_norm": 2.353111505508423, "learning_rate": 0.00014100369577903132, "loss": 4.5056, "step": 3039 }, { "epoch": 0.8866204885162231, "grad_norm": 1.9162837266921997, "learning_rate": 0.00014098424431044544, "loss": 4.4422, "step": 3040 }, { "epoch": 0.8869121399927087, "grad_norm": 2.1016733646392822, "learning_rate": 0.00014096479284185956, "loss": 4.3456, "step": 3041 }, { "epoch": 0.8872037914691943, "grad_norm": 2.0147705078125, "learning_rate": 0.00014094534137327368, "loss": 4.0889, "step": 3042 }, { "epoch": 0.88749544294568, "grad_norm": 2.1034247875213623, "learning_rate": 0.0001409258899046878, "loss": 4.3259, "step": 3043 }, { "epoch": 0.8877870944221655, "grad_norm": 1.9712333679199219, "learning_rate": 0.00014090643843610193, "loss": 4.5924, "step": 3044 }, { "epoch": 0.8880787458986511, "grad_norm": 2.1872785091400146, "learning_rate": 0.00014088698696751605, "loss": 4.2678, "step": 3045 }, { "epoch": 0.8883703973751367, "grad_norm": 2.598259925842285, "learning_rate": 0.00014086753549893017, "loss": 4.3912, "step": 3046 }, { "epoch": 0.8886620488516224, "grad_norm": 2.0687386989593506, "learning_rate": 0.00014084808403034432, "loss": 4.4419, "step": 3047 }, { "epoch": 0.8889537003281079, "grad_norm": 3.078871488571167, "learning_rate": 0.0001408286325617584, "loss": 4.6153, "step": 3048 }, { "epoch": 0.8892453518045935, "grad_norm": 1.8331000804901123, "learning_rate": 0.00014080918109317253, "loss": 4.4451, "step": 3049 }, { "epoch": 0.8895370032810791, "grad_norm": 2.4548323154449463, "learning_rate": 0.00014078972962458666, "loss": 4.1355, "step": 3050 }, { "epoch": 0.8898286547575647, "grad_norm": 2.4221482276916504, "learning_rate": 0.00014077027815600078, "loss": 4.2323, "step": 3051 }, { "epoch": 0.8901203062340504, "grad_norm": 2.5374248027801514, "learning_rate": 0.0001407508266874149, "loss": 3.8963, "step": 3052 }, { "epoch": 0.8904119577105359, "grad_norm": 3.511019706726074, "learning_rate": 0.00014073137521882902, "loss": 4.3139, "step": 3053 }, { "epoch": 0.8907036091870215, "grad_norm": 2.749054431915283, "learning_rate": 0.00014071192375024317, "loss": 4.3921, "step": 3054 }, { "epoch": 0.8909952606635071, "grad_norm": 2.4550631046295166, "learning_rate": 0.00014069247228165726, "loss": 4.2623, "step": 3055 }, { "epoch": 0.8912869121399927, "grad_norm": 2.370786666870117, "learning_rate": 0.00014067302081307139, "loss": 4.2641, "step": 3056 }, { "epoch": 0.8915785636164784, "grad_norm": 3.0333263874053955, "learning_rate": 0.00014065356934448553, "loss": 4.5862, "step": 3057 }, { "epoch": 0.8918702150929639, "grad_norm": 2.555424213409424, "learning_rate": 0.00014063411787589963, "loss": 4.3905, "step": 3058 }, { "epoch": 0.8921618665694495, "grad_norm": 2.1991615295410156, "learning_rate": 0.00014061466640731375, "loss": 4.5375, "step": 3059 }, { "epoch": 0.8924535180459351, "grad_norm": 2.497068405151367, "learning_rate": 0.00014059521493872787, "loss": 4.2156, "step": 3060 }, { "epoch": 0.8927451695224207, "grad_norm": 2.777879476547241, "learning_rate": 0.00014057576347014202, "loss": 4.4814, "step": 3061 }, { "epoch": 0.8930368209989064, "grad_norm": 2.096280574798584, "learning_rate": 0.00014055631200155612, "loss": 4.1334, "step": 3062 }, { "epoch": 0.8933284724753919, "grad_norm": 2.2230944633483887, "learning_rate": 0.00014053686053297024, "loss": 4.4204, "step": 3063 }, { "epoch": 0.8936201239518775, "grad_norm": 2.447282075881958, "learning_rate": 0.00014051740906438439, "loss": 4.0404, "step": 3064 }, { "epoch": 0.8939117754283631, "grad_norm": 2.339900255203247, "learning_rate": 0.00014049795759579848, "loss": 4.508, "step": 3065 }, { "epoch": 0.8942034269048487, "grad_norm": 2.4919075965881348, "learning_rate": 0.0001404785061272126, "loss": 4.3574, "step": 3066 }, { "epoch": 0.8944950783813344, "grad_norm": 2.0802791118621826, "learning_rate": 0.00014045905465862675, "loss": 4.4822, "step": 3067 }, { "epoch": 0.8947867298578199, "grad_norm": 3.144242286682129, "learning_rate": 0.00014043960319004085, "loss": 4.3591, "step": 3068 }, { "epoch": 0.8950783813343055, "grad_norm": 2.523522138595581, "learning_rate": 0.00014042015172145497, "loss": 4.4808, "step": 3069 }, { "epoch": 0.8953700328107911, "grad_norm": 1.9886809587478638, "learning_rate": 0.0001404007002528691, "loss": 4.4163, "step": 3070 }, { "epoch": 0.8956616842872767, "grad_norm": 2.5071706771850586, "learning_rate": 0.00014038124878428324, "loss": 4.4774, "step": 3071 }, { "epoch": 0.8959533357637623, "grad_norm": 2.153561592102051, "learning_rate": 0.00014036179731569733, "loss": 4.4868, "step": 3072 }, { "epoch": 0.8962449872402479, "grad_norm": 2.919339179992676, "learning_rate": 0.00014034234584711145, "loss": 4.1778, "step": 3073 }, { "epoch": 0.8965366387167335, "grad_norm": 1.8221698999404907, "learning_rate": 0.0001403228943785256, "loss": 4.2983, "step": 3074 }, { "epoch": 0.8968282901932191, "grad_norm": 1.797849416732788, "learning_rate": 0.0001403034429099397, "loss": 4.2883, "step": 3075 }, { "epoch": 0.8971199416697047, "grad_norm": 2.3847293853759766, "learning_rate": 0.00014028399144135382, "loss": 4.1022, "step": 3076 }, { "epoch": 0.8974115931461903, "grad_norm": 2.80308198928833, "learning_rate": 0.00014026453997276797, "loss": 4.5834, "step": 3077 }, { "epoch": 0.8977032446226759, "grad_norm": 2.286926031112671, "learning_rate": 0.0001402450885041821, "loss": 4.3456, "step": 3078 }, { "epoch": 0.8979948960991615, "grad_norm": 2.9360928535461426, "learning_rate": 0.00014022563703559618, "loss": 4.3328, "step": 3079 }, { "epoch": 0.8982865475756471, "grad_norm": 2.22908616065979, "learning_rate": 0.0001402061855670103, "loss": 4.5085, "step": 3080 }, { "epoch": 0.8985781990521327, "grad_norm": 2.384507417678833, "learning_rate": 0.00014018673409842445, "loss": 4.3823, "step": 3081 }, { "epoch": 0.8988698505286183, "grad_norm": 2.683091163635254, "learning_rate": 0.00014016728262983855, "loss": 4.2679, "step": 3082 }, { "epoch": 0.8991615020051039, "grad_norm": 2.2717947959899902, "learning_rate": 0.00014014783116125267, "loss": 4.1644, "step": 3083 }, { "epoch": 0.8994531534815895, "grad_norm": 2.18493390083313, "learning_rate": 0.00014012837969266682, "loss": 4.0336, "step": 3084 }, { "epoch": 0.8997448049580751, "grad_norm": 1.8693437576293945, "learning_rate": 0.00014010892822408094, "loss": 4.302, "step": 3085 }, { "epoch": 0.9000364564345606, "grad_norm": 2.31697154045105, "learning_rate": 0.00014008947675549503, "loss": 4.2408, "step": 3086 }, { "epoch": 0.9003281079110463, "grad_norm": 2.433759927749634, "learning_rate": 0.00014007002528690918, "loss": 4.4099, "step": 3087 }, { "epoch": 0.9006197593875319, "grad_norm": 2.776618719100952, "learning_rate": 0.0001400505738183233, "loss": 4.2641, "step": 3088 }, { "epoch": 0.9009114108640175, "grad_norm": 2.524244785308838, "learning_rate": 0.0001400311223497374, "loss": 4.2609, "step": 3089 }, { "epoch": 0.9012030623405031, "grad_norm": 2.7718942165374756, "learning_rate": 0.00014001167088115152, "loss": 4.485, "step": 3090 }, { "epoch": 0.9014947138169886, "grad_norm": 2.1829283237457275, "learning_rate": 0.00013999221941256567, "loss": 4.0461, "step": 3091 }, { "epoch": 0.9017863652934743, "grad_norm": 3.0922532081604004, "learning_rate": 0.00013997276794397976, "loss": 3.953, "step": 3092 }, { "epoch": 0.9020780167699599, "grad_norm": 3.0706498622894287, "learning_rate": 0.00013995331647539388, "loss": 4.3386, "step": 3093 }, { "epoch": 0.9023696682464455, "grad_norm": 2.036848306655884, "learning_rate": 0.00013993386500680803, "loss": 4.3181, "step": 3094 }, { "epoch": 0.9026613197229311, "grad_norm": 3.0918021202087402, "learning_rate": 0.00013991441353822216, "loss": 4.228, "step": 3095 }, { "epoch": 0.9029529711994166, "grad_norm": 1.9536834955215454, "learning_rate": 0.00013989496206963625, "loss": 4.3872, "step": 3096 }, { "epoch": 0.9032446226759023, "grad_norm": 3.1551105976104736, "learning_rate": 0.0001398755106010504, "loss": 4.2466, "step": 3097 }, { "epoch": 0.9035362741523879, "grad_norm": 3.055922269821167, "learning_rate": 0.00013985605913246452, "loss": 4.0505, "step": 3098 }, { "epoch": 0.9038279256288735, "grad_norm": 2.077361822128296, "learning_rate": 0.00013983660766387861, "loss": 3.8502, "step": 3099 }, { "epoch": 0.9041195771053591, "grad_norm": 3.415706157684326, "learning_rate": 0.00013981715619529274, "loss": 4.3843, "step": 3100 }, { "epoch": 0.9044112285818448, "grad_norm": 2.041771411895752, "learning_rate": 0.00013979770472670688, "loss": 4.2685, "step": 3101 }, { "epoch": 0.9047028800583303, "grad_norm": 1.801268458366394, "learning_rate": 0.000139778253258121, "loss": 4.3287, "step": 3102 }, { "epoch": 0.9049945315348159, "grad_norm": 4.788239479064941, "learning_rate": 0.0001397588017895351, "loss": 4.325, "step": 3103 }, { "epoch": 0.9052861830113015, "grad_norm": 2.553924560546875, "learning_rate": 0.00013973935032094925, "loss": 4.4951, "step": 3104 }, { "epoch": 0.9055778344877871, "grad_norm": 3.1710095405578613, "learning_rate": 0.00013971989885236337, "loss": 4.0504, "step": 3105 }, { "epoch": 0.9058694859642727, "grad_norm": 2.4574882984161377, "learning_rate": 0.00013970044738377747, "loss": 4.4273, "step": 3106 }, { "epoch": 0.9061611374407583, "grad_norm": 2.3182709217071533, "learning_rate": 0.0001396809959151916, "loss": 4.0842, "step": 3107 }, { "epoch": 0.9064527889172439, "grad_norm": 2.184968948364258, "learning_rate": 0.00013966154444660574, "loss": 4.1037, "step": 3108 }, { "epoch": 0.9067444403937295, "grad_norm": 3.098282814025879, "learning_rate": 0.00013964209297801986, "loss": 4.2935, "step": 3109 }, { "epoch": 0.907036091870215, "grad_norm": 3.915091037750244, "learning_rate": 0.00013962264150943395, "loss": 4.3421, "step": 3110 }, { "epoch": 0.9073277433467007, "grad_norm": 2.35410213470459, "learning_rate": 0.0001396031900408481, "loss": 4.1452, "step": 3111 }, { "epoch": 0.9076193948231863, "grad_norm": 2.754455327987671, "learning_rate": 0.00013958373857226222, "loss": 4.4074, "step": 3112 }, { "epoch": 0.9079110462996719, "grad_norm": 2.3036997318267822, "learning_rate": 0.00013956428710367632, "loss": 4.1238, "step": 3113 }, { "epoch": 0.9082026977761575, "grad_norm": 2.0366125106811523, "learning_rate": 0.00013954483563509047, "loss": 4.5523, "step": 3114 }, { "epoch": 0.908494349252643, "grad_norm": 3.29671049118042, "learning_rate": 0.0001395253841665046, "loss": 4.0117, "step": 3115 }, { "epoch": 0.9087860007291287, "grad_norm": 2.1356661319732666, "learning_rate": 0.0001395059326979187, "loss": 4.1696, "step": 3116 }, { "epoch": 0.9090776522056143, "grad_norm": 2.856205940246582, "learning_rate": 0.0001394864812293328, "loss": 4.3774, "step": 3117 }, { "epoch": 0.9093693036820999, "grad_norm": 2.096492290496826, "learning_rate": 0.00013946702976074695, "loss": 4.458, "step": 3118 }, { "epoch": 0.9096609551585855, "grad_norm": 2.1072001457214355, "learning_rate": 0.00013944757829216107, "loss": 4.4872, "step": 3119 }, { "epoch": 0.909952606635071, "grad_norm": 2.758133888244629, "learning_rate": 0.00013942812682357517, "loss": 4.2759, "step": 3120 }, { "epoch": 0.9102442581115567, "grad_norm": 2.5466153621673584, "learning_rate": 0.00013940867535498932, "loss": 4.1639, "step": 3121 }, { "epoch": 0.9105359095880423, "grad_norm": 3.189558744430542, "learning_rate": 0.00013938922388640344, "loss": 4.4871, "step": 3122 }, { "epoch": 0.9108275610645279, "grad_norm": 1.7281211614608765, "learning_rate": 0.00013936977241781753, "loss": 4.0945, "step": 3123 }, { "epoch": 0.9111192125410135, "grad_norm": 2.2101123332977295, "learning_rate": 0.00013935032094923168, "loss": 4.4228, "step": 3124 }, { "epoch": 0.911410864017499, "grad_norm": 1.897105097770691, "learning_rate": 0.0001393308694806458, "loss": 4.262, "step": 3125 }, { "epoch": 0.9117025154939847, "grad_norm": 2.8538761138916016, "learning_rate": 0.00013931141801205992, "loss": 4.5333, "step": 3126 }, { "epoch": 0.9119941669704703, "grad_norm": 2.3474719524383545, "learning_rate": 0.00013929196654347402, "loss": 4.389, "step": 3127 }, { "epoch": 0.9122858184469559, "grad_norm": 2.6397886276245117, "learning_rate": 0.00013927251507488817, "loss": 4.4009, "step": 3128 }, { "epoch": 0.9125774699234415, "grad_norm": 2.2216813564300537, "learning_rate": 0.0001392530636063023, "loss": 4.471, "step": 3129 }, { "epoch": 0.912869121399927, "grad_norm": 2.5098626613616943, "learning_rate": 0.00013923361213771638, "loss": 4.2183, "step": 3130 }, { "epoch": 0.9131607728764127, "grad_norm": 2.0458388328552246, "learning_rate": 0.00013921416066913053, "loss": 4.0591, "step": 3131 }, { "epoch": 0.9134524243528983, "grad_norm": 2.941148519515991, "learning_rate": 0.00013919470920054465, "loss": 4.284, "step": 3132 }, { "epoch": 0.9137440758293839, "grad_norm": 3.6054162979125977, "learning_rate": 0.00013917525773195878, "loss": 4.1461, "step": 3133 }, { "epoch": 0.9140357273058695, "grad_norm": 2.388540744781494, "learning_rate": 0.0001391558062633729, "loss": 4.3769, "step": 3134 }, { "epoch": 0.914327378782355, "grad_norm": 2.282674551010132, "learning_rate": 0.00013913635479478702, "loss": 4.4355, "step": 3135 }, { "epoch": 0.9146190302588407, "grad_norm": 2.5293984413146973, "learning_rate": 0.00013911690332620114, "loss": 4.4529, "step": 3136 }, { "epoch": 0.9149106817353263, "grad_norm": 2.373725414276123, "learning_rate": 0.00013909745185761524, "loss": 4.2829, "step": 3137 }, { "epoch": 0.9152023332118119, "grad_norm": 2.2922401428222656, "learning_rate": 0.00013907800038902938, "loss": 4.3041, "step": 3138 }, { "epoch": 0.9154939846882975, "grad_norm": 2.471764087677002, "learning_rate": 0.0001390585489204435, "loss": 4.5909, "step": 3139 }, { "epoch": 0.915785636164783, "grad_norm": 1.8948750495910645, "learning_rate": 0.00013903909745185763, "loss": 4.4272, "step": 3140 }, { "epoch": 0.9160772876412687, "grad_norm": 2.6406819820404053, "learning_rate": 0.00013901964598327175, "loss": 4.3369, "step": 3141 }, { "epoch": 0.9163689391177543, "grad_norm": 2.8723881244659424, "learning_rate": 0.00013900019451468587, "loss": 4.3582, "step": 3142 }, { "epoch": 0.9166605905942399, "grad_norm": 2.2180607318878174, "learning_rate": 0.0001389807430461, "loss": 4.4421, "step": 3143 }, { "epoch": 0.9169522420707255, "grad_norm": 2.307070255279541, "learning_rate": 0.0001389612915775141, "loss": 4.1081, "step": 3144 }, { "epoch": 0.917243893547211, "grad_norm": 2.417595148086548, "learning_rate": 0.00013894184010892823, "loss": 4.263, "step": 3145 }, { "epoch": 0.9175355450236967, "grad_norm": 2.0031356811523438, "learning_rate": 0.00013892238864034236, "loss": 4.2385, "step": 3146 }, { "epoch": 0.9178271965001823, "grad_norm": 2.2990849018096924, "learning_rate": 0.00013890293717175645, "loss": 4.4692, "step": 3147 }, { "epoch": 0.9181188479766679, "grad_norm": 2.682993173599243, "learning_rate": 0.0001388834857031706, "loss": 4.4899, "step": 3148 }, { "epoch": 0.9184104994531535, "grad_norm": 1.800432562828064, "learning_rate": 0.00013886403423458472, "loss": 4.4526, "step": 3149 }, { "epoch": 0.918702150929639, "grad_norm": 2.164259672164917, "learning_rate": 0.00013884458276599884, "loss": 4.2879, "step": 3150 }, { "epoch": 0.9189938024061247, "grad_norm": 2.1405746936798096, "learning_rate": 0.00013882513129741296, "loss": 4.3633, "step": 3151 }, { "epoch": 0.9192854538826103, "grad_norm": 2.373901844024658, "learning_rate": 0.00013880567982882709, "loss": 4.2315, "step": 3152 }, { "epoch": 0.9195771053590959, "grad_norm": 3.3071141242980957, "learning_rate": 0.0001387862283602412, "loss": 4.6165, "step": 3153 }, { "epoch": 0.9198687568355814, "grad_norm": 2.7169950008392334, "learning_rate": 0.00013876677689165533, "loss": 4.4611, "step": 3154 }, { "epoch": 0.9201604083120671, "grad_norm": 3.049595832824707, "learning_rate": 0.00013874732542306945, "loss": 4.3741, "step": 3155 }, { "epoch": 0.9204520597885527, "grad_norm": 2.180814504623413, "learning_rate": 0.00013872787395448357, "loss": 4.3271, "step": 3156 }, { "epoch": 0.9207437112650383, "grad_norm": 2.7160093784332275, "learning_rate": 0.0001387084224858977, "loss": 4.3477, "step": 3157 }, { "epoch": 0.9210353627415239, "grad_norm": 2.1995248794555664, "learning_rate": 0.00013868897101731182, "loss": 4.4289, "step": 3158 }, { "epoch": 0.9213270142180094, "grad_norm": 2.152045249938965, "learning_rate": 0.00013866951954872594, "loss": 4.3733, "step": 3159 }, { "epoch": 0.9216186656944951, "grad_norm": 2.34023118019104, "learning_rate": 0.00013865006808014006, "loss": 4.4599, "step": 3160 }, { "epoch": 0.9219103171709807, "grad_norm": 3.1479947566986084, "learning_rate": 0.00013863061661155418, "loss": 4.355, "step": 3161 }, { "epoch": 0.9222019686474663, "grad_norm": 2.114905595779419, "learning_rate": 0.0001386111651429683, "loss": 4.6148, "step": 3162 }, { "epoch": 0.9224936201239519, "grad_norm": 2.6852917671203613, "learning_rate": 0.00013859171367438242, "loss": 4.5089, "step": 3163 }, { "epoch": 0.9227852716004374, "grad_norm": 2.6539573669433594, "learning_rate": 0.00013857226220579655, "loss": 4.4971, "step": 3164 }, { "epoch": 0.9230769230769231, "grad_norm": 3.207108974456787, "learning_rate": 0.00013855281073721067, "loss": 4.3203, "step": 3165 }, { "epoch": 0.9233685745534087, "grad_norm": 2.678898572921753, "learning_rate": 0.0001385333592686248, "loss": 4.2603, "step": 3166 }, { "epoch": 0.9236602260298943, "grad_norm": 2.637181520462036, "learning_rate": 0.0001385139078000389, "loss": 4.5142, "step": 3167 }, { "epoch": 0.9239518775063799, "grad_norm": 3.035229444503784, "learning_rate": 0.00013849445633145303, "loss": 4.3267, "step": 3168 }, { "epoch": 0.9242435289828654, "grad_norm": 2.7162065505981445, "learning_rate": 0.00013847500486286715, "loss": 4.1722, "step": 3169 }, { "epoch": 0.9245351804593511, "grad_norm": 2.900130271911621, "learning_rate": 0.00013845555339428127, "loss": 4.5784, "step": 3170 }, { "epoch": 0.9248268319358367, "grad_norm": 2.86549973487854, "learning_rate": 0.0001384361019256954, "loss": 4.3501, "step": 3171 }, { "epoch": 0.9251184834123223, "grad_norm": 2.317234516143799, "learning_rate": 0.00013841665045710952, "loss": 4.3309, "step": 3172 }, { "epoch": 0.9254101348888079, "grad_norm": 3.0244979858398438, "learning_rate": 0.00013839719898852364, "loss": 4.287, "step": 3173 }, { "epoch": 0.9257017863652934, "grad_norm": 2.368501663208008, "learning_rate": 0.00013837774751993776, "loss": 4.1758, "step": 3174 }, { "epoch": 0.9259934378417791, "grad_norm": 2.1800177097320557, "learning_rate": 0.00013835829605135188, "loss": 4.4485, "step": 3175 }, { "epoch": 0.9262850893182647, "grad_norm": 3.5643506050109863, "learning_rate": 0.000138338844582766, "loss": 4.3645, "step": 3176 }, { "epoch": 0.9265767407947503, "grad_norm": 2.207310914993286, "learning_rate": 0.00013831939311418013, "loss": 4.4374, "step": 3177 }, { "epoch": 0.9268683922712359, "grad_norm": 2.6533005237579346, "learning_rate": 0.00013829994164559425, "loss": 4.4489, "step": 3178 }, { "epoch": 0.9271600437477214, "grad_norm": 2.0246400833129883, "learning_rate": 0.00013828049017700837, "loss": 4.5556, "step": 3179 }, { "epoch": 0.9274516952242071, "grad_norm": 2.955983877182007, "learning_rate": 0.0001382610387084225, "loss": 4.2789, "step": 3180 }, { "epoch": 0.9277433467006927, "grad_norm": 2.8540961742401123, "learning_rate": 0.0001382415872398366, "loss": 3.9647, "step": 3181 }, { "epoch": 0.9280349981771783, "grad_norm": 2.291795015335083, "learning_rate": 0.00013822213577125073, "loss": 4.3743, "step": 3182 }, { "epoch": 0.9283266496536638, "grad_norm": 2.040156364440918, "learning_rate": 0.00013820268430266486, "loss": 4.2203, "step": 3183 }, { "epoch": 0.9286183011301494, "grad_norm": 1.9638795852661133, "learning_rate": 0.00013818323283407898, "loss": 4.2719, "step": 3184 }, { "epoch": 0.9289099526066351, "grad_norm": 2.4727957248687744, "learning_rate": 0.0001381637813654931, "loss": 4.4531, "step": 3185 }, { "epoch": 0.9292016040831207, "grad_norm": 2.201974868774414, "learning_rate": 0.00013814432989690722, "loss": 4.4003, "step": 3186 }, { "epoch": 0.9294932555596063, "grad_norm": 2.098074436187744, "learning_rate": 0.00013812487842832134, "loss": 4.3774, "step": 3187 }, { "epoch": 0.9297849070360918, "grad_norm": 2.3176839351654053, "learning_rate": 0.00013810542695973546, "loss": 4.4736, "step": 3188 }, { "epoch": 0.9300765585125774, "grad_norm": 2.8098037242889404, "learning_rate": 0.00013808597549114959, "loss": 4.1023, "step": 3189 }, { "epoch": 0.9303682099890631, "grad_norm": 2.592623233795166, "learning_rate": 0.0001380665240225637, "loss": 4.336, "step": 3190 }, { "epoch": 0.9306598614655487, "grad_norm": 2.300204277038574, "learning_rate": 0.00013804707255397783, "loss": 4.253, "step": 3191 }, { "epoch": 0.9309515129420343, "grad_norm": 2.233276844024658, "learning_rate": 0.00013802762108539195, "loss": 4.1238, "step": 3192 }, { "epoch": 0.9312431644185198, "grad_norm": 2.3962290287017822, "learning_rate": 0.00013800816961680607, "loss": 4.4029, "step": 3193 }, { "epoch": 0.9315348158950054, "grad_norm": 2.0081751346588135, "learning_rate": 0.0001379887181482202, "loss": 4.0157, "step": 3194 }, { "epoch": 0.9318264673714911, "grad_norm": 2.2461395263671875, "learning_rate": 0.00013796926667963431, "loss": 4.5739, "step": 3195 }, { "epoch": 0.9321181188479767, "grad_norm": 2.348025321960449, "learning_rate": 0.00013794981521104844, "loss": 4.3547, "step": 3196 }, { "epoch": 0.9324097703244623, "grad_norm": 2.3199303150177, "learning_rate": 0.00013793036374246256, "loss": 4.3246, "step": 3197 }, { "epoch": 0.9327014218009478, "grad_norm": 2.2228777408599854, "learning_rate": 0.00013791091227387668, "loss": 4.4422, "step": 3198 }, { "epoch": 0.9329930732774334, "grad_norm": 2.316189765930176, "learning_rate": 0.0001378914608052908, "loss": 4.2954, "step": 3199 }, { "epoch": 0.9332847247539191, "grad_norm": 2.128127336502075, "learning_rate": 0.00013787200933670492, "loss": 4.2496, "step": 3200 }, { "epoch": 0.9335763762304047, "grad_norm": 2.34379506111145, "learning_rate": 0.00013785255786811907, "loss": 4.4127, "step": 3201 }, { "epoch": 0.9338680277068903, "grad_norm": 2.159461498260498, "learning_rate": 0.00013783310639953317, "loss": 4.2693, "step": 3202 }, { "epoch": 0.9341596791833758, "grad_norm": 1.8294377326965332, "learning_rate": 0.0001378136549309473, "loss": 4.3788, "step": 3203 }, { "epoch": 0.9344513306598614, "grad_norm": 2.649506092071533, "learning_rate": 0.0001377942034623614, "loss": 4.5324, "step": 3204 }, { "epoch": 0.9347429821363471, "grad_norm": 2.1108765602111816, "learning_rate": 0.00013777475199377553, "loss": 4.4625, "step": 3205 }, { "epoch": 0.9350346336128327, "grad_norm": 2.7561230659484863, "learning_rate": 0.00013775530052518965, "loss": 4.4433, "step": 3206 }, { "epoch": 0.9353262850893183, "grad_norm": 3.48724102973938, "learning_rate": 0.00013773584905660377, "loss": 4.3154, "step": 3207 }, { "epoch": 0.9356179365658038, "grad_norm": 2.9974098205566406, "learning_rate": 0.00013771639758801792, "loss": 4.4882, "step": 3208 }, { "epoch": 0.9359095880422894, "grad_norm": 2.9091994762420654, "learning_rate": 0.00013769694611943202, "loss": 4.6416, "step": 3209 }, { "epoch": 0.9362012395187751, "grad_norm": 1.965227723121643, "learning_rate": 0.00013767749465084614, "loss": 4.1993, "step": 3210 }, { "epoch": 0.9364928909952607, "grad_norm": 2.4336137771606445, "learning_rate": 0.0001376580431822603, "loss": 4.5545, "step": 3211 }, { "epoch": 0.9367845424717463, "grad_norm": 2.8580660820007324, "learning_rate": 0.00013763859171367438, "loss": 4.3685, "step": 3212 }, { "epoch": 0.9370761939482318, "grad_norm": 2.041257619857788, "learning_rate": 0.0001376191402450885, "loss": 4.3768, "step": 3213 }, { "epoch": 0.9373678454247175, "grad_norm": 1.8812614679336548, "learning_rate": 0.00013759968877650262, "loss": 4.3145, "step": 3214 }, { "epoch": 0.9376594969012031, "grad_norm": 2.783956527709961, "learning_rate": 0.00013758023730791675, "loss": 4.603, "step": 3215 }, { "epoch": 0.9379511483776887, "grad_norm": 2.0251715183258057, "learning_rate": 0.00013756078583933087, "loss": 4.1163, "step": 3216 }, { "epoch": 0.9382427998541742, "grad_norm": 2.391829490661621, "learning_rate": 0.000137541334370745, "loss": 4.4344, "step": 3217 }, { "epoch": 0.9385344513306598, "grad_norm": 3.291602373123169, "learning_rate": 0.00013752188290215914, "loss": 4.1067, "step": 3218 }, { "epoch": 0.9388261028071455, "grad_norm": 2.0027883052825928, "learning_rate": 0.00013750243143357323, "loss": 4.2972, "step": 3219 }, { "epoch": 0.9391177542836311, "grad_norm": 2.617546558380127, "learning_rate": 0.00013748297996498735, "loss": 4.4054, "step": 3220 }, { "epoch": 0.9394094057601167, "grad_norm": 1.9642945528030396, "learning_rate": 0.0001374635284964015, "loss": 4.4038, "step": 3221 }, { "epoch": 0.9397010572366022, "grad_norm": 1.8773020505905151, "learning_rate": 0.0001374440770278156, "loss": 4.2809, "step": 3222 }, { "epoch": 0.9399927087130878, "grad_norm": 2.514008045196533, "learning_rate": 0.00013742462555922972, "loss": 4.3827, "step": 3223 }, { "epoch": 0.9402843601895735, "grad_norm": 3.4770123958587646, "learning_rate": 0.00013740517409064384, "loss": 4.2149, "step": 3224 }, { "epoch": 0.9405760116660591, "grad_norm": 3.086533784866333, "learning_rate": 0.000137385722622058, "loss": 4.4602, "step": 3225 }, { "epoch": 0.9408676631425447, "grad_norm": 2.1796951293945312, "learning_rate": 0.00013736627115347208, "loss": 4.3942, "step": 3226 }, { "epoch": 0.9411593146190302, "grad_norm": 2.2949888706207275, "learning_rate": 0.0001373468196848862, "loss": 4.5949, "step": 3227 }, { "epoch": 0.9414509660955158, "grad_norm": 1.8089815378189087, "learning_rate": 0.00013732736821630035, "loss": 4.2554, "step": 3228 }, { "epoch": 0.9417426175720015, "grad_norm": 6.220850467681885, "learning_rate": 0.00013730791674771445, "loss": 3.9882, "step": 3229 }, { "epoch": 0.9420342690484871, "grad_norm": 2.1407814025878906, "learning_rate": 0.00013728846527912857, "loss": 4.3342, "step": 3230 }, { "epoch": 0.9423259205249727, "grad_norm": 2.595302104949951, "learning_rate": 0.00013726901381054272, "loss": 4.4015, "step": 3231 }, { "epoch": 0.9426175720014582, "grad_norm": 2.5973329544067383, "learning_rate": 0.00013724956234195684, "loss": 4.2471, "step": 3232 }, { "epoch": 0.9429092234779438, "grad_norm": 2.30635929107666, "learning_rate": 0.00013723011087337094, "loss": 4.2466, "step": 3233 }, { "epoch": 0.9432008749544295, "grad_norm": 2.139317512512207, "learning_rate": 0.00013721065940478506, "loss": 4.5338, "step": 3234 }, { "epoch": 0.9434925264309151, "grad_norm": 2.097759962081909, "learning_rate": 0.0001371912079361992, "loss": 4.652, "step": 3235 }, { "epoch": 0.9437841779074007, "grad_norm": 2.970928430557251, "learning_rate": 0.0001371717564676133, "loss": 4.5385, "step": 3236 }, { "epoch": 0.9440758293838862, "grad_norm": 2.577934741973877, "learning_rate": 0.00013715230499902742, "loss": 4.3607, "step": 3237 }, { "epoch": 0.9443674808603718, "grad_norm": 2.1511662006378174, "learning_rate": 0.00013713285353044157, "loss": 4.3546, "step": 3238 }, { "epoch": 0.9446591323368575, "grad_norm": 2.210756778717041, "learning_rate": 0.00013711340206185566, "loss": 4.2496, "step": 3239 }, { "epoch": 0.9449507838133431, "grad_norm": 2.0248732566833496, "learning_rate": 0.00013709395059326979, "loss": 4.3684, "step": 3240 }, { "epoch": 0.9452424352898287, "grad_norm": 1.9479186534881592, "learning_rate": 0.00013707449912468394, "loss": 4.3291, "step": 3241 }, { "epoch": 0.9455340867663142, "grad_norm": 3.119344472885132, "learning_rate": 0.00013705504765609806, "loss": 4.4225, "step": 3242 }, { "epoch": 0.9458257382427998, "grad_norm": 2.1190710067749023, "learning_rate": 0.00013703559618751215, "loss": 4.292, "step": 3243 }, { "epoch": 0.9461173897192855, "grad_norm": 1.9680079221725464, "learning_rate": 0.00013701614471892627, "loss": 4.2558, "step": 3244 }, { "epoch": 0.9464090411957711, "grad_norm": 2.407757520675659, "learning_rate": 0.00013699669325034042, "loss": 4.3915, "step": 3245 }, { "epoch": 0.9467006926722566, "grad_norm": 2.862335681915283, "learning_rate": 0.00013697724178175452, "loss": 4.476, "step": 3246 }, { "epoch": 0.9469923441487422, "grad_norm": 2.1071619987487793, "learning_rate": 0.00013695779031316864, "loss": 4.4169, "step": 3247 }, { "epoch": 0.9472839956252278, "grad_norm": 1.8919931650161743, "learning_rate": 0.00013693833884458279, "loss": 4.2895, "step": 3248 }, { "epoch": 0.9475756471017135, "grad_norm": 2.6624066829681396, "learning_rate": 0.0001369188873759969, "loss": 4.4877, "step": 3249 }, { "epoch": 0.9478672985781991, "grad_norm": 2.2610833644866943, "learning_rate": 0.000136899435907411, "loss": 4.2617, "step": 3250 }, { "epoch": 0.9481589500546846, "grad_norm": 2.292128324508667, "learning_rate": 0.00013687998443882515, "loss": 4.3197, "step": 3251 }, { "epoch": 0.9484506015311702, "grad_norm": 2.1309902667999268, "learning_rate": 0.00013686053297023927, "loss": 4.4031, "step": 3252 }, { "epoch": 0.9487422530076558, "grad_norm": 2.265245199203491, "learning_rate": 0.00013684108150165337, "loss": 4.4865, "step": 3253 }, { "epoch": 0.9490339044841415, "grad_norm": 2.6247448921203613, "learning_rate": 0.0001368216300330675, "loss": 4.1594, "step": 3254 }, { "epoch": 0.9493255559606271, "grad_norm": 2.1384575366973877, "learning_rate": 0.00013680217856448164, "loss": 4.4008, "step": 3255 }, { "epoch": 0.9496172074371126, "grad_norm": 1.9831247329711914, "learning_rate": 0.00013678272709589576, "loss": 4.1883, "step": 3256 }, { "epoch": 0.9499088589135982, "grad_norm": 2.5893006324768066, "learning_rate": 0.00013676327562730985, "loss": 4.2813, "step": 3257 }, { "epoch": 0.9502005103900838, "grad_norm": 1.8961329460144043, "learning_rate": 0.000136743824158724, "loss": 4.0905, "step": 3258 }, { "epoch": 0.9504921618665695, "grad_norm": 2.28524112701416, "learning_rate": 0.00013672437269013812, "loss": 4.325, "step": 3259 }, { "epoch": 0.9507838133430551, "grad_norm": 2.3859126567840576, "learning_rate": 0.00013670492122155222, "loss": 4.2573, "step": 3260 }, { "epoch": 0.9510754648195406, "grad_norm": 2.3822669982910156, "learning_rate": 0.00013668546975296637, "loss": 4.4637, "step": 3261 }, { "epoch": 0.9513671162960262, "grad_norm": 1.9525701999664307, "learning_rate": 0.0001366660182843805, "loss": 4.3195, "step": 3262 }, { "epoch": 0.9516587677725118, "grad_norm": 2.911515951156616, "learning_rate": 0.0001366465668157946, "loss": 4.2753, "step": 3263 }, { "epoch": 0.9519504192489975, "grad_norm": 1.90200674533844, "learning_rate": 0.0001366271153472087, "loss": 4.5435, "step": 3264 }, { "epoch": 0.9522420707254831, "grad_norm": 1.9295296669006348, "learning_rate": 0.00013660766387862285, "loss": 4.2226, "step": 3265 }, { "epoch": 0.9525337222019686, "grad_norm": 2.1597208976745605, "learning_rate": 0.00013658821241003697, "loss": 4.3286, "step": 3266 }, { "epoch": 0.9528253736784542, "grad_norm": 1.8468000888824463, "learning_rate": 0.00013656876094145107, "loss": 4.8067, "step": 3267 }, { "epoch": 0.9531170251549399, "grad_norm": 2.5827226638793945, "learning_rate": 0.00013654930947286522, "loss": 4.4434, "step": 3268 }, { "epoch": 0.9534086766314255, "grad_norm": 2.503272771835327, "learning_rate": 0.00013652985800427934, "loss": 4.6055, "step": 3269 }, { "epoch": 0.953700328107911, "grad_norm": 2.2070119380950928, "learning_rate": 0.00013651040653569343, "loss": 4.3759, "step": 3270 }, { "epoch": 0.9539919795843966, "grad_norm": 3.26188325881958, "learning_rate": 0.00013649095506710756, "loss": 4.4085, "step": 3271 }, { "epoch": 0.9542836310608822, "grad_norm": 4.083334445953369, "learning_rate": 0.0001364715035985217, "loss": 4.4676, "step": 3272 }, { "epoch": 0.9545752825373679, "grad_norm": 2.2067835330963135, "learning_rate": 0.00013645205212993583, "loss": 4.3409, "step": 3273 }, { "epoch": 0.9548669340138535, "grad_norm": 2.3526511192321777, "learning_rate": 0.00013643260066134992, "loss": 4.2049, "step": 3274 }, { "epoch": 0.955158585490339, "grad_norm": 2.779128313064575, "learning_rate": 0.00013641314919276407, "loss": 4.4084, "step": 3275 }, { "epoch": 0.9554502369668246, "grad_norm": 2.1263821125030518, "learning_rate": 0.0001363936977241782, "loss": 4.3606, "step": 3276 }, { "epoch": 0.9557418884433102, "grad_norm": 2.4998795986175537, "learning_rate": 0.00013637424625559229, "loss": 4.2301, "step": 3277 }, { "epoch": 0.9560335399197959, "grad_norm": 1.8637702465057373, "learning_rate": 0.00013635479478700643, "loss": 4.2616, "step": 3278 }, { "epoch": 0.9563251913962815, "grad_norm": 2.1601650714874268, "learning_rate": 0.00013633534331842056, "loss": 4.2554, "step": 3279 }, { "epoch": 0.956616842872767, "grad_norm": 2.7291669845581055, "learning_rate": 0.00013631589184983468, "loss": 4.569, "step": 3280 }, { "epoch": 0.9569084943492526, "grad_norm": 2.0443074703216553, "learning_rate": 0.00013629644038124877, "loss": 4.1914, "step": 3281 }, { "epoch": 0.9572001458257382, "grad_norm": 2.3546652793884277, "learning_rate": 0.00013627698891266292, "loss": 4.1584, "step": 3282 }, { "epoch": 0.9574917973022239, "grad_norm": 2.00570011138916, "learning_rate": 0.00013625753744407704, "loss": 4.3192, "step": 3283 }, { "epoch": 0.9577834487787095, "grad_norm": 2.5785908699035645, "learning_rate": 0.00013623808597549114, "loss": 4.3489, "step": 3284 }, { "epoch": 0.958075100255195, "grad_norm": 2.099228858947754, "learning_rate": 0.00013621863450690529, "loss": 4.3856, "step": 3285 }, { "epoch": 0.9583667517316806, "grad_norm": 2.0528900623321533, "learning_rate": 0.0001361991830383194, "loss": 4.534, "step": 3286 }, { "epoch": 0.9586584032081662, "grad_norm": 2.260377883911133, "learning_rate": 0.00013617973156973353, "loss": 4.4212, "step": 3287 }, { "epoch": 0.9589500546846519, "grad_norm": 2.3532910346984863, "learning_rate": 0.00013616028010114765, "loss": 4.1388, "step": 3288 }, { "epoch": 0.9592417061611375, "grad_norm": 2.211566686630249, "learning_rate": 0.00013614082863256177, "loss": 4.6275, "step": 3289 }, { "epoch": 0.959533357637623, "grad_norm": 2.329662322998047, "learning_rate": 0.0001361213771639759, "loss": 4.3451, "step": 3290 }, { "epoch": 0.9598250091141086, "grad_norm": 2.0786333084106445, "learning_rate": 0.00013610192569539, "loss": 4.4031, "step": 3291 }, { "epoch": 0.9601166605905942, "grad_norm": 2.827014207839966, "learning_rate": 0.00013608247422680414, "loss": 4.0219, "step": 3292 }, { "epoch": 0.9604083120670799, "grad_norm": 1.764486312866211, "learning_rate": 0.00013606302275821826, "loss": 4.4261, "step": 3293 }, { "epoch": 0.9606999635435655, "grad_norm": 2.497175455093384, "learning_rate": 0.00013604357128963235, "loss": 4.3682, "step": 3294 }, { "epoch": 0.960991615020051, "grad_norm": 2.6432688236236572, "learning_rate": 0.0001360241198210465, "loss": 4.3373, "step": 3295 }, { "epoch": 0.9612832664965366, "grad_norm": 2.6006038188934326, "learning_rate": 0.00013600466835246062, "loss": 4.1991, "step": 3296 }, { "epoch": 0.9615749179730222, "grad_norm": 2.7706265449523926, "learning_rate": 0.00013598521688387474, "loss": 4.3926, "step": 3297 }, { "epoch": 0.9618665694495079, "grad_norm": 2.1161727905273438, "learning_rate": 0.00013596576541528887, "loss": 4.492, "step": 3298 }, { "epoch": 0.9621582209259935, "grad_norm": 2.1015625, "learning_rate": 0.000135946313946703, "loss": 4.4036, "step": 3299 }, { "epoch": 0.962449872402479, "grad_norm": 2.6553077697753906, "learning_rate": 0.0001359268624781171, "loss": 3.9288, "step": 3300 }, { "epoch": 0.9627415238789646, "grad_norm": 1.890573263168335, "learning_rate": 0.0001359074110095312, "loss": 4.3495, "step": 3301 }, { "epoch": 0.9630331753554502, "grad_norm": 2.179184675216675, "learning_rate": 0.00013588795954094535, "loss": 4.4646, "step": 3302 }, { "epoch": 0.9633248268319359, "grad_norm": 3.3116352558135986, "learning_rate": 0.00013586850807235947, "loss": 4.3154, "step": 3303 }, { "epoch": 0.9636164783084215, "grad_norm": 2.6989662647247314, "learning_rate": 0.0001358490566037736, "loss": 4.25, "step": 3304 }, { "epoch": 0.963908129784907, "grad_norm": 1.8372505903244019, "learning_rate": 0.00013582960513518772, "loss": 4.5934, "step": 3305 }, { "epoch": 0.9641997812613926, "grad_norm": 2.431187868118286, "learning_rate": 0.00013581015366660184, "loss": 4.3079, "step": 3306 }, { "epoch": 0.9644914327378782, "grad_norm": 2.3974056243896484, "learning_rate": 0.00013579070219801596, "loss": 4.5743, "step": 3307 }, { "epoch": 0.9647830842143639, "grad_norm": 2.6988861560821533, "learning_rate": 0.00013577125072943008, "loss": 4.2252, "step": 3308 }, { "epoch": 0.9650747356908495, "grad_norm": 2.1448378562927246, "learning_rate": 0.0001357517992608442, "loss": 4.4919, "step": 3309 }, { "epoch": 0.965366387167335, "grad_norm": 2.66886568069458, "learning_rate": 0.00013573234779225833, "loss": 4.2396, "step": 3310 }, { "epoch": 0.9656580386438206, "grad_norm": 2.2983505725860596, "learning_rate": 0.00013571289632367245, "loss": 4.3822, "step": 3311 }, { "epoch": 0.9659496901203062, "grad_norm": 2.6471927165985107, "learning_rate": 0.00013569344485508657, "loss": 4.1699, "step": 3312 }, { "epoch": 0.9662413415967919, "grad_norm": 2.2335166931152344, "learning_rate": 0.0001356739933865007, "loss": 4.3419, "step": 3313 }, { "epoch": 0.9665329930732774, "grad_norm": 2.2101211547851562, "learning_rate": 0.0001356545419179148, "loss": 4.2975, "step": 3314 }, { "epoch": 0.966824644549763, "grad_norm": 2.292498826980591, "learning_rate": 0.00013563509044932893, "loss": 4.2985, "step": 3315 }, { "epoch": 0.9671162960262486, "grad_norm": 2.661898612976074, "learning_rate": 0.00013561563898074305, "loss": 4.1856, "step": 3316 }, { "epoch": 0.9674079475027342, "grad_norm": 2.146883249282837, "learning_rate": 0.00013559618751215718, "loss": 4.2023, "step": 3317 }, { "epoch": 0.9676995989792199, "grad_norm": 2.007643699645996, "learning_rate": 0.0001355767360435713, "loss": 4.3344, "step": 3318 }, { "epoch": 0.9679912504557054, "grad_norm": 2.60768985748291, "learning_rate": 0.00013555728457498542, "loss": 4.3142, "step": 3319 }, { "epoch": 0.968282901932191, "grad_norm": 2.516946315765381, "learning_rate": 0.00013553783310639954, "loss": 4.2502, "step": 3320 }, { "epoch": 0.9685745534086766, "grad_norm": 2.359689950942993, "learning_rate": 0.00013551838163781366, "loss": 4.3076, "step": 3321 }, { "epoch": 0.9688662048851623, "grad_norm": 1.9118272066116333, "learning_rate": 0.00013549893016922778, "loss": 4.4095, "step": 3322 }, { "epoch": 0.9691578563616479, "grad_norm": 2.5156238079071045, "learning_rate": 0.0001354794787006419, "loss": 4.3609, "step": 3323 }, { "epoch": 0.9694495078381334, "grad_norm": 2.218918800354004, "learning_rate": 0.00013546002723205603, "loss": 4.2502, "step": 3324 }, { "epoch": 0.969741159314619, "grad_norm": 2.690938711166382, "learning_rate": 0.00013544057576347015, "loss": 4.6114, "step": 3325 }, { "epoch": 0.9700328107911046, "grad_norm": 2.0178277492523193, "learning_rate": 0.00013542112429488427, "loss": 4.3354, "step": 3326 }, { "epoch": 0.9703244622675903, "grad_norm": 3.3865280151367188, "learning_rate": 0.0001354016728262984, "loss": 4.4372, "step": 3327 }, { "epoch": 0.9706161137440759, "grad_norm": 1.8197108507156372, "learning_rate": 0.00013538222135771251, "loss": 4.4075, "step": 3328 }, { "epoch": 0.9709077652205614, "grad_norm": 3.129798173904419, "learning_rate": 0.00013536276988912664, "loss": 4.2307, "step": 3329 }, { "epoch": 0.971199416697047, "grad_norm": 2.4602432250976562, "learning_rate": 0.00013534331842054076, "loss": 4.2478, "step": 3330 }, { "epoch": 0.9714910681735326, "grad_norm": 2.994563341140747, "learning_rate": 0.00013532386695195488, "loss": 4.4569, "step": 3331 }, { "epoch": 0.9717827196500183, "grad_norm": 1.5763145685195923, "learning_rate": 0.000135304415483369, "loss": 4.1776, "step": 3332 }, { "epoch": 0.9720743711265039, "grad_norm": 2.2283148765563965, "learning_rate": 0.00013528496401478312, "loss": 4.303, "step": 3333 }, { "epoch": 0.9723660226029894, "grad_norm": 2.093698024749756, "learning_rate": 0.00013526551254619724, "loss": 4.505, "step": 3334 }, { "epoch": 0.972657674079475, "grad_norm": 1.9983179569244385, "learning_rate": 0.00013524606107761136, "loss": 4.3768, "step": 3335 }, { "epoch": 0.9729493255559606, "grad_norm": 2.5612010955810547, "learning_rate": 0.0001352266096090255, "loss": 4.274, "step": 3336 }, { "epoch": 0.9732409770324463, "grad_norm": 2.682288885116577, "learning_rate": 0.0001352071581404396, "loss": 4.5029, "step": 3337 }, { "epoch": 0.9735326285089319, "grad_norm": 1.758858323097229, "learning_rate": 0.00013518770667185373, "loss": 4.1539, "step": 3338 }, { "epoch": 0.9738242799854174, "grad_norm": 2.0186307430267334, "learning_rate": 0.00013516825520326785, "loss": 4.3043, "step": 3339 }, { "epoch": 0.974115931461903, "grad_norm": 2.5944607257843018, "learning_rate": 0.00013514880373468197, "loss": 4.3999, "step": 3340 }, { "epoch": 0.9744075829383886, "grad_norm": 2.2242677211761475, "learning_rate": 0.0001351293522660961, "loss": 4.473, "step": 3341 }, { "epoch": 0.9746992344148743, "grad_norm": 2.8842613697052, "learning_rate": 0.00013510990079751022, "loss": 4.55, "step": 3342 }, { "epoch": 0.9749908858913598, "grad_norm": 3.756056308746338, "learning_rate": 0.00013509044932892434, "loss": 4.5723, "step": 3343 }, { "epoch": 0.9752825373678454, "grad_norm": 2.567288637161255, "learning_rate": 0.00013507099786033846, "loss": 4.2099, "step": 3344 }, { "epoch": 0.975574188844331, "grad_norm": 2.9030797481536865, "learning_rate": 0.00013505154639175258, "loss": 4.4122, "step": 3345 }, { "epoch": 0.9758658403208166, "grad_norm": 2.305860757827759, "learning_rate": 0.0001350320949231667, "loss": 4.3606, "step": 3346 }, { "epoch": 0.9761574917973023, "grad_norm": 2.688826322555542, "learning_rate": 0.00013501264345458082, "loss": 4.3099, "step": 3347 }, { "epoch": 0.9764491432737878, "grad_norm": 2.161597967147827, "learning_rate": 0.00013499319198599497, "loss": 4.2936, "step": 3348 }, { "epoch": 0.9767407947502734, "grad_norm": 2.9082024097442627, "learning_rate": 0.00013497374051740907, "loss": 4.5383, "step": 3349 }, { "epoch": 0.977032446226759, "grad_norm": 1.6603773832321167, "learning_rate": 0.0001349542890488232, "loss": 4.2277, "step": 3350 }, { "epoch": 0.9773240977032446, "grad_norm": 1.6080988645553589, "learning_rate": 0.0001349348375802373, "loss": 4.1536, "step": 3351 }, { "epoch": 0.9776157491797303, "grad_norm": 1.949001669883728, "learning_rate": 0.00013491538611165143, "loss": 4.336, "step": 3352 }, { "epoch": 0.9779074006562158, "grad_norm": 2.5155091285705566, "learning_rate": 0.00013489593464306555, "loss": 4.4545, "step": 3353 }, { "epoch": 0.9781990521327014, "grad_norm": 1.9257959127426147, "learning_rate": 0.00013487648317447968, "loss": 4.2728, "step": 3354 }, { "epoch": 0.978490703609187, "grad_norm": 2.069377899169922, "learning_rate": 0.00013485703170589382, "loss": 4.2128, "step": 3355 }, { "epoch": 0.9787823550856726, "grad_norm": 3.143242120742798, "learning_rate": 0.00013483758023730792, "loss": 4.0521, "step": 3356 }, { "epoch": 0.9790740065621583, "grad_norm": 1.9779025316238403, "learning_rate": 0.00013481812876872204, "loss": 4.423, "step": 3357 }, { "epoch": 0.9793656580386438, "grad_norm": 2.5320494174957275, "learning_rate": 0.00013479867730013616, "loss": 4.3114, "step": 3358 }, { "epoch": 0.9796573095151294, "grad_norm": 2.7331418991088867, "learning_rate": 0.00013477922583155028, "loss": 4.1467, "step": 3359 }, { "epoch": 0.979948960991615, "grad_norm": 2.6965291500091553, "learning_rate": 0.0001347597743629644, "loss": 4.5223, "step": 3360 }, { "epoch": 0.9802406124681006, "grad_norm": 2.0994315147399902, "learning_rate": 0.00013474032289437853, "loss": 4.5369, "step": 3361 }, { "epoch": 0.9805322639445863, "grad_norm": 3.2121825218200684, "learning_rate": 0.00013472087142579265, "loss": 4.6144, "step": 3362 }, { "epoch": 0.9808239154210718, "grad_norm": 2.1612186431884766, "learning_rate": 0.00013470141995720677, "loss": 4.1224, "step": 3363 }, { "epoch": 0.9811155668975574, "grad_norm": 2.3009915351867676, "learning_rate": 0.0001346819684886209, "loss": 4.2, "step": 3364 }, { "epoch": 0.981407218374043, "grad_norm": 2.0910804271698, "learning_rate": 0.00013466251702003504, "loss": 4.2331, "step": 3365 }, { "epoch": 0.9816988698505286, "grad_norm": 2.146249771118164, "learning_rate": 0.00013464306555144913, "loss": 4.3092, "step": 3366 }, { "epoch": 0.9819905213270143, "grad_norm": 2.4582979679107666, "learning_rate": 0.00013462361408286326, "loss": 4.2032, "step": 3367 }, { "epoch": 0.9822821728034998, "grad_norm": 3.3958046436309814, "learning_rate": 0.00013460416261427738, "loss": 4.624, "step": 3368 }, { "epoch": 0.9825738242799854, "grad_norm": 3.801509380340576, "learning_rate": 0.0001345847111456915, "loss": 4.1764, "step": 3369 }, { "epoch": 0.982865475756471, "grad_norm": 2.3541159629821777, "learning_rate": 0.00013456525967710562, "loss": 4.3319, "step": 3370 }, { "epoch": 0.9831571272329566, "grad_norm": 2.2629382610321045, "learning_rate": 0.00013454580820851974, "loss": 4.2809, "step": 3371 }, { "epoch": 0.9834487787094423, "grad_norm": 2.1117782592773438, "learning_rate": 0.0001345263567399339, "loss": 4.1726, "step": 3372 }, { "epoch": 0.9837404301859278, "grad_norm": 3.030406951904297, "learning_rate": 0.00013450690527134799, "loss": 4.3387, "step": 3373 }, { "epoch": 0.9840320816624134, "grad_norm": 2.385657787322998, "learning_rate": 0.0001344874538027621, "loss": 4.5193, "step": 3374 }, { "epoch": 0.984323733138899, "grad_norm": 2.79909610748291, "learning_rate": 0.00013446800233417626, "loss": 4.5609, "step": 3375 }, { "epoch": 0.9846153846153847, "grad_norm": 2.5138487815856934, "learning_rate": 0.00013444855086559035, "loss": 4.4741, "step": 3376 }, { "epoch": 0.9849070360918702, "grad_norm": 1.9415202140808105, "learning_rate": 0.00013442909939700447, "loss": 4.3013, "step": 3377 }, { "epoch": 0.9851986875683558, "grad_norm": 2.036658763885498, "learning_rate": 0.0001344096479284186, "loss": 4.3199, "step": 3378 }, { "epoch": 0.9854903390448414, "grad_norm": 2.416813850402832, "learning_rate": 0.00013439019645983274, "loss": 4.5685, "step": 3379 }, { "epoch": 0.985781990521327, "grad_norm": 2.3811349868774414, "learning_rate": 0.00013437074499124684, "loss": 4.4523, "step": 3380 }, { "epoch": 0.9860736419978127, "grad_norm": 1.6081657409667969, "learning_rate": 0.00013435129352266096, "loss": 4.4479, "step": 3381 }, { "epoch": 0.9863652934742982, "grad_norm": 3.2071213722229004, "learning_rate": 0.0001343318420540751, "loss": 4.223, "step": 3382 }, { "epoch": 0.9866569449507838, "grad_norm": 2.2453970909118652, "learning_rate": 0.0001343123905854892, "loss": 4.0124, "step": 3383 }, { "epoch": 0.9869485964272694, "grad_norm": 2.2305514812469482, "learning_rate": 0.00013429293911690332, "loss": 4.3628, "step": 3384 }, { "epoch": 0.987240247903755, "grad_norm": 2.8505256175994873, "learning_rate": 0.00013427348764831747, "loss": 4.3726, "step": 3385 }, { "epoch": 0.9875318993802407, "grad_norm": 2.122316598892212, "learning_rate": 0.00013425403617973157, "loss": 4.6364, "step": 3386 }, { "epoch": 0.9878235508567262, "grad_norm": 2.2559115886688232, "learning_rate": 0.0001342345847111457, "loss": 4.0917, "step": 3387 }, { "epoch": 0.9881152023332118, "grad_norm": 1.8466589450836182, "learning_rate": 0.0001342151332425598, "loss": 4.281, "step": 3388 }, { "epoch": 0.9884068538096974, "grad_norm": 1.7688978910446167, "learning_rate": 0.00013419568177397396, "loss": 4.0182, "step": 3389 }, { "epoch": 0.988698505286183, "grad_norm": 2.058769464492798, "learning_rate": 0.00013417623030538805, "loss": 3.9783, "step": 3390 }, { "epoch": 0.9889901567626687, "grad_norm": 4.333552837371826, "learning_rate": 0.00013415677883680217, "loss": 4.3607, "step": 3391 }, { "epoch": 0.9892818082391542, "grad_norm": 3.0012319087982178, "learning_rate": 0.00013413732736821632, "loss": 4.3864, "step": 3392 }, { "epoch": 0.9895734597156398, "grad_norm": 1.8415720462799072, "learning_rate": 0.00013411787589963042, "loss": 4.4152, "step": 3393 }, { "epoch": 0.9898651111921254, "grad_norm": 2.0388059616088867, "learning_rate": 0.00013409842443104454, "loss": 4.0174, "step": 3394 }, { "epoch": 0.990156762668611, "grad_norm": 2.173696279525757, "learning_rate": 0.0001340789729624587, "loss": 4.2281, "step": 3395 }, { "epoch": 0.9904484141450967, "grad_norm": 2.2226810455322266, "learning_rate": 0.0001340595214938728, "loss": 4.4442, "step": 3396 }, { "epoch": 0.9907400656215822, "grad_norm": 2.334852695465088, "learning_rate": 0.0001340400700252869, "loss": 4.2936, "step": 3397 }, { "epoch": 0.9910317170980678, "grad_norm": 1.8775092363357544, "learning_rate": 0.00013402061855670103, "loss": 4.2003, "step": 3398 }, { "epoch": 0.9913233685745534, "grad_norm": 2.199392557144165, "learning_rate": 0.00013400116708811517, "loss": 4.5759, "step": 3399 }, { "epoch": 0.991615020051039, "grad_norm": 3.2689011096954346, "learning_rate": 0.00013398171561952927, "loss": 4.5362, "step": 3400 }, { "epoch": 0.9919066715275247, "grad_norm": 1.9181398153305054, "learning_rate": 0.0001339622641509434, "loss": 4.1359, "step": 3401 }, { "epoch": 0.9921983230040102, "grad_norm": 2.3260409832000732, "learning_rate": 0.00013394281268235754, "loss": 4.3319, "step": 3402 }, { "epoch": 0.9924899744804958, "grad_norm": 2.20883846282959, "learning_rate": 0.00013392336121377166, "loss": 4.4419, "step": 3403 }, { "epoch": 0.9927816259569814, "grad_norm": 1.745679259300232, "learning_rate": 0.00013390390974518575, "loss": 4.2882, "step": 3404 }, { "epoch": 0.993073277433467, "grad_norm": 1.6804050207138062, "learning_rate": 0.0001338844582765999, "loss": 4.4328, "step": 3405 }, { "epoch": 0.9933649289099526, "grad_norm": 2.3120272159576416, "learning_rate": 0.00013386500680801403, "loss": 4.5019, "step": 3406 }, { "epoch": 0.9936565803864382, "grad_norm": 2.2810049057006836, "learning_rate": 0.00013384555533942812, "loss": 4.2739, "step": 3407 }, { "epoch": 0.9939482318629238, "grad_norm": 2.638453960418701, "learning_rate": 0.00013382610387084224, "loss": 3.9183, "step": 3408 }, { "epoch": 0.9942398833394094, "grad_norm": 2.1183972358703613, "learning_rate": 0.0001338066524022564, "loss": 4.369, "step": 3409 }, { "epoch": 0.994531534815895, "grad_norm": 1.8477228879928589, "learning_rate": 0.00013378720093367048, "loss": 4.2782, "step": 3410 }, { "epoch": 0.9948231862923806, "grad_norm": 2.4741878509521484, "learning_rate": 0.0001337677494650846, "loss": 4.1427, "step": 3411 }, { "epoch": 0.9951148377688662, "grad_norm": 1.881752848625183, "learning_rate": 0.00013374829799649875, "loss": 4.3725, "step": 3412 }, { "epoch": 0.9954064892453518, "grad_norm": 1.776281714439392, "learning_rate": 0.00013372884652791288, "loss": 4.2751, "step": 3413 }, { "epoch": 0.9956981407218374, "grad_norm": 3.017777919769287, "learning_rate": 0.00013370939505932697, "loss": 4.3216, "step": 3414 }, { "epoch": 0.995989792198323, "grad_norm": 2.6199123859405518, "learning_rate": 0.00013368994359074112, "loss": 4.3344, "step": 3415 }, { "epoch": 0.9962814436748086, "grad_norm": 2.4073708057403564, "learning_rate": 0.00013367049212215524, "loss": 4.493, "step": 3416 }, { "epoch": 0.9965730951512942, "grad_norm": 3.034299850463867, "learning_rate": 0.00013365104065356934, "loss": 4.2879, "step": 3417 }, { "epoch": 0.9968647466277798, "grad_norm": 2.5459437370300293, "learning_rate": 0.00013363158918498346, "loss": 4.3978, "step": 3418 }, { "epoch": 0.9971563981042654, "grad_norm": 3.386188507080078, "learning_rate": 0.0001336121377163976, "loss": 4.2826, "step": 3419 }, { "epoch": 0.997448049580751, "grad_norm": 2.331772804260254, "learning_rate": 0.00013359268624781173, "loss": 4.5226, "step": 3420 }, { "epoch": 0.9977397010572366, "grad_norm": 3.9631764888763428, "learning_rate": 0.00013357323477922582, "loss": 4.4312, "step": 3421 }, { "epoch": 0.9980313525337222, "grad_norm": 2.6952857971191406, "learning_rate": 0.00013355378331063997, "loss": 4.2005, "step": 3422 }, { "epoch": 0.9983230040102078, "grad_norm": 2.164630174636841, "learning_rate": 0.0001335343318420541, "loss": 4.2183, "step": 3423 }, { "epoch": 0.9986146554866934, "grad_norm": 2.412306547164917, "learning_rate": 0.0001335148803734682, "loss": 4.5236, "step": 3424 }, { "epoch": 0.998906306963179, "grad_norm": 2.019413709640503, "learning_rate": 0.00013349542890488234, "loss": 4.5985, "step": 3425 }, { "epoch": 0.9991979584396646, "grad_norm": 2.560671329498291, "learning_rate": 0.00013347597743629646, "loss": 4.2116, "step": 3426 }, { "epoch": 0.9994896099161502, "grad_norm": 3.646475315093994, "learning_rate": 0.00013345652596771058, "loss": 4.4596, "step": 3427 }, { "epoch": 0.9997812613926358, "grad_norm": 5.713184833526611, "learning_rate": 0.00013343707449912467, "loss": 4.8844, "step": 3428 }, { "epoch": 1.0, "grad_norm": 2.416008234024048, "learning_rate": 0.00013341762303053882, "loss": 4.1038, "step": 3429 }, { "epoch": 1.0002916514764857, "grad_norm": 2.3326210975646973, "learning_rate": 0.00013339817156195294, "loss": 4.1019, "step": 3430 }, { "epoch": 1.0005833029529712, "grad_norm": 1.8867557048797607, "learning_rate": 0.00013337872009336704, "loss": 3.823, "step": 3431 }, { "epoch": 1.0008749544294568, "grad_norm": 2.364210844039917, "learning_rate": 0.0001333592686247812, "loss": 4.1966, "step": 3432 }, { "epoch": 1.0011666059059423, "grad_norm": 2.061522960662842, "learning_rate": 0.0001333398171561953, "loss": 3.9268, "step": 3433 }, { "epoch": 1.001458257382428, "grad_norm": 2.813209295272827, "learning_rate": 0.00013332036568760943, "loss": 3.8014, "step": 3434 }, { "epoch": 1.0017499088589137, "grad_norm": 2.073108434677124, "learning_rate": 0.00013330091421902352, "loss": 3.8287, "step": 3435 }, { "epoch": 1.0020415603353992, "grad_norm": 2.554475784301758, "learning_rate": 0.00013328146275043767, "loss": 4.1206, "step": 3436 }, { "epoch": 1.0023332118118848, "grad_norm": 2.361034870147705, "learning_rate": 0.0001332620112818518, "loss": 4.4184, "step": 3437 }, { "epoch": 1.0026248632883703, "grad_norm": 2.7499959468841553, "learning_rate": 0.0001332425598132659, "loss": 4.2613, "step": 3438 }, { "epoch": 1.002916514764856, "grad_norm": 2.3989450931549072, "learning_rate": 0.00013322310834468004, "loss": 4.0864, "step": 3439 }, { "epoch": 1.0032081662413417, "grad_norm": 2.762314796447754, "learning_rate": 0.00013320365687609416, "loss": 3.8297, "step": 3440 }, { "epoch": 1.0034998177178271, "grad_norm": 2.1619298458099365, "learning_rate": 0.00013318420540750825, "loss": 3.9648, "step": 3441 }, { "epoch": 1.0037914691943128, "grad_norm": 2.1120946407318115, "learning_rate": 0.0001331647539389224, "loss": 4.0067, "step": 3442 }, { "epoch": 1.0040831206707983, "grad_norm": 2.241726875305176, "learning_rate": 0.00013314530247033652, "loss": 4.0437, "step": 3443 }, { "epoch": 1.004374772147284, "grad_norm": 2.2102160453796387, "learning_rate": 0.00013312585100175065, "loss": 3.874, "step": 3444 }, { "epoch": 1.0046664236237697, "grad_norm": 1.9475786685943604, "learning_rate": 0.00013310639953316474, "loss": 3.818, "step": 3445 }, { "epoch": 1.0049580751002551, "grad_norm": 2.465104341506958, "learning_rate": 0.0001330869480645789, "loss": 3.8767, "step": 3446 }, { "epoch": 1.0052497265767408, "grad_norm": 1.9329520463943481, "learning_rate": 0.000133067496595993, "loss": 3.9703, "step": 3447 }, { "epoch": 1.0055413780532263, "grad_norm": 3.4502336978912354, "learning_rate": 0.0001330480451274071, "loss": 3.9905, "step": 3448 }, { "epoch": 1.005833029529712, "grad_norm": 2.5210745334625244, "learning_rate": 0.00013302859365882125, "loss": 4.0026, "step": 3449 }, { "epoch": 1.0061246810061977, "grad_norm": 1.8097997903823853, "learning_rate": 0.00013300914219023538, "loss": 4.0491, "step": 3450 }, { "epoch": 1.0064163324826831, "grad_norm": 2.1037564277648926, "learning_rate": 0.0001329896907216495, "loss": 4.1583, "step": 3451 }, { "epoch": 1.0067079839591688, "grad_norm": 2.2534127235412598, "learning_rate": 0.00013297023925306362, "loss": 3.9757, "step": 3452 }, { "epoch": 1.0069996354356543, "grad_norm": 2.145324468612671, "learning_rate": 0.00013295078778447774, "loss": 3.9729, "step": 3453 }, { "epoch": 1.00729128691214, "grad_norm": 2.2657132148742676, "learning_rate": 0.00013293133631589186, "loss": 4.1803, "step": 3454 }, { "epoch": 1.0075829383886257, "grad_norm": 3.1951446533203125, "learning_rate": 0.00013291188484730596, "loss": 4.1286, "step": 3455 }, { "epoch": 1.0078745898651111, "grad_norm": 1.9480671882629395, "learning_rate": 0.0001328924333787201, "loss": 3.929, "step": 3456 }, { "epoch": 1.0081662413415968, "grad_norm": 2.356607675552368, "learning_rate": 0.00013287298191013423, "loss": 4.213, "step": 3457 }, { "epoch": 1.0084578928180823, "grad_norm": 3.604018211364746, "learning_rate": 0.00013285353044154835, "loss": 4.0556, "step": 3458 }, { "epoch": 1.008749544294568, "grad_norm": 2.105424404144287, "learning_rate": 0.00013283407897296247, "loss": 4.0223, "step": 3459 }, { "epoch": 1.0090411957710537, "grad_norm": 2.0605576038360596, "learning_rate": 0.0001328146275043766, "loss": 4.0876, "step": 3460 }, { "epoch": 1.0093328472475391, "grad_norm": 2.3625588417053223, "learning_rate": 0.0001327951760357907, "loss": 3.9944, "step": 3461 }, { "epoch": 1.0096244987240248, "grad_norm": 2.2712228298187256, "learning_rate": 0.00013277572456720483, "loss": 3.8626, "step": 3462 }, { "epoch": 1.0099161502005103, "grad_norm": 2.0972707271575928, "learning_rate": 0.00013275627309861896, "loss": 3.7791, "step": 3463 }, { "epoch": 1.010207801676996, "grad_norm": 2.1613776683807373, "learning_rate": 0.00013273682163003308, "loss": 4.0142, "step": 3464 }, { "epoch": 1.0104994531534817, "grad_norm": 2.5306122303009033, "learning_rate": 0.00013271737016144717, "loss": 4.0744, "step": 3465 }, { "epoch": 1.0107911046299671, "grad_norm": 2.114380121231079, "learning_rate": 0.00013269791869286132, "loss": 3.7941, "step": 3466 }, { "epoch": 1.0110827561064528, "grad_norm": 3.399566650390625, "learning_rate": 0.00013267846722427544, "loss": 3.9884, "step": 3467 }, { "epoch": 1.0113744075829383, "grad_norm": 2.2822439670562744, "learning_rate": 0.00013265901575568956, "loss": 3.9046, "step": 3468 }, { "epoch": 1.011666059059424, "grad_norm": 2.0032076835632324, "learning_rate": 0.00013263956428710369, "loss": 4.1016, "step": 3469 }, { "epoch": 1.0119577105359097, "grad_norm": 2.138728141784668, "learning_rate": 0.0001326201128185178, "loss": 4.0039, "step": 3470 }, { "epoch": 1.0122493620123951, "grad_norm": 2.070495128631592, "learning_rate": 0.00013260066134993193, "loss": 4.206, "step": 3471 }, { "epoch": 1.0125410134888808, "grad_norm": 2.4441587924957275, "learning_rate": 0.00013258120988134605, "loss": 4.0003, "step": 3472 }, { "epoch": 1.0128326649653663, "grad_norm": 2.053699254989624, "learning_rate": 0.00013256175841276017, "loss": 4.2988, "step": 3473 }, { "epoch": 1.013124316441852, "grad_norm": 3.923792600631714, "learning_rate": 0.0001325423069441743, "loss": 4.3502, "step": 3474 }, { "epoch": 1.0134159679183377, "grad_norm": 2.7436447143554688, "learning_rate": 0.00013252285547558842, "loss": 4.0884, "step": 3475 }, { "epoch": 1.0137076193948231, "grad_norm": 2.1527891159057617, "learning_rate": 0.00013250340400700254, "loss": 4.3831, "step": 3476 }, { "epoch": 1.0139992708713088, "grad_norm": 2.040104389190674, "learning_rate": 0.00013248395253841666, "loss": 4.2566, "step": 3477 }, { "epoch": 1.0142909223477943, "grad_norm": 2.509568691253662, "learning_rate": 0.00013246450106983078, "loss": 3.9972, "step": 3478 }, { "epoch": 1.01458257382428, "grad_norm": 2.2291972637176514, "learning_rate": 0.0001324450496012449, "loss": 4.2423, "step": 3479 }, { "epoch": 1.0148742253007657, "grad_norm": 2.7217116355895996, "learning_rate": 0.00013242559813265902, "loss": 4.1277, "step": 3480 }, { "epoch": 1.0151658767772511, "grad_norm": 2.5122575759887695, "learning_rate": 0.00013240614666407314, "loss": 3.9478, "step": 3481 }, { "epoch": 1.0154575282537368, "grad_norm": 2.0958616733551025, "learning_rate": 0.00013238669519548727, "loss": 4.2337, "step": 3482 }, { "epoch": 1.0157491797302223, "grad_norm": 2.3221218585968018, "learning_rate": 0.0001323672437269014, "loss": 4.4201, "step": 3483 }, { "epoch": 1.016040831206708, "grad_norm": 2.7179267406463623, "learning_rate": 0.0001323477922583155, "loss": 4.0863, "step": 3484 }, { "epoch": 1.0163324826831936, "grad_norm": 2.828907012939453, "learning_rate": 0.00013232834078972963, "loss": 4.009, "step": 3485 }, { "epoch": 1.0166241341596791, "grad_norm": 1.9848181009292603, "learning_rate": 0.00013230888932114375, "loss": 4.0795, "step": 3486 }, { "epoch": 1.0169157856361648, "grad_norm": 2.093888282775879, "learning_rate": 0.00013228943785255787, "loss": 3.9582, "step": 3487 }, { "epoch": 1.0172074371126505, "grad_norm": 1.7861733436584473, "learning_rate": 0.000132269986383972, "loss": 3.9559, "step": 3488 }, { "epoch": 1.017499088589136, "grad_norm": 3.272711753845215, "learning_rate": 0.00013225053491538612, "loss": 4.026, "step": 3489 }, { "epoch": 1.0177907400656216, "grad_norm": 3.1349198818206787, "learning_rate": 0.00013223108344680024, "loss": 3.9057, "step": 3490 }, { "epoch": 1.0180823915421071, "grad_norm": 2.369363307952881, "learning_rate": 0.00013221163197821436, "loss": 4.2099, "step": 3491 }, { "epoch": 1.0183740430185928, "grad_norm": 2.7979862689971924, "learning_rate": 0.00013219218050962848, "loss": 3.8509, "step": 3492 }, { "epoch": 1.0186656944950785, "grad_norm": 2.430081605911255, "learning_rate": 0.0001321727290410426, "loss": 4.1317, "step": 3493 }, { "epoch": 1.018957345971564, "grad_norm": 2.3532679080963135, "learning_rate": 0.00013215327757245673, "loss": 4.14, "step": 3494 }, { "epoch": 1.0192489974480496, "grad_norm": 2.2440614700317383, "learning_rate": 0.00013213382610387085, "loss": 4.0705, "step": 3495 }, { "epoch": 1.019540648924535, "grad_norm": 2.311441421508789, "learning_rate": 0.00013211437463528497, "loss": 4.1615, "step": 3496 }, { "epoch": 1.0198323004010208, "grad_norm": 3.0656111240386963, "learning_rate": 0.0001320949231666991, "loss": 3.9978, "step": 3497 }, { "epoch": 1.0201239518775065, "grad_norm": 3.2095048427581787, "learning_rate": 0.0001320754716981132, "loss": 4.131, "step": 3498 }, { "epoch": 1.020415603353992, "grad_norm": 2.4055562019348145, "learning_rate": 0.00013205602022952733, "loss": 4.0879, "step": 3499 }, { "epoch": 1.0207072548304776, "grad_norm": 2.897052526473999, "learning_rate": 0.00013203656876094146, "loss": 4.003, "step": 3500 }, { "epoch": 1.020998906306963, "grad_norm": 1.9802230596542358, "learning_rate": 0.00013201711729235558, "loss": 3.9862, "step": 3501 }, { "epoch": 1.0212905577834488, "grad_norm": 2.290489912033081, "learning_rate": 0.0001319976658237697, "loss": 4.1314, "step": 3502 }, { "epoch": 1.0215822092599345, "grad_norm": 2.3104827404022217, "learning_rate": 0.00013197821435518382, "loss": 4.1135, "step": 3503 }, { "epoch": 1.02187386073642, "grad_norm": 2.064621686935425, "learning_rate": 0.00013195876288659794, "loss": 4.07, "step": 3504 }, { "epoch": 1.0221655122129056, "grad_norm": 2.8410487174987793, "learning_rate": 0.00013193931141801206, "loss": 3.8922, "step": 3505 }, { "epoch": 1.022457163689391, "grad_norm": 1.8682936429977417, "learning_rate": 0.00013191985994942618, "loss": 4.1378, "step": 3506 }, { "epoch": 1.0227488151658768, "grad_norm": 2.200676918029785, "learning_rate": 0.0001319004084808403, "loss": 4.2078, "step": 3507 }, { "epoch": 1.0230404666423625, "grad_norm": 2.003002166748047, "learning_rate": 0.00013188095701225443, "loss": 3.8158, "step": 3508 }, { "epoch": 1.023332118118848, "grad_norm": 2.208402395248413, "learning_rate": 0.00013186150554366855, "loss": 4.1986, "step": 3509 }, { "epoch": 1.0236237695953336, "grad_norm": 2.9571359157562256, "learning_rate": 0.00013184205407508267, "loss": 4.286, "step": 3510 }, { "epoch": 1.023915421071819, "grad_norm": 3.380676507949829, "learning_rate": 0.0001318226026064968, "loss": 4.1453, "step": 3511 }, { "epoch": 1.0242070725483048, "grad_norm": 3.2041282653808594, "learning_rate": 0.00013180315113791094, "loss": 4.2606, "step": 3512 }, { "epoch": 1.0244987240247905, "grad_norm": 2.2913107872009277, "learning_rate": 0.00013178369966932504, "loss": 3.891, "step": 3513 }, { "epoch": 1.024790375501276, "grad_norm": 2.191253900527954, "learning_rate": 0.00013176424820073916, "loss": 3.9597, "step": 3514 }, { "epoch": 1.0250820269777616, "grad_norm": 1.8912781476974487, "learning_rate": 0.00013174479673215328, "loss": 4.3877, "step": 3515 }, { "epoch": 1.025373678454247, "grad_norm": 3.1324381828308105, "learning_rate": 0.0001317253452635674, "loss": 4.0873, "step": 3516 }, { "epoch": 1.0256653299307328, "grad_norm": 2.3171226978302, "learning_rate": 0.00013170589379498152, "loss": 3.9941, "step": 3517 }, { "epoch": 1.0259569814072185, "grad_norm": 2.273911476135254, "learning_rate": 0.00013168644232639564, "loss": 3.997, "step": 3518 }, { "epoch": 1.026248632883704, "grad_norm": 2.576537847518921, "learning_rate": 0.0001316669908578098, "loss": 4.3809, "step": 3519 }, { "epoch": 1.0265402843601896, "grad_norm": 2.850043296813965, "learning_rate": 0.0001316475393892239, "loss": 4.1824, "step": 3520 }, { "epoch": 1.026831935836675, "grad_norm": 1.872612714767456, "learning_rate": 0.000131628087920638, "loss": 3.9912, "step": 3521 }, { "epoch": 1.0271235873131608, "grad_norm": 2.7295124530792236, "learning_rate": 0.00013160863645205213, "loss": 4.1479, "step": 3522 }, { "epoch": 1.0274152387896465, "grad_norm": 2.028632879257202, "learning_rate": 0.00013158918498346625, "loss": 4.183, "step": 3523 }, { "epoch": 1.027706890266132, "grad_norm": 3.0232629776000977, "learning_rate": 0.00013156973351488037, "loss": 4.2364, "step": 3524 }, { "epoch": 1.0279985417426176, "grad_norm": 2.5391080379486084, "learning_rate": 0.0001315502820462945, "loss": 4.1137, "step": 3525 }, { "epoch": 1.028290193219103, "grad_norm": 3.173473596572876, "learning_rate": 0.00013153083057770864, "loss": 4.1599, "step": 3526 }, { "epoch": 1.0285818446955888, "grad_norm": 1.9686180353164673, "learning_rate": 0.00013151137910912274, "loss": 4.0302, "step": 3527 }, { "epoch": 1.0288734961720745, "grad_norm": 2.044929265975952, "learning_rate": 0.00013149192764053686, "loss": 4.2238, "step": 3528 }, { "epoch": 1.02916514764856, "grad_norm": 2.2928390502929688, "learning_rate": 0.000131472476171951, "loss": 3.9998, "step": 3529 }, { "epoch": 1.0294567991250456, "grad_norm": 1.8191088438034058, "learning_rate": 0.0001314530247033651, "loss": 4.212, "step": 3530 }, { "epoch": 1.029748450601531, "grad_norm": 2.1753323078155518, "learning_rate": 0.00013143357323477922, "loss": 4.2597, "step": 3531 }, { "epoch": 1.0300401020780168, "grad_norm": 1.8814165592193604, "learning_rate": 0.00013141412176619335, "loss": 4.1449, "step": 3532 }, { "epoch": 1.0303317535545025, "grad_norm": 2.356252670288086, "learning_rate": 0.00013139467029760747, "loss": 3.9215, "step": 3533 }, { "epoch": 1.030623405030988, "grad_norm": 2.839491844177246, "learning_rate": 0.0001313752188290216, "loss": 4.0525, "step": 3534 }, { "epoch": 1.0309150565074736, "grad_norm": 1.9469653367996216, "learning_rate": 0.0001313557673604357, "loss": 4.1216, "step": 3535 }, { "epoch": 1.031206707983959, "grad_norm": 2.0754892826080322, "learning_rate": 0.00013133631589184986, "loss": 3.6873, "step": 3536 }, { "epoch": 1.0314983594604448, "grad_norm": 2.179924964904785, "learning_rate": 0.00013131686442326395, "loss": 3.8863, "step": 3537 }, { "epoch": 1.0317900109369305, "grad_norm": 2.168010950088501, "learning_rate": 0.00013129741295467808, "loss": 4.1382, "step": 3538 }, { "epoch": 1.032081662413416, "grad_norm": 2.043322801589966, "learning_rate": 0.00013127796148609222, "loss": 4.1919, "step": 3539 }, { "epoch": 1.0323733138899016, "grad_norm": 2.0355608463287354, "learning_rate": 0.00013125851001750632, "loss": 4.4338, "step": 3540 }, { "epoch": 1.032664965366387, "grad_norm": 3.1718857288360596, "learning_rate": 0.00013123905854892044, "loss": 4.0882, "step": 3541 }, { "epoch": 1.0329566168428728, "grad_norm": 3.051480531692505, "learning_rate": 0.00013121960708033456, "loss": 3.8931, "step": 3542 }, { "epoch": 1.0332482683193585, "grad_norm": 2.6350018978118896, "learning_rate": 0.0001312001556117487, "loss": 4.0101, "step": 3543 }, { "epoch": 1.033539919795844, "grad_norm": 5.007105350494385, "learning_rate": 0.0001311807041431628, "loss": 4.2586, "step": 3544 }, { "epoch": 1.0338315712723296, "grad_norm": 2.4072067737579346, "learning_rate": 0.00013116125267457693, "loss": 4.0177, "step": 3545 }, { "epoch": 1.034123222748815, "grad_norm": 2.2894346714019775, "learning_rate": 0.00013114180120599108, "loss": 4.1517, "step": 3546 }, { "epoch": 1.0344148742253008, "grad_norm": 2.5950119495391846, "learning_rate": 0.00013112234973740517, "loss": 4.0663, "step": 3547 }, { "epoch": 1.0347065257017865, "grad_norm": 2.1868672370910645, "learning_rate": 0.0001311028982688193, "loss": 3.8046, "step": 3548 }, { "epoch": 1.034998177178272, "grad_norm": 2.5395150184631348, "learning_rate": 0.00013108344680023344, "loss": 4.3, "step": 3549 }, { "epoch": 1.0352898286547576, "grad_norm": 2.0507733821868896, "learning_rate": 0.00013106399533164756, "loss": 4.1228, "step": 3550 }, { "epoch": 1.035581480131243, "grad_norm": 2.2777962684631348, "learning_rate": 0.00013104454386306166, "loss": 4.4237, "step": 3551 }, { "epoch": 1.0358731316077288, "grad_norm": 3.6316025257110596, "learning_rate": 0.00013102509239447578, "loss": 4.0732, "step": 3552 }, { "epoch": 1.0361647830842144, "grad_norm": 2.334852933883667, "learning_rate": 0.00013100564092588993, "loss": 3.9437, "step": 3553 }, { "epoch": 1.0364564345607, "grad_norm": 2.249410390853882, "learning_rate": 0.00013098618945730402, "loss": 4.1779, "step": 3554 }, { "epoch": 1.0367480860371856, "grad_norm": 2.30305814743042, "learning_rate": 0.00013096673798871814, "loss": 4.1829, "step": 3555 }, { "epoch": 1.037039737513671, "grad_norm": 3.253514289855957, "learning_rate": 0.0001309472865201323, "loss": 4.1624, "step": 3556 }, { "epoch": 1.0373313889901568, "grad_norm": 2.472224473953247, "learning_rate": 0.00013092783505154639, "loss": 4.1573, "step": 3557 }, { "epoch": 1.0376230404666424, "grad_norm": 2.4491541385650635, "learning_rate": 0.0001309083835829605, "loss": 3.993, "step": 3558 }, { "epoch": 1.037914691943128, "grad_norm": 3.5965380668640137, "learning_rate": 0.00013088893211437466, "loss": 4.1016, "step": 3559 }, { "epoch": 1.0382063434196136, "grad_norm": 3.1952896118164062, "learning_rate": 0.00013086948064578878, "loss": 3.7384, "step": 3560 }, { "epoch": 1.038497994896099, "grad_norm": 2.412155866622925, "learning_rate": 0.00013085002917720287, "loss": 4.2877, "step": 3561 }, { "epoch": 1.0387896463725848, "grad_norm": 2.4900898933410645, "learning_rate": 0.000130830577708617, "loss": 4.1558, "step": 3562 }, { "epoch": 1.0390812978490704, "grad_norm": 2.529109001159668, "learning_rate": 0.00013081112624003114, "loss": 3.8222, "step": 3563 }, { "epoch": 1.039372949325556, "grad_norm": 2.0677695274353027, "learning_rate": 0.00013079167477144524, "loss": 3.8928, "step": 3564 }, { "epoch": 1.0396646008020416, "grad_norm": 1.8793413639068604, "learning_rate": 0.00013077222330285936, "loss": 3.8201, "step": 3565 }, { "epoch": 1.039956252278527, "grad_norm": 2.222069025039673, "learning_rate": 0.0001307527718342735, "loss": 3.9971, "step": 3566 }, { "epoch": 1.0402479037550127, "grad_norm": 3.303497552871704, "learning_rate": 0.00013073332036568763, "loss": 3.9301, "step": 3567 }, { "epoch": 1.0405395552314984, "grad_norm": 2.2603073120117188, "learning_rate": 0.00013071386889710172, "loss": 4.0754, "step": 3568 }, { "epoch": 1.040831206707984, "grad_norm": 2.436617136001587, "learning_rate": 0.00013069441742851587, "loss": 3.8094, "step": 3569 }, { "epoch": 1.0411228581844696, "grad_norm": 1.9966446161270142, "learning_rate": 0.00013067496595993, "loss": 4.1149, "step": 3570 }, { "epoch": 1.041414509660955, "grad_norm": 2.241757392883301, "learning_rate": 0.0001306555144913441, "loss": 3.8107, "step": 3571 }, { "epoch": 1.0417061611374407, "grad_norm": 2.058804512023926, "learning_rate": 0.0001306360630227582, "loss": 4.1102, "step": 3572 }, { "epoch": 1.0419978126139264, "grad_norm": 1.9921201467514038, "learning_rate": 0.00013061661155417236, "loss": 4.0534, "step": 3573 }, { "epoch": 1.042289464090412, "grad_norm": 2.0572948455810547, "learning_rate": 0.00013059716008558648, "loss": 3.8945, "step": 3574 }, { "epoch": 1.0425811155668976, "grad_norm": 2.3395678997039795, "learning_rate": 0.00013057770861700057, "loss": 3.9705, "step": 3575 }, { "epoch": 1.042872767043383, "grad_norm": 3.0236995220184326, "learning_rate": 0.00013055825714841472, "loss": 4.0641, "step": 3576 }, { "epoch": 1.0431644185198687, "grad_norm": 2.2343223094940186, "learning_rate": 0.00013053880567982884, "loss": 3.932, "step": 3577 }, { "epoch": 1.0434560699963544, "grad_norm": 2.349310874938965, "learning_rate": 0.00013051935421124294, "loss": 4.0638, "step": 3578 }, { "epoch": 1.04374772147284, "grad_norm": 2.2989697456359863, "learning_rate": 0.0001304999027426571, "loss": 4.2674, "step": 3579 }, { "epoch": 1.0440393729493256, "grad_norm": 2.0049092769622803, "learning_rate": 0.0001304804512740712, "loss": 4.1303, "step": 3580 }, { "epoch": 1.044331024425811, "grad_norm": 2.1712615489959717, "learning_rate": 0.00013046099980548533, "loss": 4.078, "step": 3581 }, { "epoch": 1.0446226759022967, "grad_norm": 2.2046048641204834, "learning_rate": 0.00013044154833689943, "loss": 4.228, "step": 3582 }, { "epoch": 1.0449143273787824, "grad_norm": 2.0061452388763428, "learning_rate": 0.00013042209686831357, "loss": 4.0512, "step": 3583 }, { "epoch": 1.045205978855268, "grad_norm": 2.2770352363586426, "learning_rate": 0.0001304026453997277, "loss": 4.309, "step": 3584 }, { "epoch": 1.0454976303317536, "grad_norm": 2.9033584594726562, "learning_rate": 0.0001303831939311418, "loss": 4.0753, "step": 3585 }, { "epoch": 1.045789281808239, "grad_norm": 2.40948486328125, "learning_rate": 0.00013036374246255594, "loss": 4.0258, "step": 3586 }, { "epoch": 1.0460809332847247, "grad_norm": 2.5783286094665527, "learning_rate": 0.00013034429099397006, "loss": 4.2103, "step": 3587 }, { "epoch": 1.0463725847612104, "grad_norm": 2.071617364883423, "learning_rate": 0.00013032483952538416, "loss": 4.165, "step": 3588 }, { "epoch": 1.0466642362376959, "grad_norm": 2.2772035598754883, "learning_rate": 0.0001303053880567983, "loss": 4.171, "step": 3589 }, { "epoch": 1.0469558877141816, "grad_norm": 2.2733092308044434, "learning_rate": 0.00013028593658821243, "loss": 4.0551, "step": 3590 }, { "epoch": 1.0472475391906673, "grad_norm": 2.351271629333496, "learning_rate": 0.00013026648511962655, "loss": 4.0362, "step": 3591 }, { "epoch": 1.0475391906671527, "grad_norm": 2.027092933654785, "learning_rate": 0.00013024703365104064, "loss": 4.0683, "step": 3592 }, { "epoch": 1.0478308421436384, "grad_norm": 1.7156187295913696, "learning_rate": 0.0001302275821824548, "loss": 3.9627, "step": 3593 }, { "epoch": 1.0481224936201239, "grad_norm": 2.6239726543426514, "learning_rate": 0.0001302081307138689, "loss": 3.9112, "step": 3594 }, { "epoch": 1.0484141450966096, "grad_norm": 1.846990942955017, "learning_rate": 0.000130188679245283, "loss": 3.963, "step": 3595 }, { "epoch": 1.048705796573095, "grad_norm": 1.9480798244476318, "learning_rate": 0.00013016922777669716, "loss": 4.1628, "step": 3596 }, { "epoch": 1.0489974480495807, "grad_norm": 1.7867913246154785, "learning_rate": 0.00013014977630811128, "loss": 3.9183, "step": 3597 }, { "epoch": 1.0492890995260664, "grad_norm": 2.2480857372283936, "learning_rate": 0.0001301303248395254, "loss": 3.9783, "step": 3598 }, { "epoch": 1.0495807510025519, "grad_norm": 3.0209312438964844, "learning_rate": 0.0001301108733709395, "loss": 3.8579, "step": 3599 }, { "epoch": 1.0498724024790376, "grad_norm": 2.0407180786132812, "learning_rate": 0.00013009142190235364, "loss": 3.8406, "step": 3600 }, { "epoch": 1.0501640539555233, "grad_norm": 2.5291249752044678, "learning_rate": 0.00013007197043376776, "loss": 4.0609, "step": 3601 }, { "epoch": 1.0504557054320087, "grad_norm": 2.348912477493286, "learning_rate": 0.00013005251896518186, "loss": 4.1893, "step": 3602 }, { "epoch": 1.0507473569084944, "grad_norm": 2.4402434825897217, "learning_rate": 0.000130033067496596, "loss": 4.1718, "step": 3603 }, { "epoch": 1.0510390083849799, "grad_norm": 2.0184643268585205, "learning_rate": 0.00013001361602801013, "loss": 3.6915, "step": 3604 }, { "epoch": 1.0513306598614656, "grad_norm": 2.633053779602051, "learning_rate": 0.00012999416455942425, "loss": 4.338, "step": 3605 }, { "epoch": 1.0516223113379513, "grad_norm": 2.9046967029571533, "learning_rate": 0.00012997471309083837, "loss": 4.0709, "step": 3606 }, { "epoch": 1.0519139628144367, "grad_norm": 2.7938003540039062, "learning_rate": 0.0001299552616222525, "loss": 4.3641, "step": 3607 }, { "epoch": 1.0522056142909224, "grad_norm": 3.2420787811279297, "learning_rate": 0.00012993581015366661, "loss": 4.201, "step": 3608 }, { "epoch": 1.0524972657674079, "grad_norm": 1.8700824975967407, "learning_rate": 0.0001299163586850807, "loss": 3.8766, "step": 3609 }, { "epoch": 1.0527889172438936, "grad_norm": 2.7728188037872314, "learning_rate": 0.00012989690721649486, "loss": 3.9584, "step": 3610 }, { "epoch": 1.0530805687203793, "grad_norm": 2.538593292236328, "learning_rate": 0.00012987745574790898, "loss": 4.3779, "step": 3611 }, { "epoch": 1.0533722201968647, "grad_norm": 1.929302453994751, "learning_rate": 0.00012985800427932307, "loss": 4.0782, "step": 3612 }, { "epoch": 1.0536638716733504, "grad_norm": 3.0609242916107178, "learning_rate": 0.00012983855281073722, "loss": 3.999, "step": 3613 }, { "epoch": 1.0539555231498359, "grad_norm": 2.1503701210021973, "learning_rate": 0.00012981910134215134, "loss": 4.0949, "step": 3614 }, { "epoch": 1.0542471746263216, "grad_norm": 1.9994126558303833, "learning_rate": 0.00012979964987356547, "loss": 4.1874, "step": 3615 }, { "epoch": 1.0545388261028072, "grad_norm": 2.1460988521575928, "learning_rate": 0.0001297801984049796, "loss": 4.2076, "step": 3616 }, { "epoch": 1.0548304775792927, "grad_norm": 2.4340686798095703, "learning_rate": 0.0001297607469363937, "loss": 4.3762, "step": 3617 }, { "epoch": 1.0551221290557784, "grad_norm": 2.378307342529297, "learning_rate": 0.00012974129546780783, "loss": 4.4522, "step": 3618 }, { "epoch": 1.0554137805322639, "grad_norm": 2.3210902214050293, "learning_rate": 0.00012972184399922192, "loss": 4.2682, "step": 3619 }, { "epoch": 1.0557054320087496, "grad_norm": 2.3946800231933594, "learning_rate": 0.00012970239253063607, "loss": 4.0113, "step": 3620 }, { "epoch": 1.0559970834852352, "grad_norm": 2.068248748779297, "learning_rate": 0.0001296829410620502, "loss": 4.1837, "step": 3621 }, { "epoch": 1.0562887349617207, "grad_norm": 2.0551655292510986, "learning_rate": 0.00012966348959346432, "loss": 4.225, "step": 3622 }, { "epoch": 1.0565803864382064, "grad_norm": 2.8425660133361816, "learning_rate": 0.00012964403812487844, "loss": 3.9051, "step": 3623 }, { "epoch": 1.0568720379146919, "grad_norm": 2.4270920753479004, "learning_rate": 0.00012962458665629256, "loss": 4.056, "step": 3624 }, { "epoch": 1.0571636893911776, "grad_norm": 1.8518297672271729, "learning_rate": 0.00012960513518770668, "loss": 4.215, "step": 3625 }, { "epoch": 1.0574553408676632, "grad_norm": 2.425413131713867, "learning_rate": 0.0001295856837191208, "loss": 4.087, "step": 3626 }, { "epoch": 1.0577469923441487, "grad_norm": 2.449198007583618, "learning_rate": 0.00012956623225053492, "loss": 4.0576, "step": 3627 }, { "epoch": 1.0580386438206344, "grad_norm": 2.406514883041382, "learning_rate": 0.00012954678078194905, "loss": 3.932, "step": 3628 }, { "epoch": 1.0583302952971199, "grad_norm": 2.393416166305542, "learning_rate": 0.00012952732931336317, "loss": 4.2203, "step": 3629 }, { "epoch": 1.0586219467736055, "grad_norm": 2.2086024284362793, "learning_rate": 0.0001295078778447773, "loss": 4.2168, "step": 3630 }, { "epoch": 1.0589135982500912, "grad_norm": 2.524850606918335, "learning_rate": 0.0001294884263761914, "loss": 3.8559, "step": 3631 }, { "epoch": 1.0592052497265767, "grad_norm": 4.544890403747559, "learning_rate": 0.00012946897490760553, "loss": 3.8923, "step": 3632 }, { "epoch": 1.0594969012030624, "grad_norm": 3.142557144165039, "learning_rate": 0.00012944952343901965, "loss": 4.0785, "step": 3633 }, { "epoch": 1.0597885526795479, "grad_norm": 2.3052828311920166, "learning_rate": 0.00012943007197043378, "loss": 4.0865, "step": 3634 }, { "epoch": 1.0600802041560335, "grad_norm": 2.8074536323547363, "learning_rate": 0.0001294106205018479, "loss": 4.0612, "step": 3635 }, { "epoch": 1.0603718556325192, "grad_norm": 2.3961946964263916, "learning_rate": 0.00012939116903326202, "loss": 4.1498, "step": 3636 }, { "epoch": 1.0606635071090047, "grad_norm": 2.2879765033721924, "learning_rate": 0.00012937171756467614, "loss": 4.0642, "step": 3637 }, { "epoch": 1.0609551585854904, "grad_norm": 2.1749017238616943, "learning_rate": 0.00012935226609609026, "loss": 4.2485, "step": 3638 }, { "epoch": 1.0612468100619759, "grad_norm": 1.8450850248336792, "learning_rate": 0.00012933281462750438, "loss": 3.9234, "step": 3639 }, { "epoch": 1.0615384615384615, "grad_norm": 3.324470281600952, "learning_rate": 0.0001293133631589185, "loss": 3.9024, "step": 3640 }, { "epoch": 1.0618301130149472, "grad_norm": 2.812638998031616, "learning_rate": 0.00012929391169033263, "loss": 4.0182, "step": 3641 }, { "epoch": 1.0621217644914327, "grad_norm": 2.4177346229553223, "learning_rate": 0.00012927446022174675, "loss": 3.9372, "step": 3642 }, { "epoch": 1.0624134159679184, "grad_norm": 2.6764349937438965, "learning_rate": 0.00012925500875316087, "loss": 3.9192, "step": 3643 }, { "epoch": 1.0627050674444039, "grad_norm": 2.040442705154419, "learning_rate": 0.000129235557284575, "loss": 4.0383, "step": 3644 }, { "epoch": 1.0629967189208895, "grad_norm": 2.3141751289367676, "learning_rate": 0.0001292161058159891, "loss": 4.186, "step": 3645 }, { "epoch": 1.0632883703973752, "grad_norm": 1.9237377643585205, "learning_rate": 0.00012919665434740323, "loss": 4.3198, "step": 3646 }, { "epoch": 1.0635800218738607, "grad_norm": 1.9188791513442993, "learning_rate": 0.00012917720287881736, "loss": 4.1333, "step": 3647 }, { "epoch": 1.0638716733503464, "grad_norm": 3.542592763900757, "learning_rate": 0.00012915775141023148, "loss": 3.9752, "step": 3648 }, { "epoch": 1.0641633248268318, "grad_norm": 2.27101993560791, "learning_rate": 0.0001291382999416456, "loss": 3.9506, "step": 3649 }, { "epoch": 1.0644549763033175, "grad_norm": 1.7530509233474731, "learning_rate": 0.00012911884847305972, "loss": 4.1971, "step": 3650 }, { "epoch": 1.0647466277798032, "grad_norm": 2.626396894454956, "learning_rate": 0.00012909939700447384, "loss": 4.0017, "step": 3651 }, { "epoch": 1.0650382792562887, "grad_norm": 1.917130470275879, "learning_rate": 0.00012907994553588796, "loss": 4.1367, "step": 3652 }, { "epoch": 1.0653299307327744, "grad_norm": 2.3441033363342285, "learning_rate": 0.00012906049406730209, "loss": 4.2968, "step": 3653 }, { "epoch": 1.0656215822092598, "grad_norm": 2.299095392227173, "learning_rate": 0.0001290410425987162, "loss": 4.285, "step": 3654 }, { "epoch": 1.0659132336857455, "grad_norm": 2.4602279663085938, "learning_rate": 0.00012902159113013033, "loss": 4.0548, "step": 3655 }, { "epoch": 1.0662048851622312, "grad_norm": 2.033823251724243, "learning_rate": 0.00012900213966154445, "loss": 4.0123, "step": 3656 }, { "epoch": 1.0664965366387167, "grad_norm": 2.081974506378174, "learning_rate": 0.00012898268819295857, "loss": 4.0616, "step": 3657 }, { "epoch": 1.0667881881152024, "grad_norm": 2.362161159515381, "learning_rate": 0.0001289632367243727, "loss": 4.2465, "step": 3658 }, { "epoch": 1.0670798395916878, "grad_norm": 2.2387077808380127, "learning_rate": 0.00012894378525578682, "loss": 4.4832, "step": 3659 }, { "epoch": 1.0673714910681735, "grad_norm": 2.3844571113586426, "learning_rate": 0.00012892433378720094, "loss": 4.3936, "step": 3660 }, { "epoch": 1.0676631425446592, "grad_norm": 2.094517707824707, "learning_rate": 0.00012890488231861506, "loss": 3.9766, "step": 3661 }, { "epoch": 1.0679547940211447, "grad_norm": 2.2085766792297363, "learning_rate": 0.00012888543085002918, "loss": 4.048, "step": 3662 }, { "epoch": 1.0682464454976304, "grad_norm": 2.1386711597442627, "learning_rate": 0.0001288659793814433, "loss": 3.873, "step": 3663 }, { "epoch": 1.0685380969741158, "grad_norm": 1.9744540452957153, "learning_rate": 0.00012884652791285742, "loss": 3.9199, "step": 3664 }, { "epoch": 1.0688297484506015, "grad_norm": 2.621877908706665, "learning_rate": 0.00012882707644427155, "loss": 4.1279, "step": 3665 }, { "epoch": 1.0691213999270872, "grad_norm": 2.400059223175049, "learning_rate": 0.0001288076249756857, "loss": 4.124, "step": 3666 }, { "epoch": 1.0694130514035727, "grad_norm": 2.044935941696167, "learning_rate": 0.0001287881735070998, "loss": 4.1025, "step": 3667 }, { "epoch": 1.0697047028800584, "grad_norm": 2.400033950805664, "learning_rate": 0.0001287687220385139, "loss": 4.0025, "step": 3668 }, { "epoch": 1.0699963543565438, "grad_norm": 2.289691925048828, "learning_rate": 0.00012874927056992803, "loss": 3.9317, "step": 3669 }, { "epoch": 1.0702880058330295, "grad_norm": 2.4783196449279785, "learning_rate": 0.00012872981910134215, "loss": 3.9875, "step": 3670 }, { "epoch": 1.0705796573095152, "grad_norm": 2.208743095397949, "learning_rate": 0.00012871036763275627, "loss": 4.1175, "step": 3671 }, { "epoch": 1.0708713087860007, "grad_norm": 2.6667635440826416, "learning_rate": 0.0001286909161641704, "loss": 4.2274, "step": 3672 }, { "epoch": 1.0711629602624864, "grad_norm": 2.9799394607543945, "learning_rate": 0.00012867146469558454, "loss": 4.1712, "step": 3673 }, { "epoch": 1.0714546117389718, "grad_norm": 2.0460963249206543, "learning_rate": 0.00012865201322699864, "loss": 4.0565, "step": 3674 }, { "epoch": 1.0717462632154575, "grad_norm": 2.3438265323638916, "learning_rate": 0.00012863256175841276, "loss": 4.005, "step": 3675 }, { "epoch": 1.0720379146919432, "grad_norm": 1.8547872304916382, "learning_rate": 0.0001286131102898269, "loss": 3.9953, "step": 3676 }, { "epoch": 1.0723295661684287, "grad_norm": 2.825669288635254, "learning_rate": 0.000128593658821241, "loss": 4.0266, "step": 3677 }, { "epoch": 1.0726212176449144, "grad_norm": 2.0804405212402344, "learning_rate": 0.00012857420735265513, "loss": 4.2905, "step": 3678 }, { "epoch": 1.0729128691213998, "grad_norm": 2.6583008766174316, "learning_rate": 0.00012855475588406925, "loss": 4.0321, "step": 3679 }, { "epoch": 1.0732045205978855, "grad_norm": 2.3651282787323, "learning_rate": 0.00012853530441548337, "loss": 3.9759, "step": 3680 }, { "epoch": 1.0734961720743712, "grad_norm": 2.2303764820098877, "learning_rate": 0.0001285158529468975, "loss": 4.4069, "step": 3681 }, { "epoch": 1.0737878235508567, "grad_norm": 2.1585030555725098, "learning_rate": 0.0001284964014783116, "loss": 4.2179, "step": 3682 }, { "epoch": 1.0740794750273424, "grad_norm": 2.955322027206421, "learning_rate": 0.00012847695000972576, "loss": 4.1647, "step": 3683 }, { "epoch": 1.0743711265038278, "grad_norm": 2.5349717140197754, "learning_rate": 0.00012845749854113986, "loss": 3.8718, "step": 3684 }, { "epoch": 1.0746627779803135, "grad_norm": 2.8969168663024902, "learning_rate": 0.00012843804707255398, "loss": 4.2758, "step": 3685 }, { "epoch": 1.0749544294567992, "grad_norm": 2.084346055984497, "learning_rate": 0.0001284185956039681, "loss": 3.9968, "step": 3686 }, { "epoch": 1.0752460809332847, "grad_norm": 2.0557668209075928, "learning_rate": 0.00012839914413538222, "loss": 3.7973, "step": 3687 }, { "epoch": 1.0755377324097704, "grad_norm": 2.5269646644592285, "learning_rate": 0.00012837969266679634, "loss": 3.7429, "step": 3688 }, { "epoch": 1.0758293838862558, "grad_norm": 3.1683239936828613, "learning_rate": 0.00012836024119821046, "loss": 4.1939, "step": 3689 }, { "epoch": 1.0761210353627415, "grad_norm": 1.9997286796569824, "learning_rate": 0.0001283407897296246, "loss": 3.9235, "step": 3690 }, { "epoch": 1.0764126868392272, "grad_norm": 2.0233678817749023, "learning_rate": 0.0001283213382610387, "loss": 4.1818, "step": 3691 }, { "epoch": 1.0767043383157127, "grad_norm": 2.0936896800994873, "learning_rate": 0.00012830188679245283, "loss": 4.0413, "step": 3692 }, { "epoch": 1.0769959897921983, "grad_norm": 2.264388084411621, "learning_rate": 0.00012828243532386698, "loss": 4.1875, "step": 3693 }, { "epoch": 1.077287641268684, "grad_norm": 2.733729362487793, "learning_rate": 0.00012826298385528107, "loss": 4.0726, "step": 3694 }, { "epoch": 1.0775792927451695, "grad_norm": 2.1931395530700684, "learning_rate": 0.0001282435323866952, "loss": 4.2307, "step": 3695 }, { "epoch": 1.0778709442216552, "grad_norm": 2.2030723094940186, "learning_rate": 0.00012822408091810931, "loss": 4.1984, "step": 3696 }, { "epoch": 1.0781625956981407, "grad_norm": 2.209683418273926, "learning_rate": 0.00012820462944952346, "loss": 4.195, "step": 3697 }, { "epoch": 1.0784542471746263, "grad_norm": 2.6790106296539307, "learning_rate": 0.00012818517798093756, "loss": 4.1116, "step": 3698 }, { "epoch": 1.0787458986511118, "grad_norm": 2.153533697128296, "learning_rate": 0.00012816572651235168, "loss": 4.0405, "step": 3699 }, { "epoch": 1.0790375501275975, "grad_norm": 2.217162847518921, "learning_rate": 0.00012814627504376583, "loss": 4.112, "step": 3700 }, { "epoch": 1.0793292016040832, "grad_norm": 2.238577365875244, "learning_rate": 0.00012812682357517992, "loss": 4.0115, "step": 3701 }, { "epoch": 1.0796208530805687, "grad_norm": 2.0432844161987305, "learning_rate": 0.00012810737210659404, "loss": 3.829, "step": 3702 }, { "epoch": 1.0799125045570543, "grad_norm": 2.828948736190796, "learning_rate": 0.0001280879206380082, "loss": 4.1398, "step": 3703 }, { "epoch": 1.08020415603354, "grad_norm": 2.22357439994812, "learning_rate": 0.0001280684691694223, "loss": 4.1064, "step": 3704 }, { "epoch": 1.0804958075100255, "grad_norm": 2.577380657196045, "learning_rate": 0.0001280490177008364, "loss": 4.0758, "step": 3705 }, { "epoch": 1.0807874589865112, "grad_norm": 2.5443274974823, "learning_rate": 0.00012802956623225053, "loss": 3.7964, "step": 3706 }, { "epoch": 1.0810791104629967, "grad_norm": 2.6534979343414307, "learning_rate": 0.00012801011476366468, "loss": 4.0604, "step": 3707 }, { "epoch": 1.0813707619394823, "grad_norm": 2.8065707683563232, "learning_rate": 0.00012799066329507877, "loss": 4.1587, "step": 3708 }, { "epoch": 1.0816624134159678, "grad_norm": 2.426966667175293, "learning_rate": 0.0001279712118264929, "loss": 4.4106, "step": 3709 }, { "epoch": 1.0819540648924535, "grad_norm": 1.9895684719085693, "learning_rate": 0.00012795176035790704, "loss": 4.0589, "step": 3710 }, { "epoch": 1.0822457163689392, "grad_norm": 2.439221143722534, "learning_rate": 0.00012793230888932114, "loss": 4.1805, "step": 3711 }, { "epoch": 1.0825373678454246, "grad_norm": 2.1334049701690674, "learning_rate": 0.00012791285742073526, "loss": 3.9377, "step": 3712 }, { "epoch": 1.0828290193219103, "grad_norm": 2.040493965148926, "learning_rate": 0.0001278934059521494, "loss": 4.1731, "step": 3713 }, { "epoch": 1.083120670798396, "grad_norm": 2.5639731884002686, "learning_rate": 0.00012787395448356353, "loss": 4.1131, "step": 3714 }, { "epoch": 1.0834123222748815, "grad_norm": 2.3720574378967285, "learning_rate": 0.00012785450301497762, "loss": 4.2466, "step": 3715 }, { "epoch": 1.0837039737513672, "grad_norm": 2.114790916442871, "learning_rate": 0.00012783505154639175, "loss": 4.2504, "step": 3716 }, { "epoch": 1.0839956252278526, "grad_norm": 2.399808406829834, "learning_rate": 0.0001278156000778059, "loss": 4.2713, "step": 3717 }, { "epoch": 1.0842872767043383, "grad_norm": 3.256539821624756, "learning_rate": 0.00012779614860922, "loss": 4.106, "step": 3718 }, { "epoch": 1.0845789281808238, "grad_norm": 2.5010480880737305, "learning_rate": 0.0001277766971406341, "loss": 4.2111, "step": 3719 }, { "epoch": 1.0848705796573095, "grad_norm": 2.5931286811828613, "learning_rate": 0.00012775724567204826, "loss": 4.17, "step": 3720 }, { "epoch": 1.0851622311337952, "grad_norm": 2.145604372024536, "learning_rate": 0.00012773779420346238, "loss": 3.8157, "step": 3721 }, { "epoch": 1.0854538826102806, "grad_norm": 2.017652750015259, "learning_rate": 0.00012771834273487648, "loss": 4.0446, "step": 3722 }, { "epoch": 1.0857455340867663, "grad_norm": 2.118473529815674, "learning_rate": 0.00012769889126629062, "loss": 3.9813, "step": 3723 }, { "epoch": 1.086037185563252, "grad_norm": 2.1391661167144775, "learning_rate": 0.00012767943979770475, "loss": 3.9286, "step": 3724 }, { "epoch": 1.0863288370397375, "grad_norm": 2.263873815536499, "learning_rate": 0.00012765998832911884, "loss": 4.0064, "step": 3725 }, { "epoch": 1.0866204885162232, "grad_norm": 2.2649269104003906, "learning_rate": 0.00012764053686053296, "loss": 4.1057, "step": 3726 }, { "epoch": 1.0869121399927086, "grad_norm": 2.015627145767212, "learning_rate": 0.0001276210853919471, "loss": 4.2801, "step": 3727 }, { "epoch": 1.0872037914691943, "grad_norm": 2.2863447666168213, "learning_rate": 0.0001276016339233612, "loss": 4.004, "step": 3728 }, { "epoch": 1.08749544294568, "grad_norm": 1.9052786827087402, "learning_rate": 0.00012758218245477533, "loss": 3.974, "step": 3729 }, { "epoch": 1.0877870944221655, "grad_norm": 1.87296462059021, "learning_rate": 0.00012756273098618948, "loss": 4.1313, "step": 3730 }, { "epoch": 1.0880787458986512, "grad_norm": 2.067815065383911, "learning_rate": 0.0001275432795176036, "loss": 4.1958, "step": 3731 }, { "epoch": 1.0883703973751366, "grad_norm": 2.751574993133545, "learning_rate": 0.0001275238280490177, "loss": 4.0413, "step": 3732 }, { "epoch": 1.0886620488516223, "grad_norm": 2.670555830001831, "learning_rate": 0.00012750437658043184, "loss": 4.4684, "step": 3733 }, { "epoch": 1.088953700328108, "grad_norm": 2.483023166656494, "learning_rate": 0.00012748492511184596, "loss": 4.2723, "step": 3734 }, { "epoch": 1.0892453518045935, "grad_norm": 1.8637025356292725, "learning_rate": 0.00012746547364326006, "loss": 4.3155, "step": 3735 }, { "epoch": 1.0895370032810792, "grad_norm": 1.7107656002044678, "learning_rate": 0.00012744602217467418, "loss": 4.2138, "step": 3736 }, { "epoch": 1.0898286547575646, "grad_norm": 1.8580517768859863, "learning_rate": 0.00012742657070608833, "loss": 3.999, "step": 3737 }, { "epoch": 1.0901203062340503, "grad_norm": 2.0527381896972656, "learning_rate": 0.00012740711923750245, "loss": 4.1225, "step": 3738 }, { "epoch": 1.090411957710536, "grad_norm": 3.36418080329895, "learning_rate": 0.00012738766776891654, "loss": 4.0627, "step": 3739 }, { "epoch": 1.0907036091870215, "grad_norm": 3.1110999584198, "learning_rate": 0.0001273682163003307, "loss": 4.1451, "step": 3740 }, { "epoch": 1.0909952606635072, "grad_norm": 2.383479595184326, "learning_rate": 0.0001273487648317448, "loss": 4.0797, "step": 3741 }, { "epoch": 1.0912869121399926, "grad_norm": 2.602729082107544, "learning_rate": 0.0001273293133631589, "loss": 4.1109, "step": 3742 }, { "epoch": 1.0915785636164783, "grad_norm": 2.376114845275879, "learning_rate": 0.00012730986189457306, "loss": 4.1848, "step": 3743 }, { "epoch": 1.091870215092964, "grad_norm": 2.250957727432251, "learning_rate": 0.00012729041042598718, "loss": 3.8878, "step": 3744 }, { "epoch": 1.0921618665694495, "grad_norm": 2.186068296432495, "learning_rate": 0.0001272709589574013, "loss": 4.0765, "step": 3745 }, { "epoch": 1.0924535180459352, "grad_norm": 1.810776948928833, "learning_rate": 0.0001272515074888154, "loss": 4.0378, "step": 3746 }, { "epoch": 1.0927451695224206, "grad_norm": 2.6666274070739746, "learning_rate": 0.00012723205602022954, "loss": 4.2117, "step": 3747 }, { "epoch": 1.0930368209989063, "grad_norm": 2.426586866378784, "learning_rate": 0.00012721260455164366, "loss": 4.1251, "step": 3748 }, { "epoch": 1.093328472475392, "grad_norm": 2.5523886680603027, "learning_rate": 0.00012719315308305776, "loss": 4.325, "step": 3749 }, { "epoch": 1.0936201239518775, "grad_norm": 2.4587292671203613, "learning_rate": 0.0001271737016144719, "loss": 4.1361, "step": 3750 }, { "epoch": 1.0939117754283632, "grad_norm": 2.391700267791748, "learning_rate": 0.00012715425014588603, "loss": 4.1539, "step": 3751 }, { "epoch": 1.0942034269048486, "grad_norm": 2.1935012340545654, "learning_rate": 0.00012713479867730015, "loss": 4.1668, "step": 3752 }, { "epoch": 1.0944950783813343, "grad_norm": 1.936406135559082, "learning_rate": 0.00012711534720871427, "loss": 3.9895, "step": 3753 }, { "epoch": 1.09478672985782, "grad_norm": 3.5578646659851074, "learning_rate": 0.0001270958957401284, "loss": 4.0554, "step": 3754 }, { "epoch": 1.0950783813343055, "grad_norm": 2.647141695022583, "learning_rate": 0.00012707644427154252, "loss": 4.2365, "step": 3755 }, { "epoch": 1.0953700328107911, "grad_norm": 2.319776773452759, "learning_rate": 0.0001270569928029566, "loss": 4.4718, "step": 3756 }, { "epoch": 1.0956616842872766, "grad_norm": 2.5689897537231445, "learning_rate": 0.00012703754133437076, "loss": 4.2302, "step": 3757 }, { "epoch": 1.0959533357637623, "grad_norm": 2.0307326316833496, "learning_rate": 0.00012701808986578488, "loss": 4.1466, "step": 3758 }, { "epoch": 1.096244987240248, "grad_norm": 1.8125256299972534, "learning_rate": 0.00012699863839719897, "loss": 3.9664, "step": 3759 }, { "epoch": 1.0965366387167335, "grad_norm": 2.076199769973755, "learning_rate": 0.00012697918692861312, "loss": 4.2294, "step": 3760 }, { "epoch": 1.0968282901932191, "grad_norm": 2.8038058280944824, "learning_rate": 0.00012695973546002725, "loss": 4.302, "step": 3761 }, { "epoch": 1.0971199416697046, "grad_norm": 2.014314651489258, "learning_rate": 0.00012694028399144137, "loss": 4.1107, "step": 3762 }, { "epoch": 1.0974115931461903, "grad_norm": 2.4806911945343018, "learning_rate": 0.00012692083252285546, "loss": 4.3277, "step": 3763 }, { "epoch": 1.097703244622676, "grad_norm": 2.074834108352661, "learning_rate": 0.0001269013810542696, "loss": 4.0597, "step": 3764 }, { "epoch": 1.0979948960991615, "grad_norm": 2.116048812866211, "learning_rate": 0.00012688192958568373, "loss": 3.9102, "step": 3765 }, { "epoch": 1.0982865475756471, "grad_norm": 2.4567980766296387, "learning_rate": 0.00012686247811709783, "loss": 4.0998, "step": 3766 }, { "epoch": 1.0985781990521326, "grad_norm": 2.4135046005249023, "learning_rate": 0.00012684302664851197, "loss": 4.261, "step": 3767 }, { "epoch": 1.0988698505286183, "grad_norm": 2.3346734046936035, "learning_rate": 0.0001268235751799261, "loss": 4.0119, "step": 3768 }, { "epoch": 1.099161502005104, "grad_norm": 2.207634449005127, "learning_rate": 0.00012680412371134022, "loss": 4.2706, "step": 3769 }, { "epoch": 1.0994531534815895, "grad_norm": 1.8305271863937378, "learning_rate": 0.00012678467224275434, "loss": 4.0774, "step": 3770 }, { "epoch": 1.0997448049580751, "grad_norm": 2.636587619781494, "learning_rate": 0.00012676522077416846, "loss": 3.8585, "step": 3771 }, { "epoch": 1.1000364564345606, "grad_norm": 2.563638687133789, "learning_rate": 0.00012674576930558258, "loss": 4.0928, "step": 3772 }, { "epoch": 1.1003281079110463, "grad_norm": 2.9856836795806885, "learning_rate": 0.00012672631783699668, "loss": 4.2584, "step": 3773 }, { "epoch": 1.100619759387532, "grad_norm": 2.51595401763916, "learning_rate": 0.00012670686636841083, "loss": 4.201, "step": 3774 }, { "epoch": 1.1009114108640174, "grad_norm": 2.3098883628845215, "learning_rate": 0.00012668741489982495, "loss": 4.0349, "step": 3775 }, { "epoch": 1.1012030623405031, "grad_norm": 2.548952341079712, "learning_rate": 0.00012666796343123907, "loss": 4.0761, "step": 3776 }, { "epoch": 1.1014947138169886, "grad_norm": 3.2829055786132812, "learning_rate": 0.0001266485119626532, "loss": 4.2265, "step": 3777 }, { "epoch": 1.1017863652934743, "grad_norm": 3.259401321411133, "learning_rate": 0.0001266290604940673, "loss": 3.8634, "step": 3778 }, { "epoch": 1.10207801676996, "grad_norm": 3.141383409500122, "learning_rate": 0.00012660960902548143, "loss": 3.964, "step": 3779 }, { "epoch": 1.1023696682464454, "grad_norm": 3.194380283355713, "learning_rate": 0.00012659015755689556, "loss": 4.1101, "step": 3780 }, { "epoch": 1.1026613197229311, "grad_norm": 2.19527006149292, "learning_rate": 0.00012657070608830968, "loss": 4.0061, "step": 3781 }, { "epoch": 1.1029529711994166, "grad_norm": 2.274373769760132, "learning_rate": 0.0001265512546197238, "loss": 4.0037, "step": 3782 }, { "epoch": 1.1032446226759023, "grad_norm": 2.279019832611084, "learning_rate": 0.0001265318031511379, "loss": 4.0874, "step": 3783 }, { "epoch": 1.103536274152388, "grad_norm": 2.833214044570923, "learning_rate": 0.00012651235168255204, "loss": 4.1178, "step": 3784 }, { "epoch": 1.1038279256288734, "grad_norm": 1.8894836902618408, "learning_rate": 0.00012649290021396616, "loss": 4.1957, "step": 3785 }, { "epoch": 1.1041195771053591, "grad_norm": 2.0564351081848145, "learning_rate": 0.00012647344874538029, "loss": 4.0754, "step": 3786 }, { "epoch": 1.1044112285818446, "grad_norm": 2.4152543544769287, "learning_rate": 0.0001264539972767944, "loss": 4.1613, "step": 3787 }, { "epoch": 1.1047028800583303, "grad_norm": 1.8680418729782104, "learning_rate": 0.00012643454580820853, "loss": 4.0309, "step": 3788 }, { "epoch": 1.104994531534816, "grad_norm": 2.2002201080322266, "learning_rate": 0.00012641509433962265, "loss": 4.0615, "step": 3789 }, { "epoch": 1.1052861830113014, "grad_norm": 2.244812488555908, "learning_rate": 0.00012639564287103677, "loss": 3.9551, "step": 3790 }, { "epoch": 1.1055778344877871, "grad_norm": 2.184509754180908, "learning_rate": 0.0001263761914024509, "loss": 4.268, "step": 3791 }, { "epoch": 1.1058694859642726, "grad_norm": 2.7067883014678955, "learning_rate": 0.00012635673993386501, "loss": 4.2599, "step": 3792 }, { "epoch": 1.1061611374407583, "grad_norm": 4.113669395446777, "learning_rate": 0.00012633728846527914, "loss": 4.1556, "step": 3793 }, { "epoch": 1.106452788917244, "grad_norm": 2.292478322982788, "learning_rate": 0.00012631783699669326, "loss": 4.2261, "step": 3794 }, { "epoch": 1.1067444403937294, "grad_norm": 2.88966703414917, "learning_rate": 0.00012629838552810738, "loss": 4.0957, "step": 3795 }, { "epoch": 1.1070360918702151, "grad_norm": 2.705134153366089, "learning_rate": 0.0001262789340595215, "loss": 4.2701, "step": 3796 }, { "epoch": 1.1073277433467008, "grad_norm": 2.0845186710357666, "learning_rate": 0.00012625948259093562, "loss": 3.9362, "step": 3797 }, { "epoch": 1.1076193948231863, "grad_norm": 2.7366414070129395, "learning_rate": 0.00012624003112234974, "loss": 4.3586, "step": 3798 }, { "epoch": 1.107911046299672, "grad_norm": 1.8460358381271362, "learning_rate": 0.00012622057965376387, "loss": 3.9896, "step": 3799 }, { "epoch": 1.1082026977761574, "grad_norm": 2.3890960216522217, "learning_rate": 0.000126201128185178, "loss": 3.8941, "step": 3800 }, { "epoch": 1.1084943492526431, "grad_norm": 2.663658857345581, "learning_rate": 0.0001261816767165921, "loss": 3.8793, "step": 3801 }, { "epoch": 1.1087860007291286, "grad_norm": 2.2229604721069336, "learning_rate": 0.00012616222524800623, "loss": 4.1102, "step": 3802 }, { "epoch": 1.1090776522056143, "grad_norm": 3.9307522773742676, "learning_rate": 0.00012614277377942035, "loss": 4.1439, "step": 3803 }, { "epoch": 1.1093693036821, "grad_norm": 2.5092711448669434, "learning_rate": 0.00012612332231083447, "loss": 4.0311, "step": 3804 }, { "epoch": 1.1096609551585854, "grad_norm": 2.037515878677368, "learning_rate": 0.0001261038708422486, "loss": 4.1345, "step": 3805 }, { "epoch": 1.1099526066350711, "grad_norm": 2.4826111793518066, "learning_rate": 0.00012608441937366272, "loss": 4.0889, "step": 3806 }, { "epoch": 1.1102442581115568, "grad_norm": 2.175468921661377, "learning_rate": 0.00012606496790507684, "loss": 4.0715, "step": 3807 }, { "epoch": 1.1105359095880423, "grad_norm": 2.274120330810547, "learning_rate": 0.00012604551643649096, "loss": 3.9421, "step": 3808 }, { "epoch": 1.110827561064528, "grad_norm": 3.06394624710083, "learning_rate": 0.00012602606496790508, "loss": 4.0555, "step": 3809 }, { "epoch": 1.1111192125410134, "grad_norm": 2.5345451831817627, "learning_rate": 0.0001260066134993192, "loss": 3.9514, "step": 3810 }, { "epoch": 1.1114108640174991, "grad_norm": 2.6363325119018555, "learning_rate": 0.00012598716203073332, "loss": 4.2181, "step": 3811 }, { "epoch": 1.1117025154939846, "grad_norm": 1.8451052904129028, "learning_rate": 0.00012596771056214745, "loss": 4.1997, "step": 3812 }, { "epoch": 1.1119941669704703, "grad_norm": 2.542494058609009, "learning_rate": 0.00012594825909356157, "loss": 4.1215, "step": 3813 }, { "epoch": 1.112285818446956, "grad_norm": 1.8650463819503784, "learning_rate": 0.0001259288076249757, "loss": 4.1191, "step": 3814 }, { "epoch": 1.1125774699234414, "grad_norm": 2.3131656646728516, "learning_rate": 0.0001259093561563898, "loss": 3.8039, "step": 3815 }, { "epoch": 1.112869121399927, "grad_norm": 3.234999895095825, "learning_rate": 0.00012588990468780393, "loss": 4.0572, "step": 3816 }, { "epoch": 1.1131607728764128, "grad_norm": 3.043445110321045, "learning_rate": 0.00012587045321921805, "loss": 4.2969, "step": 3817 }, { "epoch": 1.1134524243528983, "grad_norm": 2.3870763778686523, "learning_rate": 0.00012585100175063218, "loss": 3.9681, "step": 3818 }, { "epoch": 1.113744075829384, "grad_norm": 2.0752973556518555, "learning_rate": 0.0001258315502820463, "loss": 3.8142, "step": 3819 }, { "epoch": 1.1140357273058694, "grad_norm": 2.1278319358825684, "learning_rate": 0.00012581209881346042, "loss": 4.3383, "step": 3820 }, { "epoch": 1.114327378782355, "grad_norm": 2.6683380603790283, "learning_rate": 0.00012579264734487454, "loss": 4.5381, "step": 3821 }, { "epoch": 1.1146190302588406, "grad_norm": 2.06602144241333, "learning_rate": 0.00012577319587628866, "loss": 4.1278, "step": 3822 }, { "epoch": 1.1149106817353263, "grad_norm": 1.8851748704910278, "learning_rate": 0.00012575374440770278, "loss": 3.7478, "step": 3823 }, { "epoch": 1.115202333211812, "grad_norm": 2.497037172317505, "learning_rate": 0.0001257342929391169, "loss": 4.0365, "step": 3824 }, { "epoch": 1.1154939846882974, "grad_norm": 2.50358247756958, "learning_rate": 0.00012571484147053103, "loss": 4.0862, "step": 3825 }, { "epoch": 1.115785636164783, "grad_norm": 1.9699640274047852, "learning_rate": 0.00012569539000194515, "loss": 3.9677, "step": 3826 }, { "epoch": 1.1160772876412688, "grad_norm": 2.286684274673462, "learning_rate": 0.00012567593853335927, "loss": 4.1837, "step": 3827 }, { "epoch": 1.1163689391177543, "grad_norm": 2.6197593212127686, "learning_rate": 0.0001256564870647734, "loss": 4.1749, "step": 3828 }, { "epoch": 1.11666059059424, "grad_norm": 2.2310361862182617, "learning_rate": 0.00012563703559618751, "loss": 4.2401, "step": 3829 }, { "epoch": 1.1169522420707254, "grad_norm": 2.0583393573760986, "learning_rate": 0.00012561758412760166, "loss": 4.0116, "step": 3830 }, { "epoch": 1.117243893547211, "grad_norm": 2.2762644290924072, "learning_rate": 0.00012559813265901576, "loss": 4.2717, "step": 3831 }, { "epoch": 1.1175355450236968, "grad_norm": 1.8914530277252197, "learning_rate": 0.00012557868119042988, "loss": 3.7991, "step": 3832 }, { "epoch": 1.1178271965001823, "grad_norm": 2.397955894470215, "learning_rate": 0.000125559229721844, "loss": 4.2012, "step": 3833 }, { "epoch": 1.118118847976668, "grad_norm": 2.742084264755249, "learning_rate": 0.00012553977825325812, "loss": 3.8776, "step": 3834 }, { "epoch": 1.1184104994531534, "grad_norm": 2.1327435970306396, "learning_rate": 0.00012552032678467224, "loss": 4.0763, "step": 3835 }, { "epoch": 1.118702150929639, "grad_norm": 1.7967503070831299, "learning_rate": 0.00012550087531608636, "loss": 4.0874, "step": 3836 }, { "epoch": 1.1189938024061248, "grad_norm": 2.2867112159729004, "learning_rate": 0.0001254814238475005, "loss": 3.8923, "step": 3837 }, { "epoch": 1.1192854538826102, "grad_norm": 2.5264527797698975, "learning_rate": 0.0001254619723789146, "loss": 4.1582, "step": 3838 }, { "epoch": 1.119577105359096, "grad_norm": 2.577892541885376, "learning_rate": 0.00012544252091032873, "loss": 4.0869, "step": 3839 }, { "epoch": 1.1198687568355814, "grad_norm": 2.7262344360351562, "learning_rate": 0.00012542306944174288, "loss": 3.7455, "step": 3840 }, { "epoch": 1.120160408312067, "grad_norm": 2.5765764713287354, "learning_rate": 0.00012540361797315697, "loss": 4.2533, "step": 3841 }, { "epoch": 1.1204520597885528, "grad_norm": 3.4521968364715576, "learning_rate": 0.0001253841665045711, "loss": 4.0597, "step": 3842 }, { "epoch": 1.1207437112650382, "grad_norm": 2.4366743564605713, "learning_rate": 0.00012536471503598522, "loss": 4.2348, "step": 3843 }, { "epoch": 1.121035362741524, "grad_norm": 2.0424580574035645, "learning_rate": 0.00012534526356739936, "loss": 4.2877, "step": 3844 }, { "epoch": 1.1213270142180094, "grad_norm": 2.999857187271118, "learning_rate": 0.00012532581209881346, "loss": 4.3077, "step": 3845 }, { "epoch": 1.121618665694495, "grad_norm": 2.9154815673828125, "learning_rate": 0.00012530636063022758, "loss": 4.0099, "step": 3846 }, { "epoch": 1.1219103171709808, "grad_norm": 2.732811689376831, "learning_rate": 0.00012528690916164173, "loss": 4.1847, "step": 3847 }, { "epoch": 1.1222019686474662, "grad_norm": 2.097060441970825, "learning_rate": 0.00012526745769305582, "loss": 4.0659, "step": 3848 }, { "epoch": 1.122493620123952, "grad_norm": 2.998591899871826, "learning_rate": 0.00012524800622446995, "loss": 4.0559, "step": 3849 }, { "epoch": 1.1227852716004374, "grad_norm": 1.8435304164886475, "learning_rate": 0.00012522855475588407, "loss": 3.898, "step": 3850 }, { "epoch": 1.123076923076923, "grad_norm": 2.1321396827697754, "learning_rate": 0.0001252091032872982, "loss": 3.8723, "step": 3851 }, { "epoch": 1.1233685745534088, "grad_norm": 2.749593734741211, "learning_rate": 0.0001251896518187123, "loss": 4.2062, "step": 3852 }, { "epoch": 1.1236602260298942, "grad_norm": 2.182619571685791, "learning_rate": 0.00012517020035012643, "loss": 4.2274, "step": 3853 }, { "epoch": 1.12395187750638, "grad_norm": 2.4502108097076416, "learning_rate": 0.00012515074888154058, "loss": 4.3023, "step": 3854 }, { "epoch": 1.1242435289828654, "grad_norm": 2.3274686336517334, "learning_rate": 0.00012513129741295468, "loss": 4.2636, "step": 3855 }, { "epoch": 1.124535180459351, "grad_norm": 2.3184468746185303, "learning_rate": 0.0001251118459443688, "loss": 4.0624, "step": 3856 }, { "epoch": 1.1248268319358368, "grad_norm": 3.6948812007904053, "learning_rate": 0.00012509239447578295, "loss": 3.914, "step": 3857 }, { "epoch": 1.1251184834123222, "grad_norm": 3.3052573204040527, "learning_rate": 0.00012507294300719704, "loss": 4.183, "step": 3858 }, { "epoch": 1.125410134888808, "grad_norm": 1.915851354598999, "learning_rate": 0.00012505349153861116, "loss": 4.0178, "step": 3859 }, { "epoch": 1.1257017863652934, "grad_norm": 2.5772104263305664, "learning_rate": 0.00012503404007002528, "loss": 3.9587, "step": 3860 }, { "epoch": 1.125993437841779, "grad_norm": 3.13073468208313, "learning_rate": 0.00012501458860143943, "loss": 4.3288, "step": 3861 }, { "epoch": 1.1262850893182648, "grad_norm": 2.023388624191284, "learning_rate": 0.00012499513713285353, "loss": 3.6816, "step": 3862 }, { "epoch": 1.1265767407947502, "grad_norm": 2.3148133754730225, "learning_rate": 0.00012497568566426765, "loss": 3.8245, "step": 3863 }, { "epoch": 1.126868392271236, "grad_norm": 2.4919521808624268, "learning_rate": 0.0001249562341956818, "loss": 3.8889, "step": 3864 }, { "epoch": 1.1271600437477214, "grad_norm": 2.6880834102630615, "learning_rate": 0.0001249367827270959, "loss": 4.1168, "step": 3865 }, { "epoch": 1.127451695224207, "grad_norm": 2.5866708755493164, "learning_rate": 0.00012491733125851, "loss": 4.3671, "step": 3866 }, { "epoch": 1.1277433467006928, "grad_norm": 2.16829514503479, "learning_rate": 0.00012489787978992416, "loss": 3.9002, "step": 3867 }, { "epoch": 1.1280349981771782, "grad_norm": 2.435537576675415, "learning_rate": 0.00012487842832133828, "loss": 3.6927, "step": 3868 }, { "epoch": 1.128326649653664, "grad_norm": 2.615100383758545, "learning_rate": 0.00012485897685275238, "loss": 4.0295, "step": 3869 }, { "epoch": 1.1286183011301494, "grad_norm": 2.5510427951812744, "learning_rate": 0.0001248395253841665, "loss": 3.8855, "step": 3870 }, { "epoch": 1.128909952606635, "grad_norm": 2.1479597091674805, "learning_rate": 0.00012482007391558065, "loss": 4.319, "step": 3871 }, { "epoch": 1.1292016040831208, "grad_norm": 2.367952823638916, "learning_rate": 0.00012480062244699474, "loss": 3.6928, "step": 3872 }, { "epoch": 1.1294932555596062, "grad_norm": 3.0548346042633057, "learning_rate": 0.00012478117097840886, "loss": 3.9801, "step": 3873 }, { "epoch": 1.129784907036092, "grad_norm": 2.489487886428833, "learning_rate": 0.000124761719509823, "loss": 3.9241, "step": 3874 }, { "epoch": 1.1300765585125774, "grad_norm": 2.1270103454589844, "learning_rate": 0.0001247422680412371, "loss": 4.3767, "step": 3875 }, { "epoch": 1.130368209989063, "grad_norm": 2.817770004272461, "learning_rate": 0.00012472281657265123, "loss": 4.1862, "step": 3876 }, { "epoch": 1.1306598614655488, "grad_norm": 2.3943700790405273, "learning_rate": 0.00012470336510406538, "loss": 4.2554, "step": 3877 }, { "epoch": 1.1309515129420342, "grad_norm": 2.715578079223633, "learning_rate": 0.0001246839136354795, "loss": 4.2477, "step": 3878 }, { "epoch": 1.13124316441852, "grad_norm": 1.9143478870391846, "learning_rate": 0.0001246644621668936, "loss": 4.2405, "step": 3879 }, { "epoch": 1.1315348158950054, "grad_norm": 2.735515594482422, "learning_rate": 0.00012464501069830771, "loss": 4.1288, "step": 3880 }, { "epoch": 1.131826467371491, "grad_norm": 2.7355687618255615, "learning_rate": 0.00012462555922972186, "loss": 4.0552, "step": 3881 }, { "epoch": 1.1321181188479768, "grad_norm": 2.201849937438965, "learning_rate": 0.00012460610776113596, "loss": 4.0924, "step": 3882 }, { "epoch": 1.1324097703244622, "grad_norm": 1.9345051050186157, "learning_rate": 0.00012458665629255008, "loss": 3.8532, "step": 3883 }, { "epoch": 1.132701421800948, "grad_norm": 3.1043617725372314, "learning_rate": 0.00012456720482396423, "loss": 3.9628, "step": 3884 }, { "epoch": 1.1329930732774334, "grad_norm": 2.325507640838623, "learning_rate": 0.00012454775335537835, "loss": 4.2593, "step": 3885 }, { "epoch": 1.133284724753919, "grad_norm": 2.387669086456299, "learning_rate": 0.00012452830188679244, "loss": 4.0384, "step": 3886 }, { "epoch": 1.1335763762304047, "grad_norm": 3.2811641693115234, "learning_rate": 0.0001245088504182066, "loss": 3.927, "step": 3887 }, { "epoch": 1.1338680277068902, "grad_norm": 1.9612330198287964, "learning_rate": 0.00012448939894962071, "loss": 4.0335, "step": 3888 }, { "epoch": 1.134159679183376, "grad_norm": 2.2855966091156006, "learning_rate": 0.0001244699474810348, "loss": 4.0623, "step": 3889 }, { "epoch": 1.1344513306598616, "grad_norm": 2.2933530807495117, "learning_rate": 0.00012445049601244893, "loss": 4.193, "step": 3890 }, { "epoch": 1.134742982136347, "grad_norm": 2.3852901458740234, "learning_rate": 0.00012443104454386308, "loss": 4.0512, "step": 3891 }, { "epoch": 1.1350346336128327, "grad_norm": 2.5958352088928223, "learning_rate": 0.0001244115930752772, "loss": 4.0145, "step": 3892 }, { "epoch": 1.1353262850893182, "grad_norm": 2.6902995109558105, "learning_rate": 0.0001243921416066913, "loss": 4.0709, "step": 3893 }, { "epoch": 1.135617936565804, "grad_norm": 2.6426329612731934, "learning_rate": 0.00012437269013810544, "loss": 4.2936, "step": 3894 }, { "epoch": 1.1359095880422894, "grad_norm": 2.2199819087982178, "learning_rate": 0.00012435323866951957, "loss": 4.2102, "step": 3895 }, { "epoch": 1.136201239518775, "grad_norm": 2.3839223384857178, "learning_rate": 0.00012433378720093366, "loss": 3.9799, "step": 3896 }, { "epoch": 1.1364928909952607, "grad_norm": 2.236060380935669, "learning_rate": 0.0001243143357323478, "loss": 4.1096, "step": 3897 }, { "epoch": 1.1367845424717462, "grad_norm": 2.372539758682251, "learning_rate": 0.00012429488426376193, "loss": 4.0361, "step": 3898 }, { "epoch": 1.137076193948232, "grad_norm": 1.8652052879333496, "learning_rate": 0.00012427543279517605, "loss": 4.1681, "step": 3899 }, { "epoch": 1.1373678454247176, "grad_norm": 2.641557455062866, "learning_rate": 0.00012425598132659015, "loss": 4.1899, "step": 3900 }, { "epoch": 1.137659496901203, "grad_norm": 2.77243971824646, "learning_rate": 0.0001242365298580043, "loss": 4.0461, "step": 3901 }, { "epoch": 1.1379511483776887, "grad_norm": 2.7837533950805664, "learning_rate": 0.00012421707838941842, "loss": 4.3299, "step": 3902 }, { "epoch": 1.1382427998541742, "grad_norm": 2.4118974208831787, "learning_rate": 0.0001241976269208325, "loss": 4.1027, "step": 3903 }, { "epoch": 1.13853445133066, "grad_norm": 2.0655617713928223, "learning_rate": 0.00012417817545224666, "loss": 4.2106, "step": 3904 }, { "epoch": 1.1388261028071454, "grad_norm": 2.928440570831299, "learning_rate": 0.00012415872398366078, "loss": 4.1015, "step": 3905 }, { "epoch": 1.139117754283631, "grad_norm": 2.3538601398468018, "learning_rate": 0.00012413927251507488, "loss": 4.2039, "step": 3906 }, { "epoch": 1.1394094057601167, "grad_norm": 2.7193875312805176, "learning_rate": 0.00012411982104648903, "loss": 4.072, "step": 3907 }, { "epoch": 1.1397010572366022, "grad_norm": 2.0938262939453125, "learning_rate": 0.00012410036957790315, "loss": 3.9882, "step": 3908 }, { "epoch": 1.1399927087130879, "grad_norm": 2.009740114212036, "learning_rate": 0.00012408091810931727, "loss": 4.2004, "step": 3909 }, { "epoch": 1.1402843601895736, "grad_norm": 2.404324531555176, "learning_rate": 0.00012406146664073136, "loss": 4.1387, "step": 3910 }, { "epoch": 1.140576011666059, "grad_norm": 1.9602890014648438, "learning_rate": 0.0001240420151721455, "loss": 4.0037, "step": 3911 }, { "epoch": 1.1408676631425447, "grad_norm": 2.5524017810821533, "learning_rate": 0.00012402256370355963, "loss": 3.7253, "step": 3912 }, { "epoch": 1.1411593146190302, "grad_norm": 2.239731788635254, "learning_rate": 0.00012400311223497373, "loss": 4.0209, "step": 3913 }, { "epoch": 1.1414509660955159, "grad_norm": 3.8216962814331055, "learning_rate": 0.00012398366076638788, "loss": 4.1488, "step": 3914 }, { "epoch": 1.1417426175720014, "grad_norm": 2.4327924251556396, "learning_rate": 0.000123964209297802, "loss": 3.9275, "step": 3915 }, { "epoch": 1.142034269048487, "grad_norm": 2.9026143550872803, "learning_rate": 0.00012394475782921612, "loss": 4.2237, "step": 3916 }, { "epoch": 1.1423259205249727, "grad_norm": 3.205751657485962, "learning_rate": 0.00012392530636063024, "loss": 4.2164, "step": 3917 }, { "epoch": 1.1426175720014582, "grad_norm": 2.2994019985198975, "learning_rate": 0.00012390585489204436, "loss": 4.1998, "step": 3918 }, { "epoch": 1.1429092234779439, "grad_norm": 2.5683417320251465, "learning_rate": 0.00012388640342345848, "loss": 3.8664, "step": 3919 }, { "epoch": 1.1432008749544296, "grad_norm": 3.1331112384796143, "learning_rate": 0.00012386695195487258, "loss": 4.2855, "step": 3920 }, { "epoch": 1.143492526430915, "grad_norm": 2.247171640396118, "learning_rate": 0.00012384750048628673, "loss": 3.9234, "step": 3921 }, { "epoch": 1.1437841779074007, "grad_norm": 1.9167840480804443, "learning_rate": 0.00012382804901770085, "loss": 3.9091, "step": 3922 }, { "epoch": 1.1440758293838862, "grad_norm": 1.7662022113800049, "learning_rate": 0.00012380859754911497, "loss": 4.0141, "step": 3923 }, { "epoch": 1.1443674808603719, "grad_norm": 2.1958277225494385, "learning_rate": 0.0001237891460805291, "loss": 3.5178, "step": 3924 }, { "epoch": 1.1446591323368573, "grad_norm": 4.262264728546143, "learning_rate": 0.00012376969461194321, "loss": 3.8663, "step": 3925 }, { "epoch": 1.144950783813343, "grad_norm": 1.900712013244629, "learning_rate": 0.00012375024314335734, "loss": 4.2007, "step": 3926 }, { "epoch": 1.1452424352898287, "grad_norm": 2.690030336380005, "learning_rate": 0.00012373079167477143, "loss": 3.9458, "step": 3927 }, { "epoch": 1.1455340867663142, "grad_norm": 2.8691534996032715, "learning_rate": 0.00012371134020618558, "loss": 3.9052, "step": 3928 }, { "epoch": 1.1458257382427999, "grad_norm": 2.0083553791046143, "learning_rate": 0.0001236918887375997, "loss": 4.2002, "step": 3929 }, { "epoch": 1.1461173897192856, "grad_norm": 2.027391195297241, "learning_rate": 0.0001236724372690138, "loss": 4.1215, "step": 3930 }, { "epoch": 1.146409041195771, "grad_norm": 2.02272629737854, "learning_rate": 0.00012365298580042794, "loss": 4.3046, "step": 3931 }, { "epoch": 1.1467006926722567, "grad_norm": 2.465412139892578, "learning_rate": 0.00012363353433184206, "loss": 4.1907, "step": 3932 }, { "epoch": 1.1469923441487422, "grad_norm": 2.471755027770996, "learning_rate": 0.0001236140828632562, "loss": 4.3, "step": 3933 }, { "epoch": 1.1472839956252279, "grad_norm": 1.9606682062149048, "learning_rate": 0.0001235946313946703, "loss": 3.8125, "step": 3934 }, { "epoch": 1.1475756471017133, "grad_norm": 2.679378032684326, "learning_rate": 0.00012357517992608443, "loss": 4.3537, "step": 3935 }, { "epoch": 1.147867298578199, "grad_norm": 1.9663218259811401, "learning_rate": 0.00012355572845749855, "loss": 4.2405, "step": 3936 }, { "epoch": 1.1481589500546847, "grad_norm": 2.215547800064087, "learning_rate": 0.00012353627698891265, "loss": 4.4184, "step": 3937 }, { "epoch": 1.1484506015311702, "grad_norm": 2.112905979156494, "learning_rate": 0.0001235168255203268, "loss": 4.2542, "step": 3938 }, { "epoch": 1.1487422530076559, "grad_norm": 2.412358283996582, "learning_rate": 0.00012349737405174092, "loss": 4.3127, "step": 3939 }, { "epoch": 1.1490339044841416, "grad_norm": 2.293884515762329, "learning_rate": 0.00012347792258315504, "loss": 4.1116, "step": 3940 }, { "epoch": 1.149325555960627, "grad_norm": 2.0216140747070312, "learning_rate": 0.00012345847111456916, "loss": 4.1736, "step": 3941 }, { "epoch": 1.1496172074371127, "grad_norm": 2.069676160812378, "learning_rate": 0.00012343901964598328, "loss": 4.215, "step": 3942 }, { "epoch": 1.1499088589135982, "grad_norm": 2.0947976112365723, "learning_rate": 0.0001234195681773974, "loss": 4.0836, "step": 3943 }, { "epoch": 1.1502005103900839, "grad_norm": 2.771834135055542, "learning_rate": 0.00012340011670881152, "loss": 4.1443, "step": 3944 }, { "epoch": 1.1504921618665693, "grad_norm": 2.6674342155456543, "learning_rate": 0.00012338066524022565, "loss": 4.1212, "step": 3945 }, { "epoch": 1.150783813343055, "grad_norm": 3.2562785148620605, "learning_rate": 0.00012336121377163977, "loss": 4.1599, "step": 3946 }, { "epoch": 1.1510754648195407, "grad_norm": 2.1841940879821777, "learning_rate": 0.0001233417623030539, "loss": 4.1786, "step": 3947 }, { "epoch": 1.1513671162960262, "grad_norm": 2.3757457733154297, "learning_rate": 0.000123322310834468, "loss": 4.056, "step": 3948 }, { "epoch": 1.1516587677725119, "grad_norm": 2.6106343269348145, "learning_rate": 0.00012330285936588213, "loss": 4.0219, "step": 3949 }, { "epoch": 1.1519504192489975, "grad_norm": 1.9837690591812134, "learning_rate": 0.00012328340789729625, "loss": 4.2275, "step": 3950 }, { "epoch": 1.152242070725483, "grad_norm": 2.174086809158325, "learning_rate": 0.00012326395642871038, "loss": 4.1375, "step": 3951 }, { "epoch": 1.1525337222019687, "grad_norm": 2.110149383544922, "learning_rate": 0.0001232445049601245, "loss": 4.1863, "step": 3952 }, { "epoch": 1.1528253736784542, "grad_norm": 2.6461260318756104, "learning_rate": 0.00012322505349153862, "loss": 4.0499, "step": 3953 }, { "epoch": 1.1531170251549399, "grad_norm": 2.6443755626678467, "learning_rate": 0.00012320560202295274, "loss": 4.1576, "step": 3954 }, { "epoch": 1.1534086766314253, "grad_norm": 2.6690750122070312, "learning_rate": 0.00012318615055436686, "loss": 4.1873, "step": 3955 }, { "epoch": 1.153700328107911, "grad_norm": 1.9231067895889282, "learning_rate": 0.00012316669908578098, "loss": 3.8774, "step": 3956 }, { "epoch": 1.1539919795843967, "grad_norm": 2.012688636779785, "learning_rate": 0.0001231472476171951, "loss": 4.0925, "step": 3957 }, { "epoch": 1.1542836310608822, "grad_norm": 2.2439417839050293, "learning_rate": 0.00012312779614860923, "loss": 4.1344, "step": 3958 }, { "epoch": 1.1545752825373679, "grad_norm": 2.3018596172332764, "learning_rate": 0.00012310834468002335, "loss": 4.0915, "step": 3959 }, { "epoch": 1.1548669340138535, "grad_norm": 1.9194273948669434, "learning_rate": 0.00012308889321143747, "loss": 3.8622, "step": 3960 }, { "epoch": 1.155158585490339, "grad_norm": 1.865108847618103, "learning_rate": 0.0001230694417428516, "loss": 4.1587, "step": 3961 }, { "epoch": 1.1554502369668247, "grad_norm": 2.220984935760498, "learning_rate": 0.0001230499902742657, "loss": 3.8376, "step": 3962 }, { "epoch": 1.1557418884433102, "grad_norm": 2.0492570400238037, "learning_rate": 0.00012303053880567983, "loss": 4.1932, "step": 3963 }, { "epoch": 1.1560335399197958, "grad_norm": 2.4066269397735596, "learning_rate": 0.00012301108733709396, "loss": 3.9927, "step": 3964 }, { "epoch": 1.1563251913962815, "grad_norm": 2.1627907752990723, "learning_rate": 0.00012299163586850808, "loss": 4.2043, "step": 3965 }, { "epoch": 1.156616842872767, "grad_norm": 2.554886817932129, "learning_rate": 0.0001229721843999222, "loss": 4.0112, "step": 3966 }, { "epoch": 1.1569084943492527, "grad_norm": 2.1377573013305664, "learning_rate": 0.00012295273293133632, "loss": 4.1185, "step": 3967 }, { "epoch": 1.1572001458257382, "grad_norm": 2.837275266647339, "learning_rate": 0.00012293328146275044, "loss": 4.0464, "step": 3968 }, { "epoch": 1.1574917973022238, "grad_norm": 2.5823850631713867, "learning_rate": 0.00012291382999416456, "loss": 4.0315, "step": 3969 }, { "epoch": 1.1577834487787095, "grad_norm": 2.1782946586608887, "learning_rate": 0.00012289437852557869, "loss": 4.1901, "step": 3970 }, { "epoch": 1.158075100255195, "grad_norm": 2.1679062843322754, "learning_rate": 0.0001228749270569928, "loss": 3.8122, "step": 3971 }, { "epoch": 1.1583667517316807, "grad_norm": 2.3935933113098145, "learning_rate": 0.00012285547558840693, "loss": 3.9397, "step": 3972 }, { "epoch": 1.1586584032081662, "grad_norm": 2.6993048191070557, "learning_rate": 0.00012283602411982105, "loss": 4.0793, "step": 3973 }, { "epoch": 1.1589500546846518, "grad_norm": 3.788893699645996, "learning_rate": 0.00012281657265123517, "loss": 4.1849, "step": 3974 }, { "epoch": 1.1592417061611375, "grad_norm": 3.074052333831787, "learning_rate": 0.0001227971211826493, "loss": 4.1076, "step": 3975 }, { "epoch": 1.159533357637623, "grad_norm": 2.1850056648254395, "learning_rate": 0.00012277766971406342, "loss": 4.334, "step": 3976 }, { "epoch": 1.1598250091141087, "grad_norm": 2.2725226879119873, "learning_rate": 0.00012275821824547754, "loss": 4.0916, "step": 3977 }, { "epoch": 1.1601166605905942, "grad_norm": 2.5014238357543945, "learning_rate": 0.00012273876677689166, "loss": 4.1249, "step": 3978 }, { "epoch": 1.1604083120670798, "grad_norm": 3.31014347076416, "learning_rate": 0.00012271931530830578, "loss": 3.9408, "step": 3979 }, { "epoch": 1.1606999635435655, "grad_norm": 2.317816734313965, "learning_rate": 0.0001226998638397199, "loss": 4.1365, "step": 3980 }, { "epoch": 1.160991615020051, "grad_norm": 2.1271750926971436, "learning_rate": 0.00012268041237113402, "loss": 3.8473, "step": 3981 }, { "epoch": 1.1612832664965367, "grad_norm": 2.217216730117798, "learning_rate": 0.00012266096090254814, "loss": 4.1489, "step": 3982 }, { "epoch": 1.1615749179730221, "grad_norm": 2.0792219638824463, "learning_rate": 0.00012264150943396227, "loss": 4.1001, "step": 3983 }, { "epoch": 1.1618665694495078, "grad_norm": 2.564265727996826, "learning_rate": 0.00012262205796537641, "loss": 4.1976, "step": 3984 }, { "epoch": 1.1621582209259935, "grad_norm": 2.2547104358673096, "learning_rate": 0.0001226026064967905, "loss": 4.0079, "step": 3985 }, { "epoch": 1.162449872402479, "grad_norm": 2.0246968269348145, "learning_rate": 0.00012258315502820463, "loss": 3.8457, "step": 3986 }, { "epoch": 1.1627415238789647, "grad_norm": 2.4293372631073, "learning_rate": 0.00012256370355961875, "loss": 3.9915, "step": 3987 }, { "epoch": 1.1630331753554501, "grad_norm": 2.420402765274048, "learning_rate": 0.00012254425209103287, "loss": 4.1902, "step": 3988 }, { "epoch": 1.1633248268319358, "grad_norm": 2.1661128997802734, "learning_rate": 0.000122524800622447, "loss": 4.0235, "step": 3989 }, { "epoch": 1.1636164783084215, "grad_norm": 2.3790719509124756, "learning_rate": 0.00012250534915386112, "loss": 3.8664, "step": 3990 }, { "epoch": 1.163908129784907, "grad_norm": 2.149219274520874, "learning_rate": 0.00012248589768527527, "loss": 3.9512, "step": 3991 }, { "epoch": 1.1641997812613927, "grad_norm": 2.128464460372925, "learning_rate": 0.00012246644621668936, "loss": 4.176, "step": 3992 }, { "epoch": 1.1644914327378781, "grad_norm": 2.1838455200195312, "learning_rate": 0.00012244699474810348, "loss": 4.1431, "step": 3993 }, { "epoch": 1.1647830842143638, "grad_norm": 2.759864568710327, "learning_rate": 0.00012242754327951763, "loss": 4.2346, "step": 3994 }, { "epoch": 1.1650747356908495, "grad_norm": 2.325516700744629, "learning_rate": 0.00012240809181093173, "loss": 3.9538, "step": 3995 }, { "epoch": 1.165366387167335, "grad_norm": 2.1296887397766113, "learning_rate": 0.00012238864034234585, "loss": 3.9202, "step": 3996 }, { "epoch": 1.1656580386438207, "grad_norm": 3.327185869216919, "learning_rate": 0.00012236918887375997, "loss": 3.8564, "step": 3997 }, { "epoch": 1.1659496901203061, "grad_norm": 3.5080323219299316, "learning_rate": 0.0001223497374051741, "loss": 4.0568, "step": 3998 }, { "epoch": 1.1662413415967918, "grad_norm": 2.3701770305633545, "learning_rate": 0.0001223302859365882, "loss": 4.184, "step": 3999 }, { "epoch": 1.1665329930732775, "grad_norm": 2.281360149383545, "learning_rate": 0.00012231083446800233, "loss": 4.0721, "step": 4000 }, { "epoch": 1.166824644549763, "grad_norm": 3.0810773372650146, "learning_rate": 0.00012229138299941648, "loss": 4.0624, "step": 4001 }, { "epoch": 1.1671162960262487, "grad_norm": 3.1436023712158203, "learning_rate": 0.00012227193153083058, "loss": 4.2964, "step": 4002 }, { "epoch": 1.1674079475027344, "grad_norm": 2.856478452682495, "learning_rate": 0.0001222524800622447, "loss": 4.0761, "step": 4003 }, { "epoch": 1.1676995989792198, "grad_norm": 3.618701934814453, "learning_rate": 0.00012223302859365885, "loss": 4.2728, "step": 4004 }, { "epoch": 1.1679912504557055, "grad_norm": 2.763300895690918, "learning_rate": 0.00012221357712507294, "loss": 4.0629, "step": 4005 }, { "epoch": 1.168282901932191, "grad_norm": 2.5275537967681885, "learning_rate": 0.00012219412565648706, "loss": 3.9314, "step": 4006 }, { "epoch": 1.1685745534086767, "grad_norm": 2.265462875366211, "learning_rate": 0.00012217467418790118, "loss": 3.9671, "step": 4007 }, { "epoch": 1.1688662048851621, "grad_norm": 2.254957914352417, "learning_rate": 0.00012215522271931533, "loss": 4.0727, "step": 4008 }, { "epoch": 1.1691578563616478, "grad_norm": 2.758779525756836, "learning_rate": 0.00012213577125072943, "loss": 4.176, "step": 4009 }, { "epoch": 1.1694495078381335, "grad_norm": 2.4203763008117676, "learning_rate": 0.00012211631978214355, "loss": 3.9153, "step": 4010 }, { "epoch": 1.169741159314619, "grad_norm": 2.0417308807373047, "learning_rate": 0.0001220968683135577, "loss": 4.1312, "step": 4011 }, { "epoch": 1.1700328107911047, "grad_norm": 2.549193859100342, "learning_rate": 0.0001220774168449718, "loss": 4.103, "step": 4012 }, { "epoch": 1.1703244622675903, "grad_norm": 2.2228310108184814, "learning_rate": 0.00012205796537638591, "loss": 4.1985, "step": 4013 }, { "epoch": 1.1706161137440758, "grad_norm": 2.0882484912872314, "learning_rate": 0.00012203851390780004, "loss": 4.3523, "step": 4014 }, { "epoch": 1.1709077652205615, "grad_norm": 2.988651990890503, "learning_rate": 0.00012201906243921417, "loss": 4.0737, "step": 4015 }, { "epoch": 1.171199416697047, "grad_norm": 2.4614362716674805, "learning_rate": 0.00012199961097062829, "loss": 4.2272, "step": 4016 }, { "epoch": 1.1714910681735327, "grad_norm": 1.877423644065857, "learning_rate": 0.0001219801595020424, "loss": 4.1386, "step": 4017 }, { "epoch": 1.1717827196500181, "grad_norm": 2.478830337524414, "learning_rate": 0.00012196070803345654, "loss": 4.4968, "step": 4018 }, { "epoch": 1.1720743711265038, "grad_norm": 1.6624397039413452, "learning_rate": 0.00012194125656487066, "loss": 3.9293, "step": 4019 }, { "epoch": 1.1723660226029895, "grad_norm": 2.562427282333374, "learning_rate": 0.00012192180509628477, "loss": 3.9558, "step": 4020 }, { "epoch": 1.172657674079475, "grad_norm": 2.775989532470703, "learning_rate": 0.0001219023536276989, "loss": 3.7179, "step": 4021 }, { "epoch": 1.1729493255559607, "grad_norm": 2.591844081878662, "learning_rate": 0.00012188290215911302, "loss": 4.0236, "step": 4022 }, { "epoch": 1.1732409770324463, "grad_norm": 2.3562381267547607, "learning_rate": 0.00012186345069052713, "loss": 4.3325, "step": 4023 }, { "epoch": 1.1735326285089318, "grad_norm": 2.2613794803619385, "learning_rate": 0.00012184399922194125, "loss": 4.1535, "step": 4024 }, { "epoch": 1.1738242799854175, "grad_norm": 2.252082586288452, "learning_rate": 0.00012182454775335539, "loss": 3.845, "step": 4025 }, { "epoch": 1.174115931461903, "grad_norm": 2.699521064758301, "learning_rate": 0.00012180509628476951, "loss": 3.8331, "step": 4026 }, { "epoch": 1.1744075829383887, "grad_norm": 2.199965000152588, "learning_rate": 0.00012178564481618362, "loss": 3.915, "step": 4027 }, { "epoch": 1.1746992344148741, "grad_norm": 1.5550191402435303, "learning_rate": 0.00012176619334759775, "loss": 3.9667, "step": 4028 }, { "epoch": 1.1749908858913598, "grad_norm": 2.3775696754455566, "learning_rate": 0.00012174674187901187, "loss": 4.0092, "step": 4029 }, { "epoch": 1.1752825373678455, "grad_norm": 2.273129940032959, "learning_rate": 0.00012172729041042598, "loss": 4.1224, "step": 4030 }, { "epoch": 1.175574188844331, "grad_norm": 2.2602598667144775, "learning_rate": 0.00012170783894184013, "loss": 3.9476, "step": 4031 }, { "epoch": 1.1758658403208166, "grad_norm": 2.644085168838501, "learning_rate": 0.00012168838747325424, "loss": 4.1459, "step": 4032 }, { "epoch": 1.1761574917973023, "grad_norm": 2.1915810108184814, "learning_rate": 0.00012166893600466836, "loss": 4.2594, "step": 4033 }, { "epoch": 1.1764491432737878, "grad_norm": 2.101219654083252, "learning_rate": 0.00012164948453608247, "loss": 4.0924, "step": 4034 }, { "epoch": 1.1767407947502735, "grad_norm": 2.2400054931640625, "learning_rate": 0.0001216300330674966, "loss": 3.9729, "step": 4035 }, { "epoch": 1.177032446226759, "grad_norm": 2.918583869934082, "learning_rate": 0.00012161058159891072, "loss": 3.9906, "step": 4036 }, { "epoch": 1.1773240977032446, "grad_norm": 2.808084487915039, "learning_rate": 0.00012159113013032483, "loss": 4.1274, "step": 4037 }, { "epoch": 1.17761574917973, "grad_norm": 2.4839835166931152, "learning_rate": 0.00012157167866173898, "loss": 3.9529, "step": 4038 }, { "epoch": 1.1779074006562158, "grad_norm": 1.9475440979003906, "learning_rate": 0.00012155222719315309, "loss": 4.1053, "step": 4039 }, { "epoch": 1.1781990521327015, "grad_norm": 1.8378745317459106, "learning_rate": 0.00012153277572456721, "loss": 4.0995, "step": 4040 }, { "epoch": 1.178490703609187, "grad_norm": 2.698824882507324, "learning_rate": 0.00012151332425598135, "loss": 4.1578, "step": 4041 }, { "epoch": 1.1787823550856726, "grad_norm": 1.8810619115829468, "learning_rate": 0.00012149387278739545, "loss": 3.8549, "step": 4042 }, { "epoch": 1.1790740065621583, "grad_norm": 2.90083646774292, "learning_rate": 0.00012147442131880958, "loss": 4.164, "step": 4043 }, { "epoch": 1.1793656580386438, "grad_norm": 2.1310207843780518, "learning_rate": 0.00012145496985022368, "loss": 3.9792, "step": 4044 }, { "epoch": 1.1796573095151295, "grad_norm": 2.819082260131836, "learning_rate": 0.00012143551838163782, "loss": 4.2099, "step": 4045 }, { "epoch": 1.179948960991615, "grad_norm": 2.223430633544922, "learning_rate": 0.00012141606691305194, "loss": 4.254, "step": 4046 }, { "epoch": 1.1802406124681006, "grad_norm": 3.2651968002319336, "learning_rate": 0.00012139661544446606, "loss": 4.0293, "step": 4047 }, { "epoch": 1.180532263944586, "grad_norm": 2.983783483505249, "learning_rate": 0.0001213771639758802, "loss": 4.0939, "step": 4048 }, { "epoch": 1.1808239154210718, "grad_norm": 1.9520825147628784, "learning_rate": 0.0001213577125072943, "loss": 4.1254, "step": 4049 }, { "epoch": 1.1811155668975575, "grad_norm": 2.545973062515259, "learning_rate": 0.00012133826103870843, "loss": 4.1851, "step": 4050 }, { "epoch": 1.181407218374043, "grad_norm": 2.354886293411255, "learning_rate": 0.00012131880957012256, "loss": 4.2592, "step": 4051 }, { "epoch": 1.1816988698505286, "grad_norm": 2.6179096698760986, "learning_rate": 0.00012129935810153667, "loss": 4.2117, "step": 4052 }, { "epoch": 1.1819905213270143, "grad_norm": 2.1356284618377686, "learning_rate": 0.00012127990663295079, "loss": 4.2378, "step": 4053 }, { "epoch": 1.1822821728034998, "grad_norm": 1.996772050857544, "learning_rate": 0.0001212604551643649, "loss": 3.9917, "step": 4054 }, { "epoch": 1.1825738242799855, "grad_norm": 2.0620925426483154, "learning_rate": 0.00012124100369577905, "loss": 3.9926, "step": 4055 }, { "epoch": 1.182865475756471, "grad_norm": 2.1809468269348145, "learning_rate": 0.00012122155222719316, "loss": 4.2338, "step": 4056 }, { "epoch": 1.1831571272329566, "grad_norm": 2.323641061782837, "learning_rate": 0.00012120210075860728, "loss": 4.188, "step": 4057 }, { "epoch": 1.183448778709442, "grad_norm": 2.5859501361846924, "learning_rate": 0.00012118264929002141, "loss": 3.7764, "step": 4058 }, { "epoch": 1.1837404301859278, "grad_norm": 2.1101789474487305, "learning_rate": 0.00012116319782143552, "loss": 4.2591, "step": 4059 }, { "epoch": 1.1840320816624135, "grad_norm": 2.0963923931121826, "learning_rate": 0.00012114374635284964, "loss": 3.9478, "step": 4060 }, { "epoch": 1.184323733138899, "grad_norm": 2.271765947341919, "learning_rate": 0.00012112429488426378, "loss": 4.2547, "step": 4061 }, { "epoch": 1.1846153846153846, "grad_norm": 1.8995364904403687, "learning_rate": 0.0001211048434156779, "loss": 4.0756, "step": 4062 }, { "epoch": 1.1849070360918703, "grad_norm": 2.1458756923675537, "learning_rate": 0.00012108539194709201, "loss": 3.9549, "step": 4063 }, { "epoch": 1.1851986875683558, "grad_norm": 1.8275220394134521, "learning_rate": 0.00012106594047850613, "loss": 4.1284, "step": 4064 }, { "epoch": 1.1854903390448415, "grad_norm": 2.4734573364257812, "learning_rate": 0.00012104648900992026, "loss": 4.1423, "step": 4065 }, { "epoch": 1.185781990521327, "grad_norm": 2.907109260559082, "learning_rate": 0.00012102703754133437, "loss": 4.0875, "step": 4066 }, { "epoch": 1.1860736419978126, "grad_norm": 1.8809927701950073, "learning_rate": 0.0001210075860727485, "loss": 3.9857, "step": 4067 }, { "epoch": 1.1863652934742983, "grad_norm": 2.2653098106384277, "learning_rate": 0.00012098813460416263, "loss": 4.2117, "step": 4068 }, { "epoch": 1.1866569449507838, "grad_norm": 2.118134021759033, "learning_rate": 0.00012096868313557674, "loss": 3.8745, "step": 4069 }, { "epoch": 1.1869485964272695, "grad_norm": 2.072700023651123, "learning_rate": 0.00012094923166699086, "loss": 3.8159, "step": 4070 }, { "epoch": 1.187240247903755, "grad_norm": 2.3341586589813232, "learning_rate": 0.000120929780198405, "loss": 4.1539, "step": 4071 }, { "epoch": 1.1875318993802406, "grad_norm": 3.078489065170288, "learning_rate": 0.00012091032872981912, "loss": 4.1435, "step": 4072 }, { "epoch": 1.1878235508567263, "grad_norm": 2.095799446105957, "learning_rate": 0.00012089087726123322, "loss": 4.2281, "step": 4073 }, { "epoch": 1.1881152023332118, "grad_norm": 2.342243194580078, "learning_rate": 0.00012087142579264734, "loss": 3.9121, "step": 4074 }, { "epoch": 1.1884068538096975, "grad_norm": 2.4114139080047607, "learning_rate": 0.00012085197432406148, "loss": 4.117, "step": 4075 }, { "epoch": 1.188698505286183, "grad_norm": 2.524322032928467, "learning_rate": 0.00012083252285547559, "loss": 4.2669, "step": 4076 }, { "epoch": 1.1889901567626686, "grad_norm": 3.044973611831665, "learning_rate": 0.00012081307138688971, "loss": 4.2475, "step": 4077 }, { "epoch": 1.1892818082391543, "grad_norm": 2.463224411010742, "learning_rate": 0.00012079361991830384, "loss": 3.9573, "step": 4078 }, { "epoch": 1.1895734597156398, "grad_norm": 2.331392288208008, "learning_rate": 0.00012077416844971797, "loss": 4.3346, "step": 4079 }, { "epoch": 1.1898651111921255, "grad_norm": 2.2910470962524414, "learning_rate": 0.00012075471698113207, "loss": 3.9494, "step": 4080 }, { "epoch": 1.190156762668611, "grad_norm": 2.9296770095825195, "learning_rate": 0.00012073526551254621, "loss": 4.0109, "step": 4081 }, { "epoch": 1.1904484141450966, "grad_norm": 4.714772701263428, "learning_rate": 0.00012071581404396033, "loss": 4.034, "step": 4082 }, { "epoch": 1.1907400656215823, "grad_norm": 2.0426573753356934, "learning_rate": 0.00012069636257537444, "loss": 3.8374, "step": 4083 }, { "epoch": 1.1910317170980678, "grad_norm": 2.1660797595977783, "learning_rate": 0.00012067691110678856, "loss": 3.6162, "step": 4084 }, { "epoch": 1.1913233685745535, "grad_norm": 2.466344118118286, "learning_rate": 0.0001206574596382027, "loss": 4.1158, "step": 4085 }, { "epoch": 1.191615020051039, "grad_norm": 1.9663209915161133, "learning_rate": 0.00012063800816961682, "loss": 4.0894, "step": 4086 }, { "epoch": 1.1919066715275246, "grad_norm": 2.3200666904449463, "learning_rate": 0.00012061855670103093, "loss": 4.1786, "step": 4087 }, { "epoch": 1.1921983230040103, "grad_norm": 1.9573205709457397, "learning_rate": 0.00012059910523244506, "loss": 4.0775, "step": 4088 }, { "epoch": 1.1924899744804958, "grad_norm": 3.1557910442352295, "learning_rate": 0.00012057965376385918, "loss": 4.1118, "step": 4089 }, { "epoch": 1.1927816259569815, "grad_norm": 2.0551466941833496, "learning_rate": 0.00012056020229527329, "loss": 3.9541, "step": 4090 }, { "epoch": 1.193073277433467, "grad_norm": 3.008446216583252, "learning_rate": 0.00012054075082668741, "loss": 4.1649, "step": 4091 }, { "epoch": 1.1933649289099526, "grad_norm": 1.9803087711334229, "learning_rate": 0.00012052129935810155, "loss": 4.035, "step": 4092 }, { "epoch": 1.1936565803864383, "grad_norm": 1.947074055671692, "learning_rate": 0.00012050184788951567, "loss": 3.7912, "step": 4093 }, { "epoch": 1.1939482318629238, "grad_norm": 2.464444875717163, "learning_rate": 0.00012048239642092978, "loss": 3.9898, "step": 4094 }, { "epoch": 1.1942398833394094, "grad_norm": 3.0487959384918213, "learning_rate": 0.00012046294495234391, "loss": 3.9881, "step": 4095 }, { "epoch": 1.194531534815895, "grad_norm": 2.3713629245758057, "learning_rate": 0.00012044349348375803, "loss": 4.0689, "step": 4096 }, { "epoch": 1.1948231862923806, "grad_norm": 2.187082052230835, "learning_rate": 0.00012042404201517214, "loss": 4.1713, "step": 4097 }, { "epoch": 1.1951148377688663, "grad_norm": 2.3020434379577637, "learning_rate": 0.00012040459054658628, "loss": 3.9725, "step": 4098 }, { "epoch": 1.1954064892453518, "grad_norm": 3.7134151458740234, "learning_rate": 0.0001203851390780004, "loss": 4.1322, "step": 4099 }, { "epoch": 1.1956981407218374, "grad_norm": 4.111494064331055, "learning_rate": 0.0001203656876094145, "loss": 4.2557, "step": 4100 }, { "epoch": 1.195989792198323, "grad_norm": 2.950648784637451, "learning_rate": 0.00012034623614082863, "loss": 4.287, "step": 4101 }, { "epoch": 1.1962814436748086, "grad_norm": 2.566547155380249, "learning_rate": 0.00012032678467224276, "loss": 4.1758, "step": 4102 }, { "epoch": 1.1965730951512943, "grad_norm": 2.1043355464935303, "learning_rate": 0.00012030733320365688, "loss": 4.1126, "step": 4103 }, { "epoch": 1.1968647466277798, "grad_norm": 2.37327241897583, "learning_rate": 0.00012028788173507099, "loss": 3.7895, "step": 4104 }, { "epoch": 1.1971563981042654, "grad_norm": 3.3881759643554688, "learning_rate": 0.00012026843026648513, "loss": 4.1718, "step": 4105 }, { "epoch": 1.197448049580751, "grad_norm": 2.8770713806152344, "learning_rate": 0.00012024897879789925, "loss": 4.493, "step": 4106 }, { "epoch": 1.1977397010572366, "grad_norm": 3.609200954437256, "learning_rate": 0.00012022952732931336, "loss": 4.1007, "step": 4107 }, { "epoch": 1.1980313525337223, "grad_norm": 3.1964192390441895, "learning_rate": 0.0001202100758607275, "loss": 3.5371, "step": 4108 }, { "epoch": 1.1983230040102077, "grad_norm": 2.4487836360931396, "learning_rate": 0.00012019062439214161, "loss": 4.1699, "step": 4109 }, { "epoch": 1.1986146554866934, "grad_norm": 2.1934444904327393, "learning_rate": 0.00012017117292355574, "loss": 3.8166, "step": 4110 }, { "epoch": 1.198906306963179, "grad_norm": 2.7097597122192383, "learning_rate": 0.00012015172145496984, "loss": 4.3157, "step": 4111 }, { "epoch": 1.1991979584396646, "grad_norm": 2.1883628368377686, "learning_rate": 0.00012013226998638398, "loss": 3.8432, "step": 4112 }, { "epoch": 1.1994896099161503, "grad_norm": 3.771498203277588, "learning_rate": 0.0001201128185177981, "loss": 4.065, "step": 4113 }, { "epoch": 1.1997812613926357, "grad_norm": 2.630589485168457, "learning_rate": 0.00012009336704921221, "loss": 4.0289, "step": 4114 }, { "epoch": 1.2000729128691214, "grad_norm": 2.3376636505126953, "learning_rate": 0.00012007391558062636, "loss": 3.8121, "step": 4115 }, { "epoch": 1.2003645643456071, "grad_norm": 2.5966389179229736, "learning_rate": 0.00012005446411204047, "loss": 4.0093, "step": 4116 }, { "epoch": 1.2006562158220926, "grad_norm": 3.4214494228363037, "learning_rate": 0.00012003501264345459, "loss": 4.0626, "step": 4117 }, { "epoch": 1.2009478672985783, "grad_norm": 3.208137273788452, "learning_rate": 0.00012001556117486872, "loss": 4.0021, "step": 4118 }, { "epoch": 1.2012395187750637, "grad_norm": 2.651127576828003, "learning_rate": 0.00011999610970628283, "loss": 4.0106, "step": 4119 }, { "epoch": 1.2015311702515494, "grad_norm": 2.328219175338745, "learning_rate": 0.00011997665823769695, "loss": 4.1026, "step": 4120 }, { "epoch": 1.201822821728035, "grad_norm": 1.8185261487960815, "learning_rate": 0.00011995720676911106, "loss": 4.1356, "step": 4121 }, { "epoch": 1.2021144732045206, "grad_norm": 2.0649399757385254, "learning_rate": 0.0001199377553005252, "loss": 4.0563, "step": 4122 }, { "epoch": 1.2024061246810063, "grad_norm": 2.0226123332977295, "learning_rate": 0.00011991830383193932, "loss": 4.2088, "step": 4123 }, { "epoch": 1.2026977761574917, "grad_norm": 2.0907680988311768, "learning_rate": 0.00011989885236335342, "loss": 4.2861, "step": 4124 }, { "epoch": 1.2029894276339774, "grad_norm": 2.2269232273101807, "learning_rate": 0.00011987940089476757, "loss": 4.0876, "step": 4125 }, { "epoch": 1.2032810791104631, "grad_norm": 3.1932241916656494, "learning_rate": 0.00011985994942618168, "loss": 3.9564, "step": 4126 }, { "epoch": 1.2035727305869486, "grad_norm": 2.2908787727355957, "learning_rate": 0.0001198404979575958, "loss": 4.0486, "step": 4127 }, { "epoch": 1.2038643820634343, "grad_norm": 2.548675537109375, "learning_rate": 0.00011982104648900994, "loss": 4.1511, "step": 4128 }, { "epoch": 1.2041560335399197, "grad_norm": 2.000913143157959, "learning_rate": 0.00011980159502042405, "loss": 3.9409, "step": 4129 }, { "epoch": 1.2044476850164054, "grad_norm": 2.291252613067627, "learning_rate": 0.00011978214355183817, "loss": 4.1002, "step": 4130 }, { "epoch": 1.2047393364928909, "grad_norm": 2.5836780071258545, "learning_rate": 0.00011976269208325228, "loss": 4.0785, "step": 4131 }, { "epoch": 1.2050309879693766, "grad_norm": 1.740882396697998, "learning_rate": 0.00011974324061466642, "loss": 4.132, "step": 4132 }, { "epoch": 1.2053226394458623, "grad_norm": 2.339540958404541, "learning_rate": 0.00011972378914608053, "loss": 4.3507, "step": 4133 }, { "epoch": 1.2056142909223477, "grad_norm": 2.2202935218811035, "learning_rate": 0.00011970433767749465, "loss": 4.0248, "step": 4134 }, { "epoch": 1.2059059423988334, "grad_norm": 1.977535605430603, "learning_rate": 0.00011968488620890879, "loss": 3.7649, "step": 4135 }, { "epoch": 1.206197593875319, "grad_norm": 2.987647294998169, "learning_rate": 0.0001196654347403229, "loss": 3.9468, "step": 4136 }, { "epoch": 1.2064892453518046, "grad_norm": 2.289363384246826, "learning_rate": 0.00011964598327173702, "loss": 4.0994, "step": 4137 }, { "epoch": 1.2067808968282903, "grad_norm": 2.4260029792785645, "learning_rate": 0.00011962653180315115, "loss": 4.0654, "step": 4138 }, { "epoch": 1.2070725483047757, "grad_norm": 2.4500174522399902, "learning_rate": 0.00011960708033456528, "loss": 3.7368, "step": 4139 }, { "epoch": 1.2073641997812614, "grad_norm": 2.3199617862701416, "learning_rate": 0.00011958762886597938, "loss": 4.3057, "step": 4140 }, { "epoch": 1.2076558512577469, "grad_norm": 2.406940221786499, "learning_rate": 0.0001195681773973935, "loss": 3.9404, "step": 4141 }, { "epoch": 1.2079475027342326, "grad_norm": 2.381561756134033, "learning_rate": 0.00011954872592880764, "loss": 4.213, "step": 4142 }, { "epoch": 1.2082391542107183, "grad_norm": 3.2024660110473633, "learning_rate": 0.00011952927446022175, "loss": 4.0193, "step": 4143 }, { "epoch": 1.2085308056872037, "grad_norm": 1.880942940711975, "learning_rate": 0.00011950982299163587, "loss": 4.0695, "step": 4144 }, { "epoch": 1.2088224571636894, "grad_norm": 2.309000253677368, "learning_rate": 0.00011949037152305, "loss": 4.1792, "step": 4145 }, { "epoch": 1.209114108640175, "grad_norm": 2.266232967376709, "learning_rate": 0.00011947092005446411, "loss": 3.9301, "step": 4146 }, { "epoch": 1.2094057601166606, "grad_norm": 2.516253709793091, "learning_rate": 0.00011945146858587823, "loss": 4.1801, "step": 4147 }, { "epoch": 1.2096974115931463, "grad_norm": 2.557368278503418, "learning_rate": 0.00011943201711729237, "loss": 4.107, "step": 4148 }, { "epoch": 1.2099890630696317, "grad_norm": 2.3135244846343994, "learning_rate": 0.00011941256564870649, "loss": 4.1154, "step": 4149 }, { "epoch": 1.2102807145461174, "grad_norm": 1.892802119255066, "learning_rate": 0.0001193931141801206, "loss": 3.9571, "step": 4150 }, { "epoch": 1.2105723660226029, "grad_norm": 2.444793224334717, "learning_rate": 0.00011937366271153472, "loss": 3.8548, "step": 4151 }, { "epoch": 1.2108640174990886, "grad_norm": 2.270292282104492, "learning_rate": 0.00011935421124294886, "loss": 4.2077, "step": 4152 }, { "epoch": 1.2111556689755743, "grad_norm": 3.068079948425293, "learning_rate": 0.00011933475977436296, "loss": 3.9363, "step": 4153 }, { "epoch": 1.2114473204520597, "grad_norm": 2.0397226810455322, "learning_rate": 0.00011931530830577709, "loss": 4.3026, "step": 4154 }, { "epoch": 1.2117389719285454, "grad_norm": 2.6696274280548096, "learning_rate": 0.00011929585683719122, "loss": 4.2743, "step": 4155 }, { "epoch": 1.212030623405031, "grad_norm": 3.145875930786133, "learning_rate": 0.00011927640536860534, "loss": 4.2476, "step": 4156 }, { "epoch": 1.2123222748815166, "grad_norm": 2.8631303310394287, "learning_rate": 0.00011925695390001945, "loss": 3.923, "step": 4157 }, { "epoch": 1.2126139263580022, "grad_norm": 2.8744819164276123, "learning_rate": 0.00011923750243143359, "loss": 4.24, "step": 4158 }, { "epoch": 1.2129055778344877, "grad_norm": 2.41605806350708, "learning_rate": 0.00011921805096284771, "loss": 3.9767, "step": 4159 }, { "epoch": 1.2131972293109734, "grad_norm": 2.0304038524627686, "learning_rate": 0.00011919859949426182, "loss": 4.0742, "step": 4160 }, { "epoch": 1.2134888807874589, "grad_norm": 3.1839213371276855, "learning_rate": 0.00011917914802567594, "loss": 4.1077, "step": 4161 }, { "epoch": 1.2137805322639446, "grad_norm": 2.490093946456909, "learning_rate": 0.00011915969655709007, "loss": 4.0556, "step": 4162 }, { "epoch": 1.2140721837404302, "grad_norm": 2.9265971183776855, "learning_rate": 0.0001191402450885042, "loss": 4.0233, "step": 4163 }, { "epoch": 1.2143638352169157, "grad_norm": 2.1112818717956543, "learning_rate": 0.0001191207936199183, "loss": 4.0312, "step": 4164 }, { "epoch": 1.2146554866934014, "grad_norm": 2.303847312927246, "learning_rate": 0.00011910134215133244, "loss": 3.8988, "step": 4165 }, { "epoch": 1.214947138169887, "grad_norm": 2.817095994949341, "learning_rate": 0.00011908189068274656, "loss": 4.2921, "step": 4166 }, { "epoch": 1.2152387896463726, "grad_norm": 2.178354263305664, "learning_rate": 0.00011906243921416067, "loss": 3.9708, "step": 4167 }, { "epoch": 1.2155304411228582, "grad_norm": 2.0142295360565186, "learning_rate": 0.00011904298774557479, "loss": 4.2975, "step": 4168 }, { "epoch": 1.2158220925993437, "grad_norm": 2.4076290130615234, "learning_rate": 0.00011902353627698892, "loss": 4.2648, "step": 4169 }, { "epoch": 1.2161137440758294, "grad_norm": 2.6477248668670654, "learning_rate": 0.00011900408480840303, "loss": 4.1793, "step": 4170 }, { "epoch": 1.2164053955523149, "grad_norm": 3.0403056144714355, "learning_rate": 0.00011898463333981715, "loss": 3.9796, "step": 4171 }, { "epoch": 1.2166970470288005, "grad_norm": 3.053955078125, "learning_rate": 0.00011896518187123129, "loss": 3.9571, "step": 4172 }, { "epoch": 1.2169886985052862, "grad_norm": 2.2461724281311035, "learning_rate": 0.00011894573040264541, "loss": 3.9518, "step": 4173 }, { "epoch": 1.2172803499817717, "grad_norm": 1.8768563270568848, "learning_rate": 0.00011892627893405952, "loss": 4.1423, "step": 4174 }, { "epoch": 1.2175720014582574, "grad_norm": 3.2517709732055664, "learning_rate": 0.00011890682746547365, "loss": 3.9051, "step": 4175 }, { "epoch": 1.217863652934743, "grad_norm": 2.416224479675293, "learning_rate": 0.00011888737599688777, "loss": 4.3456, "step": 4176 }, { "epoch": 1.2181553044112285, "grad_norm": 2.3356592655181885, "learning_rate": 0.00011886792452830188, "loss": 4.1001, "step": 4177 }, { "epoch": 1.2184469558877142, "grad_norm": 2.099792003631592, "learning_rate": 0.000118848473059716, "loss": 4.0387, "step": 4178 }, { "epoch": 1.2187386073641997, "grad_norm": 1.9284166097640991, "learning_rate": 0.00011882902159113014, "loss": 3.8696, "step": 4179 }, { "epoch": 1.2190302588406854, "grad_norm": 3.410888195037842, "learning_rate": 0.00011880957012254426, "loss": 4.0766, "step": 4180 }, { "epoch": 1.219321910317171, "grad_norm": 2.602428913116455, "learning_rate": 0.00011879011865395837, "loss": 4.0262, "step": 4181 }, { "epoch": 1.2196135617936565, "grad_norm": 2.9490573406219482, "learning_rate": 0.0001187706671853725, "loss": 4.1644, "step": 4182 }, { "epoch": 1.2199052132701422, "grad_norm": 4.896134853363037, "learning_rate": 0.00011875121571678663, "loss": 4.0402, "step": 4183 }, { "epoch": 1.2201968647466277, "grad_norm": 2.373335361480713, "learning_rate": 0.00011873176424820073, "loss": 3.9461, "step": 4184 }, { "epoch": 1.2204885162231134, "grad_norm": 2.939681053161621, "learning_rate": 0.00011871231277961488, "loss": 4.0208, "step": 4185 }, { "epoch": 1.220780167699599, "grad_norm": 3.531458854675293, "learning_rate": 0.00011869286131102899, "loss": 4.2655, "step": 4186 }, { "epoch": 1.2210718191760845, "grad_norm": 2.324641704559326, "learning_rate": 0.00011867340984244311, "loss": 3.6149, "step": 4187 }, { "epoch": 1.2213634706525702, "grad_norm": 2.2696099281311035, "learning_rate": 0.00011865395837385722, "loss": 4.2269, "step": 4188 }, { "epoch": 1.2216551221290557, "grad_norm": 3.1901659965515137, "learning_rate": 0.00011863450690527136, "loss": 4.4023, "step": 4189 }, { "epoch": 1.2219467736055414, "grad_norm": 3.2589457035064697, "learning_rate": 0.00011861505543668548, "loss": 4.1311, "step": 4190 }, { "epoch": 1.222238425082027, "grad_norm": 2.5743162631988525, "learning_rate": 0.00011859560396809958, "loss": 3.9953, "step": 4191 }, { "epoch": 1.2225300765585125, "grad_norm": 2.7254903316497803, "learning_rate": 0.00011857615249951372, "loss": 4.1048, "step": 4192 }, { "epoch": 1.2228217280349982, "grad_norm": 3.3069326877593994, "learning_rate": 0.00011855670103092784, "loss": 4.1734, "step": 4193 }, { "epoch": 1.2231133795114837, "grad_norm": 2.392585515975952, "learning_rate": 0.00011853724956234196, "loss": 4.0634, "step": 4194 }, { "epoch": 1.2234050309879694, "grad_norm": 2.242854356765747, "learning_rate": 0.0001185177980937561, "loss": 3.9919, "step": 4195 }, { "epoch": 1.223696682464455, "grad_norm": 2.45263934135437, "learning_rate": 0.0001184983466251702, "loss": 3.8693, "step": 4196 }, { "epoch": 1.2239883339409405, "grad_norm": 2.2669694423675537, "learning_rate": 0.00011847889515658433, "loss": 4.1325, "step": 4197 }, { "epoch": 1.2242799854174262, "grad_norm": 3.046391487121582, "learning_rate": 0.00011845944368799844, "loss": 3.9212, "step": 4198 }, { "epoch": 1.2245716368939117, "grad_norm": 2.473529577255249, "learning_rate": 0.00011843999221941257, "loss": 4.0316, "step": 4199 }, { "epoch": 1.2248632883703974, "grad_norm": 2.1759586334228516, "learning_rate": 0.00011842054075082669, "loss": 4.1894, "step": 4200 }, { "epoch": 1.225154939846883, "grad_norm": 2.530287981033325, "learning_rate": 0.0001184010892822408, "loss": 3.9065, "step": 4201 }, { "epoch": 1.2254465913233685, "grad_norm": 2.1929538249969482, "learning_rate": 0.00011838163781365495, "loss": 4.116, "step": 4202 }, { "epoch": 1.2257382427998542, "grad_norm": 2.5620667934417725, "learning_rate": 0.00011836218634506906, "loss": 4.3358, "step": 4203 }, { "epoch": 1.2260298942763397, "grad_norm": 2.1083736419677734, "learning_rate": 0.00011834273487648318, "loss": 3.8419, "step": 4204 }, { "epoch": 1.2263215457528254, "grad_norm": 2.3795158863067627, "learning_rate": 0.00011832328340789731, "loss": 4.1957, "step": 4205 }, { "epoch": 1.226613197229311, "grad_norm": 1.9319065809249878, "learning_rate": 0.00011830383193931142, "loss": 3.9082, "step": 4206 }, { "epoch": 1.2269048487057965, "grad_norm": 3.2160072326660156, "learning_rate": 0.00011828438047072554, "loss": 3.9022, "step": 4207 }, { "epoch": 1.2271965001822822, "grad_norm": 2.5359532833099365, "learning_rate": 0.00011826492900213965, "loss": 4.2924, "step": 4208 }, { "epoch": 1.2274881516587677, "grad_norm": 1.9544857740402222, "learning_rate": 0.0001182454775335538, "loss": 4.1243, "step": 4209 }, { "epoch": 1.2277798031352534, "grad_norm": 2.473733901977539, "learning_rate": 0.00011822602606496791, "loss": 3.9225, "step": 4210 }, { "epoch": 1.228071454611739, "grad_norm": 2.0371952056884766, "learning_rate": 0.00011820657459638203, "loss": 4.1277, "step": 4211 }, { "epoch": 1.2283631060882245, "grad_norm": 2.2853894233703613, "learning_rate": 0.00011818712312779617, "loss": 3.8342, "step": 4212 }, { "epoch": 1.2286547575647102, "grad_norm": 2.47869873046875, "learning_rate": 0.00011816767165921027, "loss": 3.7934, "step": 4213 }, { "epoch": 1.2289464090411957, "grad_norm": 2.5780816078186035, "learning_rate": 0.0001181482201906244, "loss": 3.7444, "step": 4214 }, { "epoch": 1.2292380605176814, "grad_norm": 2.472120761871338, "learning_rate": 0.00011812876872203853, "loss": 4.1145, "step": 4215 }, { "epoch": 1.229529711994167, "grad_norm": 2.72788143157959, "learning_rate": 0.00011810931725345264, "loss": 4.0137, "step": 4216 }, { "epoch": 1.2298213634706525, "grad_norm": 2.390444040298462, "learning_rate": 0.00011808986578486676, "loss": 3.8911, "step": 4217 }, { "epoch": 1.2301130149471382, "grad_norm": 2.3992772102355957, "learning_rate": 0.00011807041431628088, "loss": 4.1686, "step": 4218 }, { "epoch": 1.230404666423624, "grad_norm": 2.219005584716797, "learning_rate": 0.00011805096284769502, "loss": 3.908, "step": 4219 }, { "epoch": 1.2306963179001094, "grad_norm": 2.281074285507202, "learning_rate": 0.00011803151137910912, "loss": 4.07, "step": 4220 }, { "epoch": 1.230987969376595, "grad_norm": 2.199761390686035, "learning_rate": 0.00011801205991052325, "loss": 3.9255, "step": 4221 }, { "epoch": 1.2312796208530805, "grad_norm": 2.0153753757476807, "learning_rate": 0.00011799260844193738, "loss": 4.0019, "step": 4222 }, { "epoch": 1.2315712723295662, "grad_norm": 3.0232603549957275, "learning_rate": 0.00011797315697335149, "loss": 4.0304, "step": 4223 }, { "epoch": 1.2318629238060517, "grad_norm": 1.9940783977508545, "learning_rate": 0.00011795370550476561, "loss": 4.0373, "step": 4224 }, { "epoch": 1.2321545752825374, "grad_norm": 2.158579111099243, "learning_rate": 0.00011793425403617975, "loss": 3.9479, "step": 4225 }, { "epoch": 1.232446226759023, "grad_norm": 2.1510531902313232, "learning_rate": 0.00011791480256759387, "loss": 3.9354, "step": 4226 }, { "epoch": 1.2327378782355085, "grad_norm": 2.5510852336883545, "learning_rate": 0.00011789535109900798, "loss": 4.1243, "step": 4227 }, { "epoch": 1.2330295297119942, "grad_norm": 2.8241305351257324, "learning_rate": 0.0001178758996304221, "loss": 4.3282, "step": 4228 }, { "epoch": 1.2333211811884799, "grad_norm": 2.2188987731933594, "learning_rate": 0.00011785644816183623, "loss": 4.0109, "step": 4229 }, { "epoch": 1.2336128326649654, "grad_norm": 2.005450487136841, "learning_rate": 0.00011783699669325034, "loss": 4.0056, "step": 4230 }, { "epoch": 1.233904484141451, "grad_norm": 2.580953598022461, "learning_rate": 0.00011781754522466446, "loss": 4.0652, "step": 4231 }, { "epoch": 1.2341961356179365, "grad_norm": 2.428070545196533, "learning_rate": 0.0001177980937560786, "loss": 4.1992, "step": 4232 }, { "epoch": 1.2344877870944222, "grad_norm": 3.0428433418273926, "learning_rate": 0.00011777864228749272, "loss": 3.9177, "step": 4233 }, { "epoch": 1.2347794385709077, "grad_norm": 2.501129150390625, "learning_rate": 0.00011775919081890683, "loss": 4.0545, "step": 4234 }, { "epoch": 1.2350710900473933, "grad_norm": 2.232322931289673, "learning_rate": 0.00011773973935032096, "loss": 4.3732, "step": 4235 }, { "epoch": 1.235362741523879, "grad_norm": 2.9199485778808594, "learning_rate": 0.00011772028788173508, "loss": 4.052, "step": 4236 }, { "epoch": 1.2356543930003645, "grad_norm": 2.3426496982574463, "learning_rate": 0.00011770083641314919, "loss": 4.019, "step": 4237 }, { "epoch": 1.2359460444768502, "grad_norm": 2.441758871078491, "learning_rate": 0.00011768138494456331, "loss": 4.5454, "step": 4238 }, { "epoch": 1.2362376959533359, "grad_norm": 2.165053367614746, "learning_rate": 0.00011766193347597745, "loss": 4.0106, "step": 4239 }, { "epoch": 1.2365293474298213, "grad_norm": 3.8748302459716797, "learning_rate": 0.00011764248200739157, "loss": 3.7329, "step": 4240 }, { "epoch": 1.236820998906307, "grad_norm": 3.0616867542266846, "learning_rate": 0.00011762303053880568, "loss": 4.0072, "step": 4241 }, { "epoch": 1.2371126503827925, "grad_norm": 2.2497541904449463, "learning_rate": 0.00011760357907021981, "loss": 4.3501, "step": 4242 }, { "epoch": 1.2374043018592782, "grad_norm": 2.350374221801758, "learning_rate": 0.00011758412760163393, "loss": 4.1005, "step": 4243 }, { "epoch": 1.2376959533357637, "grad_norm": 2.0503952503204346, "learning_rate": 0.00011756467613304804, "loss": 4.1516, "step": 4244 }, { "epoch": 1.2379876048122493, "grad_norm": 1.9898689985275269, "learning_rate": 0.00011754522466446218, "loss": 4.1963, "step": 4245 }, { "epoch": 1.238279256288735, "grad_norm": 1.8934861421585083, "learning_rate": 0.0001175257731958763, "loss": 4.1444, "step": 4246 }, { "epoch": 1.2385709077652205, "grad_norm": 2.8478286266326904, "learning_rate": 0.00011750632172729041, "loss": 3.6727, "step": 4247 }, { "epoch": 1.2388625592417062, "grad_norm": 1.9717459678649902, "learning_rate": 0.00011748687025870453, "loss": 4.0079, "step": 4248 }, { "epoch": 1.2391542107181919, "grad_norm": 2.3675692081451416, "learning_rate": 0.00011746741879011866, "loss": 4.047, "step": 4249 }, { "epoch": 1.2394458621946773, "grad_norm": 1.9449764490127563, "learning_rate": 0.00011744796732153279, "loss": 4.087, "step": 4250 }, { "epoch": 1.239737513671163, "grad_norm": 2.304633140563965, "learning_rate": 0.0001174285158529469, "loss": 3.9597, "step": 4251 }, { "epoch": 1.2400291651476485, "grad_norm": 2.1240150928497314, "learning_rate": 0.00011740906438436103, "loss": 3.9279, "step": 4252 }, { "epoch": 1.2403208166241342, "grad_norm": 2.7026784420013428, "learning_rate": 0.00011738961291577515, "loss": 3.8604, "step": 4253 }, { "epoch": 1.2406124681006196, "grad_norm": 2.5770113468170166, "learning_rate": 0.00011737016144718926, "loss": 4.1546, "step": 4254 }, { "epoch": 1.2409041195771053, "grad_norm": 2.6850221157073975, "learning_rate": 0.00011735070997860338, "loss": 3.9706, "step": 4255 }, { "epoch": 1.241195771053591, "grad_norm": 1.9216351509094238, "learning_rate": 0.00011733125851001752, "loss": 3.9176, "step": 4256 }, { "epoch": 1.2414874225300765, "grad_norm": 2.159736156463623, "learning_rate": 0.00011731180704143164, "loss": 4.1124, "step": 4257 }, { "epoch": 1.2417790740065622, "grad_norm": 2.653773784637451, "learning_rate": 0.00011729235557284575, "loss": 4.1225, "step": 4258 }, { "epoch": 1.2420707254830479, "grad_norm": 2.6407077312469482, "learning_rate": 0.00011727290410425988, "loss": 4.1308, "step": 4259 }, { "epoch": 1.2423623769595333, "grad_norm": 3.4223413467407227, "learning_rate": 0.000117253452635674, "loss": 4.1348, "step": 4260 }, { "epoch": 1.242654028436019, "grad_norm": 2.246087074279785, "learning_rate": 0.00011723400116708811, "loss": 3.9693, "step": 4261 }, { "epoch": 1.2429456799125045, "grad_norm": 2.0223021507263184, "learning_rate": 0.00011721454969850225, "loss": 3.7634, "step": 4262 }, { "epoch": 1.2432373313889902, "grad_norm": 1.9645020961761475, "learning_rate": 0.00011719509822991637, "loss": 3.8677, "step": 4263 }, { "epoch": 1.2435289828654756, "grad_norm": 2.217189311981201, "learning_rate": 0.00011717564676133049, "loss": 4.1946, "step": 4264 }, { "epoch": 1.2438206343419613, "grad_norm": 2.10685658454895, "learning_rate": 0.0001171561952927446, "loss": 3.9417, "step": 4265 }, { "epoch": 1.244112285818447, "grad_norm": 2.994438648223877, "learning_rate": 0.00011713674382415873, "loss": 4.155, "step": 4266 }, { "epoch": 1.2444039372949325, "grad_norm": 3.0030148029327393, "learning_rate": 0.00011711729235557285, "loss": 3.9293, "step": 4267 }, { "epoch": 1.2446955887714182, "grad_norm": 2.2941503524780273, "learning_rate": 0.00011709784088698696, "loss": 4.1709, "step": 4268 }, { "epoch": 1.2449872402479039, "grad_norm": 2.5313880443573, "learning_rate": 0.0001170783894184011, "loss": 4.0756, "step": 4269 }, { "epoch": 1.2452788917243893, "grad_norm": 2.069908857345581, "learning_rate": 0.00011705893794981522, "loss": 3.6644, "step": 4270 }, { "epoch": 1.245570543200875, "grad_norm": 2.1185569763183594, "learning_rate": 0.00011703948648122933, "loss": 4.1694, "step": 4271 }, { "epoch": 1.2458621946773605, "grad_norm": 2.7680110931396484, "learning_rate": 0.00011702003501264347, "loss": 4.3009, "step": 4272 }, { "epoch": 1.2461538461538462, "grad_norm": 2.2031948566436768, "learning_rate": 0.00011700058354405758, "loss": 3.9466, "step": 4273 }, { "epoch": 1.2464454976303316, "grad_norm": 2.3521316051483154, "learning_rate": 0.0001169811320754717, "loss": 3.8984, "step": 4274 }, { "epoch": 1.2467371491068173, "grad_norm": 2.1416397094726562, "learning_rate": 0.00011696168060688581, "loss": 4.2409, "step": 4275 }, { "epoch": 1.247028800583303, "grad_norm": 1.9563871622085571, "learning_rate": 0.00011694222913829995, "loss": 4.1408, "step": 4276 }, { "epoch": 1.2473204520597885, "grad_norm": 2.459298610687256, "learning_rate": 0.00011692277766971407, "loss": 3.7718, "step": 4277 }, { "epoch": 1.2476121035362742, "grad_norm": 2.7439699172973633, "learning_rate": 0.00011690332620112818, "loss": 4.2005, "step": 4278 }, { "epoch": 1.2479037550127599, "grad_norm": 2.1924734115600586, "learning_rate": 0.00011688387473254233, "loss": 4.0837, "step": 4279 }, { "epoch": 1.2481954064892453, "grad_norm": 2.082099676132202, "learning_rate": 0.00011686442326395643, "loss": 4.121, "step": 4280 }, { "epoch": 1.248487057965731, "grad_norm": 2.6340160369873047, "learning_rate": 0.00011684497179537056, "loss": 4.1758, "step": 4281 }, { "epoch": 1.2487787094422165, "grad_norm": 2.5328471660614014, "learning_rate": 0.00011682552032678469, "loss": 3.9634, "step": 4282 }, { "epoch": 1.2490703609187022, "grad_norm": 2.1403417587280273, "learning_rate": 0.0001168060688581988, "loss": 4.2476, "step": 4283 }, { "epoch": 1.2493620123951876, "grad_norm": 1.9137163162231445, "learning_rate": 0.00011678661738961292, "loss": 4.0267, "step": 4284 }, { "epoch": 1.2496536638716733, "grad_norm": 2.3506722450256348, "learning_rate": 0.00011676716592102703, "loss": 4.2537, "step": 4285 }, { "epoch": 1.249945315348159, "grad_norm": 2.3826653957366943, "learning_rate": 0.00011674771445244118, "loss": 4.0322, "step": 4286 }, { "epoch": 1.2502369668246445, "grad_norm": 2.608276844024658, "learning_rate": 0.00011672826298385528, "loss": 4.1506, "step": 4287 }, { "epoch": 1.2505286183011302, "grad_norm": 2.05513858795166, "learning_rate": 0.0001167088115152694, "loss": 4.2345, "step": 4288 }, { "epoch": 1.2508202697776158, "grad_norm": 1.960141658782959, "learning_rate": 0.00011668936004668354, "loss": 3.9238, "step": 4289 }, { "epoch": 1.2511119212541013, "grad_norm": 2.7255074977874756, "learning_rate": 0.00011666990857809765, "loss": 4.1344, "step": 4290 }, { "epoch": 1.251403572730587, "grad_norm": 2.6346585750579834, "learning_rate": 0.00011665045710951177, "loss": 4.2258, "step": 4291 }, { "epoch": 1.2516952242070725, "grad_norm": 2.7461307048797607, "learning_rate": 0.0001166310056409259, "loss": 3.8377, "step": 4292 }, { "epoch": 1.2519868756835582, "grad_norm": 3.011099100112915, "learning_rate": 0.00011661155417234001, "loss": 4.132, "step": 4293 }, { "epoch": 1.2522785271600436, "grad_norm": 2.2868492603302, "learning_rate": 0.00011659210270375414, "loss": 4.007, "step": 4294 }, { "epoch": 1.2525701786365293, "grad_norm": 2.308698892593384, "learning_rate": 0.00011657265123516826, "loss": 4.1748, "step": 4295 }, { "epoch": 1.252861830113015, "grad_norm": 2.117832899093628, "learning_rate": 0.00011655319976658239, "loss": 3.8036, "step": 4296 }, { "epoch": 1.2531534815895005, "grad_norm": 2.1717119216918945, "learning_rate": 0.0001165337482979965, "loss": 3.9605, "step": 4297 }, { "epoch": 1.2534451330659862, "grad_norm": 2.2978827953338623, "learning_rate": 0.00011651429682941062, "loss": 4.1715, "step": 4298 }, { "epoch": 1.2537367845424718, "grad_norm": 1.9889816045761108, "learning_rate": 0.00011649484536082476, "loss": 4.0859, "step": 4299 }, { "epoch": 1.2540284360189573, "grad_norm": 2.768040895462036, "learning_rate": 0.00011647539389223887, "loss": 3.8078, "step": 4300 }, { "epoch": 1.254320087495443, "grad_norm": 3.7404186725616455, "learning_rate": 0.00011645594242365299, "loss": 3.881, "step": 4301 }, { "epoch": 1.2546117389719287, "grad_norm": 2.2997488975524902, "learning_rate": 0.00011643649095506712, "loss": 4.0381, "step": 4302 }, { "epoch": 1.2549033904484141, "grad_norm": 2.283803701400757, "learning_rate": 0.00011641703948648124, "loss": 4.482, "step": 4303 }, { "epoch": 1.2551950419248996, "grad_norm": 2.2290544509887695, "learning_rate": 0.00011639758801789535, "loss": 4.1836, "step": 4304 }, { "epoch": 1.2554866934013853, "grad_norm": 2.038701295852661, "learning_rate": 0.00011637813654930947, "loss": 4.2351, "step": 4305 }, { "epoch": 1.255778344877871, "grad_norm": 1.8396620750427246, "learning_rate": 0.00011635868508072361, "loss": 4.1089, "step": 4306 }, { "epoch": 1.2560699963543565, "grad_norm": 2.4895095825195312, "learning_rate": 0.00011633923361213772, "loss": 4.0633, "step": 4307 }, { "epoch": 1.2563616478308421, "grad_norm": 2.0151045322418213, "learning_rate": 0.00011631978214355184, "loss": 4.1535, "step": 4308 }, { "epoch": 1.2566532993073278, "grad_norm": 2.2065470218658447, "learning_rate": 0.00011630033067496597, "loss": 4.1921, "step": 4309 }, { "epoch": 1.2569449507838133, "grad_norm": 2.5180768966674805, "learning_rate": 0.0001162808792063801, "loss": 4.2383, "step": 4310 }, { "epoch": 1.257236602260299, "grad_norm": 2.3894574642181396, "learning_rate": 0.0001162614277377942, "loss": 4.0037, "step": 4311 }, { "epoch": 1.2575282537367847, "grad_norm": 2.3862075805664062, "learning_rate": 0.00011624197626920834, "loss": 3.8553, "step": 4312 }, { "epoch": 1.2578199052132701, "grad_norm": 2.55191707611084, "learning_rate": 0.00011622252480062246, "loss": 4.1424, "step": 4313 }, { "epoch": 1.2581115566897556, "grad_norm": 1.8749159574508667, "learning_rate": 0.00011620307333203657, "loss": 4.2406, "step": 4314 }, { "epoch": 1.2584032081662413, "grad_norm": 2.305112600326538, "learning_rate": 0.00011618362186345069, "loss": 3.9015, "step": 4315 }, { "epoch": 1.258694859642727, "grad_norm": 2.784702777862549, "learning_rate": 0.00011616417039486482, "loss": 3.8333, "step": 4316 }, { "epoch": 1.2589865111192124, "grad_norm": 2.362748622894287, "learning_rate": 0.00011614471892627893, "loss": 3.7541, "step": 4317 }, { "epoch": 1.2592781625956981, "grad_norm": 2.884686231613159, "learning_rate": 0.00011612526745769305, "loss": 3.8187, "step": 4318 }, { "epoch": 1.2595698140721838, "grad_norm": 1.981341004371643, "learning_rate": 0.00011610581598910719, "loss": 4.0483, "step": 4319 }, { "epoch": 1.2598614655486693, "grad_norm": 2.2419517040252686, "learning_rate": 0.00011608636452052131, "loss": 3.7658, "step": 4320 }, { "epoch": 1.260153117025155, "grad_norm": 2.367119789123535, "learning_rate": 0.00011606691305193542, "loss": 4.2143, "step": 4321 }, { "epoch": 1.2604447685016407, "grad_norm": 2.055678606033325, "learning_rate": 0.00011604746158334955, "loss": 3.9603, "step": 4322 }, { "epoch": 1.2607364199781261, "grad_norm": 2.2185018062591553, "learning_rate": 0.00011602801011476368, "loss": 4.2813, "step": 4323 }, { "epoch": 1.2610280714546118, "grad_norm": 3.239122152328491, "learning_rate": 0.00011600855864617778, "loss": 3.9643, "step": 4324 }, { "epoch": 1.2613197229310973, "grad_norm": 2.1971752643585205, "learning_rate": 0.0001159891071775919, "loss": 4.0253, "step": 4325 }, { "epoch": 1.261611374407583, "grad_norm": 3.3503623008728027, "learning_rate": 0.00011596965570900604, "loss": 3.7643, "step": 4326 }, { "epoch": 1.2619030258840684, "grad_norm": 2.5999202728271484, "learning_rate": 0.00011595020424042016, "loss": 3.9696, "step": 4327 }, { "epoch": 1.2621946773605541, "grad_norm": 2.4039242267608643, "learning_rate": 0.00011593075277183427, "loss": 3.9524, "step": 4328 }, { "epoch": 1.2624863288370398, "grad_norm": 2.535025119781494, "learning_rate": 0.0001159113013032484, "loss": 4.1784, "step": 4329 }, { "epoch": 1.2627779803135253, "grad_norm": 1.8126307725906372, "learning_rate": 0.00011589184983466253, "loss": 3.9946, "step": 4330 }, { "epoch": 1.263069631790011, "grad_norm": 2.2125556468963623, "learning_rate": 0.00011587239836607664, "loss": 4.1071, "step": 4331 }, { "epoch": 1.2633612832664967, "grad_norm": 2.2719409465789795, "learning_rate": 0.00011585294689749076, "loss": 3.9075, "step": 4332 }, { "epoch": 1.2636529347429821, "grad_norm": 1.8141487836837769, "learning_rate": 0.00011583349542890489, "loss": 4.0401, "step": 4333 }, { "epoch": 1.2639445862194678, "grad_norm": 2.2945733070373535, "learning_rate": 0.00011581404396031901, "loss": 3.9978, "step": 4334 }, { "epoch": 1.2642362376959533, "grad_norm": 1.9899669885635376, "learning_rate": 0.00011579459249173312, "loss": 4.0138, "step": 4335 }, { "epoch": 1.264527889172439, "grad_norm": 1.901266098022461, "learning_rate": 0.00011577514102314726, "loss": 4.031, "step": 4336 }, { "epoch": 1.2648195406489244, "grad_norm": 2.39371657371521, "learning_rate": 0.00011575568955456138, "loss": 4.1803, "step": 4337 }, { "epoch": 1.2651111921254101, "grad_norm": 2.8934130668640137, "learning_rate": 0.00011573623808597549, "loss": 4.1864, "step": 4338 }, { "epoch": 1.2654028436018958, "grad_norm": 2.806544542312622, "learning_rate": 0.00011571678661738962, "loss": 4.1826, "step": 4339 }, { "epoch": 1.2656944950783813, "grad_norm": 2.652540683746338, "learning_rate": 0.00011569733514880374, "loss": 3.801, "step": 4340 }, { "epoch": 1.265986146554867, "grad_norm": 1.942192792892456, "learning_rate": 0.00011567788368021786, "loss": 4.4317, "step": 4341 }, { "epoch": 1.2662777980313527, "grad_norm": 2.634169816970825, "learning_rate": 0.00011565843221163197, "loss": 4.2518, "step": 4342 }, { "epoch": 1.2665694495078381, "grad_norm": 2.2147202491760254, "learning_rate": 0.00011563898074304611, "loss": 4.122, "step": 4343 }, { "epoch": 1.2668611009843238, "grad_norm": 2.114490509033203, "learning_rate": 0.00011561952927446023, "loss": 4.1624, "step": 4344 }, { "epoch": 1.2671527524608093, "grad_norm": 2.307236909866333, "learning_rate": 0.00011560007780587434, "loss": 4.0086, "step": 4345 }, { "epoch": 1.267444403937295, "grad_norm": 2.35206937789917, "learning_rate": 0.00011558062633728847, "loss": 4.1141, "step": 4346 }, { "epoch": 1.2677360554137804, "grad_norm": 2.2241501808166504, "learning_rate": 0.0001155611748687026, "loss": 4.4256, "step": 4347 }, { "epoch": 1.2680277068902661, "grad_norm": 2.208279848098755, "learning_rate": 0.0001155417234001167, "loss": 3.9859, "step": 4348 }, { "epoch": 1.2683193583667518, "grad_norm": 2.1399505138397217, "learning_rate": 0.00011552227193153085, "loss": 3.6886, "step": 4349 }, { "epoch": 1.2686110098432373, "grad_norm": 2.4335789680480957, "learning_rate": 0.00011550282046294496, "loss": 4.1671, "step": 4350 }, { "epoch": 1.268902661319723, "grad_norm": 2.7737507820129395, "learning_rate": 0.00011548336899435908, "loss": 3.866, "step": 4351 }, { "epoch": 1.2691943127962086, "grad_norm": 2.0124175548553467, "learning_rate": 0.00011546391752577319, "loss": 3.8399, "step": 4352 }, { "epoch": 1.2694859642726941, "grad_norm": 2.101879358291626, "learning_rate": 0.00011544446605718732, "loss": 4.033, "step": 4353 }, { "epoch": 1.2697776157491798, "grad_norm": 2.3503735065460205, "learning_rate": 0.00011542501458860145, "loss": 4.2133, "step": 4354 }, { "epoch": 1.2700692672256653, "grad_norm": 2.5589051246643066, "learning_rate": 0.00011540556312001555, "loss": 4.0643, "step": 4355 }, { "epoch": 1.270360918702151, "grad_norm": 3.093217134475708, "learning_rate": 0.0001153861116514297, "loss": 3.8847, "step": 4356 }, { "epoch": 1.2706525701786364, "grad_norm": 2.273609161376953, "learning_rate": 0.00011536666018284381, "loss": 3.9448, "step": 4357 }, { "epoch": 1.270944221655122, "grad_norm": 2.455812454223633, "learning_rate": 0.00011534720871425793, "loss": 4.1782, "step": 4358 }, { "epoch": 1.2712358731316078, "grad_norm": 2.161897897720337, "learning_rate": 0.00011532775724567207, "loss": 4.0445, "step": 4359 }, { "epoch": 1.2715275246080933, "grad_norm": 2.0082736015319824, "learning_rate": 0.00011530830577708617, "loss": 3.8189, "step": 4360 }, { "epoch": 1.271819176084579, "grad_norm": 2.869940996170044, "learning_rate": 0.0001152888543085003, "loss": 4.0689, "step": 4361 }, { "epoch": 1.2721108275610646, "grad_norm": 2.475565195083618, "learning_rate": 0.0001152694028399144, "loss": 3.9616, "step": 4362 }, { "epoch": 1.27240247903755, "grad_norm": 2.0289595127105713, "learning_rate": 0.00011524995137132854, "loss": 3.9767, "step": 4363 }, { "epoch": 1.2726941305140358, "grad_norm": 2.742664098739624, "learning_rate": 0.00011523049990274266, "loss": 4.1533, "step": 4364 }, { "epoch": 1.2729857819905213, "grad_norm": 2.2779979705810547, "learning_rate": 0.00011521104843415678, "loss": 4.1858, "step": 4365 }, { "epoch": 1.273277433467007, "grad_norm": 1.844819188117981, "learning_rate": 0.00011519159696557092, "loss": 3.6646, "step": 4366 }, { "epoch": 1.2735690849434924, "grad_norm": 2.1099417209625244, "learning_rate": 0.00011517214549698503, "loss": 4.0329, "step": 4367 }, { "epoch": 1.273860736419978, "grad_norm": 3.2212345600128174, "learning_rate": 0.00011515269402839915, "loss": 3.9169, "step": 4368 }, { "epoch": 1.2741523878964638, "grad_norm": 2.4406120777130127, "learning_rate": 0.00011513324255981328, "loss": 4.0026, "step": 4369 }, { "epoch": 1.2744440393729493, "grad_norm": 2.6009294986724854, "learning_rate": 0.00011511379109122739, "loss": 3.899, "step": 4370 }, { "epoch": 1.274735690849435, "grad_norm": 2.2523083686828613, "learning_rate": 0.00011509433962264151, "loss": 4.0779, "step": 4371 }, { "epoch": 1.2750273423259206, "grad_norm": 2.5607690811157227, "learning_rate": 0.00011507488815405562, "loss": 4.0051, "step": 4372 }, { "epoch": 1.275318993802406, "grad_norm": 2.430676221847534, "learning_rate": 0.00011505543668546977, "loss": 3.9398, "step": 4373 }, { "epoch": 1.2756106452788918, "grad_norm": 2.2632498741149902, "learning_rate": 0.00011503598521688388, "loss": 4.0921, "step": 4374 }, { "epoch": 1.2759022967553773, "grad_norm": 2.0351715087890625, "learning_rate": 0.000115016533748298, "loss": 3.9878, "step": 4375 }, { "epoch": 1.276193948231863, "grad_norm": 1.9665299654006958, "learning_rate": 0.00011499708227971213, "loss": 3.9306, "step": 4376 }, { "epoch": 1.2764855997083484, "grad_norm": 2.0866546630859375, "learning_rate": 0.00011497763081112624, "loss": 3.9032, "step": 4377 }, { "epoch": 1.276777251184834, "grad_norm": 2.1428375244140625, "learning_rate": 0.00011495817934254036, "loss": 4.0845, "step": 4378 }, { "epoch": 1.2770689026613198, "grad_norm": 3.1221201419830322, "learning_rate": 0.0001149387278739545, "loss": 4.0681, "step": 4379 }, { "epoch": 1.2773605541378052, "grad_norm": 2.14436936378479, "learning_rate": 0.00011491927640536862, "loss": 4.0604, "step": 4380 }, { "epoch": 1.277652205614291, "grad_norm": 2.983029365539551, "learning_rate": 0.00011489982493678273, "loss": 4.3149, "step": 4381 }, { "epoch": 1.2779438570907766, "grad_norm": 2.0458619594573975, "learning_rate": 0.00011488037346819685, "loss": 4.4035, "step": 4382 }, { "epoch": 1.278235508567262, "grad_norm": 2.3732988834381104, "learning_rate": 0.00011486092199961099, "loss": 4.2974, "step": 4383 }, { "epoch": 1.2785271600437478, "grad_norm": 2.9685537815093994, "learning_rate": 0.00011484147053102509, "loss": 4.2618, "step": 4384 }, { "epoch": 1.2788188115202332, "grad_norm": 2.1552841663360596, "learning_rate": 0.00011482201906243921, "loss": 4.3328, "step": 4385 }, { "epoch": 1.279110462996719, "grad_norm": 2.1980700492858887, "learning_rate": 0.00011480256759385335, "loss": 4.0972, "step": 4386 }, { "epoch": 1.2794021144732044, "grad_norm": 2.052067995071411, "learning_rate": 0.00011478311612526747, "loss": 4.1755, "step": 4387 }, { "epoch": 1.27969376594969, "grad_norm": 2.186309814453125, "learning_rate": 0.00011476366465668158, "loss": 3.9473, "step": 4388 }, { "epoch": 1.2799854174261758, "grad_norm": 2.306619167327881, "learning_rate": 0.00011474421318809571, "loss": 4.0187, "step": 4389 }, { "epoch": 1.2802770689026612, "grad_norm": 2.3690290451049805, "learning_rate": 0.00011472476171950984, "loss": 4.1717, "step": 4390 }, { "epoch": 1.280568720379147, "grad_norm": 1.8720006942749023, "learning_rate": 0.00011470531025092394, "loss": 4.0636, "step": 4391 }, { "epoch": 1.2808603718556326, "grad_norm": 2.3922481536865234, "learning_rate": 0.00011468585878233807, "loss": 4.3106, "step": 4392 }, { "epoch": 1.281152023332118, "grad_norm": 2.1757752895355225, "learning_rate": 0.0001146664073137522, "loss": 4.0458, "step": 4393 }, { "epoch": 1.2814436748086038, "grad_norm": 2.1742093563079834, "learning_rate": 0.00011464695584516631, "loss": 4.0633, "step": 4394 }, { "epoch": 1.2817353262850892, "grad_norm": 2.0065975189208984, "learning_rate": 0.00011462750437658043, "loss": 3.7482, "step": 4395 }, { "epoch": 1.282026977761575, "grad_norm": 2.6015594005584717, "learning_rate": 0.00011460805290799457, "loss": 3.8139, "step": 4396 }, { "epoch": 1.2823186292380604, "grad_norm": 2.4264297485351562, "learning_rate": 0.00011458860143940869, "loss": 4.0432, "step": 4397 }, { "epoch": 1.282610280714546, "grad_norm": 3.520951986312866, "learning_rate": 0.0001145691499708228, "loss": 4.0622, "step": 4398 }, { "epoch": 1.2829019321910318, "grad_norm": 2.4543216228485107, "learning_rate": 0.00011454969850223693, "loss": 3.8419, "step": 4399 }, { "epoch": 1.2831935836675172, "grad_norm": 2.7418057918548584, "learning_rate": 0.00011453024703365105, "loss": 3.8607, "step": 4400 }, { "epoch": 1.283485235144003, "grad_norm": 2.774801015853882, "learning_rate": 0.00011451079556506516, "loss": 4.1981, "step": 4401 }, { "epoch": 1.2837768866204886, "grad_norm": 2.263338804244995, "learning_rate": 0.00011449134409647928, "loss": 4.2414, "step": 4402 }, { "epoch": 1.284068538096974, "grad_norm": 2.384979009628296, "learning_rate": 0.00011447189262789342, "loss": 4.1215, "step": 4403 }, { "epoch": 1.2843601895734598, "grad_norm": 2.4093828201293945, "learning_rate": 0.00011445244115930754, "loss": 4.2871, "step": 4404 }, { "epoch": 1.2846518410499455, "grad_norm": 2.49019193649292, "learning_rate": 0.00011443298969072165, "loss": 3.9453, "step": 4405 }, { "epoch": 1.284943492526431, "grad_norm": 2.851592540740967, "learning_rate": 0.00011441353822213578, "loss": 3.8422, "step": 4406 }, { "epoch": 1.2852351440029164, "grad_norm": 2.572335958480835, "learning_rate": 0.0001143940867535499, "loss": 4.3342, "step": 4407 }, { "epoch": 1.285526795479402, "grad_norm": 2.5089869499206543, "learning_rate": 0.00011437463528496401, "loss": 4.0273, "step": 4408 }, { "epoch": 1.2858184469558878, "grad_norm": 2.4965317249298096, "learning_rate": 0.00011435518381637815, "loss": 4.2001, "step": 4409 }, { "epoch": 1.2861100984323732, "grad_norm": 2.3884031772613525, "learning_rate": 0.00011433573234779227, "loss": 4.0948, "step": 4410 }, { "epoch": 1.286401749908859, "grad_norm": 1.9686999320983887, "learning_rate": 0.00011431628087920639, "loss": 4.2893, "step": 4411 }, { "epoch": 1.2866934013853446, "grad_norm": 2.4930977821350098, "learning_rate": 0.0001142968294106205, "loss": 4.1641, "step": 4412 }, { "epoch": 1.28698505286183, "grad_norm": 1.9027012586593628, "learning_rate": 0.00011427737794203463, "loss": 4.2466, "step": 4413 }, { "epoch": 1.2872767043383158, "grad_norm": 2.147552251815796, "learning_rate": 0.00011425792647344875, "loss": 4.0178, "step": 4414 }, { "epoch": 1.2875683558148014, "grad_norm": 1.7885388135910034, "learning_rate": 0.00011423847500486286, "loss": 4.0656, "step": 4415 }, { "epoch": 1.287860007291287, "grad_norm": 2.148052215576172, "learning_rate": 0.000114219023536277, "loss": 3.6661, "step": 4416 }, { "epoch": 1.2881516587677724, "grad_norm": 2.8728926181793213, "learning_rate": 0.00011419957206769112, "loss": 3.9079, "step": 4417 }, { "epoch": 1.288443310244258, "grad_norm": 2.1392695903778076, "learning_rate": 0.00011418012059910523, "loss": 4.0996, "step": 4418 }, { "epoch": 1.2887349617207438, "grad_norm": 2.044429302215576, "learning_rate": 0.00011416066913051935, "loss": 4.1739, "step": 4419 }, { "epoch": 1.2890266131972292, "grad_norm": 2.510075330734253, "learning_rate": 0.00011414121766193348, "loss": 4.0971, "step": 4420 }, { "epoch": 1.289318264673715, "grad_norm": 2.7635631561279297, "learning_rate": 0.0001141217661933476, "loss": 4.2511, "step": 4421 }, { "epoch": 1.2896099161502006, "grad_norm": 2.3111162185668945, "learning_rate": 0.00011410231472476171, "loss": 4.2947, "step": 4422 }, { "epoch": 1.289901567626686, "grad_norm": 1.5943834781646729, "learning_rate": 0.00011408286325617585, "loss": 4.2042, "step": 4423 }, { "epoch": 1.2901932191031718, "grad_norm": 2.781383991241455, "learning_rate": 0.00011406341178758997, "loss": 4.2725, "step": 4424 }, { "epoch": 1.2904848705796574, "grad_norm": 2.5304486751556396, "learning_rate": 0.00011404396031900408, "loss": 3.9693, "step": 4425 }, { "epoch": 1.290776522056143, "grad_norm": 1.7768880128860474, "learning_rate": 0.00011402450885041823, "loss": 4.1861, "step": 4426 }, { "epoch": 1.2910681735326284, "grad_norm": 2.060370922088623, "learning_rate": 0.00011400505738183234, "loss": 3.9166, "step": 4427 }, { "epoch": 1.291359825009114, "grad_norm": 2.622553586959839, "learning_rate": 0.00011398560591324646, "loss": 4.2305, "step": 4428 }, { "epoch": 1.2916514764855997, "grad_norm": 3.0758016109466553, "learning_rate": 0.00011396615444466056, "loss": 4.3382, "step": 4429 }, { "epoch": 1.2919431279620852, "grad_norm": 2.3514788150787354, "learning_rate": 0.0001139467029760747, "loss": 4.2761, "step": 4430 }, { "epoch": 1.292234779438571, "grad_norm": 2.090614080429077, "learning_rate": 0.00011392725150748882, "loss": 4.1961, "step": 4431 }, { "epoch": 1.2925264309150566, "grad_norm": 2.229361057281494, "learning_rate": 0.00011390780003890293, "loss": 4.0293, "step": 4432 }, { "epoch": 1.292818082391542, "grad_norm": 2.3122670650482178, "learning_rate": 0.00011388834857031708, "loss": 3.6103, "step": 4433 }, { "epoch": 1.2931097338680277, "grad_norm": 2.291891098022461, "learning_rate": 0.00011386889710173119, "loss": 4.0554, "step": 4434 }, { "epoch": 1.2934013853445134, "grad_norm": 2.0243847370147705, "learning_rate": 0.00011384944563314531, "loss": 4.0072, "step": 4435 }, { "epoch": 1.293693036820999, "grad_norm": 2.236149787902832, "learning_rate": 0.00011382999416455944, "loss": 4.0935, "step": 4436 }, { "epoch": 1.2939846882974846, "grad_norm": 2.07499361038208, "learning_rate": 0.00011381054269597355, "loss": 4.189, "step": 4437 }, { "epoch": 1.29427633977397, "grad_norm": 2.3443737030029297, "learning_rate": 0.00011379109122738767, "loss": 3.9939, "step": 4438 }, { "epoch": 1.2945679912504557, "grad_norm": 3.921549081802368, "learning_rate": 0.00011377163975880178, "loss": 4.2107, "step": 4439 }, { "epoch": 1.2948596427269412, "grad_norm": 2.273510217666626, "learning_rate": 0.00011375218829021592, "loss": 4.1588, "step": 4440 }, { "epoch": 1.295151294203427, "grad_norm": 2.0619330406188965, "learning_rate": 0.00011373273682163004, "loss": 4.0282, "step": 4441 }, { "epoch": 1.2954429456799126, "grad_norm": 2.464796304702759, "learning_rate": 0.00011371328535304415, "loss": 3.7731, "step": 4442 }, { "epoch": 1.295734597156398, "grad_norm": 2.9435999393463135, "learning_rate": 0.0001136938338844583, "loss": 4.1591, "step": 4443 }, { "epoch": 1.2960262486328837, "grad_norm": 2.7617480754852295, "learning_rate": 0.0001136743824158724, "loss": 4.3262, "step": 4444 }, { "epoch": 1.2963179001093694, "grad_norm": 2.1692442893981934, "learning_rate": 0.00011365493094728652, "loss": 4.3001, "step": 4445 }, { "epoch": 1.296609551585855, "grad_norm": 2.3251161575317383, "learning_rate": 0.00011363547947870066, "loss": 4.1838, "step": 4446 }, { "epoch": 1.2969012030623406, "grad_norm": 2.283352851867676, "learning_rate": 0.00011361602801011477, "loss": 4.3361, "step": 4447 }, { "epoch": 1.297192854538826, "grad_norm": 2.5586397647857666, "learning_rate": 0.00011359657654152889, "loss": 4.2726, "step": 4448 }, { "epoch": 1.2974845060153117, "grad_norm": 2.305912733078003, "learning_rate": 0.000113577125072943, "loss": 4.1797, "step": 4449 }, { "epoch": 1.2977761574917972, "grad_norm": 2.122081995010376, "learning_rate": 0.00011355767360435715, "loss": 3.9344, "step": 4450 }, { "epoch": 1.2980678089682829, "grad_norm": 2.856043815612793, "learning_rate": 0.00011353822213577125, "loss": 4.1016, "step": 4451 }, { "epoch": 1.2983594604447686, "grad_norm": 2.4115235805511475, "learning_rate": 0.00011351877066718538, "loss": 4.264, "step": 4452 }, { "epoch": 1.298651111921254, "grad_norm": 3.0568394660949707, "learning_rate": 0.00011349931919859951, "loss": 4.1691, "step": 4453 }, { "epoch": 1.2989427633977397, "grad_norm": 2.138810157775879, "learning_rate": 0.00011347986773001362, "loss": 4.2571, "step": 4454 }, { "epoch": 1.2992344148742254, "grad_norm": 2.2487120628356934, "learning_rate": 0.00011346041626142774, "loss": 4.0243, "step": 4455 }, { "epoch": 1.2995260663507109, "grad_norm": 3.095036029815674, "learning_rate": 0.00011344096479284187, "loss": 4.2673, "step": 4456 }, { "epoch": 1.2998177178271966, "grad_norm": 2.0056045055389404, "learning_rate": 0.000113421513324256, "loss": 3.8864, "step": 4457 }, { "epoch": 1.300109369303682, "grad_norm": 2.887131929397583, "learning_rate": 0.0001134020618556701, "loss": 4.2103, "step": 4458 }, { "epoch": 1.3004010207801677, "grad_norm": 2.110914468765259, "learning_rate": 0.00011338261038708423, "loss": 4.2883, "step": 4459 }, { "epoch": 1.3006926722566532, "grad_norm": 2.1529600620269775, "learning_rate": 0.00011336315891849836, "loss": 4.1432, "step": 4460 }, { "epoch": 1.3009843237331389, "grad_norm": 2.2453320026397705, "learning_rate": 0.00011334370744991247, "loss": 4.0706, "step": 4461 }, { "epoch": 1.3012759752096246, "grad_norm": 2.339019775390625, "learning_rate": 0.00011332425598132659, "loss": 4.0324, "step": 4462 }, { "epoch": 1.30156762668611, "grad_norm": 1.9363857507705688, "learning_rate": 0.00011330480451274073, "loss": 4.0773, "step": 4463 }, { "epoch": 1.3018592781625957, "grad_norm": 2.480147123336792, "learning_rate": 0.00011328535304415483, "loss": 4.14, "step": 4464 }, { "epoch": 1.3021509296390814, "grad_norm": 2.156346321105957, "learning_rate": 0.00011326590157556896, "loss": 4.1362, "step": 4465 }, { "epoch": 1.3024425811155669, "grad_norm": 1.9537670612335205, "learning_rate": 0.00011324645010698309, "loss": 3.8899, "step": 4466 }, { "epoch": 1.3027342325920526, "grad_norm": 2.021238327026367, "learning_rate": 0.00011322699863839721, "loss": 4.1584, "step": 4467 }, { "epoch": 1.303025884068538, "grad_norm": 2.2090506553649902, "learning_rate": 0.00011320754716981132, "loss": 4.0096, "step": 4468 }, { "epoch": 1.3033175355450237, "grad_norm": 1.8856173753738403, "learning_rate": 0.00011318809570122544, "loss": 3.9894, "step": 4469 }, { "epoch": 1.3036091870215092, "grad_norm": 2.0586354732513428, "learning_rate": 0.00011316864423263958, "loss": 4.0645, "step": 4470 }, { "epoch": 1.3039008384979949, "grad_norm": 2.133326768875122, "learning_rate": 0.00011314919276405369, "loss": 3.9721, "step": 4471 }, { "epoch": 1.3041924899744806, "grad_norm": 2.6376044750213623, "learning_rate": 0.00011312974129546781, "loss": 4.1906, "step": 4472 }, { "epoch": 1.304484141450966, "grad_norm": 2.709764003753662, "learning_rate": 0.00011311028982688194, "loss": 3.7611, "step": 4473 }, { "epoch": 1.3047757929274517, "grad_norm": 2.304621458053589, "learning_rate": 0.00011309083835829606, "loss": 4.0479, "step": 4474 }, { "epoch": 1.3050674444039374, "grad_norm": 2.1909372806549072, "learning_rate": 0.00011307138688971017, "loss": 4.1321, "step": 4475 }, { "epoch": 1.3053590958804229, "grad_norm": 2.1602470874786377, "learning_rate": 0.00011305193542112431, "loss": 4.1126, "step": 4476 }, { "epoch": 1.3056507473569086, "grad_norm": 2.4092202186584473, "learning_rate": 0.00011303248395253843, "loss": 3.7895, "step": 4477 }, { "epoch": 1.305942398833394, "grad_norm": 1.7894848585128784, "learning_rate": 0.00011301303248395254, "loss": 3.5403, "step": 4478 }, { "epoch": 1.3062340503098797, "grad_norm": 3.3498847484588623, "learning_rate": 0.00011299358101536666, "loss": 4.0223, "step": 4479 }, { "epoch": 1.3065257017863652, "grad_norm": 2.1328165531158447, "learning_rate": 0.0001129741295467808, "loss": 3.8668, "step": 4480 }, { "epoch": 1.3068173532628509, "grad_norm": 2.2329256534576416, "learning_rate": 0.00011295467807819491, "loss": 4.2776, "step": 4481 }, { "epoch": 1.3071090047393366, "grad_norm": 2.060706853866577, "learning_rate": 0.00011293522660960902, "loss": 4.0252, "step": 4482 }, { "epoch": 1.307400656215822, "grad_norm": 2.329120397567749, "learning_rate": 0.00011291577514102316, "loss": 4.1134, "step": 4483 }, { "epoch": 1.3076923076923077, "grad_norm": 2.0760340690612793, "learning_rate": 0.00011289632367243728, "loss": 4.0492, "step": 4484 }, { "epoch": 1.3079839591687934, "grad_norm": 2.427769899368286, "learning_rate": 0.00011287687220385139, "loss": 4.1589, "step": 4485 }, { "epoch": 1.3082756106452789, "grad_norm": 2.1082077026367188, "learning_rate": 0.00011285742073526552, "loss": 4.1992, "step": 4486 }, { "epoch": 1.3085672621217646, "grad_norm": 2.912554979324341, "learning_rate": 0.00011283796926667964, "loss": 4.0754, "step": 4487 }, { "epoch": 1.30885891359825, "grad_norm": 2.3527872562408447, "learning_rate": 0.00011281851779809375, "loss": 4.0197, "step": 4488 }, { "epoch": 1.3091505650747357, "grad_norm": 1.8517580032348633, "learning_rate": 0.00011279906632950787, "loss": 3.7446, "step": 4489 }, { "epoch": 1.3094422165512212, "grad_norm": 2.4330942630767822, "learning_rate": 0.00011277961486092201, "loss": 4.2203, "step": 4490 }, { "epoch": 1.3097338680277069, "grad_norm": 3.098151206970215, "learning_rate": 0.00011276016339233613, "loss": 4.1694, "step": 4491 }, { "epoch": 1.3100255195041925, "grad_norm": 1.9277485609054565, "learning_rate": 0.00011274071192375024, "loss": 4.0598, "step": 4492 }, { "epoch": 1.310317170980678, "grad_norm": 2.215672254562378, "learning_rate": 0.00011272126045516437, "loss": 3.9367, "step": 4493 }, { "epoch": 1.3106088224571637, "grad_norm": 2.823711633682251, "learning_rate": 0.0001127018089865785, "loss": 4.0104, "step": 4494 }, { "epoch": 1.3109004739336494, "grad_norm": 1.8393900394439697, "learning_rate": 0.0001126823575179926, "loss": 3.8386, "step": 4495 }, { "epoch": 1.3111921254101349, "grad_norm": 2.3729896545410156, "learning_rate": 0.00011266290604940673, "loss": 4.0152, "step": 4496 }, { "epoch": 1.3114837768866205, "grad_norm": 1.9559115171432495, "learning_rate": 0.00011264345458082086, "loss": 4.0981, "step": 4497 }, { "epoch": 1.311775428363106, "grad_norm": 2.282510280609131, "learning_rate": 0.00011262400311223498, "loss": 4.068, "step": 4498 }, { "epoch": 1.3120670798395917, "grad_norm": 2.8528053760528564, "learning_rate": 0.00011260455164364909, "loss": 4.1811, "step": 4499 }, { "epoch": 1.3123587313160772, "grad_norm": 2.3204848766326904, "learning_rate": 0.00011258510017506323, "loss": 3.9756, "step": 4500 }, { "epoch": 1.3126503827925629, "grad_norm": 2.1338908672332764, "learning_rate": 0.00011256564870647735, "loss": 4.2118, "step": 4501 }, { "epoch": 1.3129420342690485, "grad_norm": 1.804535150527954, "learning_rate": 0.00011254619723789145, "loss": 3.8283, "step": 4502 }, { "epoch": 1.313233685745534, "grad_norm": 2.7127537727355957, "learning_rate": 0.0001125267457693056, "loss": 4.2903, "step": 4503 }, { "epoch": 1.3135253372220197, "grad_norm": 2.0652353763580322, "learning_rate": 0.00011250729430071971, "loss": 4.16, "step": 4504 }, { "epoch": 1.3138169886985054, "grad_norm": 2.6702845096588135, "learning_rate": 0.00011248784283213383, "loss": 4.1557, "step": 4505 }, { "epoch": 1.3141086401749908, "grad_norm": 2.585611581802368, "learning_rate": 0.00011246839136354794, "loss": 4.0421, "step": 4506 }, { "epoch": 1.3144002916514765, "grad_norm": 2.646582841873169, "learning_rate": 0.00011244893989496208, "loss": 4.1454, "step": 4507 }, { "epoch": 1.314691943127962, "grad_norm": 2.145810604095459, "learning_rate": 0.0001124294884263762, "loss": 4.431, "step": 4508 }, { "epoch": 1.3149835946044477, "grad_norm": 2.515505790710449, "learning_rate": 0.0001124100369577903, "loss": 4.2455, "step": 4509 }, { "epoch": 1.3152752460809332, "grad_norm": 2.428920269012451, "learning_rate": 0.00011239058548920444, "loss": 4.1403, "step": 4510 }, { "epoch": 1.3155668975574188, "grad_norm": 2.3988940715789795, "learning_rate": 0.00011237113402061856, "loss": 4.1144, "step": 4511 }, { "epoch": 1.3158585490339045, "grad_norm": 1.920959234237671, "learning_rate": 0.00011235168255203268, "loss": 4.18, "step": 4512 }, { "epoch": 1.31615020051039, "grad_norm": 2.2341134548187256, "learning_rate": 0.00011233223108344682, "loss": 4.0085, "step": 4513 }, { "epoch": 1.3164418519868757, "grad_norm": 3.353471517562866, "learning_rate": 0.00011231277961486093, "loss": 4.5153, "step": 4514 }, { "epoch": 1.3167335034633614, "grad_norm": 2.599095106124878, "learning_rate": 0.00011229332814627505, "loss": 3.9541, "step": 4515 }, { "epoch": 1.3170251549398468, "grad_norm": 2.1889748573303223, "learning_rate": 0.00011227387667768916, "loss": 4.4454, "step": 4516 }, { "epoch": 1.3173168064163325, "grad_norm": 1.881435513496399, "learning_rate": 0.00011225442520910329, "loss": 4.1884, "step": 4517 }, { "epoch": 1.3176084578928182, "grad_norm": 2.0915894508361816, "learning_rate": 0.00011223497374051741, "loss": 3.8321, "step": 4518 }, { "epoch": 1.3179001093693037, "grad_norm": 2.700216770172119, "learning_rate": 0.00011221552227193152, "loss": 4.1772, "step": 4519 }, { "epoch": 1.3181917608457892, "grad_norm": 2.457953691482544, "learning_rate": 0.00011219607080334567, "loss": 4.1509, "step": 4520 }, { "epoch": 1.3184834123222748, "grad_norm": 2.6042044162750244, "learning_rate": 0.00011217661933475978, "loss": 4.2756, "step": 4521 }, { "epoch": 1.3187750637987605, "grad_norm": 2.6639857292175293, "learning_rate": 0.0001121571678661739, "loss": 4.006, "step": 4522 }, { "epoch": 1.319066715275246, "grad_norm": 2.1396660804748535, "learning_rate": 0.00011213771639758804, "loss": 4.2956, "step": 4523 }, { "epoch": 1.3193583667517317, "grad_norm": 2.7097527980804443, "learning_rate": 0.00011211826492900214, "loss": 4.194, "step": 4524 }, { "epoch": 1.3196500182282174, "grad_norm": 3.3023219108581543, "learning_rate": 0.00011209881346041626, "loss": 3.9261, "step": 4525 }, { "epoch": 1.3199416697047028, "grad_norm": 2.2752134799957275, "learning_rate": 0.00011207936199183037, "loss": 4.0252, "step": 4526 }, { "epoch": 1.3202333211811885, "grad_norm": 2.3043479919433594, "learning_rate": 0.00011205991052324452, "loss": 3.9092, "step": 4527 }, { "epoch": 1.3205249726576742, "grad_norm": 4.621598243713379, "learning_rate": 0.00011204045905465863, "loss": 4.0222, "step": 4528 }, { "epoch": 1.3208166241341597, "grad_norm": 2.098848581314087, "learning_rate": 0.00011202100758607275, "loss": 4.1338, "step": 4529 }, { "epoch": 1.3211082756106451, "grad_norm": 2.425908327102661, "learning_rate": 0.00011200155611748689, "loss": 3.9953, "step": 4530 }, { "epoch": 1.3213999270871308, "grad_norm": 2.440596580505371, "learning_rate": 0.000111982104648901, "loss": 4.0564, "step": 4531 }, { "epoch": 1.3216915785636165, "grad_norm": 2.449615001678467, "learning_rate": 0.00011196265318031512, "loss": 3.9263, "step": 4532 }, { "epoch": 1.321983230040102, "grad_norm": 2.19553279876709, "learning_rate": 0.00011194320171172925, "loss": 4.0133, "step": 4533 }, { "epoch": 1.3222748815165877, "grad_norm": 1.7485469579696655, "learning_rate": 0.00011192375024314336, "loss": 4.2225, "step": 4534 }, { "epoch": 1.3225665329930734, "grad_norm": 1.897409439086914, "learning_rate": 0.00011190429877455748, "loss": 4.2944, "step": 4535 }, { "epoch": 1.3228581844695588, "grad_norm": 3.2261812686920166, "learning_rate": 0.0001118848473059716, "loss": 4.1015, "step": 4536 }, { "epoch": 1.3231498359460445, "grad_norm": 2.3372740745544434, "learning_rate": 0.00011186539583738574, "loss": 4.3025, "step": 4537 }, { "epoch": 1.3234414874225302, "grad_norm": 2.394313097000122, "learning_rate": 0.00011184594436879985, "loss": 3.9852, "step": 4538 }, { "epoch": 1.3237331388990157, "grad_norm": 3.187284231185913, "learning_rate": 0.00011182649290021397, "loss": 4.0743, "step": 4539 }, { "epoch": 1.3240247903755014, "grad_norm": 2.3312063217163086, "learning_rate": 0.0001118070414316281, "loss": 3.9688, "step": 4540 }, { "epoch": 1.3243164418519868, "grad_norm": 1.8223392963409424, "learning_rate": 0.00011178758996304221, "loss": 3.8969, "step": 4541 }, { "epoch": 1.3246080933284725, "grad_norm": 1.6305376291275024, "learning_rate": 0.00011176813849445633, "loss": 4.0351, "step": 4542 }, { "epoch": 1.324899744804958, "grad_norm": 3.662501335144043, "learning_rate": 0.00011174868702587047, "loss": 3.91, "step": 4543 }, { "epoch": 1.3251913962814437, "grad_norm": 2.3951878547668457, "learning_rate": 0.00011172923555728459, "loss": 4.0979, "step": 4544 }, { "epoch": 1.3254830477579294, "grad_norm": 2.7768218517303467, "learning_rate": 0.0001117097840886987, "loss": 3.9974, "step": 4545 }, { "epoch": 1.3257746992344148, "grad_norm": 2.08099365234375, "learning_rate": 0.00011169033262011282, "loss": 4.1219, "step": 4546 }, { "epoch": 1.3260663507109005, "grad_norm": 2.5786843299865723, "learning_rate": 0.00011167088115152695, "loss": 3.9217, "step": 4547 }, { "epoch": 1.3263580021873862, "grad_norm": 1.8290517330169678, "learning_rate": 0.00011165142968294106, "loss": 3.9033, "step": 4548 }, { "epoch": 1.3266496536638717, "grad_norm": 3.0696396827697754, "learning_rate": 0.00011163197821435518, "loss": 3.6751, "step": 4549 }, { "epoch": 1.3269413051403574, "grad_norm": 2.426952838897705, "learning_rate": 0.00011161252674576932, "loss": 4.005, "step": 4550 }, { "epoch": 1.3272329566168428, "grad_norm": 2.4452927112579346, "learning_rate": 0.00011159307527718344, "loss": 3.9804, "step": 4551 }, { "epoch": 1.3275246080933285, "grad_norm": 2.751579523086548, "learning_rate": 0.00011157362380859755, "loss": 4.3138, "step": 4552 }, { "epoch": 1.327816259569814, "grad_norm": 2.223646402359009, "learning_rate": 0.00011155417234001168, "loss": 4.0093, "step": 4553 }, { "epoch": 1.3281079110462997, "grad_norm": 2.886730432510376, "learning_rate": 0.0001115347208714258, "loss": 4.13, "step": 4554 }, { "epoch": 1.3283995625227853, "grad_norm": 2.355272054672241, "learning_rate": 0.00011151526940283991, "loss": 4.0082, "step": 4555 }, { "epoch": 1.3286912139992708, "grad_norm": 2.6389708518981934, "learning_rate": 0.00011149581793425403, "loss": 4.0099, "step": 4556 }, { "epoch": 1.3289828654757565, "grad_norm": 2.601902723312378, "learning_rate": 0.00011147636646566817, "loss": 4.0427, "step": 4557 }, { "epoch": 1.3292745169522422, "grad_norm": 2.7885255813598633, "learning_rate": 0.00011145691499708229, "loss": 4.0834, "step": 4558 }, { "epoch": 1.3295661684287277, "grad_norm": 2.2753031253814697, "learning_rate": 0.0001114374635284964, "loss": 3.5347, "step": 4559 }, { "epoch": 1.3298578199052133, "grad_norm": 2.518566370010376, "learning_rate": 0.00011141801205991053, "loss": 3.8407, "step": 4560 }, { "epoch": 1.3301494713816988, "grad_norm": 2.2184619903564453, "learning_rate": 0.00011139856059132466, "loss": 4.2737, "step": 4561 }, { "epoch": 1.3304411228581845, "grad_norm": 2.269468069076538, "learning_rate": 0.00011137910912273876, "loss": 4.3818, "step": 4562 }, { "epoch": 1.33073277433467, "grad_norm": 2.6538946628570557, "learning_rate": 0.0001113596576541529, "loss": 3.9602, "step": 4563 }, { "epoch": 1.3310244258111557, "grad_norm": 2.215968370437622, "learning_rate": 0.00011134020618556702, "loss": 4.2444, "step": 4564 }, { "epoch": 1.3313160772876413, "grad_norm": 3.2295773029327393, "learning_rate": 0.00011132075471698113, "loss": 3.8839, "step": 4565 }, { "epoch": 1.3316077287641268, "grad_norm": 2.062798500061035, "learning_rate": 0.00011130130324839525, "loss": 4.017, "step": 4566 }, { "epoch": 1.3318993802406125, "grad_norm": 2.7076234817504883, "learning_rate": 0.00011128185177980939, "loss": 3.9185, "step": 4567 }, { "epoch": 1.3321910317170982, "grad_norm": 2.4743223190307617, "learning_rate": 0.00011126240031122351, "loss": 4.2124, "step": 4568 }, { "epoch": 1.3324826831935837, "grad_norm": 1.8504455089569092, "learning_rate": 0.00011124294884263762, "loss": 4.1397, "step": 4569 }, { "epoch": 1.3327743346700693, "grad_norm": 1.9710267782211304, "learning_rate": 0.00011122349737405175, "loss": 4.0586, "step": 4570 }, { "epoch": 1.3330659861465548, "grad_norm": 3.2028815746307373, "learning_rate": 0.00011120404590546587, "loss": 3.9365, "step": 4571 }, { "epoch": 1.3333576376230405, "grad_norm": 3.226485013961792, "learning_rate": 0.00011118459443687998, "loss": 4.2203, "step": 4572 }, { "epoch": 1.333649289099526, "grad_norm": 2.0619924068450928, "learning_rate": 0.00011116514296829413, "loss": 4.2279, "step": 4573 }, { "epoch": 1.3339409405760116, "grad_norm": 3.2757604122161865, "learning_rate": 0.00011114569149970824, "loss": 3.985, "step": 4574 }, { "epoch": 1.3342325920524973, "grad_norm": 2.622764825820923, "learning_rate": 0.00011112624003112236, "loss": 4.0667, "step": 4575 }, { "epoch": 1.3345242435289828, "grad_norm": 2.91058611869812, "learning_rate": 0.00011110678856253647, "loss": 3.9609, "step": 4576 }, { "epoch": 1.3348158950054685, "grad_norm": 2.532026767730713, "learning_rate": 0.0001110873370939506, "loss": 4.1862, "step": 4577 }, { "epoch": 1.3351075464819542, "grad_norm": 4.113997936248779, "learning_rate": 0.00011106788562536472, "loss": 3.9925, "step": 4578 }, { "epoch": 1.3353991979584396, "grad_norm": 1.9021482467651367, "learning_rate": 0.00011104843415677883, "loss": 4.0652, "step": 4579 }, { "epoch": 1.3356908494349253, "grad_norm": 2.4094455242156982, "learning_rate": 0.00011102898268819297, "loss": 4.2783, "step": 4580 }, { "epoch": 1.3359825009114108, "grad_norm": 1.9774448871612549, "learning_rate": 0.00011100953121960709, "loss": 4.0702, "step": 4581 }, { "epoch": 1.3362741523878965, "grad_norm": 2.4781694412231445, "learning_rate": 0.00011099007975102121, "loss": 3.949, "step": 4582 }, { "epoch": 1.336565803864382, "grad_norm": 2.3355963230133057, "learning_rate": 0.00011097062828243532, "loss": 3.9528, "step": 4583 }, { "epoch": 1.3368574553408676, "grad_norm": 2.110476016998291, "learning_rate": 0.00011095117681384945, "loss": 4.1853, "step": 4584 }, { "epoch": 1.3371491068173533, "grad_norm": 2.353424549102783, "learning_rate": 0.00011093172534526357, "loss": 3.766, "step": 4585 }, { "epoch": 1.3374407582938388, "grad_norm": 2.3821399211883545, "learning_rate": 0.00011091227387667768, "loss": 3.8957, "step": 4586 }, { "epoch": 1.3377324097703245, "grad_norm": 2.6830711364746094, "learning_rate": 0.00011089282240809182, "loss": 3.9523, "step": 4587 }, { "epoch": 1.3380240612468102, "grad_norm": 2.7794277667999268, "learning_rate": 0.00011087337093950594, "loss": 4.0925, "step": 4588 }, { "epoch": 1.3383157127232956, "grad_norm": 2.0570483207702637, "learning_rate": 0.00011085391947092005, "loss": 4.2465, "step": 4589 }, { "epoch": 1.3386073641997813, "grad_norm": 2.351440668106079, "learning_rate": 0.0001108344680023342, "loss": 4.2012, "step": 4590 }, { "epoch": 1.3388990156762668, "grad_norm": 2.1854984760284424, "learning_rate": 0.0001108150165337483, "loss": 4.3441, "step": 4591 }, { "epoch": 1.3391906671527525, "grad_norm": 2.1014485359191895, "learning_rate": 0.00011079556506516243, "loss": 3.964, "step": 4592 }, { "epoch": 1.339482318629238, "grad_norm": 3.7782957553863525, "learning_rate": 0.00011077611359657653, "loss": 4.2419, "step": 4593 }, { "epoch": 1.3397739701057236, "grad_norm": 2.4277334213256836, "learning_rate": 0.00011075666212799067, "loss": 4.5186, "step": 4594 }, { "epoch": 1.3400656215822093, "grad_norm": 2.2277443408966064, "learning_rate": 0.00011073721065940479, "loss": 4.0429, "step": 4595 }, { "epoch": 1.3403572730586948, "grad_norm": 2.1768863201141357, "learning_rate": 0.0001107177591908189, "loss": 4.0707, "step": 4596 }, { "epoch": 1.3406489245351805, "grad_norm": 2.206695079803467, "learning_rate": 0.00011069830772223305, "loss": 4.1712, "step": 4597 }, { "epoch": 1.3409405760116662, "grad_norm": 1.968135952949524, "learning_rate": 0.00011067885625364715, "loss": 3.9744, "step": 4598 }, { "epoch": 1.3412322274881516, "grad_norm": 2.1528356075286865, "learning_rate": 0.00011065940478506128, "loss": 4.0245, "step": 4599 }, { "epoch": 1.3415238789646373, "grad_norm": 2.143651008605957, "learning_rate": 0.00011063995331647541, "loss": 4.2309, "step": 4600 }, { "epoch": 1.3418155304411228, "grad_norm": 2.5088090896606445, "learning_rate": 0.00011062050184788952, "loss": 4.457, "step": 4601 }, { "epoch": 1.3421071819176085, "grad_norm": 2.4935734272003174, "learning_rate": 0.00011060105037930364, "loss": 4.2939, "step": 4602 }, { "epoch": 1.342398833394094, "grad_norm": 2.875218391418457, "learning_rate": 0.00011058159891071775, "loss": 4.0854, "step": 4603 }, { "epoch": 1.3426904848705796, "grad_norm": 1.9995226860046387, "learning_rate": 0.0001105621474421319, "loss": 4.22, "step": 4604 }, { "epoch": 1.3429821363470653, "grad_norm": 2.005565643310547, "learning_rate": 0.000110542695973546, "loss": 4.2119, "step": 4605 }, { "epoch": 1.3432737878235508, "grad_norm": 2.45864200592041, "learning_rate": 0.00011052324450496013, "loss": 4.1086, "step": 4606 }, { "epoch": 1.3435654393000365, "grad_norm": 2.187485456466675, "learning_rate": 0.00011050379303637426, "loss": 4.1779, "step": 4607 }, { "epoch": 1.3438570907765222, "grad_norm": 3.728726625442505, "learning_rate": 0.00011048434156778837, "loss": 3.8541, "step": 4608 }, { "epoch": 1.3441487422530076, "grad_norm": 1.9538289308547974, "learning_rate": 0.00011046489009920249, "loss": 4.0812, "step": 4609 }, { "epoch": 1.3444403937294933, "grad_norm": 2.051135540008545, "learning_rate": 0.00011044543863061663, "loss": 4.0097, "step": 4610 }, { "epoch": 1.3447320452059788, "grad_norm": 3.3520917892456055, "learning_rate": 0.00011042598716203074, "loss": 3.9683, "step": 4611 }, { "epoch": 1.3450236966824645, "grad_norm": 3.6110453605651855, "learning_rate": 0.00011040653569344486, "loss": 4.0297, "step": 4612 }, { "epoch": 1.34531534815895, "grad_norm": 2.4847819805145264, "learning_rate": 0.00011038708422485898, "loss": 4.1976, "step": 4613 }, { "epoch": 1.3456069996354356, "grad_norm": 3.011664867401123, "learning_rate": 0.00011036763275627311, "loss": 4.0549, "step": 4614 }, { "epoch": 1.3458986511119213, "grad_norm": 2.4131667613983154, "learning_rate": 0.00011034818128768722, "loss": 4.2685, "step": 4615 }, { "epoch": 1.3461903025884068, "grad_norm": 3.625107526779175, "learning_rate": 0.00011032872981910134, "loss": 4.0911, "step": 4616 }, { "epoch": 1.3464819540648925, "grad_norm": 2.926687002182007, "learning_rate": 0.00011030927835051548, "loss": 4.1338, "step": 4617 }, { "epoch": 1.3467736055413781, "grad_norm": 2.1499860286712646, "learning_rate": 0.00011028982688192959, "loss": 4.2766, "step": 4618 }, { "epoch": 1.3470652570178636, "grad_norm": 2.51751446723938, "learning_rate": 0.00011027037541334371, "loss": 4.0792, "step": 4619 }, { "epoch": 1.3473569084943493, "grad_norm": 3.1148641109466553, "learning_rate": 0.00011025092394475784, "loss": 4.1792, "step": 4620 }, { "epoch": 1.3476485599708348, "grad_norm": 3.6115684509277344, "learning_rate": 0.00011023147247617197, "loss": 3.7048, "step": 4621 }, { "epoch": 1.3479402114473205, "grad_norm": 4.216788291931152, "learning_rate": 0.00011021202100758607, "loss": 4.1218, "step": 4622 }, { "epoch": 1.348231862923806, "grad_norm": 4.446755409240723, "learning_rate": 0.0001101925695390002, "loss": 4.0549, "step": 4623 }, { "epoch": 1.3485235144002916, "grad_norm": 2.279402017593384, "learning_rate": 0.00011017311807041433, "loss": 4.1792, "step": 4624 }, { "epoch": 1.3488151658767773, "grad_norm": 2.3834731578826904, "learning_rate": 0.00011015366660182844, "loss": 4.1016, "step": 4625 }, { "epoch": 1.3491068173532628, "grad_norm": 1.9177041053771973, "learning_rate": 0.00011013421513324256, "loss": 3.9246, "step": 4626 }, { "epoch": 1.3493984688297485, "grad_norm": 2.284363031387329, "learning_rate": 0.0001101147636646567, "loss": 4.014, "step": 4627 }, { "epoch": 1.3496901203062341, "grad_norm": 2.9431190490722656, "learning_rate": 0.00011009531219607082, "loss": 4.1052, "step": 4628 }, { "epoch": 1.3499817717827196, "grad_norm": 2.3057780265808105, "learning_rate": 0.00011007586072748492, "loss": 4.0008, "step": 4629 }, { "epoch": 1.3502734232592053, "grad_norm": 2.0381147861480713, "learning_rate": 0.00011005640925889906, "loss": 4.0311, "step": 4630 }, { "epoch": 1.350565074735691, "grad_norm": 3.9126229286193848, "learning_rate": 0.00011003695779031318, "loss": 3.9628, "step": 4631 }, { "epoch": 1.3508567262121765, "grad_norm": 2.794229745864868, "learning_rate": 0.00011001750632172729, "loss": 4.1065, "step": 4632 }, { "epoch": 1.351148377688662, "grad_norm": 2.8710951805114746, "learning_rate": 0.00010999805485314141, "loss": 4.2445, "step": 4633 }, { "epoch": 1.3514400291651476, "grad_norm": 2.7248754501342773, "learning_rate": 0.00010997860338455555, "loss": 3.9162, "step": 4634 }, { "epoch": 1.3517316806416333, "grad_norm": 1.9225367307662964, "learning_rate": 0.00010995915191596965, "loss": 3.6909, "step": 4635 }, { "epoch": 1.3520233321181188, "grad_norm": 2.7660129070281982, "learning_rate": 0.00010993970044738378, "loss": 4.0223, "step": 4636 }, { "epoch": 1.3523149835946044, "grad_norm": 2.623474597930908, "learning_rate": 0.00010992024897879791, "loss": 4.2868, "step": 4637 }, { "epoch": 1.3526066350710901, "grad_norm": 3.3145108222961426, "learning_rate": 0.00010990079751021203, "loss": 3.9891, "step": 4638 }, { "epoch": 1.3528982865475756, "grad_norm": 2.5081005096435547, "learning_rate": 0.00010988134604162614, "loss": 3.933, "step": 4639 }, { "epoch": 1.3531899380240613, "grad_norm": 2.769207239151001, "learning_rate": 0.00010986189457304028, "loss": 3.9927, "step": 4640 }, { "epoch": 1.353481589500547, "grad_norm": 2.4552392959594727, "learning_rate": 0.0001098424431044544, "loss": 4.1996, "step": 4641 }, { "epoch": 1.3537732409770324, "grad_norm": 2.258594274520874, "learning_rate": 0.0001098229916358685, "loss": 3.8063, "step": 4642 }, { "epoch": 1.354064892453518, "grad_norm": 2.416273832321167, "learning_rate": 0.00010980354016728263, "loss": 3.9303, "step": 4643 }, { "epoch": 1.3543565439300036, "grad_norm": 2.3070261478424072, "learning_rate": 0.00010978408869869676, "loss": 4.0733, "step": 4644 }, { "epoch": 1.3546481954064893, "grad_norm": 2.215167760848999, "learning_rate": 0.00010976463723011088, "loss": 4.0257, "step": 4645 }, { "epoch": 1.3549398468829748, "grad_norm": 2.731255054473877, "learning_rate": 0.00010974518576152499, "loss": 4.1342, "step": 4646 }, { "epoch": 1.3552314983594604, "grad_norm": 2.2105274200439453, "learning_rate": 0.00010972573429293913, "loss": 4.2255, "step": 4647 }, { "epoch": 1.3555231498359461, "grad_norm": 2.0077617168426514, "learning_rate": 0.00010970628282435325, "loss": 4.0233, "step": 4648 }, { "epoch": 1.3558148013124316, "grad_norm": 2.1099700927734375, "learning_rate": 0.00010968683135576736, "loss": 4.0895, "step": 4649 }, { "epoch": 1.3561064527889173, "grad_norm": 2.0066909790039062, "learning_rate": 0.0001096673798871815, "loss": 3.8878, "step": 4650 }, { "epoch": 1.356398104265403, "grad_norm": 2.146730899810791, "learning_rate": 0.00010964792841859561, "loss": 4.032, "step": 4651 }, { "epoch": 1.3566897557418884, "grad_norm": 2.2426679134368896, "learning_rate": 0.00010962847695000973, "loss": 4.1349, "step": 4652 }, { "epoch": 1.3569814072183741, "grad_norm": 2.215208053588867, "learning_rate": 0.00010960902548142384, "loss": 3.7723, "step": 4653 }, { "epoch": 1.3572730586948596, "grad_norm": 3.4755938053131104, "learning_rate": 0.00010958957401283798, "loss": 4.0947, "step": 4654 }, { "epoch": 1.3575647101713453, "grad_norm": 2.152017831802368, "learning_rate": 0.0001095701225442521, "loss": 3.7947, "step": 4655 }, { "epoch": 1.3578563616478307, "grad_norm": 2.424325942993164, "learning_rate": 0.00010955067107566621, "loss": 3.9065, "step": 4656 }, { "epoch": 1.3581480131243164, "grad_norm": 2.3175740242004395, "learning_rate": 0.00010953121960708034, "loss": 4.0559, "step": 4657 }, { "epoch": 1.3584396646008021, "grad_norm": 3.6041147708892822, "learning_rate": 0.00010951176813849446, "loss": 3.9946, "step": 4658 }, { "epoch": 1.3587313160772876, "grad_norm": 2.4983086585998535, "learning_rate": 0.00010949231666990859, "loss": 4.2216, "step": 4659 }, { "epoch": 1.3590229675537733, "grad_norm": 2.530447483062744, "learning_rate": 0.0001094728652013227, "loss": 4.0382, "step": 4660 }, { "epoch": 1.359314619030259, "grad_norm": 2.8941268920898438, "learning_rate": 0.00010945341373273683, "loss": 4.1156, "step": 4661 }, { "epoch": 1.3596062705067444, "grad_norm": 2.480774164199829, "learning_rate": 0.00010943396226415095, "loss": 3.8294, "step": 4662 }, { "epoch": 1.3598979219832301, "grad_norm": 2.2483627796173096, "learning_rate": 0.00010941451079556506, "loss": 4.127, "step": 4663 }, { "epoch": 1.3601895734597156, "grad_norm": 2.294667959213257, "learning_rate": 0.0001093950593269792, "loss": 4.3273, "step": 4664 }, { "epoch": 1.3604812249362013, "grad_norm": 2.155634641647339, "learning_rate": 0.00010937560785839332, "loss": 4.2136, "step": 4665 }, { "epoch": 1.3607728764126867, "grad_norm": 2.547427177429199, "learning_rate": 0.00010935615638980742, "loss": 4.0066, "step": 4666 }, { "epoch": 1.3610645278891724, "grad_norm": 2.7592968940734863, "learning_rate": 0.00010933670492122157, "loss": 3.833, "step": 4667 }, { "epoch": 1.3613561793656581, "grad_norm": 2.550849437713623, "learning_rate": 0.00010931725345263568, "loss": 3.8461, "step": 4668 }, { "epoch": 1.3616478308421436, "grad_norm": 2.3367059230804443, "learning_rate": 0.0001092978019840498, "loss": 4.3363, "step": 4669 }, { "epoch": 1.3619394823186293, "grad_norm": 1.997554063796997, "learning_rate": 0.00010927835051546391, "loss": 4.1133, "step": 4670 }, { "epoch": 1.362231133795115, "grad_norm": 1.977299451828003, "learning_rate": 0.00010925889904687804, "loss": 4.0341, "step": 4671 }, { "epoch": 1.3625227852716004, "grad_norm": 2.029947519302368, "learning_rate": 0.00010923944757829217, "loss": 4.3518, "step": 4672 }, { "epoch": 1.362814436748086, "grad_norm": 1.9763047695159912, "learning_rate": 0.00010921999610970627, "loss": 4.1581, "step": 4673 }, { "epoch": 1.3631060882245716, "grad_norm": 2.1338305473327637, "learning_rate": 0.00010920054464112042, "loss": 4.1411, "step": 4674 }, { "epoch": 1.3633977397010573, "grad_norm": 2.60113263130188, "learning_rate": 0.00010918109317253453, "loss": 4.1997, "step": 4675 }, { "epoch": 1.3636893911775427, "grad_norm": 2.0810253620147705, "learning_rate": 0.00010916164170394865, "loss": 3.8567, "step": 4676 }, { "epoch": 1.3639810426540284, "grad_norm": 2.0667150020599365, "learning_rate": 0.00010914219023536279, "loss": 3.9578, "step": 4677 }, { "epoch": 1.364272694130514, "grad_norm": 2.722405195236206, "learning_rate": 0.0001091227387667769, "loss": 3.8007, "step": 4678 }, { "epoch": 1.3645643456069996, "grad_norm": 1.9273838996887207, "learning_rate": 0.00010910328729819102, "loss": 4.4979, "step": 4679 }, { "epoch": 1.3648559970834853, "grad_norm": 1.9370622634887695, "learning_rate": 0.00010908383582960513, "loss": 3.8038, "step": 4680 }, { "epoch": 1.365147648559971, "grad_norm": 2.798306941986084, "learning_rate": 0.00010906438436101926, "loss": 3.986, "step": 4681 }, { "epoch": 1.3654393000364564, "grad_norm": 3.14278244972229, "learning_rate": 0.00010904493289243338, "loss": 3.9804, "step": 4682 }, { "epoch": 1.365730951512942, "grad_norm": 1.9869518280029297, "learning_rate": 0.0001090254814238475, "loss": 3.985, "step": 4683 }, { "epoch": 1.3660226029894276, "grad_norm": 2.4726219177246094, "learning_rate": 0.00010900602995526164, "loss": 4.2351, "step": 4684 }, { "epoch": 1.3663142544659133, "grad_norm": 1.8635165691375732, "learning_rate": 0.00010898657848667575, "loss": 3.983, "step": 4685 }, { "epoch": 1.3666059059423987, "grad_norm": 2.801600456237793, "learning_rate": 0.00010896712701808987, "loss": 3.8213, "step": 4686 }, { "epoch": 1.3668975574188844, "grad_norm": 2.1181867122650146, "learning_rate": 0.000108947675549504, "loss": 4.095, "step": 4687 }, { "epoch": 1.36718920889537, "grad_norm": 2.395754337310791, "learning_rate": 0.00010892822408091811, "loss": 3.7054, "step": 4688 }, { "epoch": 1.3674808603718556, "grad_norm": 2.5241506099700928, "learning_rate": 0.00010890877261233223, "loss": 4.0786, "step": 4689 }, { "epoch": 1.3677725118483413, "grad_norm": 3.1358211040496826, "learning_rate": 0.00010888932114374634, "loss": 4.1705, "step": 4690 }, { "epoch": 1.368064163324827, "grad_norm": 2.1025338172912598, "learning_rate": 0.00010886986967516049, "loss": 4.3671, "step": 4691 }, { "epoch": 1.3683558148013124, "grad_norm": 2.3555986881256104, "learning_rate": 0.0001088504182065746, "loss": 4.0025, "step": 4692 }, { "epoch": 1.368647466277798, "grad_norm": 2.1251742839813232, "learning_rate": 0.00010883096673798872, "loss": 4.0873, "step": 4693 }, { "epoch": 1.3689391177542836, "grad_norm": 2.5940897464752197, "learning_rate": 0.00010881151526940285, "loss": 3.9766, "step": 4694 }, { "epoch": 1.3692307692307693, "grad_norm": 2.036111354827881, "learning_rate": 0.00010879206380081696, "loss": 3.9951, "step": 4695 }, { "epoch": 1.3695224207072547, "grad_norm": 2.3082406520843506, "learning_rate": 0.00010877261233223108, "loss": 3.9096, "step": 4696 }, { "epoch": 1.3698140721837404, "grad_norm": 2.5700738430023193, "learning_rate": 0.00010875316086364522, "loss": 3.7706, "step": 4697 }, { "epoch": 1.370105723660226, "grad_norm": 3.210986852645874, "learning_rate": 0.00010873370939505934, "loss": 3.9622, "step": 4698 }, { "epoch": 1.3703973751367116, "grad_norm": 1.9581995010375977, "learning_rate": 0.00010871425792647345, "loss": 3.9273, "step": 4699 }, { "epoch": 1.3706890266131972, "grad_norm": 2.352825880050659, "learning_rate": 0.00010869480645788757, "loss": 4.2299, "step": 4700 }, { "epoch": 1.370980678089683, "grad_norm": 1.884541630744934, "learning_rate": 0.0001086753549893017, "loss": 4.1627, "step": 4701 }, { "epoch": 1.3712723295661684, "grad_norm": 2.2876083850860596, "learning_rate": 0.00010865590352071581, "loss": 4.0689, "step": 4702 }, { "epoch": 1.371563981042654, "grad_norm": 3.00128436088562, "learning_rate": 0.00010863645205212994, "loss": 4.0238, "step": 4703 }, { "epoch": 1.3718556325191396, "grad_norm": 2.3745813369750977, "learning_rate": 0.00010861700058354407, "loss": 4.3463, "step": 4704 }, { "epoch": 1.3721472839956252, "grad_norm": 2.3538012504577637, "learning_rate": 0.00010859754911495819, "loss": 4.0817, "step": 4705 }, { "epoch": 1.3724389354721107, "grad_norm": 4.32119607925415, "learning_rate": 0.0001085780976463723, "loss": 4.2532, "step": 4706 }, { "epoch": 1.3727305869485964, "grad_norm": 2.6841065883636475, "learning_rate": 0.00010855864617778644, "loss": 3.7366, "step": 4707 }, { "epoch": 1.373022238425082, "grad_norm": 2.1959402561187744, "learning_rate": 0.00010853919470920056, "loss": 4.0408, "step": 4708 }, { "epoch": 1.3733138899015676, "grad_norm": 2.5825183391571045, "learning_rate": 0.00010851974324061467, "loss": 3.9901, "step": 4709 }, { "epoch": 1.3736055413780532, "grad_norm": 2.0190517902374268, "learning_rate": 0.00010850029177202879, "loss": 4.3243, "step": 4710 }, { "epoch": 1.373897192854539, "grad_norm": 2.065232515335083, "learning_rate": 0.00010848084030344292, "loss": 3.851, "step": 4711 }, { "epoch": 1.3741888443310244, "grad_norm": 2.58622670173645, "learning_rate": 0.00010846138883485703, "loss": 4.2281, "step": 4712 }, { "epoch": 1.37448049580751, "grad_norm": 2.149852991104126, "learning_rate": 0.00010844193736627115, "loss": 3.8059, "step": 4713 }, { "epoch": 1.3747721472839955, "grad_norm": 2.1801493167877197, "learning_rate": 0.00010842248589768529, "loss": 4.0214, "step": 4714 }, { "epoch": 1.3750637987604812, "grad_norm": 2.5488274097442627, "learning_rate": 0.00010840303442909941, "loss": 3.9397, "step": 4715 }, { "epoch": 1.3753554502369667, "grad_norm": 2.7041149139404297, "learning_rate": 0.00010838358296051352, "loss": 4.1958, "step": 4716 }, { "epoch": 1.3756471017134524, "grad_norm": 2.4224345684051514, "learning_rate": 0.00010836413149192765, "loss": 4.0579, "step": 4717 }, { "epoch": 1.375938753189938, "grad_norm": 1.8962262868881226, "learning_rate": 0.00010834468002334177, "loss": 3.6588, "step": 4718 }, { "epoch": 1.3762304046664235, "grad_norm": 2.361464738845825, "learning_rate": 0.00010832522855475588, "loss": 4.2466, "step": 4719 }, { "epoch": 1.3765220561429092, "grad_norm": 1.7406812906265259, "learning_rate": 0.00010830577708617, "loss": 3.7717, "step": 4720 }, { "epoch": 1.376813707619395, "grad_norm": 2.5460548400878906, "learning_rate": 0.00010828632561758414, "loss": 3.8803, "step": 4721 }, { "epoch": 1.3771053590958804, "grad_norm": 2.1964902877807617, "learning_rate": 0.00010826687414899826, "loss": 3.9207, "step": 4722 }, { "epoch": 1.377397010572366, "grad_norm": 2.9081461429595947, "learning_rate": 0.00010824742268041237, "loss": 4.3088, "step": 4723 }, { "epoch": 1.3776886620488515, "grad_norm": 2.436938762664795, "learning_rate": 0.0001082279712118265, "loss": 4.0888, "step": 4724 }, { "epoch": 1.3779803135253372, "grad_norm": 2.200960159301758, "learning_rate": 0.00010820851974324062, "loss": 4.024, "step": 4725 }, { "epoch": 1.3782719650018227, "grad_norm": 2.5153725147247314, "learning_rate": 0.00010818906827465473, "loss": 4.0477, "step": 4726 }, { "epoch": 1.3785636164783084, "grad_norm": 1.9952987432479858, "learning_rate": 0.00010816961680606887, "loss": 3.9806, "step": 4727 }, { "epoch": 1.378855267954794, "grad_norm": 1.9967377185821533, "learning_rate": 0.00010815016533748299, "loss": 3.5786, "step": 4728 }, { "epoch": 1.3791469194312795, "grad_norm": 2.282454490661621, "learning_rate": 0.00010813071386889711, "loss": 3.9442, "step": 4729 }, { "epoch": 1.3794385709077652, "grad_norm": 1.9153382778167725, "learning_rate": 0.00010811126240031122, "loss": 3.8746, "step": 4730 }, { "epoch": 1.379730222384251, "grad_norm": 2.426436424255371, "learning_rate": 0.00010809181093172535, "loss": 4.12, "step": 4731 }, { "epoch": 1.3800218738607364, "grad_norm": 2.174394369125366, "learning_rate": 0.00010807235946313948, "loss": 4.1943, "step": 4732 }, { "epoch": 1.380313525337222, "grad_norm": 1.9720299243927002, "learning_rate": 0.00010805290799455358, "loss": 4.2727, "step": 4733 }, { "epoch": 1.3806051768137078, "grad_norm": 2.0082004070281982, "learning_rate": 0.00010803345652596772, "loss": 4.2227, "step": 4734 }, { "epoch": 1.3808968282901932, "grad_norm": 2.596585750579834, "learning_rate": 0.00010801400505738184, "loss": 4.1583, "step": 4735 }, { "epoch": 1.3811884797666787, "grad_norm": 2.089992046356201, "learning_rate": 0.00010799455358879595, "loss": 3.7842, "step": 4736 }, { "epoch": 1.3814801312431644, "grad_norm": 2.569763422012329, "learning_rate": 0.0001079751021202101, "loss": 3.9814, "step": 4737 }, { "epoch": 1.38177178271965, "grad_norm": 2.6575193405151367, "learning_rate": 0.0001079556506516242, "loss": 4.158, "step": 4738 }, { "epoch": 1.3820634341961355, "grad_norm": 3.491305351257324, "learning_rate": 0.00010793619918303833, "loss": 3.8137, "step": 4739 }, { "epoch": 1.3823550856726212, "grad_norm": 2.153637170791626, "learning_rate": 0.00010791674771445243, "loss": 4.0099, "step": 4740 }, { "epoch": 1.382646737149107, "grad_norm": 2.385416269302368, "learning_rate": 0.00010789729624586657, "loss": 3.8986, "step": 4741 }, { "epoch": 1.3829383886255924, "grad_norm": 2.5199782848358154, "learning_rate": 0.00010787784477728069, "loss": 3.8168, "step": 4742 }, { "epoch": 1.383230040102078, "grad_norm": 2.6045873165130615, "learning_rate": 0.0001078583933086948, "loss": 3.6718, "step": 4743 }, { "epoch": 1.3835216915785638, "grad_norm": 2.857475996017456, "learning_rate": 0.00010783894184010895, "loss": 3.897, "step": 4744 }, { "epoch": 1.3838133430550492, "grad_norm": 2.4607651233673096, "learning_rate": 0.00010781949037152306, "loss": 4.2237, "step": 4745 }, { "epoch": 1.3841049945315347, "grad_norm": 1.9184008836746216, "learning_rate": 0.00010780003890293718, "loss": 4.2692, "step": 4746 }, { "epoch": 1.3843966460080204, "grad_norm": 2.358930826187134, "learning_rate": 0.00010778058743435129, "loss": 4.0691, "step": 4747 }, { "epoch": 1.384688297484506, "grad_norm": 2.5145232677459717, "learning_rate": 0.00010776113596576542, "loss": 4.1897, "step": 4748 }, { "epoch": 1.3849799489609915, "grad_norm": 2.759153366088867, "learning_rate": 0.00010774168449717954, "loss": 4.0546, "step": 4749 }, { "epoch": 1.3852716004374772, "grad_norm": 2.5654072761535645, "learning_rate": 0.00010772223302859365, "loss": 4.0384, "step": 4750 }, { "epoch": 1.385563251913963, "grad_norm": 2.4695839881896973, "learning_rate": 0.0001077027815600078, "loss": 4.1611, "step": 4751 }, { "epoch": 1.3858549033904484, "grad_norm": 1.703585147857666, "learning_rate": 0.00010768333009142191, "loss": 3.8197, "step": 4752 }, { "epoch": 1.386146554866934, "grad_norm": 2.623689889907837, "learning_rate": 0.00010766387862283603, "loss": 4.2257, "step": 4753 }, { "epoch": 1.3864382063434197, "grad_norm": 3.078813076019287, "learning_rate": 0.00010764442715425016, "loss": 3.978, "step": 4754 }, { "epoch": 1.3867298578199052, "grad_norm": 2.1969404220581055, "learning_rate": 0.00010762497568566427, "loss": 4.2094, "step": 4755 }, { "epoch": 1.3870215092963907, "grad_norm": 2.5480120182037354, "learning_rate": 0.0001076055242170784, "loss": 4.1212, "step": 4756 }, { "epoch": 1.3873131607728764, "grad_norm": 3.0961902141571045, "learning_rate": 0.0001075860727484925, "loss": 4.2821, "step": 4757 }, { "epoch": 1.387604812249362, "grad_norm": 2.052837371826172, "learning_rate": 0.00010756662127990664, "loss": 3.9495, "step": 4758 }, { "epoch": 1.3878964637258475, "grad_norm": 1.9619884490966797, "learning_rate": 0.00010754716981132076, "loss": 4.343, "step": 4759 }, { "epoch": 1.3881881152023332, "grad_norm": 2.365852117538452, "learning_rate": 0.00010752771834273487, "loss": 3.6985, "step": 4760 }, { "epoch": 1.388479766678819, "grad_norm": 2.2761948108673096, "learning_rate": 0.00010750826687414902, "loss": 4.1591, "step": 4761 }, { "epoch": 1.3887714181553044, "grad_norm": 2.563380479812622, "learning_rate": 0.00010748881540556312, "loss": 4.1622, "step": 4762 }, { "epoch": 1.38906306963179, "grad_norm": 2.1427104473114014, "learning_rate": 0.00010746936393697724, "loss": 4.0502, "step": 4763 }, { "epoch": 1.3893547211082757, "grad_norm": 3.4281303882598877, "learning_rate": 0.00010744991246839138, "loss": 4.077, "step": 4764 }, { "epoch": 1.3896463725847612, "grad_norm": 2.990236520767212, "learning_rate": 0.00010743046099980549, "loss": 3.9502, "step": 4765 }, { "epoch": 1.389938024061247, "grad_norm": 3.2659358978271484, "learning_rate": 0.00010741100953121961, "loss": 4.1225, "step": 4766 }, { "epoch": 1.3902296755377324, "grad_norm": 2.289283037185669, "learning_rate": 0.00010739155806263372, "loss": 4.0996, "step": 4767 }, { "epoch": 1.390521327014218, "grad_norm": 3.354525089263916, "learning_rate": 0.00010737210659404787, "loss": 4.3632, "step": 4768 }, { "epoch": 1.3908129784907035, "grad_norm": 2.036207914352417, "learning_rate": 0.00010735265512546197, "loss": 4.3309, "step": 4769 }, { "epoch": 1.3911046299671892, "grad_norm": 2.2380149364471436, "learning_rate": 0.0001073332036568761, "loss": 4.1393, "step": 4770 }, { "epoch": 1.3913962814436749, "grad_norm": 2.4933571815490723, "learning_rate": 0.00010731375218829023, "loss": 4.1505, "step": 4771 }, { "epoch": 1.3916879329201604, "grad_norm": 2.0486490726470947, "learning_rate": 0.00010729430071970434, "loss": 3.8808, "step": 4772 }, { "epoch": 1.391979584396646, "grad_norm": 2.48087477684021, "learning_rate": 0.00010727484925111846, "loss": 4.0064, "step": 4773 }, { "epoch": 1.3922712358731317, "grad_norm": 2.3648083209991455, "learning_rate": 0.0001072553977825326, "loss": 4.1795, "step": 4774 }, { "epoch": 1.3925628873496172, "grad_norm": 2.101156711578369, "learning_rate": 0.00010723594631394672, "loss": 4.1915, "step": 4775 }, { "epoch": 1.3928545388261029, "grad_norm": 2.86356258392334, "learning_rate": 0.00010721649484536083, "loss": 3.8954, "step": 4776 }, { "epoch": 1.3931461903025883, "grad_norm": 2.121237277984619, "learning_rate": 0.00010719704337677495, "loss": 3.9641, "step": 4777 }, { "epoch": 1.393437841779074, "grad_norm": 2.3758955001831055, "learning_rate": 0.00010717759190818908, "loss": 4.1764, "step": 4778 }, { "epoch": 1.3937294932555595, "grad_norm": 2.244812488555908, "learning_rate": 0.00010715814043960319, "loss": 4.1744, "step": 4779 }, { "epoch": 1.3940211447320452, "grad_norm": 2.0362236499786377, "learning_rate": 0.00010713868897101731, "loss": 3.984, "step": 4780 }, { "epoch": 1.3943127962085309, "grad_norm": 2.7257564067840576, "learning_rate": 0.00010711923750243145, "loss": 3.9566, "step": 4781 }, { "epoch": 1.3946044476850163, "grad_norm": 1.997162103652954, "learning_rate": 0.00010709978603384556, "loss": 4.0815, "step": 4782 }, { "epoch": 1.394896099161502, "grad_norm": 4.714803695678711, "learning_rate": 0.00010708033456525968, "loss": 4.3319, "step": 4783 }, { "epoch": 1.3951877506379877, "grad_norm": 2.101888418197632, "learning_rate": 0.00010706088309667381, "loss": 3.984, "step": 4784 }, { "epoch": 1.3954794021144732, "grad_norm": 2.588160991668701, "learning_rate": 0.00010704143162808793, "loss": 4.1191, "step": 4785 }, { "epoch": 1.3957710535909589, "grad_norm": 2.4912078380584717, "learning_rate": 0.00010702198015950204, "loss": 3.9783, "step": 4786 }, { "epoch": 1.3960627050674443, "grad_norm": 2.134765863418579, "learning_rate": 0.00010700252869091616, "loss": 4.0435, "step": 4787 }, { "epoch": 1.39635435654393, "grad_norm": 2.169950485229492, "learning_rate": 0.0001069830772223303, "loss": 3.9887, "step": 4788 }, { "epoch": 1.3966460080204155, "grad_norm": 2.761608600616455, "learning_rate": 0.0001069636257537444, "loss": 4.1754, "step": 4789 }, { "epoch": 1.3969376594969012, "grad_norm": 2.2176475524902344, "learning_rate": 0.00010694417428515853, "loss": 4.169, "step": 4790 }, { "epoch": 1.3972293109733869, "grad_norm": 2.5531280040740967, "learning_rate": 0.00010692472281657266, "loss": 4.1094, "step": 4791 }, { "epoch": 1.3975209624498723, "grad_norm": 2.7215445041656494, "learning_rate": 0.00010690527134798678, "loss": 4.0765, "step": 4792 }, { "epoch": 1.397812613926358, "grad_norm": 1.9444990158081055, "learning_rate": 0.00010688581987940089, "loss": 3.8117, "step": 4793 }, { "epoch": 1.3981042654028437, "grad_norm": 2.3888301849365234, "learning_rate": 0.00010686636841081503, "loss": 4.1142, "step": 4794 }, { "epoch": 1.3983959168793292, "grad_norm": 2.1514601707458496, "learning_rate": 0.00010684691694222915, "loss": 3.7692, "step": 4795 }, { "epoch": 1.3986875683558149, "grad_norm": 2.31659197807312, "learning_rate": 0.00010682746547364326, "loss": 3.9958, "step": 4796 }, { "epoch": 1.3989792198323003, "grad_norm": 1.8743373155593872, "learning_rate": 0.00010680801400505738, "loss": 4.166, "step": 4797 }, { "epoch": 1.399270871308786, "grad_norm": 2.1851789951324463, "learning_rate": 0.00010678856253647151, "loss": 3.9432, "step": 4798 }, { "epoch": 1.3995625227852715, "grad_norm": 1.8625614643096924, "learning_rate": 0.00010676911106788564, "loss": 4.0435, "step": 4799 }, { "epoch": 1.3998541742617572, "grad_norm": 2.1645138263702393, "learning_rate": 0.00010674965959929974, "loss": 4.0878, "step": 4800 }, { "epoch": 1.4001458257382429, "grad_norm": 2.403996467590332, "learning_rate": 0.00010673020813071388, "loss": 4.0204, "step": 4801 }, { "epoch": 1.4004374772147283, "grad_norm": 2.839763879776001, "learning_rate": 0.000106710756662128, "loss": 4.0642, "step": 4802 }, { "epoch": 1.400729128691214, "grad_norm": 2.226719856262207, "learning_rate": 0.00010669130519354211, "loss": 4.1231, "step": 4803 }, { "epoch": 1.4010207801676997, "grad_norm": 2.3118860721588135, "learning_rate": 0.00010667185372495624, "loss": 4.0059, "step": 4804 }, { "epoch": 1.4013124316441852, "grad_norm": 2.7093846797943115, "learning_rate": 0.00010665240225637037, "loss": 4.031, "step": 4805 }, { "epoch": 1.4016040831206709, "grad_norm": 2.417235851287842, "learning_rate": 0.00010663295078778447, "loss": 3.7933, "step": 4806 }, { "epoch": 1.4018957345971563, "grad_norm": 2.9261465072631836, "learning_rate": 0.0001066134993191986, "loss": 4.1018, "step": 4807 }, { "epoch": 1.402187386073642, "grad_norm": 2.8475029468536377, "learning_rate": 0.00010659404785061273, "loss": 4.3427, "step": 4808 }, { "epoch": 1.4024790375501275, "grad_norm": 2.3784711360931396, "learning_rate": 0.00010657459638202685, "loss": 4.3537, "step": 4809 }, { "epoch": 1.4027706890266132, "grad_norm": 3.3753466606140137, "learning_rate": 0.00010655514491344096, "loss": 4.2049, "step": 4810 }, { "epoch": 1.4030623405030989, "grad_norm": 2.089761972427368, "learning_rate": 0.0001065356934448551, "loss": 4.1478, "step": 4811 }, { "epoch": 1.4033539919795843, "grad_norm": 2.924191951751709, "learning_rate": 0.00010651624197626922, "loss": 3.9778, "step": 4812 }, { "epoch": 1.40364564345607, "grad_norm": 3.854801893234253, "learning_rate": 0.00010649679050768332, "loss": 4.3187, "step": 4813 }, { "epoch": 1.4039372949325557, "grad_norm": 2.2456510066986084, "learning_rate": 0.00010647733903909747, "loss": 4.1172, "step": 4814 }, { "epoch": 1.4042289464090412, "grad_norm": 1.8730639219284058, "learning_rate": 0.00010645788757051158, "loss": 4.0907, "step": 4815 }, { "epoch": 1.4045205978855269, "grad_norm": 2.469423294067383, "learning_rate": 0.0001064384361019257, "loss": 4.3467, "step": 4816 }, { "epoch": 1.4048122493620123, "grad_norm": 2.887603759765625, "learning_rate": 0.00010641898463333981, "loss": 3.9741, "step": 4817 }, { "epoch": 1.405103900838498, "grad_norm": 1.9837193489074707, "learning_rate": 0.00010639953316475395, "loss": 4.2348, "step": 4818 }, { "epoch": 1.4053955523149835, "grad_norm": 3.5294957160949707, "learning_rate": 0.00010638008169616807, "loss": 4.232, "step": 4819 }, { "epoch": 1.4056872037914692, "grad_norm": 2.1097512245178223, "learning_rate": 0.00010636063022758218, "loss": 3.9762, "step": 4820 }, { "epoch": 1.4059788552679549, "grad_norm": 2.866260290145874, "learning_rate": 0.00010634117875899632, "loss": 3.8784, "step": 4821 }, { "epoch": 1.4062705067444403, "grad_norm": 2.78682804107666, "learning_rate": 0.00010632172729041043, "loss": 4.1483, "step": 4822 }, { "epoch": 1.406562158220926, "grad_norm": 4.232974052429199, "learning_rate": 0.00010630227582182455, "loss": 4.0584, "step": 4823 }, { "epoch": 1.4068538096974117, "grad_norm": 3.2652573585510254, "learning_rate": 0.00010628282435323866, "loss": 4.1388, "step": 4824 }, { "epoch": 1.4071454611738972, "grad_norm": 2.5977087020874023, "learning_rate": 0.0001062633728846528, "loss": 4.2259, "step": 4825 }, { "epoch": 1.4074371126503828, "grad_norm": 1.9414585828781128, "learning_rate": 0.00010624392141606692, "loss": 4.4016, "step": 4826 }, { "epoch": 1.4077287641268683, "grad_norm": 2.306063652038574, "learning_rate": 0.00010622446994748103, "loss": 4.152, "step": 4827 }, { "epoch": 1.408020415603354, "grad_norm": 2.852405071258545, "learning_rate": 0.00010620501847889516, "loss": 4.2093, "step": 4828 }, { "epoch": 1.4083120670798395, "grad_norm": 2.3590641021728516, "learning_rate": 0.00010618556701030928, "loss": 4.3416, "step": 4829 }, { "epoch": 1.4086037185563252, "grad_norm": 2.7854387760162354, "learning_rate": 0.0001061661155417234, "loss": 3.9582, "step": 4830 }, { "epoch": 1.4088953700328108, "grad_norm": 2.4641294479370117, "learning_rate": 0.00010614666407313754, "loss": 3.9474, "step": 4831 }, { "epoch": 1.4091870215092963, "grad_norm": 3.09967041015625, "learning_rate": 0.00010612721260455165, "loss": 4.276, "step": 4832 }, { "epoch": 1.409478672985782, "grad_norm": 2.3670616149902344, "learning_rate": 0.00010610776113596577, "loss": 4.2226, "step": 4833 }, { "epoch": 1.4097703244622677, "grad_norm": 2.190106153488159, "learning_rate": 0.00010608830966737988, "loss": 3.975, "step": 4834 }, { "epoch": 1.4100619759387532, "grad_norm": 2.403287887573242, "learning_rate": 0.00010606885819879401, "loss": 4.012, "step": 4835 }, { "epoch": 1.4103536274152388, "grad_norm": 2.0343363285064697, "learning_rate": 0.00010604940673020813, "loss": 3.951, "step": 4836 }, { "epoch": 1.4106452788917243, "grad_norm": 2.2281410694122314, "learning_rate": 0.00010602995526162224, "loss": 4.1798, "step": 4837 }, { "epoch": 1.41093693036821, "grad_norm": 1.9793670177459717, "learning_rate": 0.00010601050379303639, "loss": 4.0494, "step": 4838 }, { "epoch": 1.4112285818446955, "grad_norm": 2.056490421295166, "learning_rate": 0.0001059910523244505, "loss": 3.8945, "step": 4839 }, { "epoch": 1.4115202333211812, "grad_norm": 2.975106954574585, "learning_rate": 0.00010597160085586462, "loss": 4.3212, "step": 4840 }, { "epoch": 1.4118118847976668, "grad_norm": 2.323573350906372, "learning_rate": 0.00010595214938727876, "loss": 4.0276, "step": 4841 }, { "epoch": 1.4121035362741523, "grad_norm": 2.154378652572632, "learning_rate": 0.00010593269791869286, "loss": 4.2324, "step": 4842 }, { "epoch": 1.412395187750638, "grad_norm": 2.343644618988037, "learning_rate": 0.00010591324645010699, "loss": 4.2946, "step": 4843 }, { "epoch": 1.4126868392271237, "grad_norm": 2.300693988800049, "learning_rate": 0.0001058937949815211, "loss": 4.0669, "step": 4844 }, { "epoch": 1.4129784907036091, "grad_norm": 1.9866400957107544, "learning_rate": 0.00010587434351293524, "loss": 3.8053, "step": 4845 }, { "epoch": 1.4132701421800948, "grad_norm": 2.398909330368042, "learning_rate": 0.00010585489204434935, "loss": 4.2357, "step": 4846 }, { "epoch": 1.4135617936565805, "grad_norm": 2.3551135063171387, "learning_rate": 0.00010583544057576347, "loss": 4.0788, "step": 4847 }, { "epoch": 1.413853445133066, "grad_norm": 3.3769664764404297, "learning_rate": 0.00010581598910717761, "loss": 4.3126, "step": 4848 }, { "epoch": 1.4141450966095515, "grad_norm": 2.0065338611602783, "learning_rate": 0.00010579653763859172, "loss": 3.9471, "step": 4849 }, { "epoch": 1.4144367480860371, "grad_norm": 2.33146595954895, "learning_rate": 0.00010577708617000584, "loss": 4.2794, "step": 4850 }, { "epoch": 1.4147283995625228, "grad_norm": 2.6082682609558105, "learning_rate": 0.00010575763470141997, "loss": 4.0074, "step": 4851 }, { "epoch": 1.4150200510390083, "grad_norm": 2.142646551132202, "learning_rate": 0.00010573818323283408, "loss": 4.2665, "step": 4852 }, { "epoch": 1.415311702515494, "grad_norm": 2.110158681869507, "learning_rate": 0.0001057187317642482, "loss": 4.1086, "step": 4853 }, { "epoch": 1.4156033539919797, "grad_norm": 1.9180253744125366, "learning_rate": 0.00010569928029566232, "loss": 3.7462, "step": 4854 }, { "epoch": 1.4158950054684651, "grad_norm": 2.497175931930542, "learning_rate": 0.00010567982882707646, "loss": 4.1036, "step": 4855 }, { "epoch": 1.4161866569449508, "grad_norm": 2.426915407180786, "learning_rate": 0.00010566037735849057, "loss": 3.9873, "step": 4856 }, { "epoch": 1.4164783084214365, "grad_norm": 2.255715847015381, "learning_rate": 0.00010564092588990469, "loss": 4.0271, "step": 4857 }, { "epoch": 1.416769959897922, "grad_norm": 2.682234525680542, "learning_rate": 0.00010562147442131882, "loss": 4.3836, "step": 4858 }, { "epoch": 1.4170616113744074, "grad_norm": 1.6097440719604492, "learning_rate": 0.00010560202295273293, "loss": 3.7884, "step": 4859 }, { "epoch": 1.4173532628508931, "grad_norm": 2.144697666168213, "learning_rate": 0.00010558257148414705, "loss": 4.131, "step": 4860 }, { "epoch": 1.4176449143273788, "grad_norm": 2.4461801052093506, "learning_rate": 0.00010556312001556119, "loss": 4.2349, "step": 4861 }, { "epoch": 1.4179365658038643, "grad_norm": 2.1551589965820312, "learning_rate": 0.00010554366854697531, "loss": 4.1857, "step": 4862 }, { "epoch": 1.41822821728035, "grad_norm": 2.1726949214935303, "learning_rate": 0.00010552421707838942, "loss": 4.29, "step": 4863 }, { "epoch": 1.4185198687568357, "grad_norm": 2.2374815940856934, "learning_rate": 0.00010550476560980354, "loss": 3.9415, "step": 4864 }, { "epoch": 1.4188115202333211, "grad_norm": 2.3065950870513916, "learning_rate": 0.00010548531414121767, "loss": 4.3727, "step": 4865 }, { "epoch": 1.4191031717098068, "grad_norm": 2.2623486518859863, "learning_rate": 0.00010546586267263178, "loss": 4.2314, "step": 4866 }, { "epoch": 1.4193948231862925, "grad_norm": 3.292681932449341, "learning_rate": 0.0001054464112040459, "loss": 4.0237, "step": 4867 }, { "epoch": 1.419686474662778, "grad_norm": 2.090996265411377, "learning_rate": 0.00010542695973546004, "loss": 3.9043, "step": 4868 }, { "epoch": 1.4199781261392634, "grad_norm": 2.1949691772460938, "learning_rate": 0.00010540750826687416, "loss": 4.0066, "step": 4869 }, { "epoch": 1.4202697776157491, "grad_norm": 2.571295738220215, "learning_rate": 0.00010538805679828827, "loss": 4.3864, "step": 4870 }, { "epoch": 1.4205614290922348, "grad_norm": 2.663003921508789, "learning_rate": 0.0001053686053297024, "loss": 4.058, "step": 4871 }, { "epoch": 1.4208530805687203, "grad_norm": 2.233776569366455, "learning_rate": 0.00010534915386111653, "loss": 4.0441, "step": 4872 }, { "epoch": 1.421144732045206, "grad_norm": 1.915183186531067, "learning_rate": 0.00010532970239253063, "loss": 3.9566, "step": 4873 }, { "epoch": 1.4214363835216917, "grad_norm": 2.152381420135498, "learning_rate": 0.00010531025092394476, "loss": 4.2466, "step": 4874 }, { "epoch": 1.4217280349981771, "grad_norm": 2.998387336730957, "learning_rate": 0.00010529079945535889, "loss": 4.0357, "step": 4875 }, { "epoch": 1.4220196864746628, "grad_norm": 2.6678645610809326, "learning_rate": 0.00010527134798677301, "loss": 4.0381, "step": 4876 }, { "epoch": 1.4223113379511485, "grad_norm": 2.0110435485839844, "learning_rate": 0.00010525189651818712, "loss": 3.8199, "step": 4877 }, { "epoch": 1.422602989427634, "grad_norm": 2.643810510635376, "learning_rate": 0.00010523244504960126, "loss": 3.9175, "step": 4878 }, { "epoch": 1.4228946409041197, "grad_norm": 1.9406009912490845, "learning_rate": 0.00010521299358101538, "loss": 3.8736, "step": 4879 }, { "epoch": 1.4231862923806051, "grad_norm": 2.613297462463379, "learning_rate": 0.00010519354211242949, "loss": 4.2762, "step": 4880 }, { "epoch": 1.4234779438570908, "grad_norm": 2.468045234680176, "learning_rate": 0.00010517409064384362, "loss": 3.9667, "step": 4881 }, { "epoch": 1.4237695953335763, "grad_norm": 2.2007877826690674, "learning_rate": 0.00010515463917525774, "loss": 3.7397, "step": 4882 }, { "epoch": 1.424061246810062, "grad_norm": 2.513380765914917, "learning_rate": 0.00010513518770667185, "loss": 4.1434, "step": 4883 }, { "epoch": 1.4243528982865477, "grad_norm": 2.1113836765289307, "learning_rate": 0.00010511573623808597, "loss": 4.0856, "step": 4884 }, { "epoch": 1.4246445497630331, "grad_norm": 2.115089178085327, "learning_rate": 0.0001050962847695001, "loss": 4.0053, "step": 4885 }, { "epoch": 1.4249362012395188, "grad_norm": 2.107685089111328, "learning_rate": 0.00010507683330091423, "loss": 3.9009, "step": 4886 }, { "epoch": 1.4252278527160045, "grad_norm": 3.2840609550476074, "learning_rate": 0.00010505738183232834, "loss": 4.0641, "step": 4887 }, { "epoch": 1.42551950419249, "grad_norm": 1.591413974761963, "learning_rate": 0.00010503793036374247, "loss": 3.8701, "step": 4888 }, { "epoch": 1.4258111556689756, "grad_norm": 2.0022950172424316, "learning_rate": 0.00010501847889515659, "loss": 4.122, "step": 4889 }, { "epoch": 1.4261028071454611, "grad_norm": 2.021486759185791, "learning_rate": 0.0001049990274265707, "loss": 4.1618, "step": 4890 }, { "epoch": 1.4263944586219468, "grad_norm": 2.498328685760498, "learning_rate": 0.00010497957595798485, "loss": 4.2096, "step": 4891 }, { "epoch": 1.4266861100984323, "grad_norm": 1.763560175895691, "learning_rate": 0.00010496012448939896, "loss": 4.0789, "step": 4892 }, { "epoch": 1.426977761574918, "grad_norm": 2.2125589847564697, "learning_rate": 0.00010494067302081308, "loss": 4.5189, "step": 4893 }, { "epoch": 1.4272694130514036, "grad_norm": 2.4079058170318604, "learning_rate": 0.00010492122155222719, "loss": 4.1637, "step": 4894 }, { "epoch": 1.4275610645278891, "grad_norm": 2.1644558906555176, "learning_rate": 0.00010490177008364132, "loss": 4.0671, "step": 4895 }, { "epoch": 1.4278527160043748, "grad_norm": 2.1905553340911865, "learning_rate": 0.00010488231861505544, "loss": 4.2123, "step": 4896 }, { "epoch": 1.4281443674808605, "grad_norm": 3.1796469688415527, "learning_rate": 0.00010486286714646955, "loss": 4.2767, "step": 4897 }, { "epoch": 1.428436018957346, "grad_norm": 3.153953790664673, "learning_rate": 0.00010484341567788369, "loss": 4.0597, "step": 4898 }, { "epoch": 1.4287276704338316, "grad_norm": 2.473392963409424, "learning_rate": 0.00010482396420929781, "loss": 3.8691, "step": 4899 }, { "epoch": 1.429019321910317, "grad_norm": 3.044034481048584, "learning_rate": 0.00010480451274071193, "loss": 4.0411, "step": 4900 }, { "epoch": 1.4293109733868028, "grad_norm": 3.0454654693603516, "learning_rate": 0.00010478506127212607, "loss": 3.9289, "step": 4901 }, { "epoch": 1.4296026248632883, "grad_norm": 2.1608052253723145, "learning_rate": 0.00010476560980354017, "loss": 4.2082, "step": 4902 }, { "epoch": 1.429894276339774, "grad_norm": 3.1886887550354004, "learning_rate": 0.0001047461583349543, "loss": 3.8705, "step": 4903 }, { "epoch": 1.4301859278162596, "grad_norm": 2.450186252593994, "learning_rate": 0.0001047267068663684, "loss": 4.1048, "step": 4904 }, { "epoch": 1.430477579292745, "grad_norm": 2.3156416416168213, "learning_rate": 0.00010470725539778254, "loss": 4.382, "step": 4905 }, { "epoch": 1.4307692307692308, "grad_norm": 1.8851361274719238, "learning_rate": 0.00010468780392919666, "loss": 3.8924, "step": 4906 }, { "epoch": 1.4310608822457165, "grad_norm": 2.3512468338012695, "learning_rate": 0.00010466835246061077, "loss": 4.1716, "step": 4907 }, { "epoch": 1.431352533722202, "grad_norm": 2.241018056869507, "learning_rate": 0.00010464890099202492, "loss": 3.7974, "step": 4908 }, { "epoch": 1.4316441851986876, "grad_norm": 24.950416564941406, "learning_rate": 0.00010462944952343902, "loss": 4.1934, "step": 4909 }, { "epoch": 1.431935836675173, "grad_norm": 2.9006738662719727, "learning_rate": 0.00010460999805485315, "loss": 4.1354, "step": 4910 }, { "epoch": 1.4322274881516588, "grad_norm": 2.87827205657959, "learning_rate": 0.00010459054658626725, "loss": 4.0464, "step": 4911 }, { "epoch": 1.4325191396281443, "grad_norm": 2.4782469272613525, "learning_rate": 0.00010457109511768139, "loss": 4.1779, "step": 4912 }, { "epoch": 1.43281079110463, "grad_norm": 2.807044506072998, "learning_rate": 0.00010455164364909551, "loss": 3.9456, "step": 4913 }, { "epoch": 1.4331024425811156, "grad_norm": 3.134514093399048, "learning_rate": 0.00010453219218050962, "loss": 4.1418, "step": 4914 }, { "epoch": 1.433394094057601, "grad_norm": 2.1027073860168457, "learning_rate": 0.00010451274071192377, "loss": 3.8751, "step": 4915 }, { "epoch": 1.4336857455340868, "grad_norm": 2.2385060787200928, "learning_rate": 0.00010449328924333788, "loss": 4.2014, "step": 4916 }, { "epoch": 1.4339773970105725, "grad_norm": 2.044647693634033, "learning_rate": 0.000104473837774752, "loss": 3.9688, "step": 4917 }, { "epoch": 1.434269048487058, "grad_norm": 2.2303802967071533, "learning_rate": 0.00010445438630616613, "loss": 4.076, "step": 4918 }, { "epoch": 1.4345606999635436, "grad_norm": 1.881978988647461, "learning_rate": 0.00010443493483758024, "loss": 4.2649, "step": 4919 }, { "epoch": 1.434852351440029, "grad_norm": 2.3446245193481445, "learning_rate": 0.00010441548336899436, "loss": 3.8882, "step": 4920 }, { "epoch": 1.4351440029165148, "grad_norm": 2.422363042831421, "learning_rate": 0.00010439603190040847, "loss": 3.8759, "step": 4921 }, { "epoch": 1.4354356543930002, "grad_norm": 2.686286449432373, "learning_rate": 0.00010437658043182262, "loss": 3.8899, "step": 4922 }, { "epoch": 1.435727305869486, "grad_norm": 2.331709384918213, "learning_rate": 0.00010435712896323673, "loss": 4.3906, "step": 4923 }, { "epoch": 1.4360189573459716, "grad_norm": 2.1483230590820312, "learning_rate": 0.00010433767749465085, "loss": 3.9951, "step": 4924 }, { "epoch": 1.436310608822457, "grad_norm": 2.2159509658813477, "learning_rate": 0.00010431822602606498, "loss": 4.1869, "step": 4925 }, { "epoch": 1.4366022602989428, "grad_norm": 2.531017303466797, "learning_rate": 0.00010429877455747909, "loss": 4.2769, "step": 4926 }, { "epoch": 1.4368939117754285, "grad_norm": 1.9343414306640625, "learning_rate": 0.00010427932308889321, "loss": 3.9533, "step": 4927 }, { "epoch": 1.437185563251914, "grad_norm": 2.1150054931640625, "learning_rate": 0.00010425987162030735, "loss": 3.8753, "step": 4928 }, { "epoch": 1.4374772147283996, "grad_norm": 2.2031023502349854, "learning_rate": 0.00010424042015172146, "loss": 4.2332, "step": 4929 }, { "epoch": 1.437768866204885, "grad_norm": 2.336226463317871, "learning_rate": 0.00010422096868313558, "loss": 3.7984, "step": 4930 }, { "epoch": 1.4380605176813708, "grad_norm": 2.279226779937744, "learning_rate": 0.0001042015172145497, "loss": 4.2798, "step": 4931 }, { "epoch": 1.4383521691578562, "grad_norm": 2.5496582984924316, "learning_rate": 0.00010418206574596383, "loss": 4.0586, "step": 4932 }, { "epoch": 1.438643820634342, "grad_norm": 3.119938850402832, "learning_rate": 0.00010416261427737794, "loss": 4.2991, "step": 4933 }, { "epoch": 1.4389354721108276, "grad_norm": 2.8473737239837646, "learning_rate": 0.00010414316280879206, "loss": 4.1912, "step": 4934 }, { "epoch": 1.439227123587313, "grad_norm": 2.455554723739624, "learning_rate": 0.0001041237113402062, "loss": 4.2396, "step": 4935 }, { "epoch": 1.4395187750637988, "grad_norm": 2.513540029525757, "learning_rate": 0.00010410425987162031, "loss": 3.9979, "step": 4936 }, { "epoch": 1.4398104265402845, "grad_norm": 1.8450813293457031, "learning_rate": 0.00010408480840303443, "loss": 3.9573, "step": 4937 }, { "epoch": 1.44010207801677, "grad_norm": 1.9989612102508545, "learning_rate": 0.00010406535693444856, "loss": 3.9942, "step": 4938 }, { "epoch": 1.4403937294932556, "grad_norm": 2.1416332721710205, "learning_rate": 0.00010404590546586269, "loss": 4.0603, "step": 4939 }, { "epoch": 1.440685380969741, "grad_norm": 1.898666501045227, "learning_rate": 0.0001040264539972768, "loss": 3.9415, "step": 4940 }, { "epoch": 1.4409770324462268, "grad_norm": 2.120546817779541, "learning_rate": 0.00010400700252869092, "loss": 4.0817, "step": 4941 }, { "epoch": 1.4412686839227122, "grad_norm": 2.11554217338562, "learning_rate": 0.00010398755106010505, "loss": 3.8021, "step": 4942 }, { "epoch": 1.441560335399198, "grad_norm": 2.490283966064453, "learning_rate": 0.00010396809959151916, "loss": 4.0958, "step": 4943 }, { "epoch": 1.4418519868756836, "grad_norm": 2.33284330368042, "learning_rate": 0.00010394864812293328, "loss": 3.8674, "step": 4944 }, { "epoch": 1.442143638352169, "grad_norm": 2.60591983795166, "learning_rate": 0.00010392919665434742, "loss": 4.0826, "step": 4945 }, { "epoch": 1.4424352898286548, "grad_norm": 2.392328977584839, "learning_rate": 0.00010390974518576154, "loss": 4.116, "step": 4946 }, { "epoch": 1.4427269413051405, "grad_norm": 1.9936401844024658, "learning_rate": 0.00010389029371717565, "loss": 4.0503, "step": 4947 }, { "epoch": 1.443018592781626, "grad_norm": 2.2157461643218994, "learning_rate": 0.00010387084224858978, "loss": 4.1104, "step": 4948 }, { "epoch": 1.4433102442581116, "grad_norm": 2.2657504081726074, "learning_rate": 0.0001038513907800039, "loss": 3.8179, "step": 4949 }, { "epoch": 1.443601895734597, "grad_norm": 2.0180883407592773, "learning_rate": 0.00010383193931141801, "loss": 3.9553, "step": 4950 }, { "epoch": 1.4438935472110828, "grad_norm": 2.2601466178894043, "learning_rate": 0.00010381248784283213, "loss": 3.8765, "step": 4951 }, { "epoch": 1.4441851986875682, "grad_norm": 2.1454763412475586, "learning_rate": 0.00010379303637424627, "loss": 4.2198, "step": 4952 }, { "epoch": 1.444476850164054, "grad_norm": 2.3365795612335205, "learning_rate": 0.00010377358490566037, "loss": 4.1693, "step": 4953 }, { "epoch": 1.4447685016405396, "grad_norm": 2.024151086807251, "learning_rate": 0.0001037541334370745, "loss": 4.0065, "step": 4954 }, { "epoch": 1.445060153117025, "grad_norm": 2.1027231216430664, "learning_rate": 0.00010373468196848863, "loss": 4.0999, "step": 4955 }, { "epoch": 1.4453518045935108, "grad_norm": 2.7591168880462646, "learning_rate": 0.00010371523049990275, "loss": 4.0985, "step": 4956 }, { "epoch": 1.4456434560699964, "grad_norm": 1.9930546283721924, "learning_rate": 0.00010369577903131686, "loss": 4.3037, "step": 4957 }, { "epoch": 1.445935107546482, "grad_norm": 2.339979887008667, "learning_rate": 0.000103676327562731, "loss": 4.1182, "step": 4958 }, { "epoch": 1.4462267590229676, "grad_norm": 2.7413907051086426, "learning_rate": 0.00010365687609414512, "loss": 4.2845, "step": 4959 }, { "epoch": 1.4465184104994533, "grad_norm": 2.5221426486968994, "learning_rate": 0.00010363742462555923, "loss": 4.0108, "step": 4960 }, { "epoch": 1.4468100619759388, "grad_norm": 2.2246341705322266, "learning_rate": 0.00010361797315697335, "loss": 4.0941, "step": 4961 }, { "epoch": 1.4471017134524242, "grad_norm": 2.0763258934020996, "learning_rate": 0.00010359852168838748, "loss": 3.9641, "step": 4962 }, { "epoch": 1.44739336492891, "grad_norm": 2.286005735397339, "learning_rate": 0.0001035790702198016, "loss": 4.3187, "step": 4963 }, { "epoch": 1.4476850164053956, "grad_norm": 2.430860996246338, "learning_rate": 0.00010355961875121571, "loss": 4.1601, "step": 4964 }, { "epoch": 1.447976667881881, "grad_norm": 2.795264720916748, "learning_rate": 0.00010354016728262985, "loss": 4.0142, "step": 4965 }, { "epoch": 1.4482683193583668, "grad_norm": 2.7921555042266846, "learning_rate": 0.00010352071581404397, "loss": 4.0526, "step": 4966 }, { "epoch": 1.4485599708348524, "grad_norm": 2.285865306854248, "learning_rate": 0.00010350126434545808, "loss": 4.3953, "step": 4967 }, { "epoch": 1.448851622311338, "grad_norm": 2.3354480266571045, "learning_rate": 0.00010348181287687223, "loss": 3.9385, "step": 4968 }, { "epoch": 1.4491432737878236, "grad_norm": 3.0218589305877686, "learning_rate": 0.00010346236140828633, "loss": 4.0419, "step": 4969 }, { "epoch": 1.4494349252643093, "grad_norm": 2.236956834793091, "learning_rate": 0.00010344290993970046, "loss": 3.9554, "step": 4970 }, { "epoch": 1.4497265767407947, "grad_norm": 3.042302131652832, "learning_rate": 0.00010342345847111456, "loss": 4.0482, "step": 4971 }, { "epoch": 1.4500182282172802, "grad_norm": 3.5011653900146484, "learning_rate": 0.0001034040070025287, "loss": 4.2299, "step": 4972 }, { "epoch": 1.450309879693766, "grad_norm": 2.556766986846924, "learning_rate": 0.00010338455553394282, "loss": 4.2379, "step": 4973 }, { "epoch": 1.4506015311702516, "grad_norm": 2.4563169479370117, "learning_rate": 0.00010336510406535693, "loss": 3.9202, "step": 4974 }, { "epoch": 1.450893182646737, "grad_norm": 2.292548179626465, "learning_rate": 0.00010334565259677106, "loss": 4.0769, "step": 4975 }, { "epoch": 1.4511848341232227, "grad_norm": 2.2241101264953613, "learning_rate": 0.00010332620112818519, "loss": 3.8362, "step": 4976 }, { "epoch": 1.4514764855997084, "grad_norm": 2.667907238006592, "learning_rate": 0.0001033067496595993, "loss": 4.2398, "step": 4977 }, { "epoch": 1.451768137076194, "grad_norm": 2.5437796115875244, "learning_rate": 0.00010328729819101344, "loss": 4.2222, "step": 4978 }, { "epoch": 1.4520597885526796, "grad_norm": 2.8823139667510986, "learning_rate": 0.00010326784672242755, "loss": 4.0817, "step": 4979 }, { "epoch": 1.4523514400291653, "grad_norm": 3.0122480392456055, "learning_rate": 0.00010324839525384167, "loss": 3.9475, "step": 4980 }, { "epoch": 1.4526430915056507, "grad_norm": 2.1264379024505615, "learning_rate": 0.00010322894378525578, "loss": 4.0608, "step": 4981 }, { "epoch": 1.4529347429821364, "grad_norm": 2.2372031211853027, "learning_rate": 0.00010320949231666991, "loss": 4.1082, "step": 4982 }, { "epoch": 1.453226394458622, "grad_norm": 2.518124580383301, "learning_rate": 0.00010319004084808404, "loss": 4.2769, "step": 4983 }, { "epoch": 1.4535180459351076, "grad_norm": 3.055410623550415, "learning_rate": 0.00010317058937949814, "loss": 4.0938, "step": 4984 }, { "epoch": 1.453809697411593, "grad_norm": 1.8353078365325928, "learning_rate": 0.00010315113791091229, "loss": 4.1599, "step": 4985 }, { "epoch": 1.4541013488880787, "grad_norm": 2.5489282608032227, "learning_rate": 0.0001031316864423264, "loss": 3.8364, "step": 4986 }, { "epoch": 1.4543930003645644, "grad_norm": 2.106959104537964, "learning_rate": 0.00010311223497374052, "loss": 3.944, "step": 4987 }, { "epoch": 1.45468465184105, "grad_norm": 3.110008478164673, "learning_rate": 0.00010309278350515463, "loss": 3.7658, "step": 4988 }, { "epoch": 1.4549763033175356, "grad_norm": 3.6720194816589355, "learning_rate": 0.00010307333203656877, "loss": 4.2348, "step": 4989 }, { "epoch": 1.4552679547940213, "grad_norm": 2.071568489074707, "learning_rate": 0.00010305388056798289, "loss": 4.0163, "step": 4990 }, { "epoch": 1.4555596062705067, "grad_norm": 2.5847864151000977, "learning_rate": 0.000103034429099397, "loss": 4.2579, "step": 4991 }, { "epoch": 1.4558512577469924, "grad_norm": 2.838953733444214, "learning_rate": 0.00010301497763081114, "loss": 4.3911, "step": 4992 }, { "epoch": 1.4561429092234779, "grad_norm": 2.919095516204834, "learning_rate": 0.00010299552616222525, "loss": 3.7411, "step": 4993 }, { "epoch": 1.4564345606999636, "grad_norm": 1.8425110578536987, "learning_rate": 0.00010297607469363937, "loss": 4.214, "step": 4994 }, { "epoch": 1.456726212176449, "grad_norm": 2.2545206546783447, "learning_rate": 0.00010295662322505351, "loss": 4.1294, "step": 4995 }, { "epoch": 1.4570178636529347, "grad_norm": 2.571777582168579, "learning_rate": 0.00010293717175646762, "loss": 4.389, "step": 4996 }, { "epoch": 1.4573095151294204, "grad_norm": 3.110309600830078, "learning_rate": 0.00010291772028788174, "loss": 3.9977, "step": 4997 }, { "epoch": 1.4576011666059059, "grad_norm": 2.048652410507202, "learning_rate": 0.00010289826881929585, "loss": 4.1098, "step": 4998 }, { "epoch": 1.4578928180823916, "grad_norm": 2.3701348304748535, "learning_rate": 0.00010287881735070998, "loss": 3.6261, "step": 4999 }, { "epoch": 1.4581844695588773, "grad_norm": 2.818082094192505, "learning_rate": 0.0001028593658821241, "loss": 4.208, "step": 5000 }, { "epoch": 1.4584761210353627, "grad_norm": 2.136021852493286, "learning_rate": 0.00010283991441353822, "loss": 4.2594, "step": 5001 }, { "epoch": 1.4587677725118484, "grad_norm": 2.979215383529663, "learning_rate": 0.00010282046294495236, "loss": 4.0415, "step": 5002 }, { "epoch": 1.4590594239883339, "grad_norm": 2.1247990131378174, "learning_rate": 0.00010280101147636647, "loss": 4.0167, "step": 5003 }, { "epoch": 1.4593510754648196, "grad_norm": 2.467054605484009, "learning_rate": 0.00010278156000778059, "loss": 4.1651, "step": 5004 }, { "epoch": 1.459642726941305, "grad_norm": 2.8591740131378174, "learning_rate": 0.00010276210853919472, "loss": 4.1661, "step": 5005 }, { "epoch": 1.4599343784177907, "grad_norm": 2.2073822021484375, "learning_rate": 0.00010274265707060883, "loss": 4.0025, "step": 5006 }, { "epoch": 1.4602260298942764, "grad_norm": 2.4478981494903564, "learning_rate": 0.00010272320560202295, "loss": 3.7804, "step": 5007 }, { "epoch": 1.4605176813707619, "grad_norm": 2.185276746749878, "learning_rate": 0.00010270375413343706, "loss": 3.8509, "step": 5008 }, { "epoch": 1.4608093328472476, "grad_norm": 2.484747886657715, "learning_rate": 0.00010268430266485121, "loss": 3.9973, "step": 5009 }, { "epoch": 1.4611009843237333, "grad_norm": 3.2754454612731934, "learning_rate": 0.00010266485119626532, "loss": 4.2036, "step": 5010 }, { "epoch": 1.4613926358002187, "grad_norm": 3.499357223510742, "learning_rate": 0.00010264539972767944, "loss": 3.8579, "step": 5011 }, { "epoch": 1.4616842872767044, "grad_norm": 2.878164529800415, "learning_rate": 0.00010262594825909358, "loss": 4.2339, "step": 5012 }, { "epoch": 1.4619759387531899, "grad_norm": 2.5813848972320557, "learning_rate": 0.00010260649679050768, "loss": 4.0087, "step": 5013 }, { "epoch": 1.4622675902296756, "grad_norm": 2.94626522064209, "learning_rate": 0.0001025870453219218, "loss": 4.2345, "step": 5014 }, { "epoch": 1.462559241706161, "grad_norm": 2.2302820682525635, "learning_rate": 0.00010256759385333594, "loss": 4.2247, "step": 5015 }, { "epoch": 1.4628508931826467, "grad_norm": 2.1528921127319336, "learning_rate": 0.00010254814238475006, "loss": 4.0927, "step": 5016 }, { "epoch": 1.4631425446591324, "grad_norm": 3.8750972747802734, "learning_rate": 0.00010252869091616417, "loss": 3.9632, "step": 5017 }, { "epoch": 1.4634341961356179, "grad_norm": 2.571650981903076, "learning_rate": 0.00010250923944757829, "loss": 3.9798, "step": 5018 }, { "epoch": 1.4637258476121036, "grad_norm": 3.5272536277770996, "learning_rate": 0.00010248978797899243, "loss": 4.0113, "step": 5019 }, { "epoch": 1.4640174990885892, "grad_norm": 2.4505646228790283, "learning_rate": 0.00010247033651040654, "loss": 4.0639, "step": 5020 }, { "epoch": 1.4643091505650747, "grad_norm": 2.286574363708496, "learning_rate": 0.00010245088504182066, "loss": 4.1618, "step": 5021 }, { "epoch": 1.4646008020415604, "grad_norm": 2.005272626876831, "learning_rate": 0.00010243143357323479, "loss": 4.0781, "step": 5022 }, { "epoch": 1.4648924535180459, "grad_norm": 1.860946536064148, "learning_rate": 0.00010241198210464891, "loss": 4.0993, "step": 5023 }, { "epoch": 1.4651841049945316, "grad_norm": 2.3478074073791504, "learning_rate": 0.00010239253063606302, "loss": 4.1461, "step": 5024 }, { "epoch": 1.465475756471017, "grad_norm": 2.4011740684509277, "learning_rate": 0.00010237307916747716, "loss": 3.8172, "step": 5025 }, { "epoch": 1.4657674079475027, "grad_norm": 2.109407424926758, "learning_rate": 0.00010235362769889128, "loss": 3.8907, "step": 5026 }, { "epoch": 1.4660590594239884, "grad_norm": 2.173271656036377, "learning_rate": 0.00010233417623030539, "loss": 4.2068, "step": 5027 }, { "epoch": 1.4663507109004739, "grad_norm": 2.4473764896392822, "learning_rate": 0.00010231472476171951, "loss": 4.3304, "step": 5028 }, { "epoch": 1.4666423623769596, "grad_norm": 3.6329238414764404, "learning_rate": 0.00010229527329313364, "loss": 4.2397, "step": 5029 }, { "epoch": 1.4669340138534452, "grad_norm": 3.6569154262542725, "learning_rate": 0.00010227582182454775, "loss": 4.0155, "step": 5030 }, { "epoch": 1.4672256653299307, "grad_norm": 2.1975784301757812, "learning_rate": 0.00010225637035596187, "loss": 3.7799, "step": 5031 }, { "epoch": 1.4675173168064164, "grad_norm": 2.778472900390625, "learning_rate": 0.00010223691888737601, "loss": 4.2366, "step": 5032 }, { "epoch": 1.4678089682829019, "grad_norm": 2.1879117488861084, "learning_rate": 0.00010221746741879013, "loss": 3.9135, "step": 5033 }, { "epoch": 1.4681006197593875, "grad_norm": 3.7819671630859375, "learning_rate": 0.00010219801595020424, "loss": 4.0842, "step": 5034 }, { "epoch": 1.468392271235873, "grad_norm": 3.4742631912231445, "learning_rate": 0.00010217856448161837, "loss": 4.1512, "step": 5035 }, { "epoch": 1.4686839227123587, "grad_norm": 1.8884732723236084, "learning_rate": 0.0001021591130130325, "loss": 4.0839, "step": 5036 }, { "epoch": 1.4689755741888444, "grad_norm": 2.58626651763916, "learning_rate": 0.0001021396615444466, "loss": 4.1564, "step": 5037 }, { "epoch": 1.4692672256653299, "grad_norm": 2.2536981105804443, "learning_rate": 0.00010212021007586072, "loss": 4.2255, "step": 5038 }, { "epoch": 1.4695588771418155, "grad_norm": 2.1208996772766113, "learning_rate": 0.00010210075860727486, "loss": 4.2406, "step": 5039 }, { "epoch": 1.4698505286183012, "grad_norm": 1.7638851404190063, "learning_rate": 0.00010208130713868898, "loss": 3.9063, "step": 5040 }, { "epoch": 1.4701421800947867, "grad_norm": 2.073687791824341, "learning_rate": 0.00010206185567010309, "loss": 4.1564, "step": 5041 }, { "epoch": 1.4704338315712724, "grad_norm": 2.0858654975891113, "learning_rate": 0.00010204240420151722, "loss": 4.3083, "step": 5042 }, { "epoch": 1.4707254830477579, "grad_norm": 3.185011386871338, "learning_rate": 0.00010202295273293135, "loss": 4.178, "step": 5043 }, { "epoch": 1.4710171345242435, "grad_norm": 2.2381982803344727, "learning_rate": 0.00010200350126434545, "loss": 3.8765, "step": 5044 }, { "epoch": 1.471308786000729, "grad_norm": 2.655834674835205, "learning_rate": 0.00010198404979575959, "loss": 4.1041, "step": 5045 }, { "epoch": 1.4716004374772147, "grad_norm": 2.344635248184204, "learning_rate": 0.00010196459832717371, "loss": 4.129, "step": 5046 }, { "epoch": 1.4718920889537004, "grad_norm": 2.9630579948425293, "learning_rate": 0.00010194514685858783, "loss": 4.3025, "step": 5047 }, { "epoch": 1.4721837404301858, "grad_norm": 2.129549980163574, "learning_rate": 0.00010192569539000194, "loss": 4.1466, "step": 5048 }, { "epoch": 1.4724753919066715, "grad_norm": 2.319087505340576, "learning_rate": 0.00010190624392141608, "loss": 3.9418, "step": 5049 }, { "epoch": 1.4727670433831572, "grad_norm": 2.1253180503845215, "learning_rate": 0.0001018867924528302, "loss": 4.2138, "step": 5050 }, { "epoch": 1.4730586948596427, "grad_norm": 2.1128182411193848, "learning_rate": 0.0001018673409842443, "loss": 3.8541, "step": 5051 }, { "epoch": 1.4733503463361284, "grad_norm": 2.2290520668029785, "learning_rate": 0.00010184788951565844, "loss": 3.801, "step": 5052 }, { "epoch": 1.4736419978126138, "grad_norm": 2.014460802078247, "learning_rate": 0.00010182843804707256, "loss": 4.07, "step": 5053 }, { "epoch": 1.4739336492890995, "grad_norm": 1.8627568483352661, "learning_rate": 0.00010180898657848667, "loss": 4.342, "step": 5054 }, { "epoch": 1.474225300765585, "grad_norm": 2.2596449851989746, "learning_rate": 0.00010178953510990082, "loss": 4.1029, "step": 5055 }, { "epoch": 1.4745169522420707, "grad_norm": 1.9247106313705444, "learning_rate": 0.00010177008364131493, "loss": 3.6727, "step": 5056 }, { "epoch": 1.4748086037185564, "grad_norm": 2.1980555057525635, "learning_rate": 0.00010175063217272905, "loss": 3.9727, "step": 5057 }, { "epoch": 1.4751002551950418, "grad_norm": 2.2817234992980957, "learning_rate": 0.00010173118070414316, "loss": 3.7489, "step": 5058 }, { "epoch": 1.4753919066715275, "grad_norm": 2.1724557876586914, "learning_rate": 0.00010171172923555729, "loss": 4.0373, "step": 5059 }, { "epoch": 1.4756835581480132, "grad_norm": 3.424089193344116, "learning_rate": 0.00010169227776697141, "loss": 4.284, "step": 5060 }, { "epoch": 1.4759752096244987, "grad_norm": 2.472588062286377, "learning_rate": 0.00010167282629838552, "loss": 4.1461, "step": 5061 }, { "epoch": 1.4762668611009844, "grad_norm": 2.2380404472351074, "learning_rate": 0.00010165337482979967, "loss": 3.9049, "step": 5062 }, { "epoch": 1.47655851257747, "grad_norm": 2.3276751041412354, "learning_rate": 0.00010163392336121378, "loss": 4.1783, "step": 5063 }, { "epoch": 1.4768501640539555, "grad_norm": 2.3946704864501953, "learning_rate": 0.0001016144718926279, "loss": 4.1529, "step": 5064 }, { "epoch": 1.477141815530441, "grad_norm": 2.2630207538604736, "learning_rate": 0.00010159502042404203, "loss": 4.4397, "step": 5065 }, { "epoch": 1.4774334670069267, "grad_norm": 2.7349791526794434, "learning_rate": 0.00010157556895545614, "loss": 4.2539, "step": 5066 }, { "epoch": 1.4777251184834124, "grad_norm": 2.1099393367767334, "learning_rate": 0.00010155611748687026, "loss": 4.1296, "step": 5067 }, { "epoch": 1.4780167699598978, "grad_norm": 1.8607817888259888, "learning_rate": 0.00010153666601828437, "loss": 4.1875, "step": 5068 }, { "epoch": 1.4783084214363835, "grad_norm": 2.424513816833496, "learning_rate": 0.00010151721454969852, "loss": 4.0457, "step": 5069 }, { "epoch": 1.4786000729128692, "grad_norm": 2.761518716812134, "learning_rate": 0.00010149776308111263, "loss": 4.2164, "step": 5070 }, { "epoch": 1.4788917243893547, "grad_norm": 2.0769307613372803, "learning_rate": 0.00010147831161252675, "loss": 3.9839, "step": 5071 }, { "epoch": 1.4791833758658404, "grad_norm": 2.475538969039917, "learning_rate": 0.00010145886014394089, "loss": 4.0276, "step": 5072 }, { "epoch": 1.479475027342326, "grad_norm": 2.032477617263794, "learning_rate": 0.000101439408675355, "loss": 4.0447, "step": 5073 }, { "epoch": 1.4797666788188115, "grad_norm": 2.445805549621582, "learning_rate": 0.00010141995720676911, "loss": 4.1297, "step": 5074 }, { "epoch": 1.480058330295297, "grad_norm": 2.470386028289795, "learning_rate": 0.00010140050573818322, "loss": 4.1134, "step": 5075 }, { "epoch": 1.4803499817717827, "grad_norm": 2.396980047225952, "learning_rate": 0.00010138105426959736, "loss": 4.1817, "step": 5076 }, { "epoch": 1.4806416332482684, "grad_norm": 2.7577426433563232, "learning_rate": 0.00010136160280101148, "loss": 4.2362, "step": 5077 }, { "epoch": 1.4809332847247538, "grad_norm": 2.2758169174194336, "learning_rate": 0.00010134215133242559, "loss": 4.3531, "step": 5078 }, { "epoch": 1.4812249362012395, "grad_norm": 3.9705915451049805, "learning_rate": 0.00010132269986383974, "loss": 4.2404, "step": 5079 }, { "epoch": 1.4815165876777252, "grad_norm": 2.54443621635437, "learning_rate": 0.00010130324839525384, "loss": 4.0197, "step": 5080 }, { "epoch": 1.4818082391542107, "grad_norm": 2.087415933609009, "learning_rate": 0.00010128379692666797, "loss": 4.0722, "step": 5081 }, { "epoch": 1.4820998906306964, "grad_norm": 2.225140333175659, "learning_rate": 0.0001012643454580821, "loss": 3.9656, "step": 5082 }, { "epoch": 1.482391542107182, "grad_norm": 2.1515731811523438, "learning_rate": 0.00010124489398949621, "loss": 4.1828, "step": 5083 }, { "epoch": 1.4826831935836675, "grad_norm": 2.2865092754364014, "learning_rate": 0.00010122544252091033, "loss": 4.0832, "step": 5084 }, { "epoch": 1.482974845060153, "grad_norm": 2.263523578643799, "learning_rate": 0.00010120599105232444, "loss": 4.2484, "step": 5085 }, { "epoch": 1.4832664965366387, "grad_norm": 2.248943567276001, "learning_rate": 0.00010118653958373859, "loss": 3.9614, "step": 5086 }, { "epoch": 1.4835581480131244, "grad_norm": 2.4255802631378174, "learning_rate": 0.0001011670881151527, "loss": 4.1811, "step": 5087 }, { "epoch": 1.4838497994896098, "grad_norm": 2.0135695934295654, "learning_rate": 0.00010114763664656682, "loss": 4.1381, "step": 5088 }, { "epoch": 1.4841414509660955, "grad_norm": 2.472942590713501, "learning_rate": 0.00010112818517798095, "loss": 4.1394, "step": 5089 }, { "epoch": 1.4844331024425812, "grad_norm": 1.9743469953536987, "learning_rate": 0.00010110873370939506, "loss": 4.279, "step": 5090 }, { "epoch": 1.4847247539190667, "grad_norm": 2.4970057010650635, "learning_rate": 0.00010108928224080918, "loss": 4.1803, "step": 5091 }, { "epoch": 1.4850164053955524, "grad_norm": 2.159280776977539, "learning_rate": 0.00010106983077222332, "loss": 3.8967, "step": 5092 }, { "epoch": 1.485308056872038, "grad_norm": 2.0776560306549072, "learning_rate": 0.00010105037930363744, "loss": 3.945, "step": 5093 }, { "epoch": 1.4855997083485235, "grad_norm": 1.9329397678375244, "learning_rate": 0.00010103092783505155, "loss": 4.0413, "step": 5094 }, { "epoch": 1.4858913598250092, "grad_norm": 2.1578598022460938, "learning_rate": 0.00010101147636646567, "loss": 4.1948, "step": 5095 }, { "epoch": 1.4861830113014947, "grad_norm": 1.7040983438491821, "learning_rate": 0.0001009920248978798, "loss": 4.1583, "step": 5096 }, { "epoch": 1.4864746627779803, "grad_norm": 2.074542760848999, "learning_rate": 0.00010097257342929391, "loss": 3.9133, "step": 5097 }, { "epoch": 1.4867663142544658, "grad_norm": 4.174943447113037, "learning_rate": 0.00010095312196070803, "loss": 4.0662, "step": 5098 }, { "epoch": 1.4870579657309515, "grad_norm": 1.9742785692214966, "learning_rate": 0.00010093367049212217, "loss": 4.1854, "step": 5099 }, { "epoch": 1.4873496172074372, "grad_norm": 2.545339584350586, "learning_rate": 0.00010091421902353628, "loss": 4.2389, "step": 5100 }, { "epoch": 1.4876412686839227, "grad_norm": 1.9915255308151245, "learning_rate": 0.0001008947675549504, "loss": 3.897, "step": 5101 }, { "epoch": 1.4879329201604083, "grad_norm": 2.369980573654175, "learning_rate": 0.00010087531608636453, "loss": 4.2255, "step": 5102 }, { "epoch": 1.488224571636894, "grad_norm": 2.2203705310821533, "learning_rate": 0.00010085586461777865, "loss": 4.1194, "step": 5103 }, { "epoch": 1.4885162231133795, "grad_norm": 2.551243543624878, "learning_rate": 0.00010083641314919276, "loss": 4.0737, "step": 5104 }, { "epoch": 1.4888078745898652, "grad_norm": 1.9973565340042114, "learning_rate": 0.00010081696168060688, "loss": 3.9068, "step": 5105 }, { "epoch": 1.4890995260663507, "grad_norm": 2.045454978942871, "learning_rate": 0.00010079751021202102, "loss": 4.0025, "step": 5106 }, { "epoch": 1.4893911775428363, "grad_norm": 2.121934413909912, "learning_rate": 0.00010077805874343513, "loss": 4.0889, "step": 5107 }, { "epoch": 1.4896828290193218, "grad_norm": 2.5940909385681152, "learning_rate": 0.00010075860727484925, "loss": 4.2015, "step": 5108 }, { "epoch": 1.4899744804958075, "grad_norm": 2.253100633621216, "learning_rate": 0.00010073915580626338, "loss": 3.9764, "step": 5109 }, { "epoch": 1.4902661319722932, "grad_norm": 2.3055877685546875, "learning_rate": 0.0001007197043376775, "loss": 4.1902, "step": 5110 }, { "epoch": 1.4905577834487787, "grad_norm": 2.680415630340576, "learning_rate": 0.00010070025286909161, "loss": 3.8607, "step": 5111 }, { "epoch": 1.4908494349252643, "grad_norm": 2.0191118717193604, "learning_rate": 0.00010068080140050575, "loss": 4.234, "step": 5112 }, { "epoch": 1.49114108640175, "grad_norm": 1.9978480339050293, "learning_rate": 0.00010066134993191987, "loss": 4.1577, "step": 5113 }, { "epoch": 1.4914327378782355, "grad_norm": 2.4144411087036133, "learning_rate": 0.00010064189846333398, "loss": 3.9753, "step": 5114 }, { "epoch": 1.4917243893547212, "grad_norm": 1.9133477210998535, "learning_rate": 0.0001006224469947481, "loss": 3.9778, "step": 5115 }, { "epoch": 1.4920160408312066, "grad_norm": 2.3000288009643555, "learning_rate": 0.00010060299552616224, "loss": 3.9771, "step": 5116 }, { "epoch": 1.4923076923076923, "grad_norm": 2.1360366344451904, "learning_rate": 0.00010058354405757636, "loss": 4.192, "step": 5117 }, { "epoch": 1.4925993437841778, "grad_norm": 2.3392205238342285, "learning_rate": 0.00010056409258899047, "loss": 4.1614, "step": 5118 }, { "epoch": 1.4928909952606635, "grad_norm": 2.764159917831421, "learning_rate": 0.0001005446411204046, "loss": 4.1516, "step": 5119 }, { "epoch": 1.4931826467371492, "grad_norm": 2.26825213432312, "learning_rate": 0.00010052518965181872, "loss": 4.117, "step": 5120 }, { "epoch": 1.4934742982136346, "grad_norm": 2.7120988368988037, "learning_rate": 0.00010050573818323283, "loss": 4.008, "step": 5121 }, { "epoch": 1.4937659496901203, "grad_norm": 3.293299913406372, "learning_rate": 0.00010048628671464696, "loss": 4.098, "step": 5122 }, { "epoch": 1.494057601166606, "grad_norm": 2.172234296798706, "learning_rate": 0.00010046683524606109, "loss": 4.5086, "step": 5123 }, { "epoch": 1.4943492526430915, "grad_norm": 2.2207584381103516, "learning_rate": 0.0001004473837774752, "loss": 4.0042, "step": 5124 }, { "epoch": 1.4946409041195772, "grad_norm": 2.6813008785247803, "learning_rate": 0.00010042793230888932, "loss": 4.336, "step": 5125 }, { "epoch": 1.4949325555960626, "grad_norm": 1.9591819047927856, "learning_rate": 0.00010040848084030345, "loss": 4.0318, "step": 5126 }, { "epoch": 1.4952242070725483, "grad_norm": 2.0599894523620605, "learning_rate": 0.00010038902937171757, "loss": 4.0463, "step": 5127 }, { "epoch": 1.4955158585490338, "grad_norm": 2.107222080230713, "learning_rate": 0.00010036957790313168, "loss": 4.2173, "step": 5128 }, { "epoch": 1.4958075100255195, "grad_norm": 3.4523489475250244, "learning_rate": 0.00010035012643454582, "loss": 3.9546, "step": 5129 }, { "epoch": 1.4960991615020052, "grad_norm": 2.4993879795074463, "learning_rate": 0.00010033067496595994, "loss": 4.2176, "step": 5130 }, { "epoch": 1.4963908129784906, "grad_norm": 1.9127072095870972, "learning_rate": 0.00010031122349737405, "loss": 4.3519, "step": 5131 }, { "epoch": 1.4966824644549763, "grad_norm": 2.6964192390441895, "learning_rate": 0.0001002917720287882, "loss": 3.8521, "step": 5132 }, { "epoch": 1.496974115931462, "grad_norm": 1.949597954750061, "learning_rate": 0.0001002723205602023, "loss": 3.8655, "step": 5133 }, { "epoch": 1.4972657674079475, "grad_norm": 2.422351360321045, "learning_rate": 0.00010025286909161642, "loss": 4.1591, "step": 5134 }, { "epoch": 1.4975574188844332, "grad_norm": 2.0310142040252686, "learning_rate": 0.00010023341762303053, "loss": 4.193, "step": 5135 }, { "epoch": 1.4978490703609186, "grad_norm": 2.0547521114349365, "learning_rate": 0.00010021396615444467, "loss": 4.0084, "step": 5136 }, { "epoch": 1.4981407218374043, "grad_norm": 3.8711090087890625, "learning_rate": 0.00010019451468585879, "loss": 4.0805, "step": 5137 }, { "epoch": 1.4984323733138898, "grad_norm": 1.9732773303985596, "learning_rate": 0.0001001750632172729, "loss": 4.0082, "step": 5138 }, { "epoch": 1.4987240247903755, "grad_norm": 2.0087101459503174, "learning_rate": 0.00010015561174868705, "loss": 4.1488, "step": 5139 }, { "epoch": 1.4990156762668612, "grad_norm": 3.355860948562622, "learning_rate": 0.00010013616028010115, "loss": 4.0793, "step": 5140 }, { "epoch": 1.4993073277433466, "grad_norm": 2.3401856422424316, "learning_rate": 0.00010011670881151528, "loss": 4.1717, "step": 5141 }, { "epoch": 1.4995989792198323, "grad_norm": 1.875048041343689, "learning_rate": 0.00010009725734292941, "loss": 3.8626, "step": 5142 }, { "epoch": 1.499890630696318, "grad_norm": 3.0151114463806152, "learning_rate": 0.00010007780587434352, "loss": 4.0221, "step": 5143 }, { "epoch": 1.5001822821728035, "grad_norm": 2.032310962677002, "learning_rate": 0.00010005835440575764, "loss": 4.3051, "step": 5144 }, { "epoch": 1.500473933649289, "grad_norm": 2.257793426513672, "learning_rate": 0.00010003890293717175, "loss": 3.9299, "step": 5145 }, { "epoch": 1.5007655851257748, "grad_norm": 2.3494462966918945, "learning_rate": 0.00010001945146858588, "loss": 4.06, "step": 5146 }, { "epoch": 1.5010572366022603, "grad_norm": 2.2556254863739014, "learning_rate": 0.0001, "loss": 4.0966, "step": 5147 }, { "epoch": 1.5013488880787458, "grad_norm": 2.483135223388672, "learning_rate": 9.998054853141413e-05, "loss": 4.322, "step": 5148 }, { "epoch": 1.5016405395552315, "grad_norm": 2.3229057788848877, "learning_rate": 9.996109706282825e-05, "loss": 3.9106, "step": 5149 }, { "epoch": 1.5019321910317172, "grad_norm": 2.054137945175171, "learning_rate": 9.994164559424237e-05, "loss": 3.6597, "step": 5150 }, { "epoch": 1.5022238425082026, "grad_norm": 2.3663876056671143, "learning_rate": 9.992219412565649e-05, "loss": 4.1569, "step": 5151 }, { "epoch": 1.5025154939846883, "grad_norm": 2.5812511444091797, "learning_rate": 9.990274265707061e-05, "loss": 4.0132, "step": 5152 }, { "epoch": 1.502807145461174, "grad_norm": 2.5467429161071777, "learning_rate": 9.988329118848473e-05, "loss": 3.9566, "step": 5153 }, { "epoch": 1.5030987969376595, "grad_norm": 1.9609078168869019, "learning_rate": 9.986383971989886e-05, "loss": 4.0922, "step": 5154 }, { "epoch": 1.503390448414145, "grad_norm": 2.3108036518096924, "learning_rate": 9.984438825131298e-05, "loss": 4.2622, "step": 5155 }, { "epoch": 1.5036820998906308, "grad_norm": 2.5904440879821777, "learning_rate": 9.98249367827271e-05, "loss": 4.2642, "step": 5156 }, { "epoch": 1.5039737513671163, "grad_norm": 2.609337329864502, "learning_rate": 9.980548531414122e-05, "loss": 4.2076, "step": 5157 }, { "epoch": 1.5042654028436018, "grad_norm": 2.139195203781128, "learning_rate": 9.978603384555534e-05, "loss": 4.3842, "step": 5158 }, { "epoch": 1.5045570543200875, "grad_norm": 2.0163841247558594, "learning_rate": 9.976658237696946e-05, "loss": 4.1236, "step": 5159 }, { "epoch": 1.5048487057965731, "grad_norm": 1.8723403215408325, "learning_rate": 9.974713090838359e-05, "loss": 3.8579, "step": 5160 }, { "epoch": 1.5051403572730586, "grad_norm": 2.1067802906036377, "learning_rate": 9.972767943979771e-05, "loss": 4.3349, "step": 5161 }, { "epoch": 1.5054320087495443, "grad_norm": 2.2946155071258545, "learning_rate": 9.970822797121183e-05, "loss": 4.3241, "step": 5162 }, { "epoch": 1.50572366022603, "grad_norm": 2.383403778076172, "learning_rate": 9.968877650262596e-05, "loss": 4.0628, "step": 5163 }, { "epoch": 1.5060153117025155, "grad_norm": 1.9816968441009521, "learning_rate": 9.966932503404007e-05, "loss": 4.0213, "step": 5164 }, { "epoch": 1.5063069631790011, "grad_norm": 4.014925003051758, "learning_rate": 9.96498735654542e-05, "loss": 3.5034, "step": 5165 }, { "epoch": 1.5065986146554868, "grad_norm": 2.6830227375030518, "learning_rate": 9.963042209686832e-05, "loss": 4.127, "step": 5166 }, { "epoch": 1.5068902661319723, "grad_norm": 5.233097553253174, "learning_rate": 9.961097062828244e-05, "loss": 4.3588, "step": 5167 }, { "epoch": 1.5071819176084578, "grad_norm": 2.0473954677581787, "learning_rate": 9.959151915969657e-05, "loss": 4.0183, "step": 5168 }, { "epoch": 1.5074735690849435, "grad_norm": 2.166318655014038, "learning_rate": 9.957206769111068e-05, "loss": 3.8389, "step": 5169 }, { "epoch": 1.5077652205614291, "grad_norm": 2.479034900665283, "learning_rate": 9.955261622252482e-05, "loss": 3.9449, "step": 5170 }, { "epoch": 1.5080568720379146, "grad_norm": 2.019718885421753, "learning_rate": 9.953316475393892e-05, "loss": 4.1706, "step": 5171 }, { "epoch": 1.5083485235144003, "grad_norm": 2.062828540802002, "learning_rate": 9.951371328535304e-05, "loss": 4.0961, "step": 5172 }, { "epoch": 1.508640174990886, "grad_norm": 1.822617769241333, "learning_rate": 9.949426181676718e-05, "loss": 4.1148, "step": 5173 }, { "epoch": 1.5089318264673715, "grad_norm": 2.59725022315979, "learning_rate": 9.947481034818129e-05, "loss": 4.1916, "step": 5174 }, { "epoch": 1.5092234779438571, "grad_norm": 2.6550960540771484, "learning_rate": 9.945535887959542e-05, "loss": 4.0469, "step": 5175 }, { "epoch": 1.5095151294203428, "grad_norm": 3.9756808280944824, "learning_rate": 9.943590741100953e-05, "loss": 3.9415, "step": 5176 }, { "epoch": 1.5098067808968283, "grad_norm": 2.6130316257476807, "learning_rate": 9.941645594242365e-05, "loss": 4.2228, "step": 5177 }, { "epoch": 1.5100984323733138, "grad_norm": 2.444380044937134, "learning_rate": 9.939700447383779e-05, "loss": 3.779, "step": 5178 }, { "epoch": 1.5103900838497994, "grad_norm": 2.033379554748535, "learning_rate": 9.93775530052519e-05, "loss": 3.8053, "step": 5179 }, { "epoch": 1.5106817353262851, "grad_norm": 1.7094979286193848, "learning_rate": 9.935810153666603e-05, "loss": 4.213, "step": 5180 }, { "epoch": 1.5109733868027706, "grad_norm": 2.3064587116241455, "learning_rate": 9.933865006808014e-05, "loss": 4.1785, "step": 5181 }, { "epoch": 1.5112650382792563, "grad_norm": 2.2197017669677734, "learning_rate": 9.931919859949427e-05, "loss": 4.0531, "step": 5182 }, { "epoch": 1.511556689755742, "grad_norm": 1.954352855682373, "learning_rate": 9.92997471309084e-05, "loss": 3.9932, "step": 5183 }, { "epoch": 1.5118483412322274, "grad_norm": 2.3014369010925293, "learning_rate": 9.92802956623225e-05, "loss": 4.0264, "step": 5184 }, { "epoch": 1.5121399927087131, "grad_norm": 2.2619667053222656, "learning_rate": 9.926084419373664e-05, "loss": 4.3081, "step": 5185 }, { "epoch": 1.5124316441851988, "grad_norm": 2.6588714122772217, "learning_rate": 9.924139272515075e-05, "loss": 3.9309, "step": 5186 }, { "epoch": 1.5127232956616843, "grad_norm": 2.876641273498535, "learning_rate": 9.922194125656488e-05, "loss": 4.1347, "step": 5187 }, { "epoch": 1.5130149471381698, "grad_norm": 2.157109260559082, "learning_rate": 9.920248978797899e-05, "loss": 3.8351, "step": 5188 }, { "epoch": 1.5133065986146554, "grad_norm": 2.5943145751953125, "learning_rate": 9.918303831939311e-05, "loss": 4.1851, "step": 5189 }, { "epoch": 1.5135982500911411, "grad_norm": 2.0908854007720947, "learning_rate": 9.916358685080725e-05, "loss": 3.9586, "step": 5190 }, { "epoch": 1.5138899015676266, "grad_norm": 2.546391487121582, "learning_rate": 9.914413538222135e-05, "loss": 4.1497, "step": 5191 }, { "epoch": 1.5141815530441123, "grad_norm": 2.6562905311584473, "learning_rate": 9.912468391363549e-05, "loss": 4.0868, "step": 5192 }, { "epoch": 1.514473204520598, "grad_norm": 2.5455563068389893, "learning_rate": 9.91052324450496e-05, "loss": 4.2536, "step": 5193 }, { "epoch": 1.5147648559970834, "grad_norm": 2.1267199516296387, "learning_rate": 9.908578097646373e-05, "loss": 3.7408, "step": 5194 }, { "epoch": 1.5150565074735691, "grad_norm": 2.4212539196014404, "learning_rate": 9.906632950787785e-05, "loss": 3.8467, "step": 5195 }, { "epoch": 1.5153481589500548, "grad_norm": 2.7267518043518066, "learning_rate": 9.904687803929196e-05, "loss": 3.943, "step": 5196 }, { "epoch": 1.5156398104265403, "grad_norm": 2.5045862197875977, "learning_rate": 9.90274265707061e-05, "loss": 4.2122, "step": 5197 }, { "epoch": 1.5159314619030257, "grad_norm": 3.0880393981933594, "learning_rate": 9.90079751021202e-05, "loss": 4.0996, "step": 5198 }, { "epoch": 1.5162231133795114, "grad_norm": 2.133605718612671, "learning_rate": 9.898852363353434e-05, "loss": 4.0002, "step": 5199 }, { "epoch": 1.5165147648559971, "grad_norm": 1.9050475358963013, "learning_rate": 9.896907216494846e-05, "loss": 3.9492, "step": 5200 }, { "epoch": 1.5168064163324826, "grad_norm": 2.1414668560028076, "learning_rate": 9.894962069636257e-05, "loss": 4.2359, "step": 5201 }, { "epoch": 1.5170980678089683, "grad_norm": 2.464442491531372, "learning_rate": 9.89301692277767e-05, "loss": 4.1862, "step": 5202 }, { "epoch": 1.517389719285454, "grad_norm": 3.06827974319458, "learning_rate": 9.891071775919081e-05, "loss": 4.0173, "step": 5203 }, { "epoch": 1.5176813707619394, "grad_norm": 2.162022590637207, "learning_rate": 9.889126629060495e-05, "loss": 3.9281, "step": 5204 }, { "epoch": 1.5179730222384251, "grad_norm": 2.6819636821746826, "learning_rate": 9.887181482201907e-05, "loss": 3.9604, "step": 5205 }, { "epoch": 1.5182646737149108, "grad_norm": 2.1173059940338135, "learning_rate": 9.885236335343319e-05, "loss": 4.1612, "step": 5206 }, { "epoch": 1.5185563251913963, "grad_norm": 2.5518746376037598, "learning_rate": 9.883291188484731e-05, "loss": 4.3147, "step": 5207 }, { "epoch": 1.5188479766678817, "grad_norm": 2.3608386516571045, "learning_rate": 9.881346041626142e-05, "loss": 4.3477, "step": 5208 }, { "epoch": 1.5191396281443674, "grad_norm": 2.1340785026550293, "learning_rate": 9.879400894767556e-05, "loss": 3.9671, "step": 5209 }, { "epoch": 1.5194312796208531, "grad_norm": 2.678535223007202, "learning_rate": 9.877455747908968e-05, "loss": 4.3063, "step": 5210 }, { "epoch": 1.5197229310973386, "grad_norm": 2.0838546752929688, "learning_rate": 9.87551060105038e-05, "loss": 3.9671, "step": 5211 }, { "epoch": 1.5200145825738243, "grad_norm": 1.9226014614105225, "learning_rate": 9.873565454191792e-05, "loss": 4.3955, "step": 5212 }, { "epoch": 1.52030623405031, "grad_norm": 2.227379083633423, "learning_rate": 9.871620307333203e-05, "loss": 3.5783, "step": 5213 }, { "epoch": 1.5205978855267954, "grad_norm": 3.658142566680908, "learning_rate": 9.869675160474617e-05, "loss": 4.0371, "step": 5214 }, { "epoch": 1.5208895370032811, "grad_norm": 3.0104455947875977, "learning_rate": 9.867730013616029e-05, "loss": 3.9603, "step": 5215 }, { "epoch": 1.5211811884797668, "grad_norm": 1.9639732837677002, "learning_rate": 9.865784866757441e-05, "loss": 3.5766, "step": 5216 }, { "epoch": 1.5214728399562523, "grad_norm": 2.5147457122802734, "learning_rate": 9.863839719898853e-05, "loss": 4.196, "step": 5217 }, { "epoch": 1.5217644914327377, "grad_norm": 1.9396213293075562, "learning_rate": 9.861894573040265e-05, "loss": 4.1224, "step": 5218 }, { "epoch": 1.5220561429092236, "grad_norm": 2.0918872356414795, "learning_rate": 9.859949426181677e-05, "loss": 3.9247, "step": 5219 }, { "epoch": 1.522347794385709, "grad_norm": 2.7648608684539795, "learning_rate": 9.85800427932309e-05, "loss": 4.0714, "step": 5220 }, { "epoch": 1.5226394458621946, "grad_norm": 3.2734627723693848, "learning_rate": 9.856059132464502e-05, "loss": 3.8372, "step": 5221 }, { "epoch": 1.5229310973386803, "grad_norm": 3.1919734477996826, "learning_rate": 9.854113985605914e-05, "loss": 4.0848, "step": 5222 }, { "epoch": 1.523222748815166, "grad_norm": 2.247864007949829, "learning_rate": 9.852168838747326e-05, "loss": 3.8672, "step": 5223 }, { "epoch": 1.5235144002916514, "grad_norm": 2.0486490726470947, "learning_rate": 9.850223691888738e-05, "loss": 4.3409, "step": 5224 }, { "epoch": 1.523806051768137, "grad_norm": 1.9752646684646606, "learning_rate": 9.84827854503015e-05, "loss": 4.3219, "step": 5225 }, { "epoch": 1.5240977032446228, "grad_norm": 2.183475971221924, "learning_rate": 9.846333398171562e-05, "loss": 3.8303, "step": 5226 }, { "epoch": 1.5243893547211083, "grad_norm": 2.645031213760376, "learning_rate": 9.844388251312975e-05, "loss": 4.1113, "step": 5227 }, { "epoch": 1.5246810061975937, "grad_norm": 1.8479207754135132, "learning_rate": 9.842443104454387e-05, "loss": 3.6818, "step": 5228 }, { "epoch": 1.5249726576740796, "grad_norm": 2.12823486328125, "learning_rate": 9.840497957595799e-05, "loss": 4.0348, "step": 5229 }, { "epoch": 1.525264309150565, "grad_norm": 2.364452838897705, "learning_rate": 9.838552810737211e-05, "loss": 3.9116, "step": 5230 }, { "epoch": 1.5255559606270506, "grad_norm": 3.1530344486236572, "learning_rate": 9.836607663878623e-05, "loss": 3.8329, "step": 5231 }, { "epoch": 1.5258476121035363, "grad_norm": 2.0052387714385986, "learning_rate": 9.834662517020035e-05, "loss": 3.9968, "step": 5232 }, { "epoch": 1.526139263580022, "grad_norm": 1.908132791519165, "learning_rate": 9.832717370161448e-05, "loss": 4.0871, "step": 5233 }, { "epoch": 1.5264309150565074, "grad_norm": 1.9400320053100586, "learning_rate": 9.83077222330286e-05, "loss": 3.9342, "step": 5234 }, { "epoch": 1.526722566532993, "grad_norm": 2.15242075920105, "learning_rate": 9.828827076444272e-05, "loss": 4.1886, "step": 5235 }, { "epoch": 1.5270142180094788, "grad_norm": 2.2049968242645264, "learning_rate": 9.826881929585684e-05, "loss": 4.0126, "step": 5236 }, { "epoch": 1.5273058694859643, "grad_norm": 2.6001391410827637, "learning_rate": 9.824936782727096e-05, "loss": 4.037, "step": 5237 }, { "epoch": 1.5275975209624497, "grad_norm": 2.4524433612823486, "learning_rate": 9.822991635868508e-05, "loss": 4.2069, "step": 5238 }, { "epoch": 1.5278891724389356, "grad_norm": 2.2924344539642334, "learning_rate": 9.82104648900992e-05, "loss": 4.2305, "step": 5239 }, { "epoch": 1.528180823915421, "grad_norm": 2.1533308029174805, "learning_rate": 9.819101342151334e-05, "loss": 3.6923, "step": 5240 }, { "epoch": 1.5284724753919066, "grad_norm": 2.129758834838867, "learning_rate": 9.817156195292745e-05, "loss": 4.3391, "step": 5241 }, { "epoch": 1.5287641268683922, "grad_norm": 2.0550174713134766, "learning_rate": 9.815211048434157e-05, "loss": 4.0266, "step": 5242 }, { "epoch": 1.529055778344878, "grad_norm": 2.0789663791656494, "learning_rate": 9.813265901575569e-05, "loss": 4.081, "step": 5243 }, { "epoch": 1.5293474298213634, "grad_norm": 3.158036947250366, "learning_rate": 9.811320754716981e-05, "loss": 4.1926, "step": 5244 }, { "epoch": 1.529639081297849, "grad_norm": 4.5127997398376465, "learning_rate": 9.809375607858395e-05, "loss": 4.2903, "step": 5245 }, { "epoch": 1.5299307327743348, "grad_norm": 3.125081777572632, "learning_rate": 9.807430460999806e-05, "loss": 4.0204, "step": 5246 }, { "epoch": 1.5302223842508202, "grad_norm": 2.247366428375244, "learning_rate": 9.805485314141218e-05, "loss": 3.997, "step": 5247 }, { "epoch": 1.5305140357273057, "grad_norm": 2.0655486583709717, "learning_rate": 9.80354016728263e-05, "loss": 3.8962, "step": 5248 }, { "epoch": 1.5308056872037916, "grad_norm": 2.42020583152771, "learning_rate": 9.801595020424042e-05, "loss": 4.2132, "step": 5249 }, { "epoch": 1.531097338680277, "grad_norm": 2.287687063217163, "learning_rate": 9.799649873565456e-05, "loss": 4.4355, "step": 5250 }, { "epoch": 1.5313889901567626, "grad_norm": 2.3990283012390137, "learning_rate": 9.797704726706866e-05, "loss": 3.849, "step": 5251 }, { "epoch": 1.5316806416332482, "grad_norm": 1.9587877988815308, "learning_rate": 9.79575957984828e-05, "loss": 4.1802, "step": 5252 }, { "epoch": 1.531972293109734, "grad_norm": 2.3752074241638184, "learning_rate": 9.793814432989691e-05, "loss": 4.0467, "step": 5253 }, { "epoch": 1.5322639445862194, "grad_norm": 2.3758556842803955, "learning_rate": 9.791869286131103e-05, "loss": 4.4186, "step": 5254 }, { "epoch": 1.532555596062705, "grad_norm": 1.9535070657730103, "learning_rate": 9.789924139272516e-05, "loss": 3.7582, "step": 5255 }, { "epoch": 1.5328472475391908, "grad_norm": 2.544140577316284, "learning_rate": 9.787978992413927e-05, "loss": 3.9837, "step": 5256 }, { "epoch": 1.5331388990156762, "grad_norm": 1.915948748588562, "learning_rate": 9.786033845555341e-05, "loss": 4.0126, "step": 5257 }, { "epoch": 1.5334305504921617, "grad_norm": 2.400775194168091, "learning_rate": 9.784088698696752e-05, "loss": 3.9518, "step": 5258 }, { "epoch": 1.5337222019686476, "grad_norm": 2.181217670440674, "learning_rate": 9.782143551838164e-05, "loss": 4.1001, "step": 5259 }, { "epoch": 1.534013853445133, "grad_norm": 1.9104729890823364, "learning_rate": 9.780198404979577e-05, "loss": 4.0886, "step": 5260 }, { "epoch": 1.5343055049216185, "grad_norm": 1.8716667890548706, "learning_rate": 9.778253258120988e-05, "loss": 3.8661, "step": 5261 }, { "epoch": 1.5345971563981042, "grad_norm": 3.1816980838775635, "learning_rate": 9.776308111262402e-05, "loss": 4.1895, "step": 5262 }, { "epoch": 1.53488880787459, "grad_norm": 2.427536964416504, "learning_rate": 9.774362964403812e-05, "loss": 4.054, "step": 5263 }, { "epoch": 1.5351804593510754, "grad_norm": 1.9783076047897339, "learning_rate": 9.772417817545226e-05, "loss": 4.2928, "step": 5264 }, { "epoch": 1.535472110827561, "grad_norm": 2.388223648071289, "learning_rate": 9.770472670686638e-05, "loss": 4.3288, "step": 5265 }, { "epoch": 1.5357637623040468, "grad_norm": 2.0964787006378174, "learning_rate": 9.768527523828049e-05, "loss": 3.8665, "step": 5266 }, { "epoch": 1.5360554137805322, "grad_norm": 2.107144355773926, "learning_rate": 9.766582376969462e-05, "loss": 4.2147, "step": 5267 }, { "epoch": 1.536347065257018, "grad_norm": 2.1124842166900635, "learning_rate": 9.764637230110873e-05, "loss": 4.3296, "step": 5268 }, { "epoch": 1.5366387167335036, "grad_norm": 1.8778754472732544, "learning_rate": 9.762692083252287e-05, "loss": 3.8849, "step": 5269 }, { "epoch": 1.536930368209989, "grad_norm": 2.370663642883301, "learning_rate": 9.760746936393697e-05, "loss": 4.0553, "step": 5270 }, { "epoch": 1.5372220196864745, "grad_norm": 2.061927556991577, "learning_rate": 9.75880178953511e-05, "loss": 3.8921, "step": 5271 }, { "epoch": 1.5375136711629602, "grad_norm": 2.216684579849243, "learning_rate": 9.756856642676523e-05, "loss": 4.0654, "step": 5272 }, { "epoch": 1.537805322639446, "grad_norm": 2.723867416381836, "learning_rate": 9.754911495817934e-05, "loss": 3.5837, "step": 5273 }, { "epoch": 1.5380969741159314, "grad_norm": 2.17452073097229, "learning_rate": 9.752966348959347e-05, "loss": 4.0093, "step": 5274 }, { "epoch": 1.538388625592417, "grad_norm": 2.4192683696746826, "learning_rate": 9.751021202100758e-05, "loss": 4.2656, "step": 5275 }, { "epoch": 1.5386802770689028, "grad_norm": 2.4012110233306885, "learning_rate": 9.749076055242172e-05, "loss": 4.1076, "step": 5276 }, { "epoch": 1.5389719285453882, "grad_norm": 1.8867398500442505, "learning_rate": 9.747130908383584e-05, "loss": 3.7662, "step": 5277 }, { "epoch": 1.539263580021874, "grad_norm": 2.4963114261627197, "learning_rate": 9.745185761524995e-05, "loss": 4.3065, "step": 5278 }, { "epoch": 1.5395552314983596, "grad_norm": 2.275452136993408, "learning_rate": 9.743240614666408e-05, "loss": 4.1597, "step": 5279 }, { "epoch": 1.539846882974845, "grad_norm": 2.507004499435425, "learning_rate": 9.741295467807819e-05, "loss": 4.1104, "step": 5280 }, { "epoch": 1.5401385344513305, "grad_norm": 3.0632665157318115, "learning_rate": 9.739350320949233e-05, "loss": 4.1744, "step": 5281 }, { "epoch": 1.5404301859278162, "grad_norm": 2.4337308406829834, "learning_rate": 9.737405174090645e-05, "loss": 3.9335, "step": 5282 }, { "epoch": 1.540721837404302, "grad_norm": 2.2414793968200684, "learning_rate": 9.735460027232057e-05, "loss": 3.9761, "step": 5283 }, { "epoch": 1.5410134888807874, "grad_norm": 2.072219133377075, "learning_rate": 9.733514880373469e-05, "loss": 4.0423, "step": 5284 }, { "epoch": 1.541305140357273, "grad_norm": 2.034970283508301, "learning_rate": 9.73156973351488e-05, "loss": 4.1054, "step": 5285 }, { "epoch": 1.5415967918337588, "grad_norm": 2.419018030166626, "learning_rate": 9.729624586656293e-05, "loss": 4.2891, "step": 5286 }, { "epoch": 1.5418884433102442, "grad_norm": 2.496490955352783, "learning_rate": 9.727679439797706e-05, "loss": 4.245, "step": 5287 }, { "epoch": 1.54218009478673, "grad_norm": 2.4409019947052, "learning_rate": 9.725734292939118e-05, "loss": 4.0216, "step": 5288 }, { "epoch": 1.5424717462632156, "grad_norm": 1.9064809083938599, "learning_rate": 9.72378914608053e-05, "loss": 4.1283, "step": 5289 }, { "epoch": 1.542763397739701, "grad_norm": 2.029512405395508, "learning_rate": 9.72184399922194e-05, "loss": 3.8937, "step": 5290 }, { "epoch": 1.5430550492161865, "grad_norm": 3.326136589050293, "learning_rate": 9.719898852363354e-05, "loss": 4.1152, "step": 5291 }, { "epoch": 1.5433467006926722, "grad_norm": 2.610039234161377, "learning_rate": 9.717953705504766e-05, "loss": 4.2073, "step": 5292 }, { "epoch": 1.543638352169158, "grad_norm": 2.147651433944702, "learning_rate": 9.716008558646178e-05, "loss": 4.2363, "step": 5293 }, { "epoch": 1.5439300036456434, "grad_norm": 1.8293275833129883, "learning_rate": 9.71406341178759e-05, "loss": 4.077, "step": 5294 }, { "epoch": 1.544221655122129, "grad_norm": 2.4355275630950928, "learning_rate": 9.712118264929003e-05, "loss": 3.9546, "step": 5295 }, { "epoch": 1.5445133065986147, "grad_norm": 2.911620616912842, "learning_rate": 9.710173118070415e-05, "loss": 4.0961, "step": 5296 }, { "epoch": 1.5448049580751002, "grad_norm": 2.3349111080169678, "learning_rate": 9.708227971211827e-05, "loss": 4.0761, "step": 5297 }, { "epoch": 1.545096609551586, "grad_norm": 1.900137186050415, "learning_rate": 9.706282824353239e-05, "loss": 4.1099, "step": 5298 }, { "epoch": 1.5453882610280716, "grad_norm": 2.0756993293762207, "learning_rate": 9.704337677494651e-05, "loss": 3.9796, "step": 5299 }, { "epoch": 1.545679912504557, "grad_norm": 2.0280563831329346, "learning_rate": 9.702392530636064e-05, "loss": 3.9478, "step": 5300 }, { "epoch": 1.5459715639810425, "grad_norm": 2.5042128562927246, "learning_rate": 9.700447383777476e-05, "loss": 4.2305, "step": 5301 }, { "epoch": 1.5462632154575282, "grad_norm": 2.583005428314209, "learning_rate": 9.698502236918888e-05, "loss": 4.1968, "step": 5302 }, { "epoch": 1.546554866934014, "grad_norm": 3.3127667903900146, "learning_rate": 9.6965570900603e-05, "loss": 4.1157, "step": 5303 }, { "epoch": 1.5468465184104994, "grad_norm": 2.4369609355926514, "learning_rate": 9.694611943201712e-05, "loss": 3.8033, "step": 5304 }, { "epoch": 1.547138169886985, "grad_norm": 2.6438658237457275, "learning_rate": 9.692666796343124e-05, "loss": 4.2863, "step": 5305 }, { "epoch": 1.5474298213634707, "grad_norm": 2.816558837890625, "learning_rate": 9.690721649484537e-05, "loss": 3.9536, "step": 5306 }, { "epoch": 1.5477214728399562, "grad_norm": 2.249560832977295, "learning_rate": 9.688776502625949e-05, "loss": 4.1985, "step": 5307 }, { "epoch": 1.548013124316442, "grad_norm": 2.4689583778381348, "learning_rate": 9.686831355767361e-05, "loss": 3.9745, "step": 5308 }, { "epoch": 1.5483047757929276, "grad_norm": 2.148805618286133, "learning_rate": 9.684886208908773e-05, "loss": 3.881, "step": 5309 }, { "epoch": 1.548596427269413, "grad_norm": 2.1000754833221436, "learning_rate": 9.682941062050185e-05, "loss": 3.8057, "step": 5310 }, { "epoch": 1.5488880787458985, "grad_norm": 1.9353095293045044, "learning_rate": 9.680995915191597e-05, "loss": 4.2731, "step": 5311 }, { "epoch": 1.5491797302223842, "grad_norm": 2.2304699420928955, "learning_rate": 9.67905076833301e-05, "loss": 4.104, "step": 5312 }, { "epoch": 1.5494713816988699, "grad_norm": 3.0134692192077637, "learning_rate": 9.677105621474422e-05, "loss": 4.0335, "step": 5313 }, { "epoch": 1.5497630331753554, "grad_norm": 2.279278039932251, "learning_rate": 9.675160474615834e-05, "loss": 4.0266, "step": 5314 }, { "epoch": 1.550054684651841, "grad_norm": 3.353891611099243, "learning_rate": 9.673215327757246e-05, "loss": 4.1873, "step": 5315 }, { "epoch": 1.5503463361283267, "grad_norm": 2.1529152393341064, "learning_rate": 9.671270180898658e-05, "loss": 3.9817, "step": 5316 }, { "epoch": 1.5506379876048122, "grad_norm": 2.9887049198150635, "learning_rate": 9.66932503404007e-05, "loss": 3.8427, "step": 5317 }, { "epoch": 1.5509296390812979, "grad_norm": 2.130126714706421, "learning_rate": 9.667379887181482e-05, "loss": 4.0024, "step": 5318 }, { "epoch": 1.5512212905577836, "grad_norm": 2.7128419876098633, "learning_rate": 9.665434740322895e-05, "loss": 4.3098, "step": 5319 }, { "epoch": 1.551512942034269, "grad_norm": 1.9902691841125488, "learning_rate": 9.663489593464307e-05, "loss": 4.1049, "step": 5320 }, { "epoch": 1.5518045935107545, "grad_norm": 2.047435998916626, "learning_rate": 9.661544446605719e-05, "loss": 4.2727, "step": 5321 }, { "epoch": 1.5520962449872404, "grad_norm": 2.2346270084381104, "learning_rate": 9.659599299747132e-05, "loss": 4.2317, "step": 5322 }, { "epoch": 1.5523878964637259, "grad_norm": 2.3975982666015625, "learning_rate": 9.657654152888543e-05, "loss": 4.3241, "step": 5323 }, { "epoch": 1.5526795479402113, "grad_norm": 2.1885643005371094, "learning_rate": 9.655709006029955e-05, "loss": 4.1101, "step": 5324 }, { "epoch": 1.552971199416697, "grad_norm": 2.3359742164611816, "learning_rate": 9.653763859171368e-05, "loss": 3.8354, "step": 5325 }, { "epoch": 1.5532628508931827, "grad_norm": 2.143730640411377, "learning_rate": 9.65181871231278e-05, "loss": 4.2466, "step": 5326 }, { "epoch": 1.5535545023696682, "grad_norm": 2.6062893867492676, "learning_rate": 9.649873565454193e-05, "loss": 4.1356, "step": 5327 }, { "epoch": 1.5538461538461539, "grad_norm": 2.2509729862213135, "learning_rate": 9.647928418595604e-05, "loss": 3.9593, "step": 5328 }, { "epoch": 1.5541378053226396, "grad_norm": 1.830323338508606, "learning_rate": 9.645983271737018e-05, "loss": 3.8432, "step": 5329 }, { "epoch": 1.554429456799125, "grad_norm": 2.2513840198516846, "learning_rate": 9.644038124878428e-05, "loss": 3.9632, "step": 5330 }, { "epoch": 1.5547211082756105, "grad_norm": 2.468806743621826, "learning_rate": 9.64209297801984e-05, "loss": 4.3621, "step": 5331 }, { "epoch": 1.5550127597520964, "grad_norm": 2.173149585723877, "learning_rate": 9.640147831161254e-05, "loss": 4.2716, "step": 5332 }, { "epoch": 1.5553044112285819, "grad_norm": 2.6863884925842285, "learning_rate": 9.638202684302665e-05, "loss": 4.0971, "step": 5333 }, { "epoch": 1.5555960627050673, "grad_norm": 2.3549201488494873, "learning_rate": 9.636257537444078e-05, "loss": 4.0948, "step": 5334 }, { "epoch": 1.555887714181553, "grad_norm": 2.222728729248047, "learning_rate": 9.634312390585489e-05, "loss": 4.0478, "step": 5335 }, { "epoch": 1.5561793656580387, "grad_norm": 2.7219507694244385, "learning_rate": 9.632367243726901e-05, "loss": 4.0864, "step": 5336 }, { "epoch": 1.5564710171345242, "grad_norm": 2.1417438983917236, "learning_rate": 9.630422096868315e-05, "loss": 3.9307, "step": 5337 }, { "epoch": 1.5567626686110099, "grad_norm": 2.3860015869140625, "learning_rate": 9.628476950009726e-05, "loss": 4.19, "step": 5338 }, { "epoch": 1.5570543200874956, "grad_norm": 2.4099762439727783, "learning_rate": 9.626531803151139e-05, "loss": 4.0028, "step": 5339 }, { "epoch": 1.557345971563981, "grad_norm": 2.2030727863311768, "learning_rate": 9.62458665629255e-05, "loss": 3.6423, "step": 5340 }, { "epoch": 1.5576376230404665, "grad_norm": 2.2528579235076904, "learning_rate": 9.622641509433963e-05, "loss": 4.2151, "step": 5341 }, { "epoch": 1.5579292745169524, "grad_norm": 3.401083469390869, "learning_rate": 9.620696362575376e-05, "loss": 3.9702, "step": 5342 }, { "epoch": 1.5582209259934379, "grad_norm": 2.195571184158325, "learning_rate": 9.618751215716786e-05, "loss": 4.041, "step": 5343 }, { "epoch": 1.5585125774699233, "grad_norm": 2.8752973079681396, "learning_rate": 9.6168060688582e-05, "loss": 4.1334, "step": 5344 }, { "epoch": 1.558804228946409, "grad_norm": 2.200632333755493, "learning_rate": 9.614860921999611e-05, "loss": 4.1984, "step": 5345 }, { "epoch": 1.5590958804228947, "grad_norm": 2.1349124908447266, "learning_rate": 9.612915775141024e-05, "loss": 4.2568, "step": 5346 }, { "epoch": 1.5593875318993802, "grad_norm": 2.499030351638794, "learning_rate": 9.610970628282436e-05, "loss": 3.6856, "step": 5347 }, { "epoch": 1.5596791833758659, "grad_norm": 1.9815558195114136, "learning_rate": 9.609025481423847e-05, "loss": 4.1009, "step": 5348 }, { "epoch": 1.5599708348523516, "grad_norm": 2.0877857208251953, "learning_rate": 9.607080334565261e-05, "loss": 4.3243, "step": 5349 }, { "epoch": 1.560262486328837, "grad_norm": 2.087242841720581, "learning_rate": 9.605135187706672e-05, "loss": 3.8531, "step": 5350 }, { "epoch": 1.5605541378053225, "grad_norm": 2.826127290725708, "learning_rate": 9.603190040848085e-05, "loss": 3.9344, "step": 5351 }, { "epoch": 1.5608457892818084, "grad_norm": 2.686180353164673, "learning_rate": 9.601244893989496e-05, "loss": 3.9788, "step": 5352 }, { "epoch": 1.5611374407582939, "grad_norm": 2.6490612030029297, "learning_rate": 9.59929974713091e-05, "loss": 4.2696, "step": 5353 }, { "epoch": 1.5614290922347793, "grad_norm": 2.901033639907837, "learning_rate": 9.597354600272322e-05, "loss": 4.0452, "step": 5354 }, { "epoch": 1.561720743711265, "grad_norm": 3.3567075729370117, "learning_rate": 9.595409453413732e-05, "loss": 3.8807, "step": 5355 }, { "epoch": 1.5620123951877507, "grad_norm": 2.3998405933380127, "learning_rate": 9.593464306555146e-05, "loss": 4.1602, "step": 5356 }, { "epoch": 1.5623040466642362, "grad_norm": 2.9674487113952637, "learning_rate": 9.591519159696557e-05, "loss": 4.2075, "step": 5357 }, { "epoch": 1.5625956981407219, "grad_norm": 2.98490047454834, "learning_rate": 9.58957401283797e-05, "loss": 3.921, "step": 5358 }, { "epoch": 1.5628873496172075, "grad_norm": 2.9575889110565186, "learning_rate": 9.587628865979382e-05, "loss": 4.2219, "step": 5359 }, { "epoch": 1.563179001093693, "grad_norm": 2.053194999694824, "learning_rate": 9.585683719120793e-05, "loss": 3.8741, "step": 5360 }, { "epoch": 1.5634706525701785, "grad_norm": 3.2550814151763916, "learning_rate": 9.583738572262207e-05, "loss": 3.8802, "step": 5361 }, { "epoch": 1.5637623040466644, "grad_norm": 2.457432270050049, "learning_rate": 9.581793425403617e-05, "loss": 4.308, "step": 5362 }, { "epoch": 1.5640539555231499, "grad_norm": 1.9955509901046753, "learning_rate": 9.579848278545031e-05, "loss": 4.0363, "step": 5363 }, { "epoch": 1.5643456069996353, "grad_norm": 2.1482558250427246, "learning_rate": 9.577903131686443e-05, "loss": 3.6738, "step": 5364 }, { "epoch": 1.564637258476121, "grad_norm": 2.270212411880493, "learning_rate": 9.575957984827855e-05, "loss": 4.0756, "step": 5365 }, { "epoch": 1.5649289099526067, "grad_norm": 1.8996878862380981, "learning_rate": 9.574012837969267e-05, "loss": 4.269, "step": 5366 }, { "epoch": 1.5652205614290922, "grad_norm": 2.212977886199951, "learning_rate": 9.572067691110678e-05, "loss": 4.0969, "step": 5367 }, { "epoch": 1.5655122129055778, "grad_norm": 2.2569713592529297, "learning_rate": 9.570122544252092e-05, "loss": 4.0412, "step": 5368 }, { "epoch": 1.5658038643820635, "grad_norm": 2.040757417678833, "learning_rate": 9.568177397393504e-05, "loss": 4.2061, "step": 5369 }, { "epoch": 1.566095515858549, "grad_norm": 2.0431368350982666, "learning_rate": 9.566232250534916e-05, "loss": 3.8244, "step": 5370 }, { "epoch": 1.5663871673350345, "grad_norm": 2.089156150817871, "learning_rate": 9.564287103676328e-05, "loss": 3.9688, "step": 5371 }, { "epoch": 1.5666788188115204, "grad_norm": 2.1369309425354004, "learning_rate": 9.562341956817739e-05, "loss": 4.1269, "step": 5372 }, { "epoch": 1.5669704702880058, "grad_norm": 2.167376756668091, "learning_rate": 9.560396809959153e-05, "loss": 4.0948, "step": 5373 }, { "epoch": 1.5672621217644913, "grad_norm": 2.0684752464294434, "learning_rate": 9.558451663100565e-05, "loss": 3.9346, "step": 5374 }, { "epoch": 1.567553773240977, "grad_norm": 2.568215847015381, "learning_rate": 9.556506516241977e-05, "loss": 4.1179, "step": 5375 }, { "epoch": 1.5678454247174627, "grad_norm": 2.700019598007202, "learning_rate": 9.554561369383389e-05, "loss": 4.0391, "step": 5376 }, { "epoch": 1.5681370761939482, "grad_norm": 2.1847174167633057, "learning_rate": 9.552616222524801e-05, "loss": 3.9466, "step": 5377 }, { "epoch": 1.5684287276704338, "grad_norm": 2.2003438472747803, "learning_rate": 9.550671075666213e-05, "loss": 3.8833, "step": 5378 }, { "epoch": 1.5687203791469195, "grad_norm": 3.108191728591919, "learning_rate": 9.548725928807626e-05, "loss": 4.2346, "step": 5379 }, { "epoch": 1.569012030623405, "grad_norm": 2.251655101776123, "learning_rate": 9.546780781949038e-05, "loss": 4.2911, "step": 5380 }, { "epoch": 1.5693036820998907, "grad_norm": 1.974539875984192, "learning_rate": 9.54483563509045e-05, "loss": 4.199, "step": 5381 }, { "epoch": 1.5695953335763764, "grad_norm": 2.392042636871338, "learning_rate": 9.542890488231862e-05, "loss": 3.9787, "step": 5382 }, { "epoch": 1.5698869850528618, "grad_norm": 2.3602864742279053, "learning_rate": 9.540945341373274e-05, "loss": 3.9718, "step": 5383 }, { "epoch": 1.5701786365293473, "grad_norm": 2.298562526702881, "learning_rate": 9.539000194514686e-05, "loss": 3.9965, "step": 5384 }, { "epoch": 1.570470288005833, "grad_norm": 2.186204671859741, "learning_rate": 9.537055047656098e-05, "loss": 3.9416, "step": 5385 }, { "epoch": 1.5707619394823187, "grad_norm": 2.3019051551818848, "learning_rate": 9.53510990079751e-05, "loss": 3.9992, "step": 5386 }, { "epoch": 1.5710535909588041, "grad_norm": 2.1650044918060303, "learning_rate": 9.533164753938923e-05, "loss": 4.0306, "step": 5387 }, { "epoch": 1.5713452424352898, "grad_norm": 2.040329694747925, "learning_rate": 9.531219607080335e-05, "loss": 3.9478, "step": 5388 }, { "epoch": 1.5716368939117755, "grad_norm": 2.404310464859009, "learning_rate": 9.529274460221747e-05, "loss": 3.8003, "step": 5389 }, { "epoch": 1.571928545388261, "grad_norm": 2.029453992843628, "learning_rate": 9.527329313363159e-05, "loss": 4.1058, "step": 5390 }, { "epoch": 1.5722201968647467, "grad_norm": 2.3557815551757812, "learning_rate": 9.525384166504571e-05, "loss": 4.213, "step": 5391 }, { "epoch": 1.5725118483412324, "grad_norm": 2.513867139816284, "learning_rate": 9.523439019645984e-05, "loss": 4.161, "step": 5392 }, { "epoch": 1.5728034998177178, "grad_norm": 2.0699453353881836, "learning_rate": 9.521493872787396e-05, "loss": 4.1305, "step": 5393 }, { "epoch": 1.5730951512942033, "grad_norm": 2.2347726821899414, "learning_rate": 9.519548725928808e-05, "loss": 4.0491, "step": 5394 }, { "epoch": 1.573386802770689, "grad_norm": 2.7903358936309814, "learning_rate": 9.51760357907022e-05, "loss": 3.8445, "step": 5395 }, { "epoch": 1.5736784542471747, "grad_norm": 2.385463237762451, "learning_rate": 9.515658432211632e-05, "loss": 4.2903, "step": 5396 }, { "epoch": 1.5739701057236601, "grad_norm": 2.5409934520721436, "learning_rate": 9.513713285353044e-05, "loss": 4.0778, "step": 5397 }, { "epoch": 1.5742617572001458, "grad_norm": 2.043140411376953, "learning_rate": 9.511768138494457e-05, "loss": 4.2744, "step": 5398 }, { "epoch": 1.5745534086766315, "grad_norm": 2.611567258834839, "learning_rate": 9.50982299163587e-05, "loss": 4.0969, "step": 5399 }, { "epoch": 1.574845060153117, "grad_norm": 2.02321720123291, "learning_rate": 9.507877844777281e-05, "loss": 4.2989, "step": 5400 }, { "epoch": 1.5751367116296027, "grad_norm": 2.1015701293945312, "learning_rate": 9.505932697918693e-05, "loss": 4.1006, "step": 5401 }, { "epoch": 1.5754283631060884, "grad_norm": 2.1442618370056152, "learning_rate": 9.503987551060105e-05, "loss": 3.9648, "step": 5402 }, { "epoch": 1.5757200145825738, "grad_norm": 2.21850848197937, "learning_rate": 9.502042404201517e-05, "loss": 4.0672, "step": 5403 }, { "epoch": 1.5760116660590593, "grad_norm": 2.348074197769165, "learning_rate": 9.500097257342931e-05, "loss": 4.3402, "step": 5404 }, { "epoch": 1.576303317535545, "grad_norm": 1.9548885822296143, "learning_rate": 9.498152110484342e-05, "loss": 3.9465, "step": 5405 }, { "epoch": 1.5765949690120307, "grad_norm": 2.057727098464966, "learning_rate": 9.496206963625754e-05, "loss": 3.9284, "step": 5406 }, { "epoch": 1.5768866204885161, "grad_norm": 2.4749631881713867, "learning_rate": 9.494261816767166e-05, "loss": 4.1899, "step": 5407 }, { "epoch": 1.5771782719650018, "grad_norm": 1.9945048093795776, "learning_rate": 9.492316669908578e-05, "loss": 4.2056, "step": 5408 }, { "epoch": 1.5774699234414875, "grad_norm": 2.2004852294921875, "learning_rate": 9.490371523049992e-05, "loss": 3.8704, "step": 5409 }, { "epoch": 1.577761574917973, "grad_norm": 2.446227550506592, "learning_rate": 9.488426376191402e-05, "loss": 4.1384, "step": 5410 }, { "epoch": 1.5780532263944587, "grad_norm": 2.293490171432495, "learning_rate": 9.486481229332816e-05, "loss": 4.0095, "step": 5411 }, { "epoch": 1.5783448778709444, "grad_norm": 1.8874162435531616, "learning_rate": 9.484536082474227e-05, "loss": 4.2136, "step": 5412 }, { "epoch": 1.5786365293474298, "grad_norm": 2.318190336227417, "learning_rate": 9.482590935615639e-05, "loss": 3.8778, "step": 5413 }, { "epoch": 1.5789281808239153, "grad_norm": 2.2222280502319336, "learning_rate": 9.480645788757052e-05, "loss": 4.0201, "step": 5414 }, { "epoch": 1.579219832300401, "grad_norm": 3.544241428375244, "learning_rate": 9.478700641898463e-05, "loss": 4.1095, "step": 5415 }, { "epoch": 1.5795114837768867, "grad_norm": 2.5576789379119873, "learning_rate": 9.476755495039877e-05, "loss": 4.1153, "step": 5416 }, { "epoch": 1.5798031352533721, "grad_norm": 1.9909435510635376, "learning_rate": 9.474810348181288e-05, "loss": 3.9966, "step": 5417 }, { "epoch": 1.5800947867298578, "grad_norm": 1.9477176666259766, "learning_rate": 9.4728652013227e-05, "loss": 4.059, "step": 5418 }, { "epoch": 1.5803864382063435, "grad_norm": 2.0638198852539062, "learning_rate": 9.470920054464113e-05, "loss": 4.2228, "step": 5419 }, { "epoch": 1.580678089682829, "grad_norm": 2.587754249572754, "learning_rate": 9.468974907605524e-05, "loss": 4.1664, "step": 5420 }, { "epoch": 1.5809697411593147, "grad_norm": 3.389195203781128, "learning_rate": 9.467029760746938e-05, "loss": 4.2204, "step": 5421 }, { "epoch": 1.5812613926358003, "grad_norm": 2.4050750732421875, "learning_rate": 9.465084613888348e-05, "loss": 4.116, "step": 5422 }, { "epoch": 1.5815530441122858, "grad_norm": 3.5892550945281982, "learning_rate": 9.463139467029762e-05, "loss": 4.1183, "step": 5423 }, { "epoch": 1.5818446955887713, "grad_norm": 1.997937560081482, "learning_rate": 9.461194320171174e-05, "loss": 4.0464, "step": 5424 }, { "epoch": 1.582136347065257, "grad_norm": 2.1191964149475098, "learning_rate": 9.459249173312585e-05, "loss": 4.2764, "step": 5425 }, { "epoch": 1.5824279985417427, "grad_norm": 2.9148762226104736, "learning_rate": 9.457304026453998e-05, "loss": 4.0418, "step": 5426 }, { "epoch": 1.5827196500182281, "grad_norm": 2.27844500541687, "learning_rate": 9.455358879595409e-05, "loss": 4.1364, "step": 5427 }, { "epoch": 1.5830113014947138, "grad_norm": 2.5581188201904297, "learning_rate": 9.453413732736823e-05, "loss": 4.1234, "step": 5428 }, { "epoch": 1.5833029529711995, "grad_norm": 2.497654914855957, "learning_rate": 9.451468585878235e-05, "loss": 4.0862, "step": 5429 }, { "epoch": 1.583594604447685, "grad_norm": 2.4263222217559814, "learning_rate": 9.449523439019646e-05, "loss": 3.9061, "step": 5430 }, { "epoch": 1.5838862559241706, "grad_norm": 2.320746660232544, "learning_rate": 9.447578292161059e-05, "loss": 3.927, "step": 5431 }, { "epoch": 1.5841779074006563, "grad_norm": 2.2554454803466797, "learning_rate": 9.44563314530247e-05, "loss": 4.026, "step": 5432 }, { "epoch": 1.5844695588771418, "grad_norm": 2.8590848445892334, "learning_rate": 9.443687998443883e-05, "loss": 4.1851, "step": 5433 }, { "epoch": 1.5847612103536273, "grad_norm": 2.436490774154663, "learning_rate": 9.441742851585294e-05, "loss": 4.2598, "step": 5434 }, { "epoch": 1.5850528618301132, "grad_norm": 2.1171088218688965, "learning_rate": 9.439797704726708e-05, "loss": 4.0729, "step": 5435 }, { "epoch": 1.5853445133065986, "grad_norm": 2.078496217727661, "learning_rate": 9.43785255786812e-05, "loss": 4.3236, "step": 5436 }, { "epoch": 1.5856361647830841, "grad_norm": 1.9879364967346191, "learning_rate": 9.435907411009531e-05, "loss": 4.0941, "step": 5437 }, { "epoch": 1.5859278162595698, "grad_norm": 2.705240488052368, "learning_rate": 9.433962264150944e-05, "loss": 4.0206, "step": 5438 }, { "epoch": 1.5862194677360555, "grad_norm": 1.7166132926940918, "learning_rate": 9.432017117292355e-05, "loss": 4.1491, "step": 5439 }, { "epoch": 1.586511119212541, "grad_norm": 2.529662847518921, "learning_rate": 9.430071970433769e-05, "loss": 3.8953, "step": 5440 }, { "epoch": 1.5868027706890266, "grad_norm": 1.879354476928711, "learning_rate": 9.428126823575181e-05, "loss": 4.1232, "step": 5441 }, { "epoch": 1.5870944221655123, "grad_norm": 2.1716833114624023, "learning_rate": 9.426181676716593e-05, "loss": 4.1883, "step": 5442 }, { "epoch": 1.5873860736419978, "grad_norm": 2.741389274597168, "learning_rate": 9.424236529858005e-05, "loss": 4.0815, "step": 5443 }, { "epoch": 1.5876777251184833, "grad_norm": 2.601963520050049, "learning_rate": 9.422291382999416e-05, "loss": 3.8636, "step": 5444 }, { "epoch": 1.5879693765949692, "grad_norm": 1.8400479555130005, "learning_rate": 9.42034623614083e-05, "loss": 3.992, "step": 5445 }, { "epoch": 1.5882610280714546, "grad_norm": 2.5949487686157227, "learning_rate": 9.418401089282242e-05, "loss": 4.0372, "step": 5446 }, { "epoch": 1.58855267954794, "grad_norm": 3.1707706451416016, "learning_rate": 9.416455942423654e-05, "loss": 4.0526, "step": 5447 }, { "epoch": 1.5888443310244258, "grad_norm": 2.69830584526062, "learning_rate": 9.414510795565066e-05, "loss": 3.717, "step": 5448 }, { "epoch": 1.5891359825009115, "grad_norm": 2.1163856983184814, "learning_rate": 9.412565648706477e-05, "loss": 3.9364, "step": 5449 }, { "epoch": 1.589427633977397, "grad_norm": 2.6804027557373047, "learning_rate": 9.41062050184789e-05, "loss": 4.0799, "step": 5450 }, { "epoch": 1.5897192854538826, "grad_norm": 2.0396180152893066, "learning_rate": 9.408675354989302e-05, "loss": 4.2588, "step": 5451 }, { "epoch": 1.5900109369303683, "grad_norm": 2.2175698280334473, "learning_rate": 9.406730208130715e-05, "loss": 3.8576, "step": 5452 }, { "epoch": 1.5903025884068538, "grad_norm": 2.0382814407348633, "learning_rate": 9.404785061272127e-05, "loss": 4.1327, "step": 5453 }, { "epoch": 1.5905942398833393, "grad_norm": 2.1509201526641846, "learning_rate": 9.402839914413539e-05, "loss": 4.0697, "step": 5454 }, { "epoch": 1.5908858913598252, "grad_norm": 2.054867744445801, "learning_rate": 9.400894767554951e-05, "loss": 3.9209, "step": 5455 }, { "epoch": 1.5911775428363106, "grad_norm": 2.86387300491333, "learning_rate": 9.398949620696363e-05, "loss": 4.2961, "step": 5456 }, { "epoch": 1.591469194312796, "grad_norm": 2.3506171703338623, "learning_rate": 9.397004473837775e-05, "loss": 3.9142, "step": 5457 }, { "epoch": 1.5917608457892818, "grad_norm": 3.7910122871398926, "learning_rate": 9.395059326979187e-05, "loss": 4.3267, "step": 5458 }, { "epoch": 1.5920524972657675, "grad_norm": 2.855358839035034, "learning_rate": 9.3931141801206e-05, "loss": 4.2453, "step": 5459 }, { "epoch": 1.592344148742253, "grad_norm": 1.9723403453826904, "learning_rate": 9.391169033262012e-05, "loss": 3.9099, "step": 5460 }, { "epoch": 1.5926358002187386, "grad_norm": 1.9100182056427002, "learning_rate": 9.389223886403424e-05, "loss": 3.6044, "step": 5461 }, { "epoch": 1.5929274516952243, "grad_norm": 2.3845059871673584, "learning_rate": 9.387278739544836e-05, "loss": 4.2429, "step": 5462 }, { "epoch": 1.5932191031717098, "grad_norm": 2.4658234119415283, "learning_rate": 9.385333592686248e-05, "loss": 4.0031, "step": 5463 }, { "epoch": 1.5935107546481952, "grad_norm": 2.5762860774993896, "learning_rate": 9.38338844582766e-05, "loss": 4.1339, "step": 5464 }, { "epoch": 1.5938024061246812, "grad_norm": 1.7503324747085571, "learning_rate": 9.381443298969073e-05, "loss": 4.1712, "step": 5465 }, { "epoch": 1.5940940576011666, "grad_norm": 2.7148637771606445, "learning_rate": 9.379498152110485e-05, "loss": 4.2258, "step": 5466 }, { "epoch": 1.594385709077652, "grad_norm": 1.834147334098816, "learning_rate": 9.377553005251897e-05, "loss": 4.2408, "step": 5467 }, { "epoch": 1.5946773605541378, "grad_norm": 2.5294179916381836, "learning_rate": 9.375607858393309e-05, "loss": 3.8335, "step": 5468 }, { "epoch": 1.5949690120306235, "grad_norm": 2.272287368774414, "learning_rate": 9.373662711534721e-05, "loss": 4.2936, "step": 5469 }, { "epoch": 1.595260663507109, "grad_norm": 2.8437864780426025, "learning_rate": 9.371717564676133e-05, "loss": 3.7497, "step": 5470 }, { "epoch": 1.5955523149835946, "grad_norm": 2.029113292694092, "learning_rate": 9.369772417817546e-05, "loss": 3.949, "step": 5471 }, { "epoch": 1.5958439664600803, "grad_norm": 2.6596152782440186, "learning_rate": 9.367827270958958e-05, "loss": 4.1114, "step": 5472 }, { "epoch": 1.5961356179365658, "grad_norm": 2.186157703399658, "learning_rate": 9.36588212410037e-05, "loss": 4.0484, "step": 5473 }, { "epoch": 1.5964272694130512, "grad_norm": 2.5647799968719482, "learning_rate": 9.363936977241782e-05, "loss": 3.9104, "step": 5474 }, { "epoch": 1.5967189208895372, "grad_norm": 2.3250539302825928, "learning_rate": 9.361991830383194e-05, "loss": 4.3313, "step": 5475 }, { "epoch": 1.5970105723660226, "grad_norm": 2.047574758529663, "learning_rate": 9.360046683524606e-05, "loss": 4.0493, "step": 5476 }, { "epoch": 1.597302223842508, "grad_norm": 1.9547240734100342, "learning_rate": 9.358101536666018e-05, "loss": 3.9848, "step": 5477 }, { "epoch": 1.5975938753189938, "grad_norm": 2.0237348079681396, "learning_rate": 9.35615638980743e-05, "loss": 4.1064, "step": 5478 }, { "epoch": 1.5978855267954795, "grad_norm": 3.1523091793060303, "learning_rate": 9.354211242948843e-05, "loss": 3.9963, "step": 5479 }, { "epoch": 1.598177178271965, "grad_norm": 2.4327008724212646, "learning_rate": 9.352266096090255e-05, "loss": 4.0284, "step": 5480 }, { "epoch": 1.5984688297484506, "grad_norm": 2.234128713607788, "learning_rate": 9.350320949231668e-05, "loss": 4.1306, "step": 5481 }, { "epoch": 1.5987604812249363, "grad_norm": 2.026268243789673, "learning_rate": 9.348375802373079e-05, "loss": 3.912, "step": 5482 }, { "epoch": 1.5990521327014218, "grad_norm": 2.1285672187805176, "learning_rate": 9.346430655514491e-05, "loss": 4.0043, "step": 5483 }, { "epoch": 1.5993437841779072, "grad_norm": 3.1536130905151367, "learning_rate": 9.344485508655904e-05, "loss": 4.0964, "step": 5484 }, { "epoch": 1.5996354356543931, "grad_norm": 1.9035567045211792, "learning_rate": 9.342540361797316e-05, "loss": 3.5377, "step": 5485 }, { "epoch": 1.5999270871308786, "grad_norm": 2.263566017150879, "learning_rate": 9.340595214938729e-05, "loss": 4.1454, "step": 5486 }, { "epoch": 1.600218738607364, "grad_norm": 2.1860930919647217, "learning_rate": 9.33865006808014e-05, "loss": 3.8981, "step": 5487 }, { "epoch": 1.6005103900838498, "grad_norm": 2.5272226333618164, "learning_rate": 9.336704921221554e-05, "loss": 3.9067, "step": 5488 }, { "epoch": 1.6008020415603355, "grad_norm": 2.1237666606903076, "learning_rate": 9.334759774362964e-05, "loss": 4.3045, "step": 5489 }, { "epoch": 1.601093693036821, "grad_norm": 2.466170072555542, "learning_rate": 9.332814627504377e-05, "loss": 4.3932, "step": 5490 }, { "epoch": 1.6013853445133066, "grad_norm": 2.8886983394622803, "learning_rate": 9.33086948064579e-05, "loss": 4.2308, "step": 5491 }, { "epoch": 1.6016769959897923, "grad_norm": 3.6685681343078613, "learning_rate": 9.328924333787201e-05, "loss": 4.2048, "step": 5492 }, { "epoch": 1.6019686474662778, "grad_norm": 2.452345609664917, "learning_rate": 9.326979186928614e-05, "loss": 3.9102, "step": 5493 }, { "epoch": 1.6022602989427635, "grad_norm": 2.12675142288208, "learning_rate": 9.325034040070025e-05, "loss": 3.9217, "step": 5494 }, { "epoch": 1.6025519504192491, "grad_norm": 2.0950849056243896, "learning_rate": 9.323088893211437e-05, "loss": 3.8173, "step": 5495 }, { "epoch": 1.6028436018957346, "grad_norm": 1.8608781099319458, "learning_rate": 9.321143746352851e-05, "loss": 3.8768, "step": 5496 }, { "epoch": 1.60313525337222, "grad_norm": 3.4483819007873535, "learning_rate": 9.319198599494262e-05, "loss": 4.09, "step": 5497 }, { "epoch": 1.6034269048487058, "grad_norm": 2.355695962905884, "learning_rate": 9.317253452635675e-05, "loss": 4.1005, "step": 5498 }, { "epoch": 1.6037185563251914, "grad_norm": 2.4240052700042725, "learning_rate": 9.315308305777086e-05, "loss": 4.0804, "step": 5499 }, { "epoch": 1.604010207801677, "grad_norm": 2.1615352630615234, "learning_rate": 9.3133631589185e-05, "loss": 3.9807, "step": 5500 } ], "logging_steps": 1, "max_steps": 10287, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.840601314496435e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }