{ "best_metric": 1.9663305282592773, "best_model_checkpoint": "miner_id_24/checkpoint-4950", "epoch": 0.3772949300993768, "eval_steps": 150, "global_step": 5040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.486010517844778e-05, "grad_norm": 4.111408710479736, "learning_rate": 2e-05, "loss": 2.526, "step": 1 }, { "epoch": 7.486010517844778e-05, "eval_loss": 2.46103572845459, "eval_runtime": 178.8361, "eval_samples_per_second": 27.959, "eval_steps_per_second": 13.979, "step": 1 }, { "epoch": 0.00014972021035689555, "grad_norm": 1.8366540670394897, "learning_rate": 4e-05, "loss": 1.9493, "step": 2 }, { "epoch": 0.00022458031553534332, "grad_norm": 2.2284059524536133, "learning_rate": 6e-05, "loss": 1.9067, "step": 3 }, { "epoch": 0.0002994404207137911, "grad_norm": 1.5922659635543823, "learning_rate": 8e-05, "loss": 2.5633, "step": 4 }, { "epoch": 0.00037430052589223887, "grad_norm": 1.6796388626098633, "learning_rate": 0.0001, "loss": 2.469, "step": 5 }, { "epoch": 0.00044916063107068664, "grad_norm": 3.4307525157928467, "learning_rate": 0.00012, "loss": 2.5414, "step": 6 }, { "epoch": 0.0005240207362491344, "grad_norm": 1.490680456161499, "learning_rate": 0.00014, "loss": 2.3336, "step": 7 }, { "epoch": 0.0005988808414275822, "grad_norm": 2.054499387741089, "learning_rate": 0.00016, "loss": 2.1504, "step": 8 }, { "epoch": 0.00067374094660603, "grad_norm": 1.5282163619995117, "learning_rate": 0.00018, "loss": 2.3182, "step": 9 }, { "epoch": 0.0007486010517844777, "grad_norm": 1.6067579984664917, "learning_rate": 0.0002, "loss": 2.4376, "step": 10 }, { "epoch": 0.0008234611569629256, "grad_norm": 1.925720453262329, "learning_rate": 0.00019999998049554746, "loss": 2.1818, "step": 11 }, { "epoch": 0.0008983212621413733, "grad_norm": 1.6667739152908325, "learning_rate": 0.00019999992198219734, "loss": 2.1499, "step": 12 }, { "epoch": 0.0009731813673198211, "grad_norm": 1.7717599868774414, "learning_rate": 0.00019999982445997254, "loss": 2.2291, "step": 13 }, { "epoch": 0.0010480414724982688, "grad_norm": 1.7018389701843262, "learning_rate": 0.0001999996879289111, "loss": 2.1286, "step": 14 }, { "epoch": 0.0011229015776767167, "grad_norm": 1.7203824520111084, "learning_rate": 0.0001999995123890662, "loss": 2.8316, "step": 15 }, { "epoch": 0.0011977616828551644, "grad_norm": 1.2076104879379272, "learning_rate": 0.0001999992978405064, "loss": 2.2787, "step": 16 }, { "epoch": 0.0012726217880336121, "grad_norm": 1.3146586418151855, "learning_rate": 0.00019999904428331537, "loss": 2.578, "step": 17 }, { "epoch": 0.00134748189321206, "grad_norm": 1.670967698097229, "learning_rate": 0.00019999875171759201, "loss": 2.419, "step": 18 }, { "epoch": 0.0014223419983905078, "grad_norm": 1.4809753894805908, "learning_rate": 0.00019999842014345047, "loss": 2.8104, "step": 19 }, { "epoch": 0.0014972021035689555, "grad_norm": 1.1765881776809692, "learning_rate": 0.00019999804956102003, "loss": 2.3827, "step": 20 }, { "epoch": 0.0015720622087474032, "grad_norm": 1.7024389505386353, "learning_rate": 0.00019999763997044533, "loss": 2.4099, "step": 21 }, { "epoch": 0.0016469223139258511, "grad_norm": 1.6132036447525024, "learning_rate": 0.00019999719137188616, "loss": 2.2613, "step": 22 }, { "epoch": 0.0017217824191042988, "grad_norm": 1.5018013715744019, "learning_rate": 0.0001999967037655174, "loss": 2.4731, "step": 23 }, { "epoch": 0.0017966425242827465, "grad_norm": 1.6416369676589966, "learning_rate": 0.00019999617715152934, "loss": 2.0677, "step": 24 }, { "epoch": 0.0018715026294611945, "grad_norm": 1.5189576148986816, "learning_rate": 0.0001999956115301274, "loss": 2.7483, "step": 25 }, { "epoch": 0.0019463627346396422, "grad_norm": 1.5883592367172241, "learning_rate": 0.0001999950069015322, "loss": 2.2968, "step": 26 }, { "epoch": 0.00202122283981809, "grad_norm": 1.851583480834961, "learning_rate": 0.00019999436326597963, "loss": 2.369, "step": 27 }, { "epoch": 0.0020960829449965376, "grad_norm": 1.5210293531417847, "learning_rate": 0.00019999368062372075, "loss": 2.4894, "step": 28 }, { "epoch": 0.0021709430501749855, "grad_norm": 1.403621792793274, "learning_rate": 0.00019999295897502183, "loss": 2.4293, "step": 29 }, { "epoch": 0.0022458031553534335, "grad_norm": 1.2215828895568848, "learning_rate": 0.00019999219832016443, "loss": 2.6136, "step": 30 }, { "epoch": 0.002320663260531881, "grad_norm": 1.2971354722976685, "learning_rate": 0.00019999139865944522, "loss": 2.4146, "step": 31 }, { "epoch": 0.002395523365710329, "grad_norm": 1.9793179035186768, "learning_rate": 0.00019999055999317617, "loss": 2.4949, "step": 32 }, { "epoch": 0.002470383470888777, "grad_norm": 1.4731570482254028, "learning_rate": 0.00019998968232168442, "loss": 2.4522, "step": 33 }, { "epoch": 0.0025452435760672243, "grad_norm": 1.3595935106277466, "learning_rate": 0.00019998876564531235, "loss": 2.4243, "step": 34 }, { "epoch": 0.002620103681245672, "grad_norm": 1.5801923274993896, "learning_rate": 0.00019998780996441755, "loss": 2.6774, "step": 35 }, { "epoch": 0.00269496378642412, "grad_norm": 1.0533666610717773, "learning_rate": 0.00019998681527937277, "loss": 2.4476, "step": 36 }, { "epoch": 0.0027698238916025676, "grad_norm": 1.2822818756103516, "learning_rate": 0.0001999857815905661, "loss": 2.5505, "step": 37 }, { "epoch": 0.0028446839967810156, "grad_norm": 1.3648440837860107, "learning_rate": 0.00019998470889840073, "loss": 2.2226, "step": 38 }, { "epoch": 0.002919544101959463, "grad_norm": 1.3018823862075806, "learning_rate": 0.0001999835972032951, "loss": 2.6759, "step": 39 }, { "epoch": 0.002994404207137911, "grad_norm": 1.3233710527420044, "learning_rate": 0.0001999824465056829, "loss": 2.5621, "step": 40 }, { "epoch": 0.003069264312316359, "grad_norm": 1.5865068435668945, "learning_rate": 0.000199981256806013, "loss": 2.0641, "step": 41 }, { "epoch": 0.0031441244174948064, "grad_norm": 1.6318912506103516, "learning_rate": 0.00019998002810474945, "loss": 2.1011, "step": 42 }, { "epoch": 0.0032189845226732543, "grad_norm": 2.143345832824707, "learning_rate": 0.0001999787604023716, "loss": 2.1955, "step": 43 }, { "epoch": 0.0032938446278517023, "grad_norm": 1.3460800647735596, "learning_rate": 0.00019997745369937394, "loss": 2.5831, "step": 44 }, { "epoch": 0.0033687047330301497, "grad_norm": 2.0939102172851562, "learning_rate": 0.00019997610799626618, "loss": 2.9353, "step": 45 }, { "epoch": 0.0034435648382085977, "grad_norm": 1.5049679279327393, "learning_rate": 0.00019997472329357332, "loss": 2.3039, "step": 46 }, { "epoch": 0.0035184249433870456, "grad_norm": 1.4898723363876343, "learning_rate": 0.00019997329959183546, "loss": 2.0425, "step": 47 }, { "epoch": 0.003593285048565493, "grad_norm": 1.885844349861145, "learning_rate": 0.000199971836891608, "loss": 2.2498, "step": 48 }, { "epoch": 0.003668145153743941, "grad_norm": 1.2817769050598145, "learning_rate": 0.00019997033519346155, "loss": 2.6281, "step": 49 }, { "epoch": 0.003743005258922389, "grad_norm": 1.3435673713684082, "learning_rate": 0.00019996879449798188, "loss": 1.9339, "step": 50 }, { "epoch": 0.0038178653641008364, "grad_norm": 1.497510313987732, "learning_rate": 0.00019996721480576996, "loss": 2.2812, "step": 51 }, { "epoch": 0.0038927254692792844, "grad_norm": 1.3614548444747925, "learning_rate": 0.00019996559611744208, "loss": 2.2116, "step": 52 }, { "epoch": 0.003967585574457732, "grad_norm": 1.34121572971344, "learning_rate": 0.00019996393843362963, "loss": 2.4497, "step": 53 }, { "epoch": 0.00404244567963618, "grad_norm": 1.4011002779006958, "learning_rate": 0.00019996224175497924, "loss": 2.5577, "step": 54 }, { "epoch": 0.004117305784814627, "grad_norm": 1.2182843685150146, "learning_rate": 0.00019996050608215283, "loss": 2.2079, "step": 55 }, { "epoch": 0.004192165889993075, "grad_norm": 1.209659218788147, "learning_rate": 0.00019995873141582737, "loss": 2.4538, "step": 56 }, { "epoch": 0.004267025995171523, "grad_norm": 1.290211796760559, "learning_rate": 0.00019995691775669526, "loss": 2.3354, "step": 57 }, { "epoch": 0.004341886100349971, "grad_norm": 1.2280081510543823, "learning_rate": 0.00019995506510546386, "loss": 1.9813, "step": 58 }, { "epoch": 0.004416746205528419, "grad_norm": 1.452284574508667, "learning_rate": 0.00019995317346285595, "loss": 2.1192, "step": 59 }, { "epoch": 0.004491606310706867, "grad_norm": 1.411696195602417, "learning_rate": 0.00019995124282960944, "loss": 2.2233, "step": 60 }, { "epoch": 0.004566466415885314, "grad_norm": 1.1029645204544067, "learning_rate": 0.00019994927320647743, "loss": 2.2623, "step": 61 }, { "epoch": 0.004641326521063762, "grad_norm": 1.5756524801254272, "learning_rate": 0.00019994726459422824, "loss": 2.4578, "step": 62 }, { "epoch": 0.00471618662624221, "grad_norm": 1.6215068101882935, "learning_rate": 0.00019994521699364542, "loss": 2.2586, "step": 63 }, { "epoch": 0.004791046731420658, "grad_norm": 1.7358067035675049, "learning_rate": 0.0001999431304055277, "loss": 2.3994, "step": 64 }, { "epoch": 0.004865906836599106, "grad_norm": 1.5708988904953003, "learning_rate": 0.00019994100483068907, "loss": 2.2498, "step": 65 }, { "epoch": 0.004940766941777554, "grad_norm": 1.2799689769744873, "learning_rate": 0.00019993884026995866, "loss": 2.4157, "step": 66 }, { "epoch": 0.005015627046956001, "grad_norm": 1.5260800123214722, "learning_rate": 0.00019993663672418084, "loss": 1.6742, "step": 67 }, { "epoch": 0.005090487152134449, "grad_norm": 1.2685835361480713, "learning_rate": 0.0001999343941942152, "loss": 2.2055, "step": 68 }, { "epoch": 0.0051653472573128965, "grad_norm": 1.2635517120361328, "learning_rate": 0.00019993211268093654, "loss": 2.4069, "step": 69 }, { "epoch": 0.005240207362491344, "grad_norm": 1.2519937753677368, "learning_rate": 0.00019992979218523487, "loss": 2.0969, "step": 70 }, { "epoch": 0.005315067467669792, "grad_norm": 1.1644010543823242, "learning_rate": 0.0001999274327080153, "loss": 2.1958, "step": 71 }, { "epoch": 0.00538992757284824, "grad_norm": 1.4190188646316528, "learning_rate": 0.00019992503425019832, "loss": 1.9161, "step": 72 }, { "epoch": 0.005464787678026687, "grad_norm": 1.537219524383545, "learning_rate": 0.00019992259681271955, "loss": 2.3114, "step": 73 }, { "epoch": 0.005539647783205135, "grad_norm": 1.5972734689712524, "learning_rate": 0.00019992012039652974, "loss": 2.2067, "step": 74 }, { "epoch": 0.005614507888383583, "grad_norm": 1.5215686559677124, "learning_rate": 0.00019991760500259498, "loss": 2.4377, "step": 75 }, { "epoch": 0.005689367993562031, "grad_norm": 1.4397282600402832, "learning_rate": 0.00019991505063189643, "loss": 2.2903, "step": 76 }, { "epoch": 0.005764228098740479, "grad_norm": 1.3308610916137695, "learning_rate": 0.0001999124572854306, "loss": 2.3773, "step": 77 }, { "epoch": 0.005839088203918926, "grad_norm": 1.2203447818756104, "learning_rate": 0.00019990982496420907, "loss": 2.7311, "step": 78 }, { "epoch": 0.005913948309097374, "grad_norm": 1.74760103225708, "learning_rate": 0.0001999071536692587, "loss": 2.9273, "step": 79 }, { "epoch": 0.005988808414275822, "grad_norm": 1.2639446258544922, "learning_rate": 0.0001999044434016215, "loss": 2.6312, "step": 80 }, { "epoch": 0.00606366851945427, "grad_norm": 1.1075520515441895, "learning_rate": 0.0001999016941623548, "loss": 1.9335, "step": 81 }, { "epoch": 0.006138528624632718, "grad_norm": 1.1249256134033203, "learning_rate": 0.00019989890595253093, "loss": 2.5878, "step": 82 }, { "epoch": 0.006213388729811166, "grad_norm": 1.3388688564300537, "learning_rate": 0.00019989607877323763, "loss": 2.271, "step": 83 }, { "epoch": 0.006288248834989613, "grad_norm": 1.2046903371810913, "learning_rate": 0.00019989321262557774, "loss": 2.4837, "step": 84 }, { "epoch": 0.006363108940168061, "grad_norm": 1.1520739793777466, "learning_rate": 0.00019989030751066928, "loss": 2.6288, "step": 85 }, { "epoch": 0.006437969045346509, "grad_norm": 1.370134711265564, "learning_rate": 0.0001998873634296455, "loss": 2.6547, "step": 86 }, { "epoch": 0.006512829150524957, "grad_norm": 1.2836953401565552, "learning_rate": 0.00019988438038365488, "loss": 2.3136, "step": 87 }, { "epoch": 0.0065876892557034045, "grad_norm": 1.455772876739502, "learning_rate": 0.0001998813583738611, "loss": 2.294, "step": 88 }, { "epoch": 0.006662549360881852, "grad_norm": 1.8927704095840454, "learning_rate": 0.00019987829740144298, "loss": 1.9447, "step": 89 }, { "epoch": 0.0067374094660602995, "grad_norm": 1.7081835269927979, "learning_rate": 0.00019987519746759454, "loss": 2.373, "step": 90 }, { "epoch": 0.006812269571238747, "grad_norm": 1.1976017951965332, "learning_rate": 0.0001998720585735251, "loss": 2.2841, "step": 91 }, { "epoch": 0.006887129676417195, "grad_norm": 1.5324673652648926, "learning_rate": 0.00019986888072045904, "loss": 2.3231, "step": 92 }, { "epoch": 0.006961989781595643, "grad_norm": 1.226892352104187, "learning_rate": 0.00019986566390963603, "loss": 2.3087, "step": 93 }, { "epoch": 0.007036849886774091, "grad_norm": 1.1459636688232422, "learning_rate": 0.00019986240814231094, "loss": 2.3137, "step": 94 }, { "epoch": 0.007111709991952539, "grad_norm": 1.221402645111084, "learning_rate": 0.0001998591134197538, "loss": 2.2006, "step": 95 }, { "epoch": 0.007186570097130986, "grad_norm": 1.2592463493347168, "learning_rate": 0.00019985577974324982, "loss": 2.3276, "step": 96 }, { "epoch": 0.007261430202309434, "grad_norm": 1.49214506149292, "learning_rate": 0.00019985240711409941, "loss": 2.0785, "step": 97 }, { "epoch": 0.007336290307487882, "grad_norm": 1.3207143545150757, "learning_rate": 0.0001998489955336183, "loss": 2.5573, "step": 98 }, { "epoch": 0.00741115041266633, "grad_norm": 1.3176369667053223, "learning_rate": 0.00019984554500313717, "loss": 2.1599, "step": 99 }, { "epoch": 0.007486010517844778, "grad_norm": 1.393152117729187, "learning_rate": 0.00019984205552400216, "loss": 2.6455, "step": 100 }, { "epoch": 0.007560870623023225, "grad_norm": 1.7769023180007935, "learning_rate": 0.0001998385270975744, "loss": 2.4287, "step": 101 }, { "epoch": 0.007635730728201673, "grad_norm": 1.8088551759719849, "learning_rate": 0.0001998349597252303, "loss": 2.2463, "step": 102 }, { "epoch": 0.007710590833380121, "grad_norm": 1.1739704608917236, "learning_rate": 0.00019983135340836147, "loss": 2.4549, "step": 103 }, { "epoch": 0.007785450938558569, "grad_norm": 1.3278663158416748, "learning_rate": 0.0001998277081483747, "loss": 2.2327, "step": 104 }, { "epoch": 0.007860311043737016, "grad_norm": 1.3005081415176392, "learning_rate": 0.00019982402394669196, "loss": 2.21, "step": 105 }, { "epoch": 0.007935171148915465, "grad_norm": 1.328165888786316, "learning_rate": 0.0001998203008047504, "loss": 2.2558, "step": 106 }, { "epoch": 0.008010031254093912, "grad_norm": 1.1032073497772217, "learning_rate": 0.00019981653872400238, "loss": 2.4057, "step": 107 }, { "epoch": 0.00808489135927236, "grad_norm": 1.3643407821655273, "learning_rate": 0.00019981273770591548, "loss": 2.4368, "step": 108 }, { "epoch": 0.008159751464450807, "grad_norm": 1.1363730430603027, "learning_rate": 0.0001998088977519724, "loss": 2.0689, "step": 109 }, { "epoch": 0.008234611569629255, "grad_norm": 1.1341912746429443, "learning_rate": 0.0001998050188636711, "loss": 1.9087, "step": 110 }, { "epoch": 0.008309471674807703, "grad_norm": 1.2130628824234009, "learning_rate": 0.00019980110104252466, "loss": 2.1313, "step": 111 }, { "epoch": 0.00838433177998615, "grad_norm": 1.2280898094177246, "learning_rate": 0.00019979714429006136, "loss": 2.1906, "step": 112 }, { "epoch": 0.0084591918851646, "grad_norm": 1.5922375917434692, "learning_rate": 0.00019979314860782473, "loss": 2.5099, "step": 113 }, { "epoch": 0.008534051990343046, "grad_norm": 1.4625362157821655, "learning_rate": 0.0001997891139973734, "loss": 2.0231, "step": 114 }, { "epoch": 0.008608912095521495, "grad_norm": 1.1734627485275269, "learning_rate": 0.00019978504046028127, "loss": 2.3868, "step": 115 }, { "epoch": 0.008683772200699942, "grad_norm": 1.1917595863342285, "learning_rate": 0.0001997809279981374, "loss": 2.4125, "step": 116 }, { "epoch": 0.00875863230587839, "grad_norm": 1.155820608139038, "learning_rate": 0.00019977677661254593, "loss": 2.5883, "step": 117 }, { "epoch": 0.008833492411056838, "grad_norm": 1.069296956062317, "learning_rate": 0.00019977258630512635, "loss": 2.1509, "step": 118 }, { "epoch": 0.008908352516235285, "grad_norm": 1.3025665283203125, "learning_rate": 0.0001997683570775132, "loss": 2.189, "step": 119 }, { "epoch": 0.008983212621413734, "grad_norm": 1.7036775350570679, "learning_rate": 0.0001997640889313563, "loss": 2.4335, "step": 120 }, { "epoch": 0.00905807272659218, "grad_norm": 1.1977019309997559, "learning_rate": 0.00019975978186832056, "loss": 2.2233, "step": 121 }, { "epoch": 0.009132932831770628, "grad_norm": 1.44660484790802, "learning_rate": 0.00019975543589008615, "loss": 2.0269, "step": 122 }, { "epoch": 0.009207792936949077, "grad_norm": 1.5385639667510986, "learning_rate": 0.0001997510509983484, "loss": 2.6176, "step": 123 }, { "epoch": 0.009282653042127524, "grad_norm": 1.420533299446106, "learning_rate": 0.0001997466271948178, "loss": 2.3501, "step": 124 }, { "epoch": 0.009357513147305973, "grad_norm": 1.3611818552017212, "learning_rate": 0.00019974216448122003, "loss": 1.8353, "step": 125 }, { "epoch": 0.00943237325248442, "grad_norm": 1.4712271690368652, "learning_rate": 0.0001997376628592959, "loss": 2.3433, "step": 126 }, { "epoch": 0.009507233357662867, "grad_norm": 1.172667145729065, "learning_rate": 0.00019973312233080147, "loss": 2.4851, "step": 127 }, { "epoch": 0.009582093462841315, "grad_norm": 1.3826184272766113, "learning_rate": 0.00019972854289750796, "loss": 2.4067, "step": 128 }, { "epoch": 0.009656953568019763, "grad_norm": 1.4121184349060059, "learning_rate": 0.00019972392456120178, "loss": 2.5374, "step": 129 }, { "epoch": 0.009731813673198211, "grad_norm": 1.491615653038025, "learning_rate": 0.00019971926732368446, "loss": 2.8199, "step": 130 }, { "epoch": 0.009806673778376658, "grad_norm": 1.2035237550735474, "learning_rate": 0.00019971457118677277, "loss": 2.4083, "step": 131 }, { "epoch": 0.009881533883555107, "grad_norm": 1.0313023328781128, "learning_rate": 0.00019970983615229856, "loss": 2.5873, "step": 132 }, { "epoch": 0.009956393988733554, "grad_norm": 1.205027461051941, "learning_rate": 0.00019970506222210898, "loss": 2.6231, "step": 133 }, { "epoch": 0.010031254093912001, "grad_norm": 1.2781052589416504, "learning_rate": 0.0001997002493980662, "loss": 2.1413, "step": 134 }, { "epoch": 0.01010611419909045, "grad_norm": 1.3032124042510986, "learning_rate": 0.00019969539768204775, "loss": 1.7142, "step": 135 }, { "epoch": 0.010180974304268897, "grad_norm": 1.108437418937683, "learning_rate": 0.00019969050707594618, "loss": 2.3049, "step": 136 }, { "epoch": 0.010255834409447346, "grad_norm": 1.2756736278533936, "learning_rate": 0.0001996855775816693, "loss": 2.2543, "step": 137 }, { "epoch": 0.010330694514625793, "grad_norm": 1.2190327644348145, "learning_rate": 0.00019968060920114, "loss": 2.4235, "step": 138 }, { "epoch": 0.01040555461980424, "grad_norm": 1.4676586389541626, "learning_rate": 0.00019967560193629642, "loss": 1.9517, "step": 139 }, { "epoch": 0.010480414724982689, "grad_norm": 1.2959167957305908, "learning_rate": 0.00019967055578909185, "loss": 2.5308, "step": 140 }, { "epoch": 0.010555274830161136, "grad_norm": 1.0223097801208496, "learning_rate": 0.0001996654707614947, "loss": 2.5241, "step": 141 }, { "epoch": 0.010630134935339585, "grad_norm": 1.3780752420425415, "learning_rate": 0.0001996603468554886, "loss": 1.9017, "step": 142 }, { "epoch": 0.010704995040518032, "grad_norm": 1.1196829080581665, "learning_rate": 0.00019965518407307235, "loss": 2.7667, "step": 143 }, { "epoch": 0.01077985514569648, "grad_norm": 1.3053823709487915, "learning_rate": 0.00019964998241625987, "loss": 2.4578, "step": 144 }, { "epoch": 0.010854715250874928, "grad_norm": 1.089004635810852, "learning_rate": 0.0001996447418870803, "loss": 2.3825, "step": 145 }, { "epoch": 0.010929575356053375, "grad_norm": 0.9878839254379272, "learning_rate": 0.00019963946248757785, "loss": 2.4745, "step": 146 }, { "epoch": 0.011004435461231823, "grad_norm": 1.1300643682479858, "learning_rate": 0.000199634144219812, "loss": 2.5022, "step": 147 }, { "epoch": 0.01107929556641027, "grad_norm": 1.1882730722427368, "learning_rate": 0.0001996287870858574, "loss": 2.4492, "step": 148 }, { "epoch": 0.01115415567158872, "grad_norm": 1.0390851497650146, "learning_rate": 0.0001996233910878037, "loss": 2.4448, "step": 149 }, { "epoch": 0.011229015776767166, "grad_norm": 1.1092875003814697, "learning_rate": 0.0001996179562277559, "loss": 2.0932, "step": 150 }, { "epoch": 0.011229015776767166, "eval_loss": 2.324082136154175, "eval_runtime": 178.6776, "eval_samples_per_second": 27.983, "eval_steps_per_second": 13.992, "step": 150 }, { "epoch": 0.011303875881945613, "grad_norm": 1.3808497190475464, "learning_rate": 0.00019961248250783403, "loss": 2.2861, "step": 151 }, { "epoch": 0.011378735987124062, "grad_norm": 1.3875828981399536, "learning_rate": 0.00019960696993017338, "loss": 2.4596, "step": 152 }, { "epoch": 0.01145359609230251, "grad_norm": 1.576560378074646, "learning_rate": 0.00019960141849692433, "loss": 2.3572, "step": 153 }, { "epoch": 0.011528456197480958, "grad_norm": 1.7247194051742554, "learning_rate": 0.0001995958282102524, "loss": 2.2561, "step": 154 }, { "epoch": 0.011603316302659405, "grad_norm": 1.3386026620864868, "learning_rate": 0.00019959019907233832, "loss": 2.4301, "step": 155 }, { "epoch": 0.011678176407837852, "grad_norm": 1.1704305410385132, "learning_rate": 0.00019958453108537792, "loss": 2.4584, "step": 156 }, { "epoch": 0.011753036513016301, "grad_norm": 1.1202257871627808, "learning_rate": 0.00019957882425158233, "loss": 2.0056, "step": 157 }, { "epoch": 0.011827896618194748, "grad_norm": 1.2543144226074219, "learning_rate": 0.00019957307857317762, "loss": 2.1832, "step": 158 }, { "epoch": 0.011902756723373197, "grad_norm": 1.082338571548462, "learning_rate": 0.00019956729405240514, "loss": 2.2457, "step": 159 }, { "epoch": 0.011977616828551644, "grad_norm": 1.089671015739441, "learning_rate": 0.00019956147069152136, "loss": 2.4621, "step": 160 }, { "epoch": 0.012052476933730093, "grad_norm": 1.0606344938278198, "learning_rate": 0.00019955560849279795, "loss": 2.2228, "step": 161 }, { "epoch": 0.01212733703890854, "grad_norm": 1.1348421573638916, "learning_rate": 0.00019954970745852165, "loss": 2.2923, "step": 162 }, { "epoch": 0.012202197144086987, "grad_norm": 1.1470139026641846, "learning_rate": 0.0001995437675909944, "loss": 2.221, "step": 163 }, { "epoch": 0.012277057249265436, "grad_norm": 1.2084987163543701, "learning_rate": 0.00019953778889253326, "loss": 2.0238, "step": 164 }, { "epoch": 0.012351917354443883, "grad_norm": 1.4317717552185059, "learning_rate": 0.0001995317713654705, "loss": 2.4455, "step": 165 }, { "epoch": 0.012426777459622331, "grad_norm": 1.2501686811447144, "learning_rate": 0.00019952571501215345, "loss": 2.3682, "step": 166 }, { "epoch": 0.012501637564800779, "grad_norm": 1.171525239944458, "learning_rate": 0.00019951961983494468, "loss": 2.2018, "step": 167 }, { "epoch": 0.012576497669979226, "grad_norm": 1.103758454322815, "learning_rate": 0.00019951348583622177, "loss": 2.8412, "step": 168 }, { "epoch": 0.012651357775157674, "grad_norm": 1.287165641784668, "learning_rate": 0.0001995073130183776, "loss": 2.187, "step": 169 }, { "epoch": 0.012726217880336121, "grad_norm": 1.4740700721740723, "learning_rate": 0.00019950110138382005, "loss": 2.5244, "step": 170 }, { "epoch": 0.01280107798551457, "grad_norm": 1.5816642045974731, "learning_rate": 0.0001994948509349723, "loss": 2.4517, "step": 171 }, { "epoch": 0.012875938090693017, "grad_norm": 1.412573218345642, "learning_rate": 0.00019948856167427247, "loss": 2.6699, "step": 172 }, { "epoch": 0.012950798195871464, "grad_norm": 1.4143033027648926, "learning_rate": 0.00019948223360417405, "loss": 2.0702, "step": 173 }, { "epoch": 0.013025658301049913, "grad_norm": 1.0803649425506592, "learning_rate": 0.00019947586672714547, "loss": 2.5788, "step": 174 }, { "epoch": 0.01310051840622836, "grad_norm": 1.2203712463378906, "learning_rate": 0.0001994694610456704, "loss": 2.4045, "step": 175 }, { "epoch": 0.013175378511406809, "grad_norm": 1.1225805282592773, "learning_rate": 0.00019946301656224762, "loss": 2.3656, "step": 176 }, { "epoch": 0.013250238616585256, "grad_norm": 1.3239597082138062, "learning_rate": 0.00019945653327939106, "loss": 1.9277, "step": 177 }, { "epoch": 0.013325098721763705, "grad_norm": 1.2802566289901733, "learning_rate": 0.00019945001119962982, "loss": 2.4378, "step": 178 }, { "epoch": 0.013399958826942152, "grad_norm": 1.2047946453094482, "learning_rate": 0.000199443450325508, "loss": 2.3802, "step": 179 }, { "epoch": 0.013474818932120599, "grad_norm": 1.0469533205032349, "learning_rate": 0.000199436850659585, "loss": 1.9269, "step": 180 }, { "epoch": 0.013549679037299048, "grad_norm": 1.1473541259765625, "learning_rate": 0.00019943021220443526, "loss": 2.0261, "step": 181 }, { "epoch": 0.013624539142477495, "grad_norm": 1.3736423254013062, "learning_rate": 0.00019942353496264835, "loss": 2.2332, "step": 182 }, { "epoch": 0.013699399247655944, "grad_norm": 1.3694920539855957, "learning_rate": 0.00019941681893682897, "loss": 2.6537, "step": 183 }, { "epoch": 0.01377425935283439, "grad_norm": 1.200453519821167, "learning_rate": 0.00019941006412959708, "loss": 2.1539, "step": 184 }, { "epoch": 0.013849119458012838, "grad_norm": 1.3644521236419678, "learning_rate": 0.0001994032705435875, "loss": 2.2454, "step": 185 }, { "epoch": 0.013923979563191287, "grad_norm": 0.9955654740333557, "learning_rate": 0.00019939643818145045, "loss": 2.4046, "step": 186 }, { "epoch": 0.013998839668369734, "grad_norm": 1.180686593055725, "learning_rate": 0.00019938956704585107, "loss": 2.2324, "step": 187 }, { "epoch": 0.014073699773548182, "grad_norm": 1.1222738027572632, "learning_rate": 0.00019938265713946983, "loss": 2.4736, "step": 188 }, { "epoch": 0.01414855987872663, "grad_norm": 1.2576813697814941, "learning_rate": 0.00019937570846500209, "loss": 2.3047, "step": 189 }, { "epoch": 0.014223419983905078, "grad_norm": 1.087681531906128, "learning_rate": 0.00019936872102515853, "loss": 1.6368, "step": 190 }, { "epoch": 0.014298280089083525, "grad_norm": 1.5016229152679443, "learning_rate": 0.00019936169482266485, "loss": 2.3803, "step": 191 }, { "epoch": 0.014373140194261972, "grad_norm": 1.2790437936782837, "learning_rate": 0.00019935462986026188, "loss": 2.3014, "step": 192 }, { "epoch": 0.014448000299440421, "grad_norm": 1.0809519290924072, "learning_rate": 0.0001993475261407056, "loss": 2.3416, "step": 193 }, { "epoch": 0.014522860404618868, "grad_norm": 1.0467734336853027, "learning_rate": 0.00019934038366676708, "loss": 2.5147, "step": 194 }, { "epoch": 0.014597720509797317, "grad_norm": 1.2177330255508423, "learning_rate": 0.00019933320244123256, "loss": 2.5178, "step": 195 }, { "epoch": 0.014672580614975764, "grad_norm": 1.1958715915679932, "learning_rate": 0.00019932598246690328, "loss": 2.2128, "step": 196 }, { "epoch": 0.014747440720154211, "grad_norm": 1.141470193862915, "learning_rate": 0.00019931872374659578, "loss": 2.4288, "step": 197 }, { "epoch": 0.01482230082533266, "grad_norm": 1.0307271480560303, "learning_rate": 0.00019931142628314152, "loss": 2.709, "step": 198 }, { "epoch": 0.014897160930511107, "grad_norm": 1.1404526233673096, "learning_rate": 0.00019930409007938717, "loss": 2.0588, "step": 199 }, { "epoch": 0.014972021035689556, "grad_norm": 1.2497811317443848, "learning_rate": 0.00019929671513819455, "loss": 2.0546, "step": 200 }, { "epoch": 0.015046881140868003, "grad_norm": 1.248961091041565, "learning_rate": 0.0001992893014624405, "loss": 2.7761, "step": 201 }, { "epoch": 0.01512174124604645, "grad_norm": 1.5690079927444458, "learning_rate": 0.00019928184905501707, "loss": 2.2096, "step": 202 }, { "epoch": 0.015196601351224899, "grad_norm": 1.1712632179260254, "learning_rate": 0.00019927435791883125, "loss": 2.47, "step": 203 }, { "epoch": 0.015271461456403346, "grad_norm": 1.352821946144104, "learning_rate": 0.00019926682805680536, "loss": 1.8656, "step": 204 }, { "epoch": 0.015346321561581795, "grad_norm": 0.971763551235199, "learning_rate": 0.00019925925947187668, "loss": 2.1428, "step": 205 }, { "epoch": 0.015421181666760242, "grad_norm": 1.2209513187408447, "learning_rate": 0.00019925165216699763, "loss": 2.4244, "step": 206 }, { "epoch": 0.01549604177193869, "grad_norm": 1.1268686056137085, "learning_rate": 0.00019924400614513576, "loss": 2.3621, "step": 207 }, { "epoch": 0.015570901877117137, "grad_norm": 1.3346744775772095, "learning_rate": 0.00019923632140927364, "loss": 2.5614, "step": 208 }, { "epoch": 0.015645761982295586, "grad_norm": 1.327309250831604, "learning_rate": 0.00019922859796240908, "loss": 2.3508, "step": 209 }, { "epoch": 0.01572062208747403, "grad_norm": 1.7583903074264526, "learning_rate": 0.00019922083580755482, "loss": 2.3846, "step": 210 }, { "epoch": 0.01579548219265248, "grad_norm": 1.512117862701416, "learning_rate": 0.00019921303494773885, "loss": 1.9305, "step": 211 }, { "epoch": 0.01587034229783093, "grad_norm": 1.1355167627334595, "learning_rate": 0.0001992051953860042, "loss": 1.9502, "step": 212 }, { "epoch": 0.015945202403009378, "grad_norm": 1.4003798961639404, "learning_rate": 0.00019919731712540905, "loss": 2.0758, "step": 213 }, { "epoch": 0.016020062508187823, "grad_norm": 1.2111636400222778, "learning_rate": 0.00019918940016902648, "loss": 2.1418, "step": 214 }, { "epoch": 0.016094922613366272, "grad_norm": 1.0041537284851074, "learning_rate": 0.0001991814445199449, "loss": 2.3185, "step": 215 }, { "epoch": 0.01616978271854472, "grad_norm": 1.2643321752548218, "learning_rate": 0.00019917345018126775, "loss": 2.2867, "step": 216 }, { "epoch": 0.016244642823723166, "grad_norm": 1.0190606117248535, "learning_rate": 0.00019916541715611348, "loss": 2.523, "step": 217 }, { "epoch": 0.016319502928901615, "grad_norm": 1.3069123029708862, "learning_rate": 0.0001991573454476157, "loss": 2.1893, "step": 218 }, { "epoch": 0.016394363034080064, "grad_norm": 1.1870139837265015, "learning_rate": 0.00019914923505892312, "loss": 2.5025, "step": 219 }, { "epoch": 0.01646922313925851, "grad_norm": 1.3020622730255127, "learning_rate": 0.00019914108599319941, "loss": 2.2304, "step": 220 }, { "epoch": 0.016544083244436958, "grad_norm": 1.0825320482254028, "learning_rate": 0.0001991328982536236, "loss": 2.2669, "step": 221 }, { "epoch": 0.016618943349615407, "grad_norm": 1.2966530323028564, "learning_rate": 0.0001991246718433895, "loss": 2.3692, "step": 222 }, { "epoch": 0.016693803454793855, "grad_norm": 1.2605959177017212, "learning_rate": 0.0001991164067657062, "loss": 1.9629, "step": 223 }, { "epoch": 0.0167686635599723, "grad_norm": 1.447305679321289, "learning_rate": 0.00019910810302379783, "loss": 2.063, "step": 224 }, { "epoch": 0.01684352366515075, "grad_norm": 1.1776087284088135, "learning_rate": 0.00019909976062090355, "loss": 2.5456, "step": 225 }, { "epoch": 0.0169183837703292, "grad_norm": 1.118881106376648, "learning_rate": 0.00019909137956027767, "loss": 2.2069, "step": 226 }, { "epoch": 0.016993243875507644, "grad_norm": 1.4756065607070923, "learning_rate": 0.00019908295984518952, "loss": 2.24, "step": 227 }, { "epoch": 0.017068103980686092, "grad_norm": 1.1998573541641235, "learning_rate": 0.00019907450147892356, "loss": 2.3508, "step": 228 }, { "epoch": 0.01714296408586454, "grad_norm": 1.2587443590164185, "learning_rate": 0.0001990660044647793, "loss": 2.3493, "step": 229 }, { "epoch": 0.01721782419104299, "grad_norm": 1.3847157955169678, "learning_rate": 0.00019905746880607134, "loss": 2.257, "step": 230 }, { "epoch": 0.017292684296221435, "grad_norm": 0.8947596549987793, "learning_rate": 0.00019904889450612933, "loss": 2.6477, "step": 231 }, { "epoch": 0.017367544401399884, "grad_norm": 1.0639917850494385, "learning_rate": 0.00019904028156829805, "loss": 2.7866, "step": 232 }, { "epoch": 0.017442404506578333, "grad_norm": 1.2901989221572876, "learning_rate": 0.00019903162999593724, "loss": 2.5316, "step": 233 }, { "epoch": 0.01751726461175678, "grad_norm": 1.0971159934997559, "learning_rate": 0.00019902293979242184, "loss": 1.9414, "step": 234 }, { "epoch": 0.017592124716935227, "grad_norm": 1.4040886163711548, "learning_rate": 0.0001990142109611418, "loss": 1.4365, "step": 235 }, { "epoch": 0.017666984822113676, "grad_norm": 1.2521004676818848, "learning_rate": 0.0001990054435055021, "loss": 2.244, "step": 236 }, { "epoch": 0.01774184492729212, "grad_norm": 1.2059935331344604, "learning_rate": 0.00019899663742892294, "loss": 1.9577, "step": 237 }, { "epoch": 0.01781670503247057, "grad_norm": 1.23783540725708, "learning_rate": 0.00019898779273483933, "loss": 2.0878, "step": 238 }, { "epoch": 0.01789156513764902, "grad_norm": 1.1951799392700195, "learning_rate": 0.00019897890942670155, "loss": 2.5063, "step": 239 }, { "epoch": 0.017966425242827468, "grad_norm": 1.1414448022842407, "learning_rate": 0.00019896998750797493, "loss": 2.099, "step": 240 }, { "epoch": 0.018041285348005913, "grad_norm": 1.3273155689239502, "learning_rate": 0.00019896102698213974, "loss": 2.6114, "step": 241 }, { "epoch": 0.01811614545318436, "grad_norm": 1.2282570600509644, "learning_rate": 0.00019895202785269141, "loss": 1.9753, "step": 242 }, { "epoch": 0.01819100555836281, "grad_norm": 1.1261557340621948, "learning_rate": 0.00019894299012314043, "loss": 2.1442, "step": 243 }, { "epoch": 0.018265865663541256, "grad_norm": 1.5086970329284668, "learning_rate": 0.00019893391379701222, "loss": 2.1594, "step": 244 }, { "epoch": 0.018340725768719705, "grad_norm": 1.263375163078308, "learning_rate": 0.0001989247988778475, "loss": 2.1731, "step": 245 }, { "epoch": 0.018415585873898153, "grad_norm": 1.612851619720459, "learning_rate": 0.00019891564536920177, "loss": 2.1701, "step": 246 }, { "epoch": 0.018490445979076602, "grad_norm": 1.34824800491333, "learning_rate": 0.00019890645327464581, "loss": 2.1951, "step": 247 }, { "epoch": 0.018565306084255048, "grad_norm": 1.3140532970428467, "learning_rate": 0.0001988972225977653, "loss": 2.0705, "step": 248 }, { "epoch": 0.018640166189433496, "grad_norm": 1.0276424884796143, "learning_rate": 0.00019888795334216107, "loss": 2.2552, "step": 249 }, { "epoch": 0.018715026294611945, "grad_norm": 1.0264134407043457, "learning_rate": 0.00019887864551144892, "loss": 2.0389, "step": 250 }, { "epoch": 0.01878988639979039, "grad_norm": 1.5660361051559448, "learning_rate": 0.0001988692991092597, "loss": 1.9665, "step": 251 }, { "epoch": 0.01886474650496884, "grad_norm": 1.1472692489624023, "learning_rate": 0.00019885991413923942, "loss": 2.1773, "step": 252 }, { "epoch": 0.018939606610147288, "grad_norm": 1.493173360824585, "learning_rate": 0.00019885049060504897, "loss": 1.6517, "step": 253 }, { "epoch": 0.019014466715325733, "grad_norm": 1.283936858177185, "learning_rate": 0.00019884102851036443, "loss": 2.5211, "step": 254 }, { "epoch": 0.019089326820504182, "grad_norm": 1.5672898292541504, "learning_rate": 0.00019883152785887687, "loss": 2.645, "step": 255 }, { "epoch": 0.01916418692568263, "grad_norm": 1.1319791078567505, "learning_rate": 0.00019882198865429234, "loss": 1.9976, "step": 256 }, { "epoch": 0.01923904703086108, "grad_norm": 1.246286392211914, "learning_rate": 0.00019881241090033197, "loss": 2.4319, "step": 257 }, { "epoch": 0.019313907136039525, "grad_norm": 1.3815912008285522, "learning_rate": 0.000198802794600732, "loss": 2.7649, "step": 258 }, { "epoch": 0.019388767241217974, "grad_norm": 1.1471368074417114, "learning_rate": 0.0001987931397592436, "loss": 2.7328, "step": 259 }, { "epoch": 0.019463627346396423, "grad_norm": 1.4221285581588745, "learning_rate": 0.00019878344637963306, "loss": 1.9964, "step": 260 }, { "epoch": 0.019538487451574868, "grad_norm": 1.5814924240112305, "learning_rate": 0.0001987737144656816, "loss": 2.0768, "step": 261 }, { "epoch": 0.019613347556753317, "grad_norm": 1.1402735710144043, "learning_rate": 0.00019876394402118554, "loss": 1.6286, "step": 262 }, { "epoch": 0.019688207661931766, "grad_norm": 1.4268841743469238, "learning_rate": 0.00019875413504995629, "loss": 2.6223, "step": 263 }, { "epoch": 0.019763067767110214, "grad_norm": 1.5738192796707153, "learning_rate": 0.00019874428755582013, "loss": 2.1384, "step": 264 }, { "epoch": 0.01983792787228866, "grad_norm": 1.424912452697754, "learning_rate": 0.00019873440154261854, "loss": 2.4079, "step": 265 }, { "epoch": 0.01991278797746711, "grad_norm": 0.9901403188705444, "learning_rate": 0.00019872447701420792, "loss": 1.7325, "step": 266 }, { "epoch": 0.019987648082645557, "grad_norm": 1.3702083826065063, "learning_rate": 0.00019871451397445968, "loss": 2.4365, "step": 267 }, { "epoch": 0.020062508187824003, "grad_norm": 1.2399877309799194, "learning_rate": 0.00019870451242726036, "loss": 2.5689, "step": 268 }, { "epoch": 0.02013736829300245, "grad_norm": 1.4814860820770264, "learning_rate": 0.0001986944723765114, "loss": 2.4953, "step": 269 }, { "epoch": 0.0202122283981809, "grad_norm": 1.1851484775543213, "learning_rate": 0.00019868439382612934, "loss": 2.1241, "step": 270 }, { "epoch": 0.020287088503359346, "grad_norm": 1.8204325437545776, "learning_rate": 0.00019867427678004572, "loss": 1.7736, "step": 271 }, { "epoch": 0.020361948608537794, "grad_norm": 1.189347505569458, "learning_rate": 0.00019866412124220704, "loss": 2.3404, "step": 272 }, { "epoch": 0.020436808713716243, "grad_norm": 1.121734380722046, "learning_rate": 0.00019865392721657492, "loss": 2.0714, "step": 273 }, { "epoch": 0.020511668818894692, "grad_norm": 1.380563735961914, "learning_rate": 0.00019864369470712592, "loss": 2.5145, "step": 274 }, { "epoch": 0.020586528924073137, "grad_norm": 1.0412096977233887, "learning_rate": 0.00019863342371785158, "loss": 2.3274, "step": 275 }, { "epoch": 0.020661389029251586, "grad_norm": 1.4477009773254395, "learning_rate": 0.00019862311425275858, "loss": 2.5614, "step": 276 }, { "epoch": 0.020736249134430035, "grad_norm": 1.5369065999984741, "learning_rate": 0.0001986127663158685, "loss": 2.3393, "step": 277 }, { "epoch": 0.02081110923960848, "grad_norm": 1.3067257404327393, "learning_rate": 0.000198602379911218, "loss": 2.366, "step": 278 }, { "epoch": 0.02088596934478693, "grad_norm": 1.1250195503234863, "learning_rate": 0.0001985919550428586, "loss": 2.4906, "step": 279 }, { "epoch": 0.020960829449965378, "grad_norm": 1.159252643585205, "learning_rate": 0.00019858149171485698, "loss": 2.5549, "step": 280 }, { "epoch": 0.021035689555143827, "grad_norm": 1.2287441492080688, "learning_rate": 0.0001985709899312948, "loss": 2.2438, "step": 281 }, { "epoch": 0.021110549660322272, "grad_norm": 1.0458111763000488, "learning_rate": 0.00019856044969626866, "loss": 2.3704, "step": 282 }, { "epoch": 0.02118540976550072, "grad_norm": 1.1725354194641113, "learning_rate": 0.00019854987101389018, "loss": 2.1485, "step": 283 }, { "epoch": 0.02126026987067917, "grad_norm": 1.251326560974121, "learning_rate": 0.00019853925388828598, "loss": 2.1613, "step": 284 }, { "epoch": 0.021335129975857615, "grad_norm": 1.375732421875, "learning_rate": 0.00019852859832359773, "loss": 2.498, "step": 285 }, { "epoch": 0.021409990081036064, "grad_norm": 1.577691912651062, "learning_rate": 0.00019851790432398204, "loss": 2.6521, "step": 286 }, { "epoch": 0.021484850186214512, "grad_norm": 0.9801524877548218, "learning_rate": 0.0001985071718936105, "loss": 2.1548, "step": 287 }, { "epoch": 0.02155971029139296, "grad_norm": 1.1016772985458374, "learning_rate": 0.0001984964010366697, "loss": 2.1223, "step": 288 }, { "epoch": 0.021634570396571406, "grad_norm": 1.0707099437713623, "learning_rate": 0.00019848559175736127, "loss": 2.0871, "step": 289 }, { "epoch": 0.021709430501749855, "grad_norm": 1.5665844678878784, "learning_rate": 0.0001984747440599018, "loss": 2.6512, "step": 290 }, { "epoch": 0.021784290606928304, "grad_norm": 1.2453522682189941, "learning_rate": 0.00019846385794852279, "loss": 2.3489, "step": 291 }, { "epoch": 0.02185915071210675, "grad_norm": 1.2294588088989258, "learning_rate": 0.00019845293342747087, "loss": 2.6379, "step": 292 }, { "epoch": 0.021934010817285198, "grad_norm": 1.2780051231384277, "learning_rate": 0.0001984419705010075, "loss": 2.2715, "step": 293 }, { "epoch": 0.022008870922463647, "grad_norm": 1.0416451692581177, "learning_rate": 0.0001984309691734093, "loss": 2.4105, "step": 294 }, { "epoch": 0.022083731027642092, "grad_norm": 1.3154726028442383, "learning_rate": 0.00019841992944896766, "loss": 2.1009, "step": 295 }, { "epoch": 0.02215859113282054, "grad_norm": 1.470007300376892, "learning_rate": 0.00019840885133198913, "loss": 2.421, "step": 296 }, { "epoch": 0.02223345123799899, "grad_norm": 1.2675542831420898, "learning_rate": 0.00019839773482679515, "loss": 1.7007, "step": 297 }, { "epoch": 0.02230831134317744, "grad_norm": 1.3522319793701172, "learning_rate": 0.00019838657993772208, "loss": 2.2532, "step": 298 }, { "epoch": 0.022383171448355884, "grad_norm": 1.1142691373825073, "learning_rate": 0.0001983753866691214, "loss": 2.1964, "step": 299 }, { "epoch": 0.022458031553534333, "grad_norm": 1.1139146089553833, "learning_rate": 0.00019836415502535947, "loss": 1.9923, "step": 300 }, { "epoch": 0.022458031553534333, "eval_loss": 2.2711122035980225, "eval_runtime": 178.9086, "eval_samples_per_second": 27.947, "eval_steps_per_second": 13.974, "step": 300 }, { "epoch": 0.02253289165871278, "grad_norm": 1.3640286922454834, "learning_rate": 0.00019835288501081762, "loss": 2.2046, "step": 301 }, { "epoch": 0.022607751763891227, "grad_norm": 1.5397758483886719, "learning_rate": 0.00019834157662989213, "loss": 2.3757, "step": 302 }, { "epoch": 0.022682611869069676, "grad_norm": 1.3731625080108643, "learning_rate": 0.0001983302298869943, "loss": 1.7257, "step": 303 }, { "epoch": 0.022757471974248124, "grad_norm": 1.290584683418274, "learning_rate": 0.00019831884478655037, "loss": 2.3775, "step": 304 }, { "epoch": 0.022832332079426573, "grad_norm": 1.3346391916275024, "learning_rate": 0.00019830742133300155, "loss": 2.5969, "step": 305 }, { "epoch": 0.02290719218460502, "grad_norm": 1.4386450052261353, "learning_rate": 0.00019829595953080399, "loss": 1.8762, "step": 306 }, { "epoch": 0.022982052289783467, "grad_norm": 0.9988537430763245, "learning_rate": 0.00019828445938442887, "loss": 2.6632, "step": 307 }, { "epoch": 0.023056912394961916, "grad_norm": 1.1946516036987305, "learning_rate": 0.00019827292089836217, "loss": 2.5509, "step": 308 }, { "epoch": 0.02313177250014036, "grad_norm": 1.0687073469161987, "learning_rate": 0.000198261344077105, "loss": 2.6547, "step": 309 }, { "epoch": 0.02320663260531881, "grad_norm": 1.1438192129135132, "learning_rate": 0.00019824972892517332, "loss": 2.1007, "step": 310 }, { "epoch": 0.02328149271049726, "grad_norm": 1.3641997575759888, "learning_rate": 0.00019823807544709808, "loss": 1.7681, "step": 311 }, { "epoch": 0.023356352815675704, "grad_norm": 1.1564557552337646, "learning_rate": 0.00019822638364742518, "loss": 2.3661, "step": 312 }, { "epoch": 0.023431212920854153, "grad_norm": 1.4569306373596191, "learning_rate": 0.0001982146535307155, "loss": 2.3178, "step": 313 }, { "epoch": 0.023506073026032602, "grad_norm": 1.3295104503631592, "learning_rate": 0.00019820288510154473, "loss": 2.1653, "step": 314 }, { "epoch": 0.02358093313121105, "grad_norm": 1.1628754138946533, "learning_rate": 0.0001981910783645037, "loss": 2.4279, "step": 315 }, { "epoch": 0.023655793236389496, "grad_norm": 1.4543766975402832, "learning_rate": 0.00019817923332419808, "loss": 1.8007, "step": 316 }, { "epoch": 0.023730653341567945, "grad_norm": 1.4812188148498535, "learning_rate": 0.00019816734998524843, "loss": 2.4476, "step": 317 }, { "epoch": 0.023805513446746394, "grad_norm": 1.3124221563339233, "learning_rate": 0.00019815542835229034, "loss": 2.0443, "step": 318 }, { "epoch": 0.02388037355192484, "grad_norm": 1.1518163681030273, "learning_rate": 0.00019814346842997435, "loss": 1.9598, "step": 319 }, { "epoch": 0.023955233657103288, "grad_norm": 1.3004189729690552, "learning_rate": 0.00019813147022296583, "loss": 2.3457, "step": 320 }, { "epoch": 0.024030093762281737, "grad_norm": 1.0101702213287354, "learning_rate": 0.0001981194337359452, "loss": 2.2909, "step": 321 }, { "epoch": 0.024104953867460185, "grad_norm": 1.1289660930633545, "learning_rate": 0.00019810735897360775, "loss": 2.2389, "step": 322 }, { "epoch": 0.02417981397263863, "grad_norm": 1.4824570417404175, "learning_rate": 0.0001980952459406637, "loss": 2.0597, "step": 323 }, { "epoch": 0.02425467407781708, "grad_norm": 1.4315767288208008, "learning_rate": 0.0001980830946418382, "loss": 2.0133, "step": 324 }, { "epoch": 0.02432953418299553, "grad_norm": 1.1894255876541138, "learning_rate": 0.00019807090508187133, "loss": 2.2645, "step": 325 }, { "epoch": 0.024404394288173974, "grad_norm": 1.4118221998214722, "learning_rate": 0.0001980586772655182, "loss": 2.4879, "step": 326 }, { "epoch": 0.024479254393352422, "grad_norm": 1.1526402235031128, "learning_rate": 0.00019804641119754862, "loss": 2.4319, "step": 327 }, { "epoch": 0.02455411449853087, "grad_norm": 3.1304335594177246, "learning_rate": 0.0001980341068827475, "loss": 2.4163, "step": 328 }, { "epoch": 0.024628974603709317, "grad_norm": 1.0300992727279663, "learning_rate": 0.00019802176432591465, "loss": 2.5499, "step": 329 }, { "epoch": 0.024703834708887765, "grad_norm": 1.093098521232605, "learning_rate": 0.0001980093835318647, "loss": 2.06, "step": 330 }, { "epoch": 0.024778694814066214, "grad_norm": 1.2717130184173584, "learning_rate": 0.00019799696450542733, "loss": 2.5226, "step": 331 }, { "epoch": 0.024853554919244663, "grad_norm": 1.0919092893600464, "learning_rate": 0.00019798450725144707, "loss": 1.8345, "step": 332 }, { "epoch": 0.02492841502442311, "grad_norm": 1.2221540212631226, "learning_rate": 0.00019797201177478328, "loss": 2.4176, "step": 333 }, { "epoch": 0.025003275129601557, "grad_norm": 1.1521447896957397, "learning_rate": 0.00019795947808031036, "loss": 2.2811, "step": 334 }, { "epoch": 0.025078135234780006, "grad_norm": 1.4314345121383667, "learning_rate": 0.00019794690617291755, "loss": 2.2046, "step": 335 }, { "epoch": 0.02515299533995845, "grad_norm": 1.1658433675765991, "learning_rate": 0.00019793429605750905, "loss": 2.4605, "step": 336 }, { "epoch": 0.0252278554451369, "grad_norm": 0.9953664541244507, "learning_rate": 0.0001979216477390039, "loss": 1.911, "step": 337 }, { "epoch": 0.02530271555031535, "grad_norm": 1.1720056533813477, "learning_rate": 0.00019790896122233608, "loss": 2.5562, "step": 338 }, { "epoch": 0.025377575655493798, "grad_norm": 1.0917861461639404, "learning_rate": 0.0001978962365124544, "loss": 2.1808, "step": 339 }, { "epoch": 0.025452435760672243, "grad_norm": 1.1208796501159668, "learning_rate": 0.00019788347361432274, "loss": 2.2745, "step": 340 }, { "epoch": 0.02552729586585069, "grad_norm": 1.179793119430542, "learning_rate": 0.0001978706725329197, "loss": 2.1616, "step": 341 }, { "epoch": 0.02560215597102914, "grad_norm": 1.376680612564087, "learning_rate": 0.00019785783327323886, "loss": 2.0295, "step": 342 }, { "epoch": 0.025677016076207586, "grad_norm": 1.3046778440475464, "learning_rate": 0.00019784495584028867, "loss": 2.2518, "step": 343 }, { "epoch": 0.025751876181386035, "grad_norm": 1.1633862257003784, "learning_rate": 0.00019783204023909244, "loss": 2.2345, "step": 344 }, { "epoch": 0.025826736286564483, "grad_norm": 1.2735859155654907, "learning_rate": 0.00019781908647468845, "loss": 2.0678, "step": 345 }, { "epoch": 0.02590159639174293, "grad_norm": 1.293027400970459, "learning_rate": 0.00019780609455212985, "loss": 2.373, "step": 346 }, { "epoch": 0.025976456496921378, "grad_norm": 1.088502049446106, "learning_rate": 0.00019779306447648454, "loss": 2.2488, "step": 347 }, { "epoch": 0.026051316602099826, "grad_norm": 1.1114771366119385, "learning_rate": 0.00019777999625283552, "loss": 1.7832, "step": 348 }, { "epoch": 0.026126176707278275, "grad_norm": 1.1422191858291626, "learning_rate": 0.0001977668898862805, "loss": 2.393, "step": 349 }, { "epoch": 0.02620103681245672, "grad_norm": 1.1876436471939087, "learning_rate": 0.00019775374538193218, "loss": 2.5184, "step": 350 }, { "epoch": 0.02627589691763517, "grad_norm": 1.5131523609161377, "learning_rate": 0.00019774056274491798, "loss": 1.4388, "step": 351 }, { "epoch": 0.026350757022813618, "grad_norm": 1.2005611658096313, "learning_rate": 0.00019772734198038046, "loss": 2.2956, "step": 352 }, { "epoch": 0.026425617127992063, "grad_norm": 1.3154958486557007, "learning_rate": 0.00019771408309347677, "loss": 2.3377, "step": 353 }, { "epoch": 0.026500477233170512, "grad_norm": 1.0499690771102905, "learning_rate": 0.0001977007860893791, "loss": 2.472, "step": 354 }, { "epoch": 0.02657533733834896, "grad_norm": 0.9958057403564453, "learning_rate": 0.00019768745097327448, "loss": 2.4456, "step": 355 }, { "epoch": 0.02665019744352741, "grad_norm": 1.2506600618362427, "learning_rate": 0.00019767407775036477, "loss": 2.303, "step": 356 }, { "epoch": 0.026725057548705855, "grad_norm": 1.2738773822784424, "learning_rate": 0.00019766066642586672, "loss": 2.2514, "step": 357 }, { "epoch": 0.026799917653884304, "grad_norm": 1.4043455123901367, "learning_rate": 0.00019764721700501196, "loss": 1.8962, "step": 358 }, { "epoch": 0.026874777759062753, "grad_norm": 1.3950743675231934, "learning_rate": 0.00019763372949304693, "loss": 2.3423, "step": 359 }, { "epoch": 0.026949637864241198, "grad_norm": 1.264211893081665, "learning_rate": 0.00019762020389523297, "loss": 2.067, "step": 360 }, { "epoch": 0.027024497969419647, "grad_norm": 1.185969352722168, "learning_rate": 0.0001976066402168463, "loss": 1.9809, "step": 361 }, { "epoch": 0.027099358074598096, "grad_norm": 1.3258261680603027, "learning_rate": 0.00019759303846317793, "loss": 2.1005, "step": 362 }, { "epoch": 0.02717421817977654, "grad_norm": 1.0743958950042725, "learning_rate": 0.00019757939863953376, "loss": 1.7992, "step": 363 }, { "epoch": 0.02724907828495499, "grad_norm": 1.2121431827545166, "learning_rate": 0.00019756572075123457, "loss": 2.0482, "step": 364 }, { "epoch": 0.02732393839013344, "grad_norm": 1.0927667617797852, "learning_rate": 0.00019755200480361587, "loss": 2.0787, "step": 365 }, { "epoch": 0.027398798495311887, "grad_norm": 1.276828646659851, "learning_rate": 0.00019753825080202818, "loss": 2.5314, "step": 366 }, { "epoch": 0.027473658600490333, "grad_norm": 1.1382888555526733, "learning_rate": 0.00019752445875183679, "loss": 2.4753, "step": 367 }, { "epoch": 0.02754851870566878, "grad_norm": 1.4958301782608032, "learning_rate": 0.00019751062865842176, "loss": 2.5033, "step": 368 }, { "epoch": 0.02762337881084723, "grad_norm": 1.2672083377838135, "learning_rate": 0.00019749676052717812, "loss": 2.2803, "step": 369 }, { "epoch": 0.027698238916025675, "grad_norm": 1.2714293003082275, "learning_rate": 0.00019748285436351563, "loss": 1.807, "step": 370 }, { "epoch": 0.027773099021204124, "grad_norm": 1.1320674419403076, "learning_rate": 0.00019746891017285897, "loss": 2.0031, "step": 371 }, { "epoch": 0.027847959126382573, "grad_norm": 1.2905333042144775, "learning_rate": 0.0001974549279606476, "loss": 2.4345, "step": 372 }, { "epoch": 0.027922819231561022, "grad_norm": 1.126130223274231, "learning_rate": 0.0001974409077323358, "loss": 2.948, "step": 373 }, { "epoch": 0.027997679336739467, "grad_norm": 1.2408627271652222, "learning_rate": 0.0001974268494933928, "loss": 2.1315, "step": 374 }, { "epoch": 0.028072539441917916, "grad_norm": 1.1710187196731567, "learning_rate": 0.00019741275324930245, "loss": 2.4759, "step": 375 }, { "epoch": 0.028147399547096365, "grad_norm": 1.3984893560409546, "learning_rate": 0.0001973986190055636, "loss": 2.2751, "step": 376 }, { "epoch": 0.02822225965227481, "grad_norm": 1.0717449188232422, "learning_rate": 0.00019738444676768987, "loss": 2.4198, "step": 377 }, { "epoch": 0.02829711975745326, "grad_norm": 1.2411248683929443, "learning_rate": 0.00019737023654120965, "loss": 2.1807, "step": 378 }, { "epoch": 0.028371979862631708, "grad_norm": 1.2003836631774902, "learning_rate": 0.00019735598833166626, "loss": 1.956, "step": 379 }, { "epoch": 0.028446839967810156, "grad_norm": 1.489338994026184, "learning_rate": 0.00019734170214461772, "loss": 2.2534, "step": 380 }, { "epoch": 0.028521700072988602, "grad_norm": 1.236831784248352, "learning_rate": 0.00019732737798563692, "loss": 2.678, "step": 381 }, { "epoch": 0.02859656017816705, "grad_norm": 1.3069928884506226, "learning_rate": 0.00019731301586031159, "loss": 2.5213, "step": 382 }, { "epoch": 0.0286714202833455, "grad_norm": 1.3796393871307373, "learning_rate": 0.0001972986157742442, "loss": 2.1777, "step": 383 }, { "epoch": 0.028746280388523945, "grad_norm": 1.1422960758209229, "learning_rate": 0.0001972841777330521, "loss": 1.9844, "step": 384 }, { "epoch": 0.028821140493702394, "grad_norm": 1.2690047025680542, "learning_rate": 0.0001972697017423674, "loss": 2.6049, "step": 385 }, { "epoch": 0.028896000598880842, "grad_norm": 1.1681550741195679, "learning_rate": 0.00019725518780783698, "loss": 2.3392, "step": 386 }, { "epoch": 0.028970860704059288, "grad_norm": 1.1609700918197632, "learning_rate": 0.0001972406359351226, "loss": 2.3259, "step": 387 }, { "epoch": 0.029045720809237736, "grad_norm": 1.1603058576583862, "learning_rate": 0.00019722604612990084, "loss": 2.3608, "step": 388 }, { "epoch": 0.029120580914416185, "grad_norm": 1.0695078372955322, "learning_rate": 0.0001972114183978629, "loss": 1.6495, "step": 389 }, { "epoch": 0.029195441019594634, "grad_norm": 1.261734127998352, "learning_rate": 0.00019719675274471504, "loss": 2.1495, "step": 390 }, { "epoch": 0.02927030112477308, "grad_norm": 1.2448354959487915, "learning_rate": 0.0001971820491761781, "loss": 2.2017, "step": 391 }, { "epoch": 0.029345161229951528, "grad_norm": 1.278262972831726, "learning_rate": 0.00019716730769798777, "loss": 2.3534, "step": 392 }, { "epoch": 0.029420021335129977, "grad_norm": 1.3426772356033325, "learning_rate": 0.00019715252831589456, "loss": 1.9383, "step": 393 }, { "epoch": 0.029494881440308422, "grad_norm": 1.2473523616790771, "learning_rate": 0.00019713771103566373, "loss": 2.4362, "step": 394 }, { "epoch": 0.02956974154548687, "grad_norm": 1.3586368560791016, "learning_rate": 0.00019712285586307536, "loss": 2.817, "step": 395 }, { "epoch": 0.02964460165066532, "grad_norm": 1.1567448377609253, "learning_rate": 0.00019710796280392426, "loss": 2.101, "step": 396 }, { "epoch": 0.02971946175584377, "grad_norm": 1.2992753982543945, "learning_rate": 0.0001970930318640201, "loss": 2.4921, "step": 397 }, { "epoch": 0.029794321861022214, "grad_norm": 1.2298381328582764, "learning_rate": 0.0001970780630491872, "loss": 2.2297, "step": 398 }, { "epoch": 0.029869181966200663, "grad_norm": 1.24710214138031, "learning_rate": 0.0001970630563652648, "loss": 2.4025, "step": 399 }, { "epoch": 0.02994404207137911, "grad_norm": 1.5169633626937866, "learning_rate": 0.0001970480118181068, "loss": 2.4866, "step": 400 }, { "epoch": 0.030018902176557557, "grad_norm": 1.0674453973770142, "learning_rate": 0.000197032929413582, "loss": 2.266, "step": 401 }, { "epoch": 0.030093762281736006, "grad_norm": 1.0718282461166382, "learning_rate": 0.0001970178091575737, "loss": 2.1919, "step": 402 }, { "epoch": 0.030168622386914454, "grad_norm": 1.0722556114196777, "learning_rate": 0.00019700265105598034, "loss": 2.1797, "step": 403 }, { "epoch": 0.0302434824920929, "grad_norm": 1.126185655593872, "learning_rate": 0.00019698745511471482, "loss": 2.3346, "step": 404 }, { "epoch": 0.03031834259727135, "grad_norm": 0.9488750696182251, "learning_rate": 0.00019697222133970493, "loss": 1.8759, "step": 405 }, { "epoch": 0.030393202702449797, "grad_norm": 1.4856750965118408, "learning_rate": 0.00019695694973689322, "loss": 2.246, "step": 406 }, { "epoch": 0.030468062807628246, "grad_norm": 1.0015555620193481, "learning_rate": 0.00019694164031223697, "loss": 2.2543, "step": 407 }, { "epoch": 0.03054292291280669, "grad_norm": 1.253674030303955, "learning_rate": 0.00019692629307170817, "loss": 1.8748, "step": 408 }, { "epoch": 0.03061778301798514, "grad_norm": 1.2001224756240845, "learning_rate": 0.00019691090802129372, "loss": 2.0711, "step": 409 }, { "epoch": 0.03069264312316359, "grad_norm": 1.174880027770996, "learning_rate": 0.00019689548516699503, "loss": 2.3221, "step": 410 }, { "epoch": 0.030767503228342034, "grad_norm": 1.2869731187820435, "learning_rate": 0.00019688002451482848, "loss": 1.9998, "step": 411 }, { "epoch": 0.030842363333520483, "grad_norm": 1.0013141632080078, "learning_rate": 0.00019686452607082505, "loss": 2.2587, "step": 412 }, { "epoch": 0.030917223438698932, "grad_norm": 1.245320439338684, "learning_rate": 0.00019684898984103052, "loss": 2.3764, "step": 413 }, { "epoch": 0.03099208354387738, "grad_norm": 1.4215725660324097, "learning_rate": 0.00019683341583150545, "loss": 2.1753, "step": 414 }, { "epoch": 0.031066943649055826, "grad_norm": 1.5221104621887207, "learning_rate": 0.00019681780404832505, "loss": 1.7855, "step": 415 }, { "epoch": 0.031141803754234275, "grad_norm": 1.1683342456817627, "learning_rate": 0.00019680215449757927, "loss": 2.1504, "step": 416 }, { "epoch": 0.031216663859412724, "grad_norm": 1.345986008644104, "learning_rate": 0.00019678646718537292, "loss": 2.3225, "step": 417 }, { "epoch": 0.03129152396459117, "grad_norm": 1.2869813442230225, "learning_rate": 0.0001967707421178254, "loss": 2.5442, "step": 418 }, { "epoch": 0.03136638406976962, "grad_norm": 1.1568306684494019, "learning_rate": 0.00019675497930107084, "loss": 2.6385, "step": 419 }, { "epoch": 0.03144124417494806, "grad_norm": 1.1574580669403076, "learning_rate": 0.00019673917874125823, "loss": 2.4001, "step": 420 }, { "epoch": 0.031516104280126515, "grad_norm": 1.1471754312515259, "learning_rate": 0.00019672334044455112, "loss": 2.1168, "step": 421 }, { "epoch": 0.03159096438530496, "grad_norm": 1.2367980480194092, "learning_rate": 0.00019670746441712792, "loss": 1.8117, "step": 422 }, { "epoch": 0.031665824490483406, "grad_norm": 1.3841423988342285, "learning_rate": 0.00019669155066518164, "loss": 2.1431, "step": 423 }, { "epoch": 0.03174068459566186, "grad_norm": 1.309664011001587, "learning_rate": 0.0001966755991949201, "loss": 2.4081, "step": 424 }, { "epoch": 0.031815544700840304, "grad_norm": 1.0457936525344849, "learning_rate": 0.00019665961001256576, "loss": 2.4091, "step": 425 }, { "epoch": 0.031890404806018756, "grad_norm": 0.9885175824165344, "learning_rate": 0.00019664358312435586, "loss": 2.1315, "step": 426 }, { "epoch": 0.0319652649111972, "grad_norm": 1.3890403509140015, "learning_rate": 0.0001966275185365423, "loss": 2.1969, "step": 427 }, { "epoch": 0.032040125016375647, "grad_norm": 1.56326425075531, "learning_rate": 0.00019661141625539167, "loss": 1.4754, "step": 428 }, { "epoch": 0.0321149851215541, "grad_norm": 1.1545813083648682, "learning_rate": 0.00019659527628718536, "loss": 1.7438, "step": 429 }, { "epoch": 0.032189845226732544, "grad_norm": 1.117347240447998, "learning_rate": 0.0001965790986382193, "loss": 1.9316, "step": 430 }, { "epoch": 0.03226470533191099, "grad_norm": 1.5109646320343018, "learning_rate": 0.0001965628833148043, "loss": 2.9731, "step": 431 }, { "epoch": 0.03233956543708944, "grad_norm": 1.1995185613632202, "learning_rate": 0.00019654663032326575, "loss": 2.5027, "step": 432 }, { "epoch": 0.03241442554226789, "grad_norm": 0.9972115159034729, "learning_rate": 0.00019653033966994375, "loss": 2.4212, "step": 433 }, { "epoch": 0.03248928564744633, "grad_norm": 1.020841121673584, "learning_rate": 0.00019651401136119314, "loss": 1.8786, "step": 434 }, { "epoch": 0.032564145752624785, "grad_norm": 1.1053330898284912, "learning_rate": 0.0001964976454033834, "loss": 2.1937, "step": 435 }, { "epoch": 0.03263900585780323, "grad_norm": 1.2187232971191406, "learning_rate": 0.00019648124180289865, "loss": 2.3881, "step": 436 }, { "epoch": 0.032713865962981675, "grad_norm": 1.0465255975723267, "learning_rate": 0.00019646480056613786, "loss": 2.0041, "step": 437 }, { "epoch": 0.03278872606816013, "grad_norm": 1.0738645792007446, "learning_rate": 0.0001964483216995145, "loss": 2.1513, "step": 438 }, { "epoch": 0.03286358617333857, "grad_norm": 1.2187782526016235, "learning_rate": 0.0001964318052094568, "loss": 2.3593, "step": 439 }, { "epoch": 0.03293844627851702, "grad_norm": 1.2193257808685303, "learning_rate": 0.00019641525110240772, "loss": 2.5251, "step": 440 }, { "epoch": 0.03301330638369547, "grad_norm": 1.103697419166565, "learning_rate": 0.0001963986593848248, "loss": 2.126, "step": 441 }, { "epoch": 0.033088166488873916, "grad_norm": 1.0728291273117065, "learning_rate": 0.00019638203006318026, "loss": 1.9571, "step": 442 }, { "epoch": 0.03316302659405237, "grad_norm": 1.058306336402893, "learning_rate": 0.00019636536314396102, "loss": 1.955, "step": 443 }, { "epoch": 0.03323788669923081, "grad_norm": 1.0861690044403076, "learning_rate": 0.00019634865863366873, "loss": 2.0423, "step": 444 }, { "epoch": 0.03331274680440926, "grad_norm": 1.807481288909912, "learning_rate": 0.00019633191653881959, "loss": 2.2864, "step": 445 }, { "epoch": 0.03338760690958771, "grad_norm": 1.4101192951202393, "learning_rate": 0.00019631513686594451, "loss": 1.8145, "step": 446 }, { "epoch": 0.033462467014766156, "grad_norm": 1.157108187675476, "learning_rate": 0.000196298319621589, "loss": 2.1872, "step": 447 }, { "epoch": 0.0335373271199446, "grad_norm": 1.2355871200561523, "learning_rate": 0.00019628146481231342, "loss": 2.0164, "step": 448 }, { "epoch": 0.033612187225123054, "grad_norm": 1.2675013542175293, "learning_rate": 0.00019626457244469254, "loss": 2.4763, "step": 449 }, { "epoch": 0.0336870473303015, "grad_norm": 1.4018230438232422, "learning_rate": 0.0001962476425253159, "loss": 2.5518, "step": 450 }, { "epoch": 0.0336870473303015, "eval_loss": 2.2464044094085693, "eval_runtime": 178.9579, "eval_samples_per_second": 27.94, "eval_steps_per_second": 13.97, "step": 450 }, { "epoch": 0.033761907435479944, "grad_norm": 1.193307876586914, "learning_rate": 0.00019623067506078766, "loss": 1.8134, "step": 451 }, { "epoch": 0.0338367675406584, "grad_norm": 1.1936038732528687, "learning_rate": 0.0001962136700577267, "loss": 2.3312, "step": 452 }, { "epoch": 0.03391162764583684, "grad_norm": 0.9606512784957886, "learning_rate": 0.00019619662752276646, "loss": 2.2493, "step": 453 }, { "epoch": 0.03398648775101529, "grad_norm": 1.0140938758850098, "learning_rate": 0.00019617954746255505, "loss": 1.9791, "step": 454 }, { "epoch": 0.03406134785619374, "grad_norm": 1.1270745992660522, "learning_rate": 0.0001961624298837552, "loss": 2.014, "step": 455 }, { "epoch": 0.034136207961372185, "grad_norm": 1.22586190700531, "learning_rate": 0.0001961452747930443, "loss": 2.7084, "step": 456 }, { "epoch": 0.03421106806655063, "grad_norm": 1.0915371179580688, "learning_rate": 0.00019612808219711435, "loss": 2.6458, "step": 457 }, { "epoch": 0.03428592817172908, "grad_norm": 1.4069263935089111, "learning_rate": 0.000196110852102672, "loss": 2.2446, "step": 458 }, { "epoch": 0.03436078827690753, "grad_norm": 1.0926042795181274, "learning_rate": 0.00019609358451643853, "loss": 2.4467, "step": 459 }, { "epoch": 0.03443564838208598, "grad_norm": 1.191210389137268, "learning_rate": 0.00019607627944514982, "loss": 2.5822, "step": 460 }, { "epoch": 0.034510508487264425, "grad_norm": 1.0691332817077637, "learning_rate": 0.00019605893689555646, "loss": 1.9768, "step": 461 }, { "epoch": 0.03458536859244287, "grad_norm": 1.321379542350769, "learning_rate": 0.00019604155687442346, "loss": 1.9405, "step": 462 }, { "epoch": 0.03466022869762132, "grad_norm": 1.4471758604049683, "learning_rate": 0.00019602413938853068, "loss": 2.5657, "step": 463 }, { "epoch": 0.03473508880279977, "grad_norm": 1.2566453218460083, "learning_rate": 0.00019600668444467244, "loss": 2.8364, "step": 464 }, { "epoch": 0.034809948907978214, "grad_norm": 1.0668325424194336, "learning_rate": 0.00019598919204965776, "loss": 2.1425, "step": 465 }, { "epoch": 0.034884809013156666, "grad_norm": 1.3254109621047974, "learning_rate": 0.00019597166221031019, "loss": 2.0239, "step": 466 }, { "epoch": 0.03495966911833511, "grad_norm": 1.1214574575424194, "learning_rate": 0.00019595409493346793, "loss": 2.6919, "step": 467 }, { "epoch": 0.03503452922351356, "grad_norm": 1.3104528188705444, "learning_rate": 0.00019593649022598385, "loss": 2.1352, "step": 468 }, { "epoch": 0.03510938932869201, "grad_norm": 1.2205876111984253, "learning_rate": 0.00019591884809472528, "loss": 1.936, "step": 469 }, { "epoch": 0.035184249433870454, "grad_norm": 1.101623773574829, "learning_rate": 0.00019590116854657422, "loss": 2.4166, "step": 470 }, { "epoch": 0.0352591095390489, "grad_norm": 1.0948970317840576, "learning_rate": 0.00019588345158842735, "loss": 1.6687, "step": 471 }, { "epoch": 0.03533396964422735, "grad_norm": 1.5181125402450562, "learning_rate": 0.00019586569722719577, "loss": 2.1206, "step": 472 }, { "epoch": 0.0354088297494058, "grad_norm": 1.156446099281311, "learning_rate": 0.0001958479054698053, "loss": 2.3621, "step": 473 }, { "epoch": 0.03548368985458424, "grad_norm": 1.3297961950302124, "learning_rate": 0.0001958300763231963, "loss": 2.3485, "step": 474 }, { "epoch": 0.035558549959762695, "grad_norm": 1.2984087467193604, "learning_rate": 0.00019581220979432375, "loss": 2.4015, "step": 475 }, { "epoch": 0.03563341006494114, "grad_norm": 0.9927749037742615, "learning_rate": 0.00019579430589015715, "loss": 1.9521, "step": 476 }, { "epoch": 0.03570827017011959, "grad_norm": 1.1385101079940796, "learning_rate": 0.00019577636461768068, "loss": 2.1668, "step": 477 }, { "epoch": 0.03578313027529804, "grad_norm": 1.2376656532287598, "learning_rate": 0.00019575838598389296, "loss": 2.2977, "step": 478 }, { "epoch": 0.03585799038047648, "grad_norm": 0.9592360258102417, "learning_rate": 0.00019574036999580726, "loss": 2.4725, "step": 479 }, { "epoch": 0.035932850485654935, "grad_norm": 1.160081386566162, "learning_rate": 0.00019572231666045148, "loss": 2.1154, "step": 480 }, { "epoch": 0.03600771059083338, "grad_norm": 1.264600396156311, "learning_rate": 0.000195704225984868, "loss": 1.6916, "step": 481 }, { "epoch": 0.036082570696011826, "grad_norm": 1.2128384113311768, "learning_rate": 0.0001956860979761138, "loss": 2.055, "step": 482 }, { "epoch": 0.03615743080119028, "grad_norm": 1.39772629737854, "learning_rate": 0.0001956679326412604, "loss": 2.3728, "step": 483 }, { "epoch": 0.03623229090636872, "grad_norm": 1.3004955053329468, "learning_rate": 0.00019564972998739388, "loss": 2.2703, "step": 484 }, { "epoch": 0.03630715101154717, "grad_norm": 1.2912412881851196, "learning_rate": 0.00019563149002161496, "loss": 2.4689, "step": 485 }, { "epoch": 0.03638201111672562, "grad_norm": 1.1940809488296509, "learning_rate": 0.00019561321275103882, "loss": 1.8128, "step": 486 }, { "epoch": 0.036456871221904066, "grad_norm": 1.186443567276001, "learning_rate": 0.0001955948981827952, "loss": 2.3546, "step": 487 }, { "epoch": 0.03653173132708251, "grad_norm": 1.208213448524475, "learning_rate": 0.00019557654632402843, "loss": 2.1248, "step": 488 }, { "epoch": 0.036606591432260964, "grad_norm": 1.1440552473068237, "learning_rate": 0.00019555815718189735, "loss": 2.0597, "step": 489 }, { "epoch": 0.03668145153743941, "grad_norm": 1.2506946325302124, "learning_rate": 0.00019553973076357544, "loss": 1.9105, "step": 490 }, { "epoch": 0.036756311642617855, "grad_norm": 1.4684185981750488, "learning_rate": 0.00019552126707625053, "loss": 1.8086, "step": 491 }, { "epoch": 0.03683117174779631, "grad_norm": 1.1788640022277832, "learning_rate": 0.00019550276612712519, "loss": 1.8915, "step": 492 }, { "epoch": 0.03690603185297475, "grad_norm": 1.1474783420562744, "learning_rate": 0.0001954842279234164, "loss": 2.3048, "step": 493 }, { "epoch": 0.036980891958153204, "grad_norm": 1.4034552574157715, "learning_rate": 0.0001954656524723557, "loss": 2.0546, "step": 494 }, { "epoch": 0.03705575206333165, "grad_norm": 1.2917944192886353, "learning_rate": 0.0001954470397811892, "loss": 1.8743, "step": 495 }, { "epoch": 0.037130612168510095, "grad_norm": 1.0977483987808228, "learning_rate": 0.0001954283898571775, "loss": 2.6527, "step": 496 }, { "epoch": 0.03720547227368855, "grad_norm": 1.0162171125411987, "learning_rate": 0.0001954097027075957, "loss": 1.5224, "step": 497 }, { "epoch": 0.03728033237886699, "grad_norm": 1.4584932327270508, "learning_rate": 0.0001953909783397335, "loss": 2.697, "step": 498 }, { "epoch": 0.03735519248404544, "grad_norm": 1.077560305595398, "learning_rate": 0.00019537221676089507, "loss": 1.8911, "step": 499 }, { "epoch": 0.03743005258922389, "grad_norm": 1.2568804025650024, "learning_rate": 0.00019535341797839904, "loss": 2.3433, "step": 500 }, { "epoch": 0.037504912694402336, "grad_norm": 1.3340097665786743, "learning_rate": 0.00019533458199957865, "loss": 1.973, "step": 501 }, { "epoch": 0.03757977279958078, "grad_norm": 2.8452420234680176, "learning_rate": 0.0001953157088317816, "loss": 2.3081, "step": 502 }, { "epoch": 0.03765463290475923, "grad_norm": 1.1960886716842651, "learning_rate": 0.00019529679848237012, "loss": 2.4873, "step": 503 }, { "epoch": 0.03772949300993768, "grad_norm": 1.3851004838943481, "learning_rate": 0.00019527785095872094, "loss": 2.1594, "step": 504 }, { "epoch": 0.037804353115116124, "grad_norm": 1.2272148132324219, "learning_rate": 0.00019525886626822522, "loss": 2.2217, "step": 505 }, { "epoch": 0.037879213220294576, "grad_norm": 1.2876338958740234, "learning_rate": 0.00019523984441828876, "loss": 2.0507, "step": 506 }, { "epoch": 0.03795407332547302, "grad_norm": 1.2646394968032837, "learning_rate": 0.0001952207854163317, "loss": 2.3115, "step": 507 }, { "epoch": 0.03802893343065147, "grad_norm": 1.0257726907730103, "learning_rate": 0.0001952016892697888, "loss": 2.4534, "step": 508 }, { "epoch": 0.03810379353582992, "grad_norm": 1.200571894645691, "learning_rate": 0.00019518255598610925, "loss": 2.5095, "step": 509 }, { "epoch": 0.038178653641008364, "grad_norm": 1.1019976139068604, "learning_rate": 0.0001951633855727567, "loss": 2.3655, "step": 510 }, { "epoch": 0.03825351374618682, "grad_norm": 1.0596147775650024, "learning_rate": 0.00019514417803720937, "loss": 2.285, "step": 511 }, { "epoch": 0.03832837385136526, "grad_norm": 1.0460389852523804, "learning_rate": 0.0001951249333869599, "loss": 2.1451, "step": 512 }, { "epoch": 0.03840323395654371, "grad_norm": 0.9831485152244568, "learning_rate": 0.00019510565162951537, "loss": 2.4081, "step": 513 }, { "epoch": 0.03847809406172216, "grad_norm": 1.1139522790908813, "learning_rate": 0.00019508633277239744, "loss": 1.7343, "step": 514 }, { "epoch": 0.038552954166900605, "grad_norm": 1.1073594093322754, "learning_rate": 0.00019506697682314213, "loss": 2.1945, "step": 515 }, { "epoch": 0.03862781427207905, "grad_norm": 1.3383218050003052, "learning_rate": 0.00019504758378930003, "loss": 2.4581, "step": 516 }, { "epoch": 0.0387026743772575, "grad_norm": 1.2911996841430664, "learning_rate": 0.0001950281536784361, "loss": 2.124, "step": 517 }, { "epoch": 0.03877753448243595, "grad_norm": 1.769524335861206, "learning_rate": 0.0001950086864981299, "loss": 1.8233, "step": 518 }, { "epoch": 0.03885239458761439, "grad_norm": 1.2201745510101318, "learning_rate": 0.0001949891822559753, "loss": 2.4526, "step": 519 }, { "epoch": 0.038927254692792845, "grad_norm": 1.2932671308517456, "learning_rate": 0.00019496964095958065, "loss": 1.9003, "step": 520 }, { "epoch": 0.03900211479797129, "grad_norm": 1.1871076822280884, "learning_rate": 0.0001949500626165689, "loss": 2.3427, "step": 521 }, { "epoch": 0.039076974903149736, "grad_norm": 1.1381313800811768, "learning_rate": 0.00019493044723457729, "loss": 2.3143, "step": 522 }, { "epoch": 0.03915183500832819, "grad_norm": 1.116044521331787, "learning_rate": 0.00019491079482125755, "loss": 2.1942, "step": 523 }, { "epoch": 0.039226695113506634, "grad_norm": 1.3655458688735962, "learning_rate": 0.00019489110538427592, "loss": 2.2093, "step": 524 }, { "epoch": 0.03930155521868508, "grad_norm": 1.2578201293945312, "learning_rate": 0.00019487137893131298, "loss": 2.2464, "step": 525 }, { "epoch": 0.03937641532386353, "grad_norm": 1.0950615406036377, "learning_rate": 0.00019485161547006384, "loss": 1.938, "step": 526 }, { "epoch": 0.039451275429041976, "grad_norm": 1.8168329000473022, "learning_rate": 0.000194831815008238, "loss": 2.4114, "step": 527 }, { "epoch": 0.03952613553422043, "grad_norm": 1.0981578826904297, "learning_rate": 0.00019481197755355937, "loss": 2.4234, "step": 528 }, { "epoch": 0.039600995639398874, "grad_norm": 1.2216497659683228, "learning_rate": 0.00019479210311376641, "loss": 2.2314, "step": 529 }, { "epoch": 0.03967585574457732, "grad_norm": 1.2806556224822998, "learning_rate": 0.00019477219169661183, "loss": 1.7409, "step": 530 }, { "epoch": 0.03975071584975577, "grad_norm": 1.0465189218521118, "learning_rate": 0.00019475224330986292, "loss": 1.9866, "step": 531 }, { "epoch": 0.03982557595493422, "grad_norm": 1.3733617067337036, "learning_rate": 0.00019473225796130128, "loss": 2.4235, "step": 532 }, { "epoch": 0.03990043606011266, "grad_norm": 1.4676198959350586, "learning_rate": 0.000194712235658723, "loss": 2.322, "step": 533 }, { "epoch": 0.039975296165291115, "grad_norm": 1.2617816925048828, "learning_rate": 0.00019469217640993855, "loss": 2.1462, "step": 534 }, { "epoch": 0.04005015627046956, "grad_norm": 1.4387990236282349, "learning_rate": 0.00019467208022277282, "loss": 2.2956, "step": 535 }, { "epoch": 0.040125016375648005, "grad_norm": 1.019292950630188, "learning_rate": 0.00019465194710506517, "loss": 2.1285, "step": 536 }, { "epoch": 0.04019987648082646, "grad_norm": 1.19431734085083, "learning_rate": 0.00019463177706466924, "loss": 2.2327, "step": 537 }, { "epoch": 0.0402747365860049, "grad_norm": 1.2910435199737549, "learning_rate": 0.00019461157010945313, "loss": 2.5648, "step": 538 }, { "epoch": 0.04034959669118335, "grad_norm": 1.2621897459030151, "learning_rate": 0.00019459132624729942, "loss": 2.0828, "step": 539 }, { "epoch": 0.0404244567963618, "grad_norm": 1.1374369859695435, "learning_rate": 0.000194571045486105, "loss": 1.8306, "step": 540 }, { "epoch": 0.040499316901540246, "grad_norm": 1.1304407119750977, "learning_rate": 0.00019455072783378113, "loss": 2.2043, "step": 541 }, { "epoch": 0.04057417700671869, "grad_norm": 1.2598686218261719, "learning_rate": 0.00019453037329825352, "loss": 2.3581, "step": 542 }, { "epoch": 0.04064903711189714, "grad_norm": 1.1958965063095093, "learning_rate": 0.00019450998188746228, "loss": 2.3844, "step": 543 }, { "epoch": 0.04072389721707559, "grad_norm": 1.4034686088562012, "learning_rate": 0.00019448955360936189, "loss": 2.1809, "step": 544 }, { "epoch": 0.04079875732225404, "grad_norm": 1.416783094406128, "learning_rate": 0.0001944690884719211, "loss": 2.5238, "step": 545 }, { "epoch": 0.040873617427432486, "grad_norm": 1.0506033897399902, "learning_rate": 0.00019444858648312322, "loss": 2.139, "step": 546 }, { "epoch": 0.04094847753261093, "grad_norm": 1.197176456451416, "learning_rate": 0.00019442804765096587, "loss": 2.2918, "step": 547 }, { "epoch": 0.041023337637789384, "grad_norm": 1.335114598274231, "learning_rate": 0.00019440747198346095, "loss": 2.6727, "step": 548 }, { "epoch": 0.04109819774296783, "grad_norm": 1.3840413093566895, "learning_rate": 0.0001943868594886349, "loss": 2.229, "step": 549 }, { "epoch": 0.041173057848146274, "grad_norm": 1.3874130249023438, "learning_rate": 0.00019436621017452832, "loss": 2.2292, "step": 550 }, { "epoch": 0.04124791795332473, "grad_norm": 1.1993440389633179, "learning_rate": 0.0001943455240491963, "loss": 2.3552, "step": 551 }, { "epoch": 0.04132277805850317, "grad_norm": 1.0733462572097778, "learning_rate": 0.00019432480112070835, "loss": 2.4569, "step": 552 }, { "epoch": 0.04139763816368162, "grad_norm": 1.5089083909988403, "learning_rate": 0.0001943040413971482, "loss": 2.8098, "step": 553 }, { "epoch": 0.04147249826886007, "grad_norm": 1.2808305025100708, "learning_rate": 0.000194283244886614, "loss": 2.6334, "step": 554 }, { "epoch": 0.041547358374038515, "grad_norm": 1.1151355504989624, "learning_rate": 0.00019426241159721823, "loss": 2.0604, "step": 555 }, { "epoch": 0.04162221847921696, "grad_norm": 1.6245709657669067, "learning_rate": 0.00019424154153708774, "loss": 2.3948, "step": 556 }, { "epoch": 0.04169707858439541, "grad_norm": 1.3898793458938599, "learning_rate": 0.00019422063471436372, "loss": 1.9934, "step": 557 }, { "epoch": 0.04177193868957386, "grad_norm": 1.3242236375808716, "learning_rate": 0.00019419969113720166, "loss": 2.5958, "step": 558 }, { "epoch": 0.0418467987947523, "grad_norm": 1.1097846031188965, "learning_rate": 0.0001941787108137715, "loss": 2.3058, "step": 559 }, { "epoch": 0.041921658899930755, "grad_norm": 1.2919846773147583, "learning_rate": 0.0001941576937522573, "loss": 2.2777, "step": 560 }, { "epoch": 0.0419965190051092, "grad_norm": 1.7579779624938965, "learning_rate": 0.00019413663996085774, "loss": 2.0203, "step": 561 }, { "epoch": 0.04207137911028765, "grad_norm": 1.595299482345581, "learning_rate": 0.00019411554944778555, "loss": 2.0732, "step": 562 }, { "epoch": 0.0421462392154661, "grad_norm": 1.179313063621521, "learning_rate": 0.00019409442222126795, "loss": 2.2482, "step": 563 }, { "epoch": 0.042221099320644544, "grad_norm": 1.2170641422271729, "learning_rate": 0.00019407325828954645, "loss": 2.1767, "step": 564 }, { "epoch": 0.042295959425822996, "grad_norm": 1.5022906064987183, "learning_rate": 0.0001940520576608769, "loss": 2.5879, "step": 565 }, { "epoch": 0.04237081953100144, "grad_norm": 1.0574220418930054, "learning_rate": 0.00019403082034352936, "loss": 1.8228, "step": 566 }, { "epoch": 0.04244567963617989, "grad_norm": 1.2174720764160156, "learning_rate": 0.0001940095463457883, "loss": 2.3392, "step": 567 }, { "epoch": 0.04252053974135834, "grad_norm": 1.295279860496521, "learning_rate": 0.00019398823567595252, "loss": 2.5105, "step": 568 }, { "epoch": 0.042595399846536784, "grad_norm": 1.0793440341949463, "learning_rate": 0.00019396688834233503, "loss": 2.3174, "step": 569 }, { "epoch": 0.04267025995171523, "grad_norm": 1.1784274578094482, "learning_rate": 0.0001939455043532632, "loss": 2.1564, "step": 570 }, { "epoch": 0.04274512005689368, "grad_norm": 1.195628046989441, "learning_rate": 0.00019392408371707867, "loss": 2.5584, "step": 571 }, { "epoch": 0.04281998016207213, "grad_norm": 1.2121009826660156, "learning_rate": 0.00019390262644213748, "loss": 1.9066, "step": 572 }, { "epoch": 0.04289484026725057, "grad_norm": 1.2886301279067993, "learning_rate": 0.00019388113253680978, "loss": 2.6557, "step": 573 }, { "epoch": 0.042969700372429025, "grad_norm": 1.3391215801239014, "learning_rate": 0.00019385960200948014, "loss": 2.3919, "step": 574 }, { "epoch": 0.04304456047760747, "grad_norm": 0.9712406396865845, "learning_rate": 0.0001938380348685474, "loss": 2.0389, "step": 575 }, { "epoch": 0.04311942058278592, "grad_norm": 1.2265775203704834, "learning_rate": 0.00019381643112242465, "loss": 2.4543, "step": 576 }, { "epoch": 0.04319428068796437, "grad_norm": 1.3094369173049927, "learning_rate": 0.0001937947907795393, "loss": 2.1408, "step": 577 }, { "epoch": 0.04326914079314281, "grad_norm": 1.0604076385498047, "learning_rate": 0.00019377311384833297, "loss": 2.1863, "step": 578 }, { "epoch": 0.043344000898321265, "grad_norm": 1.236689567565918, "learning_rate": 0.00019375140033726161, "loss": 2.3901, "step": 579 }, { "epoch": 0.04341886100349971, "grad_norm": 1.3264620304107666, "learning_rate": 0.00019372965025479544, "loss": 1.7781, "step": 580 }, { "epoch": 0.043493721108678156, "grad_norm": 1.2160911560058594, "learning_rate": 0.00019370786360941892, "loss": 2.2775, "step": 581 }, { "epoch": 0.04356858121385661, "grad_norm": 1.3799935579299927, "learning_rate": 0.00019368604040963074, "loss": 2.25, "step": 582 }, { "epoch": 0.04364344131903505, "grad_norm": 1.7321348190307617, "learning_rate": 0.00019366418066394395, "loss": 2.1123, "step": 583 }, { "epoch": 0.0437183014242135, "grad_norm": 1.2517540454864502, "learning_rate": 0.00019364228438088578, "loss": 2.4024, "step": 584 }, { "epoch": 0.04379316152939195, "grad_norm": 1.236885666847229, "learning_rate": 0.00019362035156899774, "loss": 2.3532, "step": 585 }, { "epoch": 0.043868021634570396, "grad_norm": 0.9702836871147156, "learning_rate": 0.00019359838223683554, "loss": 2.335, "step": 586 }, { "epoch": 0.04394288173974884, "grad_norm": 1.3432369232177734, "learning_rate": 0.00019357637639296922, "loss": 2.349, "step": 587 }, { "epoch": 0.044017741844927294, "grad_norm": 1.29338538646698, "learning_rate": 0.00019355433404598298, "loss": 2.1593, "step": 588 }, { "epoch": 0.04409260195010574, "grad_norm": 1.3046464920043945, "learning_rate": 0.00019353225520447534, "loss": 2.3822, "step": 589 }, { "epoch": 0.044167462055284185, "grad_norm": 2.642551898956299, "learning_rate": 0.00019351013987705897, "loss": 1.9985, "step": 590 }, { "epoch": 0.04424232216046264, "grad_norm": 1.1544456481933594, "learning_rate": 0.00019348798807236085, "loss": 2.2172, "step": 591 }, { "epoch": 0.04431718226564108, "grad_norm": 1.3899751901626587, "learning_rate": 0.00019346579979902216, "loss": 2.5518, "step": 592 }, { "epoch": 0.044392042370819534, "grad_norm": 1.0831021070480347, "learning_rate": 0.00019344357506569828, "loss": 2.3708, "step": 593 }, { "epoch": 0.04446690247599798, "grad_norm": 1.131564736366272, "learning_rate": 0.00019342131388105883, "loss": 1.6545, "step": 594 }, { "epoch": 0.044541762581176425, "grad_norm": 1.1236361265182495, "learning_rate": 0.0001933990162537877, "loss": 2.4647, "step": 595 }, { "epoch": 0.04461662268635488, "grad_norm": 1.1483279466629028, "learning_rate": 0.0001933766821925829, "loss": 1.9848, "step": 596 }, { "epoch": 0.04469148279153332, "grad_norm": 1.265212059020996, "learning_rate": 0.00019335431170615673, "loss": 2.354, "step": 597 }, { "epoch": 0.04476634289671177, "grad_norm": 1.1076369285583496, "learning_rate": 0.00019333190480323565, "loss": 2.075, "step": 598 }, { "epoch": 0.04484120300189022, "grad_norm": 1.1264278888702393, "learning_rate": 0.00019330946149256035, "loss": 1.7579, "step": 599 }, { "epoch": 0.044916063107068666, "grad_norm": 1.3229577541351318, "learning_rate": 0.0001932869817828858, "loss": 2.2405, "step": 600 }, { "epoch": 0.044916063107068666, "eval_loss": 2.2287044525146484, "eval_runtime": 178.8945, "eval_samples_per_second": 27.949, "eval_steps_per_second": 13.975, "step": 600 }, { "epoch": 0.04499092321224711, "grad_norm": 1.1161249876022339, "learning_rate": 0.00019326446568298093, "loss": 2.3864, "step": 601 }, { "epoch": 0.04506578331742556, "grad_norm": 1.0321053266525269, "learning_rate": 0.00019324191320162917, "loss": 2.3342, "step": 602 }, { "epoch": 0.04514064342260401, "grad_norm": 1.2200509309768677, "learning_rate": 0.0001932193243476279, "loss": 2.7247, "step": 603 }, { "epoch": 0.045215503527782454, "grad_norm": 1.33545982837677, "learning_rate": 0.00019319669912978886, "loss": 2.5579, "step": 604 }, { "epoch": 0.045290363632960906, "grad_norm": 1.2229349613189697, "learning_rate": 0.00019317403755693784, "loss": 2.5626, "step": 605 }, { "epoch": 0.04536522373813935, "grad_norm": 1.0876628160476685, "learning_rate": 0.00019315133963791493, "loss": 2.5016, "step": 606 }, { "epoch": 0.0454400838433178, "grad_norm": 1.2245584726333618, "learning_rate": 0.00019312860538157425, "loss": 2.4374, "step": 607 }, { "epoch": 0.04551494394849625, "grad_norm": 1.3413887023925781, "learning_rate": 0.00019310583479678427, "loss": 2.4058, "step": 608 }, { "epoch": 0.045589804053674694, "grad_norm": 1.0523784160614014, "learning_rate": 0.00019308302789242752, "loss": 2.301, "step": 609 }, { "epoch": 0.04566466415885315, "grad_norm": 1.1470948457717896, "learning_rate": 0.00019306018467740073, "loss": 2.3058, "step": 610 }, { "epoch": 0.04573952426403159, "grad_norm": 1.1628730297088623, "learning_rate": 0.00019303730516061476, "loss": 2.2115, "step": 611 }, { "epoch": 0.04581438436921004, "grad_norm": 1.2365148067474365, "learning_rate": 0.00019301438935099467, "loss": 2.6889, "step": 612 }, { "epoch": 0.04588924447438849, "grad_norm": 1.2645968198776245, "learning_rate": 0.0001929914372574797, "loss": 2.1917, "step": 613 }, { "epoch": 0.045964104579566935, "grad_norm": 1.0277384519577026, "learning_rate": 0.00019296844888902313, "loss": 2.287, "step": 614 }, { "epoch": 0.04603896468474538, "grad_norm": 1.0861396789550781, "learning_rate": 0.0001929454242545926, "loss": 2.7054, "step": 615 }, { "epoch": 0.04611382478992383, "grad_norm": 1.4511088132858276, "learning_rate": 0.00019292236336316965, "loss": 1.8374, "step": 616 }, { "epoch": 0.04618868489510228, "grad_norm": 1.3309178352355957, "learning_rate": 0.0001928992662237501, "loss": 2.5041, "step": 617 }, { "epoch": 0.04626354500028072, "grad_norm": 1.525281548500061, "learning_rate": 0.00019287613284534394, "loss": 2.2715, "step": 618 }, { "epoch": 0.046338405105459175, "grad_norm": 1.2317334413528442, "learning_rate": 0.00019285296323697524, "loss": 1.967, "step": 619 }, { "epoch": 0.04641326521063762, "grad_norm": 1.215374231338501, "learning_rate": 0.00019282975740768214, "loss": 2.5807, "step": 620 }, { "epoch": 0.046488125315816066, "grad_norm": 1.1588009595870972, "learning_rate": 0.00019280651536651708, "loss": 2.2171, "step": 621 }, { "epoch": 0.04656298542099452, "grad_norm": 1.3116604089736938, "learning_rate": 0.00019278323712254648, "loss": 2.2854, "step": 622 }, { "epoch": 0.046637845526172964, "grad_norm": 1.2940829992294312, "learning_rate": 0.0001927599226848509, "loss": 2.4546, "step": 623 }, { "epoch": 0.04671270563135141, "grad_norm": 1.2130218744277954, "learning_rate": 0.00019273657206252506, "loss": 2.5763, "step": 624 }, { "epoch": 0.04678756573652986, "grad_norm": 1.2413314580917358, "learning_rate": 0.00019271318526467782, "loss": 2.0135, "step": 625 }, { "epoch": 0.046862425841708306, "grad_norm": 1.3470066785812378, "learning_rate": 0.0001926897623004321, "loss": 2.7776, "step": 626 }, { "epoch": 0.04693728594688676, "grad_norm": 1.2281748056411743, "learning_rate": 0.00019266630317892492, "loss": 2.1949, "step": 627 }, { "epoch": 0.047012146052065204, "grad_norm": 1.4457086324691772, "learning_rate": 0.0001926428079093074, "loss": 2.0626, "step": 628 }, { "epoch": 0.04708700615724365, "grad_norm": 1.3364099264144897, "learning_rate": 0.0001926192765007449, "loss": 2.3358, "step": 629 }, { "epoch": 0.0471618662624221, "grad_norm": 1.0741970539093018, "learning_rate": 0.00019259570896241663, "loss": 2.2931, "step": 630 }, { "epoch": 0.04723672636760055, "grad_norm": 1.0890361070632935, "learning_rate": 0.0001925721053035161, "loss": 2.3251, "step": 631 }, { "epoch": 0.04731158647277899, "grad_norm": 1.9497711658477783, "learning_rate": 0.00019254846553325082, "loss": 2.2486, "step": 632 }, { "epoch": 0.047386446577957445, "grad_norm": 1.1234779357910156, "learning_rate": 0.00019252478966084245, "loss": 2.116, "step": 633 }, { "epoch": 0.04746130668313589, "grad_norm": 1.2495081424713135, "learning_rate": 0.00019250107769552664, "loss": 2.2078, "step": 634 }, { "epoch": 0.047536166788314335, "grad_norm": 1.1056605577468872, "learning_rate": 0.00019247732964655314, "loss": 1.814, "step": 635 }, { "epoch": 0.04761102689349279, "grad_norm": 1.009751319885254, "learning_rate": 0.00019245354552318585, "loss": 2.3401, "step": 636 }, { "epoch": 0.04768588699867123, "grad_norm": 1.1922568082809448, "learning_rate": 0.00019242972533470274, "loss": 2.3062, "step": 637 }, { "epoch": 0.04776074710384968, "grad_norm": 1.1205613613128662, "learning_rate": 0.00019240586909039572, "loss": 2.3892, "step": 638 }, { "epoch": 0.04783560720902813, "grad_norm": 1.2349501848220825, "learning_rate": 0.00019238197679957088, "loss": 2.3263, "step": 639 }, { "epoch": 0.047910467314206576, "grad_norm": 1.2847625017166138, "learning_rate": 0.00019235804847154837, "loss": 2.1823, "step": 640 }, { "epoch": 0.04798532741938502, "grad_norm": 1.1732560396194458, "learning_rate": 0.0001923340841156623, "loss": 2.1296, "step": 641 }, { "epoch": 0.04806018752456347, "grad_norm": 1.226536750793457, "learning_rate": 0.00019231008374126098, "loss": 2.4033, "step": 642 }, { "epoch": 0.04813504762974192, "grad_norm": 1.1116877794265747, "learning_rate": 0.00019228604735770666, "loss": 1.9766, "step": 643 }, { "epoch": 0.04820990773492037, "grad_norm": 1.2602949142456055, "learning_rate": 0.00019226197497437567, "loss": 2.5391, "step": 644 }, { "epoch": 0.048284767840098816, "grad_norm": 1.1244988441467285, "learning_rate": 0.00019223786660065836, "loss": 2.1708, "step": 645 }, { "epoch": 0.04835962794527726, "grad_norm": 1.1428700685501099, "learning_rate": 0.0001922137222459592, "loss": 2.3643, "step": 646 }, { "epoch": 0.048434488050455714, "grad_norm": 1.3677639961242676, "learning_rate": 0.0001921895419196966, "loss": 2.1705, "step": 647 }, { "epoch": 0.04850934815563416, "grad_norm": 1.1862021684646606, "learning_rate": 0.000192165325631303, "loss": 2.0187, "step": 648 }, { "epoch": 0.048584208260812604, "grad_norm": 1.2418513298034668, "learning_rate": 0.000192141073390225, "loss": 1.9575, "step": 649 }, { "epoch": 0.04865906836599106, "grad_norm": 1.1723636388778687, "learning_rate": 0.00019211678520592306, "loss": 2.3148, "step": 650 }, { "epoch": 0.0487339284711695, "grad_norm": 1.259885549545288, "learning_rate": 0.00019209246108787179, "loss": 1.9878, "step": 651 }, { "epoch": 0.04880878857634795, "grad_norm": 1.5932230949401855, "learning_rate": 0.0001920681010455597, "loss": 2.3717, "step": 652 }, { "epoch": 0.0488836486815264, "grad_norm": 1.223408579826355, "learning_rate": 0.00019204370508848944, "loss": 2.4985, "step": 653 }, { "epoch": 0.048958508786704845, "grad_norm": 1.1636751890182495, "learning_rate": 0.00019201927322617756, "loss": 2.5376, "step": 654 }, { "epoch": 0.04903336889188329, "grad_norm": 1.3671400547027588, "learning_rate": 0.00019199480546815468, "loss": 2.2701, "step": 655 }, { "epoch": 0.04910822899706174, "grad_norm": 1.3577011823654175, "learning_rate": 0.0001919703018239654, "loss": 2.361, "step": 656 }, { "epoch": 0.04918308910224019, "grad_norm": 1.2391608953475952, "learning_rate": 0.00019194576230316832, "loss": 1.9841, "step": 657 }, { "epoch": 0.04925794920741863, "grad_norm": 1.073201060295105, "learning_rate": 0.00019192118691533608, "loss": 2.1896, "step": 658 }, { "epoch": 0.049332809312597085, "grad_norm": 1.130515694618225, "learning_rate": 0.00019189657567005518, "loss": 2.1411, "step": 659 }, { "epoch": 0.04940766941777553, "grad_norm": 1.1481834650039673, "learning_rate": 0.0001918719285769263, "loss": 2.4301, "step": 660 }, { "epoch": 0.04948252952295398, "grad_norm": 1.0671634674072266, "learning_rate": 0.0001918472456455639, "loss": 2.4197, "step": 661 }, { "epoch": 0.04955738962813243, "grad_norm": 1.4411333799362183, "learning_rate": 0.00019182252688559662, "loss": 2.0861, "step": 662 }, { "epoch": 0.049632249733310874, "grad_norm": 1.4298112392425537, "learning_rate": 0.00019179777230666692, "loss": 2.3618, "step": 663 }, { "epoch": 0.049707109838489326, "grad_norm": 1.138014793395996, "learning_rate": 0.0001917729819184313, "loss": 2.0588, "step": 664 }, { "epoch": 0.04978196994366777, "grad_norm": 1.2633472681045532, "learning_rate": 0.00019174815573056022, "loss": 2.2448, "step": 665 }, { "epoch": 0.04985683004884622, "grad_norm": 1.0747766494750977, "learning_rate": 0.0001917232937527381, "loss": 1.8884, "step": 666 }, { "epoch": 0.04993169015402467, "grad_norm": 1.3060377836227417, "learning_rate": 0.00019169839599466332, "loss": 2.3694, "step": 667 }, { "epoch": 0.050006550259203114, "grad_norm": 1.1866399049758911, "learning_rate": 0.00019167346246604824, "loss": 2.5393, "step": 668 }, { "epoch": 0.05008141036438156, "grad_norm": 1.2555480003356934, "learning_rate": 0.00019164849317661914, "loss": 1.7024, "step": 669 }, { "epoch": 0.05015627046956001, "grad_norm": 1.0586023330688477, "learning_rate": 0.0001916234881361163, "loss": 2.435, "step": 670 }, { "epoch": 0.05023113057473846, "grad_norm": 1.393521785736084, "learning_rate": 0.0001915984473542939, "loss": 1.8916, "step": 671 }, { "epoch": 0.0503059906799169, "grad_norm": 1.191524863243103, "learning_rate": 0.00019157337084092001, "loss": 2.4755, "step": 672 }, { "epoch": 0.050380850785095355, "grad_norm": 1.1360992193222046, "learning_rate": 0.0001915482586057768, "loss": 2.0405, "step": 673 }, { "epoch": 0.0504557108902738, "grad_norm": 1.1968960762023926, "learning_rate": 0.00019152311065866022, "loss": 2.418, "step": 674 }, { "epoch": 0.050530570995452245, "grad_norm": 1.1982566118240356, "learning_rate": 0.00019149792700938023, "loss": 2.475, "step": 675 }, { "epoch": 0.0506054311006307, "grad_norm": 1.2948143482208252, "learning_rate": 0.00019147270766776067, "loss": 2.4138, "step": 676 }, { "epoch": 0.05068029120580914, "grad_norm": 1.1352835893630981, "learning_rate": 0.00019144745264363937, "loss": 1.9054, "step": 677 }, { "epoch": 0.050755151310987595, "grad_norm": 1.280592441558838, "learning_rate": 0.00019142216194686798, "loss": 2.456, "step": 678 }, { "epoch": 0.05083001141616604, "grad_norm": 1.1994556188583374, "learning_rate": 0.00019139683558731224, "loss": 2.3359, "step": 679 }, { "epoch": 0.050904871521344486, "grad_norm": 1.4238048791885376, "learning_rate": 0.00019137147357485153, "loss": 2.0533, "step": 680 }, { "epoch": 0.05097973162652294, "grad_norm": 1.1828192472457886, "learning_rate": 0.00019134607591937938, "loss": 2.4304, "step": 681 }, { "epoch": 0.05105459173170138, "grad_norm": 1.1854288578033447, "learning_rate": 0.00019132064263080315, "loss": 1.7512, "step": 682 }, { "epoch": 0.05112945183687983, "grad_norm": 1.2808656692504883, "learning_rate": 0.00019129517371904405, "loss": 2.1759, "step": 683 }, { "epoch": 0.05120431194205828, "grad_norm": 1.3971152305603027, "learning_rate": 0.00019126966919403725, "loss": 1.9858, "step": 684 }, { "epoch": 0.051279172047236726, "grad_norm": 1.254228949546814, "learning_rate": 0.00019124412906573176, "loss": 2.4846, "step": 685 }, { "epoch": 0.05135403215241517, "grad_norm": 1.340484857559204, "learning_rate": 0.00019121855334409054, "loss": 2.2098, "step": 686 }, { "epoch": 0.051428892257593624, "grad_norm": 1.2410305738449097, "learning_rate": 0.00019119294203909034, "loss": 2.2877, "step": 687 }, { "epoch": 0.05150375236277207, "grad_norm": 1.1043133735656738, "learning_rate": 0.0001911672951607219, "loss": 2.4142, "step": 688 }, { "epoch": 0.051578612467950515, "grad_norm": 1.3445990085601807, "learning_rate": 0.00019114161271898978, "loss": 1.8344, "step": 689 }, { "epoch": 0.05165347257312897, "grad_norm": 1.3722882270812988, "learning_rate": 0.00019111589472391242, "loss": 2.1076, "step": 690 }, { "epoch": 0.05172833267830741, "grad_norm": 1.0229235887527466, "learning_rate": 0.0001910901411855221, "loss": 2.2264, "step": 691 }, { "epoch": 0.05180319278348586, "grad_norm": 1.2345408201217651, "learning_rate": 0.00019106435211386504, "loss": 1.8923, "step": 692 }, { "epoch": 0.05187805288866431, "grad_norm": 1.2056362628936768, "learning_rate": 0.00019103852751900126, "loss": 2.6309, "step": 693 }, { "epoch": 0.051952912993842755, "grad_norm": 1.1532694101333618, "learning_rate": 0.00019101266741100462, "loss": 2.2045, "step": 694 }, { "epoch": 0.05202777309902121, "grad_norm": 1.2102175951004028, "learning_rate": 0.00019098677179996284, "loss": 2.4444, "step": 695 }, { "epoch": 0.05210263320419965, "grad_norm": 1.249040961265564, "learning_rate": 0.00019096084069597762, "loss": 1.9686, "step": 696 }, { "epoch": 0.0521774933093781, "grad_norm": 1.4783918857574463, "learning_rate": 0.00019093487410916433, "loss": 1.7653, "step": 697 }, { "epoch": 0.05225235341455655, "grad_norm": 1.21443772315979, "learning_rate": 0.00019090887204965224, "loss": 2.4092, "step": 698 }, { "epoch": 0.052327213519734996, "grad_norm": 1.2620652914047241, "learning_rate": 0.0001908828345275845, "loss": 2.373, "step": 699 }, { "epoch": 0.05240207362491344, "grad_norm": 1.204424500465393, "learning_rate": 0.00019085676155311806, "loss": 2.1443, "step": 700 }, { "epoch": 0.05247693373009189, "grad_norm": 1.1467736959457397, "learning_rate": 0.00019083065313642368, "loss": 2.3748, "step": 701 }, { "epoch": 0.05255179383527034, "grad_norm": 1.043575644493103, "learning_rate": 0.00019080450928768598, "loss": 2.1149, "step": 702 }, { "epoch": 0.052626653940448784, "grad_norm": 1.3456263542175293, "learning_rate": 0.00019077833001710342, "loss": 2.2759, "step": 703 }, { "epoch": 0.052701514045627236, "grad_norm": 1.252518653869629, "learning_rate": 0.00019075211533488816, "loss": 2.2549, "step": 704 }, { "epoch": 0.05277637415080568, "grad_norm": 1.1905314922332764, "learning_rate": 0.00019072586525126637, "loss": 2.3075, "step": 705 }, { "epoch": 0.05285123425598413, "grad_norm": 1.0985368490219116, "learning_rate": 0.00019069957977647784, "loss": 1.6553, "step": 706 }, { "epoch": 0.05292609436116258, "grad_norm": 1.3894082307815552, "learning_rate": 0.0001906732589207763, "loss": 1.7666, "step": 707 }, { "epoch": 0.053000954466341024, "grad_norm": 1.2872264385223389, "learning_rate": 0.00019064690269442918, "loss": 2.6719, "step": 708 }, { "epoch": 0.05307581457151947, "grad_norm": 1.3485358953475952, "learning_rate": 0.00019062051110771777, "loss": 2.2778, "step": 709 }, { "epoch": 0.05315067467669792, "grad_norm": 1.1572600603103638, "learning_rate": 0.00019059408417093719, "loss": 2.401, "step": 710 }, { "epoch": 0.05322553478187637, "grad_norm": 1.1708123683929443, "learning_rate": 0.00019056762189439622, "loss": 2.2491, "step": 711 }, { "epoch": 0.05330039488705482, "grad_norm": 1.3132355213165283, "learning_rate": 0.00019054112428841752, "loss": 2.3946, "step": 712 }, { "epoch": 0.053375254992233265, "grad_norm": 1.2061188220977783, "learning_rate": 0.00019051459136333756, "loss": 2.479, "step": 713 }, { "epoch": 0.05345011509741171, "grad_norm": 1.4653446674346924, "learning_rate": 0.0001904880231295065, "loss": 2.1558, "step": 714 }, { "epoch": 0.05352497520259016, "grad_norm": 1.2892643213272095, "learning_rate": 0.00019046141959728837, "loss": 2.577, "step": 715 }, { "epoch": 0.05359983530776861, "grad_norm": 1.2307772636413574, "learning_rate": 0.00019043478077706086, "loss": 2.3617, "step": 716 }, { "epoch": 0.05367469541294705, "grad_norm": 1.142544150352478, "learning_rate": 0.00019040810667921552, "loss": 1.8229, "step": 717 }, { "epoch": 0.053749555518125505, "grad_norm": 1.1202781200408936, "learning_rate": 0.00019038139731415759, "loss": 2.267, "step": 718 }, { "epoch": 0.05382441562330395, "grad_norm": 1.1692383289337158, "learning_rate": 0.0001903546526923061, "loss": 2.2526, "step": 719 }, { "epoch": 0.053899275728482396, "grad_norm": 0.9985339045524597, "learning_rate": 0.00019032787282409388, "loss": 1.5727, "step": 720 }, { "epoch": 0.05397413583366085, "grad_norm": 1.309064507484436, "learning_rate": 0.00019030105771996746, "loss": 1.6863, "step": 721 }, { "epoch": 0.054048995938839293, "grad_norm": 1.2613368034362793, "learning_rate": 0.00019027420739038705, "loss": 1.97, "step": 722 }, { "epoch": 0.05412385604401774, "grad_norm": 1.3536473512649536, "learning_rate": 0.00019024732184582673, "loss": 2.3795, "step": 723 }, { "epoch": 0.05419871614919619, "grad_norm": 1.2078039646148682, "learning_rate": 0.0001902204010967742, "loss": 1.8142, "step": 724 }, { "epoch": 0.054273576254374636, "grad_norm": 1.3381015062332153, "learning_rate": 0.00019019344515373106, "loss": 1.883, "step": 725 }, { "epoch": 0.05434843635955308, "grad_norm": 1.2348848581314087, "learning_rate": 0.00019016645402721241, "loss": 2.0622, "step": 726 }, { "epoch": 0.054423296464731534, "grad_norm": 1.3992066383361816, "learning_rate": 0.0001901394277277473, "loss": 2.5124, "step": 727 }, { "epoch": 0.05449815656990998, "grad_norm": 1.2793419361114502, "learning_rate": 0.00019011236626587826, "loss": 2.2859, "step": 728 }, { "epoch": 0.05457301667508843, "grad_norm": 1.2026153802871704, "learning_rate": 0.0001900852696521618, "loss": 1.8554, "step": 729 }, { "epoch": 0.05464787678026688, "grad_norm": 1.5070152282714844, "learning_rate": 0.00019005813789716794, "loss": 2.2481, "step": 730 }, { "epoch": 0.05472273688544532, "grad_norm": 0.9764329195022583, "learning_rate": 0.00019003097101148047, "loss": 2.0695, "step": 731 }, { "epoch": 0.054797596990623774, "grad_norm": 1.2726186513900757, "learning_rate": 0.00019000376900569696, "loss": 2.0438, "step": 732 }, { "epoch": 0.05487245709580222, "grad_norm": 1.1451269388198853, "learning_rate": 0.00018997653189042858, "loss": 1.5855, "step": 733 }, { "epoch": 0.054947317200980665, "grad_norm": 1.0834863185882568, "learning_rate": 0.0001899492596763002, "loss": 2.2945, "step": 734 }, { "epoch": 0.05502217730615912, "grad_norm": 1.2465964555740356, "learning_rate": 0.00018992195237395047, "loss": 1.9284, "step": 735 }, { "epoch": 0.05509703741133756, "grad_norm": 1.2208069562911987, "learning_rate": 0.00018989460999403163, "loss": 2.5616, "step": 736 }, { "epoch": 0.05517189751651601, "grad_norm": 1.2082844972610474, "learning_rate": 0.00018986723254720963, "loss": 2.2985, "step": 737 }, { "epoch": 0.05524675762169446, "grad_norm": 1.1029062271118164, "learning_rate": 0.00018983982004416416, "loss": 2.0476, "step": 738 }, { "epoch": 0.055321617726872906, "grad_norm": 1.3719749450683594, "learning_rate": 0.00018981237249558852, "loss": 2.5596, "step": 739 }, { "epoch": 0.05539647783205135, "grad_norm": 1.0563138723373413, "learning_rate": 0.00018978488991218965, "loss": 2.2583, "step": 740 }, { "epoch": 0.0554713379372298, "grad_norm": 1.2933984994888306, "learning_rate": 0.00018975737230468828, "loss": 1.6619, "step": 741 }, { "epoch": 0.05554619804240825, "grad_norm": 1.2005354166030884, "learning_rate": 0.00018972981968381868, "loss": 2.6399, "step": 742 }, { "epoch": 0.0556210581475867, "grad_norm": 1.1265065670013428, "learning_rate": 0.00018970223206032883, "loss": 1.8363, "step": 743 }, { "epoch": 0.055695918252765146, "grad_norm": 1.4750006198883057, "learning_rate": 0.00018967460944498038, "loss": 1.8479, "step": 744 }, { "epoch": 0.05577077835794359, "grad_norm": 1.1596392393112183, "learning_rate": 0.0001896469518485486, "loss": 2.1225, "step": 745 }, { "epoch": 0.055845638463122044, "grad_norm": 1.1398597955703735, "learning_rate": 0.00018961925928182241, "loss": 2.0601, "step": 746 }, { "epoch": 0.05592049856830049, "grad_norm": 1.1837871074676514, "learning_rate": 0.0001895915317556044, "loss": 2.2047, "step": 747 }, { "epoch": 0.055995358673478934, "grad_norm": 1.2946690320968628, "learning_rate": 0.0001895637692807107, "loss": 2.5553, "step": 748 }, { "epoch": 0.05607021877865739, "grad_norm": 1.248770833015442, "learning_rate": 0.00018953597186797128, "loss": 2.4617, "step": 749 }, { "epoch": 0.05614507888383583, "grad_norm": 1.3935346603393555, "learning_rate": 0.00018950813952822948, "loss": 2.7935, "step": 750 }, { "epoch": 0.05614507888383583, "eval_loss": 2.1982643604278564, "eval_runtime": 178.818, "eval_samples_per_second": 27.961, "eval_steps_per_second": 13.981, "step": 750 }, { "epoch": 0.05621993898901428, "grad_norm": 1.1593523025512695, "learning_rate": 0.00018948027227234243, "loss": 1.9123, "step": 751 }, { "epoch": 0.05629479909419273, "grad_norm": 1.1477103233337402, "learning_rate": 0.00018945237011118085, "loss": 2.3035, "step": 752 }, { "epoch": 0.056369659199371175, "grad_norm": 1.0828063488006592, "learning_rate": 0.0001894244330556291, "loss": 1.6694, "step": 753 }, { "epoch": 0.05644451930454962, "grad_norm": 1.2220417261123657, "learning_rate": 0.0001893964611165851, "loss": 2.024, "step": 754 }, { "epoch": 0.05651937940972807, "grad_norm": 1.3867909908294678, "learning_rate": 0.00018936845430496036, "loss": 2.4139, "step": 755 }, { "epoch": 0.05659423951490652, "grad_norm": 1.3341413736343384, "learning_rate": 0.00018934041263168005, "loss": 1.9361, "step": 756 }, { "epoch": 0.05666909962008496, "grad_norm": 1.2657465934753418, "learning_rate": 0.00018931233610768294, "loss": 2.2532, "step": 757 }, { "epoch": 0.056743959725263415, "grad_norm": 1.155652642250061, "learning_rate": 0.00018928422474392135, "loss": 2.0088, "step": 758 }, { "epoch": 0.05681881983044186, "grad_norm": 1.1862109899520874, "learning_rate": 0.00018925607855136122, "loss": 2.348, "step": 759 }, { "epoch": 0.05689367993562031, "grad_norm": 1.1282507181167603, "learning_rate": 0.00018922789754098208, "loss": 2.2234, "step": 760 }, { "epoch": 0.05696854004079876, "grad_norm": 1.4048036336898804, "learning_rate": 0.00018919968172377704, "loss": 2.2845, "step": 761 }, { "epoch": 0.057043400145977204, "grad_norm": 1.2710504531860352, "learning_rate": 0.00018917143111075278, "loss": 2.2695, "step": 762 }, { "epoch": 0.057118260251155656, "grad_norm": 1.0588970184326172, "learning_rate": 0.00018914314571292953, "loss": 2.1009, "step": 763 }, { "epoch": 0.0571931203563341, "grad_norm": 1.3131258487701416, "learning_rate": 0.00018911482554134114, "loss": 2.2263, "step": 764 }, { "epoch": 0.057267980461512547, "grad_norm": 1.2187931537628174, "learning_rate": 0.000189086470607035, "loss": 2.1133, "step": 765 }, { "epoch": 0.057342840566691, "grad_norm": 1.121682047843933, "learning_rate": 0.000189058080921072, "loss": 1.8137, "step": 766 }, { "epoch": 0.057417700671869444, "grad_norm": 1.1759824752807617, "learning_rate": 0.00018902965649452677, "loss": 1.9868, "step": 767 }, { "epoch": 0.05749256077704789, "grad_norm": 1.2275068759918213, "learning_rate": 0.00018900119733848724, "loss": 2.6469, "step": 768 }, { "epoch": 0.05756742088222634, "grad_norm": 1.175705909729004, "learning_rate": 0.00018897270346405508, "loss": 2.2016, "step": 769 }, { "epoch": 0.05764228098740479, "grad_norm": 1.1492919921875, "learning_rate": 0.00018894417488234545, "loss": 2.2103, "step": 770 }, { "epoch": 0.05771714109258323, "grad_norm": 1.4248522520065308, "learning_rate": 0.00018891561160448697, "loss": 2.5743, "step": 771 }, { "epoch": 0.057792001197761685, "grad_norm": 1.112351894378662, "learning_rate": 0.00018888701364162195, "loss": 2.0211, "step": 772 }, { "epoch": 0.05786686130294013, "grad_norm": 1.399417757987976, "learning_rate": 0.00018885838100490606, "loss": 2.0125, "step": 773 }, { "epoch": 0.057941721408118575, "grad_norm": 1.1962887048721313, "learning_rate": 0.00018882971370550865, "loss": 1.7984, "step": 774 }, { "epoch": 0.05801658151329703, "grad_norm": 1.6665276288986206, "learning_rate": 0.00018880101175461247, "loss": 2.3814, "step": 775 }, { "epoch": 0.05809144161847547, "grad_norm": 1.2610100507736206, "learning_rate": 0.00018877227516341384, "loss": 2.1055, "step": 776 }, { "epoch": 0.058166301723653925, "grad_norm": 1.30479097366333, "learning_rate": 0.00018874350394312264, "loss": 2.2335, "step": 777 }, { "epoch": 0.05824116182883237, "grad_norm": 1.2403790950775146, "learning_rate": 0.0001887146981049621, "loss": 2.2158, "step": 778 }, { "epoch": 0.058316021934010816, "grad_norm": 1.5346125364303589, "learning_rate": 0.0001886858576601692, "loss": 2.2443, "step": 779 }, { "epoch": 0.05839088203918927, "grad_norm": 1.0900819301605225, "learning_rate": 0.00018865698261999418, "loss": 2.0418, "step": 780 }, { "epoch": 0.05846574214436771, "grad_norm": 1.1484243869781494, "learning_rate": 0.00018862807299570094, "loss": 2.2057, "step": 781 }, { "epoch": 0.05854060224954616, "grad_norm": 1.547903299331665, "learning_rate": 0.00018859912879856677, "loss": 2.0137, "step": 782 }, { "epoch": 0.05861546235472461, "grad_norm": 1.0916261672973633, "learning_rate": 0.00018857015003988245, "loss": 1.4612, "step": 783 }, { "epoch": 0.058690322459903056, "grad_norm": 1.2107443809509277, "learning_rate": 0.00018854113673095238, "loss": 2.0705, "step": 784 }, { "epoch": 0.0587651825650815, "grad_norm": 1.430538535118103, "learning_rate": 0.00018851208888309424, "loss": 2.374, "step": 785 }, { "epoch": 0.058840042670259954, "grad_norm": 1.1731773614883423, "learning_rate": 0.0001884830065076393, "loss": 2.0904, "step": 786 }, { "epoch": 0.0589149027754384, "grad_norm": 1.3263612985610962, "learning_rate": 0.00018845388961593232, "loss": 2.0207, "step": 787 }, { "epoch": 0.058989762880616844, "grad_norm": 1.1866116523742676, "learning_rate": 0.00018842473821933142, "loss": 2.084, "step": 788 }, { "epoch": 0.0590646229857953, "grad_norm": 1.029174566268921, "learning_rate": 0.0001883955523292083, "loss": 1.8752, "step": 789 }, { "epoch": 0.05913948309097374, "grad_norm": 1.089242696762085, "learning_rate": 0.00018836633195694797, "loss": 1.9765, "step": 790 }, { "epoch": 0.05921434319615219, "grad_norm": 1.3189738988876343, "learning_rate": 0.00018833707711394908, "loss": 2.4845, "step": 791 }, { "epoch": 0.05928920330133064, "grad_norm": 1.4870893955230713, "learning_rate": 0.00018830778781162358, "loss": 2.4273, "step": 792 }, { "epoch": 0.059364063406509085, "grad_norm": 1.258448839187622, "learning_rate": 0.00018827846406139687, "loss": 2.3256, "step": 793 }, { "epoch": 0.05943892351168754, "grad_norm": 1.0760866403579712, "learning_rate": 0.00018824910587470788, "loss": 1.8448, "step": 794 }, { "epoch": 0.05951378361686598, "grad_norm": 1.170649766921997, "learning_rate": 0.0001882197132630089, "loss": 2.3305, "step": 795 }, { "epoch": 0.05958864372204443, "grad_norm": 1.335707187652588, "learning_rate": 0.0001881902862377656, "loss": 2.1149, "step": 796 }, { "epoch": 0.05966350382722288, "grad_norm": 1.26485013961792, "learning_rate": 0.00018816082481045728, "loss": 2.4134, "step": 797 }, { "epoch": 0.059738363932401325, "grad_norm": 1.1241893768310547, "learning_rate": 0.0001881313289925764, "loss": 2.3829, "step": 798 }, { "epoch": 0.05981322403757977, "grad_norm": 1.1361337900161743, "learning_rate": 0.00018810179879562901, "loss": 2.1224, "step": 799 }, { "epoch": 0.05988808414275822, "grad_norm": 1.1942558288574219, "learning_rate": 0.0001880722342311345, "loss": 2.642, "step": 800 }, { "epoch": 0.05996294424793667, "grad_norm": 1.1899267435073853, "learning_rate": 0.00018804263531062567, "loss": 2.2625, "step": 801 }, { "epoch": 0.060037804353115114, "grad_norm": 1.2864280939102173, "learning_rate": 0.00018801300204564876, "loss": 2.2961, "step": 802 }, { "epoch": 0.060112664458293566, "grad_norm": 1.1968439817428589, "learning_rate": 0.00018798333444776337, "loss": 2.0296, "step": 803 }, { "epoch": 0.06018752456347201, "grad_norm": 1.750536561012268, "learning_rate": 0.00018795363252854252, "loss": 2.0112, "step": 804 }, { "epoch": 0.06026238466865046, "grad_norm": 1.155773639678955, "learning_rate": 0.0001879238962995726, "loss": 2.1658, "step": 805 }, { "epoch": 0.06033724477382891, "grad_norm": 1.1404871940612793, "learning_rate": 0.00018789412577245332, "loss": 2.0282, "step": 806 }, { "epoch": 0.060412104879007354, "grad_norm": 1.1371667385101318, "learning_rate": 0.00018786432095879794, "loss": 2.225, "step": 807 }, { "epoch": 0.0604869649841858, "grad_norm": 1.20431387424469, "learning_rate": 0.00018783448187023293, "loss": 2.4727, "step": 808 }, { "epoch": 0.06056182508936425, "grad_norm": 0.958070695400238, "learning_rate": 0.00018780460851839826, "loss": 2.1115, "step": 809 }, { "epoch": 0.0606366851945427, "grad_norm": 1.3852531909942627, "learning_rate": 0.00018777470091494708, "loss": 2.2048, "step": 810 }, { "epoch": 0.06071154529972115, "grad_norm": 1.0525598526000977, "learning_rate": 0.00018774475907154612, "loss": 2.2602, "step": 811 }, { "epoch": 0.060786405404899595, "grad_norm": 1.235201358795166, "learning_rate": 0.00018771478299987532, "loss": 2.2392, "step": 812 }, { "epoch": 0.06086126551007804, "grad_norm": 1.204785704612732, "learning_rate": 0.000187684772711628, "loss": 2.2966, "step": 813 }, { "epoch": 0.06093612561525649, "grad_norm": 1.1204441785812378, "learning_rate": 0.0001876547282185109, "loss": 2.0208, "step": 814 }, { "epoch": 0.06101098572043494, "grad_norm": 1.1778594255447388, "learning_rate": 0.00018762464953224402, "loss": 2.1342, "step": 815 }, { "epoch": 0.06108584582561338, "grad_norm": 1.047898292541504, "learning_rate": 0.0001875945366645607, "loss": 2.2846, "step": 816 }, { "epoch": 0.061160705930791835, "grad_norm": 1.4196580648422241, "learning_rate": 0.00018756438962720773, "loss": 2.5605, "step": 817 }, { "epoch": 0.06123556603597028, "grad_norm": 1.3268249034881592, "learning_rate": 0.000187534208431945, "loss": 2.6385, "step": 818 }, { "epoch": 0.061310426141148726, "grad_norm": 1.1722865104675293, "learning_rate": 0.000187503993090546, "loss": 2.35, "step": 819 }, { "epoch": 0.06138528624632718, "grad_norm": 1.1922276020050049, "learning_rate": 0.0001874737436147973, "loss": 2.4043, "step": 820 }, { "epoch": 0.06146014635150562, "grad_norm": 1.289080023765564, "learning_rate": 0.00018744346001649893, "loss": 2.2684, "step": 821 }, { "epoch": 0.06153500645668407, "grad_norm": 1.5033873319625854, "learning_rate": 0.00018741314230746422, "loss": 2.0636, "step": 822 }, { "epoch": 0.06160986656186252, "grad_norm": 1.2325069904327393, "learning_rate": 0.00018738279049951975, "loss": 1.9422, "step": 823 }, { "epoch": 0.061684726667040966, "grad_norm": 1.3350704908370972, "learning_rate": 0.0001873524046045054, "loss": 2.3814, "step": 824 }, { "epoch": 0.06175958677221941, "grad_norm": 1.3679190874099731, "learning_rate": 0.00018732198463427443, "loss": 2.3143, "step": 825 }, { "epoch": 0.061834446877397864, "grad_norm": 1.1092946529388428, "learning_rate": 0.00018729153060069326, "loss": 2.0319, "step": 826 }, { "epoch": 0.06190930698257631, "grad_norm": 1.1506335735321045, "learning_rate": 0.00018726104251564176, "loss": 1.8725, "step": 827 }, { "epoch": 0.06198416708775476, "grad_norm": 1.387829303741455, "learning_rate": 0.00018723052039101298, "loss": 2.0782, "step": 828 }, { "epoch": 0.06205902719293321, "grad_norm": 1.208820104598999, "learning_rate": 0.00018719996423871322, "loss": 2.0383, "step": 829 }, { "epoch": 0.06213388729811165, "grad_norm": 1.2511824369430542, "learning_rate": 0.00018716937407066214, "loss": 1.9442, "step": 830 }, { "epoch": 0.062208747403290104, "grad_norm": 1.1407896280288696, "learning_rate": 0.0001871387498987926, "loss": 2.554, "step": 831 }, { "epoch": 0.06228360750846855, "grad_norm": 1.1065419912338257, "learning_rate": 0.0001871080917350508, "loss": 1.9736, "step": 832 }, { "epoch": 0.062358467613646995, "grad_norm": 1.1847344636917114, "learning_rate": 0.0001870773995913961, "loss": 2.143, "step": 833 }, { "epoch": 0.06243332771882545, "grad_norm": 1.0570135116577148, "learning_rate": 0.00018704667347980124, "loss": 2.3263, "step": 834 }, { "epoch": 0.0625081878240039, "grad_norm": 1.2136785984039307, "learning_rate": 0.00018701591341225204, "loss": 1.9186, "step": 835 }, { "epoch": 0.06258304792918234, "grad_norm": 1.085980772972107, "learning_rate": 0.00018698511940074775, "loss": 2.1471, "step": 836 }, { "epoch": 0.06265790803436079, "grad_norm": 1.2652573585510254, "learning_rate": 0.00018695429145730076, "loss": 2.4356, "step": 837 }, { "epoch": 0.06273276813953924, "grad_norm": 1.2485424280166626, "learning_rate": 0.00018692342959393668, "loss": 2.361, "step": 838 }, { "epoch": 0.06280762824471768, "grad_norm": 1.1969283819198608, "learning_rate": 0.0001868925338226944, "loss": 2.4953, "step": 839 }, { "epoch": 0.06288248834989613, "grad_norm": 1.4092187881469727, "learning_rate": 0.00018686160415562605, "loss": 1.8672, "step": 840 }, { "epoch": 0.06295734845507459, "grad_norm": 1.2099794149398804, "learning_rate": 0.0001868306406047969, "loss": 2.3915, "step": 841 }, { "epoch": 0.06303220856025303, "grad_norm": 1.019537329673767, "learning_rate": 0.0001867996431822856, "loss": 2.1223, "step": 842 }, { "epoch": 0.06310706866543148, "grad_norm": 1.1257330179214478, "learning_rate": 0.00018676861190018376, "loss": 2.5125, "step": 843 }, { "epoch": 0.06318192877060992, "grad_norm": 1.0478541851043701, "learning_rate": 0.00018673754677059643, "loss": 2.2037, "step": 844 }, { "epoch": 0.06325678887578837, "grad_norm": 1.0717214345932007, "learning_rate": 0.00018670644780564177, "loss": 2.1757, "step": 845 }, { "epoch": 0.06333164898096681, "grad_norm": 1.274664282798767, "learning_rate": 0.00018667531501745111, "loss": 2.6257, "step": 846 }, { "epoch": 0.06340650908614527, "grad_norm": 1.1436635255813599, "learning_rate": 0.00018664414841816908, "loss": 1.9974, "step": 847 }, { "epoch": 0.06348136919132372, "grad_norm": 1.0967286825180054, "learning_rate": 0.00018661294801995335, "loss": 1.8491, "step": 848 }, { "epoch": 0.06355622929650216, "grad_norm": 1.3625829219818115, "learning_rate": 0.0001865817138349749, "loss": 2.3344, "step": 849 }, { "epoch": 0.06363108940168061, "grad_norm": 1.0455329418182373, "learning_rate": 0.00018655044587541784, "loss": 2.1761, "step": 850 }, { "epoch": 0.06370594950685905, "grad_norm": 1.4849685430526733, "learning_rate": 0.00018651914415347944, "loss": 1.9665, "step": 851 }, { "epoch": 0.06378080961203751, "grad_norm": 1.3808530569076538, "learning_rate": 0.00018648780868137018, "loss": 2.3054, "step": 852 }, { "epoch": 0.06385566971721596, "grad_norm": 1.378683090209961, "learning_rate": 0.00018645643947131366, "loss": 1.9816, "step": 853 }, { "epoch": 0.0639305298223944, "grad_norm": 1.2171995639801025, "learning_rate": 0.00018642503653554671, "loss": 2.1681, "step": 854 }, { "epoch": 0.06400538992757285, "grad_norm": 1.082524061203003, "learning_rate": 0.00018639359988631922, "loss": 2.1474, "step": 855 }, { "epoch": 0.06408025003275129, "grad_norm": 1.197901964187622, "learning_rate": 0.0001863621295358943, "loss": 1.6375, "step": 856 }, { "epoch": 0.06415511013792974, "grad_norm": 1.1498801708221436, "learning_rate": 0.00018633062549654818, "loss": 1.9129, "step": 857 }, { "epoch": 0.0642299702431082, "grad_norm": 1.351519227027893, "learning_rate": 0.00018629908778057031, "loss": 1.9342, "step": 858 }, { "epoch": 0.06430483034828664, "grad_norm": 1.3530468940734863, "learning_rate": 0.0001862675164002631, "loss": 2.2107, "step": 859 }, { "epoch": 0.06437969045346509, "grad_norm": 1.4251513481140137, "learning_rate": 0.00018623591136794228, "loss": 2.5275, "step": 860 }, { "epoch": 0.06445455055864353, "grad_norm": 1.190288782119751, "learning_rate": 0.0001862042726959366, "loss": 1.8804, "step": 861 }, { "epoch": 0.06452941066382198, "grad_norm": 1.5285230875015259, "learning_rate": 0.00018617260039658793, "loss": 2.3065, "step": 862 }, { "epoch": 0.06460427076900042, "grad_norm": 1.3687454462051392, "learning_rate": 0.00018614089448225132, "loss": 2.3249, "step": 863 }, { "epoch": 0.06467913087417888, "grad_norm": 1.2103374004364014, "learning_rate": 0.0001861091549652949, "loss": 1.9258, "step": 864 }, { "epoch": 0.06475399097935733, "grad_norm": 1.2658427953720093, "learning_rate": 0.00018607738185809992, "loss": 2.3899, "step": 865 }, { "epoch": 0.06482885108453577, "grad_norm": 1.4169390201568604, "learning_rate": 0.0001860455751730607, "loss": 2.6455, "step": 866 }, { "epoch": 0.06490371118971422, "grad_norm": 1.1437957286834717, "learning_rate": 0.00018601373492258469, "loss": 1.7896, "step": 867 }, { "epoch": 0.06497857129489266, "grad_norm": 1.3790591955184937, "learning_rate": 0.0001859818611190924, "loss": 2.2316, "step": 868 }, { "epoch": 0.06505343140007112, "grad_norm": 1.2310020923614502, "learning_rate": 0.00018594995377501747, "loss": 1.8661, "step": 869 }, { "epoch": 0.06512829150524957, "grad_norm": 1.518387794494629, "learning_rate": 0.00018591801290280665, "loss": 2.4408, "step": 870 }, { "epoch": 0.06520315161042801, "grad_norm": 1.3249173164367676, "learning_rate": 0.00018588603851491962, "loss": 2.4774, "step": 871 }, { "epoch": 0.06527801171560646, "grad_norm": 1.5034116506576538, "learning_rate": 0.00018585403062382932, "loss": 1.9053, "step": 872 }, { "epoch": 0.0653528718207849, "grad_norm": 1.285714864730835, "learning_rate": 0.00018582198924202165, "loss": 2.3669, "step": 873 }, { "epoch": 0.06542773192596335, "grad_norm": 1.2789112329483032, "learning_rate": 0.0001857899143819956, "loss": 2.522, "step": 874 }, { "epoch": 0.06550259203114181, "grad_norm": 1.2320350408554077, "learning_rate": 0.00018575780605626326, "loss": 1.9605, "step": 875 }, { "epoch": 0.06557745213632026, "grad_norm": 1.3942331075668335, "learning_rate": 0.0001857256642773497, "loss": 2.4424, "step": 876 }, { "epoch": 0.0656523122414987, "grad_norm": 1.268656849861145, "learning_rate": 0.00018569348905779308, "loss": 2.3636, "step": 877 }, { "epoch": 0.06572717234667715, "grad_norm": 1.2299535274505615, "learning_rate": 0.0001856612804101446, "loss": 2.2573, "step": 878 }, { "epoch": 0.06580203245185559, "grad_norm": 1.2484804391860962, "learning_rate": 0.00018562903834696848, "loss": 2.4244, "step": 879 }, { "epoch": 0.06587689255703404, "grad_norm": 1.1802738904953003, "learning_rate": 0.00018559676288084208, "loss": 2.5082, "step": 880 }, { "epoch": 0.0659517526622125, "grad_norm": 1.1920825242996216, "learning_rate": 0.0001855644540243556, "loss": 2.5453, "step": 881 }, { "epoch": 0.06602661276739094, "grad_norm": 1.3287335634231567, "learning_rate": 0.0001855321117901124, "loss": 2.0743, "step": 882 }, { "epoch": 0.06610147287256939, "grad_norm": 1.2227089405059814, "learning_rate": 0.00018549973619072887, "loss": 2.1053, "step": 883 }, { "epoch": 0.06617633297774783, "grad_norm": 1.2104471921920776, "learning_rate": 0.00018546732723883437, "loss": 2.1825, "step": 884 }, { "epoch": 0.06625119308292628, "grad_norm": 1.2355787754058838, "learning_rate": 0.00018543488494707124, "loss": 2.5263, "step": 885 }, { "epoch": 0.06632605318810474, "grad_norm": 1.1577972173690796, "learning_rate": 0.00018540240932809492, "loss": 1.9219, "step": 886 }, { "epoch": 0.06640091329328318, "grad_norm": 1.2734469175338745, "learning_rate": 0.00018536990039457374, "loss": 2.2034, "step": 887 }, { "epoch": 0.06647577339846163, "grad_norm": 1.4241886138916016, "learning_rate": 0.00018533735815918907, "loss": 1.7542, "step": 888 }, { "epoch": 0.06655063350364007, "grad_norm": 1.1562018394470215, "learning_rate": 0.00018530478263463534, "loss": 2.5518, "step": 889 }, { "epoch": 0.06662549360881852, "grad_norm": 1.1827027797698975, "learning_rate": 0.0001852721738336199, "loss": 2.4357, "step": 890 }, { "epoch": 0.06670035371399696, "grad_norm": 1.1920583248138428, "learning_rate": 0.00018523953176886299, "loss": 2.2037, "step": 891 }, { "epoch": 0.06677521381917542, "grad_norm": 1.0795623064041138, "learning_rate": 0.00018520685645309805, "loss": 2.6764, "step": 892 }, { "epoch": 0.06685007392435387, "grad_norm": 1.3171135187149048, "learning_rate": 0.00018517414789907127, "loss": 1.9301, "step": 893 }, { "epoch": 0.06692493402953231, "grad_norm": 1.2379522323608398, "learning_rate": 0.0001851414061195419, "loss": 1.9434, "step": 894 }, { "epoch": 0.06699979413471076, "grad_norm": 1.2157772779464722, "learning_rate": 0.00018510863112728226, "loss": 2.2342, "step": 895 }, { "epoch": 0.0670746542398892, "grad_norm": 1.2101303339004517, "learning_rate": 0.00018507582293507738, "loss": 2.1716, "step": 896 }, { "epoch": 0.06714951434506765, "grad_norm": 1.0535258054733276, "learning_rate": 0.0001850429815557255, "loss": 2.1582, "step": 897 }, { "epoch": 0.06722437445024611, "grad_norm": 1.189684510231018, "learning_rate": 0.0001850101070020376, "loss": 2.4851, "step": 898 }, { "epoch": 0.06729923455542455, "grad_norm": 1.1361318826675415, "learning_rate": 0.00018497719928683764, "loss": 2.6563, "step": 899 }, { "epoch": 0.067374094660603, "grad_norm": 1.0654137134552002, "learning_rate": 0.00018494425842296267, "loss": 2.1035, "step": 900 }, { "epoch": 0.067374094660603, "eval_loss": 2.186244249343872, "eval_runtime": 178.7621, "eval_samples_per_second": 27.97, "eval_steps_per_second": 13.985, "step": 900 }, { "epoch": 0.06744895476578144, "grad_norm": 1.3717975616455078, "learning_rate": 0.0001849112844232625, "loss": 2.4376, "step": 901 }, { "epoch": 0.06752381487095989, "grad_norm": 1.1404104232788086, "learning_rate": 0.00018487827730059994, "loss": 2.4924, "step": 902 }, { "epoch": 0.06759867497613835, "grad_norm": 1.1757146120071411, "learning_rate": 0.0001848452370678507, "loss": 2.1425, "step": 903 }, { "epoch": 0.0676735350813168, "grad_norm": 1.1435543298721313, "learning_rate": 0.0001848121637379034, "loss": 2.3571, "step": 904 }, { "epoch": 0.06774839518649524, "grad_norm": 1.1750966310501099, "learning_rate": 0.00018477905732365967, "loss": 2.6164, "step": 905 }, { "epoch": 0.06782325529167368, "grad_norm": 1.234067678451538, "learning_rate": 0.00018474591783803384, "loss": 2.0085, "step": 906 }, { "epoch": 0.06789811539685213, "grad_norm": 1.227264642715454, "learning_rate": 0.0001847127452939533, "loss": 2.51, "step": 907 }, { "epoch": 0.06797297550203057, "grad_norm": 1.5205940008163452, "learning_rate": 0.0001846795397043583, "loss": 2.2131, "step": 908 }, { "epoch": 0.06804783560720903, "grad_norm": 1.3490185737609863, "learning_rate": 0.000184646301082202, "loss": 2.2282, "step": 909 }, { "epoch": 0.06812269571238748, "grad_norm": 1.1118682622909546, "learning_rate": 0.00018461302944045046, "loss": 2.3033, "step": 910 }, { "epoch": 0.06819755581756592, "grad_norm": 1.3562675714492798, "learning_rate": 0.00018457972479208245, "loss": 2.1656, "step": 911 }, { "epoch": 0.06827241592274437, "grad_norm": 0.9638820290565491, "learning_rate": 0.00018454638715008988, "loss": 2.2423, "step": 912 }, { "epoch": 0.06834727602792282, "grad_norm": 1.5052152872085571, "learning_rate": 0.00018451301652747733, "loss": 2.7497, "step": 913 }, { "epoch": 0.06842213613310126, "grad_norm": 1.32514488697052, "learning_rate": 0.00018447961293726234, "loss": 2.2437, "step": 914 }, { "epoch": 0.06849699623827972, "grad_norm": 1.047245979309082, "learning_rate": 0.00018444617639247528, "loss": 2.1586, "step": 915 }, { "epoch": 0.06857185634345817, "grad_norm": 1.2893370389938354, "learning_rate": 0.00018441270690615938, "loss": 1.9714, "step": 916 }, { "epoch": 0.06864671644863661, "grad_norm": 1.3863189220428467, "learning_rate": 0.0001843792044913707, "loss": 2.0803, "step": 917 }, { "epoch": 0.06872157655381506, "grad_norm": 1.3875479698181152, "learning_rate": 0.0001843456691611782, "loss": 2.3664, "step": 918 }, { "epoch": 0.0687964366589935, "grad_norm": 1.1541324853897095, "learning_rate": 0.0001843121009286636, "loss": 1.8506, "step": 919 }, { "epoch": 0.06887129676417196, "grad_norm": 1.3047635555267334, "learning_rate": 0.00018427849980692156, "loss": 2.3397, "step": 920 }, { "epoch": 0.0689461568693504, "grad_norm": 1.6218945980072021, "learning_rate": 0.00018424486580905948, "loss": 2.5606, "step": 921 }, { "epoch": 0.06902101697452885, "grad_norm": 1.350576400756836, "learning_rate": 0.00018421119894819759, "loss": 2.1833, "step": 922 }, { "epoch": 0.0690958770797073, "grad_norm": 1.2078466415405273, "learning_rate": 0.000184177499237469, "loss": 2.2882, "step": 923 }, { "epoch": 0.06917073718488574, "grad_norm": 1.3808624744415283, "learning_rate": 0.00018414376669001955, "loss": 2.4157, "step": 924 }, { "epoch": 0.06924559729006419, "grad_norm": 1.2150079011917114, "learning_rate": 0.00018411000131900802, "loss": 2.1026, "step": 925 }, { "epoch": 0.06932045739524265, "grad_norm": 1.2063310146331787, "learning_rate": 0.00018407620313760586, "loss": 2.5928, "step": 926 }, { "epoch": 0.06939531750042109, "grad_norm": 1.2696150541305542, "learning_rate": 0.00018404237215899737, "loss": 2.4695, "step": 927 }, { "epoch": 0.06947017760559954, "grad_norm": 1.335753321647644, "learning_rate": 0.00018400850839637962, "loss": 2.0589, "step": 928 }, { "epoch": 0.06954503771077798, "grad_norm": 1.1516119241714478, "learning_rate": 0.00018397461186296253, "loss": 2.5417, "step": 929 }, { "epoch": 0.06961989781595643, "grad_norm": 1.1250559091567993, "learning_rate": 0.00018394068257196876, "loss": 2.3662, "step": 930 }, { "epoch": 0.06969475792113487, "grad_norm": 1.3411784172058105, "learning_rate": 0.00018390672053663376, "loss": 1.9465, "step": 931 }, { "epoch": 0.06976961802631333, "grad_norm": 1.061346173286438, "learning_rate": 0.00018387272577020572, "loss": 1.9858, "step": 932 }, { "epoch": 0.06984447813149178, "grad_norm": 1.2498948574066162, "learning_rate": 0.0001838386982859457, "loss": 2.5586, "step": 933 }, { "epoch": 0.06991933823667022, "grad_norm": 1.4810079336166382, "learning_rate": 0.00018380463809712736, "loss": 2.0685, "step": 934 }, { "epoch": 0.06999419834184867, "grad_norm": 1.2756901979446411, "learning_rate": 0.00018377054521703722, "loss": 2.0652, "step": 935 }, { "epoch": 0.07006905844702711, "grad_norm": 1.5456281900405884, "learning_rate": 0.00018373641965897458, "loss": 2.1139, "step": 936 }, { "epoch": 0.07014391855220557, "grad_norm": 1.5689592361450195, "learning_rate": 0.00018370226143625144, "loss": 2.448, "step": 937 }, { "epoch": 0.07021877865738402, "grad_norm": 1.445564866065979, "learning_rate": 0.00018366807056219254, "loss": 2.3749, "step": 938 }, { "epoch": 0.07029363876256246, "grad_norm": 1.281229019165039, "learning_rate": 0.00018363384705013535, "loss": 2.432, "step": 939 }, { "epoch": 0.07036849886774091, "grad_norm": 1.190098762512207, "learning_rate": 0.00018359959091343011, "loss": 2.4082, "step": 940 }, { "epoch": 0.07044335897291935, "grad_norm": 1.1866652965545654, "learning_rate": 0.00018356530216543976, "loss": 2.4427, "step": 941 }, { "epoch": 0.0705182190780978, "grad_norm": 1.338909387588501, "learning_rate": 0.00018353098081953995, "loss": 2.2169, "step": 942 }, { "epoch": 0.07059307918327626, "grad_norm": 1.2736167907714844, "learning_rate": 0.00018349662688911907, "loss": 1.8953, "step": 943 }, { "epoch": 0.0706679392884547, "grad_norm": 1.0369459390640259, "learning_rate": 0.0001834622403875782, "loss": 1.9191, "step": 944 }, { "epoch": 0.07074279939363315, "grad_norm": 1.2815704345703125, "learning_rate": 0.0001834278213283312, "loss": 2.4275, "step": 945 }, { "epoch": 0.0708176594988116, "grad_norm": 1.4540680646896362, "learning_rate": 0.00018339336972480447, "loss": 1.6981, "step": 946 }, { "epoch": 0.07089251960399004, "grad_norm": 1.2404223680496216, "learning_rate": 0.00018335888559043725, "loss": 1.9991, "step": 947 }, { "epoch": 0.07096737970916848, "grad_norm": 1.9970414638519287, "learning_rate": 0.00018332436893868143, "loss": 2.1209, "step": 948 }, { "epoch": 0.07104223981434694, "grad_norm": 1.3351224660873413, "learning_rate": 0.00018328981978300157, "loss": 1.6881, "step": 949 }, { "epoch": 0.07111709991952539, "grad_norm": 1.2341234683990479, "learning_rate": 0.00018325523813687494, "loss": 1.8769, "step": 950 }, { "epoch": 0.07119196002470383, "grad_norm": 1.1785449981689453, "learning_rate": 0.00018322062401379143, "loss": 2.6463, "step": 951 }, { "epoch": 0.07126682012988228, "grad_norm": 1.123913288116455, "learning_rate": 0.00018318597742725365, "loss": 2.2402, "step": 952 }, { "epoch": 0.07134168023506073, "grad_norm": 1.363873839378357, "learning_rate": 0.00018315129839077687, "loss": 2.335, "step": 953 }, { "epoch": 0.07141654034023918, "grad_norm": 1.4208567142486572, "learning_rate": 0.00018311658691788892, "loss": 2.2037, "step": 954 }, { "epoch": 0.07149140044541763, "grad_norm": 1.3694982528686523, "learning_rate": 0.00018308184302213046, "loss": 2.1403, "step": 955 }, { "epoch": 0.07156626055059608, "grad_norm": 1.384435772895813, "learning_rate": 0.0001830470667170547, "loss": 2.5758, "step": 956 }, { "epoch": 0.07164112065577452, "grad_norm": 1.0491409301757812, "learning_rate": 0.00018301225801622742, "loss": 2.1978, "step": 957 }, { "epoch": 0.07171598076095297, "grad_norm": 1.412184238433838, "learning_rate": 0.00018297741693322717, "loss": 2.7549, "step": 958 }, { "epoch": 0.07179084086613141, "grad_norm": 1.0899995565414429, "learning_rate": 0.00018294254348164506, "loss": 2.2546, "step": 959 }, { "epoch": 0.07186570097130987, "grad_norm": 1.1917152404785156, "learning_rate": 0.00018290763767508483, "loss": 2.1572, "step": 960 }, { "epoch": 0.07194056107648832, "grad_norm": 1.1625562906265259, "learning_rate": 0.0001828726995271629, "loss": 2.2277, "step": 961 }, { "epoch": 0.07201542118166676, "grad_norm": 1.1400433778762817, "learning_rate": 0.0001828377290515082, "loss": 2.2727, "step": 962 }, { "epoch": 0.0720902812868452, "grad_norm": 1.273107647895813, "learning_rate": 0.00018280272626176235, "loss": 1.9673, "step": 963 }, { "epoch": 0.07216514139202365, "grad_norm": 1.2809603214263916, "learning_rate": 0.00018276769117157957, "loss": 2.058, "step": 964 }, { "epoch": 0.0722400014972021, "grad_norm": 1.1379973888397217, "learning_rate": 0.00018273262379462667, "loss": 2.0722, "step": 965 }, { "epoch": 0.07231486160238056, "grad_norm": 1.8161274194717407, "learning_rate": 0.00018269752414458302, "loss": 2.0983, "step": 966 }, { "epoch": 0.072389721707559, "grad_norm": 1.3776799440383911, "learning_rate": 0.0001826623922351406, "loss": 2.1047, "step": 967 }, { "epoch": 0.07246458181273745, "grad_norm": 1.533652663230896, "learning_rate": 0.00018262722808000403, "loss": 2.3644, "step": 968 }, { "epoch": 0.07253944191791589, "grad_norm": 1.3355242013931274, "learning_rate": 0.00018259203169289045, "loss": 1.9691, "step": 969 }, { "epoch": 0.07261430202309434, "grad_norm": 1.7312206029891968, "learning_rate": 0.0001825568030875296, "loss": 2.2891, "step": 970 }, { "epoch": 0.0726891621282728, "grad_norm": 1.519719123840332, "learning_rate": 0.00018252154227766366, "loss": 2.3902, "step": 971 }, { "epoch": 0.07276402223345124, "grad_norm": 1.1804956197738647, "learning_rate": 0.00018248624927704764, "loss": 2.2691, "step": 972 }, { "epoch": 0.07283888233862969, "grad_norm": 1.112295389175415, "learning_rate": 0.0001824509240994489, "loss": 1.8435, "step": 973 }, { "epoch": 0.07291374244380813, "grad_norm": 1.152370572090149, "learning_rate": 0.00018241556675864735, "loss": 1.895, "step": 974 }, { "epoch": 0.07298860254898658, "grad_norm": 1.350017786026001, "learning_rate": 0.00018238017726843558, "loss": 2.1471, "step": 975 }, { "epoch": 0.07306346265416502, "grad_norm": 1.1799567937850952, "learning_rate": 0.00018234475564261857, "loss": 1.9245, "step": 976 }, { "epoch": 0.07313832275934348, "grad_norm": 1.2494592666625977, "learning_rate": 0.00018230930189501397, "loss": 1.6953, "step": 977 }, { "epoch": 0.07321318286452193, "grad_norm": 1.2443040609359741, "learning_rate": 0.00018227381603945183, "loss": 2.5471, "step": 978 }, { "epoch": 0.07328804296970037, "grad_norm": 1.1047500371932983, "learning_rate": 0.0001822382980897749, "loss": 2.2553, "step": 979 }, { "epoch": 0.07336290307487882, "grad_norm": 1.1502246856689453, "learning_rate": 0.00018220274805983826, "loss": 2.1124, "step": 980 }, { "epoch": 0.07343776318005726, "grad_norm": 1.1636180877685547, "learning_rate": 0.00018216716596350957, "loss": 2.3541, "step": 981 }, { "epoch": 0.07351262328523571, "grad_norm": 1.0437051057815552, "learning_rate": 0.0001821315518146691, "loss": 1.8362, "step": 982 }, { "epoch": 0.07358748339041417, "grad_norm": 1.2201869487762451, "learning_rate": 0.0001820959056272095, "loss": 1.9365, "step": 983 }, { "epoch": 0.07366234349559261, "grad_norm": 1.1721235513687134, "learning_rate": 0.0001820602274150359, "loss": 2.2395, "step": 984 }, { "epoch": 0.07373720360077106, "grad_norm": 1.170962929725647, "learning_rate": 0.0001820245171920661, "loss": 2.2407, "step": 985 }, { "epoch": 0.0738120637059495, "grad_norm": 1.2986100912094116, "learning_rate": 0.0001819887749722301, "loss": 2.828, "step": 986 }, { "epoch": 0.07388692381112795, "grad_norm": 1.0947149991989136, "learning_rate": 0.00018195300076947075, "loss": 2.2134, "step": 987 }, { "epoch": 0.07396178391630641, "grad_norm": 1.2755177021026611, "learning_rate": 0.000181917194597743, "loss": 2.0147, "step": 988 }, { "epoch": 0.07403664402148485, "grad_norm": 1.22310209274292, "learning_rate": 0.00018188135647101455, "loss": 2.308, "step": 989 }, { "epoch": 0.0741115041266633, "grad_norm": 1.3419992923736572, "learning_rate": 0.00018184548640326543, "loss": 2.37, "step": 990 }, { "epoch": 0.07418636423184174, "grad_norm": 1.2741196155548096, "learning_rate": 0.00018180958440848814, "loss": 1.9404, "step": 991 }, { "epoch": 0.07426122433702019, "grad_norm": 1.219087839126587, "learning_rate": 0.0001817736505006877, "loss": 2.0928, "step": 992 }, { "epoch": 0.07433608444219864, "grad_norm": 1.148200511932373, "learning_rate": 0.0001817376846938815, "loss": 2.36, "step": 993 }, { "epoch": 0.0744109445473771, "grad_norm": 1.2287012338638306, "learning_rate": 0.00018170168700209937, "loss": 2.3992, "step": 994 }, { "epoch": 0.07448580465255554, "grad_norm": 1.044374942779541, "learning_rate": 0.0001816656574393837, "loss": 2.0968, "step": 995 }, { "epoch": 0.07456066475773399, "grad_norm": 1.3500761985778809, "learning_rate": 0.0001816295960197892, "loss": 2.1954, "step": 996 }, { "epoch": 0.07463552486291243, "grad_norm": 1.4000588655471802, "learning_rate": 0.000181593502757383, "loss": 2.0877, "step": 997 }, { "epoch": 0.07471038496809088, "grad_norm": 1.2800953388214111, "learning_rate": 0.00018155737766624474, "loss": 2.7096, "step": 998 }, { "epoch": 0.07478524507326932, "grad_norm": 1.2500853538513184, "learning_rate": 0.00018152122076046636, "loss": 2.2558, "step": 999 }, { "epoch": 0.07486010517844778, "grad_norm": 1.126541256904602, "learning_rate": 0.0001814850320541523, "loss": 2.426, "step": 1000 }, { "epoch": 0.07493496528362623, "grad_norm": 1.5971488952636719, "learning_rate": 0.00018144881156141943, "loss": 2.3493, "step": 1001 }, { "epoch": 0.07500982538880467, "grad_norm": 1.2710895538330078, "learning_rate": 0.0001814125592963969, "loss": 2.0352, "step": 1002 }, { "epoch": 0.07508468549398312, "grad_norm": 1.3743606805801392, "learning_rate": 0.00018137627527322633, "loss": 2.4217, "step": 1003 }, { "epoch": 0.07515954559916156, "grad_norm": 1.160277247428894, "learning_rate": 0.00018133995950606174, "loss": 2.0678, "step": 1004 }, { "epoch": 0.07523440570434002, "grad_norm": 1.2592400312423706, "learning_rate": 0.0001813036120090695, "loss": 2.439, "step": 1005 }, { "epoch": 0.07530926580951847, "grad_norm": 1.2254111766815186, "learning_rate": 0.00018126723279642836, "loss": 2.0951, "step": 1006 }, { "epoch": 0.07538412591469691, "grad_norm": 1.2077906131744385, "learning_rate": 0.0001812308218823295, "loss": 2.3972, "step": 1007 }, { "epoch": 0.07545898601987536, "grad_norm": 1.1689410209655762, "learning_rate": 0.0001811943792809764, "loss": 1.9102, "step": 1008 }, { "epoch": 0.0755338461250538, "grad_norm": 1.1968590021133423, "learning_rate": 0.00018115790500658488, "loss": 2.4865, "step": 1009 }, { "epoch": 0.07560870623023225, "grad_norm": 1.263175368309021, "learning_rate": 0.0001811213990733832, "loss": 1.9659, "step": 1010 }, { "epoch": 0.0756835663354107, "grad_norm": 1.3422170877456665, "learning_rate": 0.00018108486149561188, "loss": 1.7586, "step": 1011 }, { "epoch": 0.07575842644058915, "grad_norm": 1.2155977487564087, "learning_rate": 0.0001810482922875239, "loss": 2.4931, "step": 1012 }, { "epoch": 0.0758332865457676, "grad_norm": 1.3148202896118164, "learning_rate": 0.00018101169146338443, "loss": 2.3104, "step": 1013 }, { "epoch": 0.07590814665094604, "grad_norm": 1.1733620166778564, "learning_rate": 0.00018097505903747106, "loss": 2.1848, "step": 1014 }, { "epoch": 0.07598300675612449, "grad_norm": 1.2134809494018555, "learning_rate": 0.0001809383950240738, "loss": 1.9469, "step": 1015 }, { "epoch": 0.07605786686130293, "grad_norm": 1.2752628326416016, "learning_rate": 0.00018090169943749476, "loss": 2.3054, "step": 1016 }, { "epoch": 0.07613272696648139, "grad_norm": 1.3403056859970093, "learning_rate": 0.00018086497229204853, "loss": 1.9849, "step": 1017 }, { "epoch": 0.07620758707165984, "grad_norm": 1.2387319803237915, "learning_rate": 0.00018082821360206197, "loss": 2.2639, "step": 1018 }, { "epoch": 0.07628244717683828, "grad_norm": 1.3904563188552856, "learning_rate": 0.00018079142338187423, "loss": 2.2377, "step": 1019 }, { "epoch": 0.07635730728201673, "grad_norm": 1.2257717847824097, "learning_rate": 0.00018075460164583679, "loss": 2.0261, "step": 1020 }, { "epoch": 0.07643216738719517, "grad_norm": 1.4777474403381348, "learning_rate": 0.00018071774840831343, "loss": 2.1668, "step": 1021 }, { "epoch": 0.07650702749237363, "grad_norm": 1.196178674697876, "learning_rate": 0.00018068086368368014, "loss": 2.3204, "step": 1022 }, { "epoch": 0.07658188759755208, "grad_norm": 1.3121552467346191, "learning_rate": 0.00018064394748632526, "loss": 2.3312, "step": 1023 }, { "epoch": 0.07665674770273052, "grad_norm": 1.2218302488327026, "learning_rate": 0.00018060699983064944, "loss": 2.1188, "step": 1024 }, { "epoch": 0.07673160780790897, "grad_norm": 1.3015470504760742, "learning_rate": 0.00018057002073106548, "loss": 2.2945, "step": 1025 }, { "epoch": 0.07680646791308741, "grad_norm": 1.2493606805801392, "learning_rate": 0.0001805330102019986, "loss": 1.8687, "step": 1026 }, { "epoch": 0.07688132801826586, "grad_norm": 1.3711400032043457, "learning_rate": 0.00018049596825788614, "loss": 2.4523, "step": 1027 }, { "epoch": 0.07695618812344432, "grad_norm": 1.429445743560791, "learning_rate": 0.00018045889491317778, "loss": 1.8115, "step": 1028 }, { "epoch": 0.07703104822862276, "grad_norm": 1.1778950691223145, "learning_rate": 0.00018042179018233542, "loss": 1.8991, "step": 1029 }, { "epoch": 0.07710590833380121, "grad_norm": 1.1916242837905884, "learning_rate": 0.00018038465407983325, "loss": 2.0204, "step": 1030 }, { "epoch": 0.07718076843897965, "grad_norm": 1.2285008430480957, "learning_rate": 0.00018034748662015758, "loss": 2.4307, "step": 1031 }, { "epoch": 0.0772556285441581, "grad_norm": 1.2217931747436523, "learning_rate": 0.00018031028781780712, "loss": 2.1643, "step": 1032 }, { "epoch": 0.07733048864933655, "grad_norm": 1.2528331279754639, "learning_rate": 0.00018027305768729263, "loss": 1.2741, "step": 1033 }, { "epoch": 0.077405348754515, "grad_norm": 1.2330291271209717, "learning_rate": 0.00018023579624313723, "loss": 1.9249, "step": 1034 }, { "epoch": 0.07748020885969345, "grad_norm": 1.3111963272094727, "learning_rate": 0.0001801985034998762, "loss": 2.4492, "step": 1035 }, { "epoch": 0.0775550689648719, "grad_norm": 1.4091006517410278, "learning_rate": 0.000180161179472057, "loss": 2.1665, "step": 1036 }, { "epoch": 0.07762992907005034, "grad_norm": 1.2484735250473022, "learning_rate": 0.00018012382417423935, "loss": 2.5784, "step": 1037 }, { "epoch": 0.07770478917522879, "grad_norm": 1.3590019941329956, "learning_rate": 0.00018008643762099512, "loss": 2.2124, "step": 1038 }, { "epoch": 0.07777964928040725, "grad_norm": 1.3044530153274536, "learning_rate": 0.00018004901982690842, "loss": 2.5255, "step": 1039 }, { "epoch": 0.07785450938558569, "grad_norm": 1.3547528982162476, "learning_rate": 0.0001800115708065755, "loss": 1.9372, "step": 1040 }, { "epoch": 0.07792936949076414, "grad_norm": 1.2361351251602173, "learning_rate": 0.00017997409057460485, "loss": 2.273, "step": 1041 }, { "epoch": 0.07800422959594258, "grad_norm": 1.1168638467788696, "learning_rate": 0.00017993657914561702, "loss": 2.2407, "step": 1042 }, { "epoch": 0.07807908970112103, "grad_norm": 1.0595859289169312, "learning_rate": 0.00017989903653424492, "loss": 1.9742, "step": 1043 }, { "epoch": 0.07815394980629947, "grad_norm": 1.1967504024505615, "learning_rate": 0.00017986146275513344, "loss": 2.1095, "step": 1044 }, { "epoch": 0.07822880991147793, "grad_norm": 1.0779035091400146, "learning_rate": 0.00017982385782293968, "loss": 1.7886, "step": 1045 }, { "epoch": 0.07830367001665638, "grad_norm": 1.1343623399734497, "learning_rate": 0.00017978622175233297, "loss": 2.0758, "step": 1046 }, { "epoch": 0.07837853012183482, "grad_norm": 1.1730152368545532, "learning_rate": 0.00017974855455799471, "loss": 1.8309, "step": 1047 }, { "epoch": 0.07845339022701327, "grad_norm": 1.4102110862731934, "learning_rate": 0.00017971085625461842, "loss": 2.0809, "step": 1048 }, { "epoch": 0.07852825033219171, "grad_norm": 1.2229026556015015, "learning_rate": 0.00017967312685690984, "loss": 2.0927, "step": 1049 }, { "epoch": 0.07860311043737016, "grad_norm": 1.4901825189590454, "learning_rate": 0.0001796353663795868, "loss": 2.3212, "step": 1050 }, { "epoch": 0.07860311043737016, "eval_loss": 2.1668286323547363, "eval_runtime": 178.8663, "eval_samples_per_second": 27.954, "eval_steps_per_second": 13.977, "step": 1050 }, { "epoch": 0.07867797054254862, "grad_norm": 1.1335030794143677, "learning_rate": 0.00017959757483737923, "loss": 1.9909, "step": 1051 }, { "epoch": 0.07875283064772706, "grad_norm": 1.1249163150787354, "learning_rate": 0.00017955975224502919, "loss": 1.6304, "step": 1052 }, { "epoch": 0.07882769075290551, "grad_norm": 1.3284449577331543, "learning_rate": 0.00017952189861729088, "loss": 2.2979, "step": 1053 }, { "epoch": 0.07890255085808395, "grad_norm": 1.2780580520629883, "learning_rate": 0.00017948401396893055, "loss": 1.9728, "step": 1054 }, { "epoch": 0.0789774109632624, "grad_norm": 1.317397952079773, "learning_rate": 0.00017944609831472663, "loss": 2.2103, "step": 1055 }, { "epoch": 0.07905227106844086, "grad_norm": 1.0973384380340576, "learning_rate": 0.0001794081516694696, "loss": 1.2083, "step": 1056 }, { "epoch": 0.0791271311736193, "grad_norm": 1.4204012155532837, "learning_rate": 0.00017937017404796195, "loss": 2.4087, "step": 1057 }, { "epoch": 0.07920199127879775, "grad_norm": 1.2547519207000732, "learning_rate": 0.00017933216546501846, "loss": 1.7095, "step": 1058 }, { "epoch": 0.0792768513839762, "grad_norm": 1.3065584897994995, "learning_rate": 0.00017929412593546579, "loss": 1.6478, "step": 1059 }, { "epoch": 0.07935171148915464, "grad_norm": 1.171447515487671, "learning_rate": 0.0001792560554741427, "loss": 2.1824, "step": 1060 }, { "epoch": 0.07942657159433308, "grad_norm": 1.267004132270813, "learning_rate": 0.00017921795409590017, "loss": 2.3961, "step": 1061 }, { "epoch": 0.07950143169951154, "grad_norm": 1.5481007099151611, "learning_rate": 0.00017917982181560104, "loss": 1.9098, "step": 1062 }, { "epoch": 0.07957629180468999, "grad_norm": 1.1206212043762207, "learning_rate": 0.00017914165864812035, "loss": 2.355, "step": 1063 }, { "epoch": 0.07965115190986843, "grad_norm": 1.1411067247390747, "learning_rate": 0.0001791034646083451, "loss": 2.1717, "step": 1064 }, { "epoch": 0.07972601201504688, "grad_norm": 1.1834534406661987, "learning_rate": 0.0001790652397111744, "loss": 2.4138, "step": 1065 }, { "epoch": 0.07980087212022532, "grad_norm": 1.1845048666000366, "learning_rate": 0.00017902698397151933, "loss": 2.0097, "step": 1066 }, { "epoch": 0.07987573222540377, "grad_norm": 1.219399333000183, "learning_rate": 0.00017898869740430305, "loss": 1.9431, "step": 1067 }, { "epoch": 0.07995059233058223, "grad_norm": 1.1284704208374023, "learning_rate": 0.00017895038002446072, "loss": 2.1752, "step": 1068 }, { "epoch": 0.08002545243576067, "grad_norm": 1.1546708345413208, "learning_rate": 0.00017891203184693954, "loss": 2.0997, "step": 1069 }, { "epoch": 0.08010031254093912, "grad_norm": 1.1339908838272095, "learning_rate": 0.00017887365288669872, "loss": 2.118, "step": 1070 }, { "epoch": 0.08017517264611757, "grad_norm": 1.2448478937149048, "learning_rate": 0.00017883524315870947, "loss": 2.0592, "step": 1071 }, { "epoch": 0.08025003275129601, "grad_norm": 1.3120628595352173, "learning_rate": 0.000178796802677955, "loss": 1.7083, "step": 1072 }, { "epoch": 0.08032489285647447, "grad_norm": 1.232805609703064, "learning_rate": 0.0001787583314594305, "loss": 2.3646, "step": 1073 }, { "epoch": 0.08039975296165291, "grad_norm": 5.674004554748535, "learning_rate": 0.00017871982951814323, "loss": 2.7122, "step": 1074 }, { "epoch": 0.08047461306683136, "grad_norm": 1.5659693479537964, "learning_rate": 0.00017868129686911233, "loss": 2.4505, "step": 1075 }, { "epoch": 0.0805494731720098, "grad_norm": 1.9109268188476562, "learning_rate": 0.00017864273352736896, "loss": 2.4885, "step": 1076 }, { "epoch": 0.08062433327718825, "grad_norm": 1.5327507257461548, "learning_rate": 0.00017860413950795625, "loss": 2.4435, "step": 1077 }, { "epoch": 0.0806991933823667, "grad_norm": 1.2586411237716675, "learning_rate": 0.00017856551482592936, "loss": 2.136, "step": 1078 }, { "epoch": 0.08077405348754516, "grad_norm": 1.4848982095718384, "learning_rate": 0.0001785268594963553, "loss": 2.6203, "step": 1079 }, { "epoch": 0.0808489135927236, "grad_norm": 1.3365272283554077, "learning_rate": 0.00017848817353431312, "loss": 2.0687, "step": 1080 }, { "epoch": 0.08092377369790205, "grad_norm": 1.1464790105819702, "learning_rate": 0.00017844945695489378, "loss": 2.0457, "step": 1081 }, { "epoch": 0.08099863380308049, "grad_norm": 1.291661024093628, "learning_rate": 0.00017841070977320017, "loss": 2.131, "step": 1082 }, { "epoch": 0.08107349390825894, "grad_norm": 1.4168891906738281, "learning_rate": 0.00017837193200434718, "loss": 2.6532, "step": 1083 }, { "epoch": 0.08114835401343738, "grad_norm": 1.3236547708511353, "learning_rate": 0.0001783331236634616, "loss": 2.4076, "step": 1084 }, { "epoch": 0.08122321411861584, "grad_norm": 1.0434134006500244, "learning_rate": 0.0001782942847656821, "loss": 2.1998, "step": 1085 }, { "epoch": 0.08129807422379429, "grad_norm": 1.1813428401947021, "learning_rate": 0.00017825541532615934, "loss": 2.5611, "step": 1086 }, { "epoch": 0.08137293432897273, "grad_norm": 1.2198381423950195, "learning_rate": 0.00017821651536005582, "loss": 1.8355, "step": 1087 }, { "epoch": 0.08144779443415118, "grad_norm": 0.8517663478851318, "learning_rate": 0.00017817758488254603, "loss": 2.2949, "step": 1088 }, { "epoch": 0.08152265453932962, "grad_norm": 1.2726380825042725, "learning_rate": 0.00017813862390881634, "loss": 2.3899, "step": 1089 }, { "epoch": 0.08159751464450808, "grad_norm": 0.9983457922935486, "learning_rate": 0.00017809963245406495, "loss": 2.2658, "step": 1090 }, { "epoch": 0.08167237474968653, "grad_norm": 1.4170000553131104, "learning_rate": 0.00017806061053350202, "loss": 2.0839, "step": 1091 }, { "epoch": 0.08174723485486497, "grad_norm": 1.3023183345794678, "learning_rate": 0.0001780215581623496, "loss": 2.5358, "step": 1092 }, { "epoch": 0.08182209496004342, "grad_norm": 1.3425196409225464, "learning_rate": 0.00017798247535584153, "loss": 2.4709, "step": 1093 }, { "epoch": 0.08189695506522186, "grad_norm": 1.4002048969268799, "learning_rate": 0.00017794336212922368, "loss": 2.1698, "step": 1094 }, { "epoch": 0.08197181517040031, "grad_norm": 1.7404706478118896, "learning_rate": 0.00017790421849775358, "loss": 2.4696, "step": 1095 }, { "epoch": 0.08204667527557877, "grad_norm": 1.4175392389297485, "learning_rate": 0.0001778650444767008, "loss": 1.9208, "step": 1096 }, { "epoch": 0.08212153538075721, "grad_norm": 1.1473445892333984, "learning_rate": 0.00017782584008134672, "loss": 1.9821, "step": 1097 }, { "epoch": 0.08219639548593566, "grad_norm": 1.5497795343399048, "learning_rate": 0.00017778660532698446, "loss": 2.5066, "step": 1098 }, { "epoch": 0.0822712555911141, "grad_norm": 1.1291656494140625, "learning_rate": 0.00017774734022891914, "loss": 1.7856, "step": 1099 }, { "epoch": 0.08234611569629255, "grad_norm": 1.2415539026260376, "learning_rate": 0.00017770804480246764, "loss": 2.4034, "step": 1100 }, { "epoch": 0.082420975801471, "grad_norm": 1.0913794040679932, "learning_rate": 0.00017766871906295864, "loss": 1.9124, "step": 1101 }, { "epoch": 0.08249583590664945, "grad_norm": 1.2200806140899658, "learning_rate": 0.0001776293630257327, "loss": 1.956, "step": 1102 }, { "epoch": 0.0825706960118279, "grad_norm": 1.2512478828430176, "learning_rate": 0.00017758997670614215, "loss": 2.3539, "step": 1103 }, { "epoch": 0.08264555611700634, "grad_norm": 1.1919282674789429, "learning_rate": 0.00017755056011955122, "loss": 1.9473, "step": 1104 }, { "epoch": 0.08272041622218479, "grad_norm": 1.1397578716278076, "learning_rate": 0.00017751111328133585, "loss": 1.8133, "step": 1105 }, { "epoch": 0.08279527632736323, "grad_norm": 1.4707269668579102, "learning_rate": 0.00017747163620688384, "loss": 2.039, "step": 1106 }, { "epoch": 0.0828701364325417, "grad_norm": 1.3370574712753296, "learning_rate": 0.00017743212891159474, "loss": 2.4693, "step": 1107 }, { "epoch": 0.08294499653772014, "grad_norm": 1.2410619258880615, "learning_rate": 0.00017739259141087992, "loss": 2.5934, "step": 1108 }, { "epoch": 0.08301985664289858, "grad_norm": 1.1280416250228882, "learning_rate": 0.00017735302372016255, "loss": 2.3309, "step": 1109 }, { "epoch": 0.08309471674807703, "grad_norm": 1.2431304454803467, "learning_rate": 0.0001773134258548775, "loss": 1.9239, "step": 1110 }, { "epoch": 0.08316957685325548, "grad_norm": 1.257811427116394, "learning_rate": 0.00017727379783047154, "loss": 2.1219, "step": 1111 }, { "epoch": 0.08324443695843392, "grad_norm": 1.3039681911468506, "learning_rate": 0.00017723413966240308, "loss": 2.3175, "step": 1112 }, { "epoch": 0.08331929706361238, "grad_norm": 1.4012370109558105, "learning_rate": 0.0001771944513661423, "loss": 2.3396, "step": 1113 }, { "epoch": 0.08339415716879083, "grad_norm": 1.3332321643829346, "learning_rate": 0.00017715473295717128, "loss": 2.6311, "step": 1114 }, { "epoch": 0.08346901727396927, "grad_norm": 1.1919529438018799, "learning_rate": 0.00017711498445098365, "loss": 1.8677, "step": 1115 }, { "epoch": 0.08354387737914772, "grad_norm": 1.3868426084518433, "learning_rate": 0.0001770752058630849, "loss": 1.8842, "step": 1116 }, { "epoch": 0.08361873748432616, "grad_norm": 1.293156385421753, "learning_rate": 0.00017703539720899215, "loss": 2.1451, "step": 1117 }, { "epoch": 0.0836935975895046, "grad_norm": 1.2677556276321411, "learning_rate": 0.00017699555850423445, "loss": 2.3035, "step": 1118 }, { "epoch": 0.08376845769468307, "grad_norm": 1.206300139427185, "learning_rate": 0.00017695568976435234, "loss": 2.1564, "step": 1119 }, { "epoch": 0.08384331779986151, "grad_norm": 1.3815343379974365, "learning_rate": 0.0001769157910048982, "loss": 2.1147, "step": 1120 }, { "epoch": 0.08391817790503996, "grad_norm": 1.0231244564056396, "learning_rate": 0.0001768758622414361, "loss": 2.1612, "step": 1121 }, { "epoch": 0.0839930380102184, "grad_norm": 1.1622685194015503, "learning_rate": 0.00017683590348954183, "loss": 2.0492, "step": 1122 }, { "epoch": 0.08406789811539685, "grad_norm": 1.3988982439041138, "learning_rate": 0.00017679591476480288, "loss": 2.5833, "step": 1123 }, { "epoch": 0.0841427582205753, "grad_norm": 1.2659207582473755, "learning_rate": 0.0001767558960828184, "loss": 2.1742, "step": 1124 }, { "epoch": 0.08421761832575375, "grad_norm": 1.2385661602020264, "learning_rate": 0.0001767158474591992, "loss": 1.8937, "step": 1125 }, { "epoch": 0.0842924784309322, "grad_norm": 1.3041611909866333, "learning_rate": 0.00017667576890956785, "loss": 1.7623, "step": 1126 }, { "epoch": 0.08436733853611064, "grad_norm": 1.238420844078064, "learning_rate": 0.00017663566044955853, "loss": 2.0903, "step": 1127 }, { "epoch": 0.08444219864128909, "grad_norm": 1.2149877548217773, "learning_rate": 0.00017659552209481713, "loss": 1.649, "step": 1128 }, { "epoch": 0.08451705874646753, "grad_norm": 1.1440823078155518, "learning_rate": 0.0001765553538610012, "loss": 2.3743, "step": 1129 }, { "epoch": 0.08459191885164599, "grad_norm": 1.088365077972412, "learning_rate": 0.00017651515576377986, "loss": 2.2725, "step": 1130 }, { "epoch": 0.08466677895682444, "grad_norm": 1.3781232833862305, "learning_rate": 0.000176474927818834, "loss": 2.8125, "step": 1131 }, { "epoch": 0.08474163906200288, "grad_norm": 1.1873775720596313, "learning_rate": 0.00017643467004185612, "loss": 1.8302, "step": 1132 }, { "epoch": 0.08481649916718133, "grad_norm": 1.6301945447921753, "learning_rate": 0.00017639438244855028, "loss": 1.9669, "step": 1133 }, { "epoch": 0.08489135927235977, "grad_norm": 1.5325994491577148, "learning_rate": 0.00017635406505463227, "loss": 2.089, "step": 1134 }, { "epoch": 0.08496621937753822, "grad_norm": 1.4200153350830078, "learning_rate": 0.00017631371787582946, "loss": 2.0859, "step": 1135 }, { "epoch": 0.08504107948271668, "grad_norm": 1.456496238708496, "learning_rate": 0.00017627334092788083, "loss": 2.0825, "step": 1136 }, { "epoch": 0.08511593958789512, "grad_norm": 1.320447325706482, "learning_rate": 0.000176232934226537, "loss": 2.1431, "step": 1137 }, { "epoch": 0.08519079969307357, "grad_norm": 1.213787317276001, "learning_rate": 0.00017619249778756015, "loss": 2.3498, "step": 1138 }, { "epoch": 0.08526565979825201, "grad_norm": 1.097078800201416, "learning_rate": 0.00017615203162672414, "loss": 1.7826, "step": 1139 }, { "epoch": 0.08534051990343046, "grad_norm": 1.1091054677963257, "learning_rate": 0.00017611153575981434, "loss": 1.846, "step": 1140 }, { "epoch": 0.08541538000860892, "grad_norm": 1.2655032873153687, "learning_rate": 0.00017607101020262778, "loss": 2.1837, "step": 1141 }, { "epoch": 0.08549024011378736, "grad_norm": 1.1432431936264038, "learning_rate": 0.00017603045497097299, "loss": 2.2325, "step": 1142 }, { "epoch": 0.08556510021896581, "grad_norm": 1.1332277059555054, "learning_rate": 0.00017598987008067015, "loss": 2.0935, "step": 1143 }, { "epoch": 0.08563996032414425, "grad_norm": 1.291705846786499, "learning_rate": 0.00017594925554755095, "loss": 2.4615, "step": 1144 }, { "epoch": 0.0857148204293227, "grad_norm": 1.1773737668991089, "learning_rate": 0.0001759086113874587, "loss": 2.2487, "step": 1145 }, { "epoch": 0.08578968053450114, "grad_norm": 1.1665387153625488, "learning_rate": 0.0001758679376162483, "loss": 2.2226, "step": 1146 }, { "epoch": 0.0858645406396796, "grad_norm": 1.6224819421768188, "learning_rate": 0.00017582723424978603, "loss": 2.6016, "step": 1147 }, { "epoch": 0.08593940074485805, "grad_norm": 1.2214092016220093, "learning_rate": 0.0001757865013039499, "loss": 2.1885, "step": 1148 }, { "epoch": 0.0860142608500365, "grad_norm": 1.3501379489898682, "learning_rate": 0.00017574573879462936, "loss": 2.2521, "step": 1149 }, { "epoch": 0.08608912095521494, "grad_norm": 1.0514832735061646, "learning_rate": 0.00017570494673772543, "loss": 1.8788, "step": 1150 }, { "epoch": 0.08616398106039339, "grad_norm": 1.3872100114822388, "learning_rate": 0.00017566412514915064, "loss": 2.4495, "step": 1151 }, { "epoch": 0.08623884116557184, "grad_norm": 1.677109718322754, "learning_rate": 0.00017562327404482906, "loss": 2.0657, "step": 1152 }, { "epoch": 0.08631370127075029, "grad_norm": 1.2448900938034058, "learning_rate": 0.00017558239344069626, "loss": 2.0203, "step": 1153 }, { "epoch": 0.08638856137592874, "grad_norm": 1.2644002437591553, "learning_rate": 0.00017554148335269923, "loss": 1.8664, "step": 1154 }, { "epoch": 0.08646342148110718, "grad_norm": 1.106068730354309, "learning_rate": 0.0001755005437967967, "loss": 2.3257, "step": 1155 }, { "epoch": 0.08653828158628563, "grad_norm": 1.2724332809448242, "learning_rate": 0.0001754595747889586, "loss": 2.2236, "step": 1156 }, { "epoch": 0.08661314169146407, "grad_norm": 1.2157081365585327, "learning_rate": 0.0001754185763451666, "loss": 2.4827, "step": 1157 }, { "epoch": 0.08668800179664253, "grad_norm": 1.3202322721481323, "learning_rate": 0.00017537754848141366, "loss": 2.1892, "step": 1158 }, { "epoch": 0.08676286190182098, "grad_norm": 1.3170231580734253, "learning_rate": 0.00017533649121370437, "loss": 1.9218, "step": 1159 }, { "epoch": 0.08683772200699942, "grad_norm": 1.2425464391708374, "learning_rate": 0.00017529540455805465, "loss": 1.8664, "step": 1160 }, { "epoch": 0.08691258211217787, "grad_norm": 1.6239116191864014, "learning_rate": 0.00017525428853049202, "loss": 1.9927, "step": 1161 }, { "epoch": 0.08698744221735631, "grad_norm": 1.3375805616378784, "learning_rate": 0.00017521314314705534, "loss": 2.0741, "step": 1162 }, { "epoch": 0.08706230232253476, "grad_norm": 1.1748435497283936, "learning_rate": 0.00017517196842379505, "loss": 2.1826, "step": 1163 }, { "epoch": 0.08713716242771322, "grad_norm": 1.1116843223571777, "learning_rate": 0.00017513076437677288, "loss": 2.52, "step": 1164 }, { "epoch": 0.08721202253289166, "grad_norm": 1.3166108131408691, "learning_rate": 0.00017508953102206206, "loss": 2.1195, "step": 1165 }, { "epoch": 0.0872868826380701, "grad_norm": 1.3875515460968018, "learning_rate": 0.00017504826837574737, "loss": 2.1483, "step": 1166 }, { "epoch": 0.08736174274324855, "grad_norm": 1.122794270515442, "learning_rate": 0.00017500697645392484, "loss": 2.193, "step": 1167 }, { "epoch": 0.087436602848427, "grad_norm": 1.1180616617202759, "learning_rate": 0.000174965655272702, "loss": 2.4521, "step": 1168 }, { "epoch": 0.08751146295360546, "grad_norm": 1.2084027528762817, "learning_rate": 0.00017492430484819784, "loss": 2.3877, "step": 1169 }, { "epoch": 0.0875863230587839, "grad_norm": 1.401640772819519, "learning_rate": 0.00017488292519654265, "loss": 2.1227, "step": 1170 }, { "epoch": 0.08766118316396235, "grad_norm": 1.028043270111084, "learning_rate": 0.00017484151633387823, "loss": 2.2063, "step": 1171 }, { "epoch": 0.08773604326914079, "grad_norm": 1.2598915100097656, "learning_rate": 0.0001748000782763577, "loss": 2.0273, "step": 1172 }, { "epoch": 0.08781090337431924, "grad_norm": 1.2322463989257812, "learning_rate": 0.00017475861104014557, "loss": 1.8695, "step": 1173 }, { "epoch": 0.08788576347949768, "grad_norm": 1.079811930656433, "learning_rate": 0.00017471711464141777, "loss": 2.1459, "step": 1174 }, { "epoch": 0.08796062358467614, "grad_norm": 1.2086703777313232, "learning_rate": 0.00017467558909636162, "loss": 1.9828, "step": 1175 }, { "epoch": 0.08803548368985459, "grad_norm": 1.251684308052063, "learning_rate": 0.00017463403442117574, "loss": 2.3591, "step": 1176 }, { "epoch": 0.08811034379503303, "grad_norm": 1.214540958404541, "learning_rate": 0.0001745924506320702, "loss": 2.4824, "step": 1177 }, { "epoch": 0.08818520390021148, "grad_norm": 1.2433278560638428, "learning_rate": 0.0001745508377452663, "loss": 1.9515, "step": 1178 }, { "epoch": 0.08826006400538992, "grad_norm": 1.2672390937805176, "learning_rate": 0.00017450919577699687, "loss": 2.0139, "step": 1179 }, { "epoch": 0.08833492411056837, "grad_norm": 1.337700605392456, "learning_rate": 0.00017446752474350593, "loss": 2.1516, "step": 1180 }, { "epoch": 0.08840978421574683, "grad_norm": 1.2376450300216675, "learning_rate": 0.0001744258246610489, "loss": 2.0201, "step": 1181 }, { "epoch": 0.08848464432092527, "grad_norm": 1.2506717443466187, "learning_rate": 0.00017438409554589254, "loss": 2.4035, "step": 1182 }, { "epoch": 0.08855950442610372, "grad_norm": 1.4963172674179077, "learning_rate": 0.0001743423374143149, "loss": 2.2391, "step": 1183 }, { "epoch": 0.08863436453128216, "grad_norm": 1.31190824508667, "learning_rate": 0.00017430055028260538, "loss": 2.0524, "step": 1184 }, { "epoch": 0.08870922463646061, "grad_norm": 1.1859015226364136, "learning_rate": 0.00017425873416706465, "loss": 1.6853, "step": 1185 }, { "epoch": 0.08878408474163907, "grad_norm": 1.2117524147033691, "learning_rate": 0.0001742168890840048, "loss": 2.2022, "step": 1186 }, { "epoch": 0.08885894484681751, "grad_norm": 1.074763536453247, "learning_rate": 0.00017417501504974906, "loss": 2.0574, "step": 1187 }, { "epoch": 0.08893380495199596, "grad_norm": 1.3986917734146118, "learning_rate": 0.0001741331120806321, "loss": 1.8333, "step": 1188 }, { "epoch": 0.0890086650571744, "grad_norm": 1.2676054239273071, "learning_rate": 0.00017409118019299973, "loss": 1.8714, "step": 1189 }, { "epoch": 0.08908352516235285, "grad_norm": 1.2313154935836792, "learning_rate": 0.00017404921940320918, "loss": 1.9152, "step": 1190 }, { "epoch": 0.0891583852675313, "grad_norm": 1.1091152429580688, "learning_rate": 0.00017400722972762888, "loss": 2.4104, "step": 1191 }, { "epoch": 0.08923324537270975, "grad_norm": 1.5932472944259644, "learning_rate": 0.00017396521118263856, "loss": 2.4212, "step": 1192 }, { "epoch": 0.0893081054778882, "grad_norm": 1.2801215648651123, "learning_rate": 0.00017392316378462912, "loss": 2.2827, "step": 1193 }, { "epoch": 0.08938296558306665, "grad_norm": 1.4843838214874268, "learning_rate": 0.0001738810875500029, "loss": 1.965, "step": 1194 }, { "epoch": 0.08945782568824509, "grad_norm": 1.2876660823822021, "learning_rate": 0.0001738389824951733, "loss": 2.0315, "step": 1195 }, { "epoch": 0.08953268579342354, "grad_norm": 1.4359833002090454, "learning_rate": 0.00017379684863656503, "loss": 2.1601, "step": 1196 }, { "epoch": 0.08960754589860198, "grad_norm": 1.5221644639968872, "learning_rate": 0.00017375468599061414, "loss": 1.9859, "step": 1197 }, { "epoch": 0.08968240600378044, "grad_norm": 1.397837519645691, "learning_rate": 0.00017371249457376773, "loss": 2.0471, "step": 1198 }, { "epoch": 0.08975726610895889, "grad_norm": 1.2167021036148071, "learning_rate": 0.00017367027440248422, "loss": 2.6858, "step": 1199 }, { "epoch": 0.08983212621413733, "grad_norm": 1.1611396074295044, "learning_rate": 0.00017362802549323326, "loss": 1.9817, "step": 1200 }, { "epoch": 0.08983212621413733, "eval_loss": 2.155243158340454, "eval_runtime": 178.9292, "eval_samples_per_second": 27.944, "eval_steps_per_second": 13.972, "step": 1200 }, { "epoch": 0.08990698631931578, "grad_norm": 1.6163415908813477, "learning_rate": 0.00017358574786249567, "loss": 2.0648, "step": 1201 }, { "epoch": 0.08998184642449422, "grad_norm": 1.0420089960098267, "learning_rate": 0.00017354344152676354, "loss": 2.3084, "step": 1202 }, { "epoch": 0.09005670652967268, "grad_norm": 1.0791280269622803, "learning_rate": 0.00017350110650254003, "loss": 2.0449, "step": 1203 }, { "epoch": 0.09013156663485113, "grad_norm": 1.2416902780532837, "learning_rate": 0.00017345874280633964, "loss": 2.4091, "step": 1204 }, { "epoch": 0.09020642674002957, "grad_norm": 1.3768773078918457, "learning_rate": 0.00017341635045468791, "loss": 2.4456, "step": 1205 }, { "epoch": 0.09028128684520802, "grad_norm": 1.620538592338562, "learning_rate": 0.0001733739294641217, "loss": 1.6817, "step": 1206 }, { "epoch": 0.09035614695038646, "grad_norm": 1.179621696472168, "learning_rate": 0.00017333147985118896, "loss": 2.6346, "step": 1207 }, { "epoch": 0.09043100705556491, "grad_norm": 1.1776443719863892, "learning_rate": 0.00017328900163244882, "loss": 1.8822, "step": 1208 }, { "epoch": 0.09050586716074337, "grad_norm": 1.3630313873291016, "learning_rate": 0.00017324649482447152, "loss": 2.218, "step": 1209 }, { "epoch": 0.09058072726592181, "grad_norm": 1.36945378780365, "learning_rate": 0.00017320395944383856, "loss": 1.5142, "step": 1210 }, { "epoch": 0.09065558737110026, "grad_norm": 1.2178399562835693, "learning_rate": 0.0001731613955071425, "loss": 1.7964, "step": 1211 }, { "epoch": 0.0907304474762787, "grad_norm": 1.3475048542022705, "learning_rate": 0.00017311880303098705, "loss": 1.9596, "step": 1212 }, { "epoch": 0.09080530758145715, "grad_norm": 1.1083323955535889, "learning_rate": 0.0001730761820319871, "loss": 1.7876, "step": 1213 }, { "epoch": 0.0908801676866356, "grad_norm": 1.4839143753051758, "learning_rate": 0.00017303353252676862, "loss": 1.5069, "step": 1214 }, { "epoch": 0.09095502779181405, "grad_norm": 1.2290488481521606, "learning_rate": 0.00017299085453196871, "loss": 2.5297, "step": 1215 }, { "epoch": 0.0910298878969925, "grad_norm": 1.2569103240966797, "learning_rate": 0.0001729481480642356, "loss": 2.6475, "step": 1216 }, { "epoch": 0.09110474800217094, "grad_norm": 1.2897191047668457, "learning_rate": 0.00017290541314022862, "loss": 1.8624, "step": 1217 }, { "epoch": 0.09117960810734939, "grad_norm": 1.2003446817398071, "learning_rate": 0.0001728626497766182, "loss": 2.4252, "step": 1218 }, { "epoch": 0.09125446821252783, "grad_norm": 1.0224945545196533, "learning_rate": 0.0001728198579900858, "loss": 2.3085, "step": 1219 }, { "epoch": 0.0913293283177063, "grad_norm": 1.1976522207260132, "learning_rate": 0.00017277703779732412, "loss": 1.9232, "step": 1220 }, { "epoch": 0.09140418842288474, "grad_norm": 1.1848504543304443, "learning_rate": 0.0001727341892150368, "loss": 2.3768, "step": 1221 }, { "epoch": 0.09147904852806318, "grad_norm": 1.3099452257156372, "learning_rate": 0.00017269131225993857, "loss": 2.1318, "step": 1222 }, { "epoch": 0.09155390863324163, "grad_norm": 1.154932975769043, "learning_rate": 0.00017264840694875534, "loss": 2.4721, "step": 1223 }, { "epoch": 0.09162876873842007, "grad_norm": 1.1244804859161377, "learning_rate": 0.00017260547329822393, "loss": 1.7488, "step": 1224 }, { "epoch": 0.09170362884359852, "grad_norm": 1.2615629434585571, "learning_rate": 0.0001725625113250923, "loss": 1.891, "step": 1225 }, { "epoch": 0.09177848894877698, "grad_norm": 1.283249020576477, "learning_rate": 0.0001725195210461195, "loss": 2.1095, "step": 1226 }, { "epoch": 0.09185334905395542, "grad_norm": 1.3241647481918335, "learning_rate": 0.0001724765024780755, "loss": 1.9416, "step": 1227 }, { "epoch": 0.09192820915913387, "grad_norm": 1.3222718238830566, "learning_rate": 0.00017243345563774142, "loss": 2.3817, "step": 1228 }, { "epoch": 0.09200306926431231, "grad_norm": 1.2114861011505127, "learning_rate": 0.0001723903805419093, "loss": 2.704, "step": 1229 }, { "epoch": 0.09207792936949076, "grad_norm": 1.0608569383621216, "learning_rate": 0.00017234727720738237, "loss": 1.8251, "step": 1230 }, { "epoch": 0.0921527894746692, "grad_norm": 1.5279498100280762, "learning_rate": 0.00017230414565097468, "loss": 2.0924, "step": 1231 }, { "epoch": 0.09222764957984766, "grad_norm": 1.337075114250183, "learning_rate": 0.0001722609858895114, "loss": 2.2932, "step": 1232 }, { "epoch": 0.09230250968502611, "grad_norm": 1.2050126791000366, "learning_rate": 0.00017221779793982864, "loss": 2.0516, "step": 1233 }, { "epoch": 0.09237736979020456, "grad_norm": 1.4143582582473755, "learning_rate": 0.00017217458181877367, "loss": 2.04, "step": 1234 }, { "epoch": 0.092452229895383, "grad_norm": 1.19197678565979, "learning_rate": 0.00017213133754320444, "loss": 2.2422, "step": 1235 }, { "epoch": 0.09252709000056145, "grad_norm": 1.4247612953186035, "learning_rate": 0.00017208806512999028, "loss": 2.5712, "step": 1236 }, { "epoch": 0.0926019501057399, "grad_norm": 1.3048101663589478, "learning_rate": 0.00017204476459601112, "loss": 2.5308, "step": 1237 }, { "epoch": 0.09267681021091835, "grad_norm": 1.2627767324447632, "learning_rate": 0.00017200143595815808, "loss": 2.6725, "step": 1238 }, { "epoch": 0.0927516703160968, "grad_norm": 1.5920464992523193, "learning_rate": 0.00017195807923333321, "loss": 2.1026, "step": 1239 }, { "epoch": 0.09282653042127524, "grad_norm": 1.1559562683105469, "learning_rate": 0.00017191469443844943, "loss": 2.3576, "step": 1240 }, { "epoch": 0.09290139052645369, "grad_norm": 1.1940432786941528, "learning_rate": 0.00017187128159043075, "loss": 2.1975, "step": 1241 }, { "epoch": 0.09297625063163213, "grad_norm": 1.2136863470077515, "learning_rate": 0.000171827840706212, "loss": 2.3224, "step": 1242 }, { "epoch": 0.09305111073681059, "grad_norm": 1.1816182136535645, "learning_rate": 0.00017178437180273904, "loss": 2.2696, "step": 1243 }, { "epoch": 0.09312597084198904, "grad_norm": 1.3242214918136597, "learning_rate": 0.00017174087489696852, "loss": 1.809, "step": 1244 }, { "epoch": 0.09320083094716748, "grad_norm": 1.1202278137207031, "learning_rate": 0.0001716973500058682, "loss": 2.1211, "step": 1245 }, { "epoch": 0.09327569105234593, "grad_norm": 1.2238733768463135, "learning_rate": 0.00017165379714641663, "loss": 1.7948, "step": 1246 }, { "epoch": 0.09335055115752437, "grad_norm": 1.2222380638122559, "learning_rate": 0.0001716102163356033, "loss": 2.1183, "step": 1247 }, { "epoch": 0.09342541126270282, "grad_norm": 1.122733235359192, "learning_rate": 0.00017156660759042863, "loss": 2.1823, "step": 1248 }, { "epoch": 0.09350027136788128, "grad_norm": 1.1838005781173706, "learning_rate": 0.00017152297092790385, "loss": 2.13, "step": 1249 }, { "epoch": 0.09357513147305972, "grad_norm": 1.2262781858444214, "learning_rate": 0.00017147930636505124, "loss": 2.3251, "step": 1250 }, { "epoch": 0.09364999157823817, "grad_norm": 1.3568865060806274, "learning_rate": 0.00017143561391890378, "loss": 2.5287, "step": 1251 }, { "epoch": 0.09372485168341661, "grad_norm": 1.3320591449737549, "learning_rate": 0.00017139189360650546, "loss": 2.0685, "step": 1252 }, { "epoch": 0.09379971178859506, "grad_norm": 1.13832426071167, "learning_rate": 0.00017134814544491107, "loss": 1.9673, "step": 1253 }, { "epoch": 0.09387457189377352, "grad_norm": 1.2287312746047974, "learning_rate": 0.00017130436945118633, "loss": 2.5105, "step": 1254 }, { "epoch": 0.09394943199895196, "grad_norm": 1.2175129652023315, "learning_rate": 0.00017126056564240773, "loss": 1.8506, "step": 1255 }, { "epoch": 0.09402429210413041, "grad_norm": 1.1834995746612549, "learning_rate": 0.0001712167340356627, "loss": 1.8135, "step": 1256 }, { "epoch": 0.09409915220930885, "grad_norm": 1.2922189235687256, "learning_rate": 0.00017117287464804944, "loss": 2.1139, "step": 1257 }, { "epoch": 0.0941740123144873, "grad_norm": 1.177619218826294, "learning_rate": 0.000171128987496677, "loss": 2.1473, "step": 1258 }, { "epoch": 0.09424887241966574, "grad_norm": 1.1676652431488037, "learning_rate": 0.00017108507259866533, "loss": 1.9876, "step": 1259 }, { "epoch": 0.0943237325248442, "grad_norm": 1.3100801706314087, "learning_rate": 0.00017104112997114512, "loss": 1.7259, "step": 1260 }, { "epoch": 0.09439859263002265, "grad_norm": 1.5738404989242554, "learning_rate": 0.00017099715963125788, "loss": 2.3072, "step": 1261 }, { "epoch": 0.0944734527352011, "grad_norm": 1.2821507453918457, "learning_rate": 0.00017095316159615603, "loss": 2.1311, "step": 1262 }, { "epoch": 0.09454831284037954, "grad_norm": 1.2842774391174316, "learning_rate": 0.00017090913588300267, "loss": 2.5451, "step": 1263 }, { "epoch": 0.09462317294555798, "grad_norm": 1.2686470746994019, "learning_rate": 0.00017086508250897174, "loss": 2.5079, "step": 1264 }, { "epoch": 0.09469803305073643, "grad_norm": 1.3063288927078247, "learning_rate": 0.000170821001491248, "loss": 2.4265, "step": 1265 }, { "epoch": 0.09477289315591489, "grad_norm": 1.6064400672912598, "learning_rate": 0.00017077689284702698, "loss": 2.1349, "step": 1266 }, { "epoch": 0.09484775326109333, "grad_norm": 1.2163150310516357, "learning_rate": 0.00017073275659351496, "loss": 2.2144, "step": 1267 }, { "epoch": 0.09492261336627178, "grad_norm": 1.2895407676696777, "learning_rate": 0.00017068859274792903, "loss": 2.1233, "step": 1268 }, { "epoch": 0.09499747347145023, "grad_norm": 1.3689519166946411, "learning_rate": 0.00017064440132749702, "loss": 1.7904, "step": 1269 }, { "epoch": 0.09507233357662867, "grad_norm": 1.2319437265396118, "learning_rate": 0.0001706001823494575, "loss": 1.7018, "step": 1270 }, { "epoch": 0.09514719368180713, "grad_norm": 1.2931547164916992, "learning_rate": 0.00017055593583105983, "loss": 2.1923, "step": 1271 }, { "epoch": 0.09522205378698557, "grad_norm": 1.0711431503295898, "learning_rate": 0.00017051166178956405, "loss": 2.2876, "step": 1272 }, { "epoch": 0.09529691389216402, "grad_norm": 1.0708810091018677, "learning_rate": 0.00017046736024224103, "loss": 2.5447, "step": 1273 }, { "epoch": 0.09537177399734247, "grad_norm": 1.112472653388977, "learning_rate": 0.0001704230312063723, "loss": 2.0175, "step": 1274 }, { "epoch": 0.09544663410252091, "grad_norm": 1.162406325340271, "learning_rate": 0.00017037867469925014, "loss": 2.269, "step": 1275 }, { "epoch": 0.09552149420769936, "grad_norm": 1.2713630199432373, "learning_rate": 0.0001703342907381775, "loss": 2.6093, "step": 1276 }, { "epoch": 0.09559635431287782, "grad_norm": 1.2598445415496826, "learning_rate": 0.00017028987934046815, "loss": 2.1506, "step": 1277 }, { "epoch": 0.09567121441805626, "grad_norm": 1.2818913459777832, "learning_rate": 0.0001702454405234464, "loss": 2.0641, "step": 1278 }, { "epoch": 0.0957460745232347, "grad_norm": 1.249788522720337, "learning_rate": 0.00017020097430444741, "loss": 2.2553, "step": 1279 }, { "epoch": 0.09582093462841315, "grad_norm": 1.391068935394287, "learning_rate": 0.00017015648070081696, "loss": 2.0276, "step": 1280 }, { "epoch": 0.0958957947335916, "grad_norm": 1.4376634359359741, "learning_rate": 0.00017011195972991148, "loss": 1.8347, "step": 1281 }, { "epoch": 0.09597065483877004, "grad_norm": 1.4593936204910278, "learning_rate": 0.00017006741140909815, "loss": 2.1562, "step": 1282 }, { "epoch": 0.0960455149439485, "grad_norm": 1.4945693016052246, "learning_rate": 0.00017002283575575478, "loss": 2.5279, "step": 1283 }, { "epoch": 0.09612037504912695, "grad_norm": 1.5007357597351074, "learning_rate": 0.0001699782327872698, "loss": 2.2079, "step": 1284 }, { "epoch": 0.09619523515430539, "grad_norm": 1.263342022895813, "learning_rate": 0.00016993360252104243, "loss": 2.1037, "step": 1285 }, { "epoch": 0.09627009525948384, "grad_norm": 1.260228157043457, "learning_rate": 0.00016988894497448236, "loss": 1.8644, "step": 1286 }, { "epoch": 0.09634495536466228, "grad_norm": 1.3372670412063599, "learning_rate": 0.00016984426016501005, "loss": 2.25, "step": 1287 }, { "epoch": 0.09641981546984074, "grad_norm": 1.1717355251312256, "learning_rate": 0.00016979954811005655, "loss": 1.9109, "step": 1288 }, { "epoch": 0.09649467557501919, "grad_norm": 1.308436393737793, "learning_rate": 0.00016975480882706353, "loss": 2.1369, "step": 1289 }, { "epoch": 0.09656953568019763, "grad_norm": 1.2624328136444092, "learning_rate": 0.0001697100423334833, "loss": 2.2584, "step": 1290 }, { "epoch": 0.09664439578537608, "grad_norm": 1.2965506315231323, "learning_rate": 0.00016966524864677878, "loss": 2.1798, "step": 1291 }, { "epoch": 0.09671925589055452, "grad_norm": 1.1903141736984253, "learning_rate": 0.00016962042778442352, "loss": 2.3308, "step": 1292 }, { "epoch": 0.09679411599573297, "grad_norm": 1.1928924322128296, "learning_rate": 0.00016957557976390162, "loss": 1.9318, "step": 1293 }, { "epoch": 0.09686897610091143, "grad_norm": 1.1392804384231567, "learning_rate": 0.00016953070460270782, "loss": 2.1668, "step": 1294 }, { "epoch": 0.09694383620608987, "grad_norm": 1.1643270254135132, "learning_rate": 0.00016948580231834737, "loss": 1.7131, "step": 1295 }, { "epoch": 0.09701869631126832, "grad_norm": 1.2784801721572876, "learning_rate": 0.00016944087292833628, "loss": 2.1143, "step": 1296 }, { "epoch": 0.09709355641644676, "grad_norm": 1.205541729927063, "learning_rate": 0.0001693959164502009, "loss": 2.2568, "step": 1297 }, { "epoch": 0.09716841652162521, "grad_norm": 1.3454887866973877, "learning_rate": 0.0001693509329014783, "loss": 2.4021, "step": 1298 }, { "epoch": 0.09724327662680365, "grad_norm": 1.283854603767395, "learning_rate": 0.0001693059222997161, "loss": 2.1975, "step": 1299 }, { "epoch": 0.09731813673198211, "grad_norm": 1.4375123977661133, "learning_rate": 0.00016926088466247242, "loss": 2.3896, "step": 1300 }, { "epoch": 0.09739299683716056, "grad_norm": 1.181265115737915, "learning_rate": 0.0001692158200073159, "loss": 2.0499, "step": 1301 }, { "epoch": 0.097467856942339, "grad_norm": 1.2245078086853027, "learning_rate": 0.00016917072835182583, "loss": 1.8507, "step": 1302 }, { "epoch": 0.09754271704751745, "grad_norm": 1.2939743995666504, "learning_rate": 0.00016912560971359198, "loss": 2.1159, "step": 1303 }, { "epoch": 0.0976175771526959, "grad_norm": 1.299012541770935, "learning_rate": 0.00016908046411021456, "loss": 1.7164, "step": 1304 }, { "epoch": 0.09769243725787435, "grad_norm": 1.281433343887329, "learning_rate": 0.00016903529155930447, "loss": 1.7115, "step": 1305 }, { "epoch": 0.0977672973630528, "grad_norm": 1.1772958040237427, "learning_rate": 0.00016899009207848296, "loss": 2.2404, "step": 1306 }, { "epoch": 0.09784215746823124, "grad_norm": 1.389107584953308, "learning_rate": 0.00016894486568538191, "loss": 2.1498, "step": 1307 }, { "epoch": 0.09791701757340969, "grad_norm": 1.1696873903274536, "learning_rate": 0.00016889961239764355, "loss": 2.2541, "step": 1308 }, { "epoch": 0.09799187767858814, "grad_norm": 1.227876901626587, "learning_rate": 0.0001688543322329208, "loss": 2.0861, "step": 1309 }, { "epoch": 0.09806673778376658, "grad_norm": 1.112677812576294, "learning_rate": 0.00016880902520887686, "loss": 2.2363, "step": 1310 }, { "epoch": 0.09814159788894504, "grad_norm": 1.4799894094467163, "learning_rate": 0.0001687636913431856, "loss": 2.4007, "step": 1311 }, { "epoch": 0.09821645799412348, "grad_norm": 1.293063998222351, "learning_rate": 0.0001687183306535312, "loss": 1.952, "step": 1312 }, { "epoch": 0.09829131809930193, "grad_norm": 1.1251343488693237, "learning_rate": 0.00016867294315760836, "loss": 2.466, "step": 1313 }, { "epoch": 0.09836617820448038, "grad_norm": 1.4368664026260376, "learning_rate": 0.00016862752887312224, "loss": 2.1603, "step": 1314 }, { "epoch": 0.09844103830965882, "grad_norm": 1.2873185873031616, "learning_rate": 0.00016858208781778854, "loss": 2.4049, "step": 1315 }, { "epoch": 0.09851589841483727, "grad_norm": 1.3190399408340454, "learning_rate": 0.00016853662000933324, "loss": 2.3097, "step": 1316 }, { "epoch": 0.09859075852001573, "grad_norm": 1.301393985748291, "learning_rate": 0.00016849112546549286, "loss": 2.3271, "step": 1317 }, { "epoch": 0.09866561862519417, "grad_norm": 1.1198275089263916, "learning_rate": 0.0001684456042040143, "loss": 1.8716, "step": 1318 }, { "epoch": 0.09874047873037262, "grad_norm": 1.2918214797973633, "learning_rate": 0.00016840005624265496, "loss": 2.2174, "step": 1319 }, { "epoch": 0.09881533883555106, "grad_norm": 1.2607537508010864, "learning_rate": 0.00016835448159918255, "loss": 2.1763, "step": 1320 }, { "epoch": 0.0988901989407295, "grad_norm": 1.1696572303771973, "learning_rate": 0.00016830888029137524, "loss": 2.1371, "step": 1321 }, { "epoch": 0.09896505904590797, "grad_norm": 1.5535697937011719, "learning_rate": 0.00016826325233702162, "loss": 1.9443, "step": 1322 }, { "epoch": 0.09903991915108641, "grad_norm": 1.613700270652771, "learning_rate": 0.00016821759775392068, "loss": 2.4372, "step": 1323 }, { "epoch": 0.09911477925626486, "grad_norm": 1.3026611804962158, "learning_rate": 0.0001681719165598817, "loss": 1.9988, "step": 1324 }, { "epoch": 0.0991896393614433, "grad_norm": 1.2683066129684448, "learning_rate": 0.0001681262087727245, "loss": 1.7268, "step": 1325 }, { "epoch": 0.09926449946662175, "grad_norm": 1.4297478199005127, "learning_rate": 0.00016808047441027913, "loss": 2.1635, "step": 1326 }, { "epoch": 0.09933935957180019, "grad_norm": 1.4923065900802612, "learning_rate": 0.00016803471349038606, "loss": 2.041, "step": 1327 }, { "epoch": 0.09941421967697865, "grad_norm": 1.2657222747802734, "learning_rate": 0.00016798892603089616, "loss": 2.3543, "step": 1328 }, { "epoch": 0.0994890797821571, "grad_norm": 1.3192569017410278, "learning_rate": 0.00016794311204967058, "loss": 2.0414, "step": 1329 }, { "epoch": 0.09956393988733554, "grad_norm": 1.2249101400375366, "learning_rate": 0.00016789727156458088, "loss": 2.0141, "step": 1330 }, { "epoch": 0.09963879999251399, "grad_norm": 1.2905266284942627, "learning_rate": 0.00016785140459350895, "loss": 2.0986, "step": 1331 }, { "epoch": 0.09971366009769243, "grad_norm": 1.251204490661621, "learning_rate": 0.00016780551115434697, "loss": 2.241, "step": 1332 }, { "epoch": 0.09978852020287088, "grad_norm": 1.1609666347503662, "learning_rate": 0.0001677595912649974, "loss": 2.3757, "step": 1333 }, { "epoch": 0.09986338030804934, "grad_norm": 1.3156108856201172, "learning_rate": 0.0001677136449433732, "loss": 2.079, "step": 1334 }, { "epoch": 0.09993824041322778, "grad_norm": 1.0391426086425781, "learning_rate": 0.0001676676722073975, "loss": 2.0125, "step": 1335 }, { "epoch": 0.10001310051840623, "grad_norm": 1.1106277704238892, "learning_rate": 0.0001676216730750037, "loss": 1.6842, "step": 1336 }, { "epoch": 0.10008796062358467, "grad_norm": 1.218356966972351, "learning_rate": 0.0001675756475641356, "loss": 2.0284, "step": 1337 }, { "epoch": 0.10016282072876312, "grad_norm": 1.1555066108703613, "learning_rate": 0.00016752959569274727, "loss": 2.4866, "step": 1338 }, { "epoch": 0.10023768083394158, "grad_norm": 1.273892879486084, "learning_rate": 0.000167483517478803, "loss": 1.9848, "step": 1339 }, { "epoch": 0.10031254093912002, "grad_norm": 1.2141733169555664, "learning_rate": 0.00016743741294027742, "loss": 2.141, "step": 1340 }, { "epoch": 0.10038740104429847, "grad_norm": 1.1448335647583008, "learning_rate": 0.0001673912820951554, "loss": 2.0254, "step": 1341 }, { "epoch": 0.10046226114947691, "grad_norm": 1.0731639862060547, "learning_rate": 0.00016734512496143205, "loss": 1.7711, "step": 1342 }, { "epoch": 0.10053712125465536, "grad_norm": 1.2731566429138184, "learning_rate": 0.00016729894155711282, "loss": 1.6752, "step": 1343 }, { "epoch": 0.1006119813598338, "grad_norm": 1.1804865598678589, "learning_rate": 0.0001672527319002133, "loss": 2.0012, "step": 1344 }, { "epoch": 0.10068684146501226, "grad_norm": 1.1807719469070435, "learning_rate": 0.00016720649600875936, "loss": 1.7433, "step": 1345 }, { "epoch": 0.10076170157019071, "grad_norm": 1.1872445344924927, "learning_rate": 0.0001671602339007872, "loss": 2.1598, "step": 1346 }, { "epoch": 0.10083656167536915, "grad_norm": 1.1488289833068848, "learning_rate": 0.00016711394559434307, "loss": 1.8963, "step": 1347 }, { "epoch": 0.1009114217805476, "grad_norm": 1.3485081195831299, "learning_rate": 0.0001670676311074836, "loss": 2.4473, "step": 1348 }, { "epoch": 0.10098628188572605, "grad_norm": 1.6222820281982422, "learning_rate": 0.0001670212904582755, "loss": 2.545, "step": 1349 }, { "epoch": 0.10106114199090449, "grad_norm": 1.3797476291656494, "learning_rate": 0.00016697492366479582, "loss": 2.1509, "step": 1350 }, { "epoch": 0.10106114199090449, "eval_loss": 2.14520263671875, "eval_runtime": 178.7669, "eval_samples_per_second": 27.969, "eval_steps_per_second": 13.985, "step": 1350 }, { "epoch": 0.10113600209608295, "grad_norm": 1.414994239807129, "learning_rate": 0.00016692853074513168, "loss": 1.9302, "step": 1351 }, { "epoch": 0.1012108622012614, "grad_norm": 1.467681884765625, "learning_rate": 0.00016688211171738044, "loss": 1.969, "step": 1352 }, { "epoch": 0.10128572230643984, "grad_norm": 1.4193848371505737, "learning_rate": 0.00016683566659964974, "loss": 1.9447, "step": 1353 }, { "epoch": 0.10136058241161829, "grad_norm": 1.1241486072540283, "learning_rate": 0.0001667891954100572, "loss": 1.7657, "step": 1354 }, { "epoch": 0.10143544251679673, "grad_norm": 1.3256449699401855, "learning_rate": 0.00016674269816673083, "loss": 2.2017, "step": 1355 }, { "epoch": 0.10151030262197519, "grad_norm": 1.2398390769958496, "learning_rate": 0.0001666961748878086, "loss": 2.1309, "step": 1356 }, { "epoch": 0.10158516272715364, "grad_norm": 1.346687912940979, "learning_rate": 0.00016664962559143884, "loss": 2.2566, "step": 1357 }, { "epoch": 0.10166002283233208, "grad_norm": 1.6762033700942993, "learning_rate": 0.0001666030502957798, "loss": 2.2217, "step": 1358 }, { "epoch": 0.10173488293751053, "grad_norm": 1.207349181175232, "learning_rate": 0.00016655644901900006, "loss": 2.0345, "step": 1359 }, { "epoch": 0.10180974304268897, "grad_norm": 1.6247551441192627, "learning_rate": 0.0001665098217792783, "loss": 2.3608, "step": 1360 }, { "epoch": 0.10188460314786742, "grad_norm": 1.4248582124710083, "learning_rate": 0.00016646316859480323, "loss": 2.6456, "step": 1361 }, { "epoch": 0.10195946325304588, "grad_norm": 1.0580540895462036, "learning_rate": 0.00016641648948377376, "loss": 1.8013, "step": 1362 }, { "epoch": 0.10203432335822432, "grad_norm": 1.2987760305404663, "learning_rate": 0.0001663697844643989, "loss": 2.0672, "step": 1363 }, { "epoch": 0.10210918346340277, "grad_norm": 1.2868109941482544, "learning_rate": 0.0001663230535548978, "loss": 2.0375, "step": 1364 }, { "epoch": 0.10218404356858121, "grad_norm": 1.0542927980422974, "learning_rate": 0.00016627629677349966, "loss": 2.078, "step": 1365 }, { "epoch": 0.10225890367375966, "grad_norm": 1.618461012840271, "learning_rate": 0.00016622951413844373, "loss": 2.4947, "step": 1366 }, { "epoch": 0.1023337637789381, "grad_norm": 1.3086234331130981, "learning_rate": 0.0001661827056679795, "loss": 2.3998, "step": 1367 }, { "epoch": 0.10240862388411656, "grad_norm": 1.4383751153945923, "learning_rate": 0.00016613587138036642, "loss": 2.2084, "step": 1368 }, { "epoch": 0.10248348398929501, "grad_norm": 1.3373417854309082, "learning_rate": 0.00016608901129387395, "loss": 2.6147, "step": 1369 }, { "epoch": 0.10255834409447345, "grad_norm": 1.3508145809173584, "learning_rate": 0.0001660421254267818, "loss": 2.4913, "step": 1370 }, { "epoch": 0.1026332041996519, "grad_norm": 1.2127130031585693, "learning_rate": 0.0001659952137973796, "loss": 1.9995, "step": 1371 }, { "epoch": 0.10270806430483034, "grad_norm": 1.3617942333221436, "learning_rate": 0.00016594827642396704, "loss": 2.5132, "step": 1372 }, { "epoch": 0.1027829244100088, "grad_norm": 1.063012719154358, "learning_rate": 0.0001659013133248539, "loss": 1.7398, "step": 1373 }, { "epoch": 0.10285778451518725, "grad_norm": 1.2978633642196655, "learning_rate": 0.0001658543245183599, "loss": 2.0811, "step": 1374 }, { "epoch": 0.1029326446203657, "grad_norm": 1.3186272382736206, "learning_rate": 0.00016580731002281498, "loss": 1.9617, "step": 1375 }, { "epoch": 0.10300750472554414, "grad_norm": 1.5570769309997559, "learning_rate": 0.00016576026985655888, "loss": 2.5364, "step": 1376 }, { "epoch": 0.10308236483072258, "grad_norm": 1.2345267534255981, "learning_rate": 0.00016571320403794152, "loss": 2.1808, "step": 1377 }, { "epoch": 0.10315722493590103, "grad_norm": 1.1077262163162231, "learning_rate": 0.0001656661125853227, "loss": 2.1605, "step": 1378 }, { "epoch": 0.10323208504107949, "grad_norm": 1.1092329025268555, "learning_rate": 0.00016561899551707237, "loss": 2.1482, "step": 1379 }, { "epoch": 0.10330694514625793, "grad_norm": 1.2542911767959595, "learning_rate": 0.0001655718528515703, "loss": 2.4911, "step": 1380 }, { "epoch": 0.10338180525143638, "grad_norm": 1.4352420568466187, "learning_rate": 0.00016552468460720636, "loss": 2.0798, "step": 1381 }, { "epoch": 0.10345666535661482, "grad_norm": 1.1715481281280518, "learning_rate": 0.00016547749080238032, "loss": 1.7918, "step": 1382 }, { "epoch": 0.10353152546179327, "grad_norm": 1.3264869451522827, "learning_rate": 0.00016543027145550206, "loss": 2.1241, "step": 1383 }, { "epoch": 0.10360638556697171, "grad_norm": 1.2857880592346191, "learning_rate": 0.00016538302658499125, "loss": 2.0609, "step": 1384 }, { "epoch": 0.10368124567215017, "grad_norm": 1.1811988353729248, "learning_rate": 0.0001653357562092776, "loss": 2.3593, "step": 1385 }, { "epoch": 0.10375610577732862, "grad_norm": 1.1397839784622192, "learning_rate": 0.00016528846034680083, "loss": 1.9513, "step": 1386 }, { "epoch": 0.10383096588250706, "grad_norm": 1.6603538990020752, "learning_rate": 0.00016524113901601045, "loss": 2.4549, "step": 1387 }, { "epoch": 0.10390582598768551, "grad_norm": 1.181579828262329, "learning_rate": 0.0001651937922353661, "loss": 2.4334, "step": 1388 }, { "epoch": 0.10398068609286396, "grad_norm": 1.1902142763137817, "learning_rate": 0.00016514642002333712, "loss": 2.394, "step": 1389 }, { "epoch": 0.10405554619804241, "grad_norm": 1.6488478183746338, "learning_rate": 0.00016509902239840297, "loss": 2.5797, "step": 1390 }, { "epoch": 0.10413040630322086, "grad_norm": 1.2363325357437134, "learning_rate": 0.00016505159937905294, "loss": 1.6119, "step": 1391 }, { "epoch": 0.1042052664083993, "grad_norm": 1.2036361694335938, "learning_rate": 0.00016500415098378625, "loss": 2.4476, "step": 1392 }, { "epoch": 0.10428012651357775, "grad_norm": 1.1309524774551392, "learning_rate": 0.0001649566772311119, "loss": 2.0075, "step": 1393 }, { "epoch": 0.1043549866187562, "grad_norm": 1.2295252084732056, "learning_rate": 0.000164909178139549, "loss": 2.3521, "step": 1394 }, { "epoch": 0.10442984672393464, "grad_norm": 1.260619044303894, "learning_rate": 0.00016486165372762635, "loss": 2.0086, "step": 1395 }, { "epoch": 0.1045047068291131, "grad_norm": 1.5090285539627075, "learning_rate": 0.00016481410401388274, "loss": 1.8876, "step": 1396 }, { "epoch": 0.10457956693429155, "grad_norm": 1.2978620529174805, "learning_rate": 0.00016476652901686674, "loss": 2.0788, "step": 1397 }, { "epoch": 0.10465442703946999, "grad_norm": 0.9798474907875061, "learning_rate": 0.0001647189287551369, "loss": 1.748, "step": 1398 }, { "epoch": 0.10472928714464844, "grad_norm": 1.240755319595337, "learning_rate": 0.00016467130324726155, "loss": 2.4854, "step": 1399 }, { "epoch": 0.10480414724982688, "grad_norm": 1.1424627304077148, "learning_rate": 0.00016462365251181885, "loss": 2.6172, "step": 1400 }, { "epoch": 0.10487900735500533, "grad_norm": 1.3274455070495605, "learning_rate": 0.00016457597656739683, "loss": 2.4682, "step": 1401 }, { "epoch": 0.10495386746018379, "grad_norm": 1.133979320526123, "learning_rate": 0.00016452827543259337, "loss": 1.7754, "step": 1402 }, { "epoch": 0.10502872756536223, "grad_norm": 1.445901870727539, "learning_rate": 0.0001644805491260162, "loss": 2.1608, "step": 1403 }, { "epoch": 0.10510358767054068, "grad_norm": 1.4545518159866333, "learning_rate": 0.00016443279766628274, "loss": 2.4818, "step": 1404 }, { "epoch": 0.10517844777571912, "grad_norm": 1.1807664632797241, "learning_rate": 0.0001643850210720204, "loss": 2.0293, "step": 1405 }, { "epoch": 0.10525330788089757, "grad_norm": 1.3189401626586914, "learning_rate": 0.00016433721936186623, "loss": 2.2809, "step": 1406 }, { "epoch": 0.10532816798607603, "grad_norm": 1.252563238143921, "learning_rate": 0.0001642893925544672, "loss": 2.4717, "step": 1407 }, { "epoch": 0.10540302809125447, "grad_norm": 1.2159745693206787, "learning_rate": 0.00016424154066848004, "loss": 2.5536, "step": 1408 }, { "epoch": 0.10547788819643292, "grad_norm": 1.0971202850341797, "learning_rate": 0.00016419366372257116, "loss": 1.9368, "step": 1409 }, { "epoch": 0.10555274830161136, "grad_norm": 1.360582947731018, "learning_rate": 0.00016414576173541697, "loss": 1.8244, "step": 1410 }, { "epoch": 0.10562760840678981, "grad_norm": 1.3573845624923706, "learning_rate": 0.0001640978347257034, "loss": 2.2657, "step": 1411 }, { "epoch": 0.10570246851196825, "grad_norm": 1.2953474521636963, "learning_rate": 0.00016404988271212628, "loss": 2.0228, "step": 1412 }, { "epoch": 0.10577732861714671, "grad_norm": 1.2372575998306274, "learning_rate": 0.00016400190571339116, "loss": 2.108, "step": 1413 }, { "epoch": 0.10585218872232516, "grad_norm": 1.3453383445739746, "learning_rate": 0.00016395390374821338, "loss": 2.277, "step": 1414 }, { "epoch": 0.1059270488275036, "grad_norm": 1.235211730003357, "learning_rate": 0.00016390587683531792, "loss": 2.1419, "step": 1415 }, { "epoch": 0.10600190893268205, "grad_norm": 1.3465505838394165, "learning_rate": 0.0001638578249934396, "loss": 2.1081, "step": 1416 }, { "epoch": 0.1060767690378605, "grad_norm": 1.374618411064148, "learning_rate": 0.00016380974824132296, "loss": 2.1936, "step": 1417 }, { "epoch": 0.10615162914303894, "grad_norm": 1.2044596672058105, "learning_rate": 0.0001637616465977221, "loss": 2.3106, "step": 1418 }, { "epoch": 0.1062264892482174, "grad_norm": 1.6745530366897583, "learning_rate": 0.00016371352008140102, "loss": 2.494, "step": 1419 }, { "epoch": 0.10630134935339584, "grad_norm": 1.3248798847198486, "learning_rate": 0.0001636653687111333, "loss": 1.9224, "step": 1420 }, { "epoch": 0.10637620945857429, "grad_norm": 1.5222692489624023, "learning_rate": 0.00016361719250570237, "loss": 2.1489, "step": 1421 }, { "epoch": 0.10645106956375273, "grad_norm": 1.2590042352676392, "learning_rate": 0.0001635689914839011, "loss": 2.3304, "step": 1422 }, { "epoch": 0.10652592966893118, "grad_norm": 1.2004003524780273, "learning_rate": 0.00016352076566453224, "loss": 2.0023, "step": 1423 }, { "epoch": 0.10660078977410964, "grad_norm": 1.1468009948730469, "learning_rate": 0.00016347251506640818, "loss": 2.019, "step": 1424 }, { "epoch": 0.10667564987928808, "grad_norm": 1.3046714067459106, "learning_rate": 0.00016342423970835092, "loss": 2.6055, "step": 1425 }, { "epoch": 0.10675050998446653, "grad_norm": 1.012610673904419, "learning_rate": 0.00016337593960919218, "loss": 2.1156, "step": 1426 }, { "epoch": 0.10682537008964497, "grad_norm": 1.1486746072769165, "learning_rate": 0.00016332761478777326, "loss": 2.1291, "step": 1427 }, { "epoch": 0.10690023019482342, "grad_norm": 1.4089386463165283, "learning_rate": 0.00016327926526294513, "loss": 2.3441, "step": 1428 }, { "epoch": 0.10697509030000187, "grad_norm": 1.0898189544677734, "learning_rate": 0.0001632308910535685, "loss": 1.4407, "step": 1429 }, { "epoch": 0.10704995040518032, "grad_norm": 1.28274405002594, "learning_rate": 0.00016318249217851355, "loss": 2.3777, "step": 1430 }, { "epoch": 0.10712481051035877, "grad_norm": 1.1939549446105957, "learning_rate": 0.0001631340686566601, "loss": 2.1846, "step": 1431 }, { "epoch": 0.10719967061553722, "grad_norm": 1.0241773128509521, "learning_rate": 0.00016308562050689775, "loss": 2.1035, "step": 1432 }, { "epoch": 0.10727453072071566, "grad_norm": 1.3949769735336304, "learning_rate": 0.00016303714774812557, "loss": 2.5187, "step": 1433 }, { "epoch": 0.1073493908258941, "grad_norm": 1.1736233234405518, "learning_rate": 0.00016298865039925215, "loss": 2.5545, "step": 1434 }, { "epoch": 0.10742425093107255, "grad_norm": 1.1999199390411377, "learning_rate": 0.0001629401284791959, "loss": 1.6012, "step": 1435 }, { "epoch": 0.10749911103625101, "grad_norm": 1.1697944402694702, "learning_rate": 0.0001628915820068846, "loss": 1.7268, "step": 1436 }, { "epoch": 0.10757397114142946, "grad_norm": 1.1482471227645874, "learning_rate": 0.0001628430110012558, "loss": 1.9965, "step": 1437 }, { "epoch": 0.1076488312466079, "grad_norm": 1.1486296653747559, "learning_rate": 0.0001627944154812564, "loss": 2.0748, "step": 1438 }, { "epoch": 0.10772369135178635, "grad_norm": 1.1992167234420776, "learning_rate": 0.00016274579546584304, "loss": 2.0305, "step": 1439 }, { "epoch": 0.10779855145696479, "grad_norm": 1.2968766689300537, "learning_rate": 0.00016269715097398183, "loss": 2.3267, "step": 1440 }, { "epoch": 0.10787341156214325, "grad_norm": 1.324224829673767, "learning_rate": 0.0001626484820246485, "loss": 2.3139, "step": 1441 }, { "epoch": 0.1079482716673217, "grad_norm": 1.2704652547836304, "learning_rate": 0.00016259978863682824, "loss": 2.0131, "step": 1442 }, { "epoch": 0.10802313177250014, "grad_norm": 1.6730550527572632, "learning_rate": 0.0001625510708295158, "loss": 1.8229, "step": 1443 }, { "epoch": 0.10809799187767859, "grad_norm": 1.441396713256836, "learning_rate": 0.00016250232862171548, "loss": 2.0619, "step": 1444 }, { "epoch": 0.10817285198285703, "grad_norm": 1.2217704057693481, "learning_rate": 0.00016245356203244108, "loss": 2.641, "step": 1445 }, { "epoch": 0.10824771208803548, "grad_norm": 1.219352126121521, "learning_rate": 0.00016240477108071593, "loss": 1.6454, "step": 1446 }, { "epoch": 0.10832257219321394, "grad_norm": 1.454992651939392, "learning_rate": 0.0001623559557855728, "loss": 2.6781, "step": 1447 }, { "epoch": 0.10839743229839238, "grad_norm": 1.36250901222229, "learning_rate": 0.00016230711616605406, "loss": 2.2777, "step": 1448 }, { "epoch": 0.10847229240357083, "grad_norm": 1.206434726715088, "learning_rate": 0.00016225825224121146, "loss": 2.3591, "step": 1449 }, { "epoch": 0.10854715250874927, "grad_norm": 1.5416185855865479, "learning_rate": 0.0001622093640301063, "loss": 2.3435, "step": 1450 }, { "epoch": 0.10862201261392772, "grad_norm": 1.317427158355713, "learning_rate": 0.00016216045155180932, "loss": 2.1631, "step": 1451 }, { "epoch": 0.10869687271910616, "grad_norm": 1.5101104974746704, "learning_rate": 0.00016211151482540078, "loss": 2.7705, "step": 1452 }, { "epoch": 0.10877173282428462, "grad_norm": 1.6048123836517334, "learning_rate": 0.00016206255386997035, "loss": 2.2543, "step": 1453 }, { "epoch": 0.10884659292946307, "grad_norm": 1.1443668603897095, "learning_rate": 0.00016201356870461717, "loss": 1.959, "step": 1454 }, { "epoch": 0.10892145303464151, "grad_norm": 1.2255264520645142, "learning_rate": 0.00016196455934844978, "loss": 2.5827, "step": 1455 }, { "epoch": 0.10899631313981996, "grad_norm": 1.594469666481018, "learning_rate": 0.0001619155258205862, "loss": 1.8588, "step": 1456 }, { "epoch": 0.1090711732449984, "grad_norm": 1.3072234392166138, "learning_rate": 0.0001618664681401539, "loss": 2.3228, "step": 1457 }, { "epoch": 0.10914603335017686, "grad_norm": 1.1683406829833984, "learning_rate": 0.00016181738632628972, "loss": 1.5614, "step": 1458 }, { "epoch": 0.10922089345535531, "grad_norm": 1.1836817264556885, "learning_rate": 0.00016176828039813997, "loss": 2.4262, "step": 1459 }, { "epoch": 0.10929575356053375, "grad_norm": 1.252601146697998, "learning_rate": 0.0001617191503748603, "loss": 2.0929, "step": 1460 }, { "epoch": 0.1093706136657122, "grad_norm": 1.3403968811035156, "learning_rate": 0.00016166999627561582, "loss": 2.2887, "step": 1461 }, { "epoch": 0.10944547377089064, "grad_norm": 1.4914844036102295, "learning_rate": 0.000161620818119581, "loss": 2.6627, "step": 1462 }, { "epoch": 0.10952033387606909, "grad_norm": 1.2858998775482178, "learning_rate": 0.00016157161592593967, "loss": 2.306, "step": 1463 }, { "epoch": 0.10959519398124755, "grad_norm": 1.0430054664611816, "learning_rate": 0.0001615223897138851, "loss": 2.391, "step": 1464 }, { "epoch": 0.109670054086426, "grad_norm": 1.3083268404006958, "learning_rate": 0.00016147313950261988, "loss": 2.0899, "step": 1465 }, { "epoch": 0.10974491419160444, "grad_norm": 1.1585360765457153, "learning_rate": 0.000161423865311356, "loss": 1.5079, "step": 1466 }, { "epoch": 0.10981977429678288, "grad_norm": 1.1752419471740723, "learning_rate": 0.00016137456715931476, "loss": 2.2066, "step": 1467 }, { "epoch": 0.10989463440196133, "grad_norm": 1.228247046470642, "learning_rate": 0.0001613252450657268, "loss": 2.0788, "step": 1468 }, { "epoch": 0.10996949450713979, "grad_norm": 1.1420124769210815, "learning_rate": 0.00016127589904983225, "loss": 2.0338, "step": 1469 }, { "epoch": 0.11004435461231823, "grad_norm": 1.1088730096817017, "learning_rate": 0.00016122652913088032, "loss": 1.8778, "step": 1470 }, { "epoch": 0.11011921471749668, "grad_norm": 1.2451066970825195, "learning_rate": 0.00016117713532812973, "loss": 2.1059, "step": 1471 }, { "epoch": 0.11019407482267513, "grad_norm": 1.1844027042388916, "learning_rate": 0.00016112771766084842, "loss": 1.6139, "step": 1472 }, { "epoch": 0.11026893492785357, "grad_norm": 1.1862640380859375, "learning_rate": 0.00016107827614831373, "loss": 2.1893, "step": 1473 }, { "epoch": 0.11034379503303202, "grad_norm": 1.4060194492340088, "learning_rate": 0.00016102881080981227, "loss": 1.9503, "step": 1474 }, { "epoch": 0.11041865513821048, "grad_norm": 1.1908422708511353, "learning_rate": 0.00016097932166463986, "loss": 1.7708, "step": 1475 }, { "epoch": 0.11049351524338892, "grad_norm": 1.2937484979629517, "learning_rate": 0.00016092980873210172, "loss": 2.3533, "step": 1476 }, { "epoch": 0.11056837534856737, "grad_norm": 1.2592964172363281, "learning_rate": 0.00016088027203151227, "loss": 2.2797, "step": 1477 }, { "epoch": 0.11064323545374581, "grad_norm": 1.2329058647155762, "learning_rate": 0.00016083071158219528, "loss": 2.1571, "step": 1478 }, { "epoch": 0.11071809555892426, "grad_norm": 1.398342490196228, "learning_rate": 0.0001607811274034837, "loss": 2.056, "step": 1479 }, { "epoch": 0.1107929556641027, "grad_norm": 1.238851547241211, "learning_rate": 0.00016073151951471982, "loss": 2.4985, "step": 1480 }, { "epoch": 0.11086781576928116, "grad_norm": 1.1297204494476318, "learning_rate": 0.00016068188793525508, "loss": 1.9222, "step": 1481 }, { "epoch": 0.1109426758744596, "grad_norm": 1.4329811334609985, "learning_rate": 0.00016063223268445026, "loss": 2.4292, "step": 1482 }, { "epoch": 0.11101753597963805, "grad_norm": 1.1774877309799194, "learning_rate": 0.00016058255378167526, "loss": 2.208, "step": 1483 }, { "epoch": 0.1110923960848165, "grad_norm": 1.1959445476531982, "learning_rate": 0.00016053285124630935, "loss": 2.3947, "step": 1484 }, { "epoch": 0.11116725618999494, "grad_norm": 1.195386528968811, "learning_rate": 0.0001604831250977409, "loss": 1.7335, "step": 1485 }, { "epoch": 0.1112421162951734, "grad_norm": 1.1710582971572876, "learning_rate": 0.0001604333753553676, "loss": 2.3415, "step": 1486 }, { "epoch": 0.11131697640035185, "grad_norm": 1.402022123336792, "learning_rate": 0.00016038360203859622, "loss": 2.1873, "step": 1487 }, { "epoch": 0.11139183650553029, "grad_norm": 1.4342947006225586, "learning_rate": 0.00016033380516684278, "loss": 2.3618, "step": 1488 }, { "epoch": 0.11146669661070874, "grad_norm": 1.2007660865783691, "learning_rate": 0.00016028398475953254, "loss": 1.9808, "step": 1489 }, { "epoch": 0.11154155671588718, "grad_norm": 1.2637619972229004, "learning_rate": 0.00016023414083609987, "loss": 2.2568, "step": 1490 }, { "epoch": 0.11161641682106563, "grad_norm": 1.307750940322876, "learning_rate": 0.00016018427341598832, "loss": 1.9082, "step": 1491 }, { "epoch": 0.11169127692624409, "grad_norm": 1.245421051979065, "learning_rate": 0.00016013438251865066, "loss": 2.3408, "step": 1492 }, { "epoch": 0.11176613703142253, "grad_norm": 1.1509568691253662, "learning_rate": 0.00016008446816354876, "loss": 2.1989, "step": 1493 }, { "epoch": 0.11184099713660098, "grad_norm": 1.3098514080047607, "learning_rate": 0.00016003453037015367, "loss": 2.3567, "step": 1494 }, { "epoch": 0.11191585724177942, "grad_norm": 1.2072499990463257, "learning_rate": 0.00015998456915794558, "loss": 1.8827, "step": 1495 }, { "epoch": 0.11199071734695787, "grad_norm": 1.1300392150878906, "learning_rate": 0.0001599345845464138, "loss": 2.0189, "step": 1496 }, { "epoch": 0.11206557745213631, "grad_norm": 1.5155521631240845, "learning_rate": 0.00015988457655505682, "loss": 1.7924, "step": 1497 }, { "epoch": 0.11214043755731477, "grad_norm": 1.0969644784927368, "learning_rate": 0.00015983454520338217, "loss": 2.4503, "step": 1498 }, { "epoch": 0.11221529766249322, "grad_norm": 1.4304834604263306, "learning_rate": 0.00015978449051090652, "loss": 2.1686, "step": 1499 }, { "epoch": 0.11229015776767166, "grad_norm": 1.229567527770996, "learning_rate": 0.00015973441249715568, "loss": 2.0084, "step": 1500 }, { "epoch": 0.11229015776767166, "eval_loss": 2.1315884590148926, "eval_runtime": 178.6584, "eval_samples_per_second": 27.986, "eval_steps_per_second": 13.993, "step": 1500 }, { "epoch": 0.11236501787285011, "grad_norm": 1.5305020809173584, "learning_rate": 0.00015968431118166453, "loss": 2.5684, "step": 1501 }, { "epoch": 0.11243987797802855, "grad_norm": 1.3768419027328491, "learning_rate": 0.00015963418658397708, "loss": 2.5733, "step": 1502 }, { "epoch": 0.11251473808320701, "grad_norm": 1.371457576751709, "learning_rate": 0.00015958403872364633, "loss": 2.1265, "step": 1503 }, { "epoch": 0.11258959818838546, "grad_norm": 1.1604242324829102, "learning_rate": 0.00015953386762023442, "loss": 2.0861, "step": 1504 }, { "epoch": 0.1126644582935639, "grad_norm": 1.2877912521362305, "learning_rate": 0.00015948367329331263, "loss": 2.1662, "step": 1505 }, { "epoch": 0.11273931839874235, "grad_norm": 1.2274935245513916, "learning_rate": 0.00015943345576246109, "loss": 1.9342, "step": 1506 }, { "epoch": 0.1128141785039208, "grad_norm": 1.2378355264663696, "learning_rate": 0.00015938321504726922, "loss": 2.4029, "step": 1507 }, { "epoch": 0.11288903860909924, "grad_norm": 1.1867324113845825, "learning_rate": 0.00015933295116733526, "loss": 1.7729, "step": 1508 }, { "epoch": 0.1129638987142777, "grad_norm": 1.2983561754226685, "learning_rate": 0.00015928266414226672, "loss": 2.2694, "step": 1509 }, { "epoch": 0.11303875881945614, "grad_norm": 1.2132484912872314, "learning_rate": 0.00015923235399167992, "loss": 1.9702, "step": 1510 }, { "epoch": 0.11311361892463459, "grad_norm": 1.1751494407653809, "learning_rate": 0.00015918202073520038, "loss": 2.1873, "step": 1511 }, { "epoch": 0.11318847902981304, "grad_norm": 1.475732684135437, "learning_rate": 0.00015913166439246248, "loss": 2.5524, "step": 1512 }, { "epoch": 0.11326333913499148, "grad_norm": 1.2347311973571777, "learning_rate": 0.0001590812849831097, "loss": 2.0673, "step": 1513 }, { "epoch": 0.11333819924016993, "grad_norm": 1.4176100492477417, "learning_rate": 0.00015903088252679454, "loss": 1.9693, "step": 1514 }, { "epoch": 0.11341305934534839, "grad_norm": 1.368050456047058, "learning_rate": 0.00015898045704317842, "loss": 2.1075, "step": 1515 }, { "epoch": 0.11348791945052683, "grad_norm": 1.153599500656128, "learning_rate": 0.00015893000855193173, "loss": 2.456, "step": 1516 }, { "epoch": 0.11356277955570528, "grad_norm": 1.217044472694397, "learning_rate": 0.0001588795370727339, "loss": 1.6267, "step": 1517 }, { "epoch": 0.11363763966088372, "grad_norm": 1.3977729082107544, "learning_rate": 0.00015882904262527333, "loss": 2.3203, "step": 1518 }, { "epoch": 0.11371249976606217, "grad_norm": 1.186957836151123, "learning_rate": 0.00015877852522924732, "loss": 2.2535, "step": 1519 }, { "epoch": 0.11378735987124063, "grad_norm": 1.3640968799591064, "learning_rate": 0.00015872798490436217, "loss": 2.4532, "step": 1520 }, { "epoch": 0.11386221997641907, "grad_norm": 1.387534737586975, "learning_rate": 0.0001586774216703331, "loss": 2.1568, "step": 1521 }, { "epoch": 0.11393708008159752, "grad_norm": 1.2614580392837524, "learning_rate": 0.00015862683554688424, "loss": 1.8193, "step": 1522 }, { "epoch": 0.11401194018677596, "grad_norm": 1.3751025199890137, "learning_rate": 0.00015857622655374875, "loss": 1.9759, "step": 1523 }, { "epoch": 0.11408680029195441, "grad_norm": 1.2049819231033325, "learning_rate": 0.00015852559471066861, "loss": 2.4674, "step": 1524 }, { "epoch": 0.11416166039713285, "grad_norm": 1.2412619590759277, "learning_rate": 0.00015847494003739474, "loss": 2.0931, "step": 1525 }, { "epoch": 0.11423652050231131, "grad_norm": 1.3276125192642212, "learning_rate": 0.00015842426255368696, "loss": 1.9284, "step": 1526 }, { "epoch": 0.11431138060748976, "grad_norm": 1.105554461479187, "learning_rate": 0.00015837356227931405, "loss": 2.1466, "step": 1527 }, { "epoch": 0.1143862407126682, "grad_norm": 1.318425178527832, "learning_rate": 0.0001583228392340536, "loss": 2.2307, "step": 1528 }, { "epoch": 0.11446110081784665, "grad_norm": 1.2088428735733032, "learning_rate": 0.0001582720934376921, "loss": 2.2104, "step": 1529 }, { "epoch": 0.11453596092302509, "grad_norm": 1.1680271625518799, "learning_rate": 0.00015822132491002497, "loss": 2.4187, "step": 1530 }, { "epoch": 0.11461082102820354, "grad_norm": 1.2665177583694458, "learning_rate": 0.00015817053367085643, "loss": 1.8447, "step": 1531 }, { "epoch": 0.114685681133382, "grad_norm": 1.278480052947998, "learning_rate": 0.00015811971973999958, "loss": 2.2408, "step": 1532 }, { "epoch": 0.11476054123856044, "grad_norm": 1.3042091131210327, "learning_rate": 0.0001580688831372764, "loss": 2.3011, "step": 1533 }, { "epoch": 0.11483540134373889, "grad_norm": 1.5212299823760986, "learning_rate": 0.00015801802388251769, "loss": 2.2183, "step": 1534 }, { "epoch": 0.11491026144891733, "grad_norm": 1.297083854675293, "learning_rate": 0.00015796714199556305, "loss": 2.3994, "step": 1535 }, { "epoch": 0.11498512155409578, "grad_norm": 1.1919662952423096, "learning_rate": 0.00015791623749626102, "loss": 2.0106, "step": 1536 }, { "epoch": 0.11505998165927424, "grad_norm": 1.4576109647750854, "learning_rate": 0.00015786531040446882, "loss": 2.0982, "step": 1537 }, { "epoch": 0.11513484176445268, "grad_norm": 1.2109098434448242, "learning_rate": 0.0001578143607400526, "loss": 1.9445, "step": 1538 }, { "epoch": 0.11520970186963113, "grad_norm": 1.0953627824783325, "learning_rate": 0.0001577633885228872, "loss": 1.9514, "step": 1539 }, { "epoch": 0.11528456197480957, "grad_norm": 1.2467195987701416, "learning_rate": 0.0001577123937728564, "loss": 2.2788, "step": 1540 }, { "epoch": 0.11535942207998802, "grad_norm": 1.4690958261489868, "learning_rate": 0.00015766137650985264, "loss": 2.4501, "step": 1541 }, { "epoch": 0.11543428218516646, "grad_norm": 1.2789924144744873, "learning_rate": 0.00015761033675377724, "loss": 2.1576, "step": 1542 }, { "epoch": 0.11550914229034492, "grad_norm": 1.2565027475357056, "learning_rate": 0.0001575592745245402, "loss": 2.2946, "step": 1543 }, { "epoch": 0.11558400239552337, "grad_norm": 1.3574968576431274, "learning_rate": 0.00015750818984206036, "loss": 2.4944, "step": 1544 }, { "epoch": 0.11565886250070181, "grad_norm": 1.3033030033111572, "learning_rate": 0.00015745708272626533, "loss": 2.0965, "step": 1545 }, { "epoch": 0.11573372260588026, "grad_norm": 1.1286591291427612, "learning_rate": 0.00015740595319709138, "loss": 1.6677, "step": 1546 }, { "epoch": 0.1158085827110587, "grad_norm": 1.3605116605758667, "learning_rate": 0.0001573548012744836, "loss": 2.4697, "step": 1547 }, { "epoch": 0.11588344281623715, "grad_norm": 1.142527461051941, "learning_rate": 0.0001573036269783958, "loss": 1.9551, "step": 1548 }, { "epoch": 0.11595830292141561, "grad_norm": 1.0613837242126465, "learning_rate": 0.0001572524303287905, "loss": 2.2453, "step": 1549 }, { "epoch": 0.11603316302659406, "grad_norm": 1.121192216873169, "learning_rate": 0.000157201211345639, "loss": 1.6973, "step": 1550 }, { "epoch": 0.1161080231317725, "grad_norm": 1.2366766929626465, "learning_rate": 0.00015714997004892122, "loss": 1.9274, "step": 1551 }, { "epoch": 0.11618288323695095, "grad_norm": 1.2095426321029663, "learning_rate": 0.0001570987064586258, "loss": 2.0799, "step": 1552 }, { "epoch": 0.11625774334212939, "grad_norm": 1.0360113382339478, "learning_rate": 0.00015704742059475017, "loss": 1.8594, "step": 1553 }, { "epoch": 0.11633260344730785, "grad_norm": 1.3302100896835327, "learning_rate": 0.00015699611247730038, "loss": 1.9518, "step": 1554 }, { "epoch": 0.1164074635524863, "grad_norm": 1.4807460308074951, "learning_rate": 0.0001569447821262911, "loss": 2.5992, "step": 1555 }, { "epoch": 0.11648232365766474, "grad_norm": 1.180059552192688, "learning_rate": 0.00015689342956174578, "loss": 2.5826, "step": 1556 }, { "epoch": 0.11655718376284319, "grad_norm": 1.4081298112869263, "learning_rate": 0.00015684205480369652, "loss": 2.0669, "step": 1557 }, { "epoch": 0.11663204386802163, "grad_norm": 1.1571223735809326, "learning_rate": 0.000156790657872184, "loss": 2.1739, "step": 1558 }, { "epoch": 0.11670690397320008, "grad_norm": 1.1743603944778442, "learning_rate": 0.00015673923878725763, "loss": 2.1028, "step": 1559 }, { "epoch": 0.11678176407837854, "grad_norm": 1.0989429950714111, "learning_rate": 0.00015668779756897544, "loss": 2.2119, "step": 1560 }, { "epoch": 0.11685662418355698, "grad_norm": 1.1469122171401978, "learning_rate": 0.000156636334237404, "loss": 2.1774, "step": 1561 }, { "epoch": 0.11693148428873543, "grad_norm": 1.5154860019683838, "learning_rate": 0.00015658484881261875, "loss": 1.9624, "step": 1562 }, { "epoch": 0.11700634439391387, "grad_norm": 1.2538763284683228, "learning_rate": 0.00015653334131470348, "loss": 1.7159, "step": 1563 }, { "epoch": 0.11708120449909232, "grad_norm": 1.0978866815567017, "learning_rate": 0.0001564818117637507, "loss": 2.0508, "step": 1564 }, { "epoch": 0.11715606460427076, "grad_norm": 1.573194146156311, "learning_rate": 0.0001564302601798616, "loss": 2.2081, "step": 1565 }, { "epoch": 0.11723092470944922, "grad_norm": 1.290280818939209, "learning_rate": 0.0001563786865831458, "loss": 2.1715, "step": 1566 }, { "epoch": 0.11730578481462767, "grad_norm": 1.2082433700561523, "learning_rate": 0.00015632709099372161, "loss": 2.1504, "step": 1567 }, { "epoch": 0.11738064491980611, "grad_norm": 1.4244985580444336, "learning_rate": 0.00015627547343171597, "loss": 2.6015, "step": 1568 }, { "epoch": 0.11745550502498456, "grad_norm": 1.2765387296676636, "learning_rate": 0.00015622383391726425, "loss": 2.1977, "step": 1569 }, { "epoch": 0.117530365130163, "grad_norm": 1.2902666330337524, "learning_rate": 0.00015617217247051052, "loss": 2.179, "step": 1570 }, { "epoch": 0.11760522523534146, "grad_norm": 1.3193457126617432, "learning_rate": 0.0001561204891116073, "loss": 2.0756, "step": 1571 }, { "epoch": 0.11768008534051991, "grad_norm": 1.2851290702819824, "learning_rate": 0.00015606878386071574, "loss": 2.3523, "step": 1572 }, { "epoch": 0.11775494544569835, "grad_norm": 1.1038391590118408, "learning_rate": 0.00015601705673800543, "loss": 2.1847, "step": 1573 }, { "epoch": 0.1178298055508768, "grad_norm": 1.3671183586120605, "learning_rate": 0.0001559653077636546, "loss": 1.9753, "step": 1574 }, { "epoch": 0.11790466565605524, "grad_norm": 1.3772914409637451, "learning_rate": 0.00015591353695785, "loss": 2.0506, "step": 1575 }, { "epoch": 0.11797952576123369, "grad_norm": 1.2335513830184937, "learning_rate": 0.00015586174434078676, "loss": 2.05, "step": 1576 }, { "epoch": 0.11805438586641215, "grad_norm": 1.2618029117584229, "learning_rate": 0.0001558099299326687, "loss": 2.2768, "step": 1577 }, { "epoch": 0.1181292459715906, "grad_norm": 1.2709025144577026, "learning_rate": 0.00015575809375370798, "loss": 2.2305, "step": 1578 }, { "epoch": 0.11820410607676904, "grad_norm": 1.323638677597046, "learning_rate": 0.0001557062358241254, "loss": 2.5505, "step": 1579 }, { "epoch": 0.11827896618194748, "grad_norm": 1.3125050067901611, "learning_rate": 0.0001556543561641501, "loss": 2.037, "step": 1580 }, { "epoch": 0.11835382628712593, "grad_norm": 1.0743082761764526, "learning_rate": 0.0001556024547940198, "loss": 2.0825, "step": 1581 }, { "epoch": 0.11842868639230437, "grad_norm": 1.5243746042251587, "learning_rate": 0.00015555053173398067, "loss": 2.5043, "step": 1582 }, { "epoch": 0.11850354649748283, "grad_norm": 1.3300241231918335, "learning_rate": 0.0001554985870042873, "loss": 2.624, "step": 1583 }, { "epoch": 0.11857840660266128, "grad_norm": 1.1617661714553833, "learning_rate": 0.00015544662062520276, "loss": 2.1202, "step": 1584 }, { "epoch": 0.11865326670783972, "grad_norm": 1.1661304235458374, "learning_rate": 0.0001553946326169986, "loss": 2.2464, "step": 1585 }, { "epoch": 0.11872812681301817, "grad_norm": 1.4300216436386108, "learning_rate": 0.00015534262299995476, "loss": 2.2747, "step": 1586 }, { "epoch": 0.11880298691819662, "grad_norm": 1.306745171546936, "learning_rate": 0.0001552905917943596, "loss": 2.3399, "step": 1587 }, { "epoch": 0.11887784702337507, "grad_norm": 1.204929232597351, "learning_rate": 0.0001552385390205099, "loss": 1.7787, "step": 1588 }, { "epoch": 0.11895270712855352, "grad_norm": 1.4004766941070557, "learning_rate": 0.00015518646469871096, "loss": 2.5462, "step": 1589 }, { "epoch": 0.11902756723373197, "grad_norm": 1.0851693153381348, "learning_rate": 0.00015513436884927634, "loss": 2.0175, "step": 1590 }, { "epoch": 0.11910242733891041, "grad_norm": 1.0728516578674316, "learning_rate": 0.00015508225149252807, "loss": 2.2007, "step": 1591 }, { "epoch": 0.11917728744408886, "grad_norm": 1.3015128374099731, "learning_rate": 0.0001550301126487966, "loss": 1.8371, "step": 1592 }, { "epoch": 0.1192521475492673, "grad_norm": 1.242832064628601, "learning_rate": 0.00015497795233842068, "loss": 2.304, "step": 1593 }, { "epoch": 0.11932700765444576, "grad_norm": 1.375105381011963, "learning_rate": 0.00015492577058174743, "loss": 2.3476, "step": 1594 }, { "epoch": 0.1194018677596242, "grad_norm": 1.3960721492767334, "learning_rate": 0.00015487356739913249, "loss": 2.4622, "step": 1595 }, { "epoch": 0.11947672786480265, "grad_norm": 1.2668826580047607, "learning_rate": 0.00015482134281093968, "loss": 2.0211, "step": 1596 }, { "epoch": 0.1195515879699811, "grad_norm": 1.272165060043335, "learning_rate": 0.00015476909683754126, "loss": 2.0536, "step": 1597 }, { "epoch": 0.11962644807515954, "grad_norm": 1.4260144233703613, "learning_rate": 0.0001547168294993178, "loss": 2.5106, "step": 1598 }, { "epoch": 0.11970130818033799, "grad_norm": 1.4087669849395752, "learning_rate": 0.00015466454081665824, "loss": 2.344, "step": 1599 }, { "epoch": 0.11977616828551645, "grad_norm": 1.1303629875183105, "learning_rate": 0.0001546122308099598, "loss": 1.6081, "step": 1600 }, { "epoch": 0.11985102839069489, "grad_norm": 1.2180038690567017, "learning_rate": 0.00015455989949962807, "loss": 2.211, "step": 1601 }, { "epoch": 0.11992588849587334, "grad_norm": 1.4002772569656372, "learning_rate": 0.00015450754690607687, "loss": 1.8053, "step": 1602 }, { "epoch": 0.12000074860105178, "grad_norm": 1.5316295623779297, "learning_rate": 0.00015445517304972842, "loss": 2.1336, "step": 1603 }, { "epoch": 0.12007560870623023, "grad_norm": 1.2090009450912476, "learning_rate": 0.0001544027779510132, "loss": 1.911, "step": 1604 }, { "epoch": 0.12015046881140869, "grad_norm": 1.2272241115570068, "learning_rate": 0.0001543503616303699, "loss": 2.1332, "step": 1605 }, { "epoch": 0.12022532891658713, "grad_norm": 1.1732720136642456, "learning_rate": 0.00015429792410824556, "loss": 2.1246, "step": 1606 }, { "epoch": 0.12030018902176558, "grad_norm": 1.2106130123138428, "learning_rate": 0.00015424546540509558, "loss": 2.2561, "step": 1607 }, { "epoch": 0.12037504912694402, "grad_norm": 1.3339710235595703, "learning_rate": 0.0001541929855413834, "loss": 2.1057, "step": 1608 }, { "epoch": 0.12044990923212247, "grad_norm": 1.148244023323059, "learning_rate": 0.00015414048453758094, "loss": 2.0693, "step": 1609 }, { "epoch": 0.12052476933730091, "grad_norm": 1.3082127571105957, "learning_rate": 0.0001540879624141682, "loss": 2.0268, "step": 1610 }, { "epoch": 0.12059962944247937, "grad_norm": 1.4446126222610474, "learning_rate": 0.0001540354191916335, "loss": 2.1207, "step": 1611 }, { "epoch": 0.12067448954765782, "grad_norm": 1.3395062685012817, "learning_rate": 0.00015398285489047342, "loss": 2.5092, "step": 1612 }, { "epoch": 0.12074934965283626, "grad_norm": 1.1893998384475708, "learning_rate": 0.0001539302695311927, "loss": 2.3393, "step": 1613 }, { "epoch": 0.12082420975801471, "grad_norm": 1.4130231142044067, "learning_rate": 0.00015387766313430424, "loss": 2.3885, "step": 1614 }, { "epoch": 0.12089906986319315, "grad_norm": 1.109057068824768, "learning_rate": 0.00015382503572032932, "loss": 1.7843, "step": 1615 }, { "epoch": 0.1209739299683716, "grad_norm": 1.2882617712020874, "learning_rate": 0.00015377238730979726, "loss": 2.1402, "step": 1616 }, { "epoch": 0.12104879007355006, "grad_norm": 1.2403464317321777, "learning_rate": 0.00015371971792324564, "loss": 2.2962, "step": 1617 }, { "epoch": 0.1211236501787285, "grad_norm": 1.340833067893982, "learning_rate": 0.0001536670275812202, "loss": 2.2195, "step": 1618 }, { "epoch": 0.12119851028390695, "grad_norm": 1.118361473083496, "learning_rate": 0.0001536143163042749, "loss": 1.4727, "step": 1619 }, { "epoch": 0.1212733703890854, "grad_norm": 1.3009833097457886, "learning_rate": 0.00015356158411297183, "loss": 2.3485, "step": 1620 }, { "epoch": 0.12134823049426384, "grad_norm": 1.0536458492279053, "learning_rate": 0.0001535088310278812, "loss": 1.8596, "step": 1621 }, { "epoch": 0.1214230905994423, "grad_norm": 1.3661812543869019, "learning_rate": 0.0001534560570695814, "loss": 2.0295, "step": 1622 }, { "epoch": 0.12149795070462074, "grad_norm": 1.481446385383606, "learning_rate": 0.00015340326225865905, "loss": 1.8093, "step": 1623 }, { "epoch": 0.12157281080979919, "grad_norm": 1.3617570400238037, "learning_rate": 0.0001533504466157088, "loss": 2.1746, "step": 1624 }, { "epoch": 0.12164767091497763, "grad_norm": 1.4018471240997314, "learning_rate": 0.0001532976101613334, "loss": 1.86, "step": 1625 }, { "epoch": 0.12172253102015608, "grad_norm": 1.3419103622436523, "learning_rate": 0.00015324475291614382, "loss": 2.0939, "step": 1626 }, { "epoch": 0.12179739112533453, "grad_norm": 1.0674574375152588, "learning_rate": 0.0001531918749007591, "loss": 1.6556, "step": 1627 }, { "epoch": 0.12187225123051298, "grad_norm": 1.1422722339630127, "learning_rate": 0.00015313897613580636, "loss": 1.5524, "step": 1628 }, { "epoch": 0.12194711133569143, "grad_norm": 1.1097332239151, "learning_rate": 0.0001530860566419208, "loss": 1.8439, "step": 1629 }, { "epoch": 0.12202197144086988, "grad_norm": 1.276288628578186, "learning_rate": 0.0001530331164397458, "loss": 1.7666, "step": 1630 }, { "epoch": 0.12209683154604832, "grad_norm": 1.2701642513275146, "learning_rate": 0.00015298015554993265, "loss": 1.7932, "step": 1631 }, { "epoch": 0.12217169165122677, "grad_norm": 1.173003911972046, "learning_rate": 0.00015292717399314096, "loss": 2.3289, "step": 1632 }, { "epoch": 0.12224655175640521, "grad_norm": 1.1957323551177979, "learning_rate": 0.00015287417179003814, "loss": 1.8387, "step": 1633 }, { "epoch": 0.12232141186158367, "grad_norm": 1.147047519683838, "learning_rate": 0.0001528211489612998, "loss": 1.7067, "step": 1634 }, { "epoch": 0.12239627196676212, "grad_norm": 1.5645887851715088, "learning_rate": 0.00015276810552760953, "loss": 2.2677, "step": 1635 }, { "epoch": 0.12247113207194056, "grad_norm": 1.376428484916687, "learning_rate": 0.00015271504150965912, "loss": 1.7709, "step": 1636 }, { "epoch": 0.122545992177119, "grad_norm": 1.2375415563583374, "learning_rate": 0.0001526619569281481, "loss": 1.7213, "step": 1637 }, { "epoch": 0.12262085228229745, "grad_norm": 1.3098688125610352, "learning_rate": 0.00015260885180378429, "loss": 1.5724, "step": 1638 }, { "epoch": 0.12269571238747591, "grad_norm": 1.6278733015060425, "learning_rate": 0.00015255572615728338, "loss": 1.9591, "step": 1639 }, { "epoch": 0.12277057249265436, "grad_norm": 1.2630109786987305, "learning_rate": 0.0001525025800093691, "loss": 1.9843, "step": 1640 }, { "epoch": 0.1228454325978328, "grad_norm": 1.1916438341140747, "learning_rate": 0.0001524494133807732, "loss": 1.6742, "step": 1641 }, { "epoch": 0.12292029270301125, "grad_norm": 1.4680110216140747, "learning_rate": 0.00015239622629223537, "loss": 2.366, "step": 1642 }, { "epoch": 0.12299515280818969, "grad_norm": 1.2704627513885498, "learning_rate": 0.00015234301876450334, "loss": 2.3117, "step": 1643 }, { "epoch": 0.12307001291336814, "grad_norm": 1.5764087438583374, "learning_rate": 0.00015228979081833278, "loss": 2.0774, "step": 1644 }, { "epoch": 0.1231448730185466, "grad_norm": 1.1682285070419312, "learning_rate": 0.00015223654247448732, "loss": 2.2049, "step": 1645 }, { "epoch": 0.12321973312372504, "grad_norm": 1.6811623573303223, "learning_rate": 0.00015218327375373855, "loss": 1.8852, "step": 1646 }, { "epoch": 0.12329459322890349, "grad_norm": 1.2056806087493896, "learning_rate": 0.00015212998467686607, "loss": 2.1365, "step": 1647 }, { "epoch": 0.12336945333408193, "grad_norm": 1.754183053970337, "learning_rate": 0.00015207667526465722, "loss": 2.3408, "step": 1648 }, { "epoch": 0.12344431343926038, "grad_norm": 1.1421213150024414, "learning_rate": 0.0001520233455379076, "loss": 2.0425, "step": 1649 }, { "epoch": 0.12351917354443882, "grad_norm": 1.1446012258529663, "learning_rate": 0.00015196999551742043, "loss": 1.9503, "step": 1650 }, { "epoch": 0.12351917354443882, "eval_loss": 2.1120011806488037, "eval_runtime": 179.0511, "eval_samples_per_second": 27.925, "eval_steps_per_second": 13.962, "step": 1650 }, { "epoch": 0.12359403364961728, "grad_norm": 1.5285367965698242, "learning_rate": 0.00015191662522400702, "loss": 2.1328, "step": 1651 }, { "epoch": 0.12366889375479573, "grad_norm": 1.8826360702514648, "learning_rate": 0.00015186323467848652, "loss": 1.9523, "step": 1652 }, { "epoch": 0.12374375385997417, "grad_norm": 1.1760352849960327, "learning_rate": 0.00015180982390168603, "loss": 2.1023, "step": 1653 }, { "epoch": 0.12381861396515262, "grad_norm": 1.2474899291992188, "learning_rate": 0.00015175639291444045, "loss": 2.376, "step": 1654 }, { "epoch": 0.12389347407033106, "grad_norm": 1.1392631530761719, "learning_rate": 0.00015170294173759267, "loss": 2.153, "step": 1655 }, { "epoch": 0.12396833417550952, "grad_norm": 1.2146198749542236, "learning_rate": 0.0001516494703919934, "loss": 2.2106, "step": 1656 }, { "epoch": 0.12404319428068797, "grad_norm": 1.249826192855835, "learning_rate": 0.0001515959788985012, "loss": 2.0551, "step": 1657 }, { "epoch": 0.12411805438586641, "grad_norm": 1.2728735208511353, "learning_rate": 0.00015154246727798256, "loss": 2.1483, "step": 1658 }, { "epoch": 0.12419291449104486, "grad_norm": 1.1851208209991455, "learning_rate": 0.00015148893555131174, "loss": 2.1977, "step": 1659 }, { "epoch": 0.1242677745962233, "grad_norm": 1.2094632387161255, "learning_rate": 0.00015143538373937088, "loss": 2.431, "step": 1660 }, { "epoch": 0.12434263470140175, "grad_norm": 1.301025152206421, "learning_rate": 0.00015138181186304998, "loss": 1.3115, "step": 1661 }, { "epoch": 0.12441749480658021, "grad_norm": 1.3868370056152344, "learning_rate": 0.00015132821994324683, "loss": 2.1809, "step": 1662 }, { "epoch": 0.12449235491175865, "grad_norm": 1.2767884731292725, "learning_rate": 0.00015127460800086706, "loss": 2.4781, "step": 1663 }, { "epoch": 0.1245672150169371, "grad_norm": 1.3260140419006348, "learning_rate": 0.00015122097605682407, "loss": 1.7609, "step": 1664 }, { "epoch": 0.12464207512211554, "grad_norm": 1.2146204710006714, "learning_rate": 0.00015116732413203917, "loss": 2.2171, "step": 1665 }, { "epoch": 0.12471693522729399, "grad_norm": 1.2543312311172485, "learning_rate": 0.0001511136522474413, "loss": 2.7237, "step": 1666 }, { "epoch": 0.12479179533247244, "grad_norm": 1.7669308185577393, "learning_rate": 0.00015105996042396732, "loss": 2.2114, "step": 1667 }, { "epoch": 0.1248666554376509, "grad_norm": 1.203289270401001, "learning_rate": 0.00015100624868256182, "loss": 2.0508, "step": 1668 }, { "epoch": 0.12494151554282934, "grad_norm": 1.1246908903121948, "learning_rate": 0.00015095251704417715, "loss": 2.5736, "step": 1669 }, { "epoch": 0.1250163756480078, "grad_norm": 1.2954673767089844, "learning_rate": 0.00015089876552977345, "loss": 1.9788, "step": 1670 }, { "epoch": 0.12509123575318623, "grad_norm": 1.393012523651123, "learning_rate": 0.00015084499416031857, "loss": 1.6808, "step": 1671 }, { "epoch": 0.1251660958583647, "grad_norm": 1.2587946653366089, "learning_rate": 0.00015079120295678812, "loss": 2.1196, "step": 1672 }, { "epoch": 0.12524095596354312, "grad_norm": 1.3821507692337036, "learning_rate": 0.00015073739194016556, "loss": 1.7963, "step": 1673 }, { "epoch": 0.12531581606872158, "grad_norm": 1.175564169883728, "learning_rate": 0.00015068356113144188, "loss": 2.3411, "step": 1674 }, { "epoch": 0.1253906761739, "grad_norm": 1.53485107421875, "learning_rate": 0.0001506297105516159, "loss": 1.9641, "step": 1675 }, { "epoch": 0.12546553627907847, "grad_norm": 1.1829657554626465, "learning_rate": 0.00015057584022169418, "loss": 2.187, "step": 1676 }, { "epoch": 0.12554039638425693, "grad_norm": 1.3381998538970947, "learning_rate": 0.00015052195016269094, "loss": 2.3341, "step": 1677 }, { "epoch": 0.12561525648943536, "grad_norm": 1.2915769815444946, "learning_rate": 0.00015046804039562804, "loss": 2.5947, "step": 1678 }, { "epoch": 0.12569011659461382, "grad_norm": 1.3186259269714355, "learning_rate": 0.00015041411094153517, "loss": 2.0692, "step": 1679 }, { "epoch": 0.12576497669979225, "grad_norm": 1.2076092958450317, "learning_rate": 0.00015036016182144957, "loss": 2.0978, "step": 1680 }, { "epoch": 0.1258398368049707, "grad_norm": 1.3691116571426392, "learning_rate": 0.00015030619305641624, "loss": 1.9061, "step": 1681 }, { "epoch": 0.12591469691014917, "grad_norm": 1.4071297645568848, "learning_rate": 0.00015025220466748776, "loss": 2.3753, "step": 1682 }, { "epoch": 0.1259895570153276, "grad_norm": 1.0705280303955078, "learning_rate": 0.00015019819667572444, "loss": 1.9264, "step": 1683 }, { "epoch": 0.12606441712050606, "grad_norm": 1.0392988920211792, "learning_rate": 0.00015014416910219417, "loss": 1.799, "step": 1684 }, { "epoch": 0.1261392772256845, "grad_norm": 1.2844994068145752, "learning_rate": 0.00015009012196797258, "loss": 2.0733, "step": 1685 }, { "epoch": 0.12621413733086295, "grad_norm": 1.2181769609451294, "learning_rate": 0.0001500360552941428, "loss": 2.1358, "step": 1686 }, { "epoch": 0.1262889974360414, "grad_norm": 1.30211341381073, "learning_rate": 0.0001499819691017957, "loss": 2.0887, "step": 1687 }, { "epoch": 0.12636385754121984, "grad_norm": 1.2238171100616455, "learning_rate": 0.00014992786341202967, "loss": 2.2727, "step": 1688 }, { "epoch": 0.1264387176463983, "grad_norm": 1.3321980237960815, "learning_rate": 0.0001498737382459508, "loss": 2.0178, "step": 1689 }, { "epoch": 0.12651357775157673, "grad_norm": 1.1045634746551514, "learning_rate": 0.00014981959362467265, "loss": 1.9011, "step": 1690 }, { "epoch": 0.1265884378567552, "grad_norm": 1.354565978050232, "learning_rate": 0.0001497654295693165, "loss": 2.2177, "step": 1691 }, { "epoch": 0.12666329796193362, "grad_norm": 1.2822046279907227, "learning_rate": 0.00014971124610101111, "loss": 2.3599, "step": 1692 }, { "epoch": 0.12673815806711208, "grad_norm": 1.2431048154830933, "learning_rate": 0.0001496570432408929, "loss": 1.7831, "step": 1693 }, { "epoch": 0.12681301817229054, "grad_norm": 1.229035496711731, "learning_rate": 0.0001496028210101058, "loss": 1.9734, "step": 1694 }, { "epoch": 0.12688787827746897, "grad_norm": 1.2570806741714478, "learning_rate": 0.0001495485794298013, "loss": 2.3082, "step": 1695 }, { "epoch": 0.12696273838264743, "grad_norm": 1.325649380683899, "learning_rate": 0.00014949431852113846, "loss": 2.1526, "step": 1696 }, { "epoch": 0.12703759848782586, "grad_norm": 1.3303685188293457, "learning_rate": 0.0001494400383052839, "loss": 2.0419, "step": 1697 }, { "epoch": 0.12711245859300432, "grad_norm": 1.549606204032898, "learning_rate": 0.00014938573880341163, "loss": 2.4993, "step": 1698 }, { "epoch": 0.12718731869818278, "grad_norm": 1.3367259502410889, "learning_rate": 0.0001493314200367034, "loss": 1.8122, "step": 1699 }, { "epoch": 0.12726217880336121, "grad_norm": 1.1958132982254028, "learning_rate": 0.00014927708202634827, "loss": 2.0824, "step": 1700 }, { "epoch": 0.12733703890853967, "grad_norm": 1.2125838994979858, "learning_rate": 0.00014922272479354302, "loss": 2.25, "step": 1701 }, { "epoch": 0.1274118990137181, "grad_norm": 1.2102372646331787, "learning_rate": 0.0001491683483594917, "loss": 1.7095, "step": 1702 }, { "epoch": 0.12748675911889656, "grad_norm": 1.2477245330810547, "learning_rate": 0.00014911395274540604, "loss": 2.2223, "step": 1703 }, { "epoch": 0.12756161922407502, "grad_norm": 1.2862021923065186, "learning_rate": 0.00014905953797250514, "loss": 2.1114, "step": 1704 }, { "epoch": 0.12763647932925345, "grad_norm": 1.179319977760315, "learning_rate": 0.00014900510406201564, "loss": 1.8423, "step": 1705 }, { "epoch": 0.12771133943443191, "grad_norm": 1.1669728755950928, "learning_rate": 0.00014895065103517153, "loss": 2.0971, "step": 1706 }, { "epoch": 0.12778619953961035, "grad_norm": 1.5127516984939575, "learning_rate": 0.0001488961789132144, "loss": 2.2859, "step": 1707 }, { "epoch": 0.1278610596447888, "grad_norm": 1.2437032461166382, "learning_rate": 0.00014884168771739324, "loss": 2.5611, "step": 1708 }, { "epoch": 0.12793591974996724, "grad_norm": 1.2339715957641602, "learning_rate": 0.00014878717746896443, "loss": 1.7684, "step": 1709 }, { "epoch": 0.1280107798551457, "grad_norm": 1.1835347414016724, "learning_rate": 0.00014873264818919188, "loss": 1.6462, "step": 1710 }, { "epoch": 0.12808563996032415, "grad_norm": 1.2870445251464844, "learning_rate": 0.0001486780998993468, "loss": 1.9308, "step": 1711 }, { "epoch": 0.12816050006550259, "grad_norm": 1.4452252388000488, "learning_rate": 0.0001486235326207079, "loss": 2.1361, "step": 1712 }, { "epoch": 0.12823536017068105, "grad_norm": 1.2638400793075562, "learning_rate": 0.0001485689463745613, "loss": 2.0293, "step": 1713 }, { "epoch": 0.12831022027585948, "grad_norm": 1.4039969444274902, "learning_rate": 0.0001485143411822005, "loss": 2.5257, "step": 1714 }, { "epoch": 0.12838508038103794, "grad_norm": 1.726402997970581, "learning_rate": 0.00014845971706492633, "loss": 2.3054, "step": 1715 }, { "epoch": 0.1284599404862164, "grad_norm": 1.198953628540039, "learning_rate": 0.00014840507404404712, "loss": 1.7641, "step": 1716 }, { "epoch": 0.12853480059139483, "grad_norm": 1.1575957536697388, "learning_rate": 0.00014835041214087847, "loss": 1.5143, "step": 1717 }, { "epoch": 0.12860966069657329, "grad_norm": 1.5104163885116577, "learning_rate": 0.00014829573137674346, "loss": 2.3668, "step": 1718 }, { "epoch": 0.12868452080175172, "grad_norm": 1.2903903722763062, "learning_rate": 0.0001482410317729724, "loss": 1.9985, "step": 1719 }, { "epoch": 0.12875938090693018, "grad_norm": 1.2244126796722412, "learning_rate": 0.00014818631335090297, "loss": 2.2945, "step": 1720 }, { "epoch": 0.12883424101210864, "grad_norm": 1.5271316766738892, "learning_rate": 0.00014813157613188032, "loss": 2.1088, "step": 1721 }, { "epoch": 0.12890910111728707, "grad_norm": 1.3260506391525269, "learning_rate": 0.0001480768201372568, "loss": 2.2321, "step": 1722 }, { "epoch": 0.12898396122246553, "grad_norm": 1.3813551664352417, "learning_rate": 0.0001480220453883921, "loss": 2.4425, "step": 1723 }, { "epoch": 0.12905882132764396, "grad_norm": 1.2803997993469238, "learning_rate": 0.00014796725190665324, "loss": 2.2576, "step": 1724 }, { "epoch": 0.12913368143282242, "grad_norm": 1.1201013326644897, "learning_rate": 0.00014791243971341462, "loss": 1.6227, "step": 1725 }, { "epoch": 0.12920854153800085, "grad_norm": 1.3893831968307495, "learning_rate": 0.00014785760883005787, "loss": 2.1544, "step": 1726 }, { "epoch": 0.1292834016431793, "grad_norm": 1.2077250480651855, "learning_rate": 0.00014780275927797181, "loss": 2.2986, "step": 1727 }, { "epoch": 0.12935826174835777, "grad_norm": 1.235998272895813, "learning_rate": 0.0001477478910785528, "loss": 2.3074, "step": 1728 }, { "epoch": 0.1294331218535362, "grad_norm": 1.2582149505615234, "learning_rate": 0.00014769300425320422, "loss": 1.7092, "step": 1729 }, { "epoch": 0.12950798195871466, "grad_norm": 1.1110175848007202, "learning_rate": 0.00014763809882333687, "loss": 1.8547, "step": 1730 }, { "epoch": 0.1295828420638931, "grad_norm": 1.227130651473999, "learning_rate": 0.00014758317481036875, "loss": 1.9331, "step": 1731 }, { "epoch": 0.12965770216907155, "grad_norm": 1.3814926147460938, "learning_rate": 0.00014752823223572508, "loss": 2.5039, "step": 1732 }, { "epoch": 0.12973256227425, "grad_norm": 1.3067970275878906, "learning_rate": 0.00014747327112083836, "loss": 2.1041, "step": 1733 }, { "epoch": 0.12980742237942844, "grad_norm": 1.3111072778701782, "learning_rate": 0.00014741829148714843, "loss": 2.0453, "step": 1734 }, { "epoch": 0.1298822824846069, "grad_norm": 1.4883724451065063, "learning_rate": 0.00014736329335610207, "loss": 2.2575, "step": 1735 }, { "epoch": 0.12995714258978533, "grad_norm": 1.2026609182357788, "learning_rate": 0.00014730827674915356, "loss": 2.3198, "step": 1736 }, { "epoch": 0.1300320026949638, "grad_norm": 1.4513899087905884, "learning_rate": 0.00014725324168776428, "loss": 2.1082, "step": 1737 }, { "epoch": 0.13010686280014225, "grad_norm": 1.2574878931045532, "learning_rate": 0.00014719818819340275, "loss": 2.4337, "step": 1738 }, { "epoch": 0.13018172290532068, "grad_norm": 1.2600208520889282, "learning_rate": 0.00014714311628754475, "loss": 2.1211, "step": 1739 }, { "epoch": 0.13025658301049914, "grad_norm": 1.606587529182434, "learning_rate": 0.00014708802599167325, "loss": 2.0422, "step": 1740 }, { "epoch": 0.13033144311567757, "grad_norm": 1.2642313241958618, "learning_rate": 0.00014703291732727836, "loss": 2.3317, "step": 1741 }, { "epoch": 0.13040630322085603, "grad_norm": 1.3313870429992676, "learning_rate": 0.00014697779031585737, "loss": 2.0215, "step": 1742 }, { "epoch": 0.13048116332603446, "grad_norm": 0.9812899231910706, "learning_rate": 0.00014692264497891472, "loss": 1.6129, "step": 1743 }, { "epoch": 0.13055602343121292, "grad_norm": 1.3263006210327148, "learning_rate": 0.000146867481337962, "loss": 2.1495, "step": 1744 }, { "epoch": 0.13063088353639138, "grad_norm": 1.2379307746887207, "learning_rate": 0.00014681229941451794, "loss": 1.8739, "step": 1745 }, { "epoch": 0.1307057436415698, "grad_norm": 1.1243404150009155, "learning_rate": 0.00014675709923010841, "loss": 1.9385, "step": 1746 }, { "epoch": 0.13078060374674827, "grad_norm": 1.2504841089248657, "learning_rate": 0.0001467018808062664, "loss": 2.225, "step": 1747 }, { "epoch": 0.1308554638519267, "grad_norm": 1.267438530921936, "learning_rate": 0.000146646644164532, "loss": 2.1551, "step": 1748 }, { "epoch": 0.13093032395710516, "grad_norm": 1.2320982217788696, "learning_rate": 0.0001465913893264524, "loss": 2.0659, "step": 1749 }, { "epoch": 0.13100518406228362, "grad_norm": 1.092518925666809, "learning_rate": 0.00014653611631358198, "loss": 2.5016, "step": 1750 }, { "epoch": 0.13108004416746205, "grad_norm": 1.3446053266525269, "learning_rate": 0.00014648082514748207, "loss": 2.074, "step": 1751 }, { "epoch": 0.1311549042726405, "grad_norm": 1.2421897649765015, "learning_rate": 0.00014642551584972117, "loss": 2.0871, "step": 1752 }, { "epoch": 0.13122976437781894, "grad_norm": 1.279618740081787, "learning_rate": 0.00014637018844187484, "loss": 2.2571, "step": 1753 }, { "epoch": 0.1313046244829974, "grad_norm": 1.3127774000167847, "learning_rate": 0.0001463148429455257, "loss": 2.0415, "step": 1754 }, { "epoch": 0.13137948458817586, "grad_norm": 1.0975751876831055, "learning_rate": 0.0001462594793822634, "loss": 2.5916, "step": 1755 }, { "epoch": 0.1314543446933543, "grad_norm": 1.2415026426315308, "learning_rate": 0.00014620409777368464, "loss": 1.5251, "step": 1756 }, { "epoch": 0.13152920479853275, "grad_norm": 1.3336435556411743, "learning_rate": 0.00014614869814139324, "loss": 2.6957, "step": 1757 }, { "epoch": 0.13160406490371118, "grad_norm": 1.4166295528411865, "learning_rate": 0.00014609328050699998, "loss": 2.3058, "step": 1758 }, { "epoch": 0.13167892500888964, "grad_norm": 1.2732607126235962, "learning_rate": 0.0001460378448921226, "loss": 2.0087, "step": 1759 }, { "epoch": 0.13175378511406807, "grad_norm": 1.3206690549850464, "learning_rate": 0.00014598239131838602, "loss": 2.3062, "step": 1760 }, { "epoch": 0.13182864521924653, "grad_norm": 1.2811468839645386, "learning_rate": 0.000145926919807422, "loss": 2.4846, "step": 1761 }, { "epoch": 0.131903505324425, "grad_norm": 1.1779218912124634, "learning_rate": 0.00014587143038086942, "loss": 1.2095, "step": 1762 }, { "epoch": 0.13197836542960342, "grad_norm": 1.0859185457229614, "learning_rate": 0.00014581592306037408, "loss": 1.7602, "step": 1763 }, { "epoch": 0.13205322553478188, "grad_norm": 1.3303217887878418, "learning_rate": 0.00014576039786758874, "loss": 1.6916, "step": 1764 }, { "epoch": 0.1321280856399603, "grad_norm": 1.3575633764266968, "learning_rate": 0.00014570485482417325, "loss": 2.5597, "step": 1765 }, { "epoch": 0.13220294574513877, "grad_norm": 1.2700060606002808, "learning_rate": 0.00014564929395179428, "loss": 2.3267, "step": 1766 }, { "epoch": 0.13227780585031723, "grad_norm": 1.113111138343811, "learning_rate": 0.00014559371527212553, "loss": 2.2522, "step": 1767 }, { "epoch": 0.13235266595549566, "grad_norm": 1.3092972040176392, "learning_rate": 0.00014553811880684765, "loss": 2.3113, "step": 1768 }, { "epoch": 0.13242752606067412, "grad_norm": 1.4693418741226196, "learning_rate": 0.0001454825045776482, "loss": 2.3862, "step": 1769 }, { "epoch": 0.13250238616585255, "grad_norm": 1.30520761013031, "learning_rate": 0.00014542687260622164, "loss": 2.2819, "step": 1770 }, { "epoch": 0.132577246271031, "grad_norm": 1.525801658630371, "learning_rate": 0.00014537122291426952, "loss": 2.4219, "step": 1771 }, { "epoch": 0.13265210637620947, "grad_norm": 1.183829665184021, "learning_rate": 0.00014531555552350006, "loss": 2.2331, "step": 1772 }, { "epoch": 0.1327269664813879, "grad_norm": 1.2187873125076294, "learning_rate": 0.0001452598704556285, "loss": 2.0588, "step": 1773 }, { "epoch": 0.13280182658656636, "grad_norm": 1.3338450193405151, "learning_rate": 0.00014520416773237705, "loss": 2.3396, "step": 1774 }, { "epoch": 0.1328766866917448, "grad_norm": 1.2261580228805542, "learning_rate": 0.0001451484473754747, "loss": 2.2144, "step": 1775 }, { "epoch": 0.13295154679692325, "grad_norm": 1.0935548543930054, "learning_rate": 0.0001450927094066573, "loss": 2.2888, "step": 1776 }, { "epoch": 0.13302640690210168, "grad_norm": 1.1387577056884766, "learning_rate": 0.00014503695384766768, "loss": 2.1825, "step": 1777 }, { "epoch": 0.13310126700728014, "grad_norm": 1.2473279237747192, "learning_rate": 0.0001449811807202554, "loss": 1.4847, "step": 1778 }, { "epoch": 0.1331761271124586, "grad_norm": 1.1446022987365723, "learning_rate": 0.0001449253900461771, "loss": 1.9436, "step": 1779 }, { "epoch": 0.13325098721763703, "grad_norm": 0.9986206293106079, "learning_rate": 0.00014486958184719596, "loss": 1.3185, "step": 1780 }, { "epoch": 0.1333258473228155, "grad_norm": 1.1014764308929443, "learning_rate": 0.00014481375614508221, "loss": 2.2495, "step": 1781 }, { "epoch": 0.13340070742799393, "grad_norm": 1.443421721458435, "learning_rate": 0.00014475791296161283, "loss": 2.3804, "step": 1782 }, { "epoch": 0.13347556753317238, "grad_norm": 1.4687126874923706, "learning_rate": 0.00014470205231857167, "loss": 2.2905, "step": 1783 }, { "epoch": 0.13355042763835084, "grad_norm": 1.2823964357376099, "learning_rate": 0.0001446461742377493, "loss": 2.3393, "step": 1784 }, { "epoch": 0.13362528774352928, "grad_norm": 1.2927583456039429, "learning_rate": 0.00014459027874094317, "loss": 2.335, "step": 1785 }, { "epoch": 0.13370014784870773, "grad_norm": 1.17427396774292, "learning_rate": 0.00014453436584995752, "loss": 1.9962, "step": 1786 }, { "epoch": 0.13377500795388617, "grad_norm": 1.1823734045028687, "learning_rate": 0.00014447843558660335, "loss": 1.8158, "step": 1787 }, { "epoch": 0.13384986805906463, "grad_norm": 1.2609316110610962, "learning_rate": 0.0001444224879726984, "loss": 1.8814, "step": 1788 }, { "epoch": 0.13392472816424308, "grad_norm": 1.2684667110443115, "learning_rate": 0.00014436652303006727, "loss": 2.1968, "step": 1789 }, { "epoch": 0.13399958826942152, "grad_norm": 1.4494051933288574, "learning_rate": 0.00014431054078054126, "loss": 2.6069, "step": 1790 }, { "epoch": 0.13407444837459997, "grad_norm": 1.2820991277694702, "learning_rate": 0.00014425454124595842, "loss": 2.2673, "step": 1791 }, { "epoch": 0.1341493084797784, "grad_norm": 1.2943484783172607, "learning_rate": 0.00014419852444816358, "loss": 1.7743, "step": 1792 }, { "epoch": 0.13422416858495687, "grad_norm": 1.2113275527954102, "learning_rate": 0.0001441424904090083, "loss": 2.3602, "step": 1793 }, { "epoch": 0.1342990286901353, "grad_norm": 1.4686064720153809, "learning_rate": 0.00014408643915035077, "loss": 2.3669, "step": 1794 }, { "epoch": 0.13437388879531376, "grad_norm": 1.379508137702942, "learning_rate": 0.000144030370694056, "loss": 2.0291, "step": 1795 }, { "epoch": 0.13444874890049222, "grad_norm": 1.2953338623046875, "learning_rate": 0.0001439742850619957, "loss": 1.7792, "step": 1796 }, { "epoch": 0.13452360900567065, "grad_norm": 1.0156526565551758, "learning_rate": 0.0001439181822760483, "loss": 1.2922, "step": 1797 }, { "epoch": 0.1345984691108491, "grad_norm": 1.4123769998550415, "learning_rate": 0.00014386206235809877, "loss": 2.2278, "step": 1798 }, { "epoch": 0.13467332921602754, "grad_norm": 1.2202821969985962, "learning_rate": 0.00014380592533003902, "loss": 1.8405, "step": 1799 }, { "epoch": 0.134748189321206, "grad_norm": 1.1181732416152954, "learning_rate": 0.00014374977121376736, "loss": 1.9088, "step": 1800 }, { "epoch": 0.134748189321206, "eval_loss": 2.099764585494995, "eval_runtime": 178.9637, "eval_samples_per_second": 27.939, "eval_steps_per_second": 13.969, "step": 1800 }, { "epoch": 0.13482304942638446, "grad_norm": 1.2117083072662354, "learning_rate": 0.00014369360003118896, "loss": 2.4402, "step": 1801 }, { "epoch": 0.1348979095315629, "grad_norm": 1.1570435762405396, "learning_rate": 0.00014363741180421555, "loss": 1.7287, "step": 1802 }, { "epoch": 0.13497276963674135, "grad_norm": 1.296866774559021, "learning_rate": 0.0001435812065547656, "loss": 1.9302, "step": 1803 }, { "epoch": 0.13504762974191978, "grad_norm": 1.2372689247131348, "learning_rate": 0.0001435249843047641, "loss": 2.0349, "step": 1804 }, { "epoch": 0.13512248984709824, "grad_norm": 1.3551175594329834, "learning_rate": 0.00014346874507614277, "loss": 1.959, "step": 1805 }, { "epoch": 0.1351973499522767, "grad_norm": 1.9621697664260864, "learning_rate": 0.0001434124888908399, "loss": 2.5044, "step": 1806 }, { "epoch": 0.13527221005745513, "grad_norm": 1.107782006263733, "learning_rate": 0.00014335621577080045, "loss": 1.2001, "step": 1807 }, { "epoch": 0.1353470701626336, "grad_norm": 1.3073853254318237, "learning_rate": 0.00014329992573797588, "loss": 2.6214, "step": 1808 }, { "epoch": 0.13542193026781202, "grad_norm": 1.4306055307388306, "learning_rate": 0.00014324361881432436, "loss": 1.9647, "step": 1809 }, { "epoch": 0.13549679037299048, "grad_norm": 1.662380576133728, "learning_rate": 0.00014318729502181063, "loss": 2.3438, "step": 1810 }, { "epoch": 0.1355716504781689, "grad_norm": 1.2980319261550903, "learning_rate": 0.00014313095438240594, "loss": 1.8007, "step": 1811 }, { "epoch": 0.13564651058334737, "grad_norm": 1.3368327617645264, "learning_rate": 0.00014307459691808818, "loss": 2.1224, "step": 1812 }, { "epoch": 0.13572137068852583, "grad_norm": 1.4532103538513184, "learning_rate": 0.00014301822265084174, "loss": 2.1369, "step": 1813 }, { "epoch": 0.13579623079370426, "grad_norm": 1.4198261499404907, "learning_rate": 0.00014296183160265766, "loss": 2.333, "step": 1814 }, { "epoch": 0.13587109089888272, "grad_norm": 1.1712688207626343, "learning_rate": 0.00014290542379553342, "loss": 1.9458, "step": 1815 }, { "epoch": 0.13594595100406115, "grad_norm": 1.3493659496307373, "learning_rate": 0.00014284899925147312, "loss": 1.8134, "step": 1816 }, { "epoch": 0.1360208111092396, "grad_norm": 1.3916547298431396, "learning_rate": 0.00014279255799248735, "loss": 2.3837, "step": 1817 }, { "epoch": 0.13609567121441807, "grad_norm": 1.3896576166152954, "learning_rate": 0.0001427361000405932, "loss": 2.1482, "step": 1818 }, { "epoch": 0.1361705313195965, "grad_norm": 1.2881159782409668, "learning_rate": 0.0001426796254178144, "loss": 1.9395, "step": 1819 }, { "epoch": 0.13624539142477496, "grad_norm": 1.306483507156372, "learning_rate": 0.00014262313414618096, "loss": 2.0709, "step": 1820 }, { "epoch": 0.1363202515299534, "grad_norm": 1.2413082122802734, "learning_rate": 0.00014256662624772954, "loss": 2.024, "step": 1821 }, { "epoch": 0.13639511163513185, "grad_norm": 1.1831843852996826, "learning_rate": 0.00014251010174450327, "loss": 1.9151, "step": 1822 }, { "epoch": 0.1364699717403103, "grad_norm": 1.3724371194839478, "learning_rate": 0.00014245356065855174, "loss": 2.3219, "step": 1823 }, { "epoch": 0.13654483184548874, "grad_norm": 1.4640085697174072, "learning_rate": 0.00014239700301193104, "loss": 2.2914, "step": 1824 }, { "epoch": 0.1366196919506672, "grad_norm": 1.2367684841156006, "learning_rate": 0.0001423404288267036, "loss": 1.9193, "step": 1825 }, { "epoch": 0.13669455205584563, "grad_norm": 1.2096275091171265, "learning_rate": 0.00014228383812493848, "loss": 1.9911, "step": 1826 }, { "epoch": 0.1367694121610241, "grad_norm": 1.2261931896209717, "learning_rate": 0.00014222723092871103, "loss": 2.0025, "step": 1827 }, { "epoch": 0.13684427226620252, "grad_norm": 1.4427851438522339, "learning_rate": 0.00014217060726010313, "loss": 2.2667, "step": 1828 }, { "epoch": 0.13691913237138098, "grad_norm": 1.3627735376358032, "learning_rate": 0.0001421139671412031, "loss": 2.0686, "step": 1829 }, { "epoch": 0.13699399247655944, "grad_norm": 1.4261817932128906, "learning_rate": 0.0001420573105941055, "loss": 1.9447, "step": 1830 }, { "epoch": 0.13706885258173787, "grad_norm": 1.5134145021438599, "learning_rate": 0.00014200063764091154, "loss": 1.6726, "step": 1831 }, { "epoch": 0.13714371268691633, "grad_norm": 1.3060468435287476, "learning_rate": 0.0001419439483037287, "loss": 2.1568, "step": 1832 }, { "epoch": 0.13721857279209476, "grad_norm": 1.1566110849380493, "learning_rate": 0.00014188724260467083, "loss": 1.8844, "step": 1833 }, { "epoch": 0.13729343289727322, "grad_norm": 1.2287389039993286, "learning_rate": 0.00014183052056585823, "loss": 2.3058, "step": 1834 }, { "epoch": 0.13736829300245168, "grad_norm": 1.207580327987671, "learning_rate": 0.00014177378220941759, "loss": 2.1765, "step": 1835 }, { "epoch": 0.1374431531076301, "grad_norm": 1.473514199256897, "learning_rate": 0.00014171702755748182, "loss": 1.9968, "step": 1836 }, { "epoch": 0.13751801321280857, "grad_norm": 1.165472149848938, "learning_rate": 0.00014166025663219035, "loss": 2.4973, "step": 1837 }, { "epoch": 0.137592873317987, "grad_norm": 1.48367440700531, "learning_rate": 0.0001416034694556889, "loss": 2.2661, "step": 1838 }, { "epoch": 0.13766773342316546, "grad_norm": 1.253893494606018, "learning_rate": 0.00014154666605012948, "loss": 1.867, "step": 1839 }, { "epoch": 0.13774259352834392, "grad_norm": 1.2920336723327637, "learning_rate": 0.0001414898464376706, "loss": 1.6541, "step": 1840 }, { "epoch": 0.13781745363352235, "grad_norm": 1.1883060932159424, "learning_rate": 0.00014143301064047683, "loss": 1.9211, "step": 1841 }, { "epoch": 0.1378923137387008, "grad_norm": 1.2208513021469116, "learning_rate": 0.00014137615868071922, "loss": 1.7072, "step": 1842 }, { "epoch": 0.13796717384387924, "grad_norm": 1.2498359680175781, "learning_rate": 0.00014131929058057514, "loss": 2.3731, "step": 1843 }, { "epoch": 0.1380420339490577, "grad_norm": 1.2707500457763672, "learning_rate": 0.0001412624063622282, "loss": 2.0133, "step": 1844 }, { "epoch": 0.13811689405423613, "grad_norm": 1.3027199506759644, "learning_rate": 0.0001412055060478683, "loss": 1.8784, "step": 1845 }, { "epoch": 0.1381917541594146, "grad_norm": 1.2853953838348389, "learning_rate": 0.00014114858965969164, "loss": 2.0969, "step": 1846 }, { "epoch": 0.13826661426459305, "grad_norm": 1.4335887432098389, "learning_rate": 0.00014109165721990066, "loss": 2.1669, "step": 1847 }, { "epoch": 0.13834147436977148, "grad_norm": 1.3296114206314087, "learning_rate": 0.0001410347087507041, "loss": 1.9031, "step": 1848 }, { "epoch": 0.13841633447494994, "grad_norm": 1.2634931802749634, "learning_rate": 0.00014097774427431693, "loss": 2.3769, "step": 1849 }, { "epoch": 0.13849119458012837, "grad_norm": 1.1392568349838257, "learning_rate": 0.0001409207638129604, "loss": 1.5395, "step": 1850 }, { "epoch": 0.13856605468530683, "grad_norm": 1.277309775352478, "learning_rate": 0.00014086376738886184, "loss": 1.6377, "step": 1851 }, { "epoch": 0.1386409147904853, "grad_norm": 1.2119747400283813, "learning_rate": 0.00014080675502425513, "loss": 2.1455, "step": 1852 }, { "epoch": 0.13871577489566372, "grad_norm": 1.4137977361679077, "learning_rate": 0.00014074972674138, "loss": 2.0064, "step": 1853 }, { "epoch": 0.13879063500084218, "grad_norm": 1.1854732036590576, "learning_rate": 0.00014069268256248262, "loss": 2.2801, "step": 1854 }, { "epoch": 0.13886549510602061, "grad_norm": 1.1197314262390137, "learning_rate": 0.00014063562250981535, "loss": 1.8878, "step": 1855 }, { "epoch": 0.13894035521119907, "grad_norm": 1.380180835723877, "learning_rate": 0.0001405785466056366, "loss": 2.0342, "step": 1856 }, { "epoch": 0.13901521531637753, "grad_norm": 1.317891240119934, "learning_rate": 0.0001405214548722111, "loss": 2.3369, "step": 1857 }, { "epoch": 0.13909007542155596, "grad_norm": 1.4403605461120605, "learning_rate": 0.00014046434733180972, "loss": 2.4418, "step": 1858 }, { "epoch": 0.13916493552673442, "grad_norm": 1.2728699445724487, "learning_rate": 0.00014040722400670948, "loss": 2.5357, "step": 1859 }, { "epoch": 0.13923979563191285, "grad_norm": 1.903470754623413, "learning_rate": 0.00014035008491919355, "loss": 2.2299, "step": 1860 }, { "epoch": 0.13931465573709131, "grad_norm": 1.2882949113845825, "learning_rate": 0.00014029293009155128, "loss": 1.8925, "step": 1861 }, { "epoch": 0.13938951584226975, "grad_norm": 1.4667880535125732, "learning_rate": 0.0001402357595460781, "loss": 1.8934, "step": 1862 }, { "epoch": 0.1394643759474482, "grad_norm": 1.1947779655456543, "learning_rate": 0.0001401785733050757, "loss": 2.076, "step": 1863 }, { "epoch": 0.13953923605262666, "grad_norm": 1.3013192415237427, "learning_rate": 0.00014012137139085172, "loss": 1.3672, "step": 1864 }, { "epoch": 0.1396140961578051, "grad_norm": 1.0421103239059448, "learning_rate": 0.00014006415382572003, "loss": 1.6713, "step": 1865 }, { "epoch": 0.13968895626298355, "grad_norm": 1.3531436920166016, "learning_rate": 0.0001400069206320006, "loss": 2.0457, "step": 1866 }, { "epoch": 0.13976381636816199, "grad_norm": 1.2105365991592407, "learning_rate": 0.0001399496718320194, "loss": 1.8943, "step": 1867 }, { "epoch": 0.13983867647334045, "grad_norm": 1.4422264099121094, "learning_rate": 0.00013989240744810867, "loss": 1.9912, "step": 1868 }, { "epoch": 0.1399135365785189, "grad_norm": 1.1056183576583862, "learning_rate": 0.00013983512750260655, "loss": 1.2408, "step": 1869 }, { "epoch": 0.13998839668369734, "grad_norm": 1.3718981742858887, "learning_rate": 0.00013977783201785733, "loss": 2.1413, "step": 1870 }, { "epoch": 0.1400632567888758, "grad_norm": 1.2405959367752075, "learning_rate": 0.00013972052101621132, "loss": 1.9921, "step": 1871 }, { "epoch": 0.14013811689405423, "grad_norm": 1.6171875, "learning_rate": 0.00013966319452002497, "loss": 2.4748, "step": 1872 }, { "epoch": 0.14021297699923269, "grad_norm": 1.2890247106552124, "learning_rate": 0.0001396058525516607, "loss": 2.4303, "step": 1873 }, { "epoch": 0.14028783710441114, "grad_norm": 1.2078078985214233, "learning_rate": 0.00013954849513348694, "loss": 2.2379, "step": 1874 }, { "epoch": 0.14036269720958958, "grad_norm": 1.4203547239303589, "learning_rate": 0.0001394911222878783, "loss": 1.8295, "step": 1875 }, { "epoch": 0.14043755731476804, "grad_norm": 1.3090572357177734, "learning_rate": 0.0001394337340372152, "loss": 1.9339, "step": 1876 }, { "epoch": 0.14051241741994647, "grad_norm": 1.6869395971298218, "learning_rate": 0.00013937633040388416, "loss": 2.1536, "step": 1877 }, { "epoch": 0.14058727752512493, "grad_norm": 1.15128493309021, "learning_rate": 0.0001393189114102778, "loss": 2.2325, "step": 1878 }, { "epoch": 0.14066213763030336, "grad_norm": 1.4626365900039673, "learning_rate": 0.00013926147707879453, "loss": 1.9826, "step": 1879 }, { "epoch": 0.14073699773548182, "grad_norm": 1.2524784803390503, "learning_rate": 0.00013920402743183895, "loss": 1.999, "step": 1880 }, { "epoch": 0.14081185784066028, "grad_norm": 1.174793004989624, "learning_rate": 0.00013914656249182153, "loss": 2.0563, "step": 1881 }, { "epoch": 0.1408867179458387, "grad_norm": 1.1178196668624878, "learning_rate": 0.0001390890822811586, "loss": 1.8408, "step": 1882 }, { "epoch": 0.14096157805101717, "grad_norm": 1.0131006240844727, "learning_rate": 0.00013903158682227268, "loss": 2.6425, "step": 1883 }, { "epoch": 0.1410364381561956, "grad_norm": 1.3321396112442017, "learning_rate": 0.00013897407613759207, "loss": 2.0549, "step": 1884 }, { "epoch": 0.14111129826137406, "grad_norm": 1.084257960319519, "learning_rate": 0.00013891655024955113, "loss": 1.9881, "step": 1885 }, { "epoch": 0.14118615836655252, "grad_norm": 1.3064546585083008, "learning_rate": 0.00013885900918058996, "loss": 2.4509, "step": 1886 }, { "epoch": 0.14126101847173095, "grad_norm": 1.2701197862625122, "learning_rate": 0.00013880145295315478, "loss": 1.818, "step": 1887 }, { "epoch": 0.1413358785769094, "grad_norm": 1.3899896144866943, "learning_rate": 0.00013874388158969763, "loss": 2.1051, "step": 1888 }, { "epoch": 0.14141073868208784, "grad_norm": 1.3075038194656372, "learning_rate": 0.00013868629511267644, "loss": 2.2626, "step": 1889 }, { "epoch": 0.1414855987872663, "grad_norm": 1.4200823307037354, "learning_rate": 0.00013862869354455513, "loss": 2.7599, "step": 1890 }, { "epoch": 0.14156045889244476, "grad_norm": 1.4246937036514282, "learning_rate": 0.00013857107690780335, "loss": 1.6488, "step": 1891 }, { "epoch": 0.1416353189976232, "grad_norm": 1.1869983673095703, "learning_rate": 0.0001385134452248968, "loss": 2.2505, "step": 1892 }, { "epoch": 0.14171017910280165, "grad_norm": 1.4029122591018677, "learning_rate": 0.00013845579851831693, "loss": 2.0296, "step": 1893 }, { "epoch": 0.14178503920798008, "grad_norm": 1.358401894569397, "learning_rate": 0.0001383981368105511, "loss": 2.2849, "step": 1894 }, { "epoch": 0.14185989931315854, "grad_norm": 1.2486733198165894, "learning_rate": 0.0001383404601240925, "loss": 2.0645, "step": 1895 }, { "epoch": 0.14193475941833697, "grad_norm": 1.1922391653060913, "learning_rate": 0.00013828276848144017, "loss": 2.1943, "step": 1896 }, { "epoch": 0.14200961952351543, "grad_norm": 1.4833083152770996, "learning_rate": 0.000138225061905099, "loss": 2.1818, "step": 1897 }, { "epoch": 0.1420844796286939, "grad_norm": 1.4014719724655151, "learning_rate": 0.00013816734041757973, "loss": 1.9872, "step": 1898 }, { "epoch": 0.14215933973387232, "grad_norm": 1.18297278881073, "learning_rate": 0.0001381096040413988, "loss": 1.9829, "step": 1899 }, { "epoch": 0.14223419983905078, "grad_norm": 1.3227430582046509, "learning_rate": 0.00013805185279907857, "loss": 1.9251, "step": 1900 }, { "epoch": 0.1423090599442292, "grad_norm": 1.3742176294326782, "learning_rate": 0.00013799408671314725, "loss": 2.3201, "step": 1901 }, { "epoch": 0.14238392004940767, "grad_norm": 1.2573095560073853, "learning_rate": 0.00013793630580613862, "loss": 2.3061, "step": 1902 }, { "epoch": 0.14245878015458613, "grad_norm": 1.3405253887176514, "learning_rate": 0.00013787851010059246, "loss": 2.2965, "step": 1903 }, { "epoch": 0.14253364025976456, "grad_norm": 1.3670345544815063, "learning_rate": 0.0001378206996190542, "loss": 2.1138, "step": 1904 }, { "epoch": 0.14260850036494302, "grad_norm": 1.2387927770614624, "learning_rate": 0.00013776287438407517, "loss": 2.2148, "step": 1905 }, { "epoch": 0.14268336047012145, "grad_norm": 1.2425695657730103, "learning_rate": 0.00013770503441821224, "loss": 2.1717, "step": 1906 }, { "epoch": 0.1427582205752999, "grad_norm": 1.312178373336792, "learning_rate": 0.00013764717974402822, "loss": 2.5665, "step": 1907 }, { "epoch": 0.14283308068047837, "grad_norm": 1.1174964904785156, "learning_rate": 0.00013758931038409156, "loss": 2.2624, "step": 1908 }, { "epoch": 0.1429079407856568, "grad_norm": 1.4196107387542725, "learning_rate": 0.00013753142636097648, "loss": 1.6823, "step": 1909 }, { "epoch": 0.14298280089083526, "grad_norm": 1.1385408639907837, "learning_rate": 0.0001374735276972629, "loss": 1.4022, "step": 1910 }, { "epoch": 0.1430576609960137, "grad_norm": 1.00640869140625, "learning_rate": 0.00013741561441553643, "loss": 2.2611, "step": 1911 }, { "epoch": 0.14313252110119215, "grad_norm": 1.31563401222229, "learning_rate": 0.00013735768653838843, "loss": 2.3759, "step": 1912 }, { "epoch": 0.14320738120637058, "grad_norm": 1.2412852048873901, "learning_rate": 0.00013729974408841595, "loss": 2.0288, "step": 1913 }, { "epoch": 0.14328224131154904, "grad_norm": 1.2289661169052124, "learning_rate": 0.00013724178708822163, "loss": 2.1903, "step": 1914 }, { "epoch": 0.1433571014167275, "grad_norm": 1.2658535242080688, "learning_rate": 0.00013718381556041397, "loss": 2.5028, "step": 1915 }, { "epoch": 0.14343196152190593, "grad_norm": 1.0692578554153442, "learning_rate": 0.00013712582952760694, "loss": 1.4208, "step": 1916 }, { "epoch": 0.1435068216270844, "grad_norm": 1.4496049880981445, "learning_rate": 0.00013706782901242028, "loss": 1.7751, "step": 1917 }, { "epoch": 0.14358168173226282, "grad_norm": 1.2621546983718872, "learning_rate": 0.00013700981403747938, "loss": 2.6067, "step": 1918 }, { "epoch": 0.14365654183744128, "grad_norm": 1.2873331308364868, "learning_rate": 0.00013695178462541521, "loss": 2.2392, "step": 1919 }, { "epoch": 0.14373140194261974, "grad_norm": 1.1268354654312134, "learning_rate": 0.00013689374079886444, "loss": 1.929, "step": 1920 }, { "epoch": 0.14380626204779817, "grad_norm": 1.3207907676696777, "learning_rate": 0.00013683568258046933, "loss": 2.0562, "step": 1921 }, { "epoch": 0.14388112215297663, "grad_norm": 1.1522400379180908, "learning_rate": 0.00013677760999287775, "loss": 1.896, "step": 1922 }, { "epoch": 0.14395598225815506, "grad_norm": 1.4892287254333496, "learning_rate": 0.00013671952305874317, "loss": 1.9034, "step": 1923 }, { "epoch": 0.14403084236333352, "grad_norm": 1.1947542428970337, "learning_rate": 0.00013666142180072466, "loss": 2.0772, "step": 1924 }, { "epoch": 0.14410570246851198, "grad_norm": 1.4527862071990967, "learning_rate": 0.00013660330624148693, "loss": 2.3346, "step": 1925 }, { "epoch": 0.1441805625736904, "grad_norm": 1.242788553237915, "learning_rate": 0.00013654517640370012, "loss": 2.0569, "step": 1926 }, { "epoch": 0.14425542267886887, "grad_norm": 1.2357585430145264, "learning_rate": 0.0001364870323100402, "loss": 2.0389, "step": 1927 }, { "epoch": 0.1443302827840473, "grad_norm": 1.1797411441802979, "learning_rate": 0.00013642887398318838, "loss": 2.4405, "step": 1928 }, { "epoch": 0.14440514288922576, "grad_norm": 1.3835309743881226, "learning_rate": 0.0001363707014458317, "loss": 2.236, "step": 1929 }, { "epoch": 0.1444800029944042, "grad_norm": 1.2609188556671143, "learning_rate": 0.0001363125147206626, "loss": 1.834, "step": 1930 }, { "epoch": 0.14455486309958265, "grad_norm": 1.263673186302185, "learning_rate": 0.00013625431383037907, "loss": 2.3784, "step": 1931 }, { "epoch": 0.1446297232047611, "grad_norm": 1.5047863721847534, "learning_rate": 0.00013619609879768466, "loss": 1.9925, "step": 1932 }, { "epoch": 0.14470458330993954, "grad_norm": 1.22649347782135, "learning_rate": 0.0001361378696452884, "loss": 2.0158, "step": 1933 }, { "epoch": 0.144779443415118, "grad_norm": 1.3190696239471436, "learning_rate": 0.00013607962639590487, "loss": 2.2288, "step": 1934 }, { "epoch": 0.14485430352029643, "grad_norm": 1.1828807592391968, "learning_rate": 0.00013602136907225405, "loss": 1.9572, "step": 1935 }, { "epoch": 0.1449291636254749, "grad_norm": 1.3119572401046753, "learning_rate": 0.00013596309769706158, "loss": 2.0135, "step": 1936 }, { "epoch": 0.14500402373065335, "grad_norm": 1.3091678619384766, "learning_rate": 0.00013590481229305845, "loss": 1.8761, "step": 1937 }, { "epoch": 0.14507888383583178, "grad_norm": 1.2656974792480469, "learning_rate": 0.00013584651288298113, "loss": 1.8251, "step": 1938 }, { "epoch": 0.14515374394101024, "grad_norm": 1.3680057525634766, "learning_rate": 0.0001357881994895716, "loss": 2.641, "step": 1939 }, { "epoch": 0.14522860404618868, "grad_norm": 1.1497386693954468, "learning_rate": 0.0001357298721355773, "loss": 2.0641, "step": 1940 }, { "epoch": 0.14530346415136713, "grad_norm": 1.6467336416244507, "learning_rate": 0.0001356715308437511, "loss": 1.8826, "step": 1941 }, { "epoch": 0.1453783242565456, "grad_norm": 1.4103337526321411, "learning_rate": 0.00013561317563685123, "loss": 1.8381, "step": 1942 }, { "epoch": 0.14545318436172402, "grad_norm": 1.1681605577468872, "learning_rate": 0.00013555480653764147, "loss": 1.7593, "step": 1943 }, { "epoch": 0.14552804446690248, "grad_norm": 1.2135992050170898, "learning_rate": 0.00013549642356889096, "loss": 1.5074, "step": 1944 }, { "epoch": 0.14560290457208092, "grad_norm": 1.1419379711151123, "learning_rate": 0.00013543802675337421, "loss": 2.1609, "step": 1945 }, { "epoch": 0.14567776467725937, "grad_norm": 1.1440839767456055, "learning_rate": 0.0001353796161138713, "loss": 2.071, "step": 1946 }, { "epoch": 0.1457526247824378, "grad_norm": 1.250229001045227, "learning_rate": 0.00013532119167316745, "loss": 1.8723, "step": 1947 }, { "epoch": 0.14582748488761627, "grad_norm": 1.3832837343215942, "learning_rate": 0.00013526275345405344, "loss": 1.6688, "step": 1948 }, { "epoch": 0.14590234499279472, "grad_norm": 1.3750972747802734, "learning_rate": 0.00013520430147932538, "loss": 2.3267, "step": 1949 }, { "epoch": 0.14597720509797316, "grad_norm": 1.4604889154434204, "learning_rate": 0.00013514583577178482, "loss": 2.4338, "step": 1950 }, { "epoch": 0.14597720509797316, "eval_loss": 2.0879149436950684, "eval_runtime": 178.8852, "eval_samples_per_second": 27.951, "eval_steps_per_second": 13.975, "step": 1950 }, { "epoch": 0.14605206520315162, "grad_norm": 1.5483856201171875, "learning_rate": 0.00013508735635423845, "loss": 2.4214, "step": 1951 }, { "epoch": 0.14612692530833005, "grad_norm": 1.2549041509628296, "learning_rate": 0.00013502886324949856, "loss": 2.3223, "step": 1952 }, { "epoch": 0.1462017854135085, "grad_norm": 1.3651773929595947, "learning_rate": 0.0001349703564803826, "loss": 1.9598, "step": 1953 }, { "epoch": 0.14627664551868697, "grad_norm": 1.2327897548675537, "learning_rate": 0.00013491183606971353, "loss": 2.2578, "step": 1954 }, { "epoch": 0.1463515056238654, "grad_norm": 1.1519122123718262, "learning_rate": 0.00013485330204031937, "loss": 1.9119, "step": 1955 }, { "epoch": 0.14642636572904386, "grad_norm": 1.136296033859253, "learning_rate": 0.00013479475441503368, "loss": 1.9023, "step": 1956 }, { "epoch": 0.1465012258342223, "grad_norm": 1.1533094644546509, "learning_rate": 0.00013473619321669527, "loss": 1.6108, "step": 1957 }, { "epoch": 0.14657608593940075, "grad_norm": 1.2725965976715088, "learning_rate": 0.00013467761846814818, "loss": 1.9682, "step": 1958 }, { "epoch": 0.1466509460445792, "grad_norm": 1.2931771278381348, "learning_rate": 0.00013461903019224185, "loss": 2.1164, "step": 1959 }, { "epoch": 0.14672580614975764, "grad_norm": 1.1966437101364136, "learning_rate": 0.00013456042841183082, "loss": 2.3015, "step": 1960 }, { "epoch": 0.1468006662549361, "grad_norm": 1.3288789987564087, "learning_rate": 0.00013450181314977509, "loss": 1.9651, "step": 1961 }, { "epoch": 0.14687552636011453, "grad_norm": 1.1518120765686035, "learning_rate": 0.0001344431844289398, "loss": 2.0988, "step": 1962 }, { "epoch": 0.146950386465293, "grad_norm": 1.3028076887130737, "learning_rate": 0.00013438454227219533, "loss": 2.274, "step": 1963 }, { "epoch": 0.14702524657047142, "grad_norm": 1.2810450792312622, "learning_rate": 0.00013432588670241738, "loss": 1.6097, "step": 1964 }, { "epoch": 0.14710010667564988, "grad_norm": 1.2976491451263428, "learning_rate": 0.0001342672177424869, "loss": 2.1922, "step": 1965 }, { "epoch": 0.14717496678082834, "grad_norm": 1.224637508392334, "learning_rate": 0.00013420853541528993, "loss": 1.8222, "step": 1966 }, { "epoch": 0.14724982688600677, "grad_norm": 1.3312517404556274, "learning_rate": 0.00013414983974371783, "loss": 1.8674, "step": 1967 }, { "epoch": 0.14732468699118523, "grad_norm": 1.2942736148834229, "learning_rate": 0.00013409113075066713, "loss": 2.0175, "step": 1968 }, { "epoch": 0.14739954709636366, "grad_norm": 1.38751220703125, "learning_rate": 0.00013403240845903956, "loss": 2.1503, "step": 1969 }, { "epoch": 0.14747440720154212, "grad_norm": 1.1226226091384888, "learning_rate": 0.0001339736728917421, "loss": 2.0862, "step": 1970 }, { "epoch": 0.14754926730672058, "grad_norm": 1.311326026916504, "learning_rate": 0.00013391492407168677, "loss": 2.5841, "step": 1971 }, { "epoch": 0.147624127411899, "grad_norm": 1.2292667627334595, "learning_rate": 0.0001338561620217909, "loss": 2.1293, "step": 1972 }, { "epoch": 0.14769898751707747, "grad_norm": 1.3615174293518066, "learning_rate": 0.00013379738676497693, "loss": 2.4408, "step": 1973 }, { "epoch": 0.1477738476222559, "grad_norm": 1.3595086336135864, "learning_rate": 0.0001337385983241724, "loss": 2.3347, "step": 1974 }, { "epoch": 0.14784870772743436, "grad_norm": 1.2500534057617188, "learning_rate": 0.00013367979672231003, "loss": 2.2067, "step": 1975 }, { "epoch": 0.14792356783261282, "grad_norm": 1.289874792098999, "learning_rate": 0.00013362098198232774, "loss": 1.9411, "step": 1976 }, { "epoch": 0.14799842793779125, "grad_norm": 1.332538366317749, "learning_rate": 0.00013356215412716843, "loss": 1.9586, "step": 1977 }, { "epoch": 0.1480732880429697, "grad_norm": 1.225311279296875, "learning_rate": 0.0001335033131797803, "loss": 2.1844, "step": 1978 }, { "epoch": 0.14814814814814814, "grad_norm": 1.1602637767791748, "learning_rate": 0.0001334444591631165, "loss": 1.9125, "step": 1979 }, { "epoch": 0.1482230082533266, "grad_norm": 1.271489143371582, "learning_rate": 0.00013338559210013536, "loss": 1.9518, "step": 1980 }, { "epoch": 0.14829786835850503, "grad_norm": 1.4271628856658936, "learning_rate": 0.00013332671201380026, "loss": 1.9807, "step": 1981 }, { "epoch": 0.1483727284636835, "grad_norm": 1.3669272661209106, "learning_rate": 0.0001332678189270797, "loss": 2.0825, "step": 1982 }, { "epoch": 0.14844758856886195, "grad_norm": 1.7350914478302002, "learning_rate": 0.00013320891286294716, "loss": 2.1648, "step": 1983 }, { "epoch": 0.14852244867404038, "grad_norm": 1.4435484409332275, "learning_rate": 0.00013314999384438133, "loss": 1.6396, "step": 1984 }, { "epoch": 0.14859730877921884, "grad_norm": 1.3031457662582397, "learning_rate": 0.00013309106189436588, "loss": 2.2051, "step": 1985 }, { "epoch": 0.14867216888439727, "grad_norm": 1.3711450099945068, "learning_rate": 0.00013303211703588947, "loss": 2.5269, "step": 1986 }, { "epoch": 0.14874702898957573, "grad_norm": 1.1134223937988281, "learning_rate": 0.00013297315929194586, "loss": 2.0677, "step": 1987 }, { "epoch": 0.1488218890947542, "grad_norm": 1.2459248304367065, "learning_rate": 0.0001329141886855338, "loss": 1.7621, "step": 1988 }, { "epoch": 0.14889674919993262, "grad_norm": 1.2299944162368774, "learning_rate": 0.0001328552052396571, "loss": 1.7812, "step": 1989 }, { "epoch": 0.14897160930511108, "grad_norm": 1.2815254926681519, "learning_rate": 0.0001327962089773246, "loss": 2.0144, "step": 1990 }, { "epoch": 0.1490464694102895, "grad_norm": 1.1389731168746948, "learning_rate": 0.00013273719992155, "loss": 1.9032, "step": 1991 }, { "epoch": 0.14912132951546797, "grad_norm": 1.5045911073684692, "learning_rate": 0.00013267817809535214, "loss": 2.3307, "step": 1992 }, { "epoch": 0.14919618962064643, "grad_norm": 1.1836259365081787, "learning_rate": 0.00013261914352175484, "loss": 2.1515, "step": 1993 }, { "epoch": 0.14927104972582486, "grad_norm": 1.4776681661605835, "learning_rate": 0.00013256009622378677, "loss": 2.4677, "step": 1994 }, { "epoch": 0.14934590983100332, "grad_norm": 1.2232685089111328, "learning_rate": 0.0001325010362244816, "loss": 1.9776, "step": 1995 }, { "epoch": 0.14942076993618175, "grad_norm": 1.4819302558898926, "learning_rate": 0.00013244196354687807, "loss": 1.8472, "step": 1996 }, { "epoch": 0.1494956300413602, "grad_norm": 1.2370500564575195, "learning_rate": 0.00013238287821401977, "loss": 1.8329, "step": 1997 }, { "epoch": 0.14957049014653864, "grad_norm": 1.4853178262710571, "learning_rate": 0.0001323237802489552, "loss": 2.2361, "step": 1998 }, { "epoch": 0.1496453502517171, "grad_norm": 1.5656044483184814, "learning_rate": 0.00013226466967473788, "loss": 2.1196, "step": 1999 }, { "epoch": 0.14972021035689556, "grad_norm": 1.481261134147644, "learning_rate": 0.00013220554651442615, "loss": 2.1669, "step": 2000 }, { "epoch": 0.149795070462074, "grad_norm": 1.3820347785949707, "learning_rate": 0.0001321464107910833, "loss": 1.8722, "step": 2001 }, { "epoch": 0.14986993056725245, "grad_norm": 1.3006460666656494, "learning_rate": 0.00013208726252777764, "loss": 2.0697, "step": 2002 }, { "epoch": 0.14994479067243088, "grad_norm": 1.2537890672683716, "learning_rate": 0.00013202810174758214, "loss": 1.8998, "step": 2003 }, { "epoch": 0.15001965077760934, "grad_norm": 1.2266128063201904, "learning_rate": 0.0001319689284735748, "loss": 1.8116, "step": 2004 }, { "epoch": 0.1500945108827878, "grad_norm": 1.3889985084533691, "learning_rate": 0.0001319097427288385, "loss": 2.3159, "step": 2005 }, { "epoch": 0.15016937098796623, "grad_norm": 1.3516956567764282, "learning_rate": 0.00013185054453646088, "loss": 2.4652, "step": 2006 }, { "epoch": 0.1502442310931447, "grad_norm": 1.320270299911499, "learning_rate": 0.00013179133391953463, "loss": 2.0234, "step": 2007 }, { "epoch": 0.15031909119832312, "grad_norm": 1.3116828203201294, "learning_rate": 0.00013173211090115704, "loss": 1.8108, "step": 2008 }, { "epoch": 0.15039395130350158, "grad_norm": 1.2329258918762207, "learning_rate": 0.00013167287550443042, "loss": 2.0011, "step": 2009 }, { "epoch": 0.15046881140868004, "grad_norm": 1.1901073455810547, "learning_rate": 0.00013161362775246184, "loss": 2.0266, "step": 2010 }, { "epoch": 0.15054367151385847, "grad_norm": 1.2578259706497192, "learning_rate": 0.0001315543676683632, "loss": 2.1923, "step": 2011 }, { "epoch": 0.15061853161903693, "grad_norm": 0.9770228266716003, "learning_rate": 0.0001314950952752512, "loss": 1.1313, "step": 2012 }, { "epoch": 0.15069339172421536, "grad_norm": 1.0720173120498657, "learning_rate": 0.00013143581059624738, "loss": 1.6376, "step": 2013 }, { "epoch": 0.15076825182939382, "grad_norm": 1.3290624618530273, "learning_rate": 0.000131376513654478, "loss": 1.9455, "step": 2014 }, { "epoch": 0.15084311193457225, "grad_norm": 1.316462755203247, "learning_rate": 0.0001313172044730742, "loss": 2.2286, "step": 2015 }, { "epoch": 0.15091797203975071, "grad_norm": 1.4389677047729492, "learning_rate": 0.00013125788307517182, "loss": 2.6256, "step": 2016 }, { "epoch": 0.15099283214492917, "grad_norm": 1.2901378870010376, "learning_rate": 0.00013119854948391146, "loss": 2.4111, "step": 2017 }, { "epoch": 0.1510676922501076, "grad_norm": 1.1807867288589478, "learning_rate": 0.0001311392037224385, "loss": 1.9117, "step": 2018 }, { "epoch": 0.15114255235528606, "grad_norm": 1.4384002685546875, "learning_rate": 0.00013107984581390317, "loss": 2.6798, "step": 2019 }, { "epoch": 0.1512174124604645, "grad_norm": 1.2251219749450684, "learning_rate": 0.00013102047578146024, "loss": 2.1318, "step": 2020 }, { "epoch": 0.15129227256564295, "grad_norm": 1.2066569328308105, "learning_rate": 0.00013096109364826928, "loss": 2.1664, "step": 2021 }, { "epoch": 0.1513671326708214, "grad_norm": 1.1218180656433105, "learning_rate": 0.00013090169943749476, "loss": 1.8291, "step": 2022 }, { "epoch": 0.15144199277599985, "grad_norm": 1.3998477458953857, "learning_rate": 0.0001308422931723056, "loss": 1.9748, "step": 2023 }, { "epoch": 0.1515168528811783, "grad_norm": 1.1416441202163696, "learning_rate": 0.00013078287487587552, "loss": 2.0853, "step": 2024 }, { "epoch": 0.15159171298635674, "grad_norm": 1.2652256488800049, "learning_rate": 0.000130723444571383, "loss": 2.1947, "step": 2025 }, { "epoch": 0.1516665730915352, "grad_norm": 1.2924758195877075, "learning_rate": 0.0001306640022820111, "loss": 1.9243, "step": 2026 }, { "epoch": 0.15174143319671365, "grad_norm": 1.4501020908355713, "learning_rate": 0.00013060454803094765, "loss": 2.1391, "step": 2027 }, { "epoch": 0.15181629330189209, "grad_norm": 1.2802395820617676, "learning_rate": 0.00013054508184138515, "loss": 1.3162, "step": 2028 }, { "epoch": 0.15189115340707054, "grad_norm": 1.1681339740753174, "learning_rate": 0.00013048560373652056, "loss": 2.0536, "step": 2029 }, { "epoch": 0.15196601351224898, "grad_norm": 1.1407694816589355, "learning_rate": 0.00013042611373955578, "loss": 1.7124, "step": 2030 }, { "epoch": 0.15204087361742744, "grad_norm": 1.2087925672531128, "learning_rate": 0.00013036661187369717, "loss": 2.1013, "step": 2031 }, { "epoch": 0.15211573372260587, "grad_norm": 1.4832388162612915, "learning_rate": 0.0001303070981621557, "loss": 1.6301, "step": 2032 }, { "epoch": 0.15219059382778433, "grad_norm": 1.1902295351028442, "learning_rate": 0.00013024757262814705, "loss": 1.9942, "step": 2033 }, { "epoch": 0.15226545393296279, "grad_norm": 1.684391736984253, "learning_rate": 0.00013018803529489152, "loss": 1.9321, "step": 2034 }, { "epoch": 0.15234031403814122, "grad_norm": 1.391819953918457, "learning_rate": 0.00013012848618561396, "loss": 1.9951, "step": 2035 }, { "epoch": 0.15241517414331968, "grad_norm": 1.1785832643508911, "learning_rate": 0.00013006892532354373, "loss": 1.7402, "step": 2036 }, { "epoch": 0.1524900342484981, "grad_norm": 1.5031028985977173, "learning_rate": 0.00013000935273191501, "loss": 1.9473, "step": 2037 }, { "epoch": 0.15256489435367657, "grad_norm": 1.233727216720581, "learning_rate": 0.0001299497684339663, "loss": 1.6486, "step": 2038 }, { "epoch": 0.15263975445885503, "grad_norm": 1.2574357986450195, "learning_rate": 0.0001298901724529409, "loss": 2.4087, "step": 2039 }, { "epoch": 0.15271461456403346, "grad_norm": 1.2610751390457153, "learning_rate": 0.00012983056481208642, "loss": 2.0197, "step": 2040 }, { "epoch": 0.15278947466921192, "grad_norm": 1.286991000175476, "learning_rate": 0.00012977094553465524, "loss": 2.3446, "step": 2041 }, { "epoch": 0.15286433477439035, "grad_norm": 1.3824872970581055, "learning_rate": 0.00012971131464390413, "loss": 2.2831, "step": 2042 }, { "epoch": 0.1529391948795688, "grad_norm": 1.9495009183883667, "learning_rate": 0.00012965167216309452, "loss": 1.7053, "step": 2043 }, { "epoch": 0.15301405498474727, "grad_norm": 1.265974998474121, "learning_rate": 0.0001295920181154922, "loss": 2.2543, "step": 2044 }, { "epoch": 0.1530889150899257, "grad_norm": 1.301795482635498, "learning_rate": 0.00012953235252436763, "loss": 2.0441, "step": 2045 }, { "epoch": 0.15316377519510416, "grad_norm": 1.2262858152389526, "learning_rate": 0.00012947267541299566, "loss": 2.398, "step": 2046 }, { "epoch": 0.1532386353002826, "grad_norm": 1.2986302375793457, "learning_rate": 0.00012941298680465573, "loss": 2.1241, "step": 2047 }, { "epoch": 0.15331349540546105, "grad_norm": 1.2745842933654785, "learning_rate": 0.00012935328672263166, "loss": 2.2574, "step": 2048 }, { "epoch": 0.15338835551063948, "grad_norm": 1.4538488388061523, "learning_rate": 0.00012929357519021178, "loss": 2.1524, "step": 2049 }, { "epoch": 0.15346321561581794, "grad_norm": 1.2173607349395752, "learning_rate": 0.000129233852230689, "loss": 2.019, "step": 2050 }, { "epoch": 0.1535380757209964, "grad_norm": 1.3313161134719849, "learning_rate": 0.0001291741178673605, "loss": 2.5538, "step": 2051 }, { "epoch": 0.15361293582617483, "grad_norm": 1.1812456846237183, "learning_rate": 0.00012911437212352803, "loss": 2.2466, "step": 2052 }, { "epoch": 0.1536877959313533, "grad_norm": 1.274715781211853, "learning_rate": 0.00012905461502249777, "loss": 2.0592, "step": 2053 }, { "epoch": 0.15376265603653172, "grad_norm": 1.3467298746109009, "learning_rate": 0.00012899484658758032, "loss": 1.4666, "step": 2054 }, { "epoch": 0.15383751614171018, "grad_norm": 1.1765761375427246, "learning_rate": 0.00012893506684209064, "loss": 2.0678, "step": 2055 }, { "epoch": 0.15391237624688864, "grad_norm": 1.1790051460266113, "learning_rate": 0.0001288752758093482, "loss": 1.8961, "step": 2056 }, { "epoch": 0.15398723635206707, "grad_norm": 1.285473108291626, "learning_rate": 0.0001288154735126768, "loss": 2.075, "step": 2057 }, { "epoch": 0.15406209645724553, "grad_norm": 1.3725836277008057, "learning_rate": 0.00012875565997540469, "loss": 1.7667, "step": 2058 }, { "epoch": 0.15413695656242396, "grad_norm": 1.4254236221313477, "learning_rate": 0.00012869583522086443, "loss": 2.236, "step": 2059 }, { "epoch": 0.15421181666760242, "grad_norm": 1.4711313247680664, "learning_rate": 0.00012863599927239307, "loss": 2.1416, "step": 2060 }, { "epoch": 0.15428667677278088, "grad_norm": 1.331192135810852, "learning_rate": 0.00012857615215333188, "loss": 2.2177, "step": 2061 }, { "epoch": 0.1543615368779593, "grad_norm": 2.9434714317321777, "learning_rate": 0.00012851629388702663, "loss": 2.2168, "step": 2062 }, { "epoch": 0.15443639698313777, "grad_norm": 1.2045172452926636, "learning_rate": 0.0001284564244968273, "loss": 1.8033, "step": 2063 }, { "epoch": 0.1545112570883162, "grad_norm": 1.363053321838379, "learning_rate": 0.0001283965440060884, "loss": 2.2106, "step": 2064 }, { "epoch": 0.15458611719349466, "grad_norm": 1.4112077951431274, "learning_rate": 0.00012833665243816854, "loss": 2.1757, "step": 2065 }, { "epoch": 0.1546609772986731, "grad_norm": 1.4309308528900146, "learning_rate": 0.00012827674981643082, "loss": 2.6369, "step": 2066 }, { "epoch": 0.15473583740385155, "grad_norm": 1.2835416793823242, "learning_rate": 0.0001282168361642426, "loss": 1.8111, "step": 2067 }, { "epoch": 0.15481069750903, "grad_norm": 1.4203721284866333, "learning_rate": 0.0001281569115049755, "loss": 1.939, "step": 2068 }, { "epoch": 0.15488555761420844, "grad_norm": 1.322770595550537, "learning_rate": 0.00012809697586200556, "loss": 1.9563, "step": 2069 }, { "epoch": 0.1549604177193869, "grad_norm": 1.4887861013412476, "learning_rate": 0.0001280370292587129, "loss": 2.1241, "step": 2070 }, { "epoch": 0.15503527782456533, "grad_norm": 1.6541773080825806, "learning_rate": 0.00012797707171848213, "loss": 2.4586, "step": 2071 }, { "epoch": 0.1551101379297438, "grad_norm": 1.178582787513733, "learning_rate": 0.00012791710326470195, "loss": 2.2632, "step": 2072 }, { "epoch": 0.15518499803492225, "grad_norm": 1.1139072179794312, "learning_rate": 0.0001278571239207655, "loss": 1.9891, "step": 2073 }, { "epoch": 0.15525985814010068, "grad_norm": 1.3479487895965576, "learning_rate": 0.00012779713371006995, "loss": 2.0242, "step": 2074 }, { "epoch": 0.15533471824527914, "grad_norm": 1.3578611612319946, "learning_rate": 0.00012773713265601687, "loss": 2.1795, "step": 2075 }, { "epoch": 0.15540957835045757, "grad_norm": 1.2403103113174438, "learning_rate": 0.00012767712078201208, "loss": 2.1536, "step": 2076 }, { "epoch": 0.15548443845563603, "grad_norm": 1.4227224588394165, "learning_rate": 0.0001276170981114655, "loss": 1.8735, "step": 2077 }, { "epoch": 0.1555592985608145, "grad_norm": 1.3214504718780518, "learning_rate": 0.00012755706466779126, "loss": 2.3033, "step": 2078 }, { "epoch": 0.15563415866599292, "grad_norm": 1.20405113697052, "learning_rate": 0.00012749702047440783, "loss": 1.9312, "step": 2079 }, { "epoch": 0.15570901877117138, "grad_norm": 1.132447361946106, "learning_rate": 0.00012743696555473779, "loss": 1.9213, "step": 2080 }, { "epoch": 0.1557838788763498, "grad_norm": 1.2295747995376587, "learning_rate": 0.00012737689993220784, "loss": 1.819, "step": 2081 }, { "epoch": 0.15585873898152827, "grad_norm": 1.1661092042922974, "learning_rate": 0.000127316823630249, "loss": 1.8179, "step": 2082 }, { "epoch": 0.1559335990867067, "grad_norm": 1.2375012636184692, "learning_rate": 0.00012725673667229634, "loss": 2.0427, "step": 2083 }, { "epoch": 0.15600845919188516, "grad_norm": 1.6048835515975952, "learning_rate": 0.00012719663908178914, "loss": 2.6462, "step": 2084 }, { "epoch": 0.15608331929706362, "grad_norm": 1.2260078191757202, "learning_rate": 0.00012713653088217077, "loss": 1.8653, "step": 2085 }, { "epoch": 0.15615817940224205, "grad_norm": 1.293408989906311, "learning_rate": 0.00012707641209688881, "loss": 2.0898, "step": 2086 }, { "epoch": 0.1562330395074205, "grad_norm": 1.213639497756958, "learning_rate": 0.00012701628274939495, "loss": 2.1596, "step": 2087 }, { "epoch": 0.15630789961259894, "grad_norm": 1.3240605592727661, "learning_rate": 0.000126956142863145, "loss": 2.1147, "step": 2088 }, { "epoch": 0.1563827597177774, "grad_norm": 1.3131322860717773, "learning_rate": 0.00012689599246159882, "loss": 1.7928, "step": 2089 }, { "epoch": 0.15645761982295586, "grad_norm": 1.1201212406158447, "learning_rate": 0.00012683583156822045, "loss": 1.7717, "step": 2090 }, { "epoch": 0.1565324799281343, "grad_norm": 1.1691396236419678, "learning_rate": 0.00012677566020647803, "loss": 1.8331, "step": 2091 }, { "epoch": 0.15660734003331275, "grad_norm": 1.2524068355560303, "learning_rate": 0.00012671547839984373, "loss": 2.4488, "step": 2092 }, { "epoch": 0.15668220013849118, "grad_norm": 1.3057796955108643, "learning_rate": 0.00012665528617179377, "loss": 1.6813, "step": 2093 }, { "epoch": 0.15675706024366964, "grad_norm": 1.2335078716278076, "learning_rate": 0.00012659508354580853, "loss": 2.3507, "step": 2094 }, { "epoch": 0.1568319203488481, "grad_norm": 1.5597755908966064, "learning_rate": 0.00012653487054537237, "loss": 2.1952, "step": 2095 }, { "epoch": 0.15690678045402653, "grad_norm": 1.5417437553405762, "learning_rate": 0.00012647464719397372, "loss": 2.4635, "step": 2096 }, { "epoch": 0.156981640559205, "grad_norm": 1.1794483661651611, "learning_rate": 0.0001264144135151051, "loss": 2.1228, "step": 2097 }, { "epoch": 0.15705650066438342, "grad_norm": 1.5024974346160889, "learning_rate": 0.00012635416953226295, "loss": 2.0346, "step": 2098 }, { "epoch": 0.15713136076956188, "grad_norm": 1.3209773302078247, "learning_rate": 0.00012629391526894777, "loss": 1.9083, "step": 2099 }, { "epoch": 0.15720622087474032, "grad_norm": 1.3858015537261963, "learning_rate": 0.00012623365074866417, "loss": 1.5782, "step": 2100 }, { "epoch": 0.15720622087474032, "eval_loss": 2.0774238109588623, "eval_runtime": 178.9002, "eval_samples_per_second": 27.949, "eval_steps_per_second": 13.974, "step": 2100 }, { "epoch": 0.15728108097991877, "grad_norm": 1.4441430568695068, "learning_rate": 0.0001261733759949206, "loss": 1.8452, "step": 2101 }, { "epoch": 0.15735594108509723, "grad_norm": 1.342297077178955, "learning_rate": 0.00012611309103122964, "loss": 2.0179, "step": 2102 }, { "epoch": 0.15743080119027567, "grad_norm": 1.4287586212158203, "learning_rate": 0.00012605279588110777, "loss": 2.2601, "step": 2103 }, { "epoch": 0.15750566129545412, "grad_norm": 1.2497203350067139, "learning_rate": 0.00012599249056807548, "loss": 1.8366, "step": 2104 }, { "epoch": 0.15758052140063256, "grad_norm": 1.2745568752288818, "learning_rate": 0.00012593217511565715, "loss": 2.2909, "step": 2105 }, { "epoch": 0.15765538150581102, "grad_norm": 1.1915743350982666, "learning_rate": 0.00012587184954738128, "loss": 1.6825, "step": 2106 }, { "epoch": 0.15773024161098947, "grad_norm": 1.245133638381958, "learning_rate": 0.00012581151388678014, "loss": 1.7224, "step": 2107 }, { "epoch": 0.1578051017161679, "grad_norm": 1.0321041345596313, "learning_rate": 0.00012575116815739004, "loss": 1.5784, "step": 2108 }, { "epoch": 0.15787996182134637, "grad_norm": 1.430556297302246, "learning_rate": 0.0001256908123827512, "loss": 1.8729, "step": 2109 }, { "epoch": 0.1579548219265248, "grad_norm": 1.3095771074295044, "learning_rate": 0.0001256304465864077, "loss": 2.4245, "step": 2110 }, { "epoch": 0.15802968203170326, "grad_norm": 1.287671685218811, "learning_rate": 0.00012557007079190763, "loss": 2.182, "step": 2111 }, { "epoch": 0.15810454213688171, "grad_norm": 1.071442723274231, "learning_rate": 0.00012550968502280288, "loss": 2.0233, "step": 2112 }, { "epoch": 0.15817940224206015, "grad_norm": 1.0569149255752563, "learning_rate": 0.0001254492893026493, "loss": 2.0798, "step": 2113 }, { "epoch": 0.1582542623472386, "grad_norm": 1.299725890159607, "learning_rate": 0.00012538888365500658, "loss": 2.1688, "step": 2114 }, { "epoch": 0.15832912245241704, "grad_norm": 1.2122199535369873, "learning_rate": 0.00012532846810343838, "loss": 1.8697, "step": 2115 }, { "epoch": 0.1584039825575955, "grad_norm": 1.2000454664230347, "learning_rate": 0.00012526804267151202, "loss": 1.991, "step": 2116 }, { "epoch": 0.15847884266277393, "grad_norm": 1.43906569480896, "learning_rate": 0.00012520760738279888, "loss": 1.878, "step": 2117 }, { "epoch": 0.1585537027679524, "grad_norm": 1.3021878004074097, "learning_rate": 0.0001251471622608741, "loss": 2.0596, "step": 2118 }, { "epoch": 0.15862856287313085, "grad_norm": 1.1307260990142822, "learning_rate": 0.00012508670732931665, "loss": 1.5749, "step": 2119 }, { "epoch": 0.15870342297830928, "grad_norm": 1.2409216165542603, "learning_rate": 0.00012502624261170935, "loss": 2.1104, "step": 2120 }, { "epoch": 0.15877828308348774, "grad_norm": 1.240818977355957, "learning_rate": 0.00012496576813163877, "loss": 1.931, "step": 2121 }, { "epoch": 0.15885314318866617, "grad_norm": 1.0667266845703125, "learning_rate": 0.0001249052839126954, "loss": 1.2428, "step": 2122 }, { "epoch": 0.15892800329384463, "grad_norm": 1.3176846504211426, "learning_rate": 0.0001248447899784735, "loss": 2.0335, "step": 2123 }, { "epoch": 0.1590028633990231, "grad_norm": 1.3453718423843384, "learning_rate": 0.00012478428635257096, "loss": 1.6715, "step": 2124 }, { "epoch": 0.15907772350420152, "grad_norm": 1.4226300716400146, "learning_rate": 0.00012472377305858972, "loss": 2.2758, "step": 2125 }, { "epoch": 0.15915258360937998, "grad_norm": 1.1700676679611206, "learning_rate": 0.0001246632501201353, "loss": 2.092, "step": 2126 }, { "epoch": 0.1592274437145584, "grad_norm": 1.3149932622909546, "learning_rate": 0.00012460271756081702, "loss": 2.1257, "step": 2127 }, { "epoch": 0.15930230381973687, "grad_norm": 1.1610989570617676, "learning_rate": 0.00012454217540424799, "loss": 2.133, "step": 2128 }, { "epoch": 0.15937716392491533, "grad_norm": 1.3001011610031128, "learning_rate": 0.00012448162367404507, "loss": 2.1821, "step": 2129 }, { "epoch": 0.15945202403009376, "grad_norm": 1.1417276859283447, "learning_rate": 0.00012442106239382873, "loss": 2.3316, "step": 2130 }, { "epoch": 0.15952688413527222, "grad_norm": 1.3337146043777466, "learning_rate": 0.00012436049158722336, "loss": 1.9296, "step": 2131 }, { "epoch": 0.15960174424045065, "grad_norm": 1.2813256978988647, "learning_rate": 0.00012429991127785694, "loss": 2.3999, "step": 2132 }, { "epoch": 0.1596766043456291, "grad_norm": 1.2754542827606201, "learning_rate": 0.0001242393214893612, "loss": 2.4524, "step": 2133 }, { "epoch": 0.15975146445080754, "grad_norm": 1.3703383207321167, "learning_rate": 0.0001241787222453715, "loss": 2.4074, "step": 2134 }, { "epoch": 0.159826324555986, "grad_norm": 1.4275147914886475, "learning_rate": 0.00012411811356952699, "loss": 2.3629, "step": 2135 }, { "epoch": 0.15990118466116446, "grad_norm": 1.0684361457824707, "learning_rate": 0.0001240574954854704, "loss": 2.13, "step": 2136 }, { "epoch": 0.1599760447663429, "grad_norm": 1.177189588546753, "learning_rate": 0.00012399686801684826, "loss": 1.8318, "step": 2137 }, { "epoch": 0.16005090487152135, "grad_norm": 1.2999993562698364, "learning_rate": 0.00012393623118731067, "loss": 2.0698, "step": 2138 }, { "epoch": 0.16012576497669978, "grad_norm": 1.1567637920379639, "learning_rate": 0.0001238755850205113, "loss": 2.034, "step": 2139 }, { "epoch": 0.16020062508187824, "grad_norm": 1.1592249870300293, "learning_rate": 0.00012381492954010765, "loss": 1.815, "step": 2140 }, { "epoch": 0.1602754851870567, "grad_norm": 1.3324074745178223, "learning_rate": 0.00012375426476976073, "loss": 2.1355, "step": 2141 }, { "epoch": 0.16035034529223513, "grad_norm": 1.2764090299606323, "learning_rate": 0.00012369359073313515, "loss": 2.2807, "step": 2142 }, { "epoch": 0.1604252053974136, "grad_norm": 1.2990005016326904, "learning_rate": 0.0001236329074538993, "loss": 1.9849, "step": 2143 }, { "epoch": 0.16050006550259202, "grad_norm": 1.1976213455200195, "learning_rate": 0.00012357221495572495, "loss": 2.1239, "step": 2144 }, { "epoch": 0.16057492560777048, "grad_norm": 1.16190505027771, "learning_rate": 0.00012351151326228768, "loss": 1.7332, "step": 2145 }, { "epoch": 0.16064978571294894, "grad_norm": 1.2652626037597656, "learning_rate": 0.00012345080239726646, "loss": 1.894, "step": 2146 }, { "epoch": 0.16072464581812737, "grad_norm": 1.5426594018936157, "learning_rate": 0.000123390082384344, "loss": 1.6456, "step": 2147 }, { "epoch": 0.16079950592330583, "grad_norm": 1.387577772140503, "learning_rate": 0.00012332935324720648, "loss": 2.0737, "step": 2148 }, { "epoch": 0.16087436602848426, "grad_norm": 1.2376543283462524, "learning_rate": 0.00012326861500954373, "loss": 2.1613, "step": 2149 }, { "epoch": 0.16094922613366272, "grad_norm": 1.3883267641067505, "learning_rate": 0.00012320786769504894, "loss": 1.9625, "step": 2150 }, { "epoch": 0.16102408623884115, "grad_norm": 1.407016396522522, "learning_rate": 0.0001231471113274191, "loss": 2.1152, "step": 2151 }, { "epoch": 0.1610989463440196, "grad_norm": 1.1066489219665527, "learning_rate": 0.00012308634593035457, "loss": 1.8443, "step": 2152 }, { "epoch": 0.16117380644919807, "grad_norm": 1.2245831489562988, "learning_rate": 0.00012302557152755924, "loss": 1.989, "step": 2153 }, { "epoch": 0.1612486665543765, "grad_norm": 1.3112536668777466, "learning_rate": 0.00012296478814274056, "loss": 2.1505, "step": 2154 }, { "epoch": 0.16132352665955496, "grad_norm": 1.294480800628662, "learning_rate": 0.00012290399579960946, "loss": 2.451, "step": 2155 }, { "epoch": 0.1613983867647334, "grad_norm": 1.3611459732055664, "learning_rate": 0.00012284319452188035, "loss": 1.8247, "step": 2156 }, { "epoch": 0.16147324686991185, "grad_norm": 1.2655017375946045, "learning_rate": 0.00012278238433327117, "loss": 2.216, "step": 2157 }, { "epoch": 0.1615481069750903, "grad_norm": 0.9941394329071045, "learning_rate": 0.0001227215652575033, "loss": 1.6497, "step": 2158 }, { "epoch": 0.16162296708026874, "grad_norm": 1.3416845798492432, "learning_rate": 0.00012266073731830162, "loss": 2.3406, "step": 2159 }, { "epoch": 0.1616978271854472, "grad_norm": 1.135642409324646, "learning_rate": 0.0001225999005393944, "loss": 1.5893, "step": 2160 }, { "epoch": 0.16177268729062563, "grad_norm": 1.300038456916809, "learning_rate": 0.0001225390549445134, "loss": 2.1936, "step": 2161 }, { "epoch": 0.1618475473958041, "grad_norm": 1.3603010177612305, "learning_rate": 0.0001224782005573938, "loss": 2.3293, "step": 2162 }, { "epoch": 0.16192240750098255, "grad_norm": 1.384914517402649, "learning_rate": 0.00012241733740177432, "loss": 2.5704, "step": 2163 }, { "epoch": 0.16199726760616098, "grad_norm": 1.3033777475357056, "learning_rate": 0.00012235646550139696, "loss": 2.6262, "step": 2164 }, { "epoch": 0.16207212771133944, "grad_norm": 1.3913774490356445, "learning_rate": 0.00012229558488000716, "loss": 2.2701, "step": 2165 }, { "epoch": 0.16214698781651787, "grad_norm": 1.1821950674057007, "learning_rate": 0.0001222346955613538, "loss": 2.354, "step": 2166 }, { "epoch": 0.16222184792169633, "grad_norm": 1.4781286716461182, "learning_rate": 0.00012217379756918916, "loss": 2.6775, "step": 2167 }, { "epoch": 0.16229670802687476, "grad_norm": 1.295506477355957, "learning_rate": 0.00012211289092726885, "loss": 2.0811, "step": 2168 }, { "epoch": 0.16237156813205322, "grad_norm": 1.2173187732696533, "learning_rate": 0.00012205197565935191, "loss": 2.1411, "step": 2169 }, { "epoch": 0.16244642823723168, "grad_norm": 1.2789169549942017, "learning_rate": 0.0001219910517892007, "loss": 2.151, "step": 2170 }, { "epoch": 0.16252128834241011, "grad_norm": 1.2260403633117676, "learning_rate": 0.00012193011934058094, "loss": 2.0408, "step": 2171 }, { "epoch": 0.16259614844758857, "grad_norm": 1.1075538396835327, "learning_rate": 0.00012186917833726174, "loss": 2.1616, "step": 2172 }, { "epoch": 0.162671008552767, "grad_norm": 1.2965052127838135, "learning_rate": 0.00012180822880301553, "loss": 2.0776, "step": 2173 }, { "epoch": 0.16274586865794546, "grad_norm": 1.3882721662521362, "learning_rate": 0.00012174727076161799, "loss": 2.0012, "step": 2174 }, { "epoch": 0.16282072876312392, "grad_norm": 2.938873052597046, "learning_rate": 0.00012168630423684825, "loss": 2.239, "step": 2175 }, { "epoch": 0.16289558886830235, "grad_norm": 1.2418581247329712, "learning_rate": 0.00012162532925248866, "loss": 2.0828, "step": 2176 }, { "epoch": 0.1629704489734808, "grad_norm": 1.5656582117080688, "learning_rate": 0.00012156434583232487, "loss": 2.0428, "step": 2177 }, { "epoch": 0.16304530907865925, "grad_norm": 1.4660829305648804, "learning_rate": 0.00012150335400014595, "loss": 2.1411, "step": 2178 }, { "epoch": 0.1631201691838377, "grad_norm": 1.2174826860427856, "learning_rate": 0.000121442353779744, "loss": 2.0739, "step": 2179 }, { "epoch": 0.16319502928901616, "grad_norm": 1.2597781419754028, "learning_rate": 0.0001213813451949146, "loss": 2.1041, "step": 2180 }, { "epoch": 0.1632698893941946, "grad_norm": 1.3076114654541016, "learning_rate": 0.00012132032826945656, "loss": 1.9478, "step": 2181 }, { "epoch": 0.16334474949937305, "grad_norm": 1.2301552295684814, "learning_rate": 0.0001212593030271719, "loss": 2.2315, "step": 2182 }, { "epoch": 0.16341960960455149, "grad_norm": 1.1922115087509155, "learning_rate": 0.00012119826949186586, "loss": 1.9342, "step": 2183 }, { "epoch": 0.16349446970972994, "grad_norm": 1.1283454895019531, "learning_rate": 0.000121137227687347, "loss": 2.0992, "step": 2184 }, { "epoch": 0.16356932981490838, "grad_norm": 1.2100927829742432, "learning_rate": 0.00012107617763742703, "loss": 2.3504, "step": 2185 }, { "epoch": 0.16364418992008684, "grad_norm": 1.6526916027069092, "learning_rate": 0.00012101511936592094, "loss": 2.3714, "step": 2186 }, { "epoch": 0.1637190500252653, "grad_norm": 1.2855767011642456, "learning_rate": 0.00012095405289664687, "loss": 1.9793, "step": 2187 }, { "epoch": 0.16379391013044373, "grad_norm": 1.4450534582138062, "learning_rate": 0.00012089297825342616, "loss": 2.0124, "step": 2188 }, { "epoch": 0.16386877023562219, "grad_norm": 1.1805438995361328, "learning_rate": 0.0001208318954600834, "loss": 1.9999, "step": 2189 }, { "epoch": 0.16394363034080062, "grad_norm": 1.1808512210845947, "learning_rate": 0.00012077080454044632, "loss": 1.893, "step": 2190 }, { "epoch": 0.16401849044597908, "grad_norm": 1.2486885786056519, "learning_rate": 0.00012070970551834576, "loss": 2.2751, "step": 2191 }, { "epoch": 0.16409335055115754, "grad_norm": 1.5185185670852661, "learning_rate": 0.00012064859841761585, "loss": 2.3002, "step": 2192 }, { "epoch": 0.16416821065633597, "grad_norm": 1.2780312299728394, "learning_rate": 0.00012058748326209373, "loss": 2.3032, "step": 2193 }, { "epoch": 0.16424307076151443, "grad_norm": 1.3084888458251953, "learning_rate": 0.00012052636007561984, "loss": 1.8472, "step": 2194 }, { "epoch": 0.16431793086669286, "grad_norm": 1.4186512231826782, "learning_rate": 0.0001204652288820376, "loss": 2.264, "step": 2195 }, { "epoch": 0.16439279097187132, "grad_norm": 1.480563998222351, "learning_rate": 0.00012040408970519364, "loss": 2.3575, "step": 2196 }, { "epoch": 0.16446765107704978, "grad_norm": 1.37344491481781, "learning_rate": 0.00012034294256893765, "loss": 2.0426, "step": 2197 }, { "epoch": 0.1645425111822282, "grad_norm": 1.4977604150772095, "learning_rate": 0.00012028178749712256, "loss": 2.0998, "step": 2198 }, { "epoch": 0.16461737128740667, "grad_norm": 1.3283594846725464, "learning_rate": 0.00012022062451360416, "loss": 1.948, "step": 2199 }, { "epoch": 0.1646922313925851, "grad_norm": 1.3358502388000488, "learning_rate": 0.00012015945364224155, "loss": 1.96, "step": 2200 }, { "epoch": 0.16476709149776356, "grad_norm": 1.6591647863388062, "learning_rate": 0.00012009827490689681, "loss": 1.9425, "step": 2201 }, { "epoch": 0.164841951602942, "grad_norm": 1.1178611516952515, "learning_rate": 0.00012003708833143505, "loss": 2.2675, "step": 2202 }, { "epoch": 0.16491681170812045, "grad_norm": 1.2166961431503296, "learning_rate": 0.00011997589393972452, "loss": 2.0675, "step": 2203 }, { "epoch": 0.1649916718132989, "grad_norm": 1.2354830503463745, "learning_rate": 0.0001199146917556365, "loss": 2.2526, "step": 2204 }, { "epoch": 0.16506653191847734, "grad_norm": 1.1480355262756348, "learning_rate": 0.0001198534818030452, "loss": 2.1838, "step": 2205 }, { "epoch": 0.1651413920236558, "grad_norm": 1.3325893878936768, "learning_rate": 0.00011979226410582809, "loss": 2.394, "step": 2206 }, { "epoch": 0.16521625212883423, "grad_norm": 1.205649971961975, "learning_rate": 0.00011973103868786542, "loss": 1.8542, "step": 2207 }, { "epoch": 0.1652911122340127, "grad_norm": 1.3049358129501343, "learning_rate": 0.00011966980557304058, "loss": 2.1783, "step": 2208 }, { "epoch": 0.16536597233919115, "grad_norm": 1.1471256017684937, "learning_rate": 0.00011960856478523994, "loss": 2.0381, "step": 2209 }, { "epoch": 0.16544083244436958, "grad_norm": 1.3880606889724731, "learning_rate": 0.00011954731634835288, "loss": 1.8015, "step": 2210 }, { "epoch": 0.16551569254954804, "grad_norm": 1.1626561880111694, "learning_rate": 0.00011948606028627169, "loss": 2.0689, "step": 2211 }, { "epoch": 0.16559055265472647, "grad_norm": 1.158015489578247, "learning_rate": 0.00011942479662289174, "loss": 2.3575, "step": 2212 }, { "epoch": 0.16566541275990493, "grad_norm": 1.2480782270431519, "learning_rate": 0.00011936352538211133, "loss": 2.0772, "step": 2213 }, { "epoch": 0.1657402728650834, "grad_norm": 1.4321715831756592, "learning_rate": 0.00011930224658783167, "loss": 2.3784, "step": 2214 }, { "epoch": 0.16581513297026182, "grad_norm": 1.4906904697418213, "learning_rate": 0.00011924096026395692, "loss": 1.7091, "step": 2215 }, { "epoch": 0.16588999307544028, "grad_norm": 1.1760426759719849, "learning_rate": 0.00011917966643439426, "loss": 1.8356, "step": 2216 }, { "epoch": 0.1659648531806187, "grad_norm": 1.3176946640014648, "learning_rate": 0.00011911836512305369, "loss": 2.6755, "step": 2217 }, { "epoch": 0.16603971328579717, "grad_norm": 1.2035272121429443, "learning_rate": 0.00011905705635384823, "loss": 1.5937, "step": 2218 }, { "epoch": 0.1661145733909756, "grad_norm": 1.1849749088287354, "learning_rate": 0.00011899574015069374, "loss": 1.7905, "step": 2219 }, { "epoch": 0.16618943349615406, "grad_norm": 1.058397650718689, "learning_rate": 0.00011893441653750896, "loss": 1.6516, "step": 2220 }, { "epoch": 0.16626429360133252, "grad_norm": 1.233577847480774, "learning_rate": 0.00011887308553821563, "loss": 2.1001, "step": 2221 }, { "epoch": 0.16633915370651095, "grad_norm": 1.1346230506896973, "learning_rate": 0.00011881174717673826, "loss": 1.8549, "step": 2222 }, { "epoch": 0.1664140138116894, "grad_norm": 1.1156986951828003, "learning_rate": 0.00011875040147700424, "loss": 2.0716, "step": 2223 }, { "epoch": 0.16648887391686784, "grad_norm": 1.1600013971328735, "learning_rate": 0.00011868904846294393, "loss": 1.6129, "step": 2224 }, { "epoch": 0.1665637340220463, "grad_norm": 1.2858619689941406, "learning_rate": 0.00011862768815849041, "loss": 2.1347, "step": 2225 }, { "epoch": 0.16663859412722476, "grad_norm": 1.6625896692276, "learning_rate": 0.00011856632058757973, "loss": 1.9453, "step": 2226 }, { "epoch": 0.1667134542324032, "grad_norm": 1.2215588092803955, "learning_rate": 0.00011850494577415064, "loss": 2.0852, "step": 2227 }, { "epoch": 0.16678831433758165, "grad_norm": 1.333836317062378, "learning_rate": 0.00011844356374214478, "loss": 2.0767, "step": 2228 }, { "epoch": 0.16686317444276008, "grad_norm": 1.2141852378845215, "learning_rate": 0.00011838217451550667, "loss": 2.1721, "step": 2229 }, { "epoch": 0.16693803454793854, "grad_norm": 1.229097843170166, "learning_rate": 0.00011832077811818357, "loss": 2.5217, "step": 2230 }, { "epoch": 0.167012894653117, "grad_norm": 1.40141761302948, "learning_rate": 0.00011825937457412544, "loss": 1.94, "step": 2231 }, { "epoch": 0.16708775475829543, "grad_norm": 1.2333474159240723, "learning_rate": 0.00011819796390728521, "loss": 2.2765, "step": 2232 }, { "epoch": 0.1671626148634739, "grad_norm": 1.4086828231811523, "learning_rate": 0.00011813654614161855, "loss": 2.1309, "step": 2233 }, { "epoch": 0.16723747496865232, "grad_norm": 1.2487986087799072, "learning_rate": 0.00011807512130108378, "loss": 1.8759, "step": 2234 }, { "epoch": 0.16731233507383078, "grad_norm": 1.1670846939086914, "learning_rate": 0.00011801368940964211, "loss": 2.2858, "step": 2235 }, { "epoch": 0.1673871951790092, "grad_norm": 1.4624829292297363, "learning_rate": 0.00011795225049125743, "loss": 2.2837, "step": 2236 }, { "epoch": 0.16746205528418767, "grad_norm": 1.2919384241104126, "learning_rate": 0.00011789080456989634, "loss": 2.2436, "step": 2237 }, { "epoch": 0.16753691538936613, "grad_norm": 1.3696972131729126, "learning_rate": 0.00011782935166952831, "loss": 2.5276, "step": 2238 }, { "epoch": 0.16761177549454456, "grad_norm": 1.4682286977767944, "learning_rate": 0.0001177678918141254, "loss": 2.3554, "step": 2239 }, { "epoch": 0.16768663559972302, "grad_norm": 1.1757866144180298, "learning_rate": 0.0001177064250276624, "loss": 1.8244, "step": 2240 }, { "epoch": 0.16776149570490145, "grad_norm": 1.1996434926986694, "learning_rate": 0.00011764495133411688, "loss": 1.9795, "step": 2241 }, { "epoch": 0.1678363558100799, "grad_norm": 1.3483864068984985, "learning_rate": 0.000117583470757469, "loss": 1.8462, "step": 2242 }, { "epoch": 0.16791121591525837, "grad_norm": 1.299032211303711, "learning_rate": 0.00011752198332170172, "loss": 2.2246, "step": 2243 }, { "epoch": 0.1679860760204368, "grad_norm": 1.30570387840271, "learning_rate": 0.0001174604890508006, "loss": 2.0642, "step": 2244 }, { "epoch": 0.16806093612561526, "grad_norm": 1.3535066843032837, "learning_rate": 0.00011739898796875383, "loss": 2.5198, "step": 2245 }, { "epoch": 0.1681357962307937, "grad_norm": 1.129560112953186, "learning_rate": 0.00011733748009955236, "loss": 1.7357, "step": 2246 }, { "epoch": 0.16821065633597215, "grad_norm": 1.2655411958694458, "learning_rate": 0.00011727596546718977, "loss": 1.855, "step": 2247 }, { "epoch": 0.1682855164411506, "grad_norm": 1.034576416015625, "learning_rate": 0.00011721444409566216, "loss": 2.054, "step": 2248 }, { "epoch": 0.16836037654632904, "grad_norm": 1.2866272926330566, "learning_rate": 0.00011715291600896836, "loss": 1.979, "step": 2249 }, { "epoch": 0.1684352366515075, "grad_norm": 1.2498602867126465, "learning_rate": 0.00011709138123110988, "loss": 2.451, "step": 2250 }, { "epoch": 0.1684352366515075, "eval_loss": 2.067962169647217, "eval_runtime": 178.8459, "eval_samples_per_second": 27.957, "eval_steps_per_second": 13.979, "step": 2250 }, { "epoch": 0.16851009675668593, "grad_norm": 1.5259897708892822, "learning_rate": 0.00011702983978609068, "loss": 2.643, "step": 2251 }, { "epoch": 0.1685849568618644, "grad_norm": 0.9506269693374634, "learning_rate": 0.00011696829169791743, "loss": 1.6317, "step": 2252 }, { "epoch": 0.16865981696704282, "grad_norm": 1.390620470046997, "learning_rate": 0.00011690673699059938, "loss": 2.3553, "step": 2253 }, { "epoch": 0.16873467707222128, "grad_norm": 1.6457849740982056, "learning_rate": 0.0001168451756881483, "loss": 2.1309, "step": 2254 }, { "epoch": 0.16880953717739974, "grad_norm": 1.25615394115448, "learning_rate": 0.00011678360781457867, "loss": 2.0193, "step": 2255 }, { "epoch": 0.16888439728257817, "grad_norm": 1.2213398218154907, "learning_rate": 0.00011672203339390735, "loss": 2.0923, "step": 2256 }, { "epoch": 0.16895925738775663, "grad_norm": 1.2302803993225098, "learning_rate": 0.00011666045245015389, "loss": 1.8675, "step": 2257 }, { "epoch": 0.16903411749293507, "grad_norm": 1.264060616493225, "learning_rate": 0.00011659886500734036, "loss": 1.9748, "step": 2258 }, { "epoch": 0.16910897759811352, "grad_norm": 1.2429448366165161, "learning_rate": 0.00011653727108949131, "loss": 2.2185, "step": 2259 }, { "epoch": 0.16918383770329198, "grad_norm": 1.1332855224609375, "learning_rate": 0.00011647567072063386, "loss": 1.6441, "step": 2260 }, { "epoch": 0.16925869780847042, "grad_norm": 1.3919029235839844, "learning_rate": 0.00011641406392479764, "loss": 2.3172, "step": 2261 }, { "epoch": 0.16933355791364887, "grad_norm": 1.3474171161651611, "learning_rate": 0.00011635245072601482, "loss": 2.007, "step": 2262 }, { "epoch": 0.1694084180188273, "grad_norm": 1.2584680318832397, "learning_rate": 0.00011629083114832, "loss": 1.6188, "step": 2263 }, { "epoch": 0.16948327812400577, "grad_norm": 1.3254450559616089, "learning_rate": 0.00011622920521575029, "loss": 1.9038, "step": 2264 }, { "epoch": 0.16955813822918422, "grad_norm": 1.535305142402649, "learning_rate": 0.00011616757295234535, "loss": 1.8712, "step": 2265 }, { "epoch": 0.16963299833436266, "grad_norm": 1.5504261255264282, "learning_rate": 0.00011610593438214719, "loss": 2.1383, "step": 2266 }, { "epoch": 0.16970785843954111, "grad_norm": 1.4000147581100464, "learning_rate": 0.00011604428952920037, "loss": 2.3232, "step": 2267 }, { "epoch": 0.16978271854471955, "grad_norm": 1.4624311923980713, "learning_rate": 0.00011598263841755188, "loss": 2.277, "step": 2268 }, { "epoch": 0.169857578649898, "grad_norm": 1.344508409500122, "learning_rate": 0.00011592098107125111, "loss": 2.0785, "step": 2269 }, { "epoch": 0.16993243875507644, "grad_norm": 1.264648675918579, "learning_rate": 0.00011585931751434995, "loss": 1.6995, "step": 2270 }, { "epoch": 0.1700072988602549, "grad_norm": 1.2020275592803955, "learning_rate": 0.00011579764777090268, "loss": 2.1118, "step": 2271 }, { "epoch": 0.17008215896543336, "grad_norm": 1.3616939783096313, "learning_rate": 0.00011573597186496595, "loss": 2.3416, "step": 2272 }, { "epoch": 0.1701570190706118, "grad_norm": 1.3635051250457764, "learning_rate": 0.0001156742898205989, "loss": 2.0229, "step": 2273 }, { "epoch": 0.17023187917579025, "grad_norm": 1.1165735721588135, "learning_rate": 0.00011561260166186297, "loss": 2.1121, "step": 2274 }, { "epoch": 0.17030673928096868, "grad_norm": 1.1176053285598755, "learning_rate": 0.0001155509074128221, "loss": 1.5006, "step": 2275 }, { "epoch": 0.17038159938614714, "grad_norm": 1.159234642982483, "learning_rate": 0.00011548920709754252, "loss": 2.4628, "step": 2276 }, { "epoch": 0.1704564594913256, "grad_norm": 1.2299526929855347, "learning_rate": 0.00011542750074009279, "loss": 1.9594, "step": 2277 }, { "epoch": 0.17053131959650403, "grad_norm": 1.2496463060379028, "learning_rate": 0.00011536578836454395, "loss": 2.2802, "step": 2278 }, { "epoch": 0.1706061797016825, "grad_norm": 1.4684141874313354, "learning_rate": 0.00011530406999496934, "loss": 1.953, "step": 2279 }, { "epoch": 0.17068103980686092, "grad_norm": 1.2728630304336548, "learning_rate": 0.00011524234565544454, "loss": 2.4483, "step": 2280 }, { "epoch": 0.17075589991203938, "grad_norm": 1.543243408203125, "learning_rate": 0.00011518061537004758, "loss": 2.4278, "step": 2281 }, { "epoch": 0.17083076001721784, "grad_norm": 1.2439000606536865, "learning_rate": 0.0001151188791628588, "loss": 1.9016, "step": 2282 }, { "epoch": 0.17090562012239627, "grad_norm": 2.713040351867676, "learning_rate": 0.0001150571370579608, "loss": 2.6099, "step": 2283 }, { "epoch": 0.17098048022757473, "grad_norm": 1.4703322649002075, "learning_rate": 0.00011499538907943843, "loss": 2.3038, "step": 2284 }, { "epoch": 0.17105534033275316, "grad_norm": 1.2370264530181885, "learning_rate": 0.00011493363525137901, "loss": 2.3918, "step": 2285 }, { "epoch": 0.17113020043793162, "grad_norm": 1.3753705024719238, "learning_rate": 0.00011487187559787196, "loss": 2.2009, "step": 2286 }, { "epoch": 0.17120506054311008, "grad_norm": 1.1948810815811157, "learning_rate": 0.00011481011014300906, "loss": 2.0148, "step": 2287 }, { "epoch": 0.1712799206482885, "grad_norm": 1.2074726819992065, "learning_rate": 0.00011474833891088436, "loss": 2.283, "step": 2288 }, { "epoch": 0.17135478075346697, "grad_norm": 1.7041045427322388, "learning_rate": 0.00011468656192559413, "loss": 2.4143, "step": 2289 }, { "epoch": 0.1714296408586454, "grad_norm": 1.2438205480575562, "learning_rate": 0.00011462477921123688, "loss": 1.7727, "step": 2290 }, { "epoch": 0.17150450096382386, "grad_norm": 1.2229326963424683, "learning_rate": 0.00011456299079191339, "loss": 2.2182, "step": 2291 }, { "epoch": 0.1715793610690023, "grad_norm": 1.1533833742141724, "learning_rate": 0.0001145011966917266, "loss": 2.1337, "step": 2292 }, { "epoch": 0.17165422117418075, "grad_norm": 1.364248275756836, "learning_rate": 0.0001144393969347818, "loss": 2.5262, "step": 2293 }, { "epoch": 0.1717290812793592, "grad_norm": 1.1518845558166504, "learning_rate": 0.00011437759154518631, "loss": 2.0633, "step": 2294 }, { "epoch": 0.17180394138453764, "grad_norm": 1.2758569717407227, "learning_rate": 0.00011431578054704977, "loss": 2.1037, "step": 2295 }, { "epoch": 0.1718788014897161, "grad_norm": 1.1912164688110352, "learning_rate": 0.00011425396396448402, "loss": 1.6308, "step": 2296 }, { "epoch": 0.17195366159489453, "grad_norm": 1.0728579759597778, "learning_rate": 0.00011419214182160294, "loss": 1.5112, "step": 2297 }, { "epoch": 0.172028521700073, "grad_norm": 1.2010711431503296, "learning_rate": 0.00011413031414252274, "loss": 1.9694, "step": 2298 }, { "epoch": 0.17210338180525145, "grad_norm": 1.4362419843673706, "learning_rate": 0.00011406848095136171, "loss": 2.3859, "step": 2299 }, { "epoch": 0.17217824191042988, "grad_norm": 1.3255558013916016, "learning_rate": 0.0001140066422722403, "loss": 1.6367, "step": 2300 }, { "epoch": 0.17225310201560834, "grad_norm": 1.2761473655700684, "learning_rate": 0.00011394479812928105, "loss": 2.229, "step": 2301 }, { "epoch": 0.17232796212078677, "grad_norm": 1.1839773654937744, "learning_rate": 0.00011388294854660875, "loss": 2.0761, "step": 2302 }, { "epoch": 0.17240282222596523, "grad_norm": 1.4345037937164307, "learning_rate": 0.00011382109354835023, "loss": 2.199, "step": 2303 }, { "epoch": 0.1724776823311437, "grad_norm": 1.3220645189285278, "learning_rate": 0.00011375923315863441, "loss": 2.008, "step": 2304 }, { "epoch": 0.17255254243632212, "grad_norm": 1.225868582725525, "learning_rate": 0.00011369736740159243, "loss": 2.7527, "step": 2305 }, { "epoch": 0.17262740254150058, "grad_norm": 1.3919926881790161, "learning_rate": 0.00011363549630135734, "loss": 2.3726, "step": 2306 }, { "epoch": 0.172702262646679, "grad_norm": 1.0289695262908936, "learning_rate": 0.00011357361988206448, "loss": 2.1837, "step": 2307 }, { "epoch": 0.17277712275185747, "grad_norm": 1.1593310832977295, "learning_rate": 0.00011351173816785108, "loss": 2.1553, "step": 2308 }, { "epoch": 0.1728519828570359, "grad_norm": 1.219590425491333, "learning_rate": 0.00011344985118285656, "loss": 1.6903, "step": 2309 }, { "epoch": 0.17292684296221436, "grad_norm": 1.1601866483688354, "learning_rate": 0.00011338795895122233, "loss": 2.332, "step": 2310 }, { "epoch": 0.17300170306739282, "grad_norm": 0.9273516535758972, "learning_rate": 0.00011332606149709196, "loss": 1.2429, "step": 2311 }, { "epoch": 0.17307656317257125, "grad_norm": 1.0498138666152954, "learning_rate": 0.00011326415884461085, "loss": 1.837, "step": 2312 }, { "epoch": 0.1731514232777497, "grad_norm": 1.2966227531433105, "learning_rate": 0.00011320225101792661, "loss": 2.2013, "step": 2313 }, { "epoch": 0.17322628338292814, "grad_norm": 1.1482752561569214, "learning_rate": 0.0001131403380411888, "loss": 1.9289, "step": 2314 }, { "epoch": 0.1733011434881066, "grad_norm": 1.14757239818573, "learning_rate": 0.00011307841993854897, "loss": 1.7554, "step": 2315 }, { "epoch": 0.17337600359328506, "grad_norm": 1.1772164106369019, "learning_rate": 0.00011301649673416075, "loss": 2.2828, "step": 2316 }, { "epoch": 0.1734508636984635, "grad_norm": 1.178999900817871, "learning_rate": 0.00011295456845217965, "loss": 1.6048, "step": 2317 }, { "epoch": 0.17352572380364195, "grad_norm": 1.3706114292144775, "learning_rate": 0.00011289263511676321, "loss": 2.3883, "step": 2318 }, { "epoch": 0.17360058390882038, "grad_norm": 1.264062762260437, "learning_rate": 0.00011283069675207099, "loss": 2.1744, "step": 2319 }, { "epoch": 0.17367544401399884, "grad_norm": 1.269591212272644, "learning_rate": 0.00011276875338226445, "loss": 2.2441, "step": 2320 }, { "epoch": 0.1737503041191773, "grad_norm": 1.1747082471847534, "learning_rate": 0.00011270680503150699, "loss": 2.0481, "step": 2321 }, { "epoch": 0.17382516422435573, "grad_norm": 1.307837963104248, "learning_rate": 0.00011264485172396402, "loss": 1.9925, "step": 2322 }, { "epoch": 0.1739000243295342, "grad_norm": 1.2463394403457642, "learning_rate": 0.00011258289348380281, "loss": 1.9193, "step": 2323 }, { "epoch": 0.17397488443471262, "grad_norm": 1.6332260370254517, "learning_rate": 0.00011252093033519266, "loss": 2.3171, "step": 2324 }, { "epoch": 0.17404974453989108, "grad_norm": 1.5242129564285278, "learning_rate": 0.00011245896230230468, "loss": 2.069, "step": 2325 }, { "epoch": 0.17412460464506951, "grad_norm": 1.3385379314422607, "learning_rate": 0.0001123969894093119, "loss": 2.2813, "step": 2326 }, { "epoch": 0.17419946475024797, "grad_norm": 1.4326709508895874, "learning_rate": 0.00011233501168038924, "loss": 2.3865, "step": 2327 }, { "epoch": 0.17427432485542643, "grad_norm": 1.2185444831848145, "learning_rate": 0.00011227302913971365, "loss": 2.4434, "step": 2328 }, { "epoch": 0.17434918496060486, "grad_norm": 1.3494240045547485, "learning_rate": 0.00011221104181146372, "loss": 2.0097, "step": 2329 }, { "epoch": 0.17442404506578332, "grad_norm": 1.3978830575942993, "learning_rate": 0.00011214904971982006, "loss": 2.3561, "step": 2330 }, { "epoch": 0.17449890517096175, "grad_norm": 1.3170719146728516, "learning_rate": 0.00011208705288896513, "loss": 2.6729, "step": 2331 }, { "epoch": 0.1745737652761402, "grad_norm": 1.401938557624817, "learning_rate": 0.00011202505134308323, "loss": 2.5401, "step": 2332 }, { "epoch": 0.17464862538131867, "grad_norm": 1.5648539066314697, "learning_rate": 0.00011196304510636042, "loss": 2.056, "step": 2333 }, { "epoch": 0.1747234854864971, "grad_norm": 1.1789642572402954, "learning_rate": 0.00011190103420298472, "loss": 1.4802, "step": 2334 }, { "epoch": 0.17479834559167556, "grad_norm": 1.2506096363067627, "learning_rate": 0.00011183901865714585, "loss": 2.1005, "step": 2335 }, { "epoch": 0.174873205696854, "grad_norm": 1.2457517385482788, "learning_rate": 0.00011177699849303547, "loss": 1.7883, "step": 2336 }, { "epoch": 0.17494806580203245, "grad_norm": 1.312867283821106, "learning_rate": 0.0001117149737348469, "loss": 2.3093, "step": 2337 }, { "epoch": 0.1750229259072109, "grad_norm": 1.280268907546997, "learning_rate": 0.00011165294440677531, "loss": 1.8013, "step": 2338 }, { "epoch": 0.17509778601238934, "grad_norm": 1.3248569965362549, "learning_rate": 0.00011159091053301774, "loss": 1.9957, "step": 2339 }, { "epoch": 0.1751726461175678, "grad_norm": 1.6372660398483276, "learning_rate": 0.00011152887213777283, "loss": 1.8866, "step": 2340 }, { "epoch": 0.17524750622274624, "grad_norm": 1.348328709602356, "learning_rate": 0.00011146682924524114, "loss": 2.457, "step": 2341 }, { "epoch": 0.1753223663279247, "grad_norm": 1.50859797000885, "learning_rate": 0.00011140478187962491, "loss": 2.4812, "step": 2342 }, { "epoch": 0.17539722643310313, "grad_norm": 1.3298579454421997, "learning_rate": 0.00011134273006512812, "loss": 2.4575, "step": 2343 }, { "epoch": 0.17547208653828159, "grad_norm": 1.3396886587142944, "learning_rate": 0.00011128067382595654, "loss": 1.9918, "step": 2344 }, { "epoch": 0.17554694664346004, "grad_norm": 1.2807761430740356, "learning_rate": 0.00011121861318631762, "loss": 2.1051, "step": 2345 }, { "epoch": 0.17562180674863848, "grad_norm": 1.4370964765548706, "learning_rate": 0.00011115654817042049, "loss": 2.2818, "step": 2346 }, { "epoch": 0.17569666685381694, "grad_norm": 1.5739094018936157, "learning_rate": 0.00011109447880247608, "loss": 2.3961, "step": 2347 }, { "epoch": 0.17577152695899537, "grad_norm": 1.3286712169647217, "learning_rate": 0.00011103240510669698, "loss": 2.278, "step": 2348 }, { "epoch": 0.17584638706417383, "grad_norm": 1.2167518138885498, "learning_rate": 0.0001109703271072974, "loss": 1.9497, "step": 2349 }, { "epoch": 0.17592124716935229, "grad_norm": 1.241768717765808, "learning_rate": 0.00011090824482849334, "loss": 2.0241, "step": 2350 }, { "epoch": 0.17599610727453072, "grad_norm": 1.4641040563583374, "learning_rate": 0.00011084615829450243, "loss": 1.7578, "step": 2351 }, { "epoch": 0.17607096737970918, "grad_norm": 1.2402361631393433, "learning_rate": 0.00011078406752954386, "loss": 2.3772, "step": 2352 }, { "epoch": 0.1761458274848876, "grad_norm": 1.2461415529251099, "learning_rate": 0.00011072197255783867, "loss": 2.0935, "step": 2353 }, { "epoch": 0.17622068759006607, "grad_norm": 1.2635853290557861, "learning_rate": 0.00011065987340360938, "loss": 2.0758, "step": 2354 }, { "epoch": 0.17629554769524453, "grad_norm": 1.3527950048446655, "learning_rate": 0.00011059777009108015, "loss": 2.5058, "step": 2355 }, { "epoch": 0.17637040780042296, "grad_norm": 1.2692421674728394, "learning_rate": 0.00011053566264447686, "loss": 2.0903, "step": 2356 }, { "epoch": 0.17644526790560142, "grad_norm": 1.1695441007614136, "learning_rate": 0.00011047355108802691, "loss": 1.6628, "step": 2357 }, { "epoch": 0.17652012801077985, "grad_norm": 1.2058587074279785, "learning_rate": 0.00011041143544595936, "loss": 1.7479, "step": 2358 }, { "epoch": 0.1765949881159583, "grad_norm": 1.249408483505249, "learning_rate": 0.0001103493157425048, "loss": 1.8605, "step": 2359 }, { "epoch": 0.17666984822113674, "grad_norm": 1.214215636253357, "learning_rate": 0.00011028719200189553, "loss": 2.1061, "step": 2360 }, { "epoch": 0.1767447083263152, "grad_norm": 1.3873701095581055, "learning_rate": 0.00011022506424836528, "loss": 2.0897, "step": 2361 }, { "epoch": 0.17681956843149366, "grad_norm": 1.5466455221176147, "learning_rate": 0.0001101629325061494, "loss": 1.9844, "step": 2362 }, { "epoch": 0.1768944285366721, "grad_norm": 1.1032969951629639, "learning_rate": 0.00011010079679948484, "loss": 1.9109, "step": 2363 }, { "epoch": 0.17696928864185055, "grad_norm": 1.2063508033752441, "learning_rate": 0.00011003865715261002, "loss": 2.0723, "step": 2364 }, { "epoch": 0.17704414874702898, "grad_norm": 1.2291463613510132, "learning_rate": 0.00010997651358976495, "loss": 1.9402, "step": 2365 }, { "epoch": 0.17711900885220744, "grad_norm": 1.2666687965393066, "learning_rate": 0.00010991436613519117, "loss": 2.0403, "step": 2366 }, { "epoch": 0.1771938689573859, "grad_norm": 1.2941019535064697, "learning_rate": 0.00010985221481313171, "loss": 2.1529, "step": 2367 }, { "epoch": 0.17726872906256433, "grad_norm": 1.423780918121338, "learning_rate": 0.00010979005964783114, "loss": 2.0674, "step": 2368 }, { "epoch": 0.1773435891677428, "grad_norm": 1.4584380388259888, "learning_rate": 0.00010972790066353547, "loss": 2.1056, "step": 2369 }, { "epoch": 0.17741844927292122, "grad_norm": 1.1654645204544067, "learning_rate": 0.00010966573788449224, "loss": 1.7274, "step": 2370 }, { "epoch": 0.17749330937809968, "grad_norm": 1.5468419790267944, "learning_rate": 0.00010960357133495049, "loss": 2.0936, "step": 2371 }, { "epoch": 0.17756816948327814, "grad_norm": 1.2872729301452637, "learning_rate": 0.0001095414010391607, "loss": 1.8362, "step": 2372 }, { "epoch": 0.17764302958845657, "grad_norm": 1.2397655248641968, "learning_rate": 0.00010947922702137487, "loss": 1.9746, "step": 2373 }, { "epoch": 0.17771788969363503, "grad_norm": 1.3410279750823975, "learning_rate": 0.00010941704930584635, "loss": 2.1558, "step": 2374 }, { "epoch": 0.17779274979881346, "grad_norm": 1.1884403228759766, "learning_rate": 0.00010935486791682998, "loss": 1.8736, "step": 2375 }, { "epoch": 0.17786760990399192, "grad_norm": 1.1722791194915771, "learning_rate": 0.00010929268287858205, "loss": 2.1435, "step": 2376 }, { "epoch": 0.17794247000917035, "grad_norm": 1.0521049499511719, "learning_rate": 0.00010923049421536034, "loss": 1.9362, "step": 2377 }, { "epoch": 0.1780173301143488, "grad_norm": 1.3412193059921265, "learning_rate": 0.00010916830195142382, "loss": 2.1107, "step": 2378 }, { "epoch": 0.17809219021952727, "grad_norm": 1.328353762626648, "learning_rate": 0.00010910610611103311, "loss": 1.9713, "step": 2379 }, { "epoch": 0.1781670503247057, "grad_norm": 1.475583791732788, "learning_rate": 0.00010904390671845012, "loss": 1.9591, "step": 2380 }, { "epoch": 0.17824191042988416, "grad_norm": 1.180235743522644, "learning_rate": 0.00010898170379793815, "loss": 2.3474, "step": 2381 }, { "epoch": 0.1783167705350626, "grad_norm": 1.2291929721832275, "learning_rate": 0.00010891949737376184, "loss": 2.1932, "step": 2382 }, { "epoch": 0.17839163064024105, "grad_norm": 1.2757991552352905, "learning_rate": 0.00010885728747018727, "loss": 2.0629, "step": 2383 }, { "epoch": 0.1784664907454195, "grad_norm": 1.2438286542892456, "learning_rate": 0.00010879507411148182, "loss": 2.1444, "step": 2384 }, { "epoch": 0.17854135085059794, "grad_norm": 1.4225753545761108, "learning_rate": 0.00010873285732191427, "loss": 2.0357, "step": 2385 }, { "epoch": 0.1786162109557764, "grad_norm": 1.4112006425857544, "learning_rate": 0.00010867063712575469, "loss": 2.0317, "step": 2386 }, { "epoch": 0.17869107106095483, "grad_norm": 1.2070868015289307, "learning_rate": 0.00010860841354727447, "loss": 2.011, "step": 2387 }, { "epoch": 0.1787659311661333, "grad_norm": 1.3921008110046387, "learning_rate": 0.0001085461866107464, "loss": 2.1481, "step": 2388 }, { "epoch": 0.17884079127131175, "grad_norm": 1.3689260482788086, "learning_rate": 0.00010848395634044452, "loss": 2.2961, "step": 2389 }, { "epoch": 0.17891565137649018, "grad_norm": 1.113157033920288, "learning_rate": 0.00010842172276064412, "loss": 2.3461, "step": 2390 }, { "epoch": 0.17899051148166864, "grad_norm": 1.1919639110565186, "learning_rate": 0.00010835948589562193, "loss": 1.4853, "step": 2391 }, { "epoch": 0.17906537158684707, "grad_norm": 1.1495624780654907, "learning_rate": 0.00010829724576965576, "loss": 1.8916, "step": 2392 }, { "epoch": 0.17914023169202553, "grad_norm": 1.4038195610046387, "learning_rate": 0.00010823500240702489, "loss": 2.543, "step": 2393 }, { "epoch": 0.17921509179720396, "grad_norm": 1.5629621744155884, "learning_rate": 0.00010817275583200974, "loss": 1.627, "step": 2394 }, { "epoch": 0.17928995190238242, "grad_norm": 1.1845427751541138, "learning_rate": 0.00010811050606889199, "loss": 2.0589, "step": 2395 }, { "epoch": 0.17936481200756088, "grad_norm": 1.453160285949707, "learning_rate": 0.00010804825314195464, "loss": 2.3908, "step": 2396 }, { "epoch": 0.1794396721127393, "grad_norm": 1.351306438446045, "learning_rate": 0.00010798599707548189, "loss": 2.4394, "step": 2397 }, { "epoch": 0.17951453221791777, "grad_norm": 1.115201473236084, "learning_rate": 0.00010792373789375909, "loss": 2.0219, "step": 2398 }, { "epoch": 0.1795893923230962, "grad_norm": 1.3785163164138794, "learning_rate": 0.00010786147562107287, "loss": 2.0575, "step": 2399 }, { "epoch": 0.17966425242827466, "grad_norm": 1.2896595001220703, "learning_rate": 0.00010779921028171111, "loss": 2.1608, "step": 2400 }, { "epoch": 0.17966425242827466, "eval_loss": 2.0557806491851807, "eval_runtime": 178.9407, "eval_samples_per_second": 27.942, "eval_steps_per_second": 13.971, "step": 2400 }, { "epoch": 0.17973911253345312, "grad_norm": 1.3137820959091187, "learning_rate": 0.00010773694189996282, "loss": 2.4861, "step": 2401 }, { "epoch": 0.17981397263863155, "grad_norm": 1.37283194065094, "learning_rate": 0.00010767467050011817, "loss": 2.1261, "step": 2402 }, { "epoch": 0.17988883274381, "grad_norm": 1.49021577835083, "learning_rate": 0.0001076123961064686, "loss": 2.0225, "step": 2403 }, { "epoch": 0.17996369284898844, "grad_norm": 1.3323060274124146, "learning_rate": 0.00010755011874330665, "loss": 1.9601, "step": 2404 }, { "epoch": 0.1800385529541669, "grad_norm": 1.4081263542175293, "learning_rate": 0.00010748783843492607, "loss": 1.5529, "step": 2405 }, { "epoch": 0.18011341305934536, "grad_norm": 1.1446870565414429, "learning_rate": 0.00010742555520562168, "loss": 1.6727, "step": 2406 }, { "epoch": 0.1801882731645238, "grad_norm": 1.484165906906128, "learning_rate": 0.00010736326907968947, "loss": 2.453, "step": 2407 }, { "epoch": 0.18026313326970225, "grad_norm": 1.3552354574203491, "learning_rate": 0.00010730098008142664, "loss": 2.2935, "step": 2408 }, { "epoch": 0.18033799337488068, "grad_norm": 1.3620710372924805, "learning_rate": 0.00010723868823513141, "loss": 1.6136, "step": 2409 }, { "epoch": 0.18041285348005914, "grad_norm": 1.2073098421096802, "learning_rate": 0.00010717639356510319, "loss": 2.0579, "step": 2410 }, { "epoch": 0.18048771358523757, "grad_norm": 1.2852795124053955, "learning_rate": 0.00010711409609564235, "loss": 1.9617, "step": 2411 }, { "epoch": 0.18056257369041603, "grad_norm": 1.1954832077026367, "learning_rate": 0.00010705179585105056, "loss": 2.2481, "step": 2412 }, { "epoch": 0.1806374337955945, "grad_norm": 1.355555534362793, "learning_rate": 0.00010698949285563037, "loss": 2.0261, "step": 2413 }, { "epoch": 0.18071229390077292, "grad_norm": 1.1475334167480469, "learning_rate": 0.0001069271871336856, "loss": 2.0217, "step": 2414 }, { "epoch": 0.18078715400595138, "grad_norm": 1.3151557445526123, "learning_rate": 0.00010686487870952093, "loss": 2.2612, "step": 2415 }, { "epoch": 0.18086201411112982, "grad_norm": 1.2128931283950806, "learning_rate": 0.00010680256760744225, "loss": 2.1068, "step": 2416 }, { "epoch": 0.18093687421630827, "grad_norm": 1.1355140209197998, "learning_rate": 0.00010674025385175643, "loss": 1.5051, "step": 2417 }, { "epoch": 0.18101173432148673, "grad_norm": 1.3119032382965088, "learning_rate": 0.00010667793746677138, "loss": 2.4596, "step": 2418 }, { "epoch": 0.18108659442666517, "grad_norm": 1.368741750717163, "learning_rate": 0.00010661561847679602, "loss": 1.8017, "step": 2419 }, { "epoch": 0.18116145453184362, "grad_norm": 1.4040298461914062, "learning_rate": 0.00010655329690614033, "loss": 2.3189, "step": 2420 }, { "epoch": 0.18123631463702206, "grad_norm": 1.2550078630447388, "learning_rate": 0.00010649097277911528, "loss": 1.8657, "step": 2421 }, { "epoch": 0.18131117474220051, "grad_norm": 1.2536036968231201, "learning_rate": 0.00010642864612003281, "loss": 2.0802, "step": 2422 }, { "epoch": 0.18138603484737897, "grad_norm": 1.2293274402618408, "learning_rate": 0.00010636631695320587, "loss": 2.1329, "step": 2423 }, { "epoch": 0.1814608949525574, "grad_norm": 1.357686161994934, "learning_rate": 0.00010630398530294838, "loss": 1.8015, "step": 2424 }, { "epoch": 0.18153575505773586, "grad_norm": 1.088990569114685, "learning_rate": 0.00010624165119357525, "loss": 1.9643, "step": 2425 }, { "epoch": 0.1816106151629143, "grad_norm": 1.403548002243042, "learning_rate": 0.00010617931464940236, "loss": 2.1084, "step": 2426 }, { "epoch": 0.18168547526809276, "grad_norm": 1.1516048908233643, "learning_rate": 0.00010611697569474643, "loss": 1.9105, "step": 2427 }, { "epoch": 0.1817603353732712, "grad_norm": 1.3192538022994995, "learning_rate": 0.00010605463435392526, "loss": 2.2061, "step": 2428 }, { "epoch": 0.18183519547844965, "grad_norm": 1.436417579650879, "learning_rate": 0.00010599229065125755, "loss": 2.1287, "step": 2429 }, { "epoch": 0.1819100555836281, "grad_norm": 1.2463489770889282, "learning_rate": 0.00010592994461106285, "loss": 2.1197, "step": 2430 }, { "epoch": 0.18198491568880654, "grad_norm": 1.3548386096954346, "learning_rate": 0.00010586759625766167, "loss": 2.09, "step": 2431 }, { "epoch": 0.182059775793985, "grad_norm": 1.48103666305542, "learning_rate": 0.00010580524561537543, "loss": 2.3395, "step": 2432 }, { "epoch": 0.18213463589916343, "grad_norm": 1.4086198806762695, "learning_rate": 0.00010574289270852644, "loss": 2.2417, "step": 2433 }, { "epoch": 0.1822094960043419, "grad_norm": 1.5813055038452148, "learning_rate": 0.0001056805375614379, "loss": 2.4506, "step": 2434 }, { "epoch": 0.18228435610952035, "grad_norm": 1.3066786527633667, "learning_rate": 0.00010561818019843384, "loss": 1.9471, "step": 2435 }, { "epoch": 0.18235921621469878, "grad_norm": 1.1837128400802612, "learning_rate": 0.00010555582064383918, "loss": 1.8197, "step": 2436 }, { "epoch": 0.18243407631987724, "grad_norm": 1.24669349193573, "learning_rate": 0.00010549345892197975, "loss": 1.7455, "step": 2437 }, { "epoch": 0.18250893642505567, "grad_norm": 1.167884349822998, "learning_rate": 0.00010543109505718211, "loss": 1.9162, "step": 2438 }, { "epoch": 0.18258379653023413, "grad_norm": 1.2997640371322632, "learning_rate": 0.00010536872907377375, "loss": 1.8617, "step": 2439 }, { "epoch": 0.1826586566354126, "grad_norm": 1.1053754091262817, "learning_rate": 0.00010530636099608297, "loss": 1.8749, "step": 2440 }, { "epoch": 0.18273351674059102, "grad_norm": 1.1864959001541138, "learning_rate": 0.00010524399084843886, "loss": 1.6015, "step": 2441 }, { "epoch": 0.18280837684576948, "grad_norm": 1.3350273370742798, "learning_rate": 0.00010518161865517136, "loss": 1.9075, "step": 2442 }, { "epoch": 0.1828832369509479, "grad_norm": 1.102724552154541, "learning_rate": 0.00010511924444061112, "loss": 1.8243, "step": 2443 }, { "epoch": 0.18295809705612637, "grad_norm": 1.324800968170166, "learning_rate": 0.00010505686822908966, "loss": 2.1291, "step": 2444 }, { "epoch": 0.1830329571613048, "grad_norm": 1.3160202503204346, "learning_rate": 0.00010499449004493928, "loss": 1.9623, "step": 2445 }, { "epoch": 0.18310781726648326, "grad_norm": 1.4666906595230103, "learning_rate": 0.00010493210991249307, "loss": 2.3973, "step": 2446 }, { "epoch": 0.18318267737166172, "grad_norm": 1.564254641532898, "learning_rate": 0.00010486972785608473, "loss": 2.0401, "step": 2447 }, { "epoch": 0.18325753747684015, "grad_norm": 1.1947875022888184, "learning_rate": 0.00010480734390004887, "loss": 1.8636, "step": 2448 }, { "epoch": 0.1833323975820186, "grad_norm": 1.298666000366211, "learning_rate": 0.0001047449580687208, "loss": 2.4241, "step": 2449 }, { "epoch": 0.18340725768719704, "grad_norm": 1.7609535455703735, "learning_rate": 0.00010468257038643657, "loss": 2.0261, "step": 2450 }, { "epoch": 0.1834821177923755, "grad_norm": 1.4583077430725098, "learning_rate": 0.00010462018087753285, "loss": 2.3966, "step": 2451 }, { "epoch": 0.18355697789755396, "grad_norm": 1.3493492603302002, "learning_rate": 0.00010455778956634717, "loss": 2.17, "step": 2452 }, { "epoch": 0.1836318380027324, "grad_norm": 1.3574457168579102, "learning_rate": 0.00010449539647721767, "loss": 2.3975, "step": 2453 }, { "epoch": 0.18370669810791085, "grad_norm": 1.511276125907898, "learning_rate": 0.00010443300163448325, "loss": 1.8347, "step": 2454 }, { "epoch": 0.18378155821308928, "grad_norm": 1.2114421129226685, "learning_rate": 0.00010437060506248341, "loss": 2.0608, "step": 2455 }, { "epoch": 0.18385641831826774, "grad_norm": 1.2141995429992676, "learning_rate": 0.0001043082067855584, "loss": 1.9914, "step": 2456 }, { "epoch": 0.1839312784234462, "grad_norm": 1.3581897020339966, "learning_rate": 0.00010424580682804905, "loss": 2.4897, "step": 2457 }, { "epoch": 0.18400613852862463, "grad_norm": 1.4975969791412354, "learning_rate": 0.00010418340521429701, "loss": 2.1569, "step": 2458 }, { "epoch": 0.1840809986338031, "grad_norm": 1.2859264612197876, "learning_rate": 0.00010412100196864434, "loss": 1.8581, "step": 2459 }, { "epoch": 0.18415585873898152, "grad_norm": 1.3845643997192383, "learning_rate": 0.00010405859711543393, "loss": 2.1674, "step": 2460 }, { "epoch": 0.18423071884415998, "grad_norm": 1.100691795349121, "learning_rate": 0.00010399619067900926, "loss": 2.0676, "step": 2461 }, { "epoch": 0.1843055789493384, "grad_norm": 1.356379747390747, "learning_rate": 0.00010393378268371434, "loss": 2.4956, "step": 2462 }, { "epoch": 0.18438043905451687, "grad_norm": 1.0112205743789673, "learning_rate": 0.00010387137315389383, "loss": 1.561, "step": 2463 }, { "epoch": 0.18445529915969533, "grad_norm": 1.4151655435562134, "learning_rate": 0.00010380896211389308, "loss": 2.0854, "step": 2464 }, { "epoch": 0.18453015926487376, "grad_norm": 1.134068489074707, "learning_rate": 0.00010374654958805789, "loss": 1.9162, "step": 2465 }, { "epoch": 0.18460501937005222, "grad_norm": 1.40127432346344, "learning_rate": 0.00010368413560073476, "loss": 2.3816, "step": 2466 }, { "epoch": 0.18467987947523065, "grad_norm": 1.2609999179840088, "learning_rate": 0.00010362172017627063, "loss": 2.3819, "step": 2467 }, { "epoch": 0.1847547395804091, "grad_norm": 1.1887705326080322, "learning_rate": 0.00010355930333901312, "loss": 1.6311, "step": 2468 }, { "epoch": 0.18482959968558757, "grad_norm": 1.297627329826355, "learning_rate": 0.00010349688511331038, "loss": 2.1821, "step": 2469 }, { "epoch": 0.184904459790766, "grad_norm": 1.2233471870422363, "learning_rate": 0.00010343446552351099, "loss": 1.5421, "step": 2470 }, { "epoch": 0.18497931989594446, "grad_norm": 1.4901320934295654, "learning_rate": 0.00010337204459396424, "loss": 2.0329, "step": 2471 }, { "epoch": 0.1850541800011229, "grad_norm": 1.348394751548767, "learning_rate": 0.0001033096223490198, "loss": 2.04, "step": 2472 }, { "epoch": 0.18512904010630135, "grad_norm": 1.1147196292877197, "learning_rate": 0.00010324719881302791, "loss": 1.8528, "step": 2473 }, { "epoch": 0.1852039002114798, "grad_norm": 1.3876211643218994, "learning_rate": 0.00010318477401033931, "loss": 2.044, "step": 2474 }, { "epoch": 0.18527876031665824, "grad_norm": 1.2794345617294312, "learning_rate": 0.00010312234796530528, "loss": 1.6003, "step": 2475 }, { "epoch": 0.1853536204218367, "grad_norm": 1.2758734226226807, "learning_rate": 0.00010305992070227746, "loss": 1.7788, "step": 2476 }, { "epoch": 0.18542848052701513, "grad_norm": 1.32688307762146, "learning_rate": 0.00010299749224560806, "loss": 1.9935, "step": 2477 }, { "epoch": 0.1855033406321936, "grad_norm": 1.3136181831359863, "learning_rate": 0.00010293506261964978, "loss": 2.2356, "step": 2478 }, { "epoch": 0.18557820073737202, "grad_norm": 1.2628141641616821, "learning_rate": 0.0001028726318487557, "loss": 1.9514, "step": 2479 }, { "epoch": 0.18565306084255048, "grad_norm": 1.3098877668380737, "learning_rate": 0.00010281019995727939, "loss": 2.6702, "step": 2480 }, { "epoch": 0.18572792094772894, "grad_norm": 1.2685630321502686, "learning_rate": 0.00010274776696957484, "loss": 2.0655, "step": 2481 }, { "epoch": 0.18580278105290737, "grad_norm": 1.1386290788650513, "learning_rate": 0.00010268533290999647, "loss": 2.2039, "step": 2482 }, { "epoch": 0.18587764115808583, "grad_norm": 1.3735054731369019, "learning_rate": 0.00010262289780289916, "loss": 2.5297, "step": 2483 }, { "epoch": 0.18595250126326426, "grad_norm": 1.3487844467163086, "learning_rate": 0.00010256046167263813, "loss": 2.2154, "step": 2484 }, { "epoch": 0.18602736136844272, "grad_norm": 1.3308765888214111, "learning_rate": 0.00010249802454356901, "loss": 2.217, "step": 2485 }, { "epoch": 0.18610222147362118, "grad_norm": 1.372965693473816, "learning_rate": 0.0001024355864400479, "loss": 2.0356, "step": 2486 }, { "epoch": 0.1861770815787996, "grad_norm": 1.0832070112228394, "learning_rate": 0.00010237314738643116, "loss": 2.0416, "step": 2487 }, { "epoch": 0.18625194168397807, "grad_norm": 1.414351463317871, "learning_rate": 0.0001023107074070756, "loss": 1.9992, "step": 2488 }, { "epoch": 0.1863268017891565, "grad_norm": 1.3068805932998657, "learning_rate": 0.00010224826652633841, "loss": 2.0056, "step": 2489 }, { "epoch": 0.18640166189433496, "grad_norm": 1.2403135299682617, "learning_rate": 0.00010218582476857703, "loss": 2.0356, "step": 2490 }, { "epoch": 0.18647652199951342, "grad_norm": 1.2719004154205322, "learning_rate": 0.00010212338215814936, "loss": 1.9976, "step": 2491 }, { "epoch": 0.18655138210469185, "grad_norm": 1.3200323581695557, "learning_rate": 0.00010206093871941356, "loss": 2.2906, "step": 2492 }, { "epoch": 0.1866262422098703, "grad_norm": 1.1146520376205444, "learning_rate": 0.00010199849447672811, "loss": 1.992, "step": 2493 }, { "epoch": 0.18670110231504874, "grad_norm": 1.1886343955993652, "learning_rate": 0.00010193604945445185, "loss": 2.0317, "step": 2494 }, { "epoch": 0.1867759624202272, "grad_norm": 1.2934683561325073, "learning_rate": 0.00010187360367694394, "loss": 1.9232, "step": 2495 }, { "epoch": 0.18685082252540564, "grad_norm": 1.325984001159668, "learning_rate": 0.00010181115716856369, "loss": 2.3969, "step": 2496 }, { "epoch": 0.1869256826305841, "grad_norm": 1.1523348093032837, "learning_rate": 0.00010174870995367087, "loss": 1.9248, "step": 2497 }, { "epoch": 0.18700054273576255, "grad_norm": 1.4561411142349243, "learning_rate": 0.00010168626205662545, "loss": 2.2986, "step": 2498 }, { "epoch": 0.18707540284094099, "grad_norm": 1.1501573324203491, "learning_rate": 0.00010162381350178769, "loss": 1.8985, "step": 2499 }, { "epoch": 0.18715026294611944, "grad_norm": 1.122711181640625, "learning_rate": 0.00010156136431351802, "loss": 2.1982, "step": 2500 }, { "epoch": 0.18722512305129788, "grad_norm": 1.2091299295425415, "learning_rate": 0.00010149891451617726, "loss": 1.8915, "step": 2501 }, { "epoch": 0.18729998315647634, "grad_norm": 1.2061444520950317, "learning_rate": 0.00010143646413412632, "loss": 1.4485, "step": 2502 }, { "epoch": 0.1873748432616548, "grad_norm": 1.2439600229263306, "learning_rate": 0.00010137401319172647, "loss": 1.9378, "step": 2503 }, { "epoch": 0.18744970336683323, "grad_norm": 1.3543941974639893, "learning_rate": 0.00010131156171333915, "loss": 2.1594, "step": 2504 }, { "epoch": 0.18752456347201168, "grad_norm": 1.6424062252044678, "learning_rate": 0.00010124910972332593, "loss": 2.3871, "step": 2505 }, { "epoch": 0.18759942357719012, "grad_norm": 1.4541740417480469, "learning_rate": 0.00010118665724604866, "loss": 1.7151, "step": 2506 }, { "epoch": 0.18767428368236858, "grad_norm": 1.3079863786697388, "learning_rate": 0.00010112420430586947, "loss": 2.2146, "step": 2507 }, { "epoch": 0.18774914378754703, "grad_norm": 1.167786955833435, "learning_rate": 0.0001010617509271504, "loss": 1.2594, "step": 2508 }, { "epoch": 0.18782400389272547, "grad_norm": 1.4556748867034912, "learning_rate": 0.00010099929713425396, "loss": 2.2697, "step": 2509 }, { "epoch": 0.18789886399790393, "grad_norm": 1.2253141403198242, "learning_rate": 0.00010093684295154264, "loss": 2.1751, "step": 2510 }, { "epoch": 0.18797372410308236, "grad_norm": 1.428909182548523, "learning_rate": 0.00010087438840337914, "loss": 2.1831, "step": 2511 }, { "epoch": 0.18804858420826082, "grad_norm": 1.5624293088912964, "learning_rate": 0.0001008119335141263, "loss": 2.2066, "step": 2512 }, { "epoch": 0.18812344431343925, "grad_norm": 1.1954678297042847, "learning_rate": 0.0001007494783081471, "loss": 2.1618, "step": 2513 }, { "epoch": 0.1881983044186177, "grad_norm": 1.2217872142791748, "learning_rate": 0.0001006870228098046, "loss": 2.0982, "step": 2514 }, { "epoch": 0.18827316452379617, "grad_norm": 1.5336390733718872, "learning_rate": 0.00010062456704346203, "loss": 2.2942, "step": 2515 }, { "epoch": 0.1883480246289746, "grad_norm": 1.0276116132736206, "learning_rate": 0.00010056211103348272, "loss": 1.4028, "step": 2516 }, { "epoch": 0.18842288473415306, "grad_norm": 1.2130041122436523, "learning_rate": 0.00010049965480423, "loss": 1.9901, "step": 2517 }, { "epoch": 0.1884977448393315, "grad_norm": 1.3399722576141357, "learning_rate": 0.00010043719838006745, "loss": 2.2362, "step": 2518 }, { "epoch": 0.18857260494450995, "grad_norm": 1.391645908355713, "learning_rate": 0.0001003747417853586, "loss": 1.8577, "step": 2519 }, { "epoch": 0.1886474650496884, "grad_norm": 1.7531449794769287, "learning_rate": 0.00010031228504446703, "loss": 2.4766, "step": 2520 }, { "epoch": 0.18872232515486684, "grad_norm": 1.3224066495895386, "learning_rate": 0.00010024982818175654, "loss": 2.0797, "step": 2521 }, { "epoch": 0.1887971852600453, "grad_norm": 1.3309358358383179, "learning_rate": 0.00010018737122159079, "loss": 1.499, "step": 2522 }, { "epoch": 0.18887204536522373, "grad_norm": 1.4112776517868042, "learning_rate": 0.00010012491418833359, "loss": 2.2738, "step": 2523 }, { "epoch": 0.1889469054704022, "grad_norm": 1.2399197816848755, "learning_rate": 0.00010006245710634872, "loss": 2.2903, "step": 2524 }, { "epoch": 0.18902176557558065, "grad_norm": 1.064683198928833, "learning_rate": 0.0001, "loss": 2.0448, "step": 2525 }, { "epoch": 0.18909662568075908, "grad_norm": 1.2560454607009888, "learning_rate": 9.993754289365129e-05, "loss": 2.0454, "step": 2526 }, { "epoch": 0.18917148578593754, "grad_norm": 1.1411855220794678, "learning_rate": 9.987508581166644e-05, "loss": 1.6111, "step": 2527 }, { "epoch": 0.18924634589111597, "grad_norm": 1.1737639904022217, "learning_rate": 9.981262877840921e-05, "loss": 2.3924, "step": 2528 }, { "epoch": 0.18932120599629443, "grad_norm": 1.352904200553894, "learning_rate": 9.975017181824348e-05, "loss": 2.3237, "step": 2529 }, { "epoch": 0.18939606610147286, "grad_norm": 1.1236881017684937, "learning_rate": 9.968771495553299e-05, "loss": 1.9478, "step": 2530 }, { "epoch": 0.18947092620665132, "grad_norm": 1.5267400741577148, "learning_rate": 9.962525821464145e-05, "loss": 2.3087, "step": 2531 }, { "epoch": 0.18954578631182978, "grad_norm": 1.3090686798095703, "learning_rate": 9.956280161993258e-05, "loss": 2.2377, "step": 2532 }, { "epoch": 0.1896206464170082, "grad_norm": 1.3999888896942139, "learning_rate": 9.950034519577002e-05, "loss": 1.8998, "step": 2533 }, { "epoch": 0.18969550652218667, "grad_norm": 1.292399525642395, "learning_rate": 9.943788896651732e-05, "loss": 1.9579, "step": 2534 }, { "epoch": 0.1897703666273651, "grad_norm": 1.2785112857818604, "learning_rate": 9.937543295653799e-05, "loss": 1.8744, "step": 2535 }, { "epoch": 0.18984522673254356, "grad_norm": 1.1730414628982544, "learning_rate": 9.93129771901954e-05, "loss": 1.7431, "step": 2536 }, { "epoch": 0.18992008683772202, "grad_norm": 1.5021699666976929, "learning_rate": 9.92505216918529e-05, "loss": 1.8466, "step": 2537 }, { "epoch": 0.18999494694290045, "grad_norm": 1.2555149793624878, "learning_rate": 9.91880664858737e-05, "loss": 2.1243, "step": 2538 }, { "epoch": 0.1900698070480789, "grad_norm": 1.3292269706726074, "learning_rate": 9.912561159662088e-05, "loss": 2.3852, "step": 2539 }, { "epoch": 0.19014466715325734, "grad_norm": 1.1523269414901733, "learning_rate": 9.906315704845738e-05, "loss": 2.025, "step": 2540 }, { "epoch": 0.1902195272584358, "grad_norm": 1.5344829559326172, "learning_rate": 9.900070286574608e-05, "loss": 2.2133, "step": 2541 }, { "epoch": 0.19029438736361426, "grad_norm": 1.516180157661438, "learning_rate": 9.89382490728496e-05, "loss": 2.0733, "step": 2542 }, { "epoch": 0.1903692474687927, "grad_norm": 1.2107470035552979, "learning_rate": 9.887579569413057e-05, "loss": 2.0975, "step": 2543 }, { "epoch": 0.19044410757397115, "grad_norm": 1.1909098625183105, "learning_rate": 9.881334275395134e-05, "loss": 1.8794, "step": 2544 }, { "epoch": 0.19051896767914958, "grad_norm": 1.4270952939987183, "learning_rate": 9.875089027667408e-05, "loss": 2.1947, "step": 2545 }, { "epoch": 0.19059382778432804, "grad_norm": 1.3174028396606445, "learning_rate": 9.86884382866609e-05, "loss": 1.8053, "step": 2546 }, { "epoch": 0.19066868788950647, "grad_norm": 1.4433616399765015, "learning_rate": 9.862598680827355e-05, "loss": 1.9609, "step": 2547 }, { "epoch": 0.19074354799468493, "grad_norm": 1.829928994178772, "learning_rate": 9.856353586587369e-05, "loss": 2.4705, "step": 2548 }, { "epoch": 0.1908184080998634, "grad_norm": 1.2038333415985107, "learning_rate": 9.850108548382276e-05, "loss": 2.034, "step": 2549 }, { "epoch": 0.19089326820504182, "grad_norm": 1.3259354829788208, "learning_rate": 9.843863568648199e-05, "loss": 1.9455, "step": 2550 }, { "epoch": 0.19089326820504182, "eval_loss": 2.0447895526885986, "eval_runtime": 179.3599, "eval_samples_per_second": 27.877, "eval_steps_per_second": 13.938, "step": 2550 }, { "epoch": 0.19096812831022028, "grad_norm": 1.2748475074768066, "learning_rate": 9.837618649821234e-05, "loss": 2.0375, "step": 2551 }, { "epoch": 0.1910429884153987, "grad_norm": 1.2304805517196655, "learning_rate": 9.831373794337454e-05, "loss": 2.3999, "step": 2552 }, { "epoch": 0.19111784852057717, "grad_norm": 1.277798056602478, "learning_rate": 9.825129004632914e-05, "loss": 2.0635, "step": 2553 }, { "epoch": 0.19119270862575563, "grad_norm": 1.1768804788589478, "learning_rate": 9.818884283143636e-05, "loss": 2.0627, "step": 2554 }, { "epoch": 0.19126756873093406, "grad_norm": 1.3320322036743164, "learning_rate": 9.812639632305611e-05, "loss": 2.2844, "step": 2555 }, { "epoch": 0.19134242883611252, "grad_norm": 1.4217394590377808, "learning_rate": 9.806395054554818e-05, "loss": 2.1251, "step": 2556 }, { "epoch": 0.19141728894129095, "grad_norm": 1.403164267539978, "learning_rate": 9.800150552327191e-05, "loss": 2.0264, "step": 2557 }, { "epoch": 0.1914921490464694, "grad_norm": 1.191450834274292, "learning_rate": 9.793906128058647e-05, "loss": 1.7131, "step": 2558 }, { "epoch": 0.19156700915164787, "grad_norm": 1.2296690940856934, "learning_rate": 9.787661784185066e-05, "loss": 2.2389, "step": 2559 }, { "epoch": 0.1916418692568263, "grad_norm": 1.4053499698638916, "learning_rate": 9.781417523142298e-05, "loss": 1.8555, "step": 2560 }, { "epoch": 0.19171672936200476, "grad_norm": 1.067259669303894, "learning_rate": 9.775173347366163e-05, "loss": 1.3924, "step": 2561 }, { "epoch": 0.1917915894671832, "grad_norm": 1.2865835428237915, "learning_rate": 9.768929259292444e-05, "loss": 2.0701, "step": 2562 }, { "epoch": 0.19186644957236165, "grad_norm": 1.2973881959915161, "learning_rate": 9.762685261356888e-05, "loss": 2.3991, "step": 2563 }, { "epoch": 0.19194130967754008, "grad_norm": 1.4384677410125732, "learning_rate": 9.756441355995213e-05, "loss": 2.6625, "step": 2564 }, { "epoch": 0.19201616978271854, "grad_norm": 1.4569586515426636, "learning_rate": 9.750197545643102e-05, "loss": 1.9014, "step": 2565 }, { "epoch": 0.192091029887897, "grad_norm": 1.2391289472579956, "learning_rate": 9.74395383273619e-05, "loss": 2.3223, "step": 2566 }, { "epoch": 0.19216588999307543, "grad_norm": 1.2528749704360962, "learning_rate": 9.737710219710085e-05, "loss": 1.8836, "step": 2567 }, { "epoch": 0.1922407500982539, "grad_norm": 1.2254638671875, "learning_rate": 9.731466709000351e-05, "loss": 1.6744, "step": 2568 }, { "epoch": 0.19231561020343232, "grad_norm": 1.3066940307617188, "learning_rate": 9.725223303042515e-05, "loss": 1.7766, "step": 2569 }, { "epoch": 0.19239047030861078, "grad_norm": 1.6331300735473633, "learning_rate": 9.718980004272064e-05, "loss": 2.0017, "step": 2570 }, { "epoch": 0.19246533041378924, "grad_norm": 1.1962822675704956, "learning_rate": 9.712736815124433e-05, "loss": 1.9648, "step": 2571 }, { "epoch": 0.19254019051896767, "grad_norm": 1.1961963176727295, "learning_rate": 9.706493738035024e-05, "loss": 1.8906, "step": 2572 }, { "epoch": 0.19261505062414613, "grad_norm": 1.1377671957015991, "learning_rate": 9.700250775439196e-05, "loss": 1.8653, "step": 2573 }, { "epoch": 0.19268991072932456, "grad_norm": 1.2898489236831665, "learning_rate": 9.694007929772258e-05, "loss": 1.9553, "step": 2574 }, { "epoch": 0.19276477083450302, "grad_norm": 1.1606167554855347, "learning_rate": 9.687765203469474e-05, "loss": 2.1542, "step": 2575 }, { "epoch": 0.19283963093968148, "grad_norm": 1.2841007709503174, "learning_rate": 9.68152259896607e-05, "loss": 2.2014, "step": 2576 }, { "epoch": 0.19291449104485991, "grad_norm": 1.2134219408035278, "learning_rate": 9.67528011869721e-05, "loss": 1.738, "step": 2577 }, { "epoch": 0.19298935115003837, "grad_norm": 1.2720497846603394, "learning_rate": 9.669037765098024e-05, "loss": 1.9444, "step": 2578 }, { "epoch": 0.1930642112552168, "grad_norm": 1.3088527917861938, "learning_rate": 9.662795540603581e-05, "loss": 1.9609, "step": 2579 }, { "epoch": 0.19313907136039526, "grad_norm": 1.5262404680252075, "learning_rate": 9.656553447648904e-05, "loss": 2.045, "step": 2580 }, { "epoch": 0.1932139314655737, "grad_norm": 1.2783342599868774, "learning_rate": 9.650311488668966e-05, "loss": 2.2273, "step": 2581 }, { "epoch": 0.19328879157075216, "grad_norm": 1.3930296897888184, "learning_rate": 9.644069666098689e-05, "loss": 2.2291, "step": 2582 }, { "epoch": 0.19336365167593061, "grad_norm": 1.2442337274551392, "learning_rate": 9.637827982372938e-05, "loss": 2.0103, "step": 2583 }, { "epoch": 0.19343851178110905, "grad_norm": 1.1706619262695312, "learning_rate": 9.631586439926528e-05, "loss": 2.2112, "step": 2584 }, { "epoch": 0.1935133718862875, "grad_norm": 1.4910132884979248, "learning_rate": 9.62534504119421e-05, "loss": 2.0676, "step": 2585 }, { "epoch": 0.19358823199146594, "grad_norm": 1.2126480340957642, "learning_rate": 9.619103788610692e-05, "loss": 2.0065, "step": 2586 }, { "epoch": 0.1936630920966444, "grad_norm": 1.1813138723373413, "learning_rate": 9.612862684610619e-05, "loss": 2.1951, "step": 2587 }, { "epoch": 0.19373795220182286, "grad_norm": 1.3619667291641235, "learning_rate": 9.60662173162857e-05, "loss": 2.4459, "step": 2588 }, { "epoch": 0.1938128123070013, "grad_norm": 1.179897427558899, "learning_rate": 9.600380932099077e-05, "loss": 2.273, "step": 2589 }, { "epoch": 0.19388767241217975, "grad_norm": 1.0527675151824951, "learning_rate": 9.594140288456608e-05, "loss": 2.0409, "step": 2590 }, { "epoch": 0.19396253251735818, "grad_norm": 1.3777822256088257, "learning_rate": 9.587899803135567e-05, "loss": 2.4602, "step": 2591 }, { "epoch": 0.19403739262253664, "grad_norm": 1.3661878108978271, "learning_rate": 9.581659478570302e-05, "loss": 2.2341, "step": 2592 }, { "epoch": 0.1941122527277151, "grad_norm": 1.0854320526123047, "learning_rate": 9.575419317195095e-05, "loss": 1.9141, "step": 2593 }, { "epoch": 0.19418711283289353, "grad_norm": 1.587121844291687, "learning_rate": 9.569179321444162e-05, "loss": 2.3462, "step": 2594 }, { "epoch": 0.194261972938072, "grad_norm": 1.3064324855804443, "learning_rate": 9.562939493751663e-05, "loss": 2.0004, "step": 2595 }, { "epoch": 0.19433683304325042, "grad_norm": 1.3270378112792969, "learning_rate": 9.556699836551678e-05, "loss": 1.8218, "step": 2596 }, { "epoch": 0.19441169314842888, "grad_norm": 1.3404897451400757, "learning_rate": 9.550460352278235e-05, "loss": 1.623, "step": 2597 }, { "epoch": 0.1944865532536073, "grad_norm": 1.3143974542617798, "learning_rate": 9.544221043365286e-05, "loss": 1.8836, "step": 2598 }, { "epoch": 0.19456141335878577, "grad_norm": 1.0080770254135132, "learning_rate": 9.537981912246718e-05, "loss": 2.5711, "step": 2599 }, { "epoch": 0.19463627346396423, "grad_norm": 1.216559648513794, "learning_rate": 9.531742961356347e-05, "loss": 2.0685, "step": 2600 }, { "epoch": 0.19471113356914266, "grad_norm": 1.258325219154358, "learning_rate": 9.525504193127919e-05, "loss": 1.8008, "step": 2601 }, { "epoch": 0.19478599367432112, "grad_norm": 1.3967820405960083, "learning_rate": 9.519265609995112e-05, "loss": 2.2713, "step": 2602 }, { "epoch": 0.19486085377949955, "grad_norm": 1.1186140775680542, "learning_rate": 9.51302721439153e-05, "loss": 1.7944, "step": 2603 }, { "epoch": 0.194935713884678, "grad_norm": 1.238155722618103, "learning_rate": 9.506789008750697e-05, "loss": 2.636, "step": 2604 }, { "epoch": 0.19501057398985647, "grad_norm": 1.421769380569458, "learning_rate": 9.500550995506073e-05, "loss": 2.4742, "step": 2605 }, { "epoch": 0.1950854340950349, "grad_norm": 1.267804741859436, "learning_rate": 9.494313177091035e-05, "loss": 2.3813, "step": 2606 }, { "epoch": 0.19516029420021336, "grad_norm": 1.1974612474441528, "learning_rate": 9.488075555938888e-05, "loss": 1.9419, "step": 2607 }, { "epoch": 0.1952351543053918, "grad_norm": 1.321756362915039, "learning_rate": 9.481838134482868e-05, "loss": 2.1001, "step": 2608 }, { "epoch": 0.19531001441057025, "grad_norm": 1.393060564994812, "learning_rate": 9.475600915156113e-05, "loss": 2.0565, "step": 2609 }, { "epoch": 0.1953848745157487, "grad_norm": 1.1704076528549194, "learning_rate": 9.469363900391704e-05, "loss": 1.924, "step": 2610 }, { "epoch": 0.19545973462092714, "grad_norm": 1.2706787586212158, "learning_rate": 9.463127092622629e-05, "loss": 2.2598, "step": 2611 }, { "epoch": 0.1955345947261056, "grad_norm": 1.357430100440979, "learning_rate": 9.456890494281793e-05, "loss": 2.3271, "step": 2612 }, { "epoch": 0.19560945483128403, "grad_norm": 1.1339412927627563, "learning_rate": 9.450654107802029e-05, "loss": 1.7816, "step": 2613 }, { "epoch": 0.1956843149364625, "grad_norm": 1.2230123281478882, "learning_rate": 9.444417935616083e-05, "loss": 1.7063, "step": 2614 }, { "epoch": 0.19575917504164092, "grad_norm": 1.2752526998519897, "learning_rate": 9.438181980156617e-05, "loss": 2.2671, "step": 2615 }, { "epoch": 0.19583403514681938, "grad_norm": 1.2532445192337036, "learning_rate": 9.431946243856212e-05, "loss": 2.1276, "step": 2616 }, { "epoch": 0.19590889525199784, "grad_norm": 1.3699102401733398, "learning_rate": 9.425710729147356e-05, "loss": 2.5681, "step": 2617 }, { "epoch": 0.19598375535717627, "grad_norm": 1.1688004732131958, "learning_rate": 9.419475438462457e-05, "loss": 1.9628, "step": 2618 }, { "epoch": 0.19605861546235473, "grad_norm": 1.615485429763794, "learning_rate": 9.413240374233836e-05, "loss": 2.2384, "step": 2619 }, { "epoch": 0.19613347556753316, "grad_norm": 1.3283374309539795, "learning_rate": 9.40700553889372e-05, "loss": 1.7043, "step": 2620 }, { "epoch": 0.19620833567271162, "grad_norm": 1.1778398752212524, "learning_rate": 9.400770934874247e-05, "loss": 2.0012, "step": 2621 }, { "epoch": 0.19628319577789008, "grad_norm": 1.2636797428131104, "learning_rate": 9.394536564607476e-05, "loss": 1.8021, "step": 2622 }, { "epoch": 0.1963580558830685, "grad_norm": 1.295097827911377, "learning_rate": 9.388302430525359e-05, "loss": 2.3041, "step": 2623 }, { "epoch": 0.19643291598824697, "grad_norm": 1.1565889120101929, "learning_rate": 9.382068535059766e-05, "loss": 1.6363, "step": 2624 }, { "epoch": 0.1965077760934254, "grad_norm": 1.0980713367462158, "learning_rate": 9.375834880642476e-05, "loss": 1.4092, "step": 2625 }, { "epoch": 0.19658263619860386, "grad_norm": 1.4848331212997437, "learning_rate": 9.369601469705162e-05, "loss": 2.6126, "step": 2626 }, { "epoch": 0.19665749630378232, "grad_norm": 1.5267930030822754, "learning_rate": 9.363368304679416e-05, "loss": 2.2044, "step": 2627 }, { "epoch": 0.19673235640896075, "grad_norm": 1.2536474466323853, "learning_rate": 9.357135387996724e-05, "loss": 1.7811, "step": 2628 }, { "epoch": 0.1968072165141392, "grad_norm": 1.0817437171936035, "learning_rate": 9.350902722088476e-05, "loss": 1.586, "step": 2629 }, { "epoch": 0.19688207661931764, "grad_norm": 1.1986533403396606, "learning_rate": 9.344670309385968e-05, "loss": 2.0332, "step": 2630 }, { "epoch": 0.1969569367244961, "grad_norm": 1.2967923879623413, "learning_rate": 9.3384381523204e-05, "loss": 2.1843, "step": 2631 }, { "epoch": 0.19703179682967453, "grad_norm": 1.5663295984268188, "learning_rate": 9.332206253322863e-05, "loss": 2.1046, "step": 2632 }, { "epoch": 0.197106656934853, "grad_norm": 1.2708942890167236, "learning_rate": 9.325974614824358e-05, "loss": 2.1125, "step": 2633 }, { "epoch": 0.19718151704003145, "grad_norm": 1.2190303802490234, "learning_rate": 9.319743239255775e-05, "loss": 2.177, "step": 2634 }, { "epoch": 0.19725637714520988, "grad_norm": 1.2229925394058228, "learning_rate": 9.313512129047909e-05, "loss": 1.9406, "step": 2635 }, { "epoch": 0.19733123725038834, "grad_norm": 1.2887563705444336, "learning_rate": 9.307281286631443e-05, "loss": 2.2949, "step": 2636 }, { "epoch": 0.19740609735556677, "grad_norm": 1.252303957939148, "learning_rate": 9.301050714436964e-05, "loss": 2.2135, "step": 2637 }, { "epoch": 0.19748095746074523, "grad_norm": 1.1509042978286743, "learning_rate": 9.294820414894947e-05, "loss": 1.738, "step": 2638 }, { "epoch": 0.1975558175659237, "grad_norm": 1.0888903141021729, "learning_rate": 9.288590390435766e-05, "loss": 1.7644, "step": 2639 }, { "epoch": 0.19763067767110212, "grad_norm": 1.563669204711914, "learning_rate": 9.282360643489685e-05, "loss": 2.1901, "step": 2640 }, { "epoch": 0.19770553777628058, "grad_norm": 1.172290325164795, "learning_rate": 9.276131176486858e-05, "loss": 2.2096, "step": 2641 }, { "epoch": 0.197780397881459, "grad_norm": 1.1230710744857788, "learning_rate": 9.269901991857336e-05, "loss": 2.0292, "step": 2642 }, { "epoch": 0.19785525798663747, "grad_norm": 1.3380013704299927, "learning_rate": 9.263673092031051e-05, "loss": 2.5252, "step": 2643 }, { "epoch": 0.19793011809181593, "grad_norm": 1.3382676839828491, "learning_rate": 9.257444479437837e-05, "loss": 1.6793, "step": 2644 }, { "epoch": 0.19800497819699436, "grad_norm": 1.2105766534805298, "learning_rate": 9.251216156507397e-05, "loss": 1.649, "step": 2645 }, { "epoch": 0.19807983830217282, "grad_norm": 1.1747983694076538, "learning_rate": 9.244988125669336e-05, "loss": 1.8673, "step": 2646 }, { "epoch": 0.19815469840735125, "grad_norm": 1.3249467611312866, "learning_rate": 9.238760389353141e-05, "loss": 2.173, "step": 2647 }, { "epoch": 0.1982295585125297, "grad_norm": 1.362375259399414, "learning_rate": 9.232532949988186e-05, "loss": 2.3916, "step": 2648 }, { "epoch": 0.19830441861770814, "grad_norm": 1.1283209323883057, "learning_rate": 9.22630581000372e-05, "loss": 2.07, "step": 2649 }, { "epoch": 0.1983792787228866, "grad_norm": 1.2182743549346924, "learning_rate": 9.220078971828888e-05, "loss": 1.9178, "step": 2650 }, { "epoch": 0.19845413882806506, "grad_norm": 1.4832134246826172, "learning_rate": 9.213852437892713e-05, "loss": 2.0625, "step": 2651 }, { "epoch": 0.1985289989332435, "grad_norm": 1.2311491966247559, "learning_rate": 9.207626210624096e-05, "loss": 2.1318, "step": 2652 }, { "epoch": 0.19860385903842195, "grad_norm": 1.2842941284179688, "learning_rate": 9.201400292451813e-05, "loss": 1.6853, "step": 2653 }, { "epoch": 0.19867871914360039, "grad_norm": 1.428164005279541, "learning_rate": 9.195174685804537e-05, "loss": 2.8641, "step": 2654 }, { "epoch": 0.19875357924877884, "grad_norm": 1.3392467498779297, "learning_rate": 9.188949393110802e-05, "loss": 1.9506, "step": 2655 }, { "epoch": 0.1988284393539573, "grad_norm": 1.4439001083374023, "learning_rate": 9.182724416799028e-05, "loss": 2.5173, "step": 2656 }, { "epoch": 0.19890329945913574, "grad_norm": 1.211437702178955, "learning_rate": 9.176499759297513e-05, "loss": 1.7791, "step": 2657 }, { "epoch": 0.1989781595643142, "grad_norm": 1.4455188512802124, "learning_rate": 9.170275423034425e-05, "loss": 2.1825, "step": 2658 }, { "epoch": 0.19905301966949263, "grad_norm": 1.2227174043655396, "learning_rate": 9.164051410437811e-05, "loss": 1.901, "step": 2659 }, { "epoch": 0.19912787977467108, "grad_norm": 1.3928200006484985, "learning_rate": 9.157827723935591e-05, "loss": 2.588, "step": 2660 }, { "epoch": 0.19920273987984954, "grad_norm": 1.4013200998306274, "learning_rate": 9.151604365955551e-05, "loss": 2.3951, "step": 2661 }, { "epoch": 0.19927759998502798, "grad_norm": 1.1936302185058594, "learning_rate": 9.145381338925361e-05, "loss": 1.8529, "step": 2662 }, { "epoch": 0.19935246009020643, "grad_norm": 1.3172322511672974, "learning_rate": 9.139158645272554e-05, "loss": 2.1716, "step": 2663 }, { "epoch": 0.19942732019538487, "grad_norm": 1.295825719833374, "learning_rate": 9.132936287424533e-05, "loss": 2.2162, "step": 2664 }, { "epoch": 0.19950218030056333, "grad_norm": 1.2856205701828003, "learning_rate": 9.126714267808576e-05, "loss": 2.2203, "step": 2665 }, { "epoch": 0.19957704040574176, "grad_norm": 1.3693369626998901, "learning_rate": 9.120492588851819e-05, "loss": 2.0758, "step": 2666 }, { "epoch": 0.19965190051092022, "grad_norm": 1.1753121614456177, "learning_rate": 9.114271252981273e-05, "loss": 2.1973, "step": 2667 }, { "epoch": 0.19972676061609868, "grad_norm": 1.2585920095443726, "learning_rate": 9.10805026262382e-05, "loss": 2.014, "step": 2668 }, { "epoch": 0.1998016207212771, "grad_norm": 1.2083808183670044, "learning_rate": 9.10182962020619e-05, "loss": 2.2238, "step": 2669 }, { "epoch": 0.19987648082645557, "grad_norm": 1.5279886722564697, "learning_rate": 9.09560932815499e-05, "loss": 1.7144, "step": 2670 }, { "epoch": 0.199951340931634, "grad_norm": 1.011759638786316, "learning_rate": 9.089389388896691e-05, "loss": 2.1524, "step": 2671 }, { "epoch": 0.20002620103681246, "grad_norm": 1.4613842964172363, "learning_rate": 9.08316980485762e-05, "loss": 2.0124, "step": 2672 }, { "epoch": 0.20010106114199092, "grad_norm": 1.425389289855957, "learning_rate": 9.076950578463969e-05, "loss": 2.0464, "step": 2673 }, { "epoch": 0.20017592124716935, "grad_norm": 1.0942742824554443, "learning_rate": 9.070731712141794e-05, "loss": 1.5924, "step": 2674 }, { "epoch": 0.2002507813523478, "grad_norm": 1.317574381828308, "learning_rate": 9.064513208317001e-05, "loss": 2.1259, "step": 2675 }, { "epoch": 0.20032564145752624, "grad_norm": 1.305481195449829, "learning_rate": 9.058295069415368e-05, "loss": 1.8804, "step": 2676 }, { "epoch": 0.2004005015627047, "grad_norm": 1.1263463497161865, "learning_rate": 9.052077297862517e-05, "loss": 1.6444, "step": 2677 }, { "epoch": 0.20047536166788316, "grad_norm": 1.2791147232055664, "learning_rate": 9.045859896083931e-05, "loss": 2.1612, "step": 2678 }, { "epoch": 0.2005502217730616, "grad_norm": 1.3777872323989868, "learning_rate": 9.039642866504952e-05, "loss": 2.3804, "step": 2679 }, { "epoch": 0.20062508187824005, "grad_norm": 1.0962227582931519, "learning_rate": 9.03342621155078e-05, "loss": 1.9784, "step": 2680 }, { "epoch": 0.20069994198341848, "grad_norm": 1.3184579610824585, "learning_rate": 9.027209933646457e-05, "loss": 1.8206, "step": 2681 }, { "epoch": 0.20077480208859694, "grad_norm": 1.3490742444992065, "learning_rate": 9.02099403521689e-05, "loss": 2.3604, "step": 2682 }, { "epoch": 0.20084966219377537, "grad_norm": 1.2904856204986572, "learning_rate": 9.01477851868683e-05, "loss": 1.7476, "step": 2683 }, { "epoch": 0.20092452229895383, "grad_norm": 1.3607404232025146, "learning_rate": 9.008563386480886e-05, "loss": 2.1175, "step": 2684 }, { "epoch": 0.2009993824041323, "grad_norm": 1.1896295547485352, "learning_rate": 9.002348641023506e-05, "loss": 1.9205, "step": 2685 }, { "epoch": 0.20107424250931072, "grad_norm": 1.4022303819656372, "learning_rate": 8.996134284739002e-05, "loss": 2.7401, "step": 2686 }, { "epoch": 0.20114910261448918, "grad_norm": 1.2265658378601074, "learning_rate": 8.989920320051519e-05, "loss": 1.7294, "step": 2687 }, { "epoch": 0.2012239627196676, "grad_norm": 1.5422414541244507, "learning_rate": 8.983706749385062e-05, "loss": 2.3679, "step": 2688 }, { "epoch": 0.20129882282484607, "grad_norm": 1.2299209833145142, "learning_rate": 8.977493575163473e-05, "loss": 2.0157, "step": 2689 }, { "epoch": 0.20137368293002453, "grad_norm": 1.3028496503829956, "learning_rate": 8.971280799810447e-05, "loss": 2.0111, "step": 2690 }, { "epoch": 0.20144854303520296, "grad_norm": 1.1158802509307861, "learning_rate": 8.965068425749519e-05, "loss": 1.669, "step": 2691 }, { "epoch": 0.20152340314038142, "grad_norm": 1.3870044946670532, "learning_rate": 8.958856455404069e-05, "loss": 2.1207, "step": 2692 }, { "epoch": 0.20159826324555985, "grad_norm": 1.1217799186706543, "learning_rate": 8.952644891197313e-05, "loss": 1.7545, "step": 2693 }, { "epoch": 0.2016731233507383, "grad_norm": 1.3743736743927002, "learning_rate": 8.946433735552318e-05, "loss": 2.2035, "step": 2694 }, { "epoch": 0.20174798345591677, "grad_norm": 1.3552076816558838, "learning_rate": 8.940222990891989e-05, "loss": 2.5483, "step": 2695 }, { "epoch": 0.2018228435610952, "grad_norm": 1.3407281637191772, "learning_rate": 8.934012659639066e-05, "loss": 2.3369, "step": 2696 }, { "epoch": 0.20189770366627366, "grad_norm": 1.3817660808563232, "learning_rate": 8.927802744216134e-05, "loss": 1.9119, "step": 2697 }, { "epoch": 0.2019725637714521, "grad_norm": 1.4322881698608398, "learning_rate": 8.921593247045613e-05, "loss": 1.9345, "step": 2698 }, { "epoch": 0.20204742387663055, "grad_norm": 1.2854143381118774, "learning_rate": 8.915384170549758e-05, "loss": 1.5745, "step": 2699 }, { "epoch": 0.20212228398180898, "grad_norm": 1.2690396308898926, "learning_rate": 8.909175517150669e-05, "loss": 1.9849, "step": 2700 }, { "epoch": 0.20212228398180898, "eval_loss": 2.0359911918640137, "eval_runtime": 178.9117, "eval_samples_per_second": 27.947, "eval_steps_per_second": 13.973, "step": 2700 }, { "epoch": 0.20219714408698744, "grad_norm": 1.5054725408554077, "learning_rate": 8.902967289270264e-05, "loss": 1.6142, "step": 2701 }, { "epoch": 0.2022720041921659, "grad_norm": 1.1076580286026, "learning_rate": 8.896759489330307e-05, "loss": 1.6748, "step": 2702 }, { "epoch": 0.20234686429734433, "grad_norm": 1.3114571571350098, "learning_rate": 8.890552119752396e-05, "loss": 1.9965, "step": 2703 }, { "epoch": 0.2024217244025228, "grad_norm": 1.3032405376434326, "learning_rate": 8.884345182957953e-05, "loss": 2.4391, "step": 2704 }, { "epoch": 0.20249658450770122, "grad_norm": 1.099833607673645, "learning_rate": 8.878138681368239e-05, "loss": 1.5957, "step": 2705 }, { "epoch": 0.20257144461287968, "grad_norm": 1.4137766361236572, "learning_rate": 8.871932617404347e-05, "loss": 2.3449, "step": 2706 }, { "epoch": 0.20264630471805814, "grad_norm": 1.2119399309158325, "learning_rate": 8.865726993487187e-05, "loss": 2.1643, "step": 2707 }, { "epoch": 0.20272116482323657, "grad_norm": 1.2164069414138794, "learning_rate": 8.85952181203751e-05, "loss": 2.1622, "step": 2708 }, { "epoch": 0.20279602492841503, "grad_norm": 1.1175076961517334, "learning_rate": 8.853317075475891e-05, "loss": 1.596, "step": 2709 }, { "epoch": 0.20287088503359346, "grad_norm": 1.2384850978851318, "learning_rate": 8.84711278622272e-05, "loss": 2.2415, "step": 2710 }, { "epoch": 0.20294574513877192, "grad_norm": 1.1191554069519043, "learning_rate": 8.84090894669823e-05, "loss": 1.5237, "step": 2711 }, { "epoch": 0.20302060524395038, "grad_norm": 1.4109838008880615, "learning_rate": 8.834705559322471e-05, "loss": 2.1011, "step": 2712 }, { "epoch": 0.2030954653491288, "grad_norm": 1.4362866878509521, "learning_rate": 8.828502626515312e-05, "loss": 2.0553, "step": 2713 }, { "epoch": 0.20317032545430727, "grad_norm": 1.2915695905685425, "learning_rate": 8.822300150696457e-05, "loss": 2.1381, "step": 2714 }, { "epoch": 0.2032451855594857, "grad_norm": 1.3734196424484253, "learning_rate": 8.816098134285414e-05, "loss": 2.5409, "step": 2715 }, { "epoch": 0.20332004566466416, "grad_norm": 1.321742296218872, "learning_rate": 8.809896579701527e-05, "loss": 2.2512, "step": 2716 }, { "epoch": 0.2033949057698426, "grad_norm": 1.4432071447372437, "learning_rate": 8.80369548936396e-05, "loss": 2.0397, "step": 2717 }, { "epoch": 0.20346976587502105, "grad_norm": 1.2250922918319702, "learning_rate": 8.797494865691682e-05, "loss": 2.071, "step": 2718 }, { "epoch": 0.2035446259801995, "grad_norm": 1.2964376211166382, "learning_rate": 8.791294711103488e-05, "loss": 2.2972, "step": 2719 }, { "epoch": 0.20361948608537794, "grad_norm": 1.3138844966888428, "learning_rate": 8.785095028017998e-05, "loss": 2.4092, "step": 2720 }, { "epoch": 0.2036943461905564, "grad_norm": 1.1755006313323975, "learning_rate": 8.778895818853632e-05, "loss": 2.174, "step": 2721 }, { "epoch": 0.20376920629573483, "grad_norm": 1.2082746028900146, "learning_rate": 8.772697086028639e-05, "loss": 1.9172, "step": 2722 }, { "epoch": 0.2038440664009133, "grad_norm": 1.1128599643707275, "learning_rate": 8.766498831961077e-05, "loss": 1.5557, "step": 2723 }, { "epoch": 0.20391892650609175, "grad_norm": 1.2773730754852295, "learning_rate": 8.760301059068811e-05, "loss": 1.8363, "step": 2724 }, { "epoch": 0.20399378661127018, "grad_norm": 1.352532148361206, "learning_rate": 8.754103769769536e-05, "loss": 1.9695, "step": 2725 }, { "epoch": 0.20406864671644864, "grad_norm": 1.4901468753814697, "learning_rate": 8.747906966480736e-05, "loss": 2.1048, "step": 2726 }, { "epoch": 0.20414350682162707, "grad_norm": 1.3935467004776, "learning_rate": 8.74171065161972e-05, "loss": 2.5932, "step": 2727 }, { "epoch": 0.20421836692680553, "grad_norm": 1.2123428583145142, "learning_rate": 8.735514827603599e-05, "loss": 2.2904, "step": 2728 }, { "epoch": 0.204293227031984, "grad_norm": 1.2142397165298462, "learning_rate": 8.729319496849304e-05, "loss": 1.9523, "step": 2729 }, { "epoch": 0.20436808713716242, "grad_norm": 1.1701178550720215, "learning_rate": 8.723124661773558e-05, "loss": 1.9385, "step": 2730 }, { "epoch": 0.20444294724234088, "grad_norm": 3.191516160964966, "learning_rate": 8.716930324792904e-05, "loss": 1.9992, "step": 2731 }, { "epoch": 0.20451780734751931, "grad_norm": 1.5473004579544067, "learning_rate": 8.71073648832368e-05, "loss": 2.7813, "step": 2732 }, { "epoch": 0.20459266745269777, "grad_norm": 1.229217767715454, "learning_rate": 8.70454315478204e-05, "loss": 1.6195, "step": 2733 }, { "epoch": 0.2046675275578762, "grad_norm": 1.3654612302780151, "learning_rate": 8.698350326583928e-05, "loss": 1.6865, "step": 2734 }, { "epoch": 0.20474238766305466, "grad_norm": 1.5048997402191162, "learning_rate": 8.692158006145105e-05, "loss": 2.2759, "step": 2735 }, { "epoch": 0.20481724776823312, "grad_norm": 1.36582612991333, "learning_rate": 8.685966195881123e-05, "loss": 2.2089, "step": 2736 }, { "epoch": 0.20489210787341156, "grad_norm": 1.4912790060043335, "learning_rate": 8.679774898207341e-05, "loss": 2.0161, "step": 2737 }, { "epoch": 0.20496696797859001, "grad_norm": 1.2326253652572632, "learning_rate": 8.673584115538916e-05, "loss": 1.7712, "step": 2738 }, { "epoch": 0.20504182808376845, "grad_norm": 1.3694868087768555, "learning_rate": 8.667393850290805e-05, "loss": 1.9567, "step": 2739 }, { "epoch": 0.2051166881889469, "grad_norm": 1.277735948562622, "learning_rate": 8.661204104877765e-05, "loss": 2.3738, "step": 2740 }, { "epoch": 0.20519154829412536, "grad_norm": 1.4377446174621582, "learning_rate": 8.655014881714348e-05, "loss": 2.3162, "step": 2741 }, { "epoch": 0.2052664083993038, "grad_norm": 1.3087705373764038, "learning_rate": 8.648826183214896e-05, "loss": 1.8952, "step": 2742 }, { "epoch": 0.20534126850448225, "grad_norm": 1.3787949085235596, "learning_rate": 8.642638011793556e-05, "loss": 2.8119, "step": 2743 }, { "epoch": 0.2054161286096607, "grad_norm": 1.4011765718460083, "learning_rate": 8.636450369864268e-05, "loss": 2.446, "step": 2744 }, { "epoch": 0.20549098871483915, "grad_norm": 1.3557651042938232, "learning_rate": 8.63026325984076e-05, "loss": 2.1559, "step": 2745 }, { "epoch": 0.2055658488200176, "grad_norm": 1.5108283758163452, "learning_rate": 8.62407668413656e-05, "loss": 1.9192, "step": 2746 }, { "epoch": 0.20564070892519604, "grad_norm": 1.26442289352417, "learning_rate": 8.617890645164978e-05, "loss": 1.8187, "step": 2747 }, { "epoch": 0.2057155690303745, "grad_norm": 1.2985153198242188, "learning_rate": 8.611705145339125e-05, "loss": 2.1158, "step": 2748 }, { "epoch": 0.20579042913555293, "grad_norm": 1.0844366550445557, "learning_rate": 8.605520187071897e-05, "loss": 2.0736, "step": 2749 }, { "epoch": 0.2058652892407314, "grad_norm": 1.1493586301803589, "learning_rate": 8.599335772775976e-05, "loss": 1.8086, "step": 2750 }, { "epoch": 0.20594014934590982, "grad_norm": 1.1414307355880737, "learning_rate": 8.593151904863833e-05, "loss": 1.9132, "step": 2751 }, { "epoch": 0.20601500945108828, "grad_norm": 1.3293867111206055, "learning_rate": 8.586968585747728e-05, "loss": 2.2928, "step": 2752 }, { "epoch": 0.20608986955626674, "grad_norm": 1.198611855506897, "learning_rate": 8.580785817839707e-05, "loss": 2.3776, "step": 2753 }, { "epoch": 0.20616472966144517, "grad_norm": 1.3419872522354126, "learning_rate": 8.5746036035516e-05, "loss": 1.8615, "step": 2754 }, { "epoch": 0.20623958976662363, "grad_norm": 1.2200703620910645, "learning_rate": 8.568421945295023e-05, "loss": 1.9647, "step": 2755 }, { "epoch": 0.20631444987180206, "grad_norm": 1.1118066310882568, "learning_rate": 8.562240845481369e-05, "loss": 1.9539, "step": 2756 }, { "epoch": 0.20638930997698052, "grad_norm": 1.4046443700790405, "learning_rate": 8.556060306521825e-05, "loss": 2.1673, "step": 2757 }, { "epoch": 0.20646417008215898, "grad_norm": 1.4271684885025024, "learning_rate": 8.549880330827342e-05, "loss": 2.1678, "step": 2758 }, { "epoch": 0.2065390301873374, "grad_norm": 1.3021272420883179, "learning_rate": 8.543700920808665e-05, "loss": 2.6695, "step": 2759 }, { "epoch": 0.20661389029251587, "grad_norm": 1.2937668561935425, "learning_rate": 8.537522078876314e-05, "loss": 2.2211, "step": 2760 }, { "epoch": 0.2066887503976943, "grad_norm": 1.313153862953186, "learning_rate": 8.531343807440591e-05, "loss": 2.0695, "step": 2761 }, { "epoch": 0.20676361050287276, "grad_norm": 1.1108145713806152, "learning_rate": 8.525166108911565e-05, "loss": 2.182, "step": 2762 }, { "epoch": 0.20683847060805122, "grad_norm": 1.3020979166030884, "learning_rate": 8.518988985699095e-05, "loss": 2.8573, "step": 2763 }, { "epoch": 0.20691333071322965, "grad_norm": 1.342994213104248, "learning_rate": 8.512812440212805e-05, "loss": 1.9959, "step": 2764 }, { "epoch": 0.2069881908184081, "grad_norm": 1.4580572843551636, "learning_rate": 8.506636474862098e-05, "loss": 1.943, "step": 2765 }, { "epoch": 0.20706305092358654, "grad_norm": 1.2398215532302856, "learning_rate": 8.500461092056158e-05, "loss": 2.0618, "step": 2766 }, { "epoch": 0.207137911028765, "grad_norm": 1.175234079360962, "learning_rate": 8.494286294203927e-05, "loss": 2.0189, "step": 2767 }, { "epoch": 0.20721277113394343, "grad_norm": 1.4234296083450317, "learning_rate": 8.488112083714121e-05, "loss": 2.2347, "step": 2768 }, { "epoch": 0.2072876312391219, "grad_norm": 1.566101312637329, "learning_rate": 8.481938462995244e-05, "loss": 2.2112, "step": 2769 }, { "epoch": 0.20736249134430035, "grad_norm": 1.215427279472351, "learning_rate": 8.475765434455548e-05, "loss": 1.7482, "step": 2770 }, { "epoch": 0.20743735144947878, "grad_norm": 1.3302955627441406, "learning_rate": 8.469593000503068e-05, "loss": 1.8269, "step": 2771 }, { "epoch": 0.20751221155465724, "grad_norm": 1.5141065120697021, "learning_rate": 8.463421163545606e-05, "loss": 2.2688, "step": 2772 }, { "epoch": 0.20758707165983567, "grad_norm": 1.3993855714797974, "learning_rate": 8.45724992599072e-05, "loss": 2.3758, "step": 2773 }, { "epoch": 0.20766193176501413, "grad_norm": 1.1682900190353394, "learning_rate": 8.451079290245753e-05, "loss": 1.9901, "step": 2774 }, { "epoch": 0.2077367918701926, "grad_norm": 1.366036295890808, "learning_rate": 8.444909258717795e-05, "loss": 1.6652, "step": 2775 }, { "epoch": 0.20781165197537102, "grad_norm": 1.356450080871582, "learning_rate": 8.438739833813704e-05, "loss": 2.0808, "step": 2776 }, { "epoch": 0.20788651208054948, "grad_norm": 1.6121494770050049, "learning_rate": 8.432571017940113e-05, "loss": 2.1146, "step": 2777 }, { "epoch": 0.2079613721857279, "grad_norm": 1.199275016784668, "learning_rate": 8.426402813503409e-05, "loss": 2.0236, "step": 2778 }, { "epoch": 0.20803623229090637, "grad_norm": 1.4350848197937012, "learning_rate": 8.420235222909735e-05, "loss": 1.9612, "step": 2779 }, { "epoch": 0.20811109239608483, "grad_norm": 1.2785420417785645, "learning_rate": 8.414068248565007e-05, "loss": 1.8803, "step": 2780 }, { "epoch": 0.20818595250126326, "grad_norm": 1.3801606893539429, "learning_rate": 8.40790189287489e-05, "loss": 2.0213, "step": 2781 }, { "epoch": 0.20826081260644172, "grad_norm": 1.2904820442199707, "learning_rate": 8.401736158244817e-05, "loss": 1.8051, "step": 2782 }, { "epoch": 0.20833567271162015, "grad_norm": 1.173324465751648, "learning_rate": 8.395571047079964e-05, "loss": 1.8646, "step": 2783 }, { "epoch": 0.2084105328167986, "grad_norm": 1.2112430334091187, "learning_rate": 8.389406561785283e-05, "loss": 1.581, "step": 2784 }, { "epoch": 0.20848539292197704, "grad_norm": 1.446946144104004, "learning_rate": 8.383242704765468e-05, "loss": 2.3335, "step": 2785 }, { "epoch": 0.2085602530271555, "grad_norm": 1.232229232788086, "learning_rate": 8.377079478424972e-05, "loss": 2.0053, "step": 2786 }, { "epoch": 0.20863511313233396, "grad_norm": 1.3162829875946045, "learning_rate": 8.370916885168001e-05, "loss": 2.1621, "step": 2787 }, { "epoch": 0.2087099732375124, "grad_norm": 1.2785738706588745, "learning_rate": 8.364754927398518e-05, "loss": 2.4562, "step": 2788 }, { "epoch": 0.20878483334269085, "grad_norm": 1.3278274536132812, "learning_rate": 8.358593607520237e-05, "loss": 2.13, "step": 2789 }, { "epoch": 0.20885969344786928, "grad_norm": 1.1959290504455566, "learning_rate": 8.352432927936619e-05, "loss": 1.9199, "step": 2790 }, { "epoch": 0.20893455355304774, "grad_norm": 1.4112823009490967, "learning_rate": 8.346272891050874e-05, "loss": 1.9954, "step": 2791 }, { "epoch": 0.2090094136582262, "grad_norm": 1.432204246520996, "learning_rate": 8.340113499265967e-05, "loss": 2.4359, "step": 2792 }, { "epoch": 0.20908427376340463, "grad_norm": 1.410042405128479, "learning_rate": 8.333954754984613e-05, "loss": 2.1792, "step": 2793 }, { "epoch": 0.2091591338685831, "grad_norm": 1.1657357215881348, "learning_rate": 8.327796660609266e-05, "loss": 1.6879, "step": 2794 }, { "epoch": 0.20923399397376152, "grad_norm": 1.4659231901168823, "learning_rate": 8.321639218542137e-05, "loss": 1.9171, "step": 2795 }, { "epoch": 0.20930885407893998, "grad_norm": 1.2671570777893066, "learning_rate": 8.31548243118517e-05, "loss": 2.4637, "step": 2796 }, { "epoch": 0.20938371418411844, "grad_norm": 1.263134479522705, "learning_rate": 8.309326300940063e-05, "loss": 2.129, "step": 2797 }, { "epoch": 0.20945857428929687, "grad_norm": 1.189886212348938, "learning_rate": 8.30317083020826e-05, "loss": 1.8062, "step": 2798 }, { "epoch": 0.20953343439447533, "grad_norm": 1.3179715871810913, "learning_rate": 8.297016021390936e-05, "loss": 2.0403, "step": 2799 }, { "epoch": 0.20960829449965376, "grad_norm": 1.3183176517486572, "learning_rate": 8.290861876889016e-05, "loss": 2.2076, "step": 2800 }, { "epoch": 0.20968315460483222, "grad_norm": 1.31160569190979, "learning_rate": 8.284708399103165e-05, "loss": 2.1552, "step": 2801 }, { "epoch": 0.20975801471001065, "grad_norm": 1.1805959939956665, "learning_rate": 8.278555590433786e-05, "loss": 2.2058, "step": 2802 }, { "epoch": 0.2098328748151891, "grad_norm": 1.33713698387146, "learning_rate": 8.272403453281025e-05, "loss": 2.3535, "step": 2803 }, { "epoch": 0.20990773492036757, "grad_norm": 1.5686932802200317, "learning_rate": 8.266251990044763e-05, "loss": 2.1836, "step": 2804 }, { "epoch": 0.209982595025546, "grad_norm": 1.2271898984909058, "learning_rate": 8.260101203124616e-05, "loss": 2.0933, "step": 2805 }, { "epoch": 0.21005745513072446, "grad_norm": 1.2838937044143677, "learning_rate": 8.253951094919944e-05, "loss": 2.0959, "step": 2806 }, { "epoch": 0.2101323152359029, "grad_norm": 1.1953611373901367, "learning_rate": 8.247801667829832e-05, "loss": 1.6942, "step": 2807 }, { "epoch": 0.21020717534108135, "grad_norm": 1.1873667240142822, "learning_rate": 8.241652924253102e-05, "loss": 2.0941, "step": 2808 }, { "epoch": 0.2102820354462598, "grad_norm": 1.5984896421432495, "learning_rate": 8.235504866588315e-05, "loss": 1.552, "step": 2809 }, { "epoch": 0.21035689555143824, "grad_norm": 1.312623143196106, "learning_rate": 8.229357497233762e-05, "loss": 2.5812, "step": 2810 }, { "epoch": 0.2104317556566167, "grad_norm": 1.2441725730895996, "learning_rate": 8.223210818587463e-05, "loss": 2.3495, "step": 2811 }, { "epoch": 0.21050661576179514, "grad_norm": 1.5059030055999756, "learning_rate": 8.217064833047171e-05, "loss": 2.0494, "step": 2812 }, { "epoch": 0.2105814758669736, "grad_norm": 1.283226728439331, "learning_rate": 8.210919543010366e-05, "loss": 1.9637, "step": 2813 }, { "epoch": 0.21065633597215205, "grad_norm": 1.1743619441986084, "learning_rate": 8.204774950874262e-05, "loss": 1.6117, "step": 2814 }, { "epoch": 0.21073119607733048, "grad_norm": 1.3811976909637451, "learning_rate": 8.198631059035791e-05, "loss": 2.2315, "step": 2815 }, { "epoch": 0.21080605618250894, "grad_norm": 1.156548023223877, "learning_rate": 8.192487869891624e-05, "loss": 1.8936, "step": 2816 }, { "epoch": 0.21088091628768738, "grad_norm": 1.5043896436691284, "learning_rate": 8.186345385838147e-05, "loss": 1.9812, "step": 2817 }, { "epoch": 0.21095577639286583, "grad_norm": 1.3084745407104492, "learning_rate": 8.18020360927148e-05, "loss": 1.8118, "step": 2818 }, { "epoch": 0.21103063649804427, "grad_norm": 1.3003332614898682, "learning_rate": 8.174062542587458e-05, "loss": 1.6785, "step": 2819 }, { "epoch": 0.21110549660322273, "grad_norm": 1.5183632373809814, "learning_rate": 8.167922188181647e-05, "loss": 2.6455, "step": 2820 }, { "epoch": 0.21118035670840118, "grad_norm": 1.4714878797531128, "learning_rate": 8.161782548449334e-05, "loss": 2.4046, "step": 2821 }, { "epoch": 0.21125521681357962, "grad_norm": 1.255902886390686, "learning_rate": 8.15564362578552e-05, "loss": 1.8991, "step": 2822 }, { "epoch": 0.21133007691875808, "grad_norm": 1.5493518114089966, "learning_rate": 8.14950542258494e-05, "loss": 2.1579, "step": 2823 }, { "epoch": 0.2114049370239365, "grad_norm": 1.5306426286697388, "learning_rate": 8.143367941242032e-05, "loss": 2.1442, "step": 2824 }, { "epoch": 0.21147979712911497, "grad_norm": 1.2761075496673584, "learning_rate": 8.13723118415096e-05, "loss": 2.1354, "step": 2825 }, { "epoch": 0.21155465723429343, "grad_norm": 1.1372085809707642, "learning_rate": 8.131095153705608e-05, "loss": 1.5905, "step": 2826 }, { "epoch": 0.21162951733947186, "grad_norm": 1.0045173168182373, "learning_rate": 8.124959852299577e-05, "loss": 1.8946, "step": 2827 }, { "epoch": 0.21170437744465032, "grad_norm": 1.2088665962219238, "learning_rate": 8.118825282326177e-05, "loss": 2.0191, "step": 2828 }, { "epoch": 0.21177923754982875, "grad_norm": 1.3823282718658447, "learning_rate": 8.11269144617844e-05, "loss": 1.7497, "step": 2829 }, { "epoch": 0.2118540976550072, "grad_norm": 1.2239453792572021, "learning_rate": 8.106558346249103e-05, "loss": 2.1137, "step": 2830 }, { "epoch": 0.21192895776018567, "grad_norm": 1.2667239904403687, "learning_rate": 8.100425984930631e-05, "loss": 2.3732, "step": 2831 }, { "epoch": 0.2120038178653641, "grad_norm": 1.1695929765701294, "learning_rate": 8.094294364615178e-05, "loss": 1.5676, "step": 2832 }, { "epoch": 0.21207867797054256, "grad_norm": 1.3283430337905884, "learning_rate": 8.088163487694632e-05, "loss": 2.0109, "step": 2833 }, { "epoch": 0.212153538075721, "grad_norm": 1.4625393152236938, "learning_rate": 8.082033356560575e-05, "loss": 2.3851, "step": 2834 }, { "epoch": 0.21222839818089945, "grad_norm": 1.1489344835281372, "learning_rate": 8.07590397360431e-05, "loss": 1.9663, "step": 2835 }, { "epoch": 0.21230325828607788, "grad_norm": 1.1615794897079468, "learning_rate": 8.069775341216835e-05, "loss": 1.978, "step": 2836 }, { "epoch": 0.21237811839125634, "grad_norm": 1.199622392654419, "learning_rate": 8.063647461788867e-05, "loss": 2.0291, "step": 2837 }, { "epoch": 0.2124529784964348, "grad_norm": 1.1148089170455933, "learning_rate": 8.057520337710825e-05, "loss": 2.3997, "step": 2838 }, { "epoch": 0.21252783860161323, "grad_norm": 1.155385136604309, "learning_rate": 8.051393971372835e-05, "loss": 2.0205, "step": 2839 }, { "epoch": 0.2126026987067917, "grad_norm": 1.280254602432251, "learning_rate": 8.045268365164717e-05, "loss": 1.8341, "step": 2840 }, { "epoch": 0.21267755881197012, "grad_norm": 1.4626336097717285, "learning_rate": 8.039143521476009e-05, "loss": 1.4046, "step": 2841 }, { "epoch": 0.21275241891714858, "grad_norm": 1.2735047340393066, "learning_rate": 8.033019442695946e-05, "loss": 2.1698, "step": 2842 }, { "epoch": 0.21282727902232704, "grad_norm": 2.4070489406585693, "learning_rate": 8.02689613121346e-05, "loss": 2.5174, "step": 2843 }, { "epoch": 0.21290213912750547, "grad_norm": 1.30704927444458, "learning_rate": 8.020773589417195e-05, "loss": 1.7709, "step": 2844 }, { "epoch": 0.21297699923268393, "grad_norm": 1.3706165552139282, "learning_rate": 8.014651819695478e-05, "loss": 1.7696, "step": 2845 }, { "epoch": 0.21305185933786236, "grad_norm": 1.269521713256836, "learning_rate": 8.008530824436351e-05, "loss": 1.9097, "step": 2846 }, { "epoch": 0.21312671944304082, "grad_norm": 1.215846300125122, "learning_rate": 8.002410606027549e-05, "loss": 1.618, "step": 2847 }, { "epoch": 0.21320157954821928, "grad_norm": 1.6526447534561157, "learning_rate": 7.996291166856497e-05, "loss": 1.8127, "step": 2848 }, { "epoch": 0.2132764396533977, "grad_norm": 1.3681057691574097, "learning_rate": 7.990172509310321e-05, "loss": 2.3141, "step": 2849 }, { "epoch": 0.21335129975857617, "grad_norm": 1.322493553161621, "learning_rate": 7.984054635775847e-05, "loss": 2.1229, "step": 2850 }, { "epoch": 0.21335129975857617, "eval_loss": 2.0261898040771484, "eval_runtime": 178.9561, "eval_samples_per_second": 27.94, "eval_steps_per_second": 13.97, "step": 2850 }, { "epoch": 0.2134261598637546, "grad_norm": 1.5573264360427856, "learning_rate": 7.977937548639585e-05, "loss": 2.092, "step": 2851 }, { "epoch": 0.21350101996893306, "grad_norm": 1.4672706127166748, "learning_rate": 7.971821250287746e-05, "loss": 2.0662, "step": 2852 }, { "epoch": 0.2135758800741115, "grad_norm": 1.4294664859771729, "learning_rate": 7.965705743106235e-05, "loss": 2.2041, "step": 2853 }, { "epoch": 0.21365074017928995, "grad_norm": 1.3817275762557983, "learning_rate": 7.959591029480637e-05, "loss": 2.1684, "step": 2854 }, { "epoch": 0.2137256002844684, "grad_norm": 1.4162713289260864, "learning_rate": 7.953477111796245e-05, "loss": 2.4045, "step": 2855 }, { "epoch": 0.21380046038964684, "grad_norm": 1.1877853870391846, "learning_rate": 7.947363992438021e-05, "loss": 1.7544, "step": 2856 }, { "epoch": 0.2138753204948253, "grad_norm": 1.713792324066162, "learning_rate": 7.94125167379063e-05, "loss": 2.2404, "step": 2857 }, { "epoch": 0.21395018060000373, "grad_norm": 1.2175538539886475, "learning_rate": 7.935140158238419e-05, "loss": 1.9157, "step": 2858 }, { "epoch": 0.2140250407051822, "grad_norm": 1.2929905652999878, "learning_rate": 7.929029448165427e-05, "loss": 2.2641, "step": 2859 }, { "epoch": 0.21409990081036065, "grad_norm": 1.2515925168991089, "learning_rate": 7.922919545955372e-05, "loss": 1.6149, "step": 2860 }, { "epoch": 0.21417476091553908, "grad_norm": 1.3374863862991333, "learning_rate": 7.916810453991662e-05, "loss": 1.8271, "step": 2861 }, { "epoch": 0.21424962102071754, "grad_norm": 1.2569639682769775, "learning_rate": 7.910702174657383e-05, "loss": 2.4504, "step": 2862 }, { "epoch": 0.21432448112589597, "grad_norm": 1.327976942062378, "learning_rate": 7.904594710335318e-05, "loss": 2.004, "step": 2863 }, { "epoch": 0.21439934123107443, "grad_norm": 1.1943280696868896, "learning_rate": 7.898488063407909e-05, "loss": 1.9135, "step": 2864 }, { "epoch": 0.2144742013362529, "grad_norm": 1.2409827709197998, "learning_rate": 7.892382236257298e-05, "loss": 1.9711, "step": 2865 }, { "epoch": 0.21454906144143132, "grad_norm": 1.200775384902954, "learning_rate": 7.886277231265302e-05, "loss": 1.8781, "step": 2866 }, { "epoch": 0.21462392154660978, "grad_norm": 1.4921456575393677, "learning_rate": 7.880173050813416e-05, "loss": 2.4941, "step": 2867 }, { "epoch": 0.2146987816517882, "grad_norm": 1.3333945274353027, "learning_rate": 7.874069697282812e-05, "loss": 2.0794, "step": 2868 }, { "epoch": 0.21477364175696667, "grad_norm": 1.2566499710083008, "learning_rate": 7.867967173054343e-05, "loss": 2.2646, "step": 2869 }, { "epoch": 0.2148485018621451, "grad_norm": 1.4196735620498657, "learning_rate": 7.86186548050854e-05, "loss": 2.2213, "step": 2870 }, { "epoch": 0.21492336196732356, "grad_norm": 1.4032801389694214, "learning_rate": 7.855764622025606e-05, "loss": 2.0306, "step": 2871 }, { "epoch": 0.21499822207250202, "grad_norm": 1.251185655593872, "learning_rate": 7.84966459998541e-05, "loss": 2.1841, "step": 2872 }, { "epoch": 0.21507308217768045, "grad_norm": 1.2970917224884033, "learning_rate": 7.843565416767514e-05, "loss": 2.3835, "step": 2873 }, { "epoch": 0.2151479422828589, "grad_norm": 1.3145643472671509, "learning_rate": 7.837467074751137e-05, "loss": 1.6663, "step": 2874 }, { "epoch": 0.21522280238803734, "grad_norm": 1.1643731594085693, "learning_rate": 7.831369576315176e-05, "loss": 1.4695, "step": 2875 }, { "epoch": 0.2152976624932158, "grad_norm": 1.3958686590194702, "learning_rate": 7.825272923838202e-05, "loss": 2.1042, "step": 2876 }, { "epoch": 0.21537252259839426, "grad_norm": 1.2984638214111328, "learning_rate": 7.81917711969845e-05, "loss": 2.3704, "step": 2877 }, { "epoch": 0.2154473827035727, "grad_norm": 1.2790745496749878, "learning_rate": 7.813082166273827e-05, "loss": 1.807, "step": 2878 }, { "epoch": 0.21552224280875115, "grad_norm": 1.3561424016952515, "learning_rate": 7.806988065941907e-05, "loss": 1.5299, "step": 2879 }, { "epoch": 0.21559710291392958, "grad_norm": 1.4649983644485474, "learning_rate": 7.800894821079935e-05, "loss": 1.5708, "step": 2880 }, { "epoch": 0.21567196301910804, "grad_norm": 1.3220348358154297, "learning_rate": 7.794802434064812e-05, "loss": 2.2899, "step": 2881 }, { "epoch": 0.2157468231242865, "grad_norm": 1.1246227025985718, "learning_rate": 7.788710907273116e-05, "loss": 1.7145, "step": 2882 }, { "epoch": 0.21582168322946493, "grad_norm": 1.3213832378387451, "learning_rate": 7.782620243081085e-05, "loss": 2.1923, "step": 2883 }, { "epoch": 0.2158965433346434, "grad_norm": 1.1738059520721436, "learning_rate": 7.776530443864622e-05, "loss": 1.6318, "step": 2884 }, { "epoch": 0.21597140343982182, "grad_norm": 1.1549359560012817, "learning_rate": 7.770441511999285e-05, "loss": 2.0498, "step": 2885 }, { "epoch": 0.21604626354500028, "grad_norm": 1.3939112424850464, "learning_rate": 7.764353449860304e-05, "loss": 2.3036, "step": 2886 }, { "epoch": 0.21612112365017871, "grad_norm": 1.355386734008789, "learning_rate": 7.758266259822568e-05, "loss": 2.2734, "step": 2887 }, { "epoch": 0.21619598375535717, "grad_norm": 1.1362019777297974, "learning_rate": 7.752179944260622e-05, "loss": 1.8581, "step": 2888 }, { "epoch": 0.21627084386053563, "grad_norm": 1.2159126996994019, "learning_rate": 7.746094505548666e-05, "loss": 1.9561, "step": 2889 }, { "epoch": 0.21634570396571406, "grad_norm": 1.14363694190979, "learning_rate": 7.740009946060564e-05, "loss": 2.2078, "step": 2890 }, { "epoch": 0.21642056407089252, "grad_norm": 1.4642022848129272, "learning_rate": 7.733926268169843e-05, "loss": 1.8897, "step": 2891 }, { "epoch": 0.21649542417607096, "grad_norm": 1.5700538158416748, "learning_rate": 7.72784347424967e-05, "loss": 1.9457, "step": 2892 }, { "epoch": 0.21657028428124941, "grad_norm": 1.312231183052063, "learning_rate": 7.721761566672884e-05, "loss": 2.3426, "step": 2893 }, { "epoch": 0.21664514438642787, "grad_norm": 1.372665286064148, "learning_rate": 7.715680547811965e-05, "loss": 1.8009, "step": 2894 }, { "epoch": 0.2167200044916063, "grad_norm": 1.2845001220703125, "learning_rate": 7.709600420039053e-05, "loss": 2.4564, "step": 2895 }, { "epoch": 0.21679486459678476, "grad_norm": 1.2745720148086548, "learning_rate": 7.703521185725946e-05, "loss": 2.1371, "step": 2896 }, { "epoch": 0.2168697247019632, "grad_norm": 1.3379429578781128, "learning_rate": 7.69744284724408e-05, "loss": 2.0932, "step": 2897 }, { "epoch": 0.21694458480714165, "grad_norm": 1.243205189704895, "learning_rate": 7.691365406964546e-05, "loss": 2.5631, "step": 2898 }, { "epoch": 0.21701944491232011, "grad_norm": 1.3347123861312866, "learning_rate": 7.685288867258092e-05, "loss": 2.4863, "step": 2899 }, { "epoch": 0.21709430501749855, "grad_norm": 1.11233389377594, "learning_rate": 7.679213230495107e-05, "loss": 1.5358, "step": 2900 }, { "epoch": 0.217169165122677, "grad_norm": 1.2692970037460327, "learning_rate": 7.673138499045631e-05, "loss": 1.4583, "step": 2901 }, { "epoch": 0.21724402522785544, "grad_norm": 1.1000231504440308, "learning_rate": 7.667064675279353e-05, "loss": 1.7299, "step": 2902 }, { "epoch": 0.2173188853330339, "grad_norm": 1.2453359365463257, "learning_rate": 7.6609917615656e-05, "loss": 2.1917, "step": 2903 }, { "epoch": 0.21739374543821233, "grad_norm": 1.0542539358139038, "learning_rate": 7.654919760273358e-05, "loss": 1.1897, "step": 2904 }, { "epoch": 0.21746860554339079, "grad_norm": 1.3035470247268677, "learning_rate": 7.648848673771237e-05, "loss": 1.8765, "step": 2905 }, { "epoch": 0.21754346564856925, "grad_norm": 1.210856556892395, "learning_rate": 7.642778504427506e-05, "loss": 2.0462, "step": 2906 }, { "epoch": 0.21761832575374768, "grad_norm": 1.108260989189148, "learning_rate": 7.636709254610073e-05, "loss": 2.0181, "step": 2907 }, { "epoch": 0.21769318585892614, "grad_norm": 1.173779010772705, "learning_rate": 7.630640926686485e-05, "loss": 1.2713, "step": 2908 }, { "epoch": 0.21776804596410457, "grad_norm": 1.2965070009231567, "learning_rate": 7.62457352302393e-05, "loss": 2.1748, "step": 2909 }, { "epoch": 0.21784290606928303, "grad_norm": 1.536515712738037, "learning_rate": 7.618507045989239e-05, "loss": 2.2117, "step": 2910 }, { "epoch": 0.21791776617446149, "grad_norm": 1.2173134088516235, "learning_rate": 7.61244149794887e-05, "loss": 2.0319, "step": 2911 }, { "epoch": 0.21799262627963992, "grad_norm": 1.5086474418640137, "learning_rate": 7.606376881268938e-05, "loss": 2.4808, "step": 2912 }, { "epoch": 0.21806748638481838, "grad_norm": 1.3591833114624023, "learning_rate": 7.600313198315174e-05, "loss": 1.6527, "step": 2913 }, { "epoch": 0.2181423464899968, "grad_norm": 1.2544853687286377, "learning_rate": 7.59425045145296e-05, "loss": 1.63, "step": 2914 }, { "epoch": 0.21821720659517527, "grad_norm": 1.0770962238311768, "learning_rate": 7.588188643047304e-05, "loss": 1.5395, "step": 2915 }, { "epoch": 0.21829206670035373, "grad_norm": 1.4068634510040283, "learning_rate": 7.582127775462853e-05, "loss": 2.0512, "step": 2916 }, { "epoch": 0.21836692680553216, "grad_norm": 1.3699356317520142, "learning_rate": 7.576067851063882e-05, "loss": 1.5394, "step": 2917 }, { "epoch": 0.21844178691071062, "grad_norm": 1.24029541015625, "learning_rate": 7.570008872214305e-05, "loss": 2.1826, "step": 2918 }, { "epoch": 0.21851664701588905, "grad_norm": 1.2130515575408936, "learning_rate": 7.563950841277664e-05, "loss": 2.1019, "step": 2919 }, { "epoch": 0.2185915071210675, "grad_norm": 1.3956819772720337, "learning_rate": 7.557893760617129e-05, "loss": 2.2831, "step": 2920 }, { "epoch": 0.21866636722624594, "grad_norm": 1.305478572845459, "learning_rate": 7.551837632595498e-05, "loss": 2.5015, "step": 2921 }, { "epoch": 0.2187412273314244, "grad_norm": 2.00286602973938, "learning_rate": 7.545782459575205e-05, "loss": 2.0377, "step": 2922 }, { "epoch": 0.21881608743660286, "grad_norm": 1.1167653799057007, "learning_rate": 7.5397282439183e-05, "loss": 1.6543, "step": 2923 }, { "epoch": 0.2188909475417813, "grad_norm": 1.7724477052688599, "learning_rate": 7.533674987986472e-05, "loss": 2.3445, "step": 2924 }, { "epoch": 0.21896580764695975, "grad_norm": 1.2965705394744873, "learning_rate": 7.52762269414103e-05, "loss": 2.1718, "step": 2925 }, { "epoch": 0.21904066775213818, "grad_norm": 1.3048453330993652, "learning_rate": 7.521571364742904e-05, "loss": 1.9033, "step": 2926 }, { "epoch": 0.21911552785731664, "grad_norm": 1.6700605154037476, "learning_rate": 7.515521002152655e-05, "loss": 2.2268, "step": 2927 }, { "epoch": 0.2191903879624951, "grad_norm": 1.2200087308883667, "learning_rate": 7.509471608730463e-05, "loss": 1.874, "step": 2928 }, { "epoch": 0.21926524806767353, "grad_norm": 1.456010103225708, "learning_rate": 7.503423186836125e-05, "loss": 2.1833, "step": 2929 }, { "epoch": 0.219340108172852, "grad_norm": 1.4217966794967651, "learning_rate": 7.497375738829069e-05, "loss": 2.1993, "step": 2930 }, { "epoch": 0.21941496827803042, "grad_norm": 1.3116620779037476, "learning_rate": 7.491329267068336e-05, "loss": 2.1318, "step": 2931 }, { "epoch": 0.21948982838320888, "grad_norm": 1.4451459646224976, "learning_rate": 7.485283773912591e-05, "loss": 2.0725, "step": 2932 }, { "epoch": 0.21956468848838734, "grad_norm": 1.3852113485336304, "learning_rate": 7.479239261720112e-05, "loss": 2.046, "step": 2933 }, { "epoch": 0.21963954859356577, "grad_norm": 1.1873122453689575, "learning_rate": 7.4731957328488e-05, "loss": 1.996, "step": 2934 }, { "epoch": 0.21971440869874423, "grad_norm": 1.3227158784866333, "learning_rate": 7.467153189656164e-05, "loss": 2.0625, "step": 2935 }, { "epoch": 0.21978926880392266, "grad_norm": 1.2525242567062378, "learning_rate": 7.461111634499341e-05, "loss": 1.9977, "step": 2936 }, { "epoch": 0.21986412890910112, "grad_norm": 1.0690242052078247, "learning_rate": 7.455071069735074e-05, "loss": 1.8182, "step": 2937 }, { "epoch": 0.21993898901427958, "grad_norm": 1.1022753715515137, "learning_rate": 7.449031497719716e-05, "loss": 1.7291, "step": 2938 }, { "epoch": 0.220013849119458, "grad_norm": 1.4931910037994385, "learning_rate": 7.44299292080924e-05, "loss": 1.8302, "step": 2939 }, { "epoch": 0.22008870922463647, "grad_norm": 1.7428377866744995, "learning_rate": 7.436955341359232e-05, "loss": 1.6589, "step": 2940 }, { "epoch": 0.2201635693298149, "grad_norm": 1.5203936100006104, "learning_rate": 7.430918761724881e-05, "loss": 2.0266, "step": 2941 }, { "epoch": 0.22023842943499336, "grad_norm": 1.207956314086914, "learning_rate": 7.424883184260997e-05, "loss": 2.3004, "step": 2942 }, { "epoch": 0.2203132895401718, "grad_norm": 1.3414666652679443, "learning_rate": 7.418848611321984e-05, "loss": 1.6767, "step": 2943 }, { "epoch": 0.22038814964535025, "grad_norm": 1.49296236038208, "learning_rate": 7.412815045261871e-05, "loss": 1.8364, "step": 2944 }, { "epoch": 0.2204630097505287, "grad_norm": 1.2224171161651611, "learning_rate": 7.406782488434285e-05, "loss": 2.0237, "step": 2945 }, { "epoch": 0.22053786985570714, "grad_norm": 1.063212513923645, "learning_rate": 7.400750943192457e-05, "loss": 1.7177, "step": 2946 }, { "epoch": 0.2206127299608856, "grad_norm": 1.115703821182251, "learning_rate": 7.394720411889225e-05, "loss": 1.3061, "step": 2947 }, { "epoch": 0.22068759006606403, "grad_norm": 1.4976978302001953, "learning_rate": 7.38869089687704e-05, "loss": 2.2921, "step": 2948 }, { "epoch": 0.2207624501712425, "grad_norm": 1.2641584873199463, "learning_rate": 7.38266240050794e-05, "loss": 1.5518, "step": 2949 }, { "epoch": 0.22083731027642095, "grad_norm": 1.293373465538025, "learning_rate": 7.376634925133586e-05, "loss": 2.1942, "step": 2950 }, { "epoch": 0.22091217038159938, "grad_norm": 1.222956895828247, "learning_rate": 7.370608473105224e-05, "loss": 2.0032, "step": 2951 }, { "epoch": 0.22098703048677784, "grad_norm": 1.3754801750183105, "learning_rate": 7.364583046773708e-05, "loss": 1.8186, "step": 2952 }, { "epoch": 0.22106189059195627, "grad_norm": 1.2105534076690674, "learning_rate": 7.358558648489496e-05, "loss": 2.1782, "step": 2953 }, { "epoch": 0.22113675069713473, "grad_norm": 1.2679569721221924, "learning_rate": 7.352535280602631e-05, "loss": 2.3085, "step": 2954 }, { "epoch": 0.2212116108023132, "grad_norm": 1.3291687965393066, "learning_rate": 7.346512945462767e-05, "loss": 2.1072, "step": 2955 }, { "epoch": 0.22128647090749162, "grad_norm": 1.373301386833191, "learning_rate": 7.34049164541915e-05, "loss": 2.0262, "step": 2956 }, { "epoch": 0.22136133101267008, "grad_norm": 1.3150500059127808, "learning_rate": 7.334471382820624e-05, "loss": 2.0009, "step": 2957 }, { "epoch": 0.2214361911178485, "grad_norm": 1.3875162601470947, "learning_rate": 7.328452160015628e-05, "loss": 2.2757, "step": 2958 }, { "epoch": 0.22151105122302697, "grad_norm": 1.5704519748687744, "learning_rate": 7.322433979352198e-05, "loss": 2.2065, "step": 2959 }, { "epoch": 0.2215859113282054, "grad_norm": 1.441763162612915, "learning_rate": 7.316416843177953e-05, "loss": 2.1484, "step": 2960 }, { "epoch": 0.22166077143338386, "grad_norm": 1.1905670166015625, "learning_rate": 7.310400753840122e-05, "loss": 2.3914, "step": 2961 }, { "epoch": 0.22173563153856232, "grad_norm": 1.108154296875, "learning_rate": 7.304385713685503e-05, "loss": 1.6925, "step": 2962 }, { "epoch": 0.22181049164374075, "grad_norm": 1.2776976823806763, "learning_rate": 7.298371725060506e-05, "loss": 2.2157, "step": 2963 }, { "epoch": 0.2218853517489192, "grad_norm": 1.1069761514663696, "learning_rate": 7.29235879031112e-05, "loss": 1.8885, "step": 2964 }, { "epoch": 0.22196021185409764, "grad_norm": 1.1077167987823486, "learning_rate": 7.286346911782927e-05, "loss": 2.007, "step": 2965 }, { "epoch": 0.2220350719592761, "grad_norm": 1.1509569883346558, "learning_rate": 7.28033609182109e-05, "loss": 1.2441, "step": 2966 }, { "epoch": 0.22210993206445456, "grad_norm": 1.3734452724456787, "learning_rate": 7.274326332770365e-05, "loss": 1.6326, "step": 2967 }, { "epoch": 0.222184792169633, "grad_norm": 1.3159713745117188, "learning_rate": 7.2683176369751e-05, "loss": 2.1827, "step": 2968 }, { "epoch": 0.22225965227481145, "grad_norm": 1.5505081415176392, "learning_rate": 7.262310006779219e-05, "loss": 2.0865, "step": 2969 }, { "epoch": 0.22233451237998988, "grad_norm": 1.2972732782363892, "learning_rate": 7.256303444526225e-05, "loss": 1.8786, "step": 2970 }, { "epoch": 0.22240937248516834, "grad_norm": 1.1268508434295654, "learning_rate": 7.25029795255922e-05, "loss": 2.1789, "step": 2971 }, { "epoch": 0.2224842325903468, "grad_norm": 1.535367488861084, "learning_rate": 7.244293533220876e-05, "loss": 2.566, "step": 2972 }, { "epoch": 0.22255909269552523, "grad_norm": 1.3607510328292847, "learning_rate": 7.238290188853454e-05, "loss": 2.036, "step": 2973 }, { "epoch": 0.2226339528007037, "grad_norm": 1.2089307308197021, "learning_rate": 7.232287921798794e-05, "loss": 2.1402, "step": 2974 }, { "epoch": 0.22270881290588213, "grad_norm": 1.2299387454986572, "learning_rate": 7.226286734398311e-05, "loss": 2.1493, "step": 2975 }, { "epoch": 0.22278367301106058, "grad_norm": 1.3255257606506348, "learning_rate": 7.220286628993007e-05, "loss": 2.1207, "step": 2976 }, { "epoch": 0.22285853311623902, "grad_norm": 1.3046870231628418, "learning_rate": 7.214287607923458e-05, "loss": 2.5119, "step": 2977 }, { "epoch": 0.22293339322141748, "grad_norm": 1.1704648733139038, "learning_rate": 7.208289673529807e-05, "loss": 2.0959, "step": 2978 }, { "epoch": 0.22300825332659593, "grad_norm": 1.189989447593689, "learning_rate": 7.20229282815179e-05, "loss": 1.8034, "step": 2979 }, { "epoch": 0.22308311343177437, "grad_norm": 1.3503079414367676, "learning_rate": 7.196297074128713e-05, "loss": 1.6056, "step": 2980 }, { "epoch": 0.22315797353695283, "grad_norm": 1.3348757028579712, "learning_rate": 7.190302413799448e-05, "loss": 2.3977, "step": 2981 }, { "epoch": 0.22323283364213126, "grad_norm": 1.2533862590789795, "learning_rate": 7.18430884950245e-05, "loss": 1.6882, "step": 2982 }, { "epoch": 0.22330769374730972, "grad_norm": 1.3490689992904663, "learning_rate": 7.178316383575742e-05, "loss": 2.1765, "step": 2983 }, { "epoch": 0.22338255385248817, "grad_norm": 1.244064450263977, "learning_rate": 7.172325018356918e-05, "loss": 1.7082, "step": 2984 }, { "epoch": 0.2234574139576666, "grad_norm": 1.4562369585037231, "learning_rate": 7.166334756183148e-05, "loss": 1.964, "step": 2985 }, { "epoch": 0.22353227406284507, "grad_norm": 1.1647534370422363, "learning_rate": 7.160345599391166e-05, "loss": 2.2406, "step": 2986 }, { "epoch": 0.2236071341680235, "grad_norm": 1.2876760959625244, "learning_rate": 7.15435755031727e-05, "loss": 1.4858, "step": 2987 }, { "epoch": 0.22368199427320196, "grad_norm": 1.3423960208892822, "learning_rate": 7.14837061129734e-05, "loss": 1.9824, "step": 2988 }, { "epoch": 0.22375685437838042, "grad_norm": 1.1998112201690674, "learning_rate": 7.142384784666814e-05, "loss": 1.906, "step": 2989 }, { "epoch": 0.22383171448355885, "grad_norm": 1.1030464172363281, "learning_rate": 7.136400072760696e-05, "loss": 1.7599, "step": 2990 }, { "epoch": 0.2239065745887373, "grad_norm": 1.6558773517608643, "learning_rate": 7.130416477913557e-05, "loss": 2.057, "step": 2991 }, { "epoch": 0.22398143469391574, "grad_norm": 1.3254019021987915, "learning_rate": 7.124434002459532e-05, "loss": 2.0816, "step": 2992 }, { "epoch": 0.2240562947990942, "grad_norm": 1.2396985292434692, "learning_rate": 7.11845264873232e-05, "loss": 1.7496, "step": 2993 }, { "epoch": 0.22413115490427263, "grad_norm": 1.3495776653289795, "learning_rate": 7.112472419065181e-05, "loss": 1.7566, "step": 2994 }, { "epoch": 0.2242060150094511, "grad_norm": 1.2550621032714844, "learning_rate": 7.106493315790938e-05, "loss": 2.5019, "step": 2995 }, { "epoch": 0.22428087511462955, "grad_norm": 1.373352289199829, "learning_rate": 7.10051534124197e-05, "loss": 2.2463, "step": 2996 }, { "epoch": 0.22435573521980798, "grad_norm": 1.4927558898925781, "learning_rate": 7.094538497750223e-05, "loss": 2.5038, "step": 2997 }, { "epoch": 0.22443059532498644, "grad_norm": 1.0653525590896606, "learning_rate": 7.088562787647198e-05, "loss": 1.2333, "step": 2998 }, { "epoch": 0.22450545543016487, "grad_norm": 1.3117351531982422, "learning_rate": 7.08258821326395e-05, "loss": 2.495, "step": 2999 }, { "epoch": 0.22458031553534333, "grad_norm": 1.2528645992279053, "learning_rate": 7.076614776931104e-05, "loss": 2.0596, "step": 3000 }, { "epoch": 0.22458031553534333, "eval_loss": 2.0165741443634033, "eval_runtime": 178.8885, "eval_samples_per_second": 27.95, "eval_steps_per_second": 13.975, "step": 3000 }, { "epoch": 0.2246551756405218, "grad_norm": 1.4404287338256836, "learning_rate": 7.070642480978821e-05, "loss": 1.9066, "step": 3001 }, { "epoch": 0.22473003574570022, "grad_norm": 1.5530503988265991, "learning_rate": 7.064671327736839e-05, "loss": 2.1274, "step": 3002 }, { "epoch": 0.22480489585087868, "grad_norm": 1.3341058492660522, "learning_rate": 7.058701319534433e-05, "loss": 2.1132, "step": 3003 }, { "epoch": 0.2248797559560571, "grad_norm": 1.354543685913086, "learning_rate": 7.052732458700437e-05, "loss": 2.1795, "step": 3004 }, { "epoch": 0.22495461606123557, "grad_norm": 1.1242326498031616, "learning_rate": 7.04676474756324e-05, "loss": 1.2963, "step": 3005 }, { "epoch": 0.22502947616641403, "grad_norm": 1.458835482597351, "learning_rate": 7.040798188450781e-05, "loss": 1.98, "step": 3006 }, { "epoch": 0.22510433627159246, "grad_norm": 1.1928789615631104, "learning_rate": 7.03483278369055e-05, "loss": 1.8492, "step": 3007 }, { "epoch": 0.22517919637677092, "grad_norm": 1.233738660812378, "learning_rate": 7.028868535609588e-05, "loss": 1.9784, "step": 3008 }, { "epoch": 0.22525405648194935, "grad_norm": 1.2399852275848389, "learning_rate": 7.022905446534479e-05, "loss": 2.0928, "step": 3009 }, { "epoch": 0.2253289165871278, "grad_norm": 1.2258734703063965, "learning_rate": 7.016943518791362e-05, "loss": 1.7003, "step": 3010 }, { "epoch": 0.22540377669230624, "grad_norm": 1.2817747592926025, "learning_rate": 7.010982754705915e-05, "loss": 1.938, "step": 3011 }, { "epoch": 0.2254786367974847, "grad_norm": 1.662971019744873, "learning_rate": 7.00502315660337e-05, "loss": 2.4288, "step": 3012 }, { "epoch": 0.22555349690266316, "grad_norm": 1.3368669748306274, "learning_rate": 6.999064726808502e-05, "loss": 2.2328, "step": 3013 }, { "epoch": 0.2256283570078416, "grad_norm": 1.1013509035110474, "learning_rate": 6.993107467645628e-05, "loss": 2.1923, "step": 3014 }, { "epoch": 0.22570321711302005, "grad_norm": 1.537875771522522, "learning_rate": 6.987151381438608e-05, "loss": 1.7678, "step": 3015 }, { "epoch": 0.22577807721819848, "grad_norm": 1.3733537197113037, "learning_rate": 6.981196470510847e-05, "loss": 1.9171, "step": 3016 }, { "epoch": 0.22585293732337694, "grad_norm": 2.1233999729156494, "learning_rate": 6.975242737185295e-05, "loss": 2.2291, "step": 3017 }, { "epoch": 0.2259277974285554, "grad_norm": 1.2396315336227417, "learning_rate": 6.969290183784433e-05, "loss": 1.9727, "step": 3018 }, { "epoch": 0.22600265753373383, "grad_norm": 1.2047637701034546, "learning_rate": 6.963338812630288e-05, "loss": 1.8573, "step": 3019 }, { "epoch": 0.2260775176389123, "grad_norm": 1.16303288936615, "learning_rate": 6.957388626044426e-05, "loss": 2.0895, "step": 3020 }, { "epoch": 0.22615237774409072, "grad_norm": 1.3047083616256714, "learning_rate": 6.951439626347944e-05, "loss": 2.1702, "step": 3021 }, { "epoch": 0.22622723784926918, "grad_norm": 1.1759214401245117, "learning_rate": 6.94549181586149e-05, "loss": 1.7895, "step": 3022 }, { "epoch": 0.22630209795444764, "grad_norm": 1.2919453382492065, "learning_rate": 6.939545196905235e-05, "loss": 2.7011, "step": 3023 }, { "epoch": 0.22637695805962607, "grad_norm": 1.1896443367004395, "learning_rate": 6.933599771798891e-05, "loss": 2.0044, "step": 3024 }, { "epoch": 0.22645181816480453, "grad_norm": 1.1643190383911133, "learning_rate": 6.927655542861705e-05, "loss": 2.0032, "step": 3025 }, { "epoch": 0.22652667826998296, "grad_norm": 1.8030990362167358, "learning_rate": 6.921712512412455e-05, "loss": 2.2319, "step": 3026 }, { "epoch": 0.22660153837516142, "grad_norm": 1.3477381467819214, "learning_rate": 6.915770682769447e-05, "loss": 1.8185, "step": 3027 }, { "epoch": 0.22667639848033985, "grad_norm": 1.1348663568496704, "learning_rate": 6.909830056250527e-05, "loss": 2.4027, "step": 3028 }, { "epoch": 0.2267512585855183, "grad_norm": 1.1328011751174927, "learning_rate": 6.903890635173071e-05, "loss": 2.0645, "step": 3029 }, { "epoch": 0.22682611869069677, "grad_norm": 1.1286251544952393, "learning_rate": 6.89795242185398e-05, "loss": 1.8423, "step": 3030 }, { "epoch": 0.2269009787958752, "grad_norm": 1.1225780248641968, "learning_rate": 6.892015418609686e-05, "loss": 2.1839, "step": 3031 }, { "epoch": 0.22697583890105366, "grad_norm": 1.2260732650756836, "learning_rate": 6.886079627756148e-05, "loss": 2.4571, "step": 3032 }, { "epoch": 0.2270506990062321, "grad_norm": 1.1776577234268188, "learning_rate": 6.880145051608855e-05, "loss": 2.1143, "step": 3033 }, { "epoch": 0.22712555911141055, "grad_norm": 1.2941601276397705, "learning_rate": 6.874211692482822e-05, "loss": 2.1712, "step": 3034 }, { "epoch": 0.227200419216589, "grad_norm": 1.5702803134918213, "learning_rate": 6.868279552692582e-05, "loss": 2.5821, "step": 3035 }, { "epoch": 0.22727527932176744, "grad_norm": 1.4146573543548584, "learning_rate": 6.8623486345522e-05, "loss": 2.0917, "step": 3036 }, { "epoch": 0.2273501394269459, "grad_norm": 1.2838493585586548, "learning_rate": 6.856418940375263e-05, "loss": 2.1877, "step": 3037 }, { "epoch": 0.22742499953212433, "grad_norm": 1.2730355262756348, "learning_rate": 6.85049047247488e-05, "loss": 1.899, "step": 3038 }, { "epoch": 0.2274998596373028, "grad_norm": 1.3108822107315063, "learning_rate": 6.844563233163681e-05, "loss": 1.9055, "step": 3039 }, { "epoch": 0.22757471974248125, "grad_norm": 1.191353678703308, "learning_rate": 6.838637224753817e-05, "loss": 2.0137, "step": 3040 }, { "epoch": 0.22764957984765968, "grad_norm": 1.1189868450164795, "learning_rate": 6.832712449556959e-05, "loss": 2.1838, "step": 3041 }, { "epoch": 0.22772443995283814, "grad_norm": 1.2986522912979126, "learning_rate": 6.8267889098843e-05, "loss": 2.1041, "step": 3042 }, { "epoch": 0.22779930005801657, "grad_norm": 1.2633647918701172, "learning_rate": 6.820866608046541e-05, "loss": 2.2515, "step": 3043 }, { "epoch": 0.22787416016319503, "grad_norm": 1.212093710899353, "learning_rate": 6.814945546353912e-05, "loss": 1.9235, "step": 3044 }, { "epoch": 0.22794902026837346, "grad_norm": 1.3307304382324219, "learning_rate": 6.809025727116153e-05, "loss": 2.5008, "step": 3045 }, { "epoch": 0.22802388037355192, "grad_norm": 1.244494080543518, "learning_rate": 6.803107152642523e-05, "loss": 2.5622, "step": 3046 }, { "epoch": 0.22809874047873038, "grad_norm": 1.4250775575637817, "learning_rate": 6.797189825241789e-05, "loss": 2.1881, "step": 3047 }, { "epoch": 0.22817360058390881, "grad_norm": 1.7752324342727661, "learning_rate": 6.791273747222238e-05, "loss": 2.034, "step": 3048 }, { "epoch": 0.22824846068908727, "grad_norm": 1.230432152748108, "learning_rate": 6.785358920891669e-05, "loss": 2.4342, "step": 3049 }, { "epoch": 0.2283233207942657, "grad_norm": 1.1812105178833008, "learning_rate": 6.779445348557389e-05, "loss": 1.9458, "step": 3050 }, { "epoch": 0.22839818089944416, "grad_norm": 1.2170411348342896, "learning_rate": 6.773533032526217e-05, "loss": 2.3278, "step": 3051 }, { "epoch": 0.22847304100462262, "grad_norm": 1.1337867975234985, "learning_rate": 6.767621975104484e-05, "loss": 1.655, "step": 3052 }, { "epoch": 0.22854790110980105, "grad_norm": 1.5802146196365356, "learning_rate": 6.761712178598028e-05, "loss": 2.2441, "step": 3053 }, { "epoch": 0.22862276121497951, "grad_norm": 1.1827062368392944, "learning_rate": 6.755803645312195e-05, "loss": 1.7745, "step": 3054 }, { "epoch": 0.22869762132015795, "grad_norm": 1.2335597276687622, "learning_rate": 6.749896377551843e-05, "loss": 2.2979, "step": 3055 }, { "epoch": 0.2287724814253364, "grad_norm": 1.308151125907898, "learning_rate": 6.743990377621328e-05, "loss": 1.9358, "step": 3056 }, { "epoch": 0.22884734153051486, "grad_norm": 1.5063706636428833, "learning_rate": 6.738085647824519e-05, "loss": 1.8776, "step": 3057 }, { "epoch": 0.2289222016356933, "grad_norm": 1.2106083631515503, "learning_rate": 6.732182190464784e-05, "loss": 1.6162, "step": 3058 }, { "epoch": 0.22899706174087175, "grad_norm": 1.5185240507125854, "learning_rate": 6.726280007845003e-05, "loss": 1.6317, "step": 3059 }, { "epoch": 0.22907192184605019, "grad_norm": 1.4851731061935425, "learning_rate": 6.720379102267544e-05, "loss": 1.9453, "step": 3060 }, { "epoch": 0.22914678195122865, "grad_norm": 1.34398353099823, "learning_rate": 6.714479476034291e-05, "loss": 1.7042, "step": 3061 }, { "epoch": 0.22922164205640708, "grad_norm": 1.0116623640060425, "learning_rate": 6.708581131446621e-05, "loss": 1.008, "step": 3062 }, { "epoch": 0.22929650216158554, "grad_norm": 1.4680171012878418, "learning_rate": 6.702684070805419e-05, "loss": 2.2667, "step": 3063 }, { "epoch": 0.229371362266764, "grad_norm": 1.10616934299469, "learning_rate": 6.696788296411056e-05, "loss": 2.0804, "step": 3064 }, { "epoch": 0.22944622237194243, "grad_norm": 1.3728981018066406, "learning_rate": 6.690893810563413e-05, "loss": 1.9647, "step": 3065 }, { "epoch": 0.22952108247712089, "grad_norm": 1.3725581169128418, "learning_rate": 6.685000615561866e-05, "loss": 1.8625, "step": 3066 }, { "epoch": 0.22959594258229932, "grad_norm": 1.2922804355621338, "learning_rate": 6.679108713705287e-05, "loss": 1.9046, "step": 3067 }, { "epoch": 0.22967080268747778, "grad_norm": 1.2921885251998901, "learning_rate": 6.673218107292035e-05, "loss": 2.108, "step": 3068 }, { "epoch": 0.22974566279265624, "grad_norm": 1.0724422931671143, "learning_rate": 6.667328798619978e-05, "loss": 2.1322, "step": 3069 }, { "epoch": 0.22982052289783467, "grad_norm": 1.3000832796096802, "learning_rate": 6.661440789986468e-05, "loss": 2.526, "step": 3070 }, { "epoch": 0.22989538300301313, "grad_norm": 1.2692246437072754, "learning_rate": 6.655554083688351e-05, "loss": 2.01, "step": 3071 }, { "epoch": 0.22997024310819156, "grad_norm": 1.204856514930725, "learning_rate": 6.649668682021972e-05, "loss": 1.7832, "step": 3072 }, { "epoch": 0.23004510321337002, "grad_norm": 1.291298747062683, "learning_rate": 6.643784587283158e-05, "loss": 2.2034, "step": 3073 }, { "epoch": 0.23011996331854848, "grad_norm": 1.2317590713500977, "learning_rate": 6.63790180176723e-05, "loss": 1.8608, "step": 3074 }, { "epoch": 0.2301948234237269, "grad_norm": 1.4971141815185547, "learning_rate": 6.632020327769003e-05, "loss": 2.4753, "step": 3075 }, { "epoch": 0.23026968352890537, "grad_norm": 1.2101675271987915, "learning_rate": 6.626140167582766e-05, "loss": 2.2941, "step": 3076 }, { "epoch": 0.2303445436340838, "grad_norm": 1.2945398092269897, "learning_rate": 6.62026132350231e-05, "loss": 1.5727, "step": 3077 }, { "epoch": 0.23041940373926226, "grad_norm": 1.1598494052886963, "learning_rate": 6.614383797820911e-05, "loss": 1.897, "step": 3078 }, { "epoch": 0.2304942638444407, "grad_norm": 1.329874873161316, "learning_rate": 6.608507592831324e-05, "loss": 1.9315, "step": 3079 }, { "epoch": 0.23056912394961915, "grad_norm": 1.1919814348220825, "learning_rate": 6.602632710825794e-05, "loss": 1.3176, "step": 3080 }, { "epoch": 0.2306439840547976, "grad_norm": 1.5556837320327759, "learning_rate": 6.596759154096043e-05, "loss": 1.9928, "step": 3081 }, { "epoch": 0.23071884415997604, "grad_norm": 1.1920573711395264, "learning_rate": 6.590886924933288e-05, "loss": 1.8596, "step": 3082 }, { "epoch": 0.2307937042651545, "grad_norm": 1.30616295337677, "learning_rate": 6.585016025628222e-05, "loss": 2.278, "step": 3083 }, { "epoch": 0.23086856437033293, "grad_norm": 1.3040179014205933, "learning_rate": 6.579146458471013e-05, "loss": 1.9334, "step": 3084 }, { "epoch": 0.2309434244755114, "grad_norm": 1.5079635381698608, "learning_rate": 6.573278225751313e-05, "loss": 1.629, "step": 3085 }, { "epoch": 0.23101828458068985, "grad_norm": 1.4360100030899048, "learning_rate": 6.567411329758261e-05, "loss": 2.2139, "step": 3086 }, { "epoch": 0.23109314468586828, "grad_norm": 1.162381887435913, "learning_rate": 6.56154577278047e-05, "loss": 1.8399, "step": 3087 }, { "epoch": 0.23116800479104674, "grad_norm": 1.3782131671905518, "learning_rate": 6.555681557106024e-05, "loss": 2.2324, "step": 3088 }, { "epoch": 0.23124286489622517, "grad_norm": 1.2847659587860107, "learning_rate": 6.549818685022492e-05, "loss": 2.1959, "step": 3089 }, { "epoch": 0.23131772500140363, "grad_norm": 1.6216719150543213, "learning_rate": 6.543957158816918e-05, "loss": 2.281, "step": 3090 }, { "epoch": 0.2313925851065821, "grad_norm": 1.1616488695144653, "learning_rate": 6.538096980775819e-05, "loss": 2.1286, "step": 3091 }, { "epoch": 0.23146744521176052, "grad_norm": 1.4162760972976685, "learning_rate": 6.532238153185182e-05, "loss": 2.0623, "step": 3092 }, { "epoch": 0.23154230531693898, "grad_norm": 1.0028706789016724, "learning_rate": 6.526380678330474e-05, "loss": 1.4619, "step": 3093 }, { "epoch": 0.2316171654221174, "grad_norm": 1.3184969425201416, "learning_rate": 6.520524558496633e-05, "loss": 1.9962, "step": 3094 }, { "epoch": 0.23169202552729587, "grad_norm": 1.1954115629196167, "learning_rate": 6.514669795968067e-05, "loss": 1.8516, "step": 3095 }, { "epoch": 0.2317668856324743, "grad_norm": 1.1370069980621338, "learning_rate": 6.508816393028651e-05, "loss": 1.9975, "step": 3096 }, { "epoch": 0.23184174573765276, "grad_norm": 1.213193655014038, "learning_rate": 6.502964351961737e-05, "loss": 2.1672, "step": 3097 }, { "epoch": 0.23191660584283122, "grad_norm": 1.185849666595459, "learning_rate": 6.497113675050146e-05, "loss": 1.9974, "step": 3098 }, { "epoch": 0.23199146594800965, "grad_norm": 1.2072176933288574, "learning_rate": 6.491264364576158e-05, "loss": 1.8154, "step": 3099 }, { "epoch": 0.2320663260531881, "grad_norm": 1.3286216259002686, "learning_rate": 6.485416422821521e-05, "loss": 1.741, "step": 3100 }, { "epoch": 0.23214118615836654, "grad_norm": 1.156723976135254, "learning_rate": 6.479569852067463e-05, "loss": 2.2914, "step": 3101 }, { "epoch": 0.232216046263545, "grad_norm": 1.0183074474334717, "learning_rate": 6.473724654594657e-05, "loss": 1.602, "step": 3102 }, { "epoch": 0.23229090636872346, "grad_norm": 1.2273211479187012, "learning_rate": 6.467880832683257e-05, "loss": 1.6219, "step": 3103 }, { "epoch": 0.2323657664739019, "grad_norm": 1.3742965459823608, "learning_rate": 6.462038388612874e-05, "loss": 1.98, "step": 3104 }, { "epoch": 0.23244062657908035, "grad_norm": 1.2535052299499512, "learning_rate": 6.456197324662577e-05, "loss": 1.8272, "step": 3105 }, { "epoch": 0.23251548668425878, "grad_norm": 1.2948116064071655, "learning_rate": 6.450357643110906e-05, "loss": 1.9192, "step": 3106 }, { "epoch": 0.23259034678943724, "grad_norm": 1.4577168226242065, "learning_rate": 6.444519346235858e-05, "loss": 2.1344, "step": 3107 }, { "epoch": 0.2326652068946157, "grad_norm": 1.3160547018051147, "learning_rate": 6.43868243631488e-05, "loss": 1.5132, "step": 3108 }, { "epoch": 0.23274006699979413, "grad_norm": 1.3019521236419678, "learning_rate": 6.432846915624892e-05, "loss": 2.7587, "step": 3109 }, { "epoch": 0.2328149271049726, "grad_norm": 1.1410762071609497, "learning_rate": 6.42701278644227e-05, "loss": 2.3377, "step": 3110 }, { "epoch": 0.23288978721015102, "grad_norm": 1.1049286127090454, "learning_rate": 6.421180051042838e-05, "loss": 1.9957, "step": 3111 }, { "epoch": 0.23296464731532948, "grad_norm": 1.2471596002578735, "learning_rate": 6.415348711701889e-05, "loss": 2.052, "step": 3112 }, { "epoch": 0.2330395074205079, "grad_norm": 1.2579057216644287, "learning_rate": 6.409518770694156e-05, "loss": 2.057, "step": 3113 }, { "epoch": 0.23311436752568637, "grad_norm": 1.4291948080062866, "learning_rate": 6.403690230293842e-05, "loss": 2.0088, "step": 3114 }, { "epoch": 0.23318922763086483, "grad_norm": 1.3260120153427124, "learning_rate": 6.397863092774594e-05, "loss": 2.1324, "step": 3115 }, { "epoch": 0.23326408773604326, "grad_norm": 1.0977643728256226, "learning_rate": 6.39203736040952e-05, "loss": 2.1616, "step": 3116 }, { "epoch": 0.23333894784122172, "grad_norm": 2.4645659923553467, "learning_rate": 6.386213035471162e-05, "loss": 2.2489, "step": 3117 }, { "epoch": 0.23341380794640015, "grad_norm": 1.2545013427734375, "learning_rate": 6.380390120231538e-05, "loss": 2.2661, "step": 3118 }, { "epoch": 0.2334886680515786, "grad_norm": 1.2972941398620605, "learning_rate": 6.374568616962095e-05, "loss": 1.9081, "step": 3119 }, { "epoch": 0.23356352815675707, "grad_norm": 1.4747353792190552, "learning_rate": 6.368748527933741e-05, "loss": 2.6007, "step": 3120 }, { "epoch": 0.2336383882619355, "grad_norm": 1.1941606998443604, "learning_rate": 6.362929855416831e-05, "loss": 2.3225, "step": 3121 }, { "epoch": 0.23371324836711396, "grad_norm": 1.4440853595733643, "learning_rate": 6.357112601681163e-05, "loss": 2.258, "step": 3122 }, { "epoch": 0.2337881084722924, "grad_norm": 1.216381311416626, "learning_rate": 6.351296768995984e-05, "loss": 2.4438, "step": 3123 }, { "epoch": 0.23386296857747085, "grad_norm": 1.226015329360962, "learning_rate": 6.34548235962999e-05, "loss": 2.1389, "step": 3124 }, { "epoch": 0.2339378286826493, "grad_norm": 1.3447883129119873, "learning_rate": 6.339669375851314e-05, "loss": 2.0401, "step": 3125 }, { "epoch": 0.23401268878782774, "grad_norm": 1.244720697402954, "learning_rate": 6.333857819927537e-05, "loss": 1.7607, "step": 3126 }, { "epoch": 0.2340875488930062, "grad_norm": 1.2716984748840332, "learning_rate": 6.328047694125688e-05, "loss": 1.9202, "step": 3127 }, { "epoch": 0.23416240899818463, "grad_norm": 1.20671546459198, "learning_rate": 6.322239000712227e-05, "loss": 2.0318, "step": 3128 }, { "epoch": 0.2342372691033631, "grad_norm": 1.1880805492401123, "learning_rate": 6.316431741953069e-05, "loss": 2.1266, "step": 3129 }, { "epoch": 0.23431212920854153, "grad_norm": 1.2184994220733643, "learning_rate": 6.310625920113556e-05, "loss": 2.4504, "step": 3130 }, { "epoch": 0.23438698931371998, "grad_norm": 1.1238553524017334, "learning_rate": 6.304821537458478e-05, "loss": 1.976, "step": 3131 }, { "epoch": 0.23446184941889844, "grad_norm": 1.2202082872390747, "learning_rate": 6.299018596252066e-05, "loss": 1.5358, "step": 3132 }, { "epoch": 0.23453670952407688, "grad_norm": 1.4962273836135864, "learning_rate": 6.293217098757975e-05, "loss": 2.4291, "step": 3133 }, { "epoch": 0.23461156962925533, "grad_norm": 1.1973357200622559, "learning_rate": 6.28741704723931e-05, "loss": 2.3163, "step": 3134 }, { "epoch": 0.23468642973443377, "grad_norm": 1.4609187841415405, "learning_rate": 6.281618443958606e-05, "loss": 1.9247, "step": 3135 }, { "epoch": 0.23476128983961222, "grad_norm": 1.2050145864486694, "learning_rate": 6.275821291177837e-05, "loss": 2.3901, "step": 3136 }, { "epoch": 0.23483614994479068, "grad_norm": 1.2308610677719116, "learning_rate": 6.270025591158409e-05, "loss": 1.8537, "step": 3137 }, { "epoch": 0.23491101004996912, "grad_norm": 1.2254767417907715, "learning_rate": 6.264231346161158e-05, "loss": 1.5755, "step": 3138 }, { "epoch": 0.23498587015514757, "grad_norm": 1.4054858684539795, "learning_rate": 6.258438558446358e-05, "loss": 2.2535, "step": 3139 }, { "epoch": 0.235060730260326, "grad_norm": 1.0767781734466553, "learning_rate": 6.252647230273714e-05, "loss": 1.7219, "step": 3140 }, { "epoch": 0.23513559036550447, "grad_norm": 1.0943024158477783, "learning_rate": 6.246857363902354e-05, "loss": 1.7817, "step": 3141 }, { "epoch": 0.23521045047068292, "grad_norm": 1.4047867059707642, "learning_rate": 6.241068961590845e-05, "loss": 1.4876, "step": 3142 }, { "epoch": 0.23528531057586136, "grad_norm": 0.9368618726730347, "learning_rate": 6.235282025597179e-05, "loss": 1.6392, "step": 3143 }, { "epoch": 0.23536017068103982, "grad_norm": 1.4061073064804077, "learning_rate": 6.229496558178778e-05, "loss": 1.5589, "step": 3144 }, { "epoch": 0.23543503078621825, "grad_norm": 1.2102389335632324, "learning_rate": 6.223712561592486e-05, "loss": 1.8149, "step": 3145 }, { "epoch": 0.2355098908913967, "grad_norm": 1.3655747175216675, "learning_rate": 6.217930038094577e-05, "loss": 2.4338, "step": 3146 }, { "epoch": 0.23558475099657514, "grad_norm": 1.3353931903839111, "learning_rate": 6.212148989940756e-05, "loss": 2.1525, "step": 3147 }, { "epoch": 0.2356596111017536, "grad_norm": 1.3801790475845337, "learning_rate": 6.206369419386142e-05, "loss": 1.9872, "step": 3148 }, { "epoch": 0.23573447120693206, "grad_norm": 1.3120461702346802, "learning_rate": 6.20059132868528e-05, "loss": 2.0376, "step": 3149 }, { "epoch": 0.2358093313121105, "grad_norm": 1.1618291139602661, "learning_rate": 6.194814720092144e-05, "loss": 2.0005, "step": 3150 }, { "epoch": 0.2358093313121105, "eval_loss": 2.0094358921051025, "eval_runtime": 178.9443, "eval_samples_per_second": 27.942, "eval_steps_per_second": 13.971, "step": 3150 }, { "epoch": 0.23588419141728895, "grad_norm": 1.139782428741455, "learning_rate": 6.189039595860123e-05, "loss": 2.1169, "step": 3151 }, { "epoch": 0.23595905152246738, "grad_norm": 1.2362688779830933, "learning_rate": 6.18326595824203e-05, "loss": 2.0219, "step": 3152 }, { "epoch": 0.23603391162764584, "grad_norm": 1.3213380575180054, "learning_rate": 6.1774938094901e-05, "loss": 2.3657, "step": 3153 }, { "epoch": 0.2361087717328243, "grad_norm": 1.4174034595489502, "learning_rate": 6.171723151855984e-05, "loss": 2.2734, "step": 3154 }, { "epoch": 0.23618363183800273, "grad_norm": 1.1825603246688843, "learning_rate": 6.165953987590753e-05, "loss": 2.357, "step": 3155 }, { "epoch": 0.2362584919431812, "grad_norm": 1.1966662406921387, "learning_rate": 6.160186318944895e-05, "loss": 1.9445, "step": 3156 }, { "epoch": 0.23633335204835962, "grad_norm": 1.1162891387939453, "learning_rate": 6.154420148168311e-05, "loss": 1.8769, "step": 3157 }, { "epoch": 0.23640821215353808, "grad_norm": 1.1850206851959229, "learning_rate": 6.148655477510322e-05, "loss": 2.2263, "step": 3158 }, { "epoch": 0.23648307225871654, "grad_norm": 1.42572021484375, "learning_rate": 6.142892309219668e-05, "loss": 2.1058, "step": 3159 }, { "epoch": 0.23655793236389497, "grad_norm": 1.513978362083435, "learning_rate": 6.13713064554449e-05, "loss": 2.318, "step": 3160 }, { "epoch": 0.23663279246907343, "grad_norm": 1.2592381238937378, "learning_rate": 6.131370488732357e-05, "loss": 1.8314, "step": 3161 }, { "epoch": 0.23670765257425186, "grad_norm": 1.1455496549606323, "learning_rate": 6.12561184103024e-05, "loss": 1.6568, "step": 3162 }, { "epoch": 0.23678251267943032, "grad_norm": 1.2599461078643799, "learning_rate": 6.119854704684522e-05, "loss": 1.8008, "step": 3163 }, { "epoch": 0.23685737278460875, "grad_norm": 1.3524178266525269, "learning_rate": 6.114099081941007e-05, "loss": 1.8063, "step": 3164 }, { "epoch": 0.2369322328897872, "grad_norm": 1.0276775360107422, "learning_rate": 6.108344975044893e-05, "loss": 1.7533, "step": 3165 }, { "epoch": 0.23700709299496567, "grad_norm": 1.1788076162338257, "learning_rate": 6.102592386240793e-05, "loss": 1.5716, "step": 3166 }, { "epoch": 0.2370819531001441, "grad_norm": 1.1917275190353394, "learning_rate": 6.0968413177727346e-05, "loss": 2.4975, "step": 3167 }, { "epoch": 0.23715681320532256, "grad_norm": 1.4926486015319824, "learning_rate": 6.091091771884141e-05, "loss": 1.8374, "step": 3168 }, { "epoch": 0.237231673310501, "grad_norm": 1.131089687347412, "learning_rate": 6.085343750817851e-05, "loss": 2.2634, "step": 3169 }, { "epoch": 0.23730653341567945, "grad_norm": 1.3783355951309204, "learning_rate": 6.079597256816107e-05, "loss": 2.1034, "step": 3170 }, { "epoch": 0.2373813935208579, "grad_norm": 1.2427281141281128, "learning_rate": 6.0738522921205456e-05, "loss": 2.3016, "step": 3171 }, { "epoch": 0.23745625362603634, "grad_norm": 1.1005496978759766, "learning_rate": 6.068108858972223e-05, "loss": 2.0916, "step": 3172 }, { "epoch": 0.2375311137312148, "grad_norm": 1.2752374410629272, "learning_rate": 6.062366959611587e-05, "loss": 1.8265, "step": 3173 }, { "epoch": 0.23760597383639323, "grad_norm": 1.6435338258743286, "learning_rate": 6.056626596278485e-05, "loss": 2.2332, "step": 3174 }, { "epoch": 0.2376808339415717, "grad_norm": 1.2263280153274536, "learning_rate": 6.0508877712121723e-05, "loss": 1.8747, "step": 3175 }, { "epoch": 0.23775569404675015, "grad_norm": 1.1826199293136597, "learning_rate": 6.045150486651305e-05, "loss": 1.7956, "step": 3176 }, { "epoch": 0.23783055415192858, "grad_norm": 1.1301007270812988, "learning_rate": 6.039414744833931e-05, "loss": 1.1023, "step": 3177 }, { "epoch": 0.23790541425710704, "grad_norm": 1.3945963382720947, "learning_rate": 6.033680547997504e-05, "loss": 2.2929, "step": 3178 }, { "epoch": 0.23798027436228547, "grad_norm": 1.271061897277832, "learning_rate": 6.02794789837887e-05, "loss": 1.9183, "step": 3179 }, { "epoch": 0.23805513446746393, "grad_norm": 1.3433088064193726, "learning_rate": 6.022216798214269e-05, "loss": 2.14, "step": 3180 }, { "epoch": 0.23812999457264236, "grad_norm": 1.4010835886001587, "learning_rate": 6.016487249739349e-05, "loss": 2.3197, "step": 3181 }, { "epoch": 0.23820485467782082, "grad_norm": 0.9978559017181396, "learning_rate": 6.010759255189137e-05, "loss": 1.7078, "step": 3182 }, { "epoch": 0.23827971478299928, "grad_norm": 1.2770293951034546, "learning_rate": 6.0050328167980606e-05, "loss": 2.1277, "step": 3183 }, { "epoch": 0.2383545748881777, "grad_norm": 1.2412047386169434, "learning_rate": 5.999307936799943e-05, "loss": 2.1017, "step": 3184 }, { "epoch": 0.23842943499335617, "grad_norm": 1.1361815929412842, "learning_rate": 5.993584617428e-05, "loss": 1.6914, "step": 3185 }, { "epoch": 0.2385042950985346, "grad_norm": 1.151660442352295, "learning_rate": 5.98786286091483e-05, "loss": 2.012, "step": 3186 }, { "epoch": 0.23857915520371306, "grad_norm": 1.20125150680542, "learning_rate": 5.982142669492433e-05, "loss": 1.8672, "step": 3187 }, { "epoch": 0.23865401530889152, "grad_norm": 1.2033027410507202, "learning_rate": 5.9764240453921896e-05, "loss": 2.4233, "step": 3188 }, { "epoch": 0.23872887541406995, "grad_norm": 1.5129797458648682, "learning_rate": 5.970706990844876e-05, "loss": 2.175, "step": 3189 }, { "epoch": 0.2388037355192484, "grad_norm": 1.2048046588897705, "learning_rate": 5.9649915080806476e-05, "loss": 1.5601, "step": 3190 }, { "epoch": 0.23887859562442684, "grad_norm": 1.2213735580444336, "learning_rate": 5.959277599329055e-05, "loss": 1.9609, "step": 3191 }, { "epoch": 0.2389534557296053, "grad_norm": 1.2295429706573486, "learning_rate": 5.953565266819029e-05, "loss": 1.6943, "step": 3192 }, { "epoch": 0.23902831583478376, "grad_norm": 1.346901774406433, "learning_rate": 5.947854512778892e-05, "loss": 2.0848, "step": 3193 }, { "epoch": 0.2391031759399622, "grad_norm": 1.1858491897583008, "learning_rate": 5.9421453394363414e-05, "loss": 1.7424, "step": 3194 }, { "epoch": 0.23917803604514065, "grad_norm": 1.2487839460372925, "learning_rate": 5.936437749018466e-05, "loss": 2.0165, "step": 3195 }, { "epoch": 0.23925289615031908, "grad_norm": 1.2125927209854126, "learning_rate": 5.930731743751736e-05, "loss": 1.4654, "step": 3196 }, { "epoch": 0.23932775625549754, "grad_norm": 1.225906491279602, "learning_rate": 5.925027325862004e-05, "loss": 1.3415, "step": 3197 }, { "epoch": 0.23940261636067597, "grad_norm": 1.1777185201644897, "learning_rate": 5.919324497574491e-05, "loss": 2.1088, "step": 3198 }, { "epoch": 0.23947747646585443, "grad_norm": 1.4698373079299927, "learning_rate": 5.913623261113817e-05, "loss": 2.0946, "step": 3199 }, { "epoch": 0.2395523365710329, "grad_norm": 1.25827157497406, "learning_rate": 5.9079236187039654e-05, "loss": 2.2298, "step": 3200 }, { "epoch": 0.23962719667621132, "grad_norm": 1.161327600479126, "learning_rate": 5.902225572568307e-05, "loss": 2.2419, "step": 3201 }, { "epoch": 0.23970205678138978, "grad_norm": 1.3536425828933716, "learning_rate": 5.8965291249295916e-05, "loss": 2.0606, "step": 3202 }, { "epoch": 0.23977691688656821, "grad_norm": 1.4027286767959595, "learning_rate": 5.890834278009935e-05, "loss": 2.4045, "step": 3203 }, { "epoch": 0.23985177699174667, "grad_norm": 1.3372899293899536, "learning_rate": 5.8851410340308386e-05, "loss": 1.7935, "step": 3204 }, { "epoch": 0.23992663709692513, "grad_norm": 1.3869376182556152, "learning_rate": 5.879449395213175e-05, "loss": 1.7269, "step": 3205 }, { "epoch": 0.24000149720210356, "grad_norm": 1.3523284196853638, "learning_rate": 5.8737593637771824e-05, "loss": 1.8396, "step": 3206 }, { "epoch": 0.24007635730728202, "grad_norm": 1.185771107673645, "learning_rate": 5.868070941942487e-05, "loss": 2.1854, "step": 3207 }, { "epoch": 0.24015121741246045, "grad_norm": 1.1263337135314941, "learning_rate": 5.86238413192808e-05, "loss": 1.9622, "step": 3208 }, { "epoch": 0.24022607751763891, "grad_norm": 1.1725854873657227, "learning_rate": 5.85669893595232e-05, "loss": 2.0812, "step": 3209 }, { "epoch": 0.24030093762281737, "grad_norm": 1.177746057510376, "learning_rate": 5.851015356232944e-05, "loss": 1.9386, "step": 3210 }, { "epoch": 0.2403757977279958, "grad_norm": 1.1677191257476807, "learning_rate": 5.8453333949870494e-05, "loss": 2.1217, "step": 3211 }, { "epoch": 0.24045065783317426, "grad_norm": 1.276090383529663, "learning_rate": 5.839653054431109e-05, "loss": 1.8314, "step": 3212 }, { "epoch": 0.2405255179383527, "grad_norm": 1.3698394298553467, "learning_rate": 5.8339743367809676e-05, "loss": 2.2576, "step": 3213 }, { "epoch": 0.24060037804353115, "grad_norm": 1.3873666524887085, "learning_rate": 5.82829724425182e-05, "loss": 2.4111, "step": 3214 }, { "epoch": 0.24067523814870959, "grad_norm": 1.3023219108581543, "learning_rate": 5.8226217790582485e-05, "loss": 2.3092, "step": 3215 }, { "epoch": 0.24075009825388805, "grad_norm": 1.4034538269042969, "learning_rate": 5.816947943414179e-05, "loss": 1.8441, "step": 3216 }, { "epoch": 0.2408249583590665, "grad_norm": 1.2940821647644043, "learning_rate": 5.8112757395329175e-05, "loss": 1.5239, "step": 3217 }, { "epoch": 0.24089981846424494, "grad_norm": 1.4161341190338135, "learning_rate": 5.805605169627131e-05, "loss": 1.8614, "step": 3218 }, { "epoch": 0.2409746785694234, "grad_norm": 1.3709160089492798, "learning_rate": 5.799936235908845e-05, "loss": 2.2582, "step": 3219 }, { "epoch": 0.24104953867460183, "grad_norm": 1.3754370212554932, "learning_rate": 5.7942689405894516e-05, "loss": 2.4539, "step": 3220 }, { "epoch": 0.24112439877978029, "grad_norm": 1.1568009853363037, "learning_rate": 5.7886032858796976e-05, "loss": 1.7033, "step": 3221 }, { "epoch": 0.24119925888495874, "grad_norm": 1.2387012243270874, "learning_rate": 5.782939273989689e-05, "loss": 1.7968, "step": 3222 }, { "epoch": 0.24127411899013718, "grad_norm": 1.355279564857483, "learning_rate": 5.777276907128899e-05, "loss": 1.9671, "step": 3223 }, { "epoch": 0.24134897909531564, "grad_norm": 1.4519834518432617, "learning_rate": 5.771616187506155e-05, "loss": 2.3679, "step": 3224 }, { "epoch": 0.24142383920049407, "grad_norm": 1.2032239437103271, "learning_rate": 5.76595711732964e-05, "loss": 1.7297, "step": 3225 }, { "epoch": 0.24149869930567253, "grad_norm": 1.3105597496032715, "learning_rate": 5.7602996988069015e-05, "loss": 2.1737, "step": 3226 }, { "epoch": 0.24157355941085099, "grad_norm": 1.274610161781311, "learning_rate": 5.754643934144828e-05, "loss": 1.6164, "step": 3227 }, { "epoch": 0.24164841951602942, "grad_norm": 1.239538311958313, "learning_rate": 5.7489898255496736e-05, "loss": 2.2211, "step": 3228 }, { "epoch": 0.24172327962120788, "grad_norm": 1.083094596862793, "learning_rate": 5.743337375227047e-05, "loss": 2.2315, "step": 3229 }, { "epoch": 0.2417981397263863, "grad_norm": 1.2495890855789185, "learning_rate": 5.73768658538191e-05, "loss": 2.0532, "step": 3230 }, { "epoch": 0.24187299983156477, "grad_norm": 1.2145074605941772, "learning_rate": 5.732037458218564e-05, "loss": 1.5884, "step": 3231 }, { "epoch": 0.2419478599367432, "grad_norm": 1.172079086303711, "learning_rate": 5.726389995940682e-05, "loss": 1.853, "step": 3232 }, { "epoch": 0.24202272004192166, "grad_norm": 1.399514079093933, "learning_rate": 5.720744200751269e-05, "loss": 2.3617, "step": 3233 }, { "epoch": 0.24209758014710012, "grad_norm": 1.1066193580627441, "learning_rate": 5.715100074852691e-05, "loss": 1.6022, "step": 3234 }, { "epoch": 0.24217244025227855, "grad_norm": 1.4184784889221191, "learning_rate": 5.7094576204466586e-05, "loss": 2.3546, "step": 3235 }, { "epoch": 0.242247300357457, "grad_norm": 1.483188271522522, "learning_rate": 5.7038168397342354e-05, "loss": 2.5138, "step": 3236 }, { "epoch": 0.24232216046263544, "grad_norm": 1.5037165880203247, "learning_rate": 5.6981777349158285e-05, "loss": 2.3374, "step": 3237 }, { "epoch": 0.2423970205678139, "grad_norm": 1.3214603662490845, "learning_rate": 5.6925403081911884e-05, "loss": 1.5422, "step": 3238 }, { "epoch": 0.24247188067299236, "grad_norm": 1.0573269128799438, "learning_rate": 5.6869045617594094e-05, "loss": 1.3795, "step": 3239 }, { "epoch": 0.2425467407781708, "grad_norm": 1.4848601818084717, "learning_rate": 5.681270497818939e-05, "loss": 2.4653, "step": 3240 }, { "epoch": 0.24262160088334925, "grad_norm": 1.4296783208847046, "learning_rate": 5.675638118567563e-05, "loss": 2.2775, "step": 3241 }, { "epoch": 0.24269646098852768, "grad_norm": 1.233920931816101, "learning_rate": 5.6700074262024104e-05, "loss": 1.942, "step": 3242 }, { "epoch": 0.24277132109370614, "grad_norm": 1.313254714012146, "learning_rate": 5.6643784229199585e-05, "loss": 2.1244, "step": 3243 }, { "epoch": 0.2428461811988846, "grad_norm": 1.190923810005188, "learning_rate": 5.658751110916011e-05, "loss": 1.7211, "step": 3244 }, { "epoch": 0.24292104130406303, "grad_norm": 1.4920499324798584, "learning_rate": 5.6531254923857236e-05, "loss": 2.3635, "step": 3245 }, { "epoch": 0.2429959014092415, "grad_norm": 1.336164116859436, "learning_rate": 5.647501569523595e-05, "loss": 2.0721, "step": 3246 }, { "epoch": 0.24307076151441992, "grad_norm": 1.2150614261627197, "learning_rate": 5.641879344523444e-05, "loss": 2.2728, "step": 3247 }, { "epoch": 0.24314562161959838, "grad_norm": 1.410369873046875, "learning_rate": 5.636258819578446e-05, "loss": 1.9641, "step": 3248 }, { "epoch": 0.2432204817247768, "grad_norm": 1.4360374212265015, "learning_rate": 5.6306399968811095e-05, "loss": 1.8446, "step": 3249 }, { "epoch": 0.24329534182995527, "grad_norm": 1.2789438962936401, "learning_rate": 5.625022878623268e-05, "loss": 2.211, "step": 3250 }, { "epoch": 0.24337020193513373, "grad_norm": 1.311806321144104, "learning_rate": 5.619407466996103e-05, "loss": 2.2302, "step": 3251 }, { "epoch": 0.24344506204031216, "grad_norm": 1.395463228225708, "learning_rate": 5.6137937641901216e-05, "loss": 2.2959, "step": 3252 }, { "epoch": 0.24351992214549062, "grad_norm": 1.2403547763824463, "learning_rate": 5.608181772395171e-05, "loss": 1.958, "step": 3253 }, { "epoch": 0.24359478225066905, "grad_norm": 1.1049106121063232, "learning_rate": 5.6025714938004306e-05, "loss": 1.6365, "step": 3254 }, { "epoch": 0.2436696423558475, "grad_norm": 1.3190135955810547, "learning_rate": 5.5969629305944004e-05, "loss": 1.737, "step": 3255 }, { "epoch": 0.24374450246102597, "grad_norm": 1.0758733749389648, "learning_rate": 5.591356084964927e-05, "loss": 1.8905, "step": 3256 }, { "epoch": 0.2438193625662044, "grad_norm": 1.3572602272033691, "learning_rate": 5.585750959099174e-05, "loss": 2.8672, "step": 3257 }, { "epoch": 0.24389422267138286, "grad_norm": 1.2867034673690796, "learning_rate": 5.580147555183641e-05, "loss": 2.4857, "step": 3258 }, { "epoch": 0.2439690827765613, "grad_norm": 1.5425955057144165, "learning_rate": 5.574545875404156e-05, "loss": 1.9494, "step": 3259 }, { "epoch": 0.24404394288173975, "grad_norm": 1.1354247331619263, "learning_rate": 5.568945921945872e-05, "loss": 2.2396, "step": 3260 }, { "epoch": 0.2441188029869182, "grad_norm": 1.2108561992645264, "learning_rate": 5.563347696993275e-05, "loss": 2.0644, "step": 3261 }, { "epoch": 0.24419366309209664, "grad_norm": 1.2035534381866455, "learning_rate": 5.557751202730165e-05, "loss": 2.3992, "step": 3262 }, { "epoch": 0.2442685231972751, "grad_norm": 1.220754623413086, "learning_rate": 5.552156441339671e-05, "loss": 2.1592, "step": 3263 }, { "epoch": 0.24434338330245353, "grad_norm": 1.2377091646194458, "learning_rate": 5.5465634150042514e-05, "loss": 1.8135, "step": 3264 }, { "epoch": 0.244418243407632, "grad_norm": 1.1918827295303345, "learning_rate": 5.540972125905684e-05, "loss": 2.0068, "step": 3265 }, { "epoch": 0.24449310351281042, "grad_norm": 1.1512640714645386, "learning_rate": 5.53538257622507e-05, "loss": 1.8591, "step": 3266 }, { "epoch": 0.24456796361798888, "grad_norm": 1.3095085620880127, "learning_rate": 5.529794768142838e-05, "loss": 2.1315, "step": 3267 }, { "epoch": 0.24464282372316734, "grad_norm": 1.2311172485351562, "learning_rate": 5.524208703838718e-05, "loss": 1.7704, "step": 3268 }, { "epoch": 0.24471768382834577, "grad_norm": 1.1050364971160889, "learning_rate": 5.5186243854917795e-05, "loss": 2.6094, "step": 3269 }, { "epoch": 0.24479254393352423, "grad_norm": 1.0761051177978516, "learning_rate": 5.5130418152804065e-05, "loss": 1.4611, "step": 3270 }, { "epoch": 0.24486740403870266, "grad_norm": 1.2498531341552734, "learning_rate": 5.507460995382293e-05, "loss": 2.252, "step": 3271 }, { "epoch": 0.24494226414388112, "grad_norm": 1.1842716932296753, "learning_rate": 5.501881927974457e-05, "loss": 2.0672, "step": 3272 }, { "epoch": 0.24501712424905958, "grad_norm": 1.2399311065673828, "learning_rate": 5.496304615233238e-05, "loss": 1.8612, "step": 3273 }, { "epoch": 0.245091984354238, "grad_norm": 1.1359643936157227, "learning_rate": 5.490729059334274e-05, "loss": 1.8269, "step": 3274 }, { "epoch": 0.24516684445941647, "grad_norm": 1.1600710153579712, "learning_rate": 5.485155262452535e-05, "loss": 1.5048, "step": 3275 }, { "epoch": 0.2452417045645949, "grad_norm": 1.2155944108963013, "learning_rate": 5.479583226762296e-05, "loss": 2.0597, "step": 3276 }, { "epoch": 0.24531656466977336, "grad_norm": 1.0865952968597412, "learning_rate": 5.474012954437149e-05, "loss": 1.8919, "step": 3277 }, { "epoch": 0.24539142477495182, "grad_norm": 1.1913416385650635, "learning_rate": 5.468444447649997e-05, "loss": 1.7318, "step": 3278 }, { "epoch": 0.24546628488013025, "grad_norm": 1.2359098196029663, "learning_rate": 5.4628777085730534e-05, "loss": 2.0497, "step": 3279 }, { "epoch": 0.2455411449853087, "grad_norm": 1.2607675790786743, "learning_rate": 5.4573127393778355e-05, "loss": 2.0007, "step": 3280 }, { "epoch": 0.24561600509048714, "grad_norm": 1.0366878509521484, "learning_rate": 5.4517495422351824e-05, "loss": 2.1748, "step": 3281 }, { "epoch": 0.2456908651956656, "grad_norm": 1.3247227668762207, "learning_rate": 5.446188119315236e-05, "loss": 2.3364, "step": 3282 }, { "epoch": 0.24576572530084403, "grad_norm": 1.1922886371612549, "learning_rate": 5.440628472787447e-05, "loss": 1.664, "step": 3283 }, { "epoch": 0.2458405854060225, "grad_norm": 1.1495803594589233, "learning_rate": 5.4350706048205754e-05, "loss": 2.2282, "step": 3284 }, { "epoch": 0.24591544551120095, "grad_norm": 1.2374387979507446, "learning_rate": 5.4295145175826765e-05, "loss": 2.2236, "step": 3285 }, { "epoch": 0.24599030561637938, "grad_norm": 1.2635934352874756, "learning_rate": 5.423960213241124e-05, "loss": 1.8442, "step": 3286 }, { "epoch": 0.24606516572155784, "grad_norm": 1.2699580192565918, "learning_rate": 5.4184076939625964e-05, "loss": 2.4461, "step": 3287 }, { "epoch": 0.24614002582673628, "grad_norm": 1.332206130027771, "learning_rate": 5.41285696191306e-05, "loss": 2.1291, "step": 3288 }, { "epoch": 0.24621488593191473, "grad_norm": 1.4730064868927002, "learning_rate": 5.4073080192578e-05, "loss": 2.1927, "step": 3289 }, { "epoch": 0.2462897460370932, "grad_norm": 1.3042746782302856, "learning_rate": 5.401760868161403e-05, "loss": 1.5735, "step": 3290 }, { "epoch": 0.24636460614227162, "grad_norm": 1.4032536745071411, "learning_rate": 5.396215510787741e-05, "loss": 2.1204, "step": 3291 }, { "epoch": 0.24643946624745008, "grad_norm": 1.3084875345230103, "learning_rate": 5.3906719493000045e-05, "loss": 2.0752, "step": 3292 }, { "epoch": 0.24651432635262852, "grad_norm": 1.2919384241104126, "learning_rate": 5.385130185860675e-05, "loss": 1.516, "step": 3293 }, { "epoch": 0.24658918645780697, "grad_norm": 1.356183409690857, "learning_rate": 5.379590222631534e-05, "loss": 1.7518, "step": 3294 }, { "epoch": 0.24666404656298543, "grad_norm": 2.3826329708099365, "learning_rate": 5.374052061773662e-05, "loss": 1.7016, "step": 3295 }, { "epoch": 0.24673890666816387, "grad_norm": 1.4093071222305298, "learning_rate": 5.368515705447436e-05, "loss": 1.8434, "step": 3296 }, { "epoch": 0.24681376677334232, "grad_norm": 1.6158283948898315, "learning_rate": 5.3629811558125184e-05, "loss": 2.1243, "step": 3297 }, { "epoch": 0.24688862687852076, "grad_norm": 1.1998611688613892, "learning_rate": 5.3574484150278834e-05, "loss": 1.928, "step": 3298 }, { "epoch": 0.24696348698369922, "grad_norm": 1.567852258682251, "learning_rate": 5.351917485251794e-05, "loss": 2.5858, "step": 3299 }, { "epoch": 0.24703834708887765, "grad_norm": 1.4819259643554688, "learning_rate": 5.346388368641801e-05, "loss": 2.5008, "step": 3300 }, { "epoch": 0.24703834708887765, "eval_loss": 2.0030405521392822, "eval_runtime": 178.9213, "eval_samples_per_second": 27.945, "eval_steps_per_second": 13.973, "step": 3300 }, { "epoch": 0.2471132071940561, "grad_norm": 1.1435844898223877, "learning_rate": 5.340861067354761e-05, "loss": 2.0656, "step": 3301 }, { "epoch": 0.24718806729923457, "grad_norm": 1.1702338457107544, "learning_rate": 5.3353355835468034e-05, "loss": 1.731, "step": 3302 }, { "epoch": 0.247262927404413, "grad_norm": 1.3106426000595093, "learning_rate": 5.329811919373366e-05, "loss": 2.3256, "step": 3303 }, { "epoch": 0.24733778750959146, "grad_norm": 1.282607913017273, "learning_rate": 5.324290076989162e-05, "loss": 1.8973, "step": 3304 }, { "epoch": 0.2474126476147699, "grad_norm": 1.3083600997924805, "learning_rate": 5.318770058548208e-05, "loss": 2.2379, "step": 3305 }, { "epoch": 0.24748750771994835, "grad_norm": 1.1797435283660889, "learning_rate": 5.313251866203801e-05, "loss": 1.6054, "step": 3306 }, { "epoch": 0.2475623678251268, "grad_norm": 1.2190746068954468, "learning_rate": 5.307735502108532e-05, "loss": 1.7001, "step": 3307 }, { "epoch": 0.24763722793030524, "grad_norm": 1.4420807361602783, "learning_rate": 5.302220968414264e-05, "loss": 2.3588, "step": 3308 }, { "epoch": 0.2477120880354837, "grad_norm": 1.4293659925460815, "learning_rate": 5.296708267272164e-05, "loss": 2.3292, "step": 3309 }, { "epoch": 0.24778694814066213, "grad_norm": 1.4928079843521118, "learning_rate": 5.2911974008326733e-05, "loss": 2.2411, "step": 3310 }, { "epoch": 0.2478618082458406, "grad_norm": 1.224805474281311, "learning_rate": 5.285688371245526e-05, "loss": 2.1616, "step": 3311 }, { "epoch": 0.24793666835101905, "grad_norm": 1.3057329654693604, "learning_rate": 5.2801811806597265e-05, "loss": 2.4613, "step": 3312 }, { "epoch": 0.24801152845619748, "grad_norm": 1.8031359910964966, "learning_rate": 5.2746758312235765e-05, "loss": 2.202, "step": 3313 }, { "epoch": 0.24808638856137594, "grad_norm": 1.4099674224853516, "learning_rate": 5.269172325084645e-05, "loss": 1.7747, "step": 3314 }, { "epoch": 0.24816124866655437, "grad_norm": 1.3078298568725586, "learning_rate": 5.2636706643897926e-05, "loss": 1.7685, "step": 3315 }, { "epoch": 0.24823610877173283, "grad_norm": 1.8797658681869507, "learning_rate": 5.258170851285159e-05, "loss": 2.4196, "step": 3316 }, { "epoch": 0.24831096887691126, "grad_norm": 1.3224971294403076, "learning_rate": 5.252672887916159e-05, "loss": 2.3173, "step": 3317 }, { "epoch": 0.24838582898208972, "grad_norm": 1.2483420372009277, "learning_rate": 5.247176776427494e-05, "loss": 1.9658, "step": 3318 }, { "epoch": 0.24846068908726818, "grad_norm": 1.4740668535232544, "learning_rate": 5.2416825189631315e-05, "loss": 1.8271, "step": 3319 }, { "epoch": 0.2485355491924466, "grad_norm": 1.2113944292068481, "learning_rate": 5.2361901176663155e-05, "loss": 1.9462, "step": 3320 }, { "epoch": 0.24861040929762507, "grad_norm": 1.5274577140808105, "learning_rate": 5.2306995746795785e-05, "loss": 2.0116, "step": 3321 }, { "epoch": 0.2486852694028035, "grad_norm": 1.275278091430664, "learning_rate": 5.225210892144721e-05, "loss": 1.8497, "step": 3322 }, { "epoch": 0.24876012950798196, "grad_norm": 1.1307909488677979, "learning_rate": 5.219724072202816e-05, "loss": 1.9635, "step": 3323 }, { "epoch": 0.24883498961316042, "grad_norm": 1.340247392654419, "learning_rate": 5.2142391169942185e-05, "loss": 2.0044, "step": 3324 }, { "epoch": 0.24890984971833885, "grad_norm": 1.1432641744613647, "learning_rate": 5.208756028658539e-05, "loss": 1.4339, "step": 3325 }, { "epoch": 0.2489847098235173, "grad_norm": 1.2371799945831299, "learning_rate": 5.203274809334675e-05, "loss": 2.1992, "step": 3326 }, { "epoch": 0.24905956992869574, "grad_norm": 1.1941542625427246, "learning_rate": 5.197795461160795e-05, "loss": 1.9612, "step": 3327 }, { "epoch": 0.2491344300338742, "grad_norm": 1.1484013795852661, "learning_rate": 5.192317986274324e-05, "loss": 1.7502, "step": 3328 }, { "epoch": 0.24920929013905266, "grad_norm": 1.2903733253479004, "learning_rate": 5.18684238681197e-05, "loss": 1.7247, "step": 3329 }, { "epoch": 0.2492841502442311, "grad_norm": 1.3514255285263062, "learning_rate": 5.181368664909706e-05, "loss": 2.2673, "step": 3330 }, { "epoch": 0.24935901034940955, "grad_norm": 1.321053385734558, "learning_rate": 5.175896822702766e-05, "loss": 1.8607, "step": 3331 }, { "epoch": 0.24943387045458798, "grad_norm": 1.3131487369537354, "learning_rate": 5.170426862325657e-05, "loss": 2.2466, "step": 3332 }, { "epoch": 0.24950873055976644, "grad_norm": 1.1229381561279297, "learning_rate": 5.1649587859121516e-05, "loss": 1.9085, "step": 3333 }, { "epoch": 0.24958359066494487, "grad_norm": 1.1773191690444946, "learning_rate": 5.159492595595288e-05, "loss": 2.1688, "step": 3334 }, { "epoch": 0.24965845077012333, "grad_norm": 1.3850032091140747, "learning_rate": 5.154028293507369e-05, "loss": 1.8171, "step": 3335 }, { "epoch": 0.2497333108753018, "grad_norm": 1.411083698272705, "learning_rate": 5.148565881779956e-05, "loss": 1.9846, "step": 3336 }, { "epoch": 0.24980817098048022, "grad_norm": 1.3648829460144043, "learning_rate": 5.143105362543873e-05, "loss": 1.9441, "step": 3337 }, { "epoch": 0.24988303108565868, "grad_norm": 1.1158488988876343, "learning_rate": 5.137646737929211e-05, "loss": 1.6618, "step": 3338 }, { "epoch": 0.2499578911908371, "grad_norm": 1.26804518699646, "learning_rate": 5.132190010065321e-05, "loss": 2.1431, "step": 3339 }, { "epoch": 0.2500327512960156, "grad_norm": 1.2445231676101685, "learning_rate": 5.126735181080813e-05, "loss": 1.8072, "step": 3340 }, { "epoch": 0.25010761140119403, "grad_norm": 1.283729910850525, "learning_rate": 5.1212822531035575e-05, "loss": 1.8181, "step": 3341 }, { "epoch": 0.25018247150637246, "grad_norm": 1.2643470764160156, "learning_rate": 5.115831228260677e-05, "loss": 1.6122, "step": 3342 }, { "epoch": 0.2502573316115509, "grad_norm": 1.153548002243042, "learning_rate": 5.110382108678564e-05, "loss": 1.8361, "step": 3343 }, { "epoch": 0.2503321917167294, "grad_norm": 1.3523515462875366, "learning_rate": 5.1049348964828515e-05, "loss": 2.0065, "step": 3344 }, { "epoch": 0.2504070518219078, "grad_norm": 1.2420202493667603, "learning_rate": 5.099489593798441e-05, "loss": 1.4332, "step": 3345 }, { "epoch": 0.25048191192708624, "grad_norm": 1.312312126159668, "learning_rate": 5.094046202749487e-05, "loss": 2.1197, "step": 3346 }, { "epoch": 0.25055677203226473, "grad_norm": 1.1544833183288574, "learning_rate": 5.088604725459398e-05, "loss": 2.2345, "step": 3347 }, { "epoch": 0.25063163213744316, "grad_norm": 1.1927703619003296, "learning_rate": 5.083165164050831e-05, "loss": 1.8904, "step": 3348 }, { "epoch": 0.2507064922426216, "grad_norm": 1.2268503904342651, "learning_rate": 5.077727520645701e-05, "loss": 1.7137, "step": 3349 }, { "epoch": 0.2507813523478, "grad_norm": 1.0494173765182495, "learning_rate": 5.0722917973651717e-05, "loss": 1.5949, "step": 3350 }, { "epoch": 0.2508562124529785, "grad_norm": 1.142045259475708, "learning_rate": 5.0668579963296616e-05, "loss": 2.1651, "step": 3351 }, { "epoch": 0.25093107255815694, "grad_norm": 1.3284367322921753, "learning_rate": 5.06142611965884e-05, "loss": 1.9854, "step": 3352 }, { "epoch": 0.2510059326633354, "grad_norm": 1.4666593074798584, "learning_rate": 5.055996169471614e-05, "loss": 2.2777, "step": 3353 }, { "epoch": 0.25108079276851386, "grad_norm": 1.2252858877182007, "learning_rate": 5.050568147886157e-05, "loss": 2.1458, "step": 3354 }, { "epoch": 0.2511556528736923, "grad_norm": 1.5595791339874268, "learning_rate": 5.045142057019872e-05, "loss": 2.2647, "step": 3355 }, { "epoch": 0.2512305129788707, "grad_norm": 1.1637955904006958, "learning_rate": 5.03971789898942e-05, "loss": 1.7586, "step": 3356 }, { "epoch": 0.2513053730840492, "grad_norm": 1.298531174659729, "learning_rate": 5.0342956759107094e-05, "loss": 1.6318, "step": 3357 }, { "epoch": 0.25138023318922764, "grad_norm": 1.1305813789367676, "learning_rate": 5.028875389898888e-05, "loss": 2.0528, "step": 3358 }, { "epoch": 0.2514550932944061, "grad_norm": 1.2717630863189697, "learning_rate": 5.023457043068354e-05, "loss": 2.3072, "step": 3359 }, { "epoch": 0.2515299533995845, "grad_norm": 1.454604148864746, "learning_rate": 5.01804063753274e-05, "loss": 1.839, "step": 3360 }, { "epoch": 0.251604813504763, "grad_norm": 1.3892890214920044, "learning_rate": 5.012626175404925e-05, "loss": 2.3111, "step": 3361 }, { "epoch": 0.2516796736099414, "grad_norm": 1.3928793668746948, "learning_rate": 5.007213658797034e-05, "loss": 2.2392, "step": 3362 }, { "epoch": 0.25175453371511985, "grad_norm": 1.601630687713623, "learning_rate": 5.0018030898204305e-05, "loss": 2.6528, "step": 3363 }, { "epoch": 0.25182939382029834, "grad_norm": 1.2111974954605103, "learning_rate": 4.996394470585718e-05, "loss": 2.0986, "step": 3364 }, { "epoch": 0.2519042539254768, "grad_norm": 1.254831075668335, "learning_rate": 4.990987803202745e-05, "loss": 2.2256, "step": 3365 }, { "epoch": 0.2519791140306552, "grad_norm": 1.255791187286377, "learning_rate": 4.985583089780583e-05, "loss": 1.7466, "step": 3366 }, { "epoch": 0.25205397413583364, "grad_norm": 1.0323251485824585, "learning_rate": 4.9801803324275576e-05, "loss": 1.5032, "step": 3367 }, { "epoch": 0.2521288342410121, "grad_norm": 1.210353136062622, "learning_rate": 4.974779533251228e-05, "loss": 2.1499, "step": 3368 }, { "epoch": 0.25220369434619055, "grad_norm": 1.3347010612487793, "learning_rate": 4.9693806943583785e-05, "loss": 2.1496, "step": 3369 }, { "epoch": 0.252278554451369, "grad_norm": 1.5291328430175781, "learning_rate": 4.9639838178550435e-05, "loss": 2.2494, "step": 3370 }, { "epoch": 0.2523534145565475, "grad_norm": 1.2639496326446533, "learning_rate": 4.9585889058464876e-05, "loss": 1.8297, "step": 3371 }, { "epoch": 0.2524282746617259, "grad_norm": 1.2731884717941284, "learning_rate": 4.953195960437198e-05, "loss": 2.1681, "step": 3372 }, { "epoch": 0.25250313476690434, "grad_norm": 1.4544697999954224, "learning_rate": 4.9478049837309094e-05, "loss": 2.104, "step": 3373 }, { "epoch": 0.2525779948720828, "grad_norm": 1.0339953899383545, "learning_rate": 4.9424159778305813e-05, "loss": 1.4914, "step": 3374 }, { "epoch": 0.25265285497726125, "grad_norm": 1.2894554138183594, "learning_rate": 4.9370289448384074e-05, "loss": 1.908, "step": 3375 }, { "epoch": 0.2527277150824397, "grad_norm": 1.2550723552703857, "learning_rate": 4.931643886855813e-05, "loss": 1.8524, "step": 3376 }, { "epoch": 0.2528025751876181, "grad_norm": 1.337499976158142, "learning_rate": 4.926260805983448e-05, "loss": 2.5007, "step": 3377 }, { "epoch": 0.2528774352927966, "grad_norm": 1.3133500814437866, "learning_rate": 4.9208797043211874e-05, "loss": 1.8863, "step": 3378 }, { "epoch": 0.25295229539797504, "grad_norm": 1.1739966869354248, "learning_rate": 4.915500583968145e-05, "loss": 1.6915, "step": 3379 }, { "epoch": 0.25302715550315347, "grad_norm": 1.182839274406433, "learning_rate": 4.9101234470226576e-05, "loss": 1.5375, "step": 3380 }, { "epoch": 0.25310201560833195, "grad_norm": 1.2018024921417236, "learning_rate": 4.9047482955822855e-05, "loss": 2.3022, "step": 3381 }, { "epoch": 0.2531768757135104, "grad_norm": 1.4958717823028564, "learning_rate": 4.8993751317438205e-05, "loss": 2.072, "step": 3382 }, { "epoch": 0.2532517358186888, "grad_norm": 1.1086511611938477, "learning_rate": 4.894003957603269e-05, "loss": 1.3508, "step": 3383 }, { "epoch": 0.25332659592386725, "grad_norm": 1.3213860988616943, "learning_rate": 4.8886347752558736e-05, "loss": 2.0383, "step": 3384 }, { "epoch": 0.25340145602904574, "grad_norm": 1.5444637537002563, "learning_rate": 4.8832675867960864e-05, "loss": 2.3861, "step": 3385 }, { "epoch": 0.25347631613422417, "grad_norm": 1.2708609104156494, "learning_rate": 4.877902394317593e-05, "loss": 2.4483, "step": 3386 }, { "epoch": 0.2535511762394026, "grad_norm": 1.3669521808624268, "learning_rate": 4.872539199913294e-05, "loss": 2.3225, "step": 3387 }, { "epoch": 0.2536260363445811, "grad_norm": 1.1963971853256226, "learning_rate": 4.867178005675319e-05, "loss": 1.8752, "step": 3388 }, { "epoch": 0.2537008964497595, "grad_norm": 1.3176287412643433, "learning_rate": 4.861818813695004e-05, "loss": 2.1936, "step": 3389 }, { "epoch": 0.25377575655493795, "grad_norm": 1.1512418985366821, "learning_rate": 4.856461626062913e-05, "loss": 1.6023, "step": 3390 }, { "epoch": 0.25385061666011643, "grad_norm": 1.2445197105407715, "learning_rate": 4.851106444868827e-05, "loss": 2.1404, "step": 3391 }, { "epoch": 0.25392547676529487, "grad_norm": 1.2478110790252686, "learning_rate": 4.845753272201747e-05, "loss": 1.9486, "step": 3392 }, { "epoch": 0.2540003368704733, "grad_norm": 1.1765928268432617, "learning_rate": 4.840402110149881e-05, "loss": 1.7576, "step": 3393 }, { "epoch": 0.25407519697565173, "grad_norm": 1.2770254611968994, "learning_rate": 4.835052960800664e-05, "loss": 1.9956, "step": 3394 }, { "epoch": 0.2541500570808302, "grad_norm": 1.5161778926849365, "learning_rate": 4.829705826240736e-05, "loss": 2.6491, "step": 3395 }, { "epoch": 0.25422491718600865, "grad_norm": 1.1939209699630737, "learning_rate": 4.824360708555956e-05, "loss": 1.9586, "step": 3396 }, { "epoch": 0.2542997772911871, "grad_norm": 1.3228507041931152, "learning_rate": 4.8190176098313975e-05, "loss": 1.5873, "step": 3397 }, { "epoch": 0.25437463739636557, "grad_norm": 1.2809228897094727, "learning_rate": 4.813676532151345e-05, "loss": 1.8345, "step": 3398 }, { "epoch": 0.254449497501544, "grad_norm": 1.4623078107833862, "learning_rate": 4.808337477599298e-05, "loss": 1.8301, "step": 3399 }, { "epoch": 0.25452435760672243, "grad_norm": 1.277687668800354, "learning_rate": 4.8030004482579604e-05, "loss": 1.9203, "step": 3400 }, { "epoch": 0.25459921771190086, "grad_norm": 1.3790115118026733, "learning_rate": 4.797665446209244e-05, "loss": 1.8658, "step": 3401 }, { "epoch": 0.25467407781707935, "grad_norm": 1.0526114702224731, "learning_rate": 4.792332473534278e-05, "loss": 1.7896, "step": 3402 }, { "epoch": 0.2547489379222578, "grad_norm": 1.1152446269989014, "learning_rate": 4.787001532313398e-05, "loss": 2.3894, "step": 3403 }, { "epoch": 0.2548237980274362, "grad_norm": 1.4244356155395508, "learning_rate": 4.781672624626144e-05, "loss": 2.103, "step": 3404 }, { "epoch": 0.2548986581326147, "grad_norm": 1.1895828247070312, "learning_rate": 4.776345752551271e-05, "loss": 1.7248, "step": 3405 }, { "epoch": 0.25497351823779313, "grad_norm": 1.0407320261001587, "learning_rate": 4.771020918166724e-05, "loss": 2.1542, "step": 3406 }, { "epoch": 0.25504837834297156, "grad_norm": 1.2808423042297363, "learning_rate": 4.765698123549667e-05, "loss": 2.1352, "step": 3407 }, { "epoch": 0.25512323844815005, "grad_norm": 1.4247184991836548, "learning_rate": 4.760377370776462e-05, "loss": 1.9425, "step": 3408 }, { "epoch": 0.2551980985533285, "grad_norm": 1.3678498268127441, "learning_rate": 4.755058661922683e-05, "loss": 2.4338, "step": 3409 }, { "epoch": 0.2552729586585069, "grad_norm": 1.3958884477615356, "learning_rate": 4.749741999063092e-05, "loss": 1.9177, "step": 3410 }, { "epoch": 0.25534781876368534, "grad_norm": 1.1777302026748657, "learning_rate": 4.7444273842716666e-05, "loss": 2.1632, "step": 3411 }, { "epoch": 0.25542267886886383, "grad_norm": 1.4515377283096313, "learning_rate": 4.739114819621574e-05, "loss": 2.4106, "step": 3412 }, { "epoch": 0.25549753897404226, "grad_norm": 1.6259158849716187, "learning_rate": 4.733804307185191e-05, "loss": 2.1838, "step": 3413 }, { "epoch": 0.2555723990792207, "grad_norm": 1.2710402011871338, "learning_rate": 4.728495849034091e-05, "loss": 1.9947, "step": 3414 }, { "epoch": 0.2556472591843992, "grad_norm": 1.4076367616653442, "learning_rate": 4.723189447239044e-05, "loss": 1.9812, "step": 3415 }, { "epoch": 0.2557221192895776, "grad_norm": 1.1075284481048584, "learning_rate": 4.717885103870023e-05, "loss": 1.874, "step": 3416 }, { "epoch": 0.25579697939475604, "grad_norm": 1.2900906801223755, "learning_rate": 4.712582820996192e-05, "loss": 2.146, "step": 3417 }, { "epoch": 0.2558718394999345, "grad_norm": 1.5141178369522095, "learning_rate": 4.707282600685908e-05, "loss": 2.0318, "step": 3418 }, { "epoch": 0.25594669960511296, "grad_norm": 1.3559762239456177, "learning_rate": 4.701984445006735e-05, "loss": 2.3686, "step": 3419 }, { "epoch": 0.2560215597102914, "grad_norm": 1.1005747318267822, "learning_rate": 4.6966883560254235e-05, "loss": 1.778, "step": 3420 }, { "epoch": 0.2560964198154698, "grad_norm": 1.4521865844726562, "learning_rate": 4.69139433580792e-05, "loss": 2.1905, "step": 3421 }, { "epoch": 0.2561712799206483, "grad_norm": 1.0705018043518066, "learning_rate": 4.6861023864193686e-05, "loss": 1.7124, "step": 3422 }, { "epoch": 0.25624614002582674, "grad_norm": 1.3510873317718506, "learning_rate": 4.680812509924092e-05, "loss": 2.4381, "step": 3423 }, { "epoch": 0.25632100013100517, "grad_norm": 1.2234299182891846, "learning_rate": 4.675524708385618e-05, "loss": 1.5418, "step": 3424 }, { "epoch": 0.25639586023618366, "grad_norm": 1.317345380783081, "learning_rate": 4.6702389838666625e-05, "loss": 1.9777, "step": 3425 }, { "epoch": 0.2564707203413621, "grad_norm": 1.3589493036270142, "learning_rate": 4.664955338429123e-05, "loss": 2.0219, "step": 3426 }, { "epoch": 0.2565455804465405, "grad_norm": 1.363197684288025, "learning_rate": 4.6596737741340944e-05, "loss": 1.9296, "step": 3427 }, { "epoch": 0.25662044055171895, "grad_norm": 1.3155146837234497, "learning_rate": 4.654394293041861e-05, "loss": 1.8904, "step": 3428 }, { "epoch": 0.25669530065689744, "grad_norm": 1.1432486772537231, "learning_rate": 4.6491168972118835e-05, "loss": 1.5058, "step": 3429 }, { "epoch": 0.25677016076207587, "grad_norm": 1.5388914346694946, "learning_rate": 4.6438415887028197e-05, "loss": 1.7361, "step": 3430 }, { "epoch": 0.2568450208672543, "grad_norm": 1.3936161994934082, "learning_rate": 4.63856836957251e-05, "loss": 1.8345, "step": 3431 }, { "epoch": 0.2569198809724328, "grad_norm": 1.1518778800964355, "learning_rate": 4.6332972418779786e-05, "loss": 1.7428, "step": 3432 }, { "epoch": 0.2569947410776112, "grad_norm": 1.1024096012115479, "learning_rate": 4.628028207675439e-05, "loss": 2.2212, "step": 3433 }, { "epoch": 0.25706960118278965, "grad_norm": 1.2036229372024536, "learning_rate": 4.6227612690202795e-05, "loss": 1.7938, "step": 3434 }, { "epoch": 0.2571444612879681, "grad_norm": 1.007718801498413, "learning_rate": 4.617496427967073e-05, "loss": 1.296, "step": 3435 }, { "epoch": 0.25721932139314657, "grad_norm": 1.1321418285369873, "learning_rate": 4.612233686569578e-05, "loss": 2.1913, "step": 3436 }, { "epoch": 0.257294181498325, "grad_norm": 1.565243124961853, "learning_rate": 4.606973046880734e-05, "loss": 2.3315, "step": 3437 }, { "epoch": 0.25736904160350343, "grad_norm": 1.51026451587677, "learning_rate": 4.601714510952657e-05, "loss": 2.3515, "step": 3438 }, { "epoch": 0.2574439017086819, "grad_norm": 1.02693510055542, "learning_rate": 4.5964580808366505e-05, "loss": 1.4434, "step": 3439 }, { "epoch": 0.25751876181386035, "grad_norm": 1.2880215644836426, "learning_rate": 4.591203758583181e-05, "loss": 1.9772, "step": 3440 }, { "epoch": 0.2575936219190388, "grad_norm": 1.4540033340454102, "learning_rate": 4.585951546241911e-05, "loss": 2.3012, "step": 3441 }, { "epoch": 0.25766848202421727, "grad_norm": 1.225653886795044, "learning_rate": 4.580701445861661e-05, "loss": 1.8116, "step": 3442 }, { "epoch": 0.2577433421293957, "grad_norm": 1.2877635955810547, "learning_rate": 4.575453459490445e-05, "loss": 2.027, "step": 3443 }, { "epoch": 0.25781820223457413, "grad_norm": 1.442991852760315, "learning_rate": 4.5702075891754426e-05, "loss": 2.0043, "step": 3444 }, { "epoch": 0.25789306233975257, "grad_norm": 1.4756706953048706, "learning_rate": 4.564963836963011e-05, "loss": 2.0392, "step": 3445 }, { "epoch": 0.25796792244493105, "grad_norm": 1.5145765542984009, "learning_rate": 4.559722204898684e-05, "loss": 1.8992, "step": 3446 }, { "epoch": 0.2580427825501095, "grad_norm": 1.7502492666244507, "learning_rate": 4.554482695027158e-05, "loss": 2.001, "step": 3447 }, { "epoch": 0.2581176426552879, "grad_norm": 1.45966374874115, "learning_rate": 4.549245309392312e-05, "loss": 2.0622, "step": 3448 }, { "epoch": 0.2581925027604664, "grad_norm": 1.1168608665466309, "learning_rate": 4.544010050037196e-05, "loss": 1.8139, "step": 3449 }, { "epoch": 0.25826736286564483, "grad_norm": 1.1136834621429443, "learning_rate": 4.5387769190040205e-05, "loss": 1.951, "step": 3450 }, { "epoch": 0.25826736286564483, "eval_loss": 1.9946517944335938, "eval_runtime": 178.8801, "eval_samples_per_second": 27.952, "eval_steps_per_second": 13.976, "step": 3450 }, { "epoch": 0.25834222297082327, "grad_norm": 1.3444892168045044, "learning_rate": 4.533545918334175e-05, "loss": 2.2774, "step": 3451 }, { "epoch": 0.2584170830760017, "grad_norm": 1.253953456878662, "learning_rate": 4.528317050068222e-05, "loss": 1.9823, "step": 3452 }, { "epoch": 0.2584919431811802, "grad_norm": 1.0317198038101196, "learning_rate": 4.523090316245876e-05, "loss": 1.9079, "step": 3453 }, { "epoch": 0.2585668032863586, "grad_norm": 1.1406723260879517, "learning_rate": 4.5178657189060326e-05, "loss": 1.6578, "step": 3454 }, { "epoch": 0.25864166339153705, "grad_norm": 1.280587911605835, "learning_rate": 4.512643260086751e-05, "loss": 2.1886, "step": 3455 }, { "epoch": 0.25871652349671553, "grad_norm": 1.380959153175354, "learning_rate": 4.5074229418252546e-05, "loss": 1.7294, "step": 3456 }, { "epoch": 0.25879138360189397, "grad_norm": 1.1507941484451294, "learning_rate": 4.502204766157936e-05, "loss": 2.3555, "step": 3457 }, { "epoch": 0.2588662437070724, "grad_norm": 1.5744465589523315, "learning_rate": 4.496988735120346e-05, "loss": 1.9649, "step": 3458 }, { "epoch": 0.2589411038122509, "grad_norm": 1.5127986669540405, "learning_rate": 4.491774850747194e-05, "loss": 2.258, "step": 3459 }, { "epoch": 0.2590159639174293, "grad_norm": 1.4164901971817017, "learning_rate": 4.4865631150723676e-05, "loss": 2.0221, "step": 3460 }, { "epoch": 0.25909082402260775, "grad_norm": 1.1150977611541748, "learning_rate": 4.4813535301289046e-05, "loss": 1.9149, "step": 3461 }, { "epoch": 0.2591656841277862, "grad_norm": 1.1708787679672241, "learning_rate": 4.4761460979490075e-05, "loss": 2.1167, "step": 3462 }, { "epoch": 0.25924054423296466, "grad_norm": 1.1806700229644775, "learning_rate": 4.4709408205640434e-05, "loss": 1.7259, "step": 3463 }, { "epoch": 0.2593154043381431, "grad_norm": 1.4895094633102417, "learning_rate": 4.4657377000045266e-05, "loss": 2.2662, "step": 3464 }, { "epoch": 0.2593902644433215, "grad_norm": 1.1883647441864014, "learning_rate": 4.46053673830014e-05, "loss": 1.776, "step": 3465 }, { "epoch": 0.2594651245485, "grad_norm": 1.2828632593154907, "learning_rate": 4.4553379374797255e-05, "loss": 1.9898, "step": 3466 }, { "epoch": 0.25953998465367845, "grad_norm": 1.1958879232406616, "learning_rate": 4.450141299571272e-05, "loss": 1.7328, "step": 3467 }, { "epoch": 0.2596148447588569, "grad_norm": 1.4935420751571655, "learning_rate": 4.4449468266019345e-05, "loss": 2.4168, "step": 3468 }, { "epoch": 0.2596897048640353, "grad_norm": 1.2173084020614624, "learning_rate": 4.439754520598023e-05, "loss": 2.0005, "step": 3469 }, { "epoch": 0.2597645649692138, "grad_norm": 1.6210618019104004, "learning_rate": 4.4345643835849926e-05, "loss": 1.8899, "step": 3470 }, { "epoch": 0.2598394250743922, "grad_norm": 1.2748149633407593, "learning_rate": 4.429376417587462e-05, "loss": 2.4442, "step": 3471 }, { "epoch": 0.25991428517957066, "grad_norm": 1.1839276552200317, "learning_rate": 4.424190624629201e-05, "loss": 2.0715, "step": 3472 }, { "epoch": 0.25998914528474915, "grad_norm": 1.348148226737976, "learning_rate": 4.419007006733129e-05, "loss": 1.7917, "step": 3473 }, { "epoch": 0.2600640053899276, "grad_norm": 1.303303599357605, "learning_rate": 4.4138255659213245e-05, "loss": 2.0473, "step": 3474 }, { "epoch": 0.260138865495106, "grad_norm": 1.370660662651062, "learning_rate": 4.408646304215005e-05, "loss": 2.3392, "step": 3475 }, { "epoch": 0.2602137256002845, "grad_norm": 1.1959431171417236, "learning_rate": 4.4034692236345406e-05, "loss": 2.013, "step": 3476 }, { "epoch": 0.2602885857054629, "grad_norm": 1.4204901456832886, "learning_rate": 4.398294326199459e-05, "loss": 2.0688, "step": 3477 }, { "epoch": 0.26036344581064136, "grad_norm": 1.4799500703811646, "learning_rate": 4.393121613928429e-05, "loss": 2.486, "step": 3478 }, { "epoch": 0.2604383059158198, "grad_norm": 1.2557923793792725, "learning_rate": 4.3879510888392705e-05, "loss": 1.3534, "step": 3479 }, { "epoch": 0.2605131660209983, "grad_norm": 1.1247121095657349, "learning_rate": 4.3827827529489506e-05, "loss": 1.7858, "step": 3480 }, { "epoch": 0.2605880261261767, "grad_norm": 1.2144724130630493, "learning_rate": 4.377616608273576e-05, "loss": 2.2646, "step": 3481 }, { "epoch": 0.26066288623135514, "grad_norm": 1.381126880645752, "learning_rate": 4.3724526568284076e-05, "loss": 2.218, "step": 3482 }, { "epoch": 0.2607377463365336, "grad_norm": 1.250625491142273, "learning_rate": 4.367290900627841e-05, "loss": 1.9454, "step": 3483 }, { "epoch": 0.26081260644171206, "grad_norm": 1.353980541229248, "learning_rate": 4.362131341685424e-05, "loss": 2.002, "step": 3484 }, { "epoch": 0.2608874665468905, "grad_norm": 1.502170443534851, "learning_rate": 4.356973982013842e-05, "loss": 2.0845, "step": 3485 }, { "epoch": 0.2609623266520689, "grad_norm": 1.255301833152771, "learning_rate": 4.3518188236249314e-05, "loss": 2.1251, "step": 3486 }, { "epoch": 0.2610371867572474, "grad_norm": 1.2067056894302368, "learning_rate": 4.3466658685296546e-05, "loss": 1.5695, "step": 3487 }, { "epoch": 0.26111204686242584, "grad_norm": 1.3116868734359741, "learning_rate": 4.3415151187381255e-05, "loss": 2.3631, "step": 3488 }, { "epoch": 0.26118690696760427, "grad_norm": 1.3123323917388916, "learning_rate": 4.3363665762595964e-05, "loss": 1.9275, "step": 3489 }, { "epoch": 0.26126176707278276, "grad_norm": 1.020263910293579, "learning_rate": 4.331220243102461e-05, "loss": 1.2471, "step": 3490 }, { "epoch": 0.2613366271779612, "grad_norm": 1.113088607788086, "learning_rate": 4.3260761212742385e-05, "loss": 2.1426, "step": 3491 }, { "epoch": 0.2614114872831396, "grad_norm": 1.552274227142334, "learning_rate": 4.320934212781605e-05, "loss": 2.0093, "step": 3492 }, { "epoch": 0.2614863473883181, "grad_norm": 1.1524330377578735, "learning_rate": 4.3157945196303516e-05, "loss": 1.4081, "step": 3493 }, { "epoch": 0.26156120749349654, "grad_norm": 1.3470449447631836, "learning_rate": 4.310657043825423e-05, "loss": 1.9633, "step": 3494 }, { "epoch": 0.26163606759867497, "grad_norm": 1.2323765754699707, "learning_rate": 4.305521787370891e-05, "loss": 1.8969, "step": 3495 }, { "epoch": 0.2617109277038534, "grad_norm": 1.2676854133605957, "learning_rate": 4.3003887522699635e-05, "loss": 1.8812, "step": 3496 }, { "epoch": 0.2617857878090319, "grad_norm": 1.4218566417694092, "learning_rate": 4.295257940524984e-05, "loss": 2.4154, "step": 3497 }, { "epoch": 0.2618606479142103, "grad_norm": 1.3449474573135376, "learning_rate": 4.290129354137423e-05, "loss": 2.0854, "step": 3498 }, { "epoch": 0.26193550801938875, "grad_norm": 1.1401857137680054, "learning_rate": 4.2850029951078826e-05, "loss": 2.0833, "step": 3499 }, { "epoch": 0.26201036812456724, "grad_norm": 1.343851923942566, "learning_rate": 4.279878865436102e-05, "loss": 2.56, "step": 3500 }, { "epoch": 0.26208522822974567, "grad_norm": 1.2265610694885254, "learning_rate": 4.2747569671209486e-05, "loss": 1.5639, "step": 3501 }, { "epoch": 0.2621600883349241, "grad_norm": 1.1725025177001953, "learning_rate": 4.26963730216042e-05, "loss": 1.6676, "step": 3502 }, { "epoch": 0.26223494844010253, "grad_norm": 1.3643269538879395, "learning_rate": 4.264519872551642e-05, "loss": 2.1136, "step": 3503 }, { "epoch": 0.262309808545281, "grad_norm": 1.3441693782806396, "learning_rate": 4.259404680290864e-05, "loss": 2.024, "step": 3504 }, { "epoch": 0.26238466865045945, "grad_norm": 1.5697351694107056, "learning_rate": 4.2542917273734684e-05, "loss": 1.5715, "step": 3505 }, { "epoch": 0.2624595287556379, "grad_norm": 1.3419634103775024, "learning_rate": 4.249181015793966e-05, "loss": 2.1148, "step": 3506 }, { "epoch": 0.26253438886081637, "grad_norm": 1.0906691551208496, "learning_rate": 4.244072547545982e-05, "loss": 1.7148, "step": 3507 }, { "epoch": 0.2626092489659948, "grad_norm": 1.2566791772842407, "learning_rate": 4.238966324622278e-05, "loss": 2.2239, "step": 3508 }, { "epoch": 0.26268410907117323, "grad_norm": 1.6119168996810913, "learning_rate": 4.233862349014739e-05, "loss": 2.3113, "step": 3509 }, { "epoch": 0.2627589691763517, "grad_norm": 1.4447964429855347, "learning_rate": 4.228760622714363e-05, "loss": 2.0667, "step": 3510 }, { "epoch": 0.26283382928153015, "grad_norm": 1.2911735773086548, "learning_rate": 4.223661147711281e-05, "loss": 1.9775, "step": 3511 }, { "epoch": 0.2629086893867086, "grad_norm": 1.6586129665374756, "learning_rate": 4.2185639259947416e-05, "loss": 1.9848, "step": 3512 }, { "epoch": 0.262983549491887, "grad_norm": 1.3210426568984985, "learning_rate": 4.2134689595531164e-05, "loss": 2.0193, "step": 3513 }, { "epoch": 0.2630584095970655, "grad_norm": 1.324910044670105, "learning_rate": 4.2083762503739e-05, "loss": 2.4187, "step": 3514 }, { "epoch": 0.26313326970224393, "grad_norm": 1.3694475889205933, "learning_rate": 4.203285800443697e-05, "loss": 2.5156, "step": 3515 }, { "epoch": 0.26320812980742236, "grad_norm": 1.1987301111221313, "learning_rate": 4.1981976117482345e-05, "loss": 2.7263, "step": 3516 }, { "epoch": 0.26328298991260085, "grad_norm": 1.1968696117401123, "learning_rate": 4.193111686272361e-05, "loss": 1.8542, "step": 3517 }, { "epoch": 0.2633578500177793, "grad_norm": 1.2312966585159302, "learning_rate": 4.188028026000043e-05, "loss": 1.6691, "step": 3518 }, { "epoch": 0.2634327101229577, "grad_norm": 1.1676862239837646, "learning_rate": 4.1829466329143576e-05, "loss": 1.8467, "step": 3519 }, { "epoch": 0.26350757022813615, "grad_norm": 1.689098834991455, "learning_rate": 4.177867508997505e-05, "loss": 2.1325, "step": 3520 }, { "epoch": 0.26358243033331463, "grad_norm": 1.1357771158218384, "learning_rate": 4.1727906562307905e-05, "loss": 1.8723, "step": 3521 }, { "epoch": 0.26365729043849306, "grad_norm": 1.145292043685913, "learning_rate": 4.167716076594641e-05, "loss": 1.9718, "step": 3522 }, { "epoch": 0.2637321505436715, "grad_norm": 1.3766591548919678, "learning_rate": 4.162643772068598e-05, "loss": 2.0078, "step": 3523 }, { "epoch": 0.26380701064885, "grad_norm": 1.120312213897705, "learning_rate": 4.157573744631305e-05, "loss": 1.7239, "step": 3524 }, { "epoch": 0.2638818707540284, "grad_norm": 1.3257248401641846, "learning_rate": 4.152505996260528e-05, "loss": 1.8619, "step": 3525 }, { "epoch": 0.26395673085920685, "grad_norm": 1.306697130203247, "learning_rate": 4.147440528933143e-05, "loss": 2.214, "step": 3526 }, { "epoch": 0.26403159096438533, "grad_norm": 1.1765097379684448, "learning_rate": 4.1423773446251266e-05, "loss": 2.1, "step": 3527 }, { "epoch": 0.26410645106956376, "grad_norm": 1.1140875816345215, "learning_rate": 4.137316445311576e-05, "loss": 1.4998, "step": 3528 }, { "epoch": 0.2641813111747422, "grad_norm": 1.4084118604660034, "learning_rate": 4.132257832966691e-05, "loss": 2.3452, "step": 3529 }, { "epoch": 0.2642561712799206, "grad_norm": 1.1152507066726685, "learning_rate": 4.127201509563783e-05, "loss": 1.7664, "step": 3530 }, { "epoch": 0.2643310313850991, "grad_norm": 1.0885183811187744, "learning_rate": 4.12214747707527e-05, "loss": 1.4685, "step": 3531 }, { "epoch": 0.26440589149027754, "grad_norm": 1.2826919555664062, "learning_rate": 4.117095737472672e-05, "loss": 2.0191, "step": 3532 }, { "epoch": 0.264480751595456, "grad_norm": 1.0601887702941895, "learning_rate": 4.112046292726612e-05, "loss": 1.8402, "step": 3533 }, { "epoch": 0.26455561170063446, "grad_norm": 1.2844061851501465, "learning_rate": 4.10699914480683e-05, "loss": 2.2958, "step": 3534 }, { "epoch": 0.2646304718058129, "grad_norm": 1.2363065481185913, "learning_rate": 4.101954295682161e-05, "loss": 1.8864, "step": 3535 }, { "epoch": 0.2647053319109913, "grad_norm": 1.4195659160614014, "learning_rate": 4.0969117473205466e-05, "loss": 2.1267, "step": 3536 }, { "epoch": 0.26478019201616976, "grad_norm": 1.2080254554748535, "learning_rate": 4.091871501689032e-05, "loss": 2.0183, "step": 3537 }, { "epoch": 0.26485505212134824, "grad_norm": 1.3037010431289673, "learning_rate": 4.0868335607537545e-05, "loss": 2.2499, "step": 3538 }, { "epoch": 0.2649299122265267, "grad_norm": 1.166123390197754, "learning_rate": 4.0817979264799674e-05, "loss": 2.0306, "step": 3539 }, { "epoch": 0.2650047723317051, "grad_norm": 1.3875656127929688, "learning_rate": 4.0767646008320105e-05, "loss": 2.161, "step": 3540 }, { "epoch": 0.2650796324368836, "grad_norm": 1.3465032577514648, "learning_rate": 4.071733585773332e-05, "loss": 1.8913, "step": 3541 }, { "epoch": 0.265154492542062, "grad_norm": 1.5010851621627808, "learning_rate": 4.066704883266475e-05, "loss": 2.0595, "step": 3542 }, { "epoch": 0.26522935264724046, "grad_norm": 1.3488107919692993, "learning_rate": 4.0616784952730804e-05, "loss": 2.0417, "step": 3543 }, { "epoch": 0.26530421275241894, "grad_norm": 1.3426992893218994, "learning_rate": 4.056654423753893e-05, "loss": 2.0578, "step": 3544 }, { "epoch": 0.2653790728575974, "grad_norm": 1.1908326148986816, "learning_rate": 4.05163267066874e-05, "loss": 1.9818, "step": 3545 }, { "epoch": 0.2654539329627758, "grad_norm": 1.2556883096694946, "learning_rate": 4.046613237976555e-05, "loss": 1.8451, "step": 3546 }, { "epoch": 0.26552879306795424, "grad_norm": 1.1166797876358032, "learning_rate": 4.0415961276353695e-05, "loss": 2.0886, "step": 3547 }, { "epoch": 0.2656036531731327, "grad_norm": 1.2591995000839233, "learning_rate": 4.036581341602294e-05, "loss": 1.7324, "step": 3548 }, { "epoch": 0.26567851327831116, "grad_norm": 1.1776820421218872, "learning_rate": 4.031568881833546e-05, "loss": 1.6007, "step": 3549 }, { "epoch": 0.2657533733834896, "grad_norm": 1.456089735031128, "learning_rate": 4.026558750284435e-05, "loss": 1.9357, "step": 3550 }, { "epoch": 0.2658282334886681, "grad_norm": 1.378693699836731, "learning_rate": 4.0215509489093506e-05, "loss": 2.0744, "step": 3551 }, { "epoch": 0.2659030935938465, "grad_norm": 1.2904287576675415, "learning_rate": 4.016545479661785e-05, "loss": 2.1567, "step": 3552 }, { "epoch": 0.26597795369902494, "grad_norm": 1.2386677265167236, "learning_rate": 4.011542344494319e-05, "loss": 1.9103, "step": 3553 }, { "epoch": 0.26605281380420337, "grad_norm": 1.3962070941925049, "learning_rate": 4.0065415453586176e-05, "loss": 1.9966, "step": 3554 }, { "epoch": 0.26612767390938186, "grad_norm": 1.2792255878448486, "learning_rate": 4.001543084205444e-05, "loss": 2.3169, "step": 3555 }, { "epoch": 0.2662025340145603, "grad_norm": 1.2216852903366089, "learning_rate": 3.9965469629846365e-05, "loss": 1.6893, "step": 3556 }, { "epoch": 0.2662773941197387, "grad_norm": 1.3071799278259277, "learning_rate": 3.991553183645127e-05, "loss": 1.4868, "step": 3557 }, { "epoch": 0.2663522542249172, "grad_norm": 1.2988708019256592, "learning_rate": 3.986561748134936e-05, "loss": 1.9669, "step": 3558 }, { "epoch": 0.26642711433009564, "grad_norm": 1.1445127725601196, "learning_rate": 3.981572658401169e-05, "loss": 1.8821, "step": 3559 }, { "epoch": 0.26650197443527407, "grad_norm": 1.0859088897705078, "learning_rate": 3.976585916390014e-05, "loss": 1.8067, "step": 3560 }, { "epoch": 0.26657683454045256, "grad_norm": 1.4403103590011597, "learning_rate": 3.971601524046749e-05, "loss": 2.2352, "step": 3561 }, { "epoch": 0.266651694645631, "grad_norm": 1.209730625152588, "learning_rate": 3.966619483315722e-05, "loss": 1.7222, "step": 3562 }, { "epoch": 0.2667265547508094, "grad_norm": 1.3124589920043945, "learning_rate": 3.961639796140383e-05, "loss": 2.0959, "step": 3563 }, { "epoch": 0.26680141485598785, "grad_norm": 1.3061461448669434, "learning_rate": 3.956662464463242e-05, "loss": 1.9612, "step": 3564 }, { "epoch": 0.26687627496116634, "grad_norm": 1.130108118057251, "learning_rate": 3.951687490225909e-05, "loss": 2.2834, "step": 3565 }, { "epoch": 0.26695113506634477, "grad_norm": 1.0444188117980957, "learning_rate": 3.946714875369065e-05, "loss": 2.1664, "step": 3566 }, { "epoch": 0.2670259951715232, "grad_norm": 1.2440394163131714, "learning_rate": 3.9417446218324774e-05, "loss": 1.7268, "step": 3567 }, { "epoch": 0.2671008552767017, "grad_norm": 1.131906270980835, "learning_rate": 3.936776731554979e-05, "loss": 2.0529, "step": 3568 }, { "epoch": 0.2671757153818801, "grad_norm": 1.354885458946228, "learning_rate": 3.931811206474494e-05, "loss": 1.6805, "step": 3569 }, { "epoch": 0.26725057548705855, "grad_norm": 1.2794641256332397, "learning_rate": 3.926848048528018e-05, "loss": 1.8887, "step": 3570 }, { "epoch": 0.267325435592237, "grad_norm": 1.3499798774719238, "learning_rate": 3.92188725965163e-05, "loss": 1.9585, "step": 3571 }, { "epoch": 0.26740029569741547, "grad_norm": 1.1670323610305786, "learning_rate": 3.916928841780472e-05, "loss": 1.604, "step": 3572 }, { "epoch": 0.2674751558025939, "grad_norm": 1.3768935203552246, "learning_rate": 3.911972796848775e-05, "loss": 1.7748, "step": 3573 }, { "epoch": 0.26755001590777233, "grad_norm": 1.3561466932296753, "learning_rate": 3.9070191267898306e-05, "loss": 1.848, "step": 3574 }, { "epoch": 0.2676248760129508, "grad_norm": 2.094216823577881, "learning_rate": 3.902067833536015e-05, "loss": 2.0141, "step": 3575 }, { "epoch": 0.26769973611812925, "grad_norm": 1.3528594970703125, "learning_rate": 3.897118919018775e-05, "loss": 1.9244, "step": 3576 }, { "epoch": 0.2677745962233077, "grad_norm": 1.520224928855896, "learning_rate": 3.8921723851686255e-05, "loss": 1.837, "step": 3577 }, { "epoch": 0.26784945632848617, "grad_norm": 1.4357924461364746, "learning_rate": 3.88722823391516e-05, "loss": 1.7467, "step": 3578 }, { "epoch": 0.2679243164336646, "grad_norm": 1.4240448474884033, "learning_rate": 3.882286467187031e-05, "loss": 2.2103, "step": 3579 }, { "epoch": 0.26799917653884303, "grad_norm": 1.1811350584030151, "learning_rate": 3.877347086911973e-05, "loss": 2.1883, "step": 3580 }, { "epoch": 0.26807403664402146, "grad_norm": 1.3024163246154785, "learning_rate": 3.8724100950167785e-05, "loss": 2.2651, "step": 3581 }, { "epoch": 0.26814889674919995, "grad_norm": 1.475986123085022, "learning_rate": 3.8674754934273186e-05, "loss": 1.7375, "step": 3582 }, { "epoch": 0.2682237568543784, "grad_norm": 1.2460542917251587, "learning_rate": 3.862543284068525e-05, "loss": 2.0172, "step": 3583 }, { "epoch": 0.2682986169595568, "grad_norm": 1.1706708669662476, "learning_rate": 3.857613468864404e-05, "loss": 1.6769, "step": 3584 }, { "epoch": 0.2683734770647353, "grad_norm": 1.3117034435272217, "learning_rate": 3.8526860497380135e-05, "loss": 2.1029, "step": 3585 }, { "epoch": 0.26844833716991373, "grad_norm": 1.3964478969573975, "learning_rate": 3.8477610286114915e-05, "loss": 2.2556, "step": 3586 }, { "epoch": 0.26852319727509216, "grad_norm": 1.2470375299453735, "learning_rate": 3.842838407406033e-05, "loss": 1.9441, "step": 3587 }, { "epoch": 0.2685980573802706, "grad_norm": 1.075679063796997, "learning_rate": 3.837918188041904e-05, "loss": 2.074, "step": 3588 }, { "epoch": 0.2686729174854491, "grad_norm": 1.232170820236206, "learning_rate": 3.833000372438419e-05, "loss": 1.7787, "step": 3589 }, { "epoch": 0.2687477775906275, "grad_norm": 1.2465778589248657, "learning_rate": 3.8280849625139726e-05, "loss": 2.2898, "step": 3590 }, { "epoch": 0.26882263769580594, "grad_norm": 1.3823808431625366, "learning_rate": 3.823171960186005e-05, "loss": 2.5301, "step": 3591 }, { "epoch": 0.26889749780098443, "grad_norm": 1.314964771270752, "learning_rate": 3.818261367371028e-05, "loss": 2.2686, "step": 3592 }, { "epoch": 0.26897235790616286, "grad_norm": 1.3065847158432007, "learning_rate": 3.81335318598461e-05, "loss": 2.0856, "step": 3593 }, { "epoch": 0.2690472180113413, "grad_norm": 1.2500271797180176, "learning_rate": 3.808447417941379e-05, "loss": 2.1715, "step": 3594 }, { "epoch": 0.2691220781165198, "grad_norm": 1.3757578134536743, "learning_rate": 3.8035440651550246e-05, "loss": 1.9983, "step": 3595 }, { "epoch": 0.2691969382216982, "grad_norm": 1.1818174123764038, "learning_rate": 3.7986431295382877e-05, "loss": 1.6645, "step": 3596 }, { "epoch": 0.26927179832687664, "grad_norm": 1.2160753011703491, "learning_rate": 3.793744613002966e-05, "loss": 1.9526, "step": 3597 }, { "epoch": 0.2693466584320551, "grad_norm": 1.526711106300354, "learning_rate": 3.788848517459922e-05, "loss": 2.2431, "step": 3598 }, { "epoch": 0.26942151853723356, "grad_norm": 1.2936984300613403, "learning_rate": 3.783954844819067e-05, "loss": 2.2462, "step": 3599 }, { "epoch": 0.269496378642412, "grad_norm": 1.27053964138031, "learning_rate": 3.779063596989371e-05, "loss": 2.2078, "step": 3600 }, { "epoch": 0.269496378642412, "eval_loss": 1.9885979890823364, "eval_runtime": 178.9083, "eval_samples_per_second": 27.947, "eval_steps_per_second": 13.974, "step": 3600 }, { "epoch": 0.2695712387475904, "grad_norm": 1.1623233556747437, "learning_rate": 3.7741747758788574e-05, "loss": 2.3109, "step": 3601 }, { "epoch": 0.2696460988527689, "grad_norm": 1.3491543531417847, "learning_rate": 3.769288383394597e-05, "loss": 1.9318, "step": 3602 }, { "epoch": 0.26972095895794734, "grad_norm": 1.1400843858718872, "learning_rate": 3.76440442144272e-05, "loss": 1.7643, "step": 3603 }, { "epoch": 0.2697958190631258, "grad_norm": 1.2297710180282593, "learning_rate": 3.759522891928411e-05, "loss": 1.6931, "step": 3604 }, { "epoch": 0.2698706791683042, "grad_norm": 1.3276809453964233, "learning_rate": 3.754643796755894e-05, "loss": 2.1092, "step": 3605 }, { "epoch": 0.2699455392734827, "grad_norm": 1.25497305393219, "learning_rate": 3.749767137828453e-05, "loss": 2.3432, "step": 3606 }, { "epoch": 0.2700203993786611, "grad_norm": 1.2203545570373535, "learning_rate": 3.744892917048424e-05, "loss": 2.0445, "step": 3607 }, { "epoch": 0.27009525948383956, "grad_norm": 1.1583970785140991, "learning_rate": 3.74002113631718e-05, "loss": 1.527, "step": 3608 }, { "epoch": 0.27017011958901804, "grad_norm": 1.2245904207229614, "learning_rate": 3.735151797535152e-05, "loss": 1.9618, "step": 3609 }, { "epoch": 0.2702449796941965, "grad_norm": 1.3370026350021362, "learning_rate": 3.7302849026018174e-05, "loss": 1.8191, "step": 3610 }, { "epoch": 0.2703198397993749, "grad_norm": 1.194138765335083, "learning_rate": 3.7254204534156965e-05, "loss": 2.3897, "step": 3611 }, { "epoch": 0.2703946999045534, "grad_norm": 1.4035539627075195, "learning_rate": 3.720558451874363e-05, "loss": 2.1657, "step": 3612 }, { "epoch": 0.2704695600097318, "grad_norm": 1.377695083618164, "learning_rate": 3.715698899874427e-05, "loss": 2.1623, "step": 3613 }, { "epoch": 0.27054442011491026, "grad_norm": 1.1201077699661255, "learning_rate": 3.7108417993115406e-05, "loss": 2.365, "step": 3614 }, { "epoch": 0.2706192802200887, "grad_norm": 1.1337802410125732, "learning_rate": 3.7059871520804125e-05, "loss": 1.6822, "step": 3615 }, { "epoch": 0.2706941403252672, "grad_norm": 1.145977258682251, "learning_rate": 3.701134960074785e-05, "loss": 2.1049, "step": 3616 }, { "epoch": 0.2707690004304456, "grad_norm": 1.2072327136993408, "learning_rate": 3.696285225187445e-05, "loss": 2.0351, "step": 3617 }, { "epoch": 0.27084386053562404, "grad_norm": 1.1643778085708618, "learning_rate": 3.691437949310226e-05, "loss": 1.7434, "step": 3618 }, { "epoch": 0.2709187206408025, "grad_norm": 1.279704213142395, "learning_rate": 3.6865931343339886e-05, "loss": 2.0517, "step": 3619 }, { "epoch": 0.27099358074598096, "grad_norm": 1.5341826677322388, "learning_rate": 3.6817507821486505e-05, "loss": 2.1182, "step": 3620 }, { "epoch": 0.2710684408511594, "grad_norm": 1.1970291137695312, "learning_rate": 3.676910894643153e-05, "loss": 2.0564, "step": 3621 }, { "epoch": 0.2711433009563378, "grad_norm": 1.0516990423202515, "learning_rate": 3.672073473705486e-05, "loss": 1.4151, "step": 3622 }, { "epoch": 0.2712181610615163, "grad_norm": 1.1447782516479492, "learning_rate": 3.667238521222676e-05, "loss": 2.1018, "step": 3623 }, { "epoch": 0.27129302116669474, "grad_norm": 1.2192944288253784, "learning_rate": 3.662406039080786e-05, "loss": 1.9768, "step": 3624 }, { "epoch": 0.27136788127187317, "grad_norm": 1.2195444107055664, "learning_rate": 3.65757602916491e-05, "loss": 2.1754, "step": 3625 }, { "epoch": 0.27144274137705166, "grad_norm": 1.2319406270980835, "learning_rate": 3.652748493359183e-05, "loss": 1.9333, "step": 3626 }, { "epoch": 0.2715176014822301, "grad_norm": 1.2944533824920654, "learning_rate": 3.647923433546776e-05, "loss": 2.0097, "step": 3627 }, { "epoch": 0.2715924615874085, "grad_norm": 1.4053471088409424, "learning_rate": 3.643100851609894e-05, "loss": 2.2724, "step": 3628 }, { "epoch": 0.271667321692587, "grad_norm": 1.107938289642334, "learning_rate": 3.6382807494297676e-05, "loss": 2.2664, "step": 3629 }, { "epoch": 0.27174218179776544, "grad_norm": 1.4958117008209229, "learning_rate": 3.633463128886673e-05, "loss": 2.1662, "step": 3630 }, { "epoch": 0.27181704190294387, "grad_norm": 1.0348858833312988, "learning_rate": 3.628647991859903e-05, "loss": 1.2674, "step": 3631 }, { "epoch": 0.2718919020081223, "grad_norm": 1.3962452411651611, "learning_rate": 3.623835340227792e-05, "loss": 1.7473, "step": 3632 }, { "epoch": 0.2719667621133008, "grad_norm": 1.4056390523910522, "learning_rate": 3.6190251758677074e-05, "loss": 1.6925, "step": 3633 }, { "epoch": 0.2720416222184792, "grad_norm": 1.5788779258728027, "learning_rate": 3.614217500656038e-05, "loss": 2.0759, "step": 3634 }, { "epoch": 0.27211648232365765, "grad_norm": 1.3622477054595947, "learning_rate": 3.609412316468209e-05, "loss": 2.7627, "step": 3635 }, { "epoch": 0.27219134242883614, "grad_norm": 1.198473572731018, "learning_rate": 3.6046096251786645e-05, "loss": 2.092, "step": 3636 }, { "epoch": 0.27226620253401457, "grad_norm": 1.2867705821990967, "learning_rate": 3.599809428660887e-05, "loss": 2.3965, "step": 3637 }, { "epoch": 0.272341062639193, "grad_norm": 1.48613703250885, "learning_rate": 3.595011728787376e-05, "loss": 1.924, "step": 3638 }, { "epoch": 0.27241592274437143, "grad_norm": 1.4577990770339966, "learning_rate": 3.590216527429664e-05, "loss": 2.1451, "step": 3639 }, { "epoch": 0.2724907828495499, "grad_norm": 1.3098697662353516, "learning_rate": 3.585423826458305e-05, "loss": 2.0568, "step": 3640 }, { "epoch": 0.27256564295472835, "grad_norm": 1.1683754920959473, "learning_rate": 3.580633627742882e-05, "loss": 1.2372, "step": 3641 }, { "epoch": 0.2726405030599068, "grad_norm": 1.2954245805740356, "learning_rate": 3.575845933152e-05, "loss": 2.1585, "step": 3642 }, { "epoch": 0.27271536316508527, "grad_norm": 1.3171230554580688, "learning_rate": 3.571060744553282e-05, "loss": 2.2385, "step": 3643 }, { "epoch": 0.2727902232702637, "grad_norm": 1.2265230417251587, "learning_rate": 3.566278063813378e-05, "loss": 2.2811, "step": 3644 }, { "epoch": 0.27286508337544213, "grad_norm": 1.4847770929336548, "learning_rate": 3.561497892797965e-05, "loss": 2.0061, "step": 3645 }, { "epoch": 0.2729399434806206, "grad_norm": 1.3508427143096924, "learning_rate": 3.556720233371727e-05, "loss": 2.237, "step": 3646 }, { "epoch": 0.27301480358579905, "grad_norm": 1.8456686735153198, "learning_rate": 3.551945087398383e-05, "loss": 2.2047, "step": 3647 }, { "epoch": 0.2730896636909775, "grad_norm": 1.2590117454528809, "learning_rate": 3.547172456740665e-05, "loss": 1.7274, "step": 3648 }, { "epoch": 0.2731645237961559, "grad_norm": 1.1660341024398804, "learning_rate": 3.5424023432603194e-05, "loss": 1.7525, "step": 3649 }, { "epoch": 0.2732393839013344, "grad_norm": 1.2405661344528198, "learning_rate": 3.5376347488181175e-05, "loss": 2.3804, "step": 3650 }, { "epoch": 0.27331424400651283, "grad_norm": 1.3416826725006104, "learning_rate": 3.532869675273846e-05, "loss": 1.7481, "step": 3651 }, { "epoch": 0.27338910411169126, "grad_norm": 1.1902174949645996, "learning_rate": 3.528107124486309e-05, "loss": 1.7816, "step": 3652 }, { "epoch": 0.27346396421686975, "grad_norm": 1.2742209434509277, "learning_rate": 3.5233470983133265e-05, "loss": 2.5991, "step": 3653 }, { "epoch": 0.2735388243220482, "grad_norm": 1.2751320600509644, "learning_rate": 3.518589598611732e-05, "loss": 2.4139, "step": 3654 }, { "epoch": 0.2736136844272266, "grad_norm": 1.1735732555389404, "learning_rate": 3.513834627237369e-05, "loss": 1.9962, "step": 3655 }, { "epoch": 0.27368854453240504, "grad_norm": 1.3955276012420654, "learning_rate": 3.509082186045103e-05, "loss": 2.3597, "step": 3656 }, { "epoch": 0.27376340463758353, "grad_norm": 1.0580183267593384, "learning_rate": 3.504332276888809e-05, "loss": 1.9453, "step": 3657 }, { "epoch": 0.27383826474276196, "grad_norm": 1.473850131034851, "learning_rate": 3.4995849016213764e-05, "loss": 2.2379, "step": 3658 }, { "epoch": 0.2739131248479404, "grad_norm": 1.4154881238937378, "learning_rate": 3.4948400620947066e-05, "loss": 2.3454, "step": 3659 }, { "epoch": 0.2739879849531189, "grad_norm": 1.110482931137085, "learning_rate": 3.490097760159702e-05, "loss": 1.5292, "step": 3660 }, { "epoch": 0.2740628450582973, "grad_norm": 1.190510869026184, "learning_rate": 3.48535799766629e-05, "loss": 1.8008, "step": 3661 }, { "epoch": 0.27413770516347574, "grad_norm": 1.357673168182373, "learning_rate": 3.480620776463393e-05, "loss": 1.6796, "step": 3662 }, { "epoch": 0.27421256526865423, "grad_norm": 1.2468235492706299, "learning_rate": 3.475886098398955e-05, "loss": 1.9074, "step": 3663 }, { "epoch": 0.27428742537383266, "grad_norm": 1.079415202140808, "learning_rate": 3.471153965319919e-05, "loss": 1.5924, "step": 3664 }, { "epoch": 0.2743622854790111, "grad_norm": 1.3118798732757568, "learning_rate": 3.466424379072242e-05, "loss": 2.4251, "step": 3665 }, { "epoch": 0.2744371455841895, "grad_norm": 1.4137887954711914, "learning_rate": 3.461697341500878e-05, "loss": 1.9661, "step": 3666 }, { "epoch": 0.274512005689368, "grad_norm": 1.1400495767593384, "learning_rate": 3.456972854449796e-05, "loss": 1.6698, "step": 3667 }, { "epoch": 0.27458686579454644, "grad_norm": 1.2482852935791016, "learning_rate": 3.452250919761967e-05, "loss": 1.5083, "step": 3668 }, { "epoch": 0.2746617258997249, "grad_norm": 1.1569100618362427, "learning_rate": 3.447531539279367e-05, "loss": 1.7045, "step": 3669 }, { "epoch": 0.27473658600490336, "grad_norm": 1.1992650032043457, "learning_rate": 3.4428147148429715e-05, "loss": 2.0206, "step": 3670 }, { "epoch": 0.2748114461100818, "grad_norm": 1.3336853981018066, "learning_rate": 3.438100448292766e-05, "loss": 1.6485, "step": 3671 }, { "epoch": 0.2748863062152602, "grad_norm": 1.4234938621520996, "learning_rate": 3.433388741467729e-05, "loss": 2.6367, "step": 3672 }, { "epoch": 0.27496116632043865, "grad_norm": 1.3389924764633179, "learning_rate": 3.4286795962058495e-05, "loss": 2.0709, "step": 3673 }, { "epoch": 0.27503602642561714, "grad_norm": 1.1362086534500122, "learning_rate": 3.423973014344112e-05, "loss": 1.7949, "step": 3674 }, { "epoch": 0.2751108865307956, "grad_norm": 1.6052814722061157, "learning_rate": 3.419268997718502e-05, "loss": 2.0848, "step": 3675 }, { "epoch": 0.275185746635974, "grad_norm": 1.2573515176773071, "learning_rate": 3.4145675481640104e-05, "loss": 1.9837, "step": 3676 }, { "epoch": 0.2752606067411525, "grad_norm": 1.3349645137786865, "learning_rate": 3.409868667514617e-05, "loss": 1.5753, "step": 3677 }, { "epoch": 0.2753354668463309, "grad_norm": 1.15304434299469, "learning_rate": 3.405172357603301e-05, "loss": 1.9487, "step": 3678 }, { "epoch": 0.27541032695150935, "grad_norm": 1.4454526901245117, "learning_rate": 3.400478620262043e-05, "loss": 2.3073, "step": 3679 }, { "epoch": 0.27548518705668784, "grad_norm": 1.2526516914367676, "learning_rate": 3.395787457321821e-05, "loss": 2.2945, "step": 3680 }, { "epoch": 0.2755600471618663, "grad_norm": 1.1207069158554077, "learning_rate": 3.3910988706126037e-05, "loss": 2.0658, "step": 3681 }, { "epoch": 0.2756349072670447, "grad_norm": 1.3669694662094116, "learning_rate": 3.386412861963362e-05, "loss": 2.1255, "step": 3682 }, { "epoch": 0.27570976737222314, "grad_norm": 1.5304166078567505, "learning_rate": 3.38172943320205e-05, "loss": 2.1498, "step": 3683 }, { "epoch": 0.2757846274774016, "grad_norm": 1.2092289924621582, "learning_rate": 3.377048586155626e-05, "loss": 1.9035, "step": 3684 }, { "epoch": 0.27585948758258005, "grad_norm": 1.0932542085647583, "learning_rate": 3.372370322650039e-05, "loss": 1.7759, "step": 3685 }, { "epoch": 0.2759343476877585, "grad_norm": 1.1611695289611816, "learning_rate": 3.367694644510223e-05, "loss": 2.1571, "step": 3686 }, { "epoch": 0.276009207792937, "grad_norm": 1.2870222330093384, "learning_rate": 3.363021553560111e-05, "loss": 2.3086, "step": 3687 }, { "epoch": 0.2760840678981154, "grad_norm": 1.1605165004730225, "learning_rate": 3.358351051622629e-05, "loss": 1.8902, "step": 3688 }, { "epoch": 0.27615892800329384, "grad_norm": 1.1800134181976318, "learning_rate": 3.353683140519681e-05, "loss": 1.7219, "step": 3689 }, { "epoch": 0.27623378810847227, "grad_norm": 1.1893481016159058, "learning_rate": 3.349017822072172e-05, "loss": 1.9093, "step": 3690 }, { "epoch": 0.27630864821365075, "grad_norm": 1.0842167139053345, "learning_rate": 3.3443550980999925e-05, "loss": 1.8851, "step": 3691 }, { "epoch": 0.2763835083188292, "grad_norm": 1.2017210721969604, "learning_rate": 3.339694970422019e-05, "loss": 1.8048, "step": 3692 }, { "epoch": 0.2764583684240076, "grad_norm": 1.1787925958633423, "learning_rate": 3.335037440856119e-05, "loss": 2.2447, "step": 3693 }, { "epoch": 0.2765332285291861, "grad_norm": 1.2351973056793213, "learning_rate": 3.3303825112191424e-05, "loss": 1.1729, "step": 3694 }, { "epoch": 0.27660808863436454, "grad_norm": 1.3471297025680542, "learning_rate": 3.3257301833269204e-05, "loss": 1.9705, "step": 3695 }, { "epoch": 0.27668294873954297, "grad_norm": 1.0663827657699585, "learning_rate": 3.32108045899428e-05, "loss": 2.1075, "step": 3696 }, { "epoch": 0.27675780884472145, "grad_norm": 1.185662865638733, "learning_rate": 3.316433340035029e-05, "loss": 1.7743, "step": 3697 }, { "epoch": 0.2768326689498999, "grad_norm": 1.2430531978607178, "learning_rate": 3.311788828261955e-05, "loss": 1.5352, "step": 3698 }, { "epoch": 0.2769075290550783, "grad_norm": 1.3029327392578125, "learning_rate": 3.307146925486836e-05, "loss": 1.9173, "step": 3699 }, { "epoch": 0.27698238916025675, "grad_norm": 1.1752313375473022, "learning_rate": 3.3025076335204206e-05, "loss": 1.5305, "step": 3700 }, { "epoch": 0.27705724926543523, "grad_norm": 1.1588367223739624, "learning_rate": 3.2978709541724497e-05, "loss": 1.9922, "step": 3701 }, { "epoch": 0.27713210937061367, "grad_norm": 1.3807158470153809, "learning_rate": 3.2932368892516444e-05, "loss": 2.3437, "step": 3702 }, { "epoch": 0.2772069694757921, "grad_norm": 1.5050561428070068, "learning_rate": 3.2886054405656944e-05, "loss": 2.3629, "step": 3703 }, { "epoch": 0.2772818295809706, "grad_norm": 1.2329421043395996, "learning_rate": 3.283976609921281e-05, "loss": 2.4735, "step": 3704 }, { "epoch": 0.277356689686149, "grad_norm": 1.258728265762329, "learning_rate": 3.279350399124066e-05, "loss": 2.1701, "step": 3705 }, { "epoch": 0.27743154979132745, "grad_norm": 1.2790207862854004, "learning_rate": 3.274726809978673e-05, "loss": 2.2005, "step": 3706 }, { "epoch": 0.2775064098965059, "grad_norm": 1.3147201538085938, "learning_rate": 3.270105844288721e-05, "loss": 1.3958, "step": 3707 }, { "epoch": 0.27758127000168437, "grad_norm": 1.2691043615341187, "learning_rate": 3.2654875038567954e-05, "loss": 2.0876, "step": 3708 }, { "epoch": 0.2776561301068628, "grad_norm": 1.2658696174621582, "learning_rate": 3.260871790484461e-05, "loss": 1.691, "step": 3709 }, { "epoch": 0.27773099021204123, "grad_norm": 1.226161241531372, "learning_rate": 3.25625870597226e-05, "loss": 2.0919, "step": 3710 }, { "epoch": 0.2778058503172197, "grad_norm": 1.276658535003662, "learning_rate": 3.251648252119704e-05, "loss": 1.9528, "step": 3711 }, { "epoch": 0.27788071042239815, "grad_norm": 1.2434160709381104, "learning_rate": 3.2470404307252756e-05, "loss": 1.887, "step": 3712 }, { "epoch": 0.2779555705275766, "grad_norm": 1.1952719688415527, "learning_rate": 3.24243524358644e-05, "loss": 1.7656, "step": 3713 }, { "epoch": 0.27803043063275507, "grad_norm": 1.1772487163543701, "learning_rate": 3.2378326924996305e-05, "loss": 1.9917, "step": 3714 }, { "epoch": 0.2781052907379335, "grad_norm": 1.2649497985839844, "learning_rate": 3.2332327792602504e-05, "loss": 1.7954, "step": 3715 }, { "epoch": 0.27818015084311193, "grad_norm": 1.3276275396347046, "learning_rate": 3.22863550566268e-05, "loss": 2.1842, "step": 3716 }, { "epoch": 0.27825501094829036, "grad_norm": 1.3530197143554688, "learning_rate": 3.224040873500259e-05, "loss": 2.2962, "step": 3717 }, { "epoch": 0.27832987105346885, "grad_norm": 1.3955507278442383, "learning_rate": 3.219448884565308e-05, "loss": 1.8122, "step": 3718 }, { "epoch": 0.2784047311586473, "grad_norm": 1.336397409439087, "learning_rate": 3.2148595406491076e-05, "loss": 2.149, "step": 3719 }, { "epoch": 0.2784795912638257, "grad_norm": 1.04210364818573, "learning_rate": 3.210272843541911e-05, "loss": 1.9881, "step": 3720 }, { "epoch": 0.2785544513690042, "grad_norm": 1.2580152750015259, "learning_rate": 3.205688795032942e-05, "loss": 2.1728, "step": 3721 }, { "epoch": 0.27862931147418263, "grad_norm": 1.209764838218689, "learning_rate": 3.2011073969103875e-05, "loss": 2.103, "step": 3722 }, { "epoch": 0.27870417157936106, "grad_norm": 1.1044540405273438, "learning_rate": 3.196528650961397e-05, "loss": 1.7157, "step": 3723 }, { "epoch": 0.2787790316845395, "grad_norm": 1.3951267004013062, "learning_rate": 3.19195255897209e-05, "loss": 2.3731, "step": 3724 }, { "epoch": 0.278853891789718, "grad_norm": 1.0687568187713623, "learning_rate": 3.1873791227275516e-05, "loss": 1.8036, "step": 3725 }, { "epoch": 0.2789287518948964, "grad_norm": 1.1896275281906128, "learning_rate": 3.1828083440118314e-05, "loss": 2.1496, "step": 3726 }, { "epoch": 0.27900361200007484, "grad_norm": 1.296722412109375, "learning_rate": 3.178240224607935e-05, "loss": 2.0396, "step": 3727 }, { "epoch": 0.27907847210525333, "grad_norm": 1.3040552139282227, "learning_rate": 3.1736747662978406e-05, "loss": 2.1786, "step": 3728 }, { "epoch": 0.27915333221043176, "grad_norm": 1.2086619138717651, "learning_rate": 3.1691119708624786e-05, "loss": 2.2381, "step": 3729 }, { "epoch": 0.2792281923156102, "grad_norm": 1.2334345579147339, "learning_rate": 3.164551840081748e-05, "loss": 1.6066, "step": 3730 }, { "epoch": 0.2793030524207887, "grad_norm": 0.9677254557609558, "learning_rate": 3.159994375734505e-05, "loss": 1.3692, "step": 3731 }, { "epoch": 0.2793779125259671, "grad_norm": 1.5042107105255127, "learning_rate": 3.1554395795985684e-05, "loss": 2.1305, "step": 3732 }, { "epoch": 0.27945277263114554, "grad_norm": 1.1663800477981567, "learning_rate": 3.150887453450716e-05, "loss": 1.9145, "step": 3733 }, { "epoch": 0.27952763273632397, "grad_norm": 1.3978912830352783, "learning_rate": 3.146337999066676e-05, "loss": 2.0441, "step": 3734 }, { "epoch": 0.27960249284150246, "grad_norm": 1.3679187297821045, "learning_rate": 3.1417912182211494e-05, "loss": 2.2325, "step": 3735 }, { "epoch": 0.2796773529466809, "grad_norm": 1.2664083242416382, "learning_rate": 3.137247112687777e-05, "loss": 2.283, "step": 3736 }, { "epoch": 0.2797522130518593, "grad_norm": 1.1854172945022583, "learning_rate": 3.132705684239168e-05, "loss": 1.6917, "step": 3737 }, { "epoch": 0.2798270731570378, "grad_norm": 1.2262346744537354, "learning_rate": 3.1281669346468836e-05, "loss": 1.9595, "step": 3738 }, { "epoch": 0.27990193326221624, "grad_norm": 1.2855292558670044, "learning_rate": 3.1236308656814406e-05, "loss": 1.8628, "step": 3739 }, { "epoch": 0.27997679336739467, "grad_norm": 1.3022968769073486, "learning_rate": 3.119097479112315e-05, "loss": 2.2594, "step": 3740 }, { "epoch": 0.2800516534725731, "grad_norm": 1.2705274820327759, "learning_rate": 3.114566776707922e-05, "loss": 2.3726, "step": 3741 }, { "epoch": 0.2801265135777516, "grad_norm": 1.1676372289657593, "learning_rate": 3.110038760235647e-05, "loss": 1.5686, "step": 3742 }, { "epoch": 0.28020137368293, "grad_norm": 1.3534809350967407, "learning_rate": 3.105513431461814e-05, "loss": 1.9778, "step": 3743 }, { "epoch": 0.28027623378810845, "grad_norm": 1.20972740650177, "learning_rate": 3.100990792151704e-05, "loss": 1.5468, "step": 3744 }, { "epoch": 0.28035109389328694, "grad_norm": 1.1352863311767578, "learning_rate": 3.0964708440695535e-05, "loss": 1.7282, "step": 3745 }, { "epoch": 0.28042595399846537, "grad_norm": 1.1724121570587158, "learning_rate": 3.0919535889785455e-05, "loss": 1.628, "step": 3746 }, { "epoch": 0.2805008141036438, "grad_norm": 1.6344754695892334, "learning_rate": 3.087439028640805e-05, "loss": 1.8243, "step": 3747 }, { "epoch": 0.2805756742088223, "grad_norm": 1.408743977546692, "learning_rate": 3.0829271648174176e-05, "loss": 1.769, "step": 3748 }, { "epoch": 0.2806505343140007, "grad_norm": 1.7425427436828613, "learning_rate": 3.078417999268409e-05, "loss": 2.0765, "step": 3749 }, { "epoch": 0.28072539441917915, "grad_norm": 1.2188204526901245, "learning_rate": 3.0739115337527626e-05, "loss": 2.0641, "step": 3750 }, { "epoch": 0.28072539441917915, "eval_loss": 1.9838340282440186, "eval_runtime": 178.9949, "eval_samples_per_second": 27.934, "eval_steps_per_second": 13.967, "step": 3750 }, { "epoch": 0.2808002545243576, "grad_norm": 1.207412600517273, "learning_rate": 3.0694077700283905e-05, "loss": 1.5137, "step": 3751 }, { "epoch": 0.28087511462953607, "grad_norm": 1.4043002128601074, "learning_rate": 3.064906709852172e-05, "loss": 1.8774, "step": 3752 }, { "epoch": 0.2809499747347145, "grad_norm": 1.354300856590271, "learning_rate": 3.0604083549799126e-05, "loss": 2.5454, "step": 3753 }, { "epoch": 0.28102483483989293, "grad_norm": 1.2400425672531128, "learning_rate": 3.055912707166374e-05, "loss": 2.0635, "step": 3754 }, { "epoch": 0.2810996949450714, "grad_norm": 1.1845600605010986, "learning_rate": 3.0514197681652612e-05, "loss": 1.9901, "step": 3755 }, { "epoch": 0.28117455505024985, "grad_norm": 1.270832896232605, "learning_rate": 3.0469295397292198e-05, "loss": 1.8688, "step": 3756 }, { "epoch": 0.2812494151554283, "grad_norm": 1.3874880075454712, "learning_rate": 3.0424420236098404e-05, "loss": 2.1929, "step": 3757 }, { "epoch": 0.2813242752606067, "grad_norm": 1.1305326223373413, "learning_rate": 3.0379572215576502e-05, "loss": 1.6898, "step": 3758 }, { "epoch": 0.2813991353657852, "grad_norm": 1.2193983793258667, "learning_rate": 3.033475135322126e-05, "loss": 1.7322, "step": 3759 }, { "epoch": 0.28147399547096363, "grad_norm": 1.5692068338394165, "learning_rate": 3.0289957666516733e-05, "loss": 2.1399, "step": 3760 }, { "epoch": 0.28154885557614207, "grad_norm": 1.5422791242599487, "learning_rate": 3.0245191172936506e-05, "loss": 1.9824, "step": 3761 }, { "epoch": 0.28162371568132055, "grad_norm": 1.2794523239135742, "learning_rate": 3.0200451889943483e-05, "loss": 2.0921, "step": 3762 }, { "epoch": 0.281698575786499, "grad_norm": 1.1849145889282227, "learning_rate": 3.0155739834989992e-05, "loss": 1.6286, "step": 3763 }, { "epoch": 0.2817734358916774, "grad_norm": 1.5417006015777588, "learning_rate": 3.0111055025517664e-05, "loss": 2.0103, "step": 3764 }, { "epoch": 0.2818482959968559, "grad_norm": 1.1270604133605957, "learning_rate": 3.0066397478957588e-05, "loss": 1.916, "step": 3765 }, { "epoch": 0.28192315610203433, "grad_norm": 1.491173505783081, "learning_rate": 3.0021767212730177e-05, "loss": 2.0702, "step": 3766 }, { "epoch": 0.28199801620721276, "grad_norm": 1.2720727920532227, "learning_rate": 2.9977164244245247e-05, "loss": 2.2804, "step": 3767 }, { "epoch": 0.2820728763123912, "grad_norm": 1.283512830734253, "learning_rate": 2.9932588590901865e-05, "loss": 1.7359, "step": 3768 }, { "epoch": 0.2821477364175697, "grad_norm": 1.4128354787826538, "learning_rate": 2.9888040270088556e-05, "loss": 1.9504, "step": 3769 }, { "epoch": 0.2822225965227481, "grad_norm": 1.4890402555465698, "learning_rate": 2.9843519299183077e-05, "loss": 2.0908, "step": 3770 }, { "epoch": 0.28229745662792655, "grad_norm": 1.404641032218933, "learning_rate": 2.9799025695552606e-05, "loss": 2.0238, "step": 3771 }, { "epoch": 0.28237231673310503, "grad_norm": 1.175040364265442, "learning_rate": 2.975455947655361e-05, "loss": 2.3378, "step": 3772 }, { "epoch": 0.28244717683828346, "grad_norm": 1.328039526939392, "learning_rate": 2.971012065953187e-05, "loss": 2.2092, "step": 3773 }, { "epoch": 0.2825220369434619, "grad_norm": 1.22098708152771, "learning_rate": 2.9665709261822516e-05, "loss": 1.8155, "step": 3774 }, { "epoch": 0.2825968970486403, "grad_norm": 1.3758236169815063, "learning_rate": 2.9621325300749912e-05, "loss": 2.2831, "step": 3775 }, { "epoch": 0.2826717571538188, "grad_norm": 1.2801865339279175, "learning_rate": 2.9576968793627734e-05, "loss": 1.8186, "step": 3776 }, { "epoch": 0.28274661725899725, "grad_norm": 1.1967885494232178, "learning_rate": 2.9532639757758994e-05, "loss": 1.6329, "step": 3777 }, { "epoch": 0.2828214773641757, "grad_norm": 1.2403509616851807, "learning_rate": 2.948833821043596e-05, "loss": 2.1247, "step": 3778 }, { "epoch": 0.28289633746935416, "grad_norm": 1.2723476886749268, "learning_rate": 2.944406416894019e-05, "loss": 1.8553, "step": 3779 }, { "epoch": 0.2829711975745326, "grad_norm": 1.2781561613082886, "learning_rate": 2.9399817650542526e-05, "loss": 2.2408, "step": 3780 }, { "epoch": 0.283046057679711, "grad_norm": 1.0300153493881226, "learning_rate": 2.9355598672502993e-05, "loss": 1.7825, "step": 3781 }, { "epoch": 0.2831209177848895, "grad_norm": 1.1118957996368408, "learning_rate": 2.9311407252070965e-05, "loss": 1.7311, "step": 3782 }, { "epoch": 0.28319577789006795, "grad_norm": 1.2643485069274902, "learning_rate": 2.9267243406485058e-05, "loss": 1.743, "step": 3783 }, { "epoch": 0.2832706379952464, "grad_norm": 1.30633544921875, "learning_rate": 2.922310715297303e-05, "loss": 1.8252, "step": 3784 }, { "epoch": 0.2833454981004248, "grad_norm": 1.3449517488479614, "learning_rate": 2.9178998508751997e-05, "loss": 1.5536, "step": 3785 }, { "epoch": 0.2834203582056033, "grad_norm": 1.3068872690200806, "learning_rate": 2.913491749102829e-05, "loss": 1.9503, "step": 3786 }, { "epoch": 0.2834952183107817, "grad_norm": 1.6194566488265991, "learning_rate": 2.9090864116997362e-05, "loss": 1.7285, "step": 3787 }, { "epoch": 0.28357007841596016, "grad_norm": 1.3881391286849976, "learning_rate": 2.904683840384398e-05, "loss": 2.4538, "step": 3788 }, { "epoch": 0.28364493852113865, "grad_norm": 1.2729352712631226, "learning_rate": 2.90028403687421e-05, "loss": 2.0169, "step": 3789 }, { "epoch": 0.2837197986263171, "grad_norm": 1.3883562088012695, "learning_rate": 2.8958870028854878e-05, "loss": 1.6042, "step": 3790 }, { "epoch": 0.2837946587314955, "grad_norm": 1.3669692277908325, "learning_rate": 2.891492740133468e-05, "loss": 2.3127, "step": 3791 }, { "epoch": 0.28386951883667394, "grad_norm": 1.3271369934082031, "learning_rate": 2.8871012503323025e-05, "loss": 2.2466, "step": 3792 }, { "epoch": 0.2839443789418524, "grad_norm": 1.4040052890777588, "learning_rate": 2.88271253519506e-05, "loss": 1.8344, "step": 3793 }, { "epoch": 0.28401923904703086, "grad_norm": 1.199990153312683, "learning_rate": 2.8783265964337324e-05, "loss": 2.1366, "step": 3794 }, { "epoch": 0.2840940991522093, "grad_norm": 1.1696723699569702, "learning_rate": 2.8739434357592276e-05, "loss": 1.6939, "step": 3795 }, { "epoch": 0.2841689592573878, "grad_norm": 1.2926928997039795, "learning_rate": 2.869563054881368e-05, "loss": 1.624, "step": 3796 }, { "epoch": 0.2842438193625662, "grad_norm": 1.1222649812698364, "learning_rate": 2.8651854555088954e-05, "loss": 1.4962, "step": 3797 }, { "epoch": 0.28431867946774464, "grad_norm": 1.1762524843215942, "learning_rate": 2.860810639349456e-05, "loss": 1.5146, "step": 3798 }, { "epoch": 0.2843935395729231, "grad_norm": 1.1004385948181152, "learning_rate": 2.856438608109626e-05, "loss": 2.2298, "step": 3799 }, { "epoch": 0.28446839967810156, "grad_norm": 1.3233323097229004, "learning_rate": 2.8520693634948803e-05, "loss": 2.1355, "step": 3800 }, { "epoch": 0.28454325978328, "grad_norm": 1.1481561660766602, "learning_rate": 2.847702907209615e-05, "loss": 1.5756, "step": 3801 }, { "epoch": 0.2846181198884584, "grad_norm": 1.297219157218933, "learning_rate": 2.843339240957139e-05, "loss": 1.97, "step": 3802 }, { "epoch": 0.2846929799936369, "grad_norm": 1.4713846445083618, "learning_rate": 2.8389783664396717e-05, "loss": 2.0447, "step": 3803 }, { "epoch": 0.28476784009881534, "grad_norm": 1.184596300125122, "learning_rate": 2.834620285358338e-05, "loss": 2.0419, "step": 3804 }, { "epoch": 0.28484270020399377, "grad_norm": 1.2929202318191528, "learning_rate": 2.8302649994131802e-05, "loss": 1.7204, "step": 3805 }, { "epoch": 0.28491756030917226, "grad_norm": 1.2498064041137695, "learning_rate": 2.825912510303147e-05, "loss": 2.2631, "step": 3806 }, { "epoch": 0.2849924204143507, "grad_norm": 1.121712327003479, "learning_rate": 2.8215628197261e-05, "loss": 2.0473, "step": 3807 }, { "epoch": 0.2850672805195291, "grad_norm": 1.2431342601776123, "learning_rate": 2.8172159293788003e-05, "loss": 1.7352, "step": 3808 }, { "epoch": 0.28514214062470755, "grad_norm": 1.3346837759017944, "learning_rate": 2.8128718409569287e-05, "loss": 1.7899, "step": 3809 }, { "epoch": 0.28521700072988604, "grad_norm": 1.4943522214889526, "learning_rate": 2.808530556155059e-05, "loss": 2.2383, "step": 3810 }, { "epoch": 0.28529186083506447, "grad_norm": 1.1788746118545532, "learning_rate": 2.804192076666683e-05, "loss": 1.7101, "step": 3811 }, { "epoch": 0.2853667209402429, "grad_norm": 1.0839083194732666, "learning_rate": 2.7998564041841935e-05, "loss": 1.9712, "step": 3812 }, { "epoch": 0.2854415810454214, "grad_norm": 1.5021429061889648, "learning_rate": 2.79552354039889e-05, "loss": 2.294, "step": 3813 }, { "epoch": 0.2855164411505998, "grad_norm": 1.3215985298156738, "learning_rate": 2.791193487000977e-05, "loss": 1.3718, "step": 3814 }, { "epoch": 0.28559130125577825, "grad_norm": 1.183494210243225, "learning_rate": 2.7868662456795547e-05, "loss": 2.443, "step": 3815 }, { "epoch": 0.28566616136095674, "grad_norm": 1.2543389797210693, "learning_rate": 2.782541818122639e-05, "loss": 2.274, "step": 3816 }, { "epoch": 0.28574102146613517, "grad_norm": 1.1998188495635986, "learning_rate": 2.778220206017137e-05, "loss": 2.285, "step": 3817 }, { "epoch": 0.2858158815713136, "grad_norm": 1.0411428213119507, "learning_rate": 2.7739014110488636e-05, "loss": 1.8445, "step": 3818 }, { "epoch": 0.28589074167649203, "grad_norm": 1.279973030090332, "learning_rate": 2.769585434902534e-05, "loss": 1.8415, "step": 3819 }, { "epoch": 0.2859656017816705, "grad_norm": 1.2691515684127808, "learning_rate": 2.7652722792617657e-05, "loss": 2.137, "step": 3820 }, { "epoch": 0.28604046188684895, "grad_norm": 1.3299516439437866, "learning_rate": 2.7609619458090698e-05, "loss": 2.3958, "step": 3821 }, { "epoch": 0.2861153219920274, "grad_norm": 1.193332552909851, "learning_rate": 2.7566544362258595e-05, "loss": 1.7505, "step": 3822 }, { "epoch": 0.28619018209720587, "grad_norm": 1.263997197151184, "learning_rate": 2.75234975219245e-05, "loss": 2.3475, "step": 3823 }, { "epoch": 0.2862650422023843, "grad_norm": 1.2018673419952393, "learning_rate": 2.7480478953880528e-05, "loss": 1.5956, "step": 3824 }, { "epoch": 0.28633990230756273, "grad_norm": 1.3443028926849365, "learning_rate": 2.7437488674907707e-05, "loss": 2.1856, "step": 3825 }, { "epoch": 0.28641476241274116, "grad_norm": 1.2892485857009888, "learning_rate": 2.7394526701776112e-05, "loss": 1.6756, "step": 3826 }, { "epoch": 0.28648962251791965, "grad_norm": 1.324503779411316, "learning_rate": 2.7351593051244706e-05, "loss": 1.9087, "step": 3827 }, { "epoch": 0.2865644826230981, "grad_norm": 1.4397438764572144, "learning_rate": 2.7308687740061444e-05, "loss": 2.1285, "step": 3828 }, { "epoch": 0.2866393427282765, "grad_norm": 1.4441887140274048, "learning_rate": 2.726581078496323e-05, "loss": 1.8802, "step": 3829 }, { "epoch": 0.286714202833455, "grad_norm": 1.3619921207427979, "learning_rate": 2.7222962202675885e-05, "loss": 1.4232, "step": 3830 }, { "epoch": 0.28678906293863343, "grad_norm": 1.1683720350265503, "learning_rate": 2.718014200991421e-05, "loss": 1.6594, "step": 3831 }, { "epoch": 0.28686392304381186, "grad_norm": 1.1547281742095947, "learning_rate": 2.713735022338183e-05, "loss": 2.295, "step": 3832 }, { "epoch": 0.28693878314899035, "grad_norm": 1.365252137184143, "learning_rate": 2.709458685977141e-05, "loss": 1.7618, "step": 3833 }, { "epoch": 0.2870136432541688, "grad_norm": 1.2595072984695435, "learning_rate": 2.7051851935764416e-05, "loss": 1.6897, "step": 3834 }, { "epoch": 0.2870885033593472, "grad_norm": 1.60390043258667, "learning_rate": 2.7009145468031306e-05, "loss": 2.4944, "step": 3835 }, { "epoch": 0.28716336346452564, "grad_norm": 1.312190294265747, "learning_rate": 2.6966467473231395e-05, "loss": 1.8372, "step": 3836 }, { "epoch": 0.28723822356970413, "grad_norm": 1.3880176544189453, "learning_rate": 2.69238179680129e-05, "loss": 1.8575, "step": 3837 }, { "epoch": 0.28731308367488256, "grad_norm": 1.351420283317566, "learning_rate": 2.6881196969012968e-05, "loss": 2.4426, "step": 3838 }, { "epoch": 0.287387943780061, "grad_norm": 1.270731806755066, "learning_rate": 2.6838604492857523e-05, "loss": 1.823, "step": 3839 }, { "epoch": 0.2874628038852395, "grad_norm": 1.6293742656707764, "learning_rate": 2.6796040556161483e-05, "loss": 2.2879, "step": 3840 }, { "epoch": 0.2875376639904179, "grad_norm": 1.590905785560608, "learning_rate": 2.6753505175528504e-05, "loss": 2.5961, "step": 3841 }, { "epoch": 0.28761252409559634, "grad_norm": 1.1352043151855469, "learning_rate": 2.671099836755121e-05, "loss": 1.7162, "step": 3842 }, { "epoch": 0.2876873842007748, "grad_norm": 1.3536630868911743, "learning_rate": 2.666852014881104e-05, "loss": 2.3501, "step": 3843 }, { "epoch": 0.28776224430595326, "grad_norm": 1.0722090005874634, "learning_rate": 2.66260705358783e-05, "loss": 2.1718, "step": 3844 }, { "epoch": 0.2878371044111317, "grad_norm": 1.6106789112091064, "learning_rate": 2.6583649545312085e-05, "loss": 1.9588, "step": 3845 }, { "epoch": 0.2879119645163101, "grad_norm": 1.2883594036102295, "learning_rate": 2.6541257193660374e-05, "loss": 2.3625, "step": 3846 }, { "epoch": 0.2879868246214886, "grad_norm": 1.3388254642486572, "learning_rate": 2.6498893497459955e-05, "loss": 1.9431, "step": 3847 }, { "epoch": 0.28806168472666704, "grad_norm": 1.3169176578521729, "learning_rate": 2.6456558473236483e-05, "loss": 1.9479, "step": 3848 }, { "epoch": 0.2881365448318455, "grad_norm": 1.3699671030044556, "learning_rate": 2.6414252137504324e-05, "loss": 1.8837, "step": 3849 }, { "epoch": 0.28821140493702396, "grad_norm": 1.1597939729690552, "learning_rate": 2.637197450676677e-05, "loss": 1.7481, "step": 3850 }, { "epoch": 0.2882862650422024, "grad_norm": 1.1889296770095825, "learning_rate": 2.632972559751581e-05, "loss": 2.3349, "step": 3851 }, { "epoch": 0.2883611251473808, "grad_norm": 1.5291746854782104, "learning_rate": 2.6287505426232294e-05, "loss": 1.8609, "step": 3852 }, { "epoch": 0.28843598525255926, "grad_norm": 1.4949208498001099, "learning_rate": 2.624531400938588e-05, "loss": 2.0463, "step": 3853 }, { "epoch": 0.28851084535773774, "grad_norm": 1.0496788024902344, "learning_rate": 2.6203151363434952e-05, "loss": 2.6265, "step": 3854 }, { "epoch": 0.2885857054629162, "grad_norm": 1.2499306201934814, "learning_rate": 2.6161017504826736e-05, "loss": 1.8998, "step": 3855 }, { "epoch": 0.2886605655680946, "grad_norm": 1.2711821794509888, "learning_rate": 2.6118912449997147e-05, "loss": 2.193, "step": 3856 }, { "epoch": 0.2887354256732731, "grad_norm": 1.3939212560653687, "learning_rate": 2.6076836215370893e-05, "loss": 1.9669, "step": 3857 }, { "epoch": 0.2888102857784515, "grad_norm": 1.1545202732086182, "learning_rate": 2.603478881736148e-05, "loss": 2.0835, "step": 3858 }, { "epoch": 0.28888514588362996, "grad_norm": 1.378939151763916, "learning_rate": 2.5992770272371126e-05, "loss": 2.6535, "step": 3859 }, { "epoch": 0.2889600059888084, "grad_norm": 1.2740567922592163, "learning_rate": 2.5950780596790813e-05, "loss": 2.0284, "step": 3860 }, { "epoch": 0.2890348660939869, "grad_norm": 1.1165826320648193, "learning_rate": 2.5908819807000283e-05, "loss": 2.0333, "step": 3861 }, { "epoch": 0.2891097261991653, "grad_norm": 1.2753795385360718, "learning_rate": 2.586688791936792e-05, "loss": 2.0572, "step": 3862 }, { "epoch": 0.28918458630434374, "grad_norm": 1.466369867324829, "learning_rate": 2.582498495025093e-05, "loss": 1.7328, "step": 3863 }, { "epoch": 0.2892594464095222, "grad_norm": 1.3811403512954712, "learning_rate": 2.5783110915995223e-05, "loss": 2.5983, "step": 3864 }, { "epoch": 0.28933430651470066, "grad_norm": 1.4060243368148804, "learning_rate": 2.5741265832935346e-05, "loss": 1.5054, "step": 3865 }, { "epoch": 0.2894091666198791, "grad_norm": 1.3294570446014404, "learning_rate": 2.569944971739464e-05, "loss": 2.5684, "step": 3866 }, { "epoch": 0.2894840267250576, "grad_norm": 1.1702747344970703, "learning_rate": 2.565766258568514e-05, "loss": 1.8872, "step": 3867 }, { "epoch": 0.289558886830236, "grad_norm": 1.4963657855987549, "learning_rate": 2.5615904454107496e-05, "loss": 1.897, "step": 3868 }, { "epoch": 0.28963374693541444, "grad_norm": 1.4837406873703003, "learning_rate": 2.557417533895111e-05, "loss": 1.7879, "step": 3869 }, { "epoch": 0.28970860704059287, "grad_norm": 1.1382025480270386, "learning_rate": 2.553247525649407e-05, "loss": 1.7544, "step": 3870 }, { "epoch": 0.28978346714577136, "grad_norm": 1.1270805597305298, "learning_rate": 2.549080422300312e-05, "loss": 1.9009, "step": 3871 }, { "epoch": 0.2898583272509498, "grad_norm": 1.2043201923370361, "learning_rate": 2.5449162254733693e-05, "loss": 1.972, "step": 3872 }, { "epoch": 0.2899331873561282, "grad_norm": 1.2430338859558105, "learning_rate": 2.540754936792985e-05, "loss": 1.3916, "step": 3873 }, { "epoch": 0.2900080474613067, "grad_norm": 1.4161804914474487, "learning_rate": 2.536596557882428e-05, "loss": 2.2737, "step": 3874 }, { "epoch": 0.29008290756648514, "grad_norm": 1.2108774185180664, "learning_rate": 2.5324410903638406e-05, "loss": 2.2077, "step": 3875 }, { "epoch": 0.29015776767166357, "grad_norm": 1.3517228364944458, "learning_rate": 2.528288535858223e-05, "loss": 2.3026, "step": 3876 }, { "epoch": 0.290232627776842, "grad_norm": 1.3244082927703857, "learning_rate": 2.5241388959854438e-05, "loss": 2.0008, "step": 3877 }, { "epoch": 0.2903074878820205, "grad_norm": 1.2876060009002686, "learning_rate": 2.519992172364234e-05, "loss": 1.8389, "step": 3878 }, { "epoch": 0.2903823479871989, "grad_norm": 1.237341284751892, "learning_rate": 2.515848366612179e-05, "loss": 1.9232, "step": 3879 }, { "epoch": 0.29045720809237735, "grad_norm": 1.620824933052063, "learning_rate": 2.511707480345734e-05, "loss": 2.1003, "step": 3880 }, { "epoch": 0.29053206819755584, "grad_norm": 1.4315983057022095, "learning_rate": 2.5075695151802193e-05, "loss": 2.2431, "step": 3881 }, { "epoch": 0.29060692830273427, "grad_norm": 1.0967376232147217, "learning_rate": 2.503434472729801e-05, "loss": 1.6695, "step": 3882 }, { "epoch": 0.2906817884079127, "grad_norm": 1.364526391029358, "learning_rate": 2.4993023546075177e-05, "loss": 1.9768, "step": 3883 }, { "epoch": 0.2907566485130912, "grad_norm": 1.2855855226516724, "learning_rate": 2.4951731624252662e-05, "loss": 2.2199, "step": 3884 }, { "epoch": 0.2908315086182696, "grad_norm": 1.3406977653503418, "learning_rate": 2.491046897793795e-05, "loss": 2.5473, "step": 3885 }, { "epoch": 0.29090636872344805, "grad_norm": 1.3207018375396729, "learning_rate": 2.486923562322715e-05, "loss": 2.2578, "step": 3886 }, { "epoch": 0.2909812288286265, "grad_norm": 1.2394272089004517, "learning_rate": 2.482803157620497e-05, "loss": 1.6384, "step": 3887 }, { "epoch": 0.29105608893380497, "grad_norm": 1.184557557106018, "learning_rate": 2.478685685294463e-05, "loss": 2.0516, "step": 3888 }, { "epoch": 0.2911309490389834, "grad_norm": 1.4188936948776245, "learning_rate": 2.4745711469507994e-05, "loss": 2.2441, "step": 3889 }, { "epoch": 0.29120580914416183, "grad_norm": 1.2565886974334717, "learning_rate": 2.4704595441945376e-05, "loss": 2.0275, "step": 3890 }, { "epoch": 0.2912806692493403, "grad_norm": 1.3128793239593506, "learning_rate": 2.4663508786295664e-05, "loss": 1.8932, "step": 3891 }, { "epoch": 0.29135552935451875, "grad_norm": 1.1746132373809814, "learning_rate": 2.4622451518586353e-05, "loss": 2.0613, "step": 3892 }, { "epoch": 0.2914303894596972, "grad_norm": 1.2960000038146973, "learning_rate": 2.4581423654833414e-05, "loss": 1.6676, "step": 3893 }, { "epoch": 0.2915052495648756, "grad_norm": 1.4135433435440063, "learning_rate": 2.4540425211041384e-05, "loss": 1.6098, "step": 3894 }, { "epoch": 0.2915801096700541, "grad_norm": 1.46435546875, "learning_rate": 2.449945620320333e-05, "loss": 2.1194, "step": 3895 }, { "epoch": 0.29165496977523253, "grad_norm": 1.172973394393921, "learning_rate": 2.4458516647300766e-05, "loss": 1.7492, "step": 3896 }, { "epoch": 0.29172982988041096, "grad_norm": 1.2454965114593506, "learning_rate": 2.4417606559303795e-05, "loss": 2.3048, "step": 3897 }, { "epoch": 0.29180468998558945, "grad_norm": 1.321365475654602, "learning_rate": 2.4376725955170965e-05, "loss": 2.2254, "step": 3898 }, { "epoch": 0.2918795500907679, "grad_norm": 1.2571951150894165, "learning_rate": 2.433587485084937e-05, "loss": 2.1635, "step": 3899 }, { "epoch": 0.2919544101959463, "grad_norm": 1.2208398580551147, "learning_rate": 2.429505326227457e-05, "loss": 2.1594, "step": 3900 }, { "epoch": 0.2919544101959463, "eval_loss": 1.9793667793273926, "eval_runtime": 179.0136, "eval_samples_per_second": 27.931, "eval_steps_per_second": 13.965, "step": 3900 }, { "epoch": 0.2920292703011248, "grad_norm": 1.4668182134628296, "learning_rate": 2.4254261205370666e-05, "loss": 1.93, "step": 3901 }, { "epoch": 0.29210413040630323, "grad_norm": 1.3168736696243286, "learning_rate": 2.421349869605013e-05, "loss": 2.1536, "step": 3902 }, { "epoch": 0.29217899051148166, "grad_norm": 1.3177366256713867, "learning_rate": 2.417276575021399e-05, "loss": 1.9181, "step": 3903 }, { "epoch": 0.2922538506166601, "grad_norm": 1.129365086555481, "learning_rate": 2.4132062383751717e-05, "loss": 1.911, "step": 3904 }, { "epoch": 0.2923287107218386, "grad_norm": 1.244715929031372, "learning_rate": 2.40913886125413e-05, "loss": 1.9495, "step": 3905 }, { "epoch": 0.292403570827017, "grad_norm": 1.4900572299957275, "learning_rate": 2.405074445244906e-05, "loss": 2.2217, "step": 3906 }, { "epoch": 0.29247843093219544, "grad_norm": 1.1483628749847412, "learning_rate": 2.4010129919329905e-05, "loss": 1.8573, "step": 3907 }, { "epoch": 0.29255329103737393, "grad_norm": 1.3607122898101807, "learning_rate": 2.396954502902705e-05, "loss": 2.1321, "step": 3908 }, { "epoch": 0.29262815114255236, "grad_norm": 1.3134105205535889, "learning_rate": 2.3928989797372247e-05, "loss": 1.7242, "step": 3909 }, { "epoch": 0.2927030112477308, "grad_norm": 1.2184776067733765, "learning_rate": 2.388846424018566e-05, "loss": 1.7699, "step": 3910 }, { "epoch": 0.2927778713529092, "grad_norm": 1.1946921348571777, "learning_rate": 2.3847968373275855e-05, "loss": 1.6261, "step": 3911 }, { "epoch": 0.2928527314580877, "grad_norm": 1.4854552745819092, "learning_rate": 2.380750221243986e-05, "loss": 1.381, "step": 3912 }, { "epoch": 0.29292759156326614, "grad_norm": 1.144334077835083, "learning_rate": 2.376706577346304e-05, "loss": 1.927, "step": 3913 }, { "epoch": 0.2930024516684446, "grad_norm": 1.344338297843933, "learning_rate": 2.3726659072119196e-05, "loss": 1.9027, "step": 3914 }, { "epoch": 0.29307731177362306, "grad_norm": 1.7256338596343994, "learning_rate": 2.3686282124170556e-05, "loss": 2.1617, "step": 3915 }, { "epoch": 0.2931521718788015, "grad_norm": 1.284041166305542, "learning_rate": 2.3645934945367733e-05, "loss": 1.9349, "step": 3916 }, { "epoch": 0.2932270319839799, "grad_norm": 1.3531914949417114, "learning_rate": 2.360561755144972e-05, "loss": 1.8634, "step": 3917 }, { "epoch": 0.2933018920891584, "grad_norm": 1.1813284158706665, "learning_rate": 2.3565329958143913e-05, "loss": 1.7908, "step": 3918 }, { "epoch": 0.29337675219433684, "grad_norm": 1.4031023979187012, "learning_rate": 2.352507218116601e-05, "loss": 2.0208, "step": 3919 }, { "epoch": 0.2934516122995153, "grad_norm": 1.3516638278961182, "learning_rate": 2.3484844236220148e-05, "loss": 1.9745, "step": 3920 }, { "epoch": 0.2935264724046937, "grad_norm": 1.2264529466629028, "learning_rate": 2.344464613899885e-05, "loss": 1.7497, "step": 3921 }, { "epoch": 0.2936013325098722, "grad_norm": 1.0892685651779175, "learning_rate": 2.3404477905182887e-05, "loss": 2.2278, "step": 3922 }, { "epoch": 0.2936761926150506, "grad_norm": 1.180606484413147, "learning_rate": 2.3364339550441473e-05, "loss": 1.8234, "step": 3923 }, { "epoch": 0.29375105272022906, "grad_norm": 1.2137155532836914, "learning_rate": 2.332423109043218e-05, "loss": 2.1253, "step": 3924 }, { "epoch": 0.29382591282540754, "grad_norm": 1.5196802616119385, "learning_rate": 2.3284152540800818e-05, "loss": 2.4126, "step": 3925 }, { "epoch": 0.293900772930586, "grad_norm": 1.2818958759307861, "learning_rate": 2.3244103917181626e-05, "loss": 2.25, "step": 3926 }, { "epoch": 0.2939756330357644, "grad_norm": 1.2143261432647705, "learning_rate": 2.3204085235197115e-05, "loss": 2.2989, "step": 3927 }, { "epoch": 0.29405049314094284, "grad_norm": 1.35483717918396, "learning_rate": 2.3164096510458144e-05, "loss": 1.858, "step": 3928 }, { "epoch": 0.2941253532461213, "grad_norm": 1.4319674968719482, "learning_rate": 2.3124137758563902e-05, "loss": 1.8885, "step": 3929 }, { "epoch": 0.29420021335129976, "grad_norm": 1.217708706855774, "learning_rate": 2.3084208995101818e-05, "loss": 1.8481, "step": 3930 }, { "epoch": 0.2942750734564782, "grad_norm": 1.5711474418640137, "learning_rate": 2.3044310235647713e-05, "loss": 2.4268, "step": 3931 }, { "epoch": 0.2943499335616567, "grad_norm": 1.2367194890975952, "learning_rate": 2.3004441495765594e-05, "loss": 1.6657, "step": 3932 }, { "epoch": 0.2944247936668351, "grad_norm": 1.273148536682129, "learning_rate": 2.296460279100785e-05, "loss": 2.0576, "step": 3933 }, { "epoch": 0.29449965377201354, "grad_norm": 1.3977408409118652, "learning_rate": 2.292479413691513e-05, "loss": 1.9091, "step": 3934 }, { "epoch": 0.294574513877192, "grad_norm": 1.3213720321655273, "learning_rate": 2.2885015549016354e-05, "loss": 2.145, "step": 3935 }, { "epoch": 0.29464937398237045, "grad_norm": 1.151888132095337, "learning_rate": 2.2845267042828733e-05, "loss": 1.8677, "step": 3936 }, { "epoch": 0.2947242340875489, "grad_norm": 1.0984301567077637, "learning_rate": 2.2805548633857677e-05, "loss": 2.1509, "step": 3937 }, { "epoch": 0.2947990941927273, "grad_norm": 1.2692869901657104, "learning_rate": 2.2765860337596957e-05, "loss": 2.2343, "step": 3938 }, { "epoch": 0.2948739542979058, "grad_norm": 1.2791962623596191, "learning_rate": 2.2726202169528476e-05, "loss": 2.1334, "step": 3939 }, { "epoch": 0.29494881440308424, "grad_norm": 1.3698170185089111, "learning_rate": 2.2686574145122497e-05, "loss": 2.5123, "step": 3940 }, { "epoch": 0.29502367450826267, "grad_norm": 1.146588921546936, "learning_rate": 2.2646976279837463e-05, "loss": 1.8351, "step": 3941 }, { "epoch": 0.29509853461344115, "grad_norm": 1.2131917476654053, "learning_rate": 2.2607408589120104e-05, "loss": 1.715, "step": 3942 }, { "epoch": 0.2951733947186196, "grad_norm": 1.2357946634292603, "learning_rate": 2.2567871088405278e-05, "loss": 1.4484, "step": 3943 }, { "epoch": 0.295248254823798, "grad_norm": 1.3810207843780518, "learning_rate": 2.252836379311618e-05, "loss": 2.2864, "step": 3944 }, { "epoch": 0.29532311492897645, "grad_norm": 1.1768088340759277, "learning_rate": 2.2488886718664148e-05, "loss": 1.6061, "step": 3945 }, { "epoch": 0.29539797503415494, "grad_norm": 1.233165979385376, "learning_rate": 2.2449439880448796e-05, "loss": 2.0962, "step": 3946 }, { "epoch": 0.29547283513933337, "grad_norm": 1.4975526332855225, "learning_rate": 2.241002329385785e-05, "loss": 2.3547, "step": 3947 }, { "epoch": 0.2955476952445118, "grad_norm": 1.2563834190368652, "learning_rate": 2.2370636974267346e-05, "loss": 1.8109, "step": 3948 }, { "epoch": 0.2956225553496903, "grad_norm": 1.3371756076812744, "learning_rate": 2.23312809370414e-05, "loss": 2.4661, "step": 3949 }, { "epoch": 0.2956974154548687, "grad_norm": 1.2190178632736206, "learning_rate": 2.229195519753239e-05, "loss": 1.9709, "step": 3950 }, { "epoch": 0.29577227556004715, "grad_norm": 1.3788176774978638, "learning_rate": 2.2252659771080864e-05, "loss": 1.9803, "step": 3951 }, { "epoch": 0.29584713566522564, "grad_norm": 1.316563367843628, "learning_rate": 2.221339467301553e-05, "loss": 1.9629, "step": 3952 }, { "epoch": 0.29592199577040407, "grad_norm": 1.111270785331726, "learning_rate": 2.2174159918653313e-05, "loss": 1.2139, "step": 3953 }, { "epoch": 0.2959968558755825, "grad_norm": 1.4404094219207764, "learning_rate": 2.213495552329923e-05, "loss": 2.0969, "step": 3954 }, { "epoch": 0.29607171598076093, "grad_norm": 1.2391482591629028, "learning_rate": 2.209578150224645e-05, "loss": 1.5772, "step": 3955 }, { "epoch": 0.2961465760859394, "grad_norm": 1.3997584581375122, "learning_rate": 2.2056637870776354e-05, "loss": 2.0066, "step": 3956 }, { "epoch": 0.29622143619111785, "grad_norm": 1.4698848724365234, "learning_rate": 2.201752464415846e-05, "loss": 1.8869, "step": 3957 }, { "epoch": 0.2962962962962963, "grad_norm": 1.315360426902771, "learning_rate": 2.197844183765041e-05, "loss": 2.0634, "step": 3958 }, { "epoch": 0.29637115640147477, "grad_norm": 1.3423080444335938, "learning_rate": 2.1939389466497994e-05, "loss": 2.0696, "step": 3959 }, { "epoch": 0.2964460165066532, "grad_norm": 1.145530343055725, "learning_rate": 2.1900367545935063e-05, "loss": 1.6263, "step": 3960 }, { "epoch": 0.29652087661183163, "grad_norm": 1.2749954462051392, "learning_rate": 2.186137609118367e-05, "loss": 2.27, "step": 3961 }, { "epoch": 0.29659573671701006, "grad_norm": 1.2191940546035767, "learning_rate": 2.1822415117453987e-05, "loss": 1.5782, "step": 3962 }, { "epoch": 0.29667059682218855, "grad_norm": 1.309130072593689, "learning_rate": 2.1783484639944195e-05, "loss": 1.5729, "step": 3963 }, { "epoch": 0.296745456927367, "grad_norm": 1.11361563205719, "learning_rate": 2.1744584673840684e-05, "loss": 1.8517, "step": 3964 }, { "epoch": 0.2968203170325454, "grad_norm": 1.1047282218933105, "learning_rate": 2.1705715234317935e-05, "loss": 1.8375, "step": 3965 }, { "epoch": 0.2968951771377239, "grad_norm": 1.182600975036621, "learning_rate": 2.1666876336538433e-05, "loss": 1.6632, "step": 3966 }, { "epoch": 0.29697003724290233, "grad_norm": 1.3028936386108398, "learning_rate": 2.1628067995652823e-05, "loss": 1.9735, "step": 3967 }, { "epoch": 0.29704489734808076, "grad_norm": 1.4734143018722534, "learning_rate": 2.158929022679983e-05, "loss": 2.0658, "step": 3968 }, { "epoch": 0.29711975745325925, "grad_norm": 1.6597189903259277, "learning_rate": 2.155054304510623e-05, "loss": 2.0485, "step": 3969 }, { "epoch": 0.2971946175584377, "grad_norm": 1.329876184463501, "learning_rate": 2.1511826465686914e-05, "loss": 1.8855, "step": 3970 }, { "epoch": 0.2972694776636161, "grad_norm": 1.265824317932129, "learning_rate": 2.147314050364474e-05, "loss": 1.7649, "step": 3971 }, { "epoch": 0.29734433776879454, "grad_norm": 1.5167194604873657, "learning_rate": 2.1434485174070683e-05, "loss": 2.1565, "step": 3972 }, { "epoch": 0.29741919787397303, "grad_norm": 1.3801146745681763, "learning_rate": 2.1395860492043763e-05, "loss": 2.0057, "step": 3973 }, { "epoch": 0.29749405797915146, "grad_norm": 1.248510479927063, "learning_rate": 2.1357266472631066e-05, "loss": 1.668, "step": 3974 }, { "epoch": 0.2975689180843299, "grad_norm": 1.1799479722976685, "learning_rate": 2.1318703130887686e-05, "loss": 1.9234, "step": 3975 }, { "epoch": 0.2976437781895084, "grad_norm": 1.2091654539108276, "learning_rate": 2.1280170481856797e-05, "loss": 2.2018, "step": 3976 }, { "epoch": 0.2977186382946868, "grad_norm": 1.367374062538147, "learning_rate": 2.1241668540569494e-05, "loss": 1.9326, "step": 3977 }, { "epoch": 0.29779349839986524, "grad_norm": 1.3511207103729248, "learning_rate": 2.1203197322045032e-05, "loss": 2.149, "step": 3978 }, { "epoch": 0.2978683585050437, "grad_norm": 1.2766677141189575, "learning_rate": 2.1164756841290556e-05, "loss": 2.1218, "step": 3979 }, { "epoch": 0.29794321861022216, "grad_norm": 1.1204276084899902, "learning_rate": 2.1126347113301292e-05, "loss": 1.692, "step": 3980 }, { "epoch": 0.2980180787154006, "grad_norm": 1.3864918947219849, "learning_rate": 2.1087968153060467e-05, "loss": 1.9604, "step": 3981 }, { "epoch": 0.298092938820579, "grad_norm": 1.2093075513839722, "learning_rate": 2.1049619975539315e-05, "loss": 1.5394, "step": 3982 }, { "epoch": 0.2981677989257575, "grad_norm": 1.262485146522522, "learning_rate": 2.101130259569698e-05, "loss": 2.1278, "step": 3983 }, { "epoch": 0.29824265903093594, "grad_norm": 1.4482489824295044, "learning_rate": 2.0973016028480684e-05, "loss": 2.4419, "step": 3984 }, { "epoch": 0.2983175191361144, "grad_norm": 1.3206839561462402, "learning_rate": 2.0934760288825616e-05, "loss": 1.987, "step": 3985 }, { "epoch": 0.29839237924129286, "grad_norm": 1.3289496898651123, "learning_rate": 2.0896535391654914e-05, "loss": 2.4664, "step": 3986 }, { "epoch": 0.2984672393464713, "grad_norm": 1.1732226610183716, "learning_rate": 2.0858341351879664e-05, "loss": 1.7513, "step": 3987 }, { "epoch": 0.2985420994516497, "grad_norm": 1.2676868438720703, "learning_rate": 2.0820178184398985e-05, "loss": 2.103, "step": 3988 }, { "epoch": 0.29861695955682815, "grad_norm": 1.315244197845459, "learning_rate": 2.0782045904099866e-05, "loss": 2.6582, "step": 3989 }, { "epoch": 0.29869181966200664, "grad_norm": 1.2877997159957886, "learning_rate": 2.0743944525857305e-05, "loss": 2.251, "step": 3990 }, { "epoch": 0.2987666797671851, "grad_norm": 1.411036491394043, "learning_rate": 2.0705874064534248e-05, "loss": 1.7806, "step": 3991 }, { "epoch": 0.2988415398723635, "grad_norm": 1.1227136850357056, "learning_rate": 2.0667834534981556e-05, "loss": 1.4497, "step": 3992 }, { "epoch": 0.298916399977542, "grad_norm": 1.2739559412002563, "learning_rate": 2.0629825952038062e-05, "loss": 2.0728, "step": 3993 }, { "epoch": 0.2989912600827204, "grad_norm": 1.4167178869247437, "learning_rate": 2.059184833053044e-05, "loss": 1.8699, "step": 3994 }, { "epoch": 0.29906612018789885, "grad_norm": 1.127899408340454, "learning_rate": 2.05539016852734e-05, "loss": 2.2993, "step": 3995 }, { "epoch": 0.2991409802930773, "grad_norm": 1.2067402601242065, "learning_rate": 2.0515986031069466e-05, "loss": 2.021, "step": 3996 }, { "epoch": 0.2992158403982558, "grad_norm": 1.2495208978652954, "learning_rate": 2.0478101382709146e-05, "loss": 2.0169, "step": 3997 }, { "epoch": 0.2992907005034342, "grad_norm": 1.5436145067214966, "learning_rate": 2.044024775497082e-05, "loss": 2.4031, "step": 3998 }, { "epoch": 0.29936556060861264, "grad_norm": 1.337142825126648, "learning_rate": 2.0402425162620796e-05, "loss": 2.3839, "step": 3999 }, { "epoch": 0.2994404207137911, "grad_norm": 1.1541532278060913, "learning_rate": 2.036463362041321e-05, "loss": 2.2192, "step": 4000 }, { "epoch": 0.29951528081896955, "grad_norm": 1.2649480104446411, "learning_rate": 2.0326873143090153e-05, "loss": 2.1061, "step": 4001 }, { "epoch": 0.299590140924148, "grad_norm": 1.814571499824524, "learning_rate": 2.0289143745381577e-05, "loss": 2.1504, "step": 4002 }, { "epoch": 0.29966500102932647, "grad_norm": 1.0770083665847778, "learning_rate": 2.0251445442005323e-05, "loss": 1.3163, "step": 4003 }, { "epoch": 0.2997398611345049, "grad_norm": 1.3618335723876953, "learning_rate": 2.0213778247667036e-05, "loss": 2.0475, "step": 4004 }, { "epoch": 0.29981472123968333, "grad_norm": 1.2065316438674927, "learning_rate": 2.017614217706034e-05, "loss": 1.3929, "step": 4005 }, { "epoch": 0.29988958134486177, "grad_norm": 1.350765347480774, "learning_rate": 2.0138537244866595e-05, "loss": 2.1823, "step": 4006 }, { "epoch": 0.29996444145004025, "grad_norm": 1.3820773363113403, "learning_rate": 2.0100963465755095e-05, "loss": 2.0223, "step": 4007 }, { "epoch": 0.3000393015552187, "grad_norm": 1.2295290231704712, "learning_rate": 2.0063420854382964e-05, "loss": 1.9434, "step": 4008 }, { "epoch": 0.3001141616603971, "grad_norm": 1.2466357946395874, "learning_rate": 2.002590942539516e-05, "loss": 1.8788, "step": 4009 }, { "epoch": 0.3001890217655756, "grad_norm": 1.161490797996521, "learning_rate": 1.9988429193424506e-05, "loss": 1.8346, "step": 4010 }, { "epoch": 0.30026388187075403, "grad_norm": 1.2626473903656006, "learning_rate": 1.9950980173091614e-05, "loss": 1.7598, "step": 4011 }, { "epoch": 0.30033874197593247, "grad_norm": 1.180898666381836, "learning_rate": 1.9913562379004914e-05, "loss": 1.735, "step": 4012 }, { "epoch": 0.3004136020811109, "grad_norm": 1.4839529991149902, "learning_rate": 1.987617582576068e-05, "loss": 1.7939, "step": 4013 }, { "epoch": 0.3004884621862894, "grad_norm": 1.457495927810669, "learning_rate": 1.9838820527943024e-05, "loss": 1.7505, "step": 4014 }, { "epoch": 0.3005633222914678, "grad_norm": 1.257240653038025, "learning_rate": 1.9801496500123816e-05, "loss": 2.0504, "step": 4015 }, { "epoch": 0.30063818239664625, "grad_norm": 1.2636433839797974, "learning_rate": 1.9764203756862788e-05, "loss": 2.1075, "step": 4016 }, { "epoch": 0.30071304250182473, "grad_norm": 1.3001116514205933, "learning_rate": 1.9726942312707387e-05, "loss": 1.8914, "step": 4017 }, { "epoch": 0.30078790260700317, "grad_norm": 1.3088630437850952, "learning_rate": 1.96897121821929e-05, "loss": 1.4737, "step": 4018 }, { "epoch": 0.3008627627121816, "grad_norm": 1.2373766899108887, "learning_rate": 1.9652513379842443e-05, "loss": 1.7795, "step": 4019 }, { "epoch": 0.3009376228173601, "grad_norm": 1.4271975755691528, "learning_rate": 1.9615345920166783e-05, "loss": 1.853, "step": 4020 }, { "epoch": 0.3010124829225385, "grad_norm": 1.3381285667419434, "learning_rate": 1.9578209817664583e-05, "loss": 1.4662, "step": 4021 }, { "epoch": 0.30108734302771695, "grad_norm": 1.3777472972869873, "learning_rate": 1.9541105086822255e-05, "loss": 1.961, "step": 4022 }, { "epoch": 0.3011622031328954, "grad_norm": 1.2023438215255737, "learning_rate": 1.9504031742113893e-05, "loss": 2.0259, "step": 4023 }, { "epoch": 0.30123706323807387, "grad_norm": 1.3820981979370117, "learning_rate": 1.9466989798001434e-05, "loss": 2.2355, "step": 4024 }, { "epoch": 0.3013119233432523, "grad_norm": 1.3389943838119507, "learning_rate": 1.9429979268934517e-05, "loss": 1.8868, "step": 4025 }, { "epoch": 0.30138678344843073, "grad_norm": 1.2242356538772583, "learning_rate": 1.9393000169350572e-05, "loss": 2.3564, "step": 4026 }, { "epoch": 0.3014616435536092, "grad_norm": 1.2871242761611938, "learning_rate": 1.935605251367475e-05, "loss": 2.3593, "step": 4027 }, { "epoch": 0.30153650365878765, "grad_norm": 1.1905903816223145, "learning_rate": 1.9319136316319864e-05, "loss": 1.7604, "step": 4028 }, { "epoch": 0.3016113637639661, "grad_norm": 1.4285056591033936, "learning_rate": 1.9282251591686594e-05, "loss": 1.7632, "step": 4029 }, { "epoch": 0.3016862238691445, "grad_norm": 1.186838984489441, "learning_rate": 1.9245398354163203e-05, "loss": 2.2211, "step": 4030 }, { "epoch": 0.301761083974323, "grad_norm": 1.2330384254455566, "learning_rate": 1.920857661812576e-05, "loss": 1.8937, "step": 4031 }, { "epoch": 0.30183594407950143, "grad_norm": 1.6869494915008545, "learning_rate": 1.917178639793803e-05, "loss": 2.3147, "step": 4032 }, { "epoch": 0.30191080418467986, "grad_norm": 1.0925287008285522, "learning_rate": 1.9135027707951468e-05, "loss": 1.6979, "step": 4033 }, { "epoch": 0.30198566428985835, "grad_norm": 1.118889570236206, "learning_rate": 1.9098300562505266e-05, "loss": 2.0201, "step": 4034 }, { "epoch": 0.3020605243950368, "grad_norm": 1.3168593645095825, "learning_rate": 1.906160497592625e-05, "loss": 2.0211, "step": 4035 }, { "epoch": 0.3021353845002152, "grad_norm": 1.253433346748352, "learning_rate": 1.9024940962528947e-05, "loss": 1.7705, "step": 4036 }, { "epoch": 0.3022102446053937, "grad_norm": 1.2419159412384033, "learning_rate": 1.8988308536615607e-05, "loss": 2.083, "step": 4037 }, { "epoch": 0.30228510471057213, "grad_norm": 1.166131615638733, "learning_rate": 1.8951707712476142e-05, "loss": 2.3374, "step": 4038 }, { "epoch": 0.30235996481575056, "grad_norm": 1.201485276222229, "learning_rate": 1.891513850438813e-05, "loss": 1.6982, "step": 4039 }, { "epoch": 0.302434824920929, "grad_norm": 1.3317996263504028, "learning_rate": 1.8878600926616842e-05, "loss": 1.9714, "step": 4040 }, { "epoch": 0.3025096850261075, "grad_norm": 1.1165748834609985, "learning_rate": 1.884209499341515e-05, "loss": 1.7682, "step": 4041 }, { "epoch": 0.3025845451312859, "grad_norm": 1.3141906261444092, "learning_rate": 1.8805620719023633e-05, "loss": 1.8969, "step": 4042 }, { "epoch": 0.30265940523646434, "grad_norm": 1.3917417526245117, "learning_rate": 1.8769178117670528e-05, "loss": 1.8949, "step": 4043 }, { "epoch": 0.3027342653416428, "grad_norm": 1.3563727140426636, "learning_rate": 1.8732767203571644e-05, "loss": 1.8934, "step": 4044 }, { "epoch": 0.30280912544682126, "grad_norm": 1.1375949382781982, "learning_rate": 1.869638799093052e-05, "loss": 2.3021, "step": 4045 }, { "epoch": 0.3028839855519997, "grad_norm": 1.3237961530685425, "learning_rate": 1.8660040493938302e-05, "loss": 2.0617, "step": 4046 }, { "epoch": 0.3029588456571781, "grad_norm": 1.4968429803848267, "learning_rate": 1.8623724726773705e-05, "loss": 2.1839, "step": 4047 }, { "epoch": 0.3030337057623566, "grad_norm": 1.125922679901123, "learning_rate": 1.858744070360313e-05, "loss": 1.8824, "step": 4048 }, { "epoch": 0.30310856586753504, "grad_norm": 1.514498233795166, "learning_rate": 1.8551188438580592e-05, "loss": 2.266, "step": 4049 }, { "epoch": 0.30318342597271347, "grad_norm": 1.4843404293060303, "learning_rate": 1.8514967945847683e-05, "loss": 2.405, "step": 4050 }, { "epoch": 0.30318342597271347, "eval_loss": 1.975447654724121, "eval_runtime": 179.009, "eval_samples_per_second": 27.932, "eval_steps_per_second": 13.966, "step": 4050 }, { "epoch": 0.30325828607789196, "grad_norm": 1.2375472784042358, "learning_rate": 1.8478779239533663e-05, "loss": 2.084, "step": 4051 }, { "epoch": 0.3033331461830704, "grad_norm": 1.3809995651245117, "learning_rate": 1.844262233375531e-05, "loss": 2.3242, "step": 4052 }, { "epoch": 0.3034080062882488, "grad_norm": 1.2033385038375854, "learning_rate": 1.8406497242617015e-05, "loss": 2.2075, "step": 4053 }, { "epoch": 0.3034828663934273, "grad_norm": 1.3567943572998047, "learning_rate": 1.8370403980210814e-05, "loss": 1.8792, "step": 4054 }, { "epoch": 0.30355772649860574, "grad_norm": 1.2904313802719116, "learning_rate": 1.833434256061629e-05, "loss": 1.5604, "step": 4055 }, { "epoch": 0.30363258660378417, "grad_norm": 1.241664171218872, "learning_rate": 1.829831299790061e-05, "loss": 2.0435, "step": 4056 }, { "epoch": 0.3037074467089626, "grad_norm": 1.207828164100647, "learning_rate": 1.8262315306118538e-05, "loss": 2.0905, "step": 4057 }, { "epoch": 0.3037823068141411, "grad_norm": 1.355797529220581, "learning_rate": 1.8226349499312322e-05, "loss": 2.3153, "step": 4058 }, { "epoch": 0.3038571669193195, "grad_norm": 1.4054418802261353, "learning_rate": 1.8190415591511854e-05, "loss": 2.1058, "step": 4059 }, { "epoch": 0.30393202702449795, "grad_norm": 1.3853784799575806, "learning_rate": 1.8154513596734602e-05, "loss": 2.3567, "step": 4060 }, { "epoch": 0.30400688712967644, "grad_norm": 1.1983472108840942, "learning_rate": 1.8118643528985456e-05, "loss": 2.2024, "step": 4061 }, { "epoch": 0.30408174723485487, "grad_norm": 1.1993993520736694, "learning_rate": 1.8082805402256996e-05, "loss": 2.0189, "step": 4062 }, { "epoch": 0.3041566073400333, "grad_norm": 1.112529993057251, "learning_rate": 1.8046999230529292e-05, "loss": 2.0666, "step": 4063 }, { "epoch": 0.30423146744521173, "grad_norm": 1.256941556930542, "learning_rate": 1.8011225027769883e-05, "loss": 1.6108, "step": 4064 }, { "epoch": 0.3043063275503902, "grad_norm": 1.2825404405593872, "learning_rate": 1.7975482807933942e-05, "loss": 1.854, "step": 4065 }, { "epoch": 0.30438118765556865, "grad_norm": 1.2060844898223877, "learning_rate": 1.7939772584964088e-05, "loss": 2.1824, "step": 4066 }, { "epoch": 0.3044560477607471, "grad_norm": 1.3294241428375244, "learning_rate": 1.7904094372790514e-05, "loss": 2.0232, "step": 4067 }, { "epoch": 0.30453090786592557, "grad_norm": 1.369537115097046, "learning_rate": 1.7868448185330912e-05, "loss": 1.351, "step": 4068 }, { "epoch": 0.304605767971104, "grad_norm": 1.4236432313919067, "learning_rate": 1.7832834036490443e-05, "loss": 2.4995, "step": 4069 }, { "epoch": 0.30468062807628243, "grad_norm": 1.1932224035263062, "learning_rate": 1.7797251940161773e-05, "loss": 2.2037, "step": 4070 }, { "epoch": 0.3047554881814609, "grad_norm": 1.367132306098938, "learning_rate": 1.7761701910225124e-05, "loss": 2.1232, "step": 4071 }, { "epoch": 0.30483034828663935, "grad_norm": 1.4908311367034912, "learning_rate": 1.772618396054816e-05, "loss": 2.2996, "step": 4072 }, { "epoch": 0.3049052083918178, "grad_norm": 1.2963372468948364, "learning_rate": 1.769069810498605e-05, "loss": 1.9373, "step": 4073 }, { "epoch": 0.3049800684969962, "grad_norm": 1.2931585311889648, "learning_rate": 1.765524435738145e-05, "loss": 1.6983, "step": 4074 }, { "epoch": 0.3050549286021747, "grad_norm": 1.2115219831466675, "learning_rate": 1.761982273156445e-05, "loss": 1.7978, "step": 4075 }, { "epoch": 0.30512978870735313, "grad_norm": 1.2973915338516235, "learning_rate": 1.758443324135268e-05, "loss": 1.584, "step": 4076 }, { "epoch": 0.30520464881253156, "grad_norm": 1.1812747716903687, "learning_rate": 1.7549075900551138e-05, "loss": 1.1159, "step": 4077 }, { "epoch": 0.30527950891771005, "grad_norm": 1.132139801979065, "learning_rate": 1.751375072295237e-05, "loss": 1.9777, "step": 4078 }, { "epoch": 0.3053543690228885, "grad_norm": 1.330201506614685, "learning_rate": 1.747845772233633e-05, "loss": 2.2232, "step": 4079 }, { "epoch": 0.3054292291280669, "grad_norm": 1.2512439489364624, "learning_rate": 1.7443196912470462e-05, "loss": 1.9862, "step": 4080 }, { "epoch": 0.30550408923324535, "grad_norm": 1.2438949346542358, "learning_rate": 1.740796830710957e-05, "loss": 2.0577, "step": 4081 }, { "epoch": 0.30557894933842383, "grad_norm": 1.1295206546783447, "learning_rate": 1.7372771919995977e-05, "loss": 1.7723, "step": 4082 }, { "epoch": 0.30565380944360226, "grad_norm": 1.3180714845657349, "learning_rate": 1.73376077648594e-05, "loss": 1.9786, "step": 4083 }, { "epoch": 0.3057286695487807, "grad_norm": 1.6048989295959473, "learning_rate": 1.7302475855417022e-05, "loss": 2.107, "step": 4084 }, { "epoch": 0.3058035296539592, "grad_norm": 1.2396585941314697, "learning_rate": 1.7267376205373365e-05, "loss": 1.7827, "step": 4085 }, { "epoch": 0.3058783897591376, "grad_norm": 1.4661520719528198, "learning_rate": 1.723230882842046e-05, "loss": 2.6513, "step": 4086 }, { "epoch": 0.30595324986431605, "grad_norm": 1.2255977392196655, "learning_rate": 1.7197273738237672e-05, "loss": 2.0294, "step": 4087 }, { "epoch": 0.30602810996949453, "grad_norm": 1.1964830160140991, "learning_rate": 1.7162270948491832e-05, "loss": 1.92, "step": 4088 }, { "epoch": 0.30610297007467296, "grad_norm": 1.096755027770996, "learning_rate": 1.7127300472837126e-05, "loss": 1.8973, "step": 4089 }, { "epoch": 0.3061778301798514, "grad_norm": 1.0533169507980347, "learning_rate": 1.709236232491517e-05, "loss": 2.1012, "step": 4090 }, { "epoch": 0.3062526902850298, "grad_norm": 1.107009768486023, "learning_rate": 1.7057456518354965e-05, "loss": 1.6729, "step": 4091 }, { "epoch": 0.3063275503902083, "grad_norm": 1.3943463563919067, "learning_rate": 1.702258306677288e-05, "loss": 2.4317, "step": 4092 }, { "epoch": 0.30640241049538675, "grad_norm": 1.2476930618286133, "learning_rate": 1.698774198377262e-05, "loss": 1.8821, "step": 4093 }, { "epoch": 0.3064772706005652, "grad_norm": 1.3115483522415161, "learning_rate": 1.6952933282945337e-05, "loss": 1.7603, "step": 4094 }, { "epoch": 0.30655213070574366, "grad_norm": 1.029050350189209, "learning_rate": 1.6918156977869535e-05, "loss": 1.4612, "step": 4095 }, { "epoch": 0.3066269908109221, "grad_norm": 1.4106920957565308, "learning_rate": 1.6883413082111066e-05, "loss": 2.0946, "step": 4096 }, { "epoch": 0.3067018509161005, "grad_norm": 1.1665525436401367, "learning_rate": 1.684870160922317e-05, "loss": 2.2601, "step": 4097 }, { "epoch": 0.30677671102127896, "grad_norm": 1.0961947441101074, "learning_rate": 1.6814022572746358e-05, "loss": 1.8229, "step": 4098 }, { "epoch": 0.30685157112645745, "grad_norm": 1.2001997232437134, "learning_rate": 1.6779375986208567e-05, "loss": 2.0785, "step": 4099 }, { "epoch": 0.3069264312316359, "grad_norm": 1.2410109043121338, "learning_rate": 1.6744761863125082e-05, "loss": 1.8366, "step": 4100 }, { "epoch": 0.3070012913368143, "grad_norm": 1.066138505935669, "learning_rate": 1.671018021699844e-05, "loss": 1.7288, "step": 4101 }, { "epoch": 0.3070761514419928, "grad_norm": 1.1424481868743896, "learning_rate": 1.6675631061318577e-05, "loss": 1.776, "step": 4102 }, { "epoch": 0.3071510115471712, "grad_norm": 1.3031971454620361, "learning_rate": 1.6641114409562785e-05, "loss": 1.9804, "step": 4103 }, { "epoch": 0.30722587165234966, "grad_norm": 1.1968178749084473, "learning_rate": 1.6606630275195577e-05, "loss": 1.7019, "step": 4104 }, { "epoch": 0.30730073175752814, "grad_norm": 1.3047308921813965, "learning_rate": 1.6572178671668847e-05, "loss": 1.756, "step": 4105 }, { "epoch": 0.3073755918627066, "grad_norm": 1.2981348037719727, "learning_rate": 1.65377596124218e-05, "loss": 2.2261, "step": 4106 }, { "epoch": 0.307450451967885, "grad_norm": 1.3344699144363403, "learning_rate": 1.6503373110880937e-05, "loss": 2.1687, "step": 4107 }, { "epoch": 0.30752531207306344, "grad_norm": 1.4665021896362305, "learning_rate": 1.6469019180460078e-05, "loss": 1.4367, "step": 4108 }, { "epoch": 0.3076001721782419, "grad_norm": 1.2521626949310303, "learning_rate": 1.6434697834560288e-05, "loss": 2.4785, "step": 4109 }, { "epoch": 0.30767503228342036, "grad_norm": 1.2441452741622925, "learning_rate": 1.6400409086569912e-05, "loss": 2.0607, "step": 4110 }, { "epoch": 0.3077498923885988, "grad_norm": 1.384574294090271, "learning_rate": 1.6366152949864666e-05, "loss": 1.5333, "step": 4111 }, { "epoch": 0.3078247524937773, "grad_norm": 1.2359360456466675, "learning_rate": 1.633192943780747e-05, "loss": 1.5971, "step": 4112 }, { "epoch": 0.3078996125989557, "grad_norm": 1.5133596658706665, "learning_rate": 1.6297738563748554e-05, "loss": 2.1403, "step": 4113 }, { "epoch": 0.30797447270413414, "grad_norm": 1.1594699621200562, "learning_rate": 1.626358034102543e-05, "loss": 2.3156, "step": 4114 }, { "epoch": 0.30804933280931257, "grad_norm": 1.3752086162567139, "learning_rate": 1.6229454782962794e-05, "loss": 2.1326, "step": 4115 }, { "epoch": 0.30812419291449106, "grad_norm": 1.2432183027267456, "learning_rate": 1.6195361902872663e-05, "loss": 2.0908, "step": 4116 }, { "epoch": 0.3081990530196695, "grad_norm": 1.3134894371032715, "learning_rate": 1.6161301714054345e-05, "loss": 1.8659, "step": 4117 }, { "epoch": 0.3082739131248479, "grad_norm": 1.438181757926941, "learning_rate": 1.612727422979429e-05, "loss": 2.2441, "step": 4118 }, { "epoch": 0.3083487732300264, "grad_norm": 1.338538408279419, "learning_rate": 1.609327946336625e-05, "loss": 2.4267, "step": 4119 }, { "epoch": 0.30842363333520484, "grad_norm": 1.3462523221969604, "learning_rate": 1.605931742803127e-05, "loss": 1.5538, "step": 4120 }, { "epoch": 0.30849849344038327, "grad_norm": 1.3484785556793213, "learning_rate": 1.60253881370375e-05, "loss": 1.934, "step": 4121 }, { "epoch": 0.30857335354556176, "grad_norm": 1.3878412246704102, "learning_rate": 1.599149160362041e-05, "loss": 2.3092, "step": 4122 }, { "epoch": 0.3086482136507402, "grad_norm": 1.3290033340454102, "learning_rate": 1.5957627841002664e-05, "loss": 2.0252, "step": 4123 }, { "epoch": 0.3087230737559186, "grad_norm": 1.2164829969406128, "learning_rate": 1.5923796862394158e-05, "loss": 1.6984, "step": 4124 }, { "epoch": 0.30879793386109705, "grad_norm": 1.3966617584228516, "learning_rate": 1.5889998680992002e-05, "loss": 1.8572, "step": 4125 }, { "epoch": 0.30887279396627554, "grad_norm": 1.2370803356170654, "learning_rate": 1.585623330998044e-05, "loss": 1.7755, "step": 4126 }, { "epoch": 0.30894765407145397, "grad_norm": 1.334602952003479, "learning_rate": 1.5822500762531046e-05, "loss": 1.7591, "step": 4127 }, { "epoch": 0.3090225141766324, "grad_norm": 1.0683672428131104, "learning_rate": 1.5788801051802438e-05, "loss": 1.4727, "step": 4128 }, { "epoch": 0.3090973742818109, "grad_norm": 1.4616260528564453, "learning_rate": 1.5755134190940546e-05, "loss": 2.0929, "step": 4129 }, { "epoch": 0.3091722343869893, "grad_norm": 1.3632547855377197, "learning_rate": 1.572150019307845e-05, "loss": 1.7149, "step": 4130 }, { "epoch": 0.30924709449216775, "grad_norm": 1.353914499282837, "learning_rate": 1.5687899071336386e-05, "loss": 2.2765, "step": 4131 }, { "epoch": 0.3093219545973462, "grad_norm": 1.3182027339935303, "learning_rate": 1.565433083882183e-05, "loss": 1.6951, "step": 4132 }, { "epoch": 0.30939681470252467, "grad_norm": 1.290814995765686, "learning_rate": 1.5620795508629337e-05, "loss": 2.0187, "step": 4133 }, { "epoch": 0.3094716748077031, "grad_norm": 1.0754402875900269, "learning_rate": 1.558729309384066e-05, "loss": 1.2031, "step": 4134 }, { "epoch": 0.30954653491288153, "grad_norm": 1.3762871026992798, "learning_rate": 1.5553823607524742e-05, "loss": 2.0831, "step": 4135 }, { "epoch": 0.30962139501806, "grad_norm": 1.5534108877182007, "learning_rate": 1.5520387062737673e-05, "loss": 2.2212, "step": 4136 }, { "epoch": 0.30969625512323845, "grad_norm": 1.367431402206421, "learning_rate": 1.5486983472522676e-05, "loss": 2.327, "step": 4137 }, { "epoch": 0.3097711152284169, "grad_norm": 1.4046030044555664, "learning_rate": 1.545361284991014e-05, "loss": 2.5039, "step": 4138 }, { "epoch": 0.30984597533359537, "grad_norm": 1.5137723684310913, "learning_rate": 1.5420275207917546e-05, "loss": 2.1898, "step": 4139 }, { "epoch": 0.3099208354387738, "grad_norm": 1.3537466526031494, "learning_rate": 1.5386970559549563e-05, "loss": 1.5327, "step": 4140 }, { "epoch": 0.30999569554395223, "grad_norm": 1.2620223760604858, "learning_rate": 1.5353698917798e-05, "loss": 1.861, "step": 4141 }, { "epoch": 0.31007055564913066, "grad_norm": 1.3101414442062378, "learning_rate": 1.5320460295641703e-05, "loss": 2.0428, "step": 4142 }, { "epoch": 0.31014541575430915, "grad_norm": 1.3442658185958862, "learning_rate": 1.5287254706046718e-05, "loss": 2.0686, "step": 4143 }, { "epoch": 0.3102202758594876, "grad_norm": 1.1794476509094238, "learning_rate": 1.525408216196621e-05, "loss": 2.1155, "step": 4144 }, { "epoch": 0.310295135964666, "grad_norm": 1.2577784061431885, "learning_rate": 1.5220942676340365e-05, "loss": 1.8068, "step": 4145 }, { "epoch": 0.3103699960698445, "grad_norm": 1.3938839435577393, "learning_rate": 1.5187836262096589e-05, "loss": 2.2035, "step": 4146 }, { "epoch": 0.31044485617502293, "grad_norm": 1.3575022220611572, "learning_rate": 1.5154762932149303e-05, "loss": 1.6518, "step": 4147 }, { "epoch": 0.31051971628020136, "grad_norm": 1.1770210266113281, "learning_rate": 1.5121722699400054e-05, "loss": 1.8802, "step": 4148 }, { "epoch": 0.3105945763853798, "grad_norm": 1.2927170991897583, "learning_rate": 1.5088715576737511e-05, "loss": 2.0345, "step": 4149 }, { "epoch": 0.3106694364905583, "grad_norm": 1.3028998374938965, "learning_rate": 1.5055741577037363e-05, "loss": 2.2159, "step": 4150 }, { "epoch": 0.3107442965957367, "grad_norm": 1.3838856220245361, "learning_rate": 1.5022800713162378e-05, "loss": 2.1063, "step": 4151 }, { "epoch": 0.31081915670091514, "grad_norm": 1.3562817573547363, "learning_rate": 1.4989892997962452e-05, "loss": 2.2189, "step": 4152 }, { "epoch": 0.31089401680609363, "grad_norm": 1.0580971240997314, "learning_rate": 1.4957018444274517e-05, "loss": 1.8228, "step": 4153 }, { "epoch": 0.31096887691127206, "grad_norm": 1.238795280456543, "learning_rate": 1.4924177064922596e-05, "loss": 1.7391, "step": 4154 }, { "epoch": 0.3110437370164505, "grad_norm": 1.3416134119033813, "learning_rate": 1.489136887271776e-05, "loss": 2.0269, "step": 4155 }, { "epoch": 0.311118597121629, "grad_norm": 1.2789497375488281, "learning_rate": 1.4858593880458083e-05, "loss": 1.8586, "step": 4156 }, { "epoch": 0.3111934572268074, "grad_norm": 1.4079089164733887, "learning_rate": 1.4825852100928772e-05, "loss": 2.0929, "step": 4157 }, { "epoch": 0.31126831733198584, "grad_norm": 1.0542490482330322, "learning_rate": 1.479314354690199e-05, "loss": 1.4598, "step": 4158 }, { "epoch": 0.3113431774371643, "grad_norm": 1.3467721939086914, "learning_rate": 1.4760468231137025e-05, "loss": 2.3606, "step": 4159 }, { "epoch": 0.31141803754234276, "grad_norm": 1.3539036512374878, "learning_rate": 1.4727826166380143e-05, "loss": 2.0371, "step": 4160 }, { "epoch": 0.3114928976475212, "grad_norm": 1.2002004384994507, "learning_rate": 1.4695217365364678e-05, "loss": 2.6941, "step": 4161 }, { "epoch": 0.3115677577526996, "grad_norm": 1.4936364889144897, "learning_rate": 1.4662641840810943e-05, "loss": 2.1169, "step": 4162 }, { "epoch": 0.3116426178578781, "grad_norm": 1.2853271961212158, "learning_rate": 1.4630099605426284e-05, "loss": 1.9103, "step": 4163 }, { "epoch": 0.31171747796305654, "grad_norm": 1.2770893573760986, "learning_rate": 1.4597590671905092e-05, "loss": 2.1517, "step": 4164 }, { "epoch": 0.311792338068235, "grad_norm": 2.467751979827881, "learning_rate": 1.4565115052928746e-05, "loss": 2.2871, "step": 4165 }, { "epoch": 0.3118671981734134, "grad_norm": 1.1206268072128296, "learning_rate": 1.4532672761165644e-05, "loss": 2.0844, "step": 4166 }, { "epoch": 0.3119420582785919, "grad_norm": 1.447290301322937, "learning_rate": 1.4500263809271152e-05, "loss": 2.1031, "step": 4167 }, { "epoch": 0.3120169183837703, "grad_norm": 1.3242137432098389, "learning_rate": 1.4467888209887615e-05, "loss": 2.0292, "step": 4168 }, { "epoch": 0.31209177848894876, "grad_norm": 1.3341171741485596, "learning_rate": 1.4435545975644438e-05, "loss": 2.0296, "step": 4169 }, { "epoch": 0.31216663859412724, "grad_norm": 1.547074794769287, "learning_rate": 1.4403237119157953e-05, "loss": 2.345, "step": 4170 }, { "epoch": 0.3122414986993057, "grad_norm": 1.3654197454452515, "learning_rate": 1.4370961653031511e-05, "loss": 1.6751, "step": 4171 }, { "epoch": 0.3123163588044841, "grad_norm": 1.1811145544052124, "learning_rate": 1.4338719589855432e-05, "loss": 1.6844, "step": 4172 }, { "epoch": 0.3123912189096626, "grad_norm": 1.215173363685608, "learning_rate": 1.4306510942206941e-05, "loss": 1.5567, "step": 4173 }, { "epoch": 0.312466079014841, "grad_norm": 1.1414226293563843, "learning_rate": 1.4274335722650334e-05, "loss": 1.7216, "step": 4174 }, { "epoch": 0.31254093912001946, "grad_norm": 1.2330782413482666, "learning_rate": 1.4242193943736759e-05, "loss": 1.4695, "step": 4175 }, { "epoch": 0.3126157992251979, "grad_norm": 1.843246340751648, "learning_rate": 1.4210085618004398e-05, "loss": 2.1048, "step": 4176 }, { "epoch": 0.3126906593303764, "grad_norm": 1.3115407228469849, "learning_rate": 1.4178010757978356e-05, "loss": 2.4087, "step": 4177 }, { "epoch": 0.3127655194355548, "grad_norm": 1.2785003185272217, "learning_rate": 1.414596937617071e-05, "loss": 2.0839, "step": 4178 }, { "epoch": 0.31284037954073324, "grad_norm": 1.3134210109710693, "learning_rate": 1.4113961485080406e-05, "loss": 1.8374, "step": 4179 }, { "epoch": 0.3129152396459117, "grad_norm": 1.0752772092819214, "learning_rate": 1.4081987097193383e-05, "loss": 2.0352, "step": 4180 }, { "epoch": 0.31299009975109016, "grad_norm": 1.1392195224761963, "learning_rate": 1.4050046224982527e-05, "loss": 1.7224, "step": 4181 }, { "epoch": 0.3130649598562686, "grad_norm": 2.0944085121154785, "learning_rate": 1.4018138880907617e-05, "loss": 1.9977, "step": 4182 }, { "epoch": 0.313139819961447, "grad_norm": 1.2687186002731323, "learning_rate": 1.3986265077415328e-05, "loss": 1.9831, "step": 4183 }, { "epoch": 0.3132146800666255, "grad_norm": 1.3837085962295532, "learning_rate": 1.3954424826939327e-05, "loss": 1.6477, "step": 4184 }, { "epoch": 0.31328954017180394, "grad_norm": 1.1525795459747314, "learning_rate": 1.3922618141900101e-05, "loss": 1.3427, "step": 4185 }, { "epoch": 0.31336440027698237, "grad_norm": 1.3248560428619385, "learning_rate": 1.3890845034705102e-05, "loss": 1.8167, "step": 4186 }, { "epoch": 0.31343926038216086, "grad_norm": 1.0211544036865234, "learning_rate": 1.3859105517748684e-05, "loss": 1.936, "step": 4187 }, { "epoch": 0.3135141204873393, "grad_norm": 1.2533379793167114, "learning_rate": 1.3827399603412072e-05, "loss": 1.9772, "step": 4188 }, { "epoch": 0.3135889805925177, "grad_norm": 1.4336566925048828, "learning_rate": 1.3795727304063433e-05, "loss": 2.4198, "step": 4189 }, { "epoch": 0.3136638406976962, "grad_norm": 1.5366734266281128, "learning_rate": 1.3764088632057759e-05, "loss": 2.049, "step": 4190 }, { "epoch": 0.31373870080287464, "grad_norm": 1.4055321216583252, "learning_rate": 1.3732483599736923e-05, "loss": 1.9928, "step": 4191 }, { "epoch": 0.31381356090805307, "grad_norm": 1.2820324897766113, "learning_rate": 1.3700912219429718e-05, "loss": 2.0272, "step": 4192 }, { "epoch": 0.3138884210132315, "grad_norm": 1.426094889640808, "learning_rate": 1.3669374503451815e-05, "loss": 1.898, "step": 4193 }, { "epoch": 0.31396328111841, "grad_norm": 1.257351040840149, "learning_rate": 1.3637870464105706e-05, "loss": 1.9222, "step": 4194 }, { "epoch": 0.3140381412235884, "grad_norm": 1.163941502571106, "learning_rate": 1.3606400113680806e-05, "loss": 1.958, "step": 4195 }, { "epoch": 0.31411300132876685, "grad_norm": 1.4299142360687256, "learning_rate": 1.3574963464453317e-05, "loss": 1.9792, "step": 4196 }, { "epoch": 0.31418786143394534, "grad_norm": 1.4303052425384521, "learning_rate": 1.3543560528686338e-05, "loss": 2.0043, "step": 4197 }, { "epoch": 0.31426272153912377, "grad_norm": 1.0838043689727783, "learning_rate": 1.351219131862984e-05, "loss": 2.0258, "step": 4198 }, { "epoch": 0.3143375816443022, "grad_norm": 1.2281017303466797, "learning_rate": 1.3480855846520569e-05, "loss": 1.9452, "step": 4199 }, { "epoch": 0.31441244174948063, "grad_norm": 1.1211081743240356, "learning_rate": 1.3449554124582175e-05, "loss": 1.5995, "step": 4200 }, { "epoch": 0.31441244174948063, "eval_loss": 1.9722527265548706, "eval_runtime": 178.9512, "eval_samples_per_second": 27.941, "eval_steps_per_second": 13.97, "step": 4200 }, { "epoch": 0.3144873018546591, "grad_norm": 1.2430413961410522, "learning_rate": 1.3418286165025118e-05, "loss": 1.6363, "step": 4201 }, { "epoch": 0.31456216195983755, "grad_norm": 1.2864850759506226, "learning_rate": 1.3387051980046661e-05, "loss": 2.0841, "step": 4202 }, { "epoch": 0.314637022065016, "grad_norm": 1.3296749591827393, "learning_rate": 1.3355851581830936e-05, "loss": 2.0585, "step": 4203 }, { "epoch": 0.31471188217019447, "grad_norm": 1.4967375993728638, "learning_rate": 1.3324684982548885e-05, "loss": 2.2413, "step": 4204 }, { "epoch": 0.3147867422753729, "grad_norm": 1.1289546489715576, "learning_rate": 1.3293552194358238e-05, "loss": 1.6154, "step": 4205 }, { "epoch": 0.31486160238055133, "grad_norm": 1.3189398050308228, "learning_rate": 1.3262453229403582e-05, "loss": 1.7616, "step": 4206 }, { "epoch": 0.3149364624857298, "grad_norm": 1.2080447673797607, "learning_rate": 1.3231388099816277e-05, "loss": 1.7799, "step": 4207 }, { "epoch": 0.31501132259090825, "grad_norm": 1.1236724853515625, "learning_rate": 1.3200356817714443e-05, "loss": 1.8307, "step": 4208 }, { "epoch": 0.3150861826960867, "grad_norm": 1.307536005973816, "learning_rate": 1.3169359395203095e-05, "loss": 1.8861, "step": 4209 }, { "epoch": 0.3151610428012651, "grad_norm": 1.2413207292556763, "learning_rate": 1.3138395844373963e-05, "loss": 1.7444, "step": 4210 }, { "epoch": 0.3152359029064436, "grad_norm": 1.2621508836746216, "learning_rate": 1.3107466177305593e-05, "loss": 2.2027, "step": 4211 }, { "epoch": 0.31531076301162203, "grad_norm": 1.3719115257263184, "learning_rate": 1.3076570406063349e-05, "loss": 2.3726, "step": 4212 }, { "epoch": 0.31538562311680046, "grad_norm": 1.3190993070602417, "learning_rate": 1.3045708542699264e-05, "loss": 2.2419, "step": 4213 }, { "epoch": 0.31546048322197895, "grad_norm": 1.3265511989593506, "learning_rate": 1.3014880599252278e-05, "loss": 2.2512, "step": 4214 }, { "epoch": 0.3155353433271574, "grad_norm": 1.1280676126480103, "learning_rate": 1.298408658774798e-05, "loss": 2.3213, "step": 4215 }, { "epoch": 0.3156102034323358, "grad_norm": 1.26838219165802, "learning_rate": 1.2953326520198794e-05, "loss": 1.7414, "step": 4216 }, { "epoch": 0.31568506353751424, "grad_norm": 1.1598235368728638, "learning_rate": 1.2922600408603902e-05, "loss": 1.662, "step": 4217 }, { "epoch": 0.31575992364269273, "grad_norm": 1.1660648584365845, "learning_rate": 1.2891908264949215e-05, "loss": 1.6961, "step": 4218 }, { "epoch": 0.31583478374787116, "grad_norm": 1.1905189752578735, "learning_rate": 1.2861250101207411e-05, "loss": 1.7872, "step": 4219 }, { "epoch": 0.3159096438530496, "grad_norm": 1.1770448684692383, "learning_rate": 1.283062592933788e-05, "loss": 1.6366, "step": 4220 }, { "epoch": 0.3159845039582281, "grad_norm": 1.398833155632019, "learning_rate": 1.2800035761286787e-05, "loss": 2.5052, "step": 4221 }, { "epoch": 0.3160593640634065, "grad_norm": 1.349840760231018, "learning_rate": 1.2769479608987034e-05, "loss": 1.6673, "step": 4222 }, { "epoch": 0.31613422416858494, "grad_norm": 1.2817665338516235, "learning_rate": 1.2738957484358249e-05, "loss": 1.5891, "step": 4223 }, { "epoch": 0.31620908427376343, "grad_norm": 1.131892442703247, "learning_rate": 1.2708469399306733e-05, "loss": 2.132, "step": 4224 }, { "epoch": 0.31628394437894186, "grad_norm": 1.0842331647872925, "learning_rate": 1.2678015365725615e-05, "loss": 1.766, "step": 4225 }, { "epoch": 0.3163588044841203, "grad_norm": 1.1961909532546997, "learning_rate": 1.2647595395494627e-05, "loss": 1.4948, "step": 4226 }, { "epoch": 0.3164336645892987, "grad_norm": 1.185808777809143, "learning_rate": 1.2617209500480276e-05, "loss": 2.0668, "step": 4227 }, { "epoch": 0.3165085246944772, "grad_norm": 1.1728310585021973, "learning_rate": 1.2586857692535792e-05, "loss": 2.3815, "step": 4228 }, { "epoch": 0.31658338479965564, "grad_norm": 1.3092294931411743, "learning_rate": 1.2556539983501059e-05, "loss": 1.9627, "step": 4229 }, { "epoch": 0.3166582449048341, "grad_norm": 1.0717939138412476, "learning_rate": 1.2526256385202717e-05, "loss": 1.4853, "step": 4230 }, { "epoch": 0.31673310501001256, "grad_norm": 1.0487746000289917, "learning_rate": 1.249600690945405e-05, "loss": 2.3029, "step": 4231 }, { "epoch": 0.316807965115191, "grad_norm": 1.1825230121612549, "learning_rate": 1.2465791568055018e-05, "loss": 2.0643, "step": 4232 }, { "epoch": 0.3168828252203694, "grad_norm": 1.4030083417892456, "learning_rate": 1.2435610372792306e-05, "loss": 1.8907, "step": 4233 }, { "epoch": 0.31695768532554786, "grad_norm": 1.3982510566711426, "learning_rate": 1.2405463335439293e-05, "loss": 1.7806, "step": 4234 }, { "epoch": 0.31703254543072634, "grad_norm": 1.2343229055404663, "learning_rate": 1.2375350467755986e-05, "loss": 1.9991, "step": 4235 }, { "epoch": 0.3171074055359048, "grad_norm": 1.1671169996261597, "learning_rate": 1.234527178148912e-05, "loss": 2.138, "step": 4236 }, { "epoch": 0.3171822656410832, "grad_norm": 1.051686406135559, "learning_rate": 1.2315227288372e-05, "loss": 1.7932, "step": 4237 }, { "epoch": 0.3172571257462617, "grad_norm": 1.2837929725646973, "learning_rate": 1.2285217000124704e-05, "loss": 2.3026, "step": 4238 }, { "epoch": 0.3173319858514401, "grad_norm": 1.0531237125396729, "learning_rate": 1.2255240928453915e-05, "loss": 1.755, "step": 4239 }, { "epoch": 0.31740684595661856, "grad_norm": 1.558048963546753, "learning_rate": 1.2225299085052933e-05, "loss": 1.8248, "step": 4240 }, { "epoch": 0.31748170606179704, "grad_norm": 1.3072763681411743, "learning_rate": 1.219539148160177e-05, "loss": 1.2878, "step": 4241 }, { "epoch": 0.3175565661669755, "grad_norm": 1.3946715593338013, "learning_rate": 1.2165518129767072e-05, "loss": 2.4251, "step": 4242 }, { "epoch": 0.3176314262721539, "grad_norm": 1.3483070135116577, "learning_rate": 1.2135679041202075e-05, "loss": 2.215, "step": 4243 }, { "epoch": 0.31770628637733234, "grad_norm": 1.5368834733963013, "learning_rate": 1.2105874227546676e-05, "loss": 1.732, "step": 4244 }, { "epoch": 0.3177811464825108, "grad_norm": 1.128314733505249, "learning_rate": 1.2076103700427432e-05, "loss": 1.9792, "step": 4245 }, { "epoch": 0.31785600658768925, "grad_norm": 1.1726202964782715, "learning_rate": 1.2046367471457487e-05, "loss": 2.0682, "step": 4246 }, { "epoch": 0.3179308666928677, "grad_norm": 1.3298349380493164, "learning_rate": 1.2016665552236639e-05, "loss": 1.857, "step": 4247 }, { "epoch": 0.3180057267980462, "grad_norm": 1.0984100103378296, "learning_rate": 1.1986997954351265e-05, "loss": 1.5534, "step": 4248 }, { "epoch": 0.3180805869032246, "grad_norm": 1.1997307538986206, "learning_rate": 1.195736468937435e-05, "loss": 2.1331, "step": 4249 }, { "epoch": 0.31815544700840304, "grad_norm": 1.2463687658309937, "learning_rate": 1.1927765768865517e-05, "loss": 2.039, "step": 4250 }, { "epoch": 0.31823030711358147, "grad_norm": 1.247537612915039, "learning_rate": 1.1898201204370995e-05, "loss": 2.1008, "step": 4251 }, { "epoch": 0.31830516721875995, "grad_norm": 1.273748517036438, "learning_rate": 1.1868671007423593e-05, "loss": 1.686, "step": 4252 }, { "epoch": 0.3183800273239384, "grad_norm": 1.1115397214889526, "learning_rate": 1.183917518954274e-05, "loss": 1.7288, "step": 4253 }, { "epoch": 0.3184548874291168, "grad_norm": 1.1542783975601196, "learning_rate": 1.1809713762234375e-05, "loss": 2.1513, "step": 4254 }, { "epoch": 0.3185297475342953, "grad_norm": 1.2255346775054932, "learning_rate": 1.1780286736991141e-05, "loss": 2.4397, "step": 4255 }, { "epoch": 0.31860460763947374, "grad_norm": 1.421760082244873, "learning_rate": 1.1750894125292144e-05, "loss": 2.1511, "step": 4256 }, { "epoch": 0.31867946774465217, "grad_norm": 1.492655634880066, "learning_rate": 1.1721535938603135e-05, "loss": 2.3054, "step": 4257 }, { "epoch": 0.31875432784983065, "grad_norm": 1.3507533073425293, "learning_rate": 1.1692212188376439e-05, "loss": 2.1703, "step": 4258 }, { "epoch": 0.3188291879550091, "grad_norm": 1.4702476263046265, "learning_rate": 1.1662922886050942e-05, "loss": 2.5515, "step": 4259 }, { "epoch": 0.3189040480601875, "grad_norm": 1.2616642713546753, "learning_rate": 1.1633668043052026e-05, "loss": 2.4595, "step": 4260 }, { "epoch": 0.31897890816536595, "grad_norm": 1.3287217617034912, "learning_rate": 1.1604447670791729e-05, "loss": 2.1589, "step": 4261 }, { "epoch": 0.31905376827054444, "grad_norm": 1.091387152671814, "learning_rate": 1.157526178066859e-05, "loss": 1.8494, "step": 4262 }, { "epoch": 0.31912862837572287, "grad_norm": 1.559254765510559, "learning_rate": 1.154611038406771e-05, "loss": 2.148, "step": 4263 }, { "epoch": 0.3192034884809013, "grad_norm": 1.2237756252288818, "learning_rate": 1.1516993492360705e-05, "loss": 1.9358, "step": 4264 }, { "epoch": 0.3192783485860798, "grad_norm": 1.1708077192306519, "learning_rate": 1.1487911116905802e-05, "loss": 1.9387, "step": 4265 }, { "epoch": 0.3193532086912582, "grad_norm": 1.4867584705352783, "learning_rate": 1.145886326904766e-05, "loss": 2.2669, "step": 4266 }, { "epoch": 0.31942806879643665, "grad_norm": 1.3053001165390015, "learning_rate": 1.1429849960117556e-05, "loss": 2.1416, "step": 4267 }, { "epoch": 0.3195029289016151, "grad_norm": 1.200826644897461, "learning_rate": 1.1400871201433261e-05, "loss": 2.1413, "step": 4268 }, { "epoch": 0.31957778900679357, "grad_norm": 1.10733163356781, "learning_rate": 1.1371927004299076e-05, "loss": 2.0621, "step": 4269 }, { "epoch": 0.319652649111972, "grad_norm": 1.169324517250061, "learning_rate": 1.1343017380005827e-05, "loss": 1.9922, "step": 4270 }, { "epoch": 0.31972750921715043, "grad_norm": 1.1384263038635254, "learning_rate": 1.1314142339830835e-05, "loss": 1.9093, "step": 4271 }, { "epoch": 0.3198023693223289, "grad_norm": 0.9797958135604858, "learning_rate": 1.1285301895037904e-05, "loss": 1.7391, "step": 4272 }, { "epoch": 0.31987722942750735, "grad_norm": 1.309282660484314, "learning_rate": 1.1256496056877397e-05, "loss": 1.7957, "step": 4273 }, { "epoch": 0.3199520895326858, "grad_norm": 1.000567078590393, "learning_rate": 1.1227724836586163e-05, "loss": 1.2836, "step": 4274 }, { "epoch": 0.32002694963786427, "grad_norm": 1.301513910293579, "learning_rate": 1.1198988245387543e-05, "loss": 1.9535, "step": 4275 }, { "epoch": 0.3201018097430427, "grad_norm": 1.0438960790634155, "learning_rate": 1.1170286294491372e-05, "loss": 1.9521, "step": 4276 }, { "epoch": 0.32017666984822113, "grad_norm": 1.4060007333755493, "learning_rate": 1.1141618995093938e-05, "loss": 2.1708, "step": 4277 }, { "epoch": 0.32025152995339956, "grad_norm": 1.0469266176223755, "learning_rate": 1.1112986358378063e-05, "loss": 2.0823, "step": 4278 }, { "epoch": 0.32032639005857805, "grad_norm": 1.231764316558838, "learning_rate": 1.1084388395513035e-05, "loss": 1.6766, "step": 4279 }, { "epoch": 0.3204012501637565, "grad_norm": 1.5473371744155884, "learning_rate": 1.1055825117654572e-05, "loss": 1.7436, "step": 4280 }, { "epoch": 0.3204761102689349, "grad_norm": 1.1543024778366089, "learning_rate": 1.102729653594492e-05, "loss": 2.1059, "step": 4281 }, { "epoch": 0.3205509703741134, "grad_norm": 1.283323049545288, "learning_rate": 1.0998802661512775e-05, "loss": 2.2421, "step": 4282 }, { "epoch": 0.32062583047929183, "grad_norm": 1.374492883682251, "learning_rate": 1.0970343505473257e-05, "loss": 2.1562, "step": 4283 }, { "epoch": 0.32070069058447026, "grad_norm": 1.276918888092041, "learning_rate": 1.0941919078927987e-05, "loss": 1.7669, "step": 4284 }, { "epoch": 0.3207755506896487, "grad_norm": 1.2332602739334106, "learning_rate": 1.0913529392965016e-05, "loss": 2.2145, "step": 4285 }, { "epoch": 0.3208504107948272, "grad_norm": 1.1612800359725952, "learning_rate": 1.0885174458658853e-05, "loss": 1.7278, "step": 4286 }, { "epoch": 0.3209252709000056, "grad_norm": 1.3488943576812744, "learning_rate": 1.0856854287070484e-05, "loss": 1.9821, "step": 4287 }, { "epoch": 0.32100013100518404, "grad_norm": 1.8883134126663208, "learning_rate": 1.0828568889247247e-05, "loss": 1.5593, "step": 4288 }, { "epoch": 0.32107499111036253, "grad_norm": 1.2456871271133423, "learning_rate": 1.0800318276222977e-05, "loss": 2.1175, "step": 4289 }, { "epoch": 0.32114985121554096, "grad_norm": 1.4785337448120117, "learning_rate": 1.077210245901793e-05, "loss": 2.0587, "step": 4290 }, { "epoch": 0.3212247113207194, "grad_norm": 1.3994067907333374, "learning_rate": 1.0743921448638795e-05, "loss": 2.0042, "step": 4291 }, { "epoch": 0.3212995714258979, "grad_norm": 1.512692928314209, "learning_rate": 1.0715775256078665e-05, "loss": 2.1934, "step": 4292 }, { "epoch": 0.3213744315310763, "grad_norm": 1.5207679271697998, "learning_rate": 1.068766389231709e-05, "loss": 1.8997, "step": 4293 }, { "epoch": 0.32144929163625474, "grad_norm": 1.2716584205627441, "learning_rate": 1.0659587368319968e-05, "loss": 2.0687, "step": 4294 }, { "epoch": 0.3215241517414332, "grad_norm": 1.2716985940933228, "learning_rate": 1.0631545695039657e-05, "loss": 1.9754, "step": 4295 }, { "epoch": 0.32159901184661166, "grad_norm": 1.4693506956100464, "learning_rate": 1.0603538883414944e-05, "loss": 2.3568, "step": 4296 }, { "epoch": 0.3216738719517901, "grad_norm": 1.3230175971984863, "learning_rate": 1.0575566944370907e-05, "loss": 2.1313, "step": 4297 }, { "epoch": 0.3217487320569685, "grad_norm": 1.3708618879318237, "learning_rate": 1.0547629888819144e-05, "loss": 2.0376, "step": 4298 }, { "epoch": 0.321823592162147, "grad_norm": 1.4978593587875366, "learning_rate": 1.0519727727657591e-05, "loss": 1.962, "step": 4299 }, { "epoch": 0.32189845226732544, "grad_norm": 1.143630027770996, "learning_rate": 1.0491860471770554e-05, "loss": 1.9942, "step": 4300 }, { "epoch": 0.3219733123725039, "grad_norm": 1.1761173009872437, "learning_rate": 1.0464028132028759e-05, "loss": 1.3112, "step": 4301 }, { "epoch": 0.3220481724776823, "grad_norm": 1.1893281936645508, "learning_rate": 1.0436230719289287e-05, "loss": 2.1874, "step": 4302 }, { "epoch": 0.3221230325828608, "grad_norm": 1.2789543867111206, "learning_rate": 1.040846824439562e-05, "loss": 1.8348, "step": 4303 }, { "epoch": 0.3221978926880392, "grad_norm": 1.4697990417480469, "learning_rate": 1.0380740718177606e-05, "loss": 1.8879, "step": 4304 }, { "epoch": 0.32227275279321765, "grad_norm": 1.1386090517044067, "learning_rate": 1.0353048151451428e-05, "loss": 2.0337, "step": 4305 }, { "epoch": 0.32234761289839614, "grad_norm": 1.2646231651306152, "learning_rate": 1.0325390555019632e-05, "loss": 2.0434, "step": 4306 }, { "epoch": 0.3224224730035746, "grad_norm": 1.261138916015625, "learning_rate": 1.0297767939671177e-05, "loss": 2.1893, "step": 4307 }, { "epoch": 0.322497333108753, "grad_norm": 1.3194853067398071, "learning_rate": 1.0270180316181333e-05, "loss": 2.2775, "step": 4308 }, { "epoch": 0.3225721932139315, "grad_norm": 1.2645589113235474, "learning_rate": 1.0242627695311724e-05, "loss": 2.5407, "step": 4309 }, { "epoch": 0.3226470533191099, "grad_norm": 1.5182836055755615, "learning_rate": 1.0215110087810365e-05, "loss": 1.7635, "step": 4310 }, { "epoch": 0.32272191342428835, "grad_norm": 1.379013180732727, "learning_rate": 1.0187627504411513e-05, "loss": 2.3139, "step": 4311 }, { "epoch": 0.3227967735294668, "grad_norm": 1.2215808629989624, "learning_rate": 1.0160179955835868e-05, "loss": 2.1813, "step": 4312 }, { "epoch": 0.32287163363464527, "grad_norm": 1.2249337434768677, "learning_rate": 1.0132767452790382e-05, "loss": 2.1529, "step": 4313 }, { "epoch": 0.3229464937398237, "grad_norm": 1.170960783958435, "learning_rate": 1.0105390005968397e-05, "loss": 2.2203, "step": 4314 }, { "epoch": 0.32302135384500213, "grad_norm": 1.2596862316131592, "learning_rate": 1.0078047626049537e-05, "loss": 1.6348, "step": 4315 }, { "epoch": 0.3230962139501806, "grad_norm": 1.2516705989837646, "learning_rate": 1.0050740323699792e-05, "loss": 1.6107, "step": 4316 }, { "epoch": 0.32317107405535905, "grad_norm": 1.3642774820327759, "learning_rate": 1.0023468109571432e-05, "loss": 2.1139, "step": 4317 }, { "epoch": 0.3232459341605375, "grad_norm": 1.2518163919448853, "learning_rate": 9.996230994303035e-06, "loss": 2.041, "step": 4318 }, { "epoch": 0.3233207942657159, "grad_norm": 1.491049885749817, "learning_rate": 9.969028988519514e-06, "loss": 1.7066, "step": 4319 }, { "epoch": 0.3233956543708944, "grad_norm": 1.3978567123413086, "learning_rate": 9.941862102832078e-06, "loss": 2.1174, "step": 4320 }, { "epoch": 0.32347051447607283, "grad_norm": 1.2534300088882446, "learning_rate": 9.914730347838219e-06, "loss": 2.1594, "step": 4321 }, { "epoch": 0.32354537458125127, "grad_norm": 1.2443972826004028, "learning_rate": 9.887633734121726e-06, "loss": 2.3362, "step": 4322 }, { "epoch": 0.32362023468642975, "grad_norm": 1.3144580125808716, "learning_rate": 9.860572272252744e-06, "loss": 2.1023, "step": 4323 }, { "epoch": 0.3236950947916082, "grad_norm": 1.2565767765045166, "learning_rate": 9.833545972787583e-06, "loss": 2.1545, "step": 4324 }, { "epoch": 0.3237699548967866, "grad_norm": 1.4609133005142212, "learning_rate": 9.806554846268945e-06, "loss": 2.4756, "step": 4325 }, { "epoch": 0.3238448150019651, "grad_norm": 1.3058511018753052, "learning_rate": 9.779598903225774e-06, "loss": 1.9528, "step": 4326 }, { "epoch": 0.32391967510714353, "grad_norm": 1.2262194156646729, "learning_rate": 9.752678154173278e-06, "loss": 2.073, "step": 4327 }, { "epoch": 0.32399453521232197, "grad_norm": 1.3606054782867432, "learning_rate": 9.725792609612972e-06, "loss": 1.9707, "step": 4328 }, { "epoch": 0.3240693953175004, "grad_norm": 1.0933979749679565, "learning_rate": 9.698942280032597e-06, "loss": 1.5065, "step": 4329 }, { "epoch": 0.3241442554226789, "grad_norm": 1.3731929063796997, "learning_rate": 9.672127175906143e-06, "loss": 1.993, "step": 4330 }, { "epoch": 0.3242191155278573, "grad_norm": 1.354134202003479, "learning_rate": 9.645347307693908e-06, "loss": 2.2719, "step": 4331 }, { "epoch": 0.32429397563303575, "grad_norm": 1.233121395111084, "learning_rate": 9.618602685842437e-06, "loss": 1.7967, "step": 4332 }, { "epoch": 0.32436883573821423, "grad_norm": 1.126228928565979, "learning_rate": 9.591893320784506e-06, "loss": 1.8928, "step": 4333 }, { "epoch": 0.32444369584339267, "grad_norm": 1.1630918979644775, "learning_rate": 9.56521922293916e-06, "loss": 1.9527, "step": 4334 }, { "epoch": 0.3245185559485711, "grad_norm": 1.22890305519104, "learning_rate": 9.53858040271165e-06, "loss": 1.7972, "step": 4335 }, { "epoch": 0.32459341605374953, "grad_norm": 1.3265979290008545, "learning_rate": 9.511976870493511e-06, "loss": 2.3747, "step": 4336 }, { "epoch": 0.324668276158928, "grad_norm": 1.2737456560134888, "learning_rate": 9.485408636662463e-06, "loss": 1.6842, "step": 4337 }, { "epoch": 0.32474313626410645, "grad_norm": 1.1951372623443604, "learning_rate": 9.458875711582493e-06, "loss": 2.217, "step": 4338 }, { "epoch": 0.3248179963692849, "grad_norm": 1.1306966543197632, "learning_rate": 9.4323781056038e-06, "loss": 2.3055, "step": 4339 }, { "epoch": 0.32489285647446337, "grad_norm": 1.4470226764678955, "learning_rate": 9.40591582906285e-06, "loss": 2.0531, "step": 4340 }, { "epoch": 0.3249677165796418, "grad_norm": 1.4400477409362793, "learning_rate": 9.37948889228223e-06, "loss": 2.0791, "step": 4341 }, { "epoch": 0.32504257668482023, "grad_norm": 1.4107931852340698, "learning_rate": 9.353097305570824e-06, "loss": 1.8197, "step": 4342 }, { "epoch": 0.3251174367899987, "grad_norm": 1.297286868095398, "learning_rate": 9.32674107922371e-06, "loss": 1.5633, "step": 4343 }, { "epoch": 0.32519229689517715, "grad_norm": 1.2191598415374756, "learning_rate": 9.300420223522143e-06, "loss": 2.1477, "step": 4344 }, { "epoch": 0.3252671570003556, "grad_norm": 1.0640449523925781, "learning_rate": 9.274134748733643e-06, "loss": 1.5755, "step": 4345 }, { "epoch": 0.325342017105534, "grad_norm": 1.2535250186920166, "learning_rate": 9.24788466511185e-06, "loss": 1.8846, "step": 4346 }, { "epoch": 0.3254168772107125, "grad_norm": 1.205539584159851, "learning_rate": 9.221669982896619e-06, "loss": 1.8229, "step": 4347 }, { "epoch": 0.3254917373158909, "grad_norm": 1.3653366565704346, "learning_rate": 9.195490712314025e-06, "loss": 2.0382, "step": 4348 }, { "epoch": 0.32556659742106936, "grad_norm": 1.1930748224258423, "learning_rate": 9.169346863576334e-06, "loss": 1.533, "step": 4349 }, { "epoch": 0.32564145752624785, "grad_norm": 1.0746008157730103, "learning_rate": 9.143238446881952e-06, "loss": 1.9719, "step": 4350 }, { "epoch": 0.32564145752624785, "eval_loss": 1.969606637954712, "eval_runtime": 179.0432, "eval_samples_per_second": 27.926, "eval_steps_per_second": 13.963, "step": 4350 }, { "epoch": 0.3257163176314263, "grad_norm": 1.3905658721923828, "learning_rate": 9.117165472415512e-06, "loss": 2.1205, "step": 4351 }, { "epoch": 0.3257911777366047, "grad_norm": 1.3244316577911377, "learning_rate": 9.091127950347767e-06, "loss": 1.8565, "step": 4352 }, { "epoch": 0.32586603784178314, "grad_norm": 1.5965996980667114, "learning_rate": 9.065125890835702e-06, "loss": 2.4872, "step": 4353 }, { "epoch": 0.3259408979469616, "grad_norm": 1.4740352630615234, "learning_rate": 9.039159304022404e-06, "loss": 1.9009, "step": 4354 }, { "epoch": 0.32601575805214006, "grad_norm": 1.3802940845489502, "learning_rate": 9.013228200037161e-06, "loss": 1.9568, "step": 4355 }, { "epoch": 0.3260906181573185, "grad_norm": 1.1935075521469116, "learning_rate": 8.987332588995412e-06, "loss": 1.707, "step": 4356 }, { "epoch": 0.326165478262497, "grad_norm": 1.2085293531417847, "learning_rate": 8.961472480998778e-06, "loss": 2.065, "step": 4357 }, { "epoch": 0.3262403383676754, "grad_norm": 1.1489678621292114, "learning_rate": 8.935647886134968e-06, "loss": 1.8482, "step": 4358 }, { "epoch": 0.32631519847285384, "grad_norm": 1.3342386484146118, "learning_rate": 8.909858814477889e-06, "loss": 2.7056, "step": 4359 }, { "epoch": 0.3263900585780323, "grad_norm": 1.0669077634811401, "learning_rate": 8.884105276087584e-06, "loss": 1.3921, "step": 4360 }, { "epoch": 0.32646491868321076, "grad_norm": 1.388182282447815, "learning_rate": 8.858387281010227e-06, "loss": 2.0807, "step": 4361 }, { "epoch": 0.3265397787883892, "grad_norm": 1.2218077182769775, "learning_rate": 8.832704839278105e-06, "loss": 2.4558, "step": 4362 }, { "epoch": 0.3266146388935676, "grad_norm": 1.0980782508850098, "learning_rate": 8.807057960909681e-06, "loss": 2.2699, "step": 4363 }, { "epoch": 0.3266894989987461, "grad_norm": 1.2559531927108765, "learning_rate": 8.781446655909498e-06, "loss": 1.7444, "step": 4364 }, { "epoch": 0.32676435910392454, "grad_norm": 1.1109176874160767, "learning_rate": 8.75587093426825e-06, "loss": 2.0999, "step": 4365 }, { "epoch": 0.32683921920910297, "grad_norm": 0.9935234785079956, "learning_rate": 8.730330805962761e-06, "loss": 2.3238, "step": 4366 }, { "epoch": 0.32691407931428146, "grad_norm": 1.1573008298873901, "learning_rate": 8.704826280955946e-06, "loss": 1.8165, "step": 4367 }, { "epoch": 0.3269889394194599, "grad_norm": 1.2716618776321411, "learning_rate": 8.679357369196862e-06, "loss": 1.879, "step": 4368 }, { "epoch": 0.3270637995246383, "grad_norm": 1.2197717428207397, "learning_rate": 8.653924080620634e-06, "loss": 1.5881, "step": 4369 }, { "epoch": 0.32713865962981675, "grad_norm": 1.4899544715881348, "learning_rate": 8.628526425148498e-06, "loss": 1.7813, "step": 4370 }, { "epoch": 0.32721351973499524, "grad_norm": 1.2050772905349731, "learning_rate": 8.603164412687804e-06, "loss": 2.2112, "step": 4371 }, { "epoch": 0.32728837984017367, "grad_norm": 1.384883165359497, "learning_rate": 8.577838053132003e-06, "loss": 2.1957, "step": 4372 }, { "epoch": 0.3273632399453521, "grad_norm": 0.9097550511360168, "learning_rate": 8.55254735636063e-06, "loss": 1.1688, "step": 4373 }, { "epoch": 0.3274381000505306, "grad_norm": 1.1735903024673462, "learning_rate": 8.527292332239334e-06, "loss": 2.2201, "step": 4374 }, { "epoch": 0.327512960155709, "grad_norm": 1.334586501121521, "learning_rate": 8.50207299061978e-06, "loss": 2.2172, "step": 4375 }, { "epoch": 0.32758782026088745, "grad_norm": 1.2190319299697876, "learning_rate": 8.476889341339788e-06, "loss": 1.8169, "step": 4376 }, { "epoch": 0.32766268036606594, "grad_norm": 1.2611587047576904, "learning_rate": 8.451741394223223e-06, "loss": 1.812, "step": 4377 }, { "epoch": 0.32773754047124437, "grad_norm": 1.2816450595855713, "learning_rate": 8.42662915907999e-06, "loss": 1.9024, "step": 4378 }, { "epoch": 0.3278124005764228, "grad_norm": 1.3656351566314697, "learning_rate": 8.401552645706124e-06, "loss": 2.3825, "step": 4379 }, { "epoch": 0.32788726068160123, "grad_norm": 1.1080862283706665, "learning_rate": 8.376511863883718e-06, "loss": 1.5253, "step": 4380 }, { "epoch": 0.3279621207867797, "grad_norm": 1.471994400024414, "learning_rate": 8.351506823380861e-06, "loss": 2.2418, "step": 4381 }, { "epoch": 0.32803698089195815, "grad_norm": 1.3628584146499634, "learning_rate": 8.326537533951762e-06, "loss": 2.151, "step": 4382 }, { "epoch": 0.3281118409971366, "grad_norm": 1.1717501878738403, "learning_rate": 8.301604005336683e-06, "loss": 1.6913, "step": 4383 }, { "epoch": 0.32818670110231507, "grad_norm": 1.5502525568008423, "learning_rate": 8.276706247261912e-06, "loss": 2.3784, "step": 4384 }, { "epoch": 0.3282615612074935, "grad_norm": 1.2802622318267822, "learning_rate": 8.251844269439801e-06, "loss": 1.9703, "step": 4385 }, { "epoch": 0.32833642131267193, "grad_norm": 1.3509187698364258, "learning_rate": 8.227018081568738e-06, "loss": 1.9123, "step": 4386 }, { "epoch": 0.32841128141785036, "grad_norm": 1.2135541439056396, "learning_rate": 8.202227693333108e-06, "loss": 2.2466, "step": 4387 }, { "epoch": 0.32848614152302885, "grad_norm": 1.3049659729003906, "learning_rate": 8.177473114403401e-06, "loss": 2.0321, "step": 4388 }, { "epoch": 0.3285610016282073, "grad_norm": 1.241136074066162, "learning_rate": 8.152754354436099e-06, "loss": 1.7434, "step": 4389 }, { "epoch": 0.3286358617333857, "grad_norm": 1.1801185607910156, "learning_rate": 8.128071423073724e-06, "loss": 1.6053, "step": 4390 }, { "epoch": 0.3287107218385642, "grad_norm": 1.3641743659973145, "learning_rate": 8.10342432994483e-06, "loss": 1.7511, "step": 4391 }, { "epoch": 0.32878558194374263, "grad_norm": 1.2104682922363281, "learning_rate": 8.07881308466395e-06, "loss": 2.2639, "step": 4392 }, { "epoch": 0.32886044204892106, "grad_norm": 1.2274954319000244, "learning_rate": 8.054237696831701e-06, "loss": 1.6448, "step": 4393 }, { "epoch": 0.32893530215409955, "grad_norm": 1.5121643543243408, "learning_rate": 8.029698176034617e-06, "loss": 2.3079, "step": 4394 }, { "epoch": 0.329010162259278, "grad_norm": 1.1555250883102417, "learning_rate": 8.00519453184534e-06, "loss": 1.991, "step": 4395 }, { "epoch": 0.3290850223644564, "grad_norm": 1.034796953201294, "learning_rate": 7.980726773822456e-06, "loss": 1.9575, "step": 4396 }, { "epoch": 0.32915988246963485, "grad_norm": 1.1143454313278198, "learning_rate": 7.956294911510598e-06, "loss": 1.6958, "step": 4397 }, { "epoch": 0.32923474257481333, "grad_norm": 1.192488193511963, "learning_rate": 7.93189895444032e-06, "loss": 1.7494, "step": 4398 }, { "epoch": 0.32930960267999176, "grad_norm": 1.3084845542907715, "learning_rate": 7.907538912128243e-06, "loss": 2.3727, "step": 4399 }, { "epoch": 0.3293844627851702, "grad_norm": 1.4189320802688599, "learning_rate": 7.883214794076943e-06, "loss": 1.5607, "step": 4400 }, { "epoch": 0.3294593228903487, "grad_norm": 1.315454363822937, "learning_rate": 7.858926609775009e-06, "loss": 1.634, "step": 4401 }, { "epoch": 0.3295341829955271, "grad_norm": 1.4979063272476196, "learning_rate": 7.834674368697003e-06, "loss": 1.9172, "step": 4402 }, { "epoch": 0.32960904310070555, "grad_norm": 1.412748098373413, "learning_rate": 7.81045808030345e-06, "loss": 1.9045, "step": 4403 }, { "epoch": 0.329683903205884, "grad_norm": 1.1860309839248657, "learning_rate": 7.786277754040828e-06, "loss": 1.889, "step": 4404 }, { "epoch": 0.32975876331106246, "grad_norm": 1.4009395837783813, "learning_rate": 7.762133399341642e-06, "loss": 2.3985, "step": 4405 }, { "epoch": 0.3298336234162409, "grad_norm": 1.3755050897598267, "learning_rate": 7.738025025624352e-06, "loss": 2.2848, "step": 4406 }, { "epoch": 0.3299084835214193, "grad_norm": 1.1259900331497192, "learning_rate": 7.713952642293342e-06, "loss": 2.0114, "step": 4407 }, { "epoch": 0.3299833436265978, "grad_norm": 1.1992723941802979, "learning_rate": 7.689916258739027e-06, "loss": 1.9659, "step": 4408 }, { "epoch": 0.33005820373177625, "grad_norm": 1.323368787765503, "learning_rate": 7.665915884337693e-06, "loss": 1.5556, "step": 4409 }, { "epoch": 0.3301330638369547, "grad_norm": 1.4067720174789429, "learning_rate": 7.641951528451664e-06, "loss": 2.0456, "step": 4410 }, { "epoch": 0.33020792394213316, "grad_norm": 1.2704321146011353, "learning_rate": 7.618023200429137e-06, "loss": 2.0192, "step": 4411 }, { "epoch": 0.3302827840473116, "grad_norm": 1.4159473180770874, "learning_rate": 7.594130909604291e-06, "loss": 2.2038, "step": 4412 }, { "epoch": 0.33035764415249, "grad_norm": 1.542804479598999, "learning_rate": 7.570274665297272e-06, "loss": 1.7192, "step": 4413 }, { "epoch": 0.33043250425766846, "grad_norm": 1.2539608478546143, "learning_rate": 7.546454476814124e-06, "loss": 2.096, "step": 4414 }, { "epoch": 0.33050736436284694, "grad_norm": 1.3254759311676025, "learning_rate": 7.52267035344687e-06, "loss": 2.2033, "step": 4415 }, { "epoch": 0.3305822244680254, "grad_norm": 1.5012385845184326, "learning_rate": 7.498922304473388e-06, "loss": 2.4617, "step": 4416 }, { "epoch": 0.3306570845732038, "grad_norm": 1.3006514310836792, "learning_rate": 7.475210339157568e-06, "loss": 1.8796, "step": 4417 }, { "epoch": 0.3307319446783823, "grad_norm": 1.0804299116134644, "learning_rate": 7.451534466749188e-06, "loss": 1.9128, "step": 4418 }, { "epoch": 0.3308068047835607, "grad_norm": 1.388202428817749, "learning_rate": 7.427894696483917e-06, "loss": 1.8795, "step": 4419 }, { "epoch": 0.33088166488873916, "grad_norm": 1.444893717765808, "learning_rate": 7.404291037583389e-06, "loss": 1.9201, "step": 4420 }, { "epoch": 0.3309565249939176, "grad_norm": 1.2913213968276978, "learning_rate": 7.3807234992551375e-06, "loss": 2.1704, "step": 4421 }, { "epoch": 0.3310313850990961, "grad_norm": 1.2402644157409668, "learning_rate": 7.3571920906926e-06, "loss": 2.3608, "step": 4422 }, { "epoch": 0.3311062452042745, "grad_norm": 1.1276830434799194, "learning_rate": 7.333696821075109e-06, "loss": 1.7259, "step": 4423 }, { "epoch": 0.33118110530945294, "grad_norm": 1.1984943151474, "learning_rate": 7.310237699567912e-06, "loss": 1.9613, "step": 4424 }, { "epoch": 0.3312559654146314, "grad_norm": 1.1256036758422852, "learning_rate": 7.286814735322178e-06, "loss": 2.1164, "step": 4425 }, { "epoch": 0.33133082551980986, "grad_norm": 1.2741403579711914, "learning_rate": 7.263427937474942e-06, "loss": 1.8209, "step": 4426 }, { "epoch": 0.3314056856249883, "grad_norm": 1.558203935623169, "learning_rate": 7.240077315149141e-06, "loss": 1.8327, "step": 4427 }, { "epoch": 0.3314805457301668, "grad_norm": 1.3963170051574707, "learning_rate": 7.216762877453564e-06, "loss": 2.0738, "step": 4428 }, { "epoch": 0.3315554058353452, "grad_norm": 1.6008565425872803, "learning_rate": 7.193484633482939e-06, "loss": 2.2702, "step": 4429 }, { "epoch": 0.33163026594052364, "grad_norm": 1.0609760284423828, "learning_rate": 7.170242592317855e-06, "loss": 2.0826, "step": 4430 }, { "epoch": 0.33170512604570207, "grad_norm": 1.1889939308166504, "learning_rate": 7.147036763024783e-06, "loss": 1.8871, "step": 4431 }, { "epoch": 0.33177998615088056, "grad_norm": 1.4711006879806519, "learning_rate": 7.123867154656072e-06, "loss": 2.1012, "step": 4432 }, { "epoch": 0.331854846256059, "grad_norm": 1.2431904077529907, "learning_rate": 7.100733776249901e-06, "loss": 1.9886, "step": 4433 }, { "epoch": 0.3319297063612374, "grad_norm": 1.3750231266021729, "learning_rate": 7.077636636830387e-06, "loss": 2.0687, "step": 4434 }, { "epoch": 0.3320045664664159, "grad_norm": 1.3740020990371704, "learning_rate": 7.054575745407433e-06, "loss": 2.129, "step": 4435 }, { "epoch": 0.33207942657159434, "grad_norm": 1.1560695171356201, "learning_rate": 7.031551110976864e-06, "loss": 1.5444, "step": 4436 }, { "epoch": 0.33215428667677277, "grad_norm": 1.1945688724517822, "learning_rate": 7.0085627425203195e-06, "loss": 2.0741, "step": 4437 }, { "epoch": 0.3322291467819512, "grad_norm": 1.4020158052444458, "learning_rate": 6.985610649005336e-06, "loss": 2.0004, "step": 4438 }, { "epoch": 0.3323040068871297, "grad_norm": 1.2921391725540161, "learning_rate": 6.962694839385253e-06, "loss": 1.9492, "step": 4439 }, { "epoch": 0.3323788669923081, "grad_norm": 1.3309450149536133, "learning_rate": 6.9398153225992855e-06, "loss": 1.425, "step": 4440 }, { "epoch": 0.33245372709748655, "grad_norm": 1.2448668479919434, "learning_rate": 6.916972107572473e-06, "loss": 1.6236, "step": 4441 }, { "epoch": 0.33252858720266504, "grad_norm": 1.326363444328308, "learning_rate": 6.89416520321573e-06, "loss": 1.6501, "step": 4442 }, { "epoch": 0.33260344730784347, "grad_norm": 1.3676609992980957, "learning_rate": 6.871394618425741e-06, "loss": 2.1144, "step": 4443 }, { "epoch": 0.3326783074130219, "grad_norm": 1.4085878133773804, "learning_rate": 6.8486603620851134e-06, "loss": 2.0792, "step": 4444 }, { "epoch": 0.3327531675182004, "grad_norm": 1.3534120321273804, "learning_rate": 6.825962443062173e-06, "loss": 1.7903, "step": 4445 }, { "epoch": 0.3328280276233788, "grad_norm": 1.3016992807388306, "learning_rate": 6.80330087021116e-06, "loss": 1.8581, "step": 4446 }, { "epoch": 0.33290288772855725, "grad_norm": 1.2000764608383179, "learning_rate": 6.780675652372093e-06, "loss": 2.3044, "step": 4447 }, { "epoch": 0.3329777478337357, "grad_norm": 1.4830985069274902, "learning_rate": 6.758086798370844e-06, "loss": 1.9328, "step": 4448 }, { "epoch": 0.33305260793891417, "grad_norm": 1.2970929145812988, "learning_rate": 6.735534317019077e-06, "loss": 2.1614, "step": 4449 }, { "epoch": 0.3331274680440926, "grad_norm": 1.2095531225204468, "learning_rate": 6.713018217114264e-06, "loss": 2.2711, "step": 4450 }, { "epoch": 0.33320232814927103, "grad_norm": 1.1782187223434448, "learning_rate": 6.690538507439659e-06, "loss": 1.8764, "step": 4451 }, { "epoch": 0.3332771882544495, "grad_norm": 1.1599329710006714, "learning_rate": 6.668095196764368e-06, "loss": 1.9403, "step": 4452 }, { "epoch": 0.33335204835962795, "grad_norm": 1.2321711778640747, "learning_rate": 6.645688293843289e-06, "loss": 2.0978, "step": 4453 }, { "epoch": 0.3334269084648064, "grad_norm": 1.5362797975540161, "learning_rate": 6.623317807417107e-06, "loss": 2.1059, "step": 4454 }, { "epoch": 0.3335017685699848, "grad_norm": 1.2954950332641602, "learning_rate": 6.600983746212319e-06, "loss": 2.0127, "step": 4455 }, { "epoch": 0.3335766286751633, "grad_norm": 1.1956791877746582, "learning_rate": 6.578686118941169e-06, "loss": 2.5334, "step": 4456 }, { "epoch": 0.33365148878034173, "grad_norm": 1.3118082284927368, "learning_rate": 6.5564249343017306e-06, "loss": 2.0191, "step": 4457 }, { "epoch": 0.33372634888552016, "grad_norm": 1.29216468334198, "learning_rate": 6.534200200977847e-06, "loss": 1.8399, "step": 4458 }, { "epoch": 0.33380120899069865, "grad_norm": 1.2131433486938477, "learning_rate": 6.512011927639161e-06, "loss": 2.2759, "step": 4459 }, { "epoch": 0.3338760690958771, "grad_norm": 1.3825722932815552, "learning_rate": 6.489860122941039e-06, "loss": 1.629, "step": 4460 }, { "epoch": 0.3339509292010555, "grad_norm": 1.527295470237732, "learning_rate": 6.4677447955246995e-06, "loss": 2.0666, "step": 4461 }, { "epoch": 0.334025789306234, "grad_norm": 1.2060127258300781, "learning_rate": 6.445665954017044e-06, "loss": 2.3603, "step": 4462 }, { "epoch": 0.33410064941141243, "grad_norm": 1.3833404779434204, "learning_rate": 6.4236236070308044e-06, "loss": 2.3777, "step": 4463 }, { "epoch": 0.33417550951659086, "grad_norm": 1.053107738494873, "learning_rate": 6.40161776316448e-06, "loss": 1.6097, "step": 4464 }, { "epoch": 0.3342503696217693, "grad_norm": 1.231627345085144, "learning_rate": 6.379648431002283e-06, "loss": 1.6673, "step": 4465 }, { "epoch": 0.3343252297269478, "grad_norm": 1.2864612340927124, "learning_rate": 6.357715619114235e-06, "loss": 2.1678, "step": 4466 }, { "epoch": 0.3344000898321262, "grad_norm": 1.3971160650253296, "learning_rate": 6.33581933605607e-06, "loss": 2.1692, "step": 4467 }, { "epoch": 0.33447494993730464, "grad_norm": 1.1919745206832886, "learning_rate": 6.313959590369279e-06, "loss": 1.9913, "step": 4468 }, { "epoch": 0.33454981004248313, "grad_norm": 1.2282556295394897, "learning_rate": 6.292136390581116e-06, "loss": 1.9534, "step": 4469 }, { "epoch": 0.33462467014766156, "grad_norm": 1.195971131324768, "learning_rate": 6.270349745204584e-06, "loss": 1.4962, "step": 4470 }, { "epoch": 0.33469953025284, "grad_norm": 1.1929998397827148, "learning_rate": 6.248599662738397e-06, "loss": 2.0474, "step": 4471 }, { "epoch": 0.3347743903580184, "grad_norm": 1.371244192123413, "learning_rate": 6.226886151667055e-06, "loss": 2.1171, "step": 4472 }, { "epoch": 0.3348492504631969, "grad_norm": 1.9405146837234497, "learning_rate": 6.205209220460717e-06, "loss": 2.0981, "step": 4473 }, { "epoch": 0.33492411056837534, "grad_norm": 1.2556201219558716, "learning_rate": 6.183568877575352e-06, "loss": 1.6203, "step": 4474 }, { "epoch": 0.3349989706735538, "grad_norm": 1.212060809135437, "learning_rate": 6.161965131452618e-06, "loss": 1.9742, "step": 4475 }, { "epoch": 0.33507383077873226, "grad_norm": 1.5470325946807861, "learning_rate": 6.140397990519864e-06, "loss": 1.936, "step": 4476 }, { "epoch": 0.3351486908839107, "grad_norm": 1.0620369911193848, "learning_rate": 6.118867463190236e-06, "loss": 1.9962, "step": 4477 }, { "epoch": 0.3352235509890891, "grad_norm": 1.4855120182037354, "learning_rate": 6.097373557862551e-06, "loss": 2.2709, "step": 4478 }, { "epoch": 0.3352984110942676, "grad_norm": 1.2029447555541992, "learning_rate": 6.075916282921324e-06, "loss": 1.9973, "step": 4479 }, { "epoch": 0.33537327119944604, "grad_norm": 1.2604753971099854, "learning_rate": 6.054495646736813e-06, "loss": 1.4965, "step": 4480 }, { "epoch": 0.3354481313046245, "grad_norm": 1.1157619953155518, "learning_rate": 6.03311165766498e-06, "loss": 1.8553, "step": 4481 }, { "epoch": 0.3355229914098029, "grad_norm": 1.270527720451355, "learning_rate": 6.011764324047487e-06, "loss": 1.9392, "step": 4482 }, { "epoch": 0.3355978515149814, "grad_norm": 1.2749990224838257, "learning_rate": 5.990453654211703e-06, "loss": 2.3344, "step": 4483 }, { "epoch": 0.3356727116201598, "grad_norm": 1.236285924911499, "learning_rate": 5.969179656470669e-06, "loss": 1.8099, "step": 4484 }, { "epoch": 0.33574757172533826, "grad_norm": 1.2376283407211304, "learning_rate": 5.9479423391231355e-06, "loss": 1.9023, "step": 4485 }, { "epoch": 0.33582243183051674, "grad_norm": 1.3194891214370728, "learning_rate": 5.926741710453554e-06, "loss": 1.8627, "step": 4486 }, { "epoch": 0.3358972919356952, "grad_norm": 1.429590106010437, "learning_rate": 5.905577778732052e-06, "loss": 1.864, "step": 4487 }, { "epoch": 0.3359721520408736, "grad_norm": 1.2722073793411255, "learning_rate": 5.88445055221446e-06, "loss": 2.2522, "step": 4488 }, { "epoch": 0.33604701214605204, "grad_norm": 1.5981965065002441, "learning_rate": 5.863360039142296e-06, "loss": 2.5485, "step": 4489 }, { "epoch": 0.3361218722512305, "grad_norm": 1.3211497068405151, "learning_rate": 5.842306247742691e-06, "loss": 2.2295, "step": 4490 }, { "epoch": 0.33619673235640896, "grad_norm": 1.5702818632125854, "learning_rate": 5.8212891862285405e-06, "loss": 2.0385, "step": 4491 }, { "epoch": 0.3362715924615874, "grad_norm": 1.2147630453109741, "learning_rate": 5.8003088627983425e-06, "loss": 2.0027, "step": 4492 }, { "epoch": 0.3363464525667659, "grad_norm": 1.4685789346694946, "learning_rate": 5.779365285636296e-06, "loss": 2.0883, "step": 4493 }, { "epoch": 0.3364213126719443, "grad_norm": 1.1228058338165283, "learning_rate": 5.758458462912264e-06, "loss": 1.7361, "step": 4494 }, { "epoch": 0.33649617277712274, "grad_norm": 1.2063429355621338, "learning_rate": 5.73758840278179e-06, "loss": 2.0709, "step": 4495 }, { "epoch": 0.3365710328823012, "grad_norm": 1.0103908777236938, "learning_rate": 5.7167551133860185e-06, "loss": 1.9018, "step": 4496 }, { "epoch": 0.33664589298747966, "grad_norm": 1.4489710330963135, "learning_rate": 5.695958602851814e-06, "loss": 2.1125, "step": 4497 }, { "epoch": 0.3367207530926581, "grad_norm": 1.2138603925704956, "learning_rate": 5.675198879291655e-06, "loss": 2.0634, "step": 4498 }, { "epoch": 0.3367956131978365, "grad_norm": 1.3541476726531982, "learning_rate": 5.6544759508036945e-06, "loss": 1.6927, "step": 4499 }, { "epoch": 0.336870473303015, "grad_norm": 1.2563163042068481, "learning_rate": 5.6337898254717135e-06, "loss": 1.6564, "step": 4500 }, { "epoch": 0.336870473303015, "eval_loss": 1.9679548740386963, "eval_runtime": 178.9434, "eval_samples_per_second": 27.942, "eval_steps_per_second": 13.971, "step": 4500 }, { "epoch": 0.33694533340819344, "grad_norm": 1.4810017347335815, "learning_rate": 5.613140511365145e-06, "loss": 1.7605, "step": 4501 }, { "epoch": 0.33702019351337187, "grad_norm": 1.1984667778015137, "learning_rate": 5.592528016539056e-06, "loss": 2.3057, "step": 4502 }, { "epoch": 0.33709505361855036, "grad_norm": 1.126158595085144, "learning_rate": 5.571952349034148e-06, "loss": 2.0628, "step": 4503 }, { "epoch": 0.3371699137237288, "grad_norm": 1.2925643920898438, "learning_rate": 5.551413516876769e-06, "loss": 2.2757, "step": 4504 }, { "epoch": 0.3372447738289072, "grad_norm": 1.1219215393066406, "learning_rate": 5.530911528078908e-06, "loss": 1.8029, "step": 4505 }, { "epoch": 0.33731963393408565, "grad_norm": 1.16107177734375, "learning_rate": 5.510446390638158e-06, "loss": 1.7235, "step": 4506 }, { "epoch": 0.33739449403926414, "grad_norm": 1.2166976928710938, "learning_rate": 5.490018112537732e-06, "loss": 2.0464, "step": 4507 }, { "epoch": 0.33746935414444257, "grad_norm": 1.1722297668457031, "learning_rate": 5.4696267017465e-06, "loss": 1.5223, "step": 4508 }, { "epoch": 0.337544214249621, "grad_norm": 1.4316025972366333, "learning_rate": 5.449272166218899e-06, "loss": 2.2162, "step": 4509 }, { "epoch": 0.3376190743547995, "grad_norm": 1.3389047384262085, "learning_rate": 5.428954513895035e-06, "loss": 2.3095, "step": 4510 }, { "epoch": 0.3376939344599779, "grad_norm": 1.1408673524856567, "learning_rate": 5.408673752700588e-06, "loss": 2.1663, "step": 4511 }, { "epoch": 0.33776879456515635, "grad_norm": 1.3274317979812622, "learning_rate": 5.3884298905468645e-06, "loss": 2.0602, "step": 4512 }, { "epoch": 0.33784365467033484, "grad_norm": 1.296419620513916, "learning_rate": 5.368222935330791e-06, "loss": 2.0923, "step": 4513 }, { "epoch": 0.33791851477551327, "grad_norm": 1.3062355518341064, "learning_rate": 5.348052894934852e-06, "loss": 1.7155, "step": 4514 }, { "epoch": 0.3379933748806917, "grad_norm": 1.7694618701934814, "learning_rate": 5.3279197772271636e-06, "loss": 2.0182, "step": 4515 }, { "epoch": 0.33806823498587013, "grad_norm": 1.3689396381378174, "learning_rate": 5.307823590061467e-06, "loss": 2.3869, "step": 4516 }, { "epoch": 0.3381430950910486, "grad_norm": 1.2253679037094116, "learning_rate": 5.2877643412770125e-06, "loss": 1.4936, "step": 4517 }, { "epoch": 0.33821795519622705, "grad_norm": 1.176476001739502, "learning_rate": 5.267742038698731e-06, "loss": 1.6329, "step": 4518 }, { "epoch": 0.3382928153014055, "grad_norm": 1.2689753770828247, "learning_rate": 5.247756690137107e-06, "loss": 2.0669, "step": 4519 }, { "epoch": 0.33836767540658397, "grad_norm": 1.2251344919204712, "learning_rate": 5.227808303388182e-06, "loss": 1.8159, "step": 4520 }, { "epoch": 0.3384425355117624, "grad_norm": 1.125416874885559, "learning_rate": 5.207896886233599e-06, "loss": 2.1833, "step": 4521 }, { "epoch": 0.33851739561694083, "grad_norm": 1.3435468673706055, "learning_rate": 5.18802244644061e-06, "loss": 1.9395, "step": 4522 }, { "epoch": 0.33859225572211926, "grad_norm": 1.3793023824691772, "learning_rate": 5.168184991762004e-06, "loss": 2.0206, "step": 4523 }, { "epoch": 0.33866711582729775, "grad_norm": 1.235838532447815, "learning_rate": 5.148384529936168e-06, "loss": 2.1191, "step": 4524 }, { "epoch": 0.3387419759324762, "grad_norm": 1.605233073234558, "learning_rate": 5.128621068687034e-06, "loss": 2.2223, "step": 4525 }, { "epoch": 0.3388168360376546, "grad_norm": 1.408650279045105, "learning_rate": 5.108894615724102e-06, "loss": 2.0143, "step": 4526 }, { "epoch": 0.3388916961428331, "grad_norm": 1.5687154531478882, "learning_rate": 5.089205178742451e-06, "loss": 2.3578, "step": 4527 }, { "epoch": 0.33896655624801153, "grad_norm": 1.4328505992889404, "learning_rate": 5.069552765422714e-06, "loss": 2.3008, "step": 4528 }, { "epoch": 0.33904141635318996, "grad_norm": 1.3026410341262817, "learning_rate": 5.049937383431092e-06, "loss": 1.9366, "step": 4529 }, { "epoch": 0.33911627645836845, "grad_norm": 1.253153681755066, "learning_rate": 5.030359040419341e-06, "loss": 1.8756, "step": 4530 }, { "epoch": 0.3391911365635469, "grad_norm": 1.3611829280853271, "learning_rate": 5.01081774402472e-06, "loss": 1.6398, "step": 4531 }, { "epoch": 0.3392659966687253, "grad_norm": 1.395171880722046, "learning_rate": 4.991313501870121e-06, "loss": 2.154, "step": 4532 }, { "epoch": 0.33934085677390374, "grad_norm": 1.4445016384124756, "learning_rate": 4.971846321563889e-06, "loss": 1.7116, "step": 4533 }, { "epoch": 0.33941571687908223, "grad_norm": 1.231751561164856, "learning_rate": 4.952416210699984e-06, "loss": 1.9969, "step": 4534 }, { "epoch": 0.33949057698426066, "grad_norm": 1.3712412118911743, "learning_rate": 4.933023176857876e-06, "loss": 2.0731, "step": 4535 }, { "epoch": 0.3395654370894391, "grad_norm": 1.3415284156799316, "learning_rate": 4.913667227602592e-06, "loss": 2.0211, "step": 4536 }, { "epoch": 0.3396402971946176, "grad_norm": 1.1422545909881592, "learning_rate": 4.8943483704846475e-06, "loss": 1.7379, "step": 4537 }, { "epoch": 0.339715157299796, "grad_norm": 1.3054524660110474, "learning_rate": 4.875066613040125e-06, "loss": 1.9913, "step": 4538 }, { "epoch": 0.33979001740497444, "grad_norm": 1.3599408864974976, "learning_rate": 4.85582196279063e-06, "loss": 2.2468, "step": 4539 }, { "epoch": 0.3398648775101529, "grad_norm": 1.278880000114441, "learning_rate": 4.836614427243302e-06, "loss": 1.9222, "step": 4540 }, { "epoch": 0.33993973761533136, "grad_norm": 1.3722074031829834, "learning_rate": 4.8174440138907705e-06, "loss": 1.9081, "step": 4541 }, { "epoch": 0.3400145977205098, "grad_norm": 1.2537212371826172, "learning_rate": 4.7983107302112196e-06, "loss": 2.0784, "step": 4542 }, { "epoch": 0.3400894578256882, "grad_norm": 1.4048740863800049, "learning_rate": 4.779214583668323e-06, "loss": 1.4979, "step": 4543 }, { "epoch": 0.3401643179308667, "grad_norm": 1.2366307973861694, "learning_rate": 4.760155581711267e-06, "loss": 1.6565, "step": 4544 }, { "epoch": 0.34023917803604514, "grad_norm": 1.2021944522857666, "learning_rate": 4.741133731774783e-06, "loss": 1.8748, "step": 4545 }, { "epoch": 0.3403140381412236, "grad_norm": 1.1715115308761597, "learning_rate": 4.722149041279078e-06, "loss": 1.302, "step": 4546 }, { "epoch": 0.34038889824640206, "grad_norm": 1.3931176662445068, "learning_rate": 4.703201517629885e-06, "loss": 2.179, "step": 4547 }, { "epoch": 0.3404637583515805, "grad_norm": 1.3814362287521362, "learning_rate": 4.684291168218413e-06, "loss": 1.8616, "step": 4548 }, { "epoch": 0.3405386184567589, "grad_norm": 1.5033177137374878, "learning_rate": 4.665418000421362e-06, "loss": 1.8014, "step": 4549 }, { "epoch": 0.34061347856193736, "grad_norm": 1.2480769157409668, "learning_rate": 4.646582021600976e-06, "loss": 2.0747, "step": 4550 }, { "epoch": 0.34068833866711584, "grad_norm": 1.373628854751587, "learning_rate": 4.627783239104955e-06, "loss": 2.1831, "step": 4551 }, { "epoch": 0.3407631987722943, "grad_norm": 1.286186695098877, "learning_rate": 4.609021660266499e-06, "loss": 1.7044, "step": 4552 }, { "epoch": 0.3408380588774727, "grad_norm": 1.1876329183578491, "learning_rate": 4.5902972924043e-06, "loss": 2.232, "step": 4553 }, { "epoch": 0.3409129189826512, "grad_norm": 1.3626254796981812, "learning_rate": 4.571610142822513e-06, "loss": 2.0606, "step": 4554 }, { "epoch": 0.3409877790878296, "grad_norm": 1.1329621076583862, "learning_rate": 4.552960218810809e-06, "loss": 2.4244, "step": 4555 }, { "epoch": 0.34106263919300805, "grad_norm": 1.3104658126831055, "learning_rate": 4.534347527644323e-06, "loss": 1.9852, "step": 4556 }, { "epoch": 0.3411374992981865, "grad_norm": 1.2121812105178833, "learning_rate": 4.515772076583624e-06, "loss": 1.3558, "step": 4557 }, { "epoch": 0.341212359403365, "grad_norm": 1.1290010213851929, "learning_rate": 4.497233872874828e-06, "loss": 1.9576, "step": 4558 }, { "epoch": 0.3412872195085434, "grad_norm": 1.2438616752624512, "learning_rate": 4.4787329237494845e-06, "loss": 2.0231, "step": 4559 }, { "epoch": 0.34136207961372184, "grad_norm": 1.225235939025879, "learning_rate": 4.4602692364245965e-06, "loss": 2.0676, "step": 4560 }, { "epoch": 0.3414369397189003, "grad_norm": 1.1549967527389526, "learning_rate": 4.441842818102648e-06, "loss": 1.9661, "step": 4561 }, { "epoch": 0.34151179982407875, "grad_norm": 1.2064086198806763, "learning_rate": 4.423453675971578e-06, "loss": 2.1442, "step": 4562 }, { "epoch": 0.3415866599292572, "grad_norm": 1.219541311264038, "learning_rate": 4.405101817204816e-06, "loss": 1.7305, "step": 4563 }, { "epoch": 0.3416615200344357, "grad_norm": 1.3185049295425415, "learning_rate": 4.386787248961199e-06, "loss": 1.8793, "step": 4564 }, { "epoch": 0.3417363801396141, "grad_norm": 1.2376056909561157, "learning_rate": 4.3685099783850605e-06, "loss": 1.8657, "step": 4565 }, { "epoch": 0.34181124024479254, "grad_norm": 1.5029139518737793, "learning_rate": 4.350270012606117e-06, "loss": 1.3492, "step": 4566 }, { "epoch": 0.34188610034997097, "grad_norm": 1.2089899778366089, "learning_rate": 4.332067358739622e-06, "loss": 1.7283, "step": 4567 }, { "epoch": 0.34196096045514945, "grad_norm": 1.2148491144180298, "learning_rate": 4.313902023886218e-06, "loss": 2.1722, "step": 4568 }, { "epoch": 0.3420358205603279, "grad_norm": 1.4390515089035034, "learning_rate": 4.295774015132003e-06, "loss": 2.1155, "step": 4569 }, { "epoch": 0.3421106806655063, "grad_norm": 1.3047356605529785, "learning_rate": 4.277683339548522e-06, "loss": 1.7524, "step": 4570 }, { "epoch": 0.3421855407706848, "grad_norm": 1.3196558952331543, "learning_rate": 4.259630004192739e-06, "loss": 2.262, "step": 4571 }, { "epoch": 0.34226040087586324, "grad_norm": 1.1313893795013428, "learning_rate": 4.241614016107076e-06, "loss": 1.7131, "step": 4572 }, { "epoch": 0.34233526098104167, "grad_norm": 1.0311846733093262, "learning_rate": 4.223635382319357e-06, "loss": 1.6891, "step": 4573 }, { "epoch": 0.34241012108622015, "grad_norm": 1.2184085845947266, "learning_rate": 4.20569410984285e-06, "loss": 1.9424, "step": 4574 }, { "epoch": 0.3424849811913986, "grad_norm": 1.080529808998108, "learning_rate": 4.187790205676268e-06, "loss": 1.5174, "step": 4575 }, { "epoch": 0.342559841296577, "grad_norm": 1.2139588594436646, "learning_rate": 4.169923676803722e-06, "loss": 1.9176, "step": 4576 }, { "epoch": 0.34263470140175545, "grad_norm": 1.1079462766647339, "learning_rate": 4.152094530194728e-06, "loss": 2.1642, "step": 4577 }, { "epoch": 0.34270956150693394, "grad_norm": 1.2222208976745605, "learning_rate": 4.134302772804255e-06, "loss": 1.9995, "step": 4578 }, { "epoch": 0.34278442161211237, "grad_norm": 1.1748733520507812, "learning_rate": 4.116548411572674e-06, "loss": 1.8265, "step": 4579 }, { "epoch": 0.3428592817172908, "grad_norm": 1.076507568359375, "learning_rate": 4.098831453425766e-06, "loss": 1.7564, "step": 4580 }, { "epoch": 0.3429341418224693, "grad_norm": 1.184265375137329, "learning_rate": 4.081151905274738e-06, "loss": 1.9224, "step": 4581 }, { "epoch": 0.3430090019276477, "grad_norm": 1.2332978248596191, "learning_rate": 4.063509774016183e-06, "loss": 1.7429, "step": 4582 }, { "epoch": 0.34308386203282615, "grad_norm": 1.4396919012069702, "learning_rate": 4.045905066532063e-06, "loss": 2.1194, "step": 4583 }, { "epoch": 0.3431587221380046, "grad_norm": 1.330065131187439, "learning_rate": 4.0283377896898285e-06, "loss": 2.0043, "step": 4584 }, { "epoch": 0.34323358224318307, "grad_norm": 1.3443070650100708, "learning_rate": 4.010807950342255e-06, "loss": 2.038, "step": 4585 }, { "epoch": 0.3433084423483615, "grad_norm": 1.4051352739334106, "learning_rate": 3.993315555327559e-06, "loss": 1.9003, "step": 4586 }, { "epoch": 0.34338330245353993, "grad_norm": 1.1485977172851562, "learning_rate": 3.975860611469329e-06, "loss": 2.1166, "step": 4587 }, { "epoch": 0.3434581625587184, "grad_norm": 1.2538539171218872, "learning_rate": 3.958443125576539e-06, "loss": 2.1864, "step": 4588 }, { "epoch": 0.34353302266389685, "grad_norm": 1.4990195035934448, "learning_rate": 3.941063104443576e-06, "loss": 1.7862, "step": 4589 }, { "epoch": 0.3436078827690753, "grad_norm": 1.2596116065979004, "learning_rate": 3.923720554850164e-06, "loss": 1.8179, "step": 4590 }, { "epoch": 0.34368274287425377, "grad_norm": 1.3709298372268677, "learning_rate": 3.906415483561465e-06, "loss": 2.1358, "step": 4591 }, { "epoch": 0.3437576029794322, "grad_norm": 1.221257209777832, "learning_rate": 3.889147897328005e-06, "loss": 1.7133, "step": 4592 }, { "epoch": 0.34383246308461063, "grad_norm": 1.385835886001587, "learning_rate": 3.871917802885671e-06, "loss": 2.3332, "step": 4593 }, { "epoch": 0.34390732318978906, "grad_norm": 1.0814961194992065, "learning_rate": 3.854725206955722e-06, "loss": 1.9773, "step": 4594 }, { "epoch": 0.34398218329496755, "grad_norm": 1.1003409624099731, "learning_rate": 3.837570116244815e-06, "loss": 2.0169, "step": 4595 }, { "epoch": 0.344057043400146, "grad_norm": 1.2890812158584595, "learning_rate": 3.820452537444952e-06, "loss": 1.9974, "step": 4596 }, { "epoch": 0.3441319035053244, "grad_norm": 1.3188706636428833, "learning_rate": 3.8033724772335468e-06, "loss": 2.3923, "step": 4597 }, { "epoch": 0.3442067636105029, "grad_norm": 1.2769078016281128, "learning_rate": 3.7863299422732923e-06, "loss": 2.0763, "step": 4598 }, { "epoch": 0.34428162371568133, "grad_norm": 1.2590250968933105, "learning_rate": 3.7693249392123486e-06, "loss": 2.1258, "step": 4599 }, { "epoch": 0.34435648382085976, "grad_norm": 1.1261121034622192, "learning_rate": 3.7523574746841315e-06, "loss": 1.9399, "step": 4600 }, { "epoch": 0.3444313439260382, "grad_norm": 1.3801418542861938, "learning_rate": 3.735427555307491e-06, "loss": 1.9588, "step": 4601 }, { "epoch": 0.3445062040312167, "grad_norm": 1.2212085723876953, "learning_rate": 3.718535187686589e-06, "loss": 1.9726, "step": 4602 }, { "epoch": 0.3445810641363951, "grad_norm": 1.393600344657898, "learning_rate": 3.7016803784109654e-06, "loss": 1.8796, "step": 4603 }, { "epoch": 0.34465592424157354, "grad_norm": 1.3755656480789185, "learning_rate": 3.684863134055516e-06, "loss": 1.9832, "step": 4604 }, { "epoch": 0.34473078434675203, "grad_norm": 1.350975513458252, "learning_rate": 3.668083461180416e-06, "loss": 2.1357, "step": 4605 }, { "epoch": 0.34480564445193046, "grad_norm": 1.3265366554260254, "learning_rate": 3.6513413663312733e-06, "loss": 2.0447, "step": 4606 }, { "epoch": 0.3448805045571089, "grad_norm": 1.309421420097351, "learning_rate": 3.634636856038964e-06, "loss": 2.5869, "step": 4607 }, { "epoch": 0.3449553646622874, "grad_norm": 1.5961720943450928, "learning_rate": 3.617969936819754e-06, "loss": 1.278, "step": 4608 }, { "epoch": 0.3450302247674658, "grad_norm": 1.3921486139297485, "learning_rate": 3.6013406151752306e-06, "loss": 2.1268, "step": 4609 }, { "epoch": 0.34510508487264424, "grad_norm": 1.221779465675354, "learning_rate": 3.5847488975922937e-06, "loss": 2.0602, "step": 4610 }, { "epoch": 0.3451799449778227, "grad_norm": 1.3110703229904175, "learning_rate": 3.5681947905432e-06, "loss": 2.247, "step": 4611 }, { "epoch": 0.34525480508300116, "grad_norm": 1.2690353393554688, "learning_rate": 3.551678300485528e-06, "loss": 1.514, "step": 4612 }, { "epoch": 0.3453296651881796, "grad_norm": 1.3026281595230103, "learning_rate": 3.5351994338621796e-06, "loss": 1.7812, "step": 4613 }, { "epoch": 0.345404525293358, "grad_norm": 1.210710048675537, "learning_rate": 3.518758197101368e-06, "loss": 1.4519, "step": 4614 }, { "epoch": 0.3454793853985365, "grad_norm": 1.2986096143722534, "learning_rate": 3.5023545966166414e-06, "loss": 1.7607, "step": 4615 }, { "epoch": 0.34555424550371494, "grad_norm": 1.2815183401107788, "learning_rate": 3.485988638806881e-06, "loss": 2.1305, "step": 4616 }, { "epoch": 0.34562910560889337, "grad_norm": 1.2001575231552124, "learning_rate": 3.469660330056268e-06, "loss": 2.6433, "step": 4617 }, { "epoch": 0.3457039657140718, "grad_norm": 1.1708577871322632, "learning_rate": 3.453369676734264e-06, "loss": 1.8919, "step": 4618 }, { "epoch": 0.3457788258192503, "grad_norm": 1.1975778341293335, "learning_rate": 3.437116685195707e-06, "loss": 1.9456, "step": 4619 }, { "epoch": 0.3458536859244287, "grad_norm": 1.4327126741409302, "learning_rate": 3.420901361780704e-06, "loss": 2.2748, "step": 4620 }, { "epoch": 0.34592854602960715, "grad_norm": 1.2596209049224854, "learning_rate": 3.4047237128146837e-06, "loss": 1.8103, "step": 4621 }, { "epoch": 0.34600340613478564, "grad_norm": 1.4433529376983643, "learning_rate": 3.3885837446083424e-06, "loss": 2.2913, "step": 4622 }, { "epoch": 0.34607826623996407, "grad_norm": 1.4974435567855835, "learning_rate": 3.372481463457733e-06, "loss": 2.0991, "step": 4623 }, { "epoch": 0.3461531263451425, "grad_norm": 1.05780827999115, "learning_rate": 3.3564168756441525e-06, "loss": 1.4658, "step": 4624 }, { "epoch": 0.346227986450321, "grad_norm": 1.2845220565795898, "learning_rate": 3.340389987434245e-06, "loss": 1.6306, "step": 4625 }, { "epoch": 0.3463028465554994, "grad_norm": 1.345922589302063, "learning_rate": 3.324400805079908e-06, "loss": 2.1766, "step": 4626 }, { "epoch": 0.34637770666067785, "grad_norm": 1.3875548839569092, "learning_rate": 3.3084493348183642e-06, "loss": 2.2689, "step": 4627 }, { "epoch": 0.3464525667658563, "grad_norm": 1.2664425373077393, "learning_rate": 3.2925355828720915e-06, "loss": 1.7772, "step": 4628 }, { "epoch": 0.34652742687103477, "grad_norm": 1.4358547925949097, "learning_rate": 3.276659555448891e-06, "loss": 2.3201, "step": 4629 }, { "epoch": 0.3466022869762132, "grad_norm": 1.2995142936706543, "learning_rate": 3.260821258741786e-06, "loss": 2.4587, "step": 4630 }, { "epoch": 0.34667714708139163, "grad_norm": 1.3399537801742554, "learning_rate": 3.2450206989291576e-06, "loss": 2.3059, "step": 4631 }, { "epoch": 0.3467520071865701, "grad_norm": 1.3072859048843384, "learning_rate": 3.229257882174619e-06, "loss": 1.7803, "step": 4632 }, { "epoch": 0.34682686729174855, "grad_norm": 1.2386009693145752, "learning_rate": 3.2135328146270738e-06, "loss": 1.8493, "step": 4633 }, { "epoch": 0.346901727396927, "grad_norm": 1.3215731382369995, "learning_rate": 3.1978455024207155e-06, "loss": 2.0525, "step": 4634 }, { "epoch": 0.3469765875021054, "grad_norm": 1.3598041534423828, "learning_rate": 3.18219595167496e-06, "loss": 1.8988, "step": 4635 }, { "epoch": 0.3470514476072839, "grad_norm": 1.283776044845581, "learning_rate": 3.166584168494546e-06, "loss": 2.1585, "step": 4636 }, { "epoch": 0.34712630771246233, "grad_norm": 1.2166756391525269, "learning_rate": 3.1510101589694586e-06, "loss": 2.4006, "step": 4637 }, { "epoch": 0.34720116781764077, "grad_norm": 1.3717604875564575, "learning_rate": 3.135473929174959e-06, "loss": 2.0093, "step": 4638 }, { "epoch": 0.34727602792281925, "grad_norm": 1.270832896232605, "learning_rate": 3.1199754851715336e-06, "loss": 1.5808, "step": 4639 }, { "epoch": 0.3473508880279977, "grad_norm": 1.3722212314605713, "learning_rate": 3.104514833004979e-06, "loss": 2.0907, "step": 4640 }, { "epoch": 0.3474257481331761, "grad_norm": 1.285548448562622, "learning_rate": 3.0890919787063045e-06, "loss": 2.257, "step": 4641 }, { "epoch": 0.3475006082383546, "grad_norm": 1.408409833908081, "learning_rate": 3.0737069282918194e-06, "loss": 2.0686, "step": 4642 }, { "epoch": 0.34757546834353303, "grad_norm": 1.391724944114685, "learning_rate": 3.058359687763046e-06, "loss": 2.7496, "step": 4643 }, { "epoch": 0.34765032844871147, "grad_norm": 1.5066701173782349, "learning_rate": 3.0430502631067836e-06, "loss": 2.0663, "step": 4644 }, { "epoch": 0.3477251885538899, "grad_norm": 1.042411208152771, "learning_rate": 3.0277786602950776e-06, "loss": 1.751, "step": 4645 }, { "epoch": 0.3478000486590684, "grad_norm": 1.2254292964935303, "learning_rate": 3.0125448852851956e-06, "loss": 1.8533, "step": 4646 }, { "epoch": 0.3478749087642468, "grad_norm": 1.3644639253616333, "learning_rate": 2.9973489440196844e-06, "loss": 2.0269, "step": 4647 }, { "epoch": 0.34794976886942525, "grad_norm": 1.2592064142227173, "learning_rate": 2.9821908424262914e-06, "loss": 2.0861, "step": 4648 }, { "epoch": 0.34802462897460373, "grad_norm": 1.5635769367218018, "learning_rate": 2.9670705864180414e-06, "loss": 2.1319, "step": 4649 }, { "epoch": 0.34809948907978217, "grad_norm": 1.3212071657180786, "learning_rate": 2.9519881818931836e-06, "loss": 1.5252, "step": 4650 }, { "epoch": 0.34809948907978217, "eval_loss": 1.9670976400375366, "eval_runtime": 179.1455, "eval_samples_per_second": 27.91, "eval_steps_per_second": 13.955, "step": 4650 }, { "epoch": 0.3481743491849606, "grad_norm": 1.3401981592178345, "learning_rate": 2.9369436347352118e-06, "loss": 2.3211, "step": 4651 }, { "epoch": 0.34824920929013903, "grad_norm": 1.4028186798095703, "learning_rate": 2.9219369508128093e-06, "loss": 2.0651, "step": 4652 }, { "epoch": 0.3483240693953175, "grad_norm": 1.3562135696411133, "learning_rate": 2.906968135979926e-06, "loss": 1.9705, "step": 4653 }, { "epoch": 0.34839892950049595, "grad_norm": 1.3539448976516724, "learning_rate": 2.8920371960757475e-06, "loss": 2.133, "step": 4654 }, { "epoch": 0.3484737896056744, "grad_norm": 1.3303998708724976, "learning_rate": 2.877144136924659e-06, "loss": 2.0737, "step": 4655 }, { "epoch": 0.34854864971085286, "grad_norm": 1.3011642694473267, "learning_rate": 2.862288964336279e-06, "loss": 1.5676, "step": 4656 }, { "epoch": 0.3486235098160313, "grad_norm": 1.1873743534088135, "learning_rate": 2.847471684105463e-06, "loss": 1.5828, "step": 4657 }, { "epoch": 0.3486983699212097, "grad_norm": 1.2421154975891113, "learning_rate": 2.8326923020122407e-06, "loss": 2.0552, "step": 4658 }, { "epoch": 0.3487732300263882, "grad_norm": 1.2967408895492554, "learning_rate": 2.8179508238219133e-06, "loss": 1.9104, "step": 4659 }, { "epoch": 0.34884809013156665, "grad_norm": 1.4872585535049438, "learning_rate": 2.8032472552849577e-06, "loss": 2.2209, "step": 4660 }, { "epoch": 0.3489229502367451, "grad_norm": 1.2645149230957031, "learning_rate": 2.7885816021370747e-06, "loss": 2.1375, "step": 4661 }, { "epoch": 0.3489978103419235, "grad_norm": 1.7234541177749634, "learning_rate": 2.7739538700991886e-06, "loss": 2.0671, "step": 4662 }, { "epoch": 0.349072670447102, "grad_norm": 1.3294626474380493, "learning_rate": 2.759364064877412e-06, "loss": 1.489, "step": 4663 }, { "epoch": 0.3491475305522804, "grad_norm": 1.319984793663025, "learning_rate": 2.744812192163049e-06, "loss": 1.6896, "step": 4664 }, { "epoch": 0.34922239065745886, "grad_norm": 1.3685359954833984, "learning_rate": 2.7302982576326462e-06, "loss": 1.9214, "step": 4665 }, { "epoch": 0.34929725076263735, "grad_norm": 1.3376015424728394, "learning_rate": 2.7158222669479205e-06, "loss": 1.5299, "step": 4666 }, { "epoch": 0.3493721108678158, "grad_norm": 1.116471290588379, "learning_rate": 2.7013842257557985e-06, "loss": 2.1801, "step": 4667 }, { "epoch": 0.3494469709729942, "grad_norm": 1.1204684972763062, "learning_rate": 2.6869841396884197e-06, "loss": 2.2769, "step": 4668 }, { "epoch": 0.34952183107817264, "grad_norm": 1.246972680091858, "learning_rate": 2.67262201436308e-06, "loss": 1.9962, "step": 4669 }, { "epoch": 0.3495966911833511, "grad_norm": 1.30109703540802, "learning_rate": 2.658297855382297e-06, "loss": 2.0111, "step": 4670 }, { "epoch": 0.34967155128852956, "grad_norm": 1.2520473003387451, "learning_rate": 2.644011668333757e-06, "loss": 1.7951, "step": 4671 }, { "epoch": 0.349746411393708, "grad_norm": 0.9805873036384583, "learning_rate": 2.6297634587903465e-06, "loss": 1.6521, "step": 4672 }, { "epoch": 0.3498212714988865, "grad_norm": 1.2866811752319336, "learning_rate": 2.6155532323101417e-06, "loss": 2.1828, "step": 4673 }, { "epoch": 0.3498961316040649, "grad_norm": 1.1330819129943848, "learning_rate": 2.601380994436409e-06, "loss": 1.1754, "step": 4674 }, { "epoch": 0.34997099170924334, "grad_norm": 1.135083556175232, "learning_rate": 2.5872467506975697e-06, "loss": 1.9038, "step": 4675 }, { "epoch": 0.3500458518144218, "grad_norm": 1.3459707498550415, "learning_rate": 2.573150506607225e-06, "loss": 2.0948, "step": 4676 }, { "epoch": 0.35012071191960026, "grad_norm": 1.277839183807373, "learning_rate": 2.559092267664187e-06, "loss": 1.7929, "step": 4677 }, { "epoch": 0.3501955720247787, "grad_norm": 0.9159249067306519, "learning_rate": 2.5450720393524253e-06, "loss": 2.0505, "step": 4678 }, { "epoch": 0.3502704321299571, "grad_norm": 1.3078985214233398, "learning_rate": 2.5310898271410423e-06, "loss": 1.6495, "step": 4679 }, { "epoch": 0.3503452922351356, "grad_norm": 1.2950013875961304, "learning_rate": 2.517145636484386e-06, "loss": 1.755, "step": 4680 }, { "epoch": 0.35042015234031404, "grad_norm": 1.5121675729751587, "learning_rate": 2.5032394728219053e-06, "loss": 2.3922, "step": 4681 }, { "epoch": 0.35049501244549247, "grad_norm": 1.2426512241363525, "learning_rate": 2.4893713415782503e-06, "loss": 1.8246, "step": 4682 }, { "epoch": 0.35056987255067096, "grad_norm": 1.1959779262542725, "learning_rate": 2.475541248163238e-06, "loss": 2.1967, "step": 4683 }, { "epoch": 0.3506447326558494, "grad_norm": 1.3203898668289185, "learning_rate": 2.461749197971819e-06, "loss": 1.8341, "step": 4684 }, { "epoch": 0.3507195927610278, "grad_norm": 1.1833311319351196, "learning_rate": 2.447995196384134e-06, "loss": 1.8438, "step": 4685 }, { "epoch": 0.35079445286620625, "grad_norm": 1.0874465703964233, "learning_rate": 2.43427924876547e-06, "loss": 2.2704, "step": 4686 }, { "epoch": 0.35086931297138474, "grad_norm": 1.223625659942627, "learning_rate": 2.420601360466257e-06, "loss": 1.798, "step": 4687 }, { "epoch": 0.35094417307656317, "grad_norm": 1.1008864641189575, "learning_rate": 2.4069615368220834e-06, "loss": 1.5893, "step": 4688 }, { "epoch": 0.3510190331817416, "grad_norm": 1.264212727546692, "learning_rate": 2.3933597831537146e-06, "loss": 2.0519, "step": 4689 }, { "epoch": 0.3510938932869201, "grad_norm": 1.2397962808609009, "learning_rate": 2.3797961047670293e-06, "loss": 2.6031, "step": 4690 }, { "epoch": 0.3511687533920985, "grad_norm": 1.1585699319839478, "learning_rate": 2.3662705069530942e-06, "loss": 1.779, "step": 4691 }, { "epoch": 0.35124361349727695, "grad_norm": 1.3350188732147217, "learning_rate": 2.3527829949880674e-06, "loss": 1.954, "step": 4692 }, { "epoch": 0.35131847360245544, "grad_norm": 1.2255163192749023, "learning_rate": 2.3393335741332957e-06, "loss": 1.7874, "step": 4693 }, { "epoch": 0.35139333370763387, "grad_norm": 1.0834754705429077, "learning_rate": 2.325922249635248e-06, "loss": 1.5284, "step": 4694 }, { "epoch": 0.3514681938128123, "grad_norm": 1.8498111963272095, "learning_rate": 2.3125490267255414e-06, "loss": 1.9813, "step": 4695 }, { "epoch": 0.35154305391799073, "grad_norm": 1.3007972240447998, "learning_rate": 2.299213910620912e-06, "loss": 1.574, "step": 4696 }, { "epoch": 0.3516179140231692, "grad_norm": 1.2581608295440674, "learning_rate": 2.2859169065232556e-06, "loss": 1.2881, "step": 4697 }, { "epoch": 0.35169277412834765, "grad_norm": 1.3727989196777344, "learning_rate": 2.2726580196195666e-06, "loss": 2.4513, "step": 4698 }, { "epoch": 0.3517676342335261, "grad_norm": 1.269454836845398, "learning_rate": 2.2594372550820085e-06, "loss": 1.7256, "step": 4699 }, { "epoch": 0.35184249433870457, "grad_norm": 1.187708854675293, "learning_rate": 2.2462546180678557e-06, "loss": 1.8941, "step": 4700 }, { "epoch": 0.351917354443883, "grad_norm": 1.2607229948043823, "learning_rate": 2.2331101137195054e-06, "loss": 2.0609, "step": 4701 }, { "epoch": 0.35199221454906143, "grad_norm": 1.141001582145691, "learning_rate": 2.2200037471645006e-06, "loss": 1.5999, "step": 4702 }, { "epoch": 0.35206707465423986, "grad_norm": 1.4044742584228516, "learning_rate": 2.2069355235154634e-06, "loss": 2.3231, "step": 4703 }, { "epoch": 0.35214193475941835, "grad_norm": 1.3991650342941284, "learning_rate": 2.1939054478701928e-06, "loss": 1.7774, "step": 4704 }, { "epoch": 0.3522167948645968, "grad_norm": 0.9666325449943542, "learning_rate": 2.1809135253115565e-06, "loss": 2.046, "step": 4705 }, { "epoch": 0.3522916549697752, "grad_norm": 1.1266132593154907, "learning_rate": 2.1679597609075784e-06, "loss": 1.8757, "step": 4706 }, { "epoch": 0.3523665150749537, "grad_norm": 1.259818196296692, "learning_rate": 2.1550441597113615e-06, "loss": 1.4984, "step": 4707 }, { "epoch": 0.35244137518013213, "grad_norm": 1.2312493324279785, "learning_rate": 2.142166726761152e-06, "loss": 2.1444, "step": 4708 }, { "epoch": 0.35251623528531056, "grad_norm": 1.480087161064148, "learning_rate": 2.129327467080311e-06, "loss": 1.8274, "step": 4709 }, { "epoch": 0.35259109539048905, "grad_norm": 1.2718236446380615, "learning_rate": 2.1165263856772645e-06, "loss": 2.473, "step": 4710 }, { "epoch": 0.3526659554956675, "grad_norm": 1.1285791397094727, "learning_rate": 2.103763487545596e-06, "loss": 1.8222, "step": 4711 }, { "epoch": 0.3527408156008459, "grad_norm": 1.1967288255691528, "learning_rate": 2.091038777663956e-06, "loss": 1.743, "step": 4712 }, { "epoch": 0.35281567570602435, "grad_norm": 1.2441881895065308, "learning_rate": 2.078352260996119e-06, "loss": 1.9742, "step": 4713 }, { "epoch": 0.35289053581120283, "grad_norm": 1.108802318572998, "learning_rate": 2.06570394249096e-06, "loss": 1.7311, "step": 4714 }, { "epoch": 0.35296539591638126, "grad_norm": 1.223149061203003, "learning_rate": 2.0530938270824528e-06, "loss": 1.9674, "step": 4715 }, { "epoch": 0.3530402560215597, "grad_norm": 1.3285163640975952, "learning_rate": 2.040521919689664e-06, "loss": 1.9713, "step": 4716 }, { "epoch": 0.3531151161267382, "grad_norm": 1.240027904510498, "learning_rate": 2.027988225216737e-06, "loss": 1.9984, "step": 4717 }, { "epoch": 0.3531899762319166, "grad_norm": 1.1664494276046753, "learning_rate": 2.015492748552961e-06, "loss": 1.6277, "step": 4718 }, { "epoch": 0.35326483633709505, "grad_norm": 1.301286220550537, "learning_rate": 2.003035494572669e-06, "loss": 2.0526, "step": 4719 }, { "epoch": 0.3533396964422735, "grad_norm": 1.3743666410446167, "learning_rate": 1.990616468135298e-06, "loss": 2.1226, "step": 4720 }, { "epoch": 0.35341455654745196, "grad_norm": 1.2668309211730957, "learning_rate": 1.9782356740853826e-06, "loss": 2.3056, "step": 4721 }, { "epoch": 0.3534894166526304, "grad_norm": 1.3808245658874512, "learning_rate": 1.9658931172525154e-06, "loss": 2.161, "step": 4722 }, { "epoch": 0.3535642767578088, "grad_norm": 1.273419976234436, "learning_rate": 1.9535888024514004e-06, "loss": 2.1417, "step": 4723 }, { "epoch": 0.3536391368629873, "grad_norm": 1.413378119468689, "learning_rate": 1.9413227344818318e-06, "loss": 2.0766, "step": 4724 }, { "epoch": 0.35371399696816574, "grad_norm": 1.2383196353912354, "learning_rate": 1.92909491812866e-06, "loss": 1.8401, "step": 4725 }, { "epoch": 0.3537888570733442, "grad_norm": 1.0885154008865356, "learning_rate": 1.9169053581618248e-06, "loss": 1.8536, "step": 4726 }, { "epoch": 0.35386371717852266, "grad_norm": 1.3712769746780396, "learning_rate": 1.9047540593363333e-06, "loss": 2.1599, "step": 4727 }, { "epoch": 0.3539385772837011, "grad_norm": 1.1271910667419434, "learning_rate": 1.8926410263922722e-06, "loss": 1.4383, "step": 4728 }, { "epoch": 0.3540134373888795, "grad_norm": 1.2245129346847534, "learning_rate": 1.880566264054806e-06, "loss": 2.1517, "step": 4729 }, { "epoch": 0.35408829749405796, "grad_norm": 1.295994758605957, "learning_rate": 1.868529777034167e-06, "loss": 2.5187, "step": 4730 }, { "epoch": 0.35416315759923644, "grad_norm": 1.1289862394332886, "learning_rate": 1.8565315700256546e-06, "loss": 1.927, "step": 4731 }, { "epoch": 0.3542380177044149, "grad_norm": 1.231433629989624, "learning_rate": 1.8445716477096587e-06, "loss": 2.1194, "step": 4732 }, { "epoch": 0.3543128778095933, "grad_norm": 1.2786555290222168, "learning_rate": 1.8326500147515802e-06, "loss": 2.1342, "step": 4733 }, { "epoch": 0.3543877379147718, "grad_norm": 1.0546828508377075, "learning_rate": 1.8207666758019436e-06, "loss": 1.6651, "step": 4734 }, { "epoch": 0.3544625980199502, "grad_norm": 1.204908013343811, "learning_rate": 1.8089216354962967e-06, "loss": 1.8487, "step": 4735 }, { "epoch": 0.35453745812512866, "grad_norm": 1.2417758703231812, "learning_rate": 1.7971148984552656e-06, "loss": 2.0613, "step": 4736 }, { "epoch": 0.3546123182303071, "grad_norm": 1.1337772607803345, "learning_rate": 1.7853464692845323e-06, "loss": 1.5162, "step": 4737 }, { "epoch": 0.3546871783354856, "grad_norm": 1.281450629234314, "learning_rate": 1.773616352574825e-06, "loss": 1.5318, "step": 4738 }, { "epoch": 0.354762038440664, "grad_norm": 1.2737902402877808, "learning_rate": 1.761924552901928e-06, "loss": 1.8054, "step": 4739 }, { "epoch": 0.35483689854584244, "grad_norm": 1.5360839366912842, "learning_rate": 1.7502710748266926e-06, "loss": 2.1307, "step": 4740 }, { "epoch": 0.3549117586510209, "grad_norm": 1.3820832967758179, "learning_rate": 1.738655922895016e-06, "loss": 1.4559, "step": 4741 }, { "epoch": 0.35498661875619936, "grad_norm": 1.1601957082748413, "learning_rate": 1.7270791016378406e-06, "loss": 1.8693, "step": 4742 }, { "epoch": 0.3550614788613778, "grad_norm": 1.3373970985412598, "learning_rate": 1.715540615571165e-06, "loss": 2.4302, "step": 4743 }, { "epoch": 0.3551363389665563, "grad_norm": 1.5517210960388184, "learning_rate": 1.7040404691960111e-06, "loss": 2.4438, "step": 4744 }, { "epoch": 0.3552111990717347, "grad_norm": 1.2890070676803589, "learning_rate": 1.6925786669984566e-06, "loss": 2.2886, "step": 4745 }, { "epoch": 0.35528605917691314, "grad_norm": 1.1847519874572754, "learning_rate": 1.6811552134496366e-06, "loss": 1.6826, "step": 4746 }, { "epoch": 0.35536091928209157, "grad_norm": 1.3133333921432495, "learning_rate": 1.6697701130057198e-06, "loss": 1.5658, "step": 4747 }, { "epoch": 0.35543577938727006, "grad_norm": 1.3516374826431274, "learning_rate": 1.6584233701078865e-06, "loss": 1.9117, "step": 4748 }, { "epoch": 0.3555106394924485, "grad_norm": 1.2920116186141968, "learning_rate": 1.6471149891824188e-06, "loss": 2.1331, "step": 4749 }, { "epoch": 0.3555854995976269, "grad_norm": 1.4769059419631958, "learning_rate": 1.6358449746405436e-06, "loss": 2.2426, "step": 4750 }, { "epoch": 0.3556603597028054, "grad_norm": 1.5411772727966309, "learning_rate": 1.6246133308785993e-06, "loss": 1.7534, "step": 4751 }, { "epoch": 0.35573521980798384, "grad_norm": 1.1344534158706665, "learning_rate": 1.6134200622779261e-06, "loss": 2.4404, "step": 4752 }, { "epoch": 0.35581007991316227, "grad_norm": 1.4739525318145752, "learning_rate": 1.6022651732048865e-06, "loss": 2.1877, "step": 4753 }, { "epoch": 0.3558849400183407, "grad_norm": 1.2323731184005737, "learning_rate": 1.5911486680108779e-06, "loss": 1.8032, "step": 4754 }, { "epoch": 0.3559598001235192, "grad_norm": 1.4784456491470337, "learning_rate": 1.5800705510323532e-06, "loss": 2.0904, "step": 4755 }, { "epoch": 0.3560346602286976, "grad_norm": 1.2239915132522583, "learning_rate": 1.5690308265907228e-06, "loss": 1.8487, "step": 4756 }, { "epoch": 0.35610952033387605, "grad_norm": 1.3453365564346313, "learning_rate": 1.5580294989924971e-06, "loss": 1.807, "step": 4757 }, { "epoch": 0.35618438043905454, "grad_norm": 1.4742310047149658, "learning_rate": 1.5470665725291545e-06, "loss": 1.675, "step": 4758 }, { "epoch": 0.35625924054423297, "grad_norm": 1.24545419216156, "learning_rate": 1.5361420514772185e-06, "loss": 1.9456, "step": 4759 }, { "epoch": 0.3563341006494114, "grad_norm": 1.5866343975067139, "learning_rate": 1.5252559400982248e-06, "loss": 1.9522, "step": 4760 }, { "epoch": 0.3564089607545899, "grad_norm": 1.2893792390823364, "learning_rate": 1.514408242638743e-06, "loss": 1.8116, "step": 4761 }, { "epoch": 0.3564838208597683, "grad_norm": 1.185278058052063, "learning_rate": 1.5035989633303105e-06, "loss": 1.816, "step": 4762 }, { "epoch": 0.35655868096494675, "grad_norm": 1.18119215965271, "learning_rate": 1.492828106389521e-06, "loss": 1.8315, "step": 4763 }, { "epoch": 0.3566335410701252, "grad_norm": 1.4347643852233887, "learning_rate": 1.4820956760179693e-06, "loss": 1.7027, "step": 4764 }, { "epoch": 0.35670840117530367, "grad_norm": 1.2170485258102417, "learning_rate": 1.471401676402262e-06, "loss": 2.0832, "step": 4765 }, { "epoch": 0.3567832612804821, "grad_norm": 1.4111818075180054, "learning_rate": 1.4607461117140286e-06, "loss": 1.7891, "step": 4766 }, { "epoch": 0.35685812138566053, "grad_norm": 1.3345310688018799, "learning_rate": 1.4501289861098443e-06, "loss": 2.3296, "step": 4767 }, { "epoch": 0.356932981490839, "grad_norm": 1.3104097843170166, "learning_rate": 1.439550303731374e-06, "loss": 2.0865, "step": 4768 }, { "epoch": 0.35700784159601745, "grad_norm": 1.2689918279647827, "learning_rate": 1.4290100687052166e-06, "loss": 2.3491, "step": 4769 }, { "epoch": 0.3570827017011959, "grad_norm": 1.2147607803344727, "learning_rate": 1.4185082851430276e-06, "loss": 1.8853, "step": 4770 }, { "epoch": 0.3571575618063743, "grad_norm": 1.21406888961792, "learning_rate": 1.408044957141419e-06, "loss": 1.1247, "step": 4771 }, { "epoch": 0.3572324219115528, "grad_norm": 1.2421716451644897, "learning_rate": 1.3976200887820367e-06, "loss": 2.3309, "step": 4772 }, { "epoch": 0.35730728201673123, "grad_norm": 1.3337939977645874, "learning_rate": 1.3872336841314837e-06, "loss": 2.3731, "step": 4773 }, { "epoch": 0.35738214212190966, "grad_norm": 1.2841119766235352, "learning_rate": 1.3768857472414075e-06, "loss": 2.467, "step": 4774 }, { "epoch": 0.35745700222708815, "grad_norm": 1.4242182970046997, "learning_rate": 1.366576282148413e-06, "loss": 2.2642, "step": 4775 }, { "epoch": 0.3575318623322666, "grad_norm": 1.2528835535049438, "learning_rate": 1.356305292874116e-06, "loss": 1.4495, "step": 4776 }, { "epoch": 0.357606722437445, "grad_norm": 1.3492192029953003, "learning_rate": 1.3460727834251008e-06, "loss": 1.6189, "step": 4777 }, { "epoch": 0.3576815825426235, "grad_norm": 1.1135846376419067, "learning_rate": 1.3358787577929854e-06, "loss": 1.7254, "step": 4778 }, { "epoch": 0.35775644264780193, "grad_norm": 1.4133411645889282, "learning_rate": 1.3257232199543223e-06, "loss": 1.9492, "step": 4779 }, { "epoch": 0.35783130275298036, "grad_norm": 1.363173484802246, "learning_rate": 1.315606173870676e-06, "loss": 1.5082, "step": 4780 }, { "epoch": 0.3579061628581588, "grad_norm": 1.2061594724655151, "learning_rate": 1.3055276234886116e-06, "loss": 2.3175, "step": 4781 }, { "epoch": 0.3579810229633373, "grad_norm": 1.8743728399276733, "learning_rate": 1.295487572739651e-06, "loss": 1.7171, "step": 4782 }, { "epoch": 0.3580558830685157, "grad_norm": 1.2853257656097412, "learning_rate": 1.2854860255403279e-06, "loss": 2.087, "step": 4783 }, { "epoch": 0.35813074317369414, "grad_norm": 1.4842607975006104, "learning_rate": 1.2755229857921102e-06, "loss": 1.9859, "step": 4784 }, { "epoch": 0.35820560327887263, "grad_norm": 1.1833633184432983, "learning_rate": 1.2655984573814672e-06, "loss": 1.9455, "step": 4785 }, { "epoch": 0.35828046338405106, "grad_norm": 1.3112082481384277, "learning_rate": 1.2557124441798684e-06, "loss": 2.1803, "step": 4786 }, { "epoch": 0.3583553234892295, "grad_norm": 1.1095073223114014, "learning_rate": 1.2458649500437403e-06, "loss": 1.6338, "step": 4787 }, { "epoch": 0.3584301835944079, "grad_norm": 1.0903877019882202, "learning_rate": 1.2360559788144655e-06, "loss": 1.8656, "step": 4788 }, { "epoch": 0.3585050436995864, "grad_norm": 1.2582987546920776, "learning_rate": 1.2262855343184277e-06, "loss": 1.5925, "step": 4789 }, { "epoch": 0.35857990380476484, "grad_norm": 1.3843111991882324, "learning_rate": 1.2165536203669669e-06, "loss": 2.026, "step": 4790 }, { "epoch": 0.3586547639099433, "grad_norm": 1.345423936843872, "learning_rate": 1.2068602407563911e-06, "loss": 1.9924, "step": 4791 }, { "epoch": 0.35872962401512176, "grad_norm": 1.2851756811141968, "learning_rate": 1.1972053992680088e-06, "loss": 2.14, "step": 4792 }, { "epoch": 0.3588044841203002, "grad_norm": 1.1968756914138794, "learning_rate": 1.1875890996680295e-06, "loss": 1.9848, "step": 4793 }, { "epoch": 0.3588793442254786, "grad_norm": 1.247410535812378, "learning_rate": 1.1780113457076858e-06, "loss": 1.5603, "step": 4794 }, { "epoch": 0.3589542043306571, "grad_norm": 1.2045660018920898, "learning_rate": 1.1684721411231558e-06, "loss": 2.3644, "step": 4795 }, { "epoch": 0.35902906443583554, "grad_norm": 1.2834498882293701, "learning_rate": 1.1589714896355742e-06, "loss": 1.817, "step": 4796 }, { "epoch": 0.359103924541014, "grad_norm": 1.263255000114441, "learning_rate": 1.1495093949510316e-06, "loss": 1.7725, "step": 4797 }, { "epoch": 0.3591787846461924, "grad_norm": 1.3079043626785278, "learning_rate": 1.1400858607606092e-06, "loss": 2.3345, "step": 4798 }, { "epoch": 0.3592536447513709, "grad_norm": 1.2117722034454346, "learning_rate": 1.1307008907403105e-06, "loss": 2.0139, "step": 4799 }, { "epoch": 0.3593285048565493, "grad_norm": 1.3770304918289185, "learning_rate": 1.1213544885511184e-06, "loss": 1.5916, "step": 4800 }, { "epoch": 0.3593285048565493, "eval_loss": 1.9665676355361938, "eval_runtime": 179.0608, "eval_samples_per_second": 27.923, "eval_steps_per_second": 13.962, "step": 4800 }, { "epoch": 0.35940336496172776, "grad_norm": 1.3666555881500244, "learning_rate": 1.11204665783895e-06, "loss": 1.4583, "step": 4801 }, { "epoch": 0.35947822506690624, "grad_norm": 1.20809805393219, "learning_rate": 1.1027774022347004e-06, "loss": 1.8182, "step": 4802 }, { "epoch": 0.3595530851720847, "grad_norm": 1.1807363033294678, "learning_rate": 1.0935467253541997e-06, "loss": 2.2149, "step": 4803 }, { "epoch": 0.3596279452772631, "grad_norm": 1.1929394006729126, "learning_rate": 1.0843546307982232e-06, "loss": 2.0058, "step": 4804 }, { "epoch": 0.35970280538244154, "grad_norm": 1.3269634246826172, "learning_rate": 1.075201122152525e-06, "loss": 2.0606, "step": 4805 }, { "epoch": 0.35977766548762, "grad_norm": 1.556891679763794, "learning_rate": 1.0660862029877704e-06, "loss": 1.7795, "step": 4806 }, { "epoch": 0.35985252559279846, "grad_norm": 1.2965061664581299, "learning_rate": 1.0570098768596049e-06, "loss": 2.3629, "step": 4807 }, { "epoch": 0.3599273856979769, "grad_norm": 1.2918086051940918, "learning_rate": 1.0479721473085957e-06, "loss": 1.986, "step": 4808 }, { "epoch": 0.3600022458031554, "grad_norm": 1.4029771089553833, "learning_rate": 1.0389730178602786e-06, "loss": 2.4262, "step": 4809 }, { "epoch": 0.3600771059083338, "grad_norm": 1.3551044464111328, "learning_rate": 1.0300124920250898e-06, "loss": 1.8047, "step": 4810 }, { "epoch": 0.36015196601351224, "grad_norm": 1.1065592765808105, "learning_rate": 1.0210905732984333e-06, "loss": 2.4606, "step": 4811 }, { "epoch": 0.3602268261186907, "grad_norm": 1.3470953702926636, "learning_rate": 1.0122072651606695e-06, "loss": 2.2579, "step": 4812 }, { "epoch": 0.36030168622386916, "grad_norm": 1.1170743703842163, "learning_rate": 1.0033625710770822e-06, "loss": 1.661, "step": 4813 }, { "epoch": 0.3603765463290476, "grad_norm": 1.2535237073898315, "learning_rate": 9.94556494497878e-07, "loss": 1.5951, "step": 4814 }, { "epoch": 0.360451406434226, "grad_norm": 1.4549514055252075, "learning_rate": 9.857890388582091e-07, "loss": 1.6634, "step": 4815 }, { "epoch": 0.3605262665394045, "grad_norm": 1.3379424810409546, "learning_rate": 9.770602075781621e-07, "loss": 2.0988, "step": 4816 }, { "epoch": 0.36060112664458294, "grad_norm": 1.2470217943191528, "learning_rate": 9.683700040627685e-07, "loss": 1.4601, "step": 4817 }, { "epoch": 0.36067598674976137, "grad_norm": 1.1995042562484741, "learning_rate": 9.597184317019836e-07, "loss": 1.5097, "step": 4818 }, { "epoch": 0.36075084685493986, "grad_norm": 1.3935478925704956, "learning_rate": 9.511054938706854e-07, "loss": 2.3442, "step": 4819 }, { "epoch": 0.3608257069601183, "grad_norm": 1.425140142440796, "learning_rate": 9.425311939286752e-07, "loss": 1.9299, "step": 4820 }, { "epoch": 0.3609005670652967, "grad_norm": 1.3113347291946411, "learning_rate": 9.339955352207108e-07, "loss": 1.7162, "step": 4821 }, { "epoch": 0.36097542717047515, "grad_norm": 1.3793444633483887, "learning_rate": 9.254985210764511e-07, "loss": 2.345, "step": 4822 }, { "epoch": 0.36105028727565364, "grad_norm": 1.450742244720459, "learning_rate": 9.17040154810489e-07, "loss": 2.1551, "step": 4823 }, { "epoch": 0.36112514738083207, "grad_norm": 1.3210675716400146, "learning_rate": 9.086204397223519e-07, "loss": 1.5997, "step": 4824 }, { "epoch": 0.3612000074860105, "grad_norm": 1.0365413427352905, "learning_rate": 9.002393790964569e-07, "loss": 1.7115, "step": 4825 }, { "epoch": 0.361274867591189, "grad_norm": 1.3690142631530762, "learning_rate": 8.918969762021778e-07, "loss": 2.2153, "step": 4826 }, { "epoch": 0.3613497276963674, "grad_norm": 1.1770695447921753, "learning_rate": 8.835932342937892e-07, "loss": 1.5389, "step": 4827 }, { "epoch": 0.36142458780154585, "grad_norm": 1.128373384475708, "learning_rate": 8.753281566104998e-07, "loss": 1.6488, "step": 4828 }, { "epoch": 0.36149944790672434, "grad_norm": 1.2103967666625977, "learning_rate": 8.671017463764086e-07, "loss": 1.8346, "step": 4829 }, { "epoch": 0.36157430801190277, "grad_norm": 1.4798550605773926, "learning_rate": 8.589140068005708e-07, "loss": 1.957, "step": 4830 }, { "epoch": 0.3616491681170812, "grad_norm": 1.0252922773361206, "learning_rate": 8.507649410769092e-07, "loss": 1.8199, "step": 4831 }, { "epoch": 0.36172402822225963, "grad_norm": 1.2154515981674194, "learning_rate": 8.426545523843033e-07, "loss": 2.0045, "step": 4832 }, { "epoch": 0.3617988883274381, "grad_norm": 1.502529263496399, "learning_rate": 8.345828438865333e-07, "loss": 2.2072, "step": 4833 }, { "epoch": 0.36187374843261655, "grad_norm": 1.2568877935409546, "learning_rate": 8.265498187322584e-07, "loss": 1.7209, "step": 4834 }, { "epoch": 0.361948608537795, "grad_norm": 1.1955690383911133, "learning_rate": 8.185554800550832e-07, "loss": 1.7142, "step": 4835 }, { "epoch": 0.36202346864297347, "grad_norm": 1.164058804512024, "learning_rate": 8.105998309735352e-07, "loss": 1.6761, "step": 4836 }, { "epoch": 0.3620983287481519, "grad_norm": 1.2796891927719116, "learning_rate": 8.026828745909876e-07, "loss": 2.0994, "step": 4837 }, { "epoch": 0.36217318885333033, "grad_norm": 1.3017101287841797, "learning_rate": 7.94804613995781e-07, "loss": 2.0424, "step": 4838 }, { "epoch": 0.36224804895850876, "grad_norm": 1.1822599172592163, "learning_rate": 7.86965052261135e-07, "loss": 1.8043, "step": 4839 }, { "epoch": 0.36232290906368725, "grad_norm": 1.3519049882888794, "learning_rate": 7.791641924451809e-07, "loss": 2.2575, "step": 4840 }, { "epoch": 0.3623977691688657, "grad_norm": 1.2971651554107666, "learning_rate": 7.714020375909514e-07, "loss": 1.9735, "step": 4841 }, { "epoch": 0.3624726292740441, "grad_norm": 1.2707282304763794, "learning_rate": 7.636785907263688e-07, "loss": 1.8846, "step": 4842 }, { "epoch": 0.3625474893792226, "grad_norm": 1.2120001316070557, "learning_rate": 7.559938548642675e-07, "loss": 2.1078, "step": 4843 }, { "epoch": 0.36262234948440103, "grad_norm": 1.2335073947906494, "learning_rate": 7.483478330023719e-07, "loss": 1.6015, "step": 4844 }, { "epoch": 0.36269720958957946, "grad_norm": 1.3071846961975098, "learning_rate": 7.407405281233182e-07, "loss": 2.1699, "step": 4845 }, { "epoch": 0.36277206969475795, "grad_norm": 1.2320595979690552, "learning_rate": 7.331719431946437e-07, "loss": 2.2824, "step": 4846 }, { "epoch": 0.3628469297999364, "grad_norm": 1.1222375631332397, "learning_rate": 7.256420811687537e-07, "loss": 1.301, "step": 4847 }, { "epoch": 0.3629217899051148, "grad_norm": 1.3574960231781006, "learning_rate": 7.181509449829649e-07, "loss": 2.1557, "step": 4848 }, { "epoch": 0.36299665001029324, "grad_norm": 1.5463085174560547, "learning_rate": 7.106985375595066e-07, "loss": 1.8126, "step": 4849 }, { "epoch": 0.36307151011547173, "grad_norm": 1.4583035707473755, "learning_rate": 7.032848618054644e-07, "loss": 2.3425, "step": 4850 }, { "epoch": 0.36314637022065016, "grad_norm": 1.0839351415634155, "learning_rate": 6.959099206128361e-07, "loss": 1.95, "step": 4851 }, { "epoch": 0.3632212303258286, "grad_norm": 1.3000894784927368, "learning_rate": 6.885737168584983e-07, "loss": 2.0179, "step": 4852 }, { "epoch": 0.3632960904310071, "grad_norm": 1.1293402910232544, "learning_rate": 6.812762534042505e-07, "loss": 1.8015, "step": 4853 }, { "epoch": 0.3633709505361855, "grad_norm": 1.2404367923736572, "learning_rate": 6.740175330967158e-07, "loss": 1.8606, "step": 4854 }, { "epoch": 0.36344581064136394, "grad_norm": 1.2156977653503418, "learning_rate": 6.667975587674624e-07, "loss": 1.7551, "step": 4855 }, { "epoch": 0.3635206707465424, "grad_norm": 1.460497260093689, "learning_rate": 6.596163332329264e-07, "loss": 2.0023, "step": 4856 }, { "epoch": 0.36359553085172086, "grad_norm": 1.35504150390625, "learning_rate": 6.524738592944224e-07, "loss": 1.8718, "step": 4857 }, { "epoch": 0.3636703909568993, "grad_norm": 1.3027747869491577, "learning_rate": 6.453701397381329e-07, "loss": 1.9385, "step": 4858 }, { "epoch": 0.3637452510620777, "grad_norm": 1.0650755167007446, "learning_rate": 6.383051773351745e-07, "loss": 1.6844, "step": 4859 }, { "epoch": 0.3638201111672562, "grad_norm": 1.1924686431884766, "learning_rate": 6.312789748414872e-07, "loss": 1.7381, "step": 4860 }, { "epoch": 0.36389497127243464, "grad_norm": 1.2272838354110718, "learning_rate": 6.242915349979117e-07, "loss": 1.5656, "step": 4861 }, { "epoch": 0.3639698313776131, "grad_norm": 1.219632863998413, "learning_rate": 6.1734286053019e-07, "loss": 2.1276, "step": 4862 }, { "epoch": 0.36404469148279156, "grad_norm": 1.2751058340072632, "learning_rate": 6.104329541489207e-07, "loss": 2.0959, "step": 4863 }, { "epoch": 0.36411955158797, "grad_norm": 1.2794551849365234, "learning_rate": 6.035618185495806e-07, "loss": 1.959, "step": 4864 }, { "epoch": 0.3641944116931484, "grad_norm": 1.2748736143112183, "learning_rate": 5.967294564125147e-07, "loss": 1.8107, "step": 4865 }, { "epoch": 0.36426927179832685, "grad_norm": 1.2118269205093384, "learning_rate": 5.899358704029579e-07, "loss": 1.7428, "step": 4866 }, { "epoch": 0.36434413190350534, "grad_norm": 1.2712749242782593, "learning_rate": 5.831810631710233e-07, "loss": 2.2848, "step": 4867 }, { "epoch": 0.3644189920086838, "grad_norm": 1.3176336288452148, "learning_rate": 5.7646503735167e-07, "loss": 1.6594, "step": 4868 }, { "epoch": 0.3644938521138622, "grad_norm": 1.6437129974365234, "learning_rate": 5.69787795564758e-07, "loss": 1.8737, "step": 4869 }, { "epoch": 0.3645687122190407, "grad_norm": 1.239113450050354, "learning_rate": 5.631493404150146e-07, "loss": 1.8125, "step": 4870 }, { "epoch": 0.3646435723242191, "grad_norm": 1.220037579536438, "learning_rate": 5.565496744920129e-07, "loss": 1.8927, "step": 4871 }, { "epoch": 0.36471843242939755, "grad_norm": 1.4726874828338623, "learning_rate": 5.499888003702047e-07, "loss": 1.8574, "step": 4872 }, { "epoch": 0.364793292534576, "grad_norm": 1.0971484184265137, "learning_rate": 5.434667206089427e-07, "loss": 1.8033, "step": 4873 }, { "epoch": 0.3648681526397545, "grad_norm": 1.1831159591674805, "learning_rate": 5.369834377523919e-07, "loss": 1.914, "step": 4874 }, { "epoch": 0.3649430127449329, "grad_norm": 1.7039172649383545, "learning_rate": 5.30538954329618e-07, "loss": 2.4755, "step": 4875 }, { "epoch": 0.36501787285011134, "grad_norm": 1.2353792190551758, "learning_rate": 5.241332728545545e-07, "loss": 2.1262, "step": 4876 }, { "epoch": 0.3650927329552898, "grad_norm": 1.2258747816085815, "learning_rate": 5.177663958259804e-07, "loss": 2.4955, "step": 4877 }, { "epoch": 0.36516759306046825, "grad_norm": 1.4445019960403442, "learning_rate": 5.114383257275312e-07, "loss": 2.0394, "step": 4878 }, { "epoch": 0.3652424531656467, "grad_norm": 1.150300145149231, "learning_rate": 5.051490650277324e-07, "loss": 1.9648, "step": 4879 }, { "epoch": 0.3653173132708252, "grad_norm": 1.4390690326690674, "learning_rate": 4.988986161799548e-07, "loss": 1.8336, "step": 4880 }, { "epoch": 0.3653921733760036, "grad_norm": 1.2658113241195679, "learning_rate": 4.926869816224256e-07, "loss": 1.9253, "step": 4881 }, { "epoch": 0.36546703348118204, "grad_norm": 1.0818787813186646, "learning_rate": 4.865141637782511e-07, "loss": 1.7486, "step": 4882 }, { "epoch": 0.36554189358636047, "grad_norm": 1.1822084188461304, "learning_rate": 4.803801650553496e-07, "loss": 1.8316, "step": 4883 }, { "epoch": 0.36561675369153895, "grad_norm": 1.2775869369506836, "learning_rate": 4.7428498784655164e-07, "loss": 1.6655, "step": 4884 }, { "epoch": 0.3656916137967174, "grad_norm": 1.5697925090789795, "learning_rate": 4.682286345295106e-07, "loss": 2.3465, "step": 4885 }, { "epoch": 0.3657664739018958, "grad_norm": 1.4389771223068237, "learning_rate": 4.622111074667368e-07, "loss": 1.771, "step": 4886 }, { "epoch": 0.3658413340070743, "grad_norm": 1.1903655529022217, "learning_rate": 4.562324090056191e-07, "loss": 1.6575, "step": 4887 }, { "epoch": 0.36591619411225274, "grad_norm": 1.2551591396331787, "learning_rate": 4.5029254147836985e-07, "loss": 1.6109, "step": 4888 }, { "epoch": 0.36599105421743117, "grad_norm": 1.3494244813919067, "learning_rate": 4.443915072020688e-07, "loss": 2.1705, "step": 4889 }, { "epoch": 0.3660659143226096, "grad_norm": 1.3069137334823608, "learning_rate": 4.3852930847865236e-07, "loss": 1.5388, "step": 4890 }, { "epoch": 0.3661407744277881, "grad_norm": 1.2429250478744507, "learning_rate": 4.3270594759488027e-07, "loss": 2.0469, "step": 4891 }, { "epoch": 0.3662156345329665, "grad_norm": 1.253157138824463, "learning_rate": 4.2692142682240197e-07, "loss": 1.8055, "step": 4892 }, { "epoch": 0.36629049463814495, "grad_norm": 1.184638500213623, "learning_rate": 4.2117574841769037e-07, "loss": 1.6842, "step": 4893 }, { "epoch": 0.36636535474332343, "grad_norm": 1.4168773889541626, "learning_rate": 4.154689146220636e-07, "loss": 2.0017, "step": 4894 }, { "epoch": 0.36644021484850187, "grad_norm": 1.2037566900253296, "learning_rate": 4.0980092766169653e-07, "loss": 2.2403, "step": 4895 }, { "epoch": 0.3665150749536803, "grad_norm": 1.1409797668457031, "learning_rate": 4.041717897476205e-07, "loss": 1.9627, "step": 4896 }, { "epoch": 0.3665899350588588, "grad_norm": 1.155261754989624, "learning_rate": 3.985815030756901e-07, "loss": 1.5608, "step": 4897 }, { "epoch": 0.3666647951640372, "grad_norm": 1.2428401708602905, "learning_rate": 3.930300698266165e-07, "loss": 1.9801, "step": 4898 }, { "epoch": 0.36673965526921565, "grad_norm": 1.0919442176818848, "learning_rate": 3.8751749216595633e-07, "loss": 1.6696, "step": 4899 }, { "epoch": 0.3668145153743941, "grad_norm": 1.0419034957885742, "learning_rate": 3.820437722441117e-07, "loss": 2.3604, "step": 4900 }, { "epoch": 0.36688937547957257, "grad_norm": 1.2276426553726196, "learning_rate": 3.766089121963079e-07, "loss": 2.1043, "step": 4901 }, { "epoch": 0.366964235584751, "grad_norm": 1.0536746978759766, "learning_rate": 3.7121291414262683e-07, "loss": 2.2779, "step": 4902 }, { "epoch": 0.36703909568992943, "grad_norm": 1.2664833068847656, "learning_rate": 3.6585578018798474e-07, "loss": 1.5661, "step": 4903 }, { "epoch": 0.3671139557951079, "grad_norm": 1.2505574226379395, "learning_rate": 3.605375124221544e-07, "loss": 2.1854, "step": 4904 }, { "epoch": 0.36718881590028635, "grad_norm": 1.2223308086395264, "learning_rate": 3.552581129197319e-07, "loss": 2.1928, "step": 4905 }, { "epoch": 0.3672636760054648, "grad_norm": 1.209747076034546, "learning_rate": 3.500175837401476e-07, "loss": 2.3185, "step": 4906 }, { "epoch": 0.3673385361106432, "grad_norm": 1.2402360439300537, "learning_rate": 3.448159269276663e-07, "loss": 1.7535, "step": 4907 }, { "epoch": 0.3674133962158217, "grad_norm": 1.2397148609161377, "learning_rate": 3.396531445114093e-07, "loss": 2.1873, "step": 4908 }, { "epoch": 0.36748825632100013, "grad_norm": 1.1321591138839722, "learning_rate": 3.345292385053211e-07, "loss": 1.8797, "step": 4909 }, { "epoch": 0.36756311642617856, "grad_norm": 1.300744652748108, "learning_rate": 3.2944421090816966e-07, "loss": 2.2295, "step": 4910 }, { "epoch": 0.36763797653135705, "grad_norm": 2.217629909515381, "learning_rate": 3.243980637035904e-07, "loss": 1.7606, "step": 4911 }, { "epoch": 0.3677128366365355, "grad_norm": 1.3869738578796387, "learning_rate": 3.193907988600087e-07, "loss": 2.2472, "step": 4912 }, { "epoch": 0.3677876967417139, "grad_norm": 1.2921842336654663, "learning_rate": 3.144224183307065e-07, "loss": 1.7667, "step": 4913 }, { "epoch": 0.3678625568468924, "grad_norm": 1.1030547618865967, "learning_rate": 3.094929240538114e-07, "loss": 1.9136, "step": 4914 }, { "epoch": 0.36793741695207083, "grad_norm": 1.3629906177520752, "learning_rate": 3.046023179522517e-07, "loss": 2.3002, "step": 4915 }, { "epoch": 0.36801227705724926, "grad_norm": 1.4827508926391602, "learning_rate": 2.9975060193380144e-07, "loss": 2.194, "step": 4916 }, { "epoch": 0.3680871371624277, "grad_norm": 1.1288799047470093, "learning_rate": 2.949377778910578e-07, "loss": 1.9908, "step": 4917 }, { "epoch": 0.3681619972676062, "grad_norm": 1.5025713443756104, "learning_rate": 2.901638477014523e-07, "loss": 1.9856, "step": 4918 }, { "epoch": 0.3682368573727846, "grad_norm": 1.3653533458709717, "learning_rate": 2.854288132272509e-07, "loss": 1.7599, "step": 4919 }, { "epoch": 0.36831171747796304, "grad_norm": 1.775288462638855, "learning_rate": 2.807326763155316e-07, "loss": 1.9025, "step": 4920 }, { "epoch": 0.36838657758314153, "grad_norm": 1.168463110923767, "learning_rate": 2.7607543879820673e-07, "loss": 2.0069, "step": 4921 }, { "epoch": 0.36846143768831996, "grad_norm": 1.4156184196472168, "learning_rate": 2.7145710249202317e-07, "loss": 2.0641, "step": 4922 }, { "epoch": 0.3685362977934984, "grad_norm": 1.2704062461853027, "learning_rate": 2.668776691985286e-07, "loss": 1.9054, "step": 4923 }, { "epoch": 0.3686111578986768, "grad_norm": 1.113425374031067, "learning_rate": 2.623371407041164e-07, "loss": 1.7273, "step": 4924 }, { "epoch": 0.3686860180038553, "grad_norm": 1.216292142868042, "learning_rate": 2.578355187799919e-07, "loss": 1.3275, "step": 4925 }, { "epoch": 0.36876087810903374, "grad_norm": 1.3145354986190796, "learning_rate": 2.5337280518219484e-07, "loss": 1.7351, "step": 4926 }, { "epoch": 0.36883573821421217, "grad_norm": 1.482887864112854, "learning_rate": 2.489490016515883e-07, "loss": 2.4417, "step": 4927 }, { "epoch": 0.36891059831939066, "grad_norm": 1.3228777647018433, "learning_rate": 2.4456410991383626e-07, "loss": 1.6222, "step": 4928 }, { "epoch": 0.3689854584245691, "grad_norm": 1.1918296813964844, "learning_rate": 2.4021813167943717e-07, "loss": 2.1735, "step": 4929 }, { "epoch": 0.3690603185297475, "grad_norm": 1.2862356901168823, "learning_rate": 2.359110686437127e-07, "loss": 2.0697, "step": 4930 }, { "epoch": 0.369135178634926, "grad_norm": 1.2714046239852905, "learning_rate": 2.3164292248680774e-07, "loss": 2.4273, "step": 4931 }, { "epoch": 0.36921003874010444, "grad_norm": 1.1355618238449097, "learning_rate": 2.2741369487366827e-07, "loss": 2.0719, "step": 4932 }, { "epoch": 0.36928489884528287, "grad_norm": 1.8375118970870972, "learning_rate": 2.2322338745407455e-07, "loss": 1.8741, "step": 4933 }, { "epoch": 0.3693597589504613, "grad_norm": 1.1978259086608887, "learning_rate": 2.190720018626302e-07, "loss": 1.9827, "step": 4934 }, { "epoch": 0.3694346190556398, "grad_norm": 1.2580115795135498, "learning_rate": 2.1495953971872872e-07, "loss": 2.2007, "step": 4935 }, { "epoch": 0.3695094791608182, "grad_norm": 1.1597857475280762, "learning_rate": 2.1088600262659798e-07, "loss": 1.2902, "step": 4936 }, { "epoch": 0.36958433926599665, "grad_norm": 1.350406289100647, "learning_rate": 2.0685139217528904e-07, "loss": 1.8741, "step": 4937 }, { "epoch": 0.36965919937117514, "grad_norm": 1.168755054473877, "learning_rate": 2.028557099386541e-07, "loss": 1.9249, "step": 4938 }, { "epoch": 0.36973405947635357, "grad_norm": 1.5015709400177002, "learning_rate": 1.9889895747536856e-07, "loss": 2.468, "step": 4939 }, { "epoch": 0.369808919581532, "grad_norm": 1.311902642250061, "learning_rate": 1.9498113632892e-07, "loss": 1.999, "step": 4940 }, { "epoch": 0.36988377968671043, "grad_norm": 1.3193798065185547, "learning_rate": 1.91102248027597e-07, "loss": 2.2362, "step": 4941 }, { "epoch": 0.3699586397918889, "grad_norm": 1.320547103881836, "learning_rate": 1.8726229408452257e-07, "loss": 2.1357, "step": 4942 }, { "epoch": 0.37003349989706735, "grad_norm": 1.224130630493164, "learning_rate": 1.8346127599760954e-07, "loss": 1.8214, "step": 4943 }, { "epoch": 0.3701083600022458, "grad_norm": 1.5769799947738647, "learning_rate": 1.7969919524960522e-07, "loss": 2.0376, "step": 4944 }, { "epoch": 0.37018322010742427, "grad_norm": 1.3981033563613892, "learning_rate": 1.7597605330805788e-07, "loss": 1.9262, "step": 4945 }, { "epoch": 0.3702580802126027, "grad_norm": 1.335249662399292, "learning_rate": 1.722918516253058e-07, "loss": 1.8509, "step": 4946 }, { "epoch": 0.37033294031778113, "grad_norm": 1.125611662864685, "learning_rate": 1.686465916385327e-07, "loss": 1.8926, "step": 4947 }, { "epoch": 0.3704078004229596, "grad_norm": 1.2969744205474854, "learning_rate": 1.6504027476971217e-07, "loss": 1.9572, "step": 4948 }, { "epoch": 0.37048266052813805, "grad_norm": 1.0507197380065918, "learning_rate": 1.6147290242561896e-07, "loss": 2.0563, "step": 4949 }, { "epoch": 0.3705575206333165, "grad_norm": 1.3633369207382202, "learning_rate": 1.579444759978621e-07, "loss": 1.7533, "step": 4950 }, { "epoch": 0.3705575206333165, "eval_loss": 1.9663305282592773, "eval_runtime": 178.8911, "eval_samples_per_second": 27.95, "eval_steps_per_second": 13.975, "step": 4950 }, { "epoch": 0.3706323807384949, "grad_norm": 1.2411137819290161, "learning_rate": 1.544549968628295e-07, "loss": 2.1719, "step": 4951 }, { "epoch": 0.3707072408436734, "grad_norm": 1.2201836109161377, "learning_rate": 1.5100446638173228e-07, "loss": 1.5187, "step": 4952 }, { "epoch": 0.37078210094885183, "grad_norm": 1.0959951877593994, "learning_rate": 1.4759288590058263e-07, "loss": 1.2851, "step": 4953 }, { "epoch": 0.37085696105403027, "grad_norm": 1.2576202154159546, "learning_rate": 1.4422025675020488e-07, "loss": 1.8899, "step": 4954 }, { "epoch": 0.37093182115920875, "grad_norm": 1.198002576828003, "learning_rate": 1.4088658024622448e-07, "loss": 2.171, "step": 4955 }, { "epoch": 0.3710066812643872, "grad_norm": 1.283689022064209, "learning_rate": 1.375918576890678e-07, "loss": 2.2446, "step": 4956 }, { "epoch": 0.3710815413695656, "grad_norm": 1.3751139640808105, "learning_rate": 1.3433609036397342e-07, "loss": 1.8505, "step": 4957 }, { "epoch": 0.37115640147474405, "grad_norm": 1.064444661140442, "learning_rate": 1.3111927954098102e-07, "loss": 1.7843, "step": 4958 }, { "epoch": 0.37123126157992253, "grad_norm": 1.0398433208465576, "learning_rate": 1.2794142647492013e-07, "loss": 1.9175, "step": 4959 }, { "epoch": 0.37130612168510096, "grad_norm": 1.3033219575881958, "learning_rate": 1.248025324054658e-07, "loss": 2.3778, "step": 4960 }, { "epoch": 0.3713809817902794, "grad_norm": 1.0516605377197266, "learning_rate": 1.2170259855703858e-07, "loss": 2.3922, "step": 4961 }, { "epoch": 0.3714558418954579, "grad_norm": 1.3150949478149414, "learning_rate": 1.186416261389045e-07, "loss": 2.2298, "step": 4962 }, { "epoch": 0.3715307020006363, "grad_norm": 1.3337782621383667, "learning_rate": 1.1561961634510843e-07, "loss": 2.1364, "step": 4963 }, { "epoch": 0.37160556210581475, "grad_norm": 1.1191245317459106, "learning_rate": 1.1263657035449627e-07, "loss": 1.8137, "step": 4964 }, { "epoch": 0.37168042221099323, "grad_norm": 1.032953143119812, "learning_rate": 1.0969248933073717e-07, "loss": 1.5611, "step": 4965 }, { "epoch": 0.37175528231617166, "grad_norm": 1.4030370712280273, "learning_rate": 1.0678737442227915e-07, "loss": 1.856, "step": 4966 }, { "epoch": 0.3718301424213501, "grad_norm": 1.1899338960647583, "learning_rate": 1.0392122676237126e-07, "loss": 1.4475, "step": 4967 }, { "epoch": 0.3719050025265285, "grad_norm": 1.318682312965393, "learning_rate": 1.0109404746907469e-07, "loss": 2.1785, "step": 4968 }, { "epoch": 0.371979862631707, "grad_norm": 1.1499639749526978, "learning_rate": 9.830583764522949e-08, "loss": 1.7568, "step": 4969 }, { "epoch": 0.37205472273688545, "grad_norm": 1.2958184480667114, "learning_rate": 9.555659837849895e-08, "loss": 1.8185, "step": 4970 }, { "epoch": 0.3721295828420639, "grad_norm": 1.2534544467926025, "learning_rate": 9.28463307413141e-08, "loss": 2.1521, "step": 4971 }, { "epoch": 0.37220444294724236, "grad_norm": 1.4329583644866943, "learning_rate": 9.017503579094033e-08, "loss": 2.2751, "step": 4972 }, { "epoch": 0.3722793030524208, "grad_norm": 1.1662907600402832, "learning_rate": 8.754271456941077e-08, "loss": 1.9949, "step": 4973 }, { "epoch": 0.3723541631575992, "grad_norm": 1.1763770580291748, "learning_rate": 8.494936810355958e-08, "loss": 2.207, "step": 4974 }, { "epoch": 0.37242902326277766, "grad_norm": 1.3006685972213745, "learning_rate": 8.23949974050331e-08, "loss": 1.8742, "step": 4975 }, { "epoch": 0.37250388336795615, "grad_norm": 1.51710844039917, "learning_rate": 7.987960347025647e-08, "loss": 2.1904, "step": 4976 }, { "epoch": 0.3725787434731346, "grad_norm": 1.4190149307250977, "learning_rate": 7.740318728045593e-08, "loss": 2.177, "step": 4977 }, { "epoch": 0.372653603578313, "grad_norm": 1.3400824069976807, "learning_rate": 7.496574980166982e-08, "loss": 1.7044, "step": 4978 }, { "epoch": 0.3727284636834915, "grad_norm": 1.1763372421264648, "learning_rate": 7.256729198469314e-08, "loss": 1.7179, "step": 4979 }, { "epoch": 0.3728033237886699, "grad_norm": 1.2219486236572266, "learning_rate": 7.020781476515525e-08, "loss": 1.726, "step": 4980 }, { "epoch": 0.37287818389384836, "grad_norm": 1.2846757173538208, "learning_rate": 6.788731906345325e-08, "loss": 2.4991, "step": 4981 }, { "epoch": 0.37295304399902685, "grad_norm": 1.2243740558624268, "learning_rate": 6.560580578479636e-08, "loss": 1.6773, "step": 4982 }, { "epoch": 0.3730279041042053, "grad_norm": 1.3469129800796509, "learning_rate": 6.336327581916157e-08, "loss": 1.9793, "step": 4983 }, { "epoch": 0.3731027642093837, "grad_norm": 1.2938308715820312, "learning_rate": 6.115973004134912e-08, "loss": 2.1317, "step": 4984 }, { "epoch": 0.37317762431456214, "grad_norm": 1.362736463546753, "learning_rate": 5.899516931093807e-08, "loss": 2.272, "step": 4985 }, { "epoch": 0.3732524844197406, "grad_norm": 1.1034882068634033, "learning_rate": 5.686959447229745e-08, "loss": 1.6906, "step": 4986 }, { "epoch": 0.37332734452491906, "grad_norm": 1.1260019540786743, "learning_rate": 5.478300635458622e-08, "loss": 2.0264, "step": 4987 }, { "epoch": 0.3734022046300975, "grad_norm": 1.3279393911361694, "learning_rate": 5.273540577176439e-08, "loss": 1.6167, "step": 4988 }, { "epoch": 0.373477064735276, "grad_norm": 1.314508318901062, "learning_rate": 5.0726793522570814e-08, "loss": 1.5518, "step": 4989 }, { "epoch": 0.3735519248404544, "grad_norm": 1.2558382749557495, "learning_rate": 4.8757170390556495e-08, "loss": 1.9004, "step": 4990 }, { "epoch": 0.37362678494563284, "grad_norm": 1.2909865379333496, "learning_rate": 4.682653714404017e-08, "loss": 1.7055, "step": 4991 }, { "epoch": 0.37370164505081127, "grad_norm": 1.1747167110443115, "learning_rate": 4.493489453614164e-08, "loss": 2.4907, "step": 4992 }, { "epoch": 0.37377650515598976, "grad_norm": 1.4869656562805176, "learning_rate": 4.3082243304770617e-08, "loss": 2.4387, "step": 4993 }, { "epoch": 0.3738513652611682, "grad_norm": 1.2070189714431763, "learning_rate": 4.12685841726268e-08, "loss": 1.8671, "step": 4994 }, { "epoch": 0.3739262253663466, "grad_norm": 1.234904408454895, "learning_rate": 3.94939178471998e-08, "loss": 1.9867, "step": 4995 }, { "epoch": 0.3740010854715251, "grad_norm": 1.1703590154647827, "learning_rate": 3.775824502076919e-08, "loss": 2.0854, "step": 4996 }, { "epoch": 0.37407594557670354, "grad_norm": 1.4773565530776978, "learning_rate": 3.6061566370393376e-08, "loss": 2.1288, "step": 4997 }, { "epoch": 0.37415080568188197, "grad_norm": 1.5366137027740479, "learning_rate": 3.4403882557942915e-08, "loss": 2.2701, "step": 4998 }, { "epoch": 0.37422566578706046, "grad_norm": 1.0518525838851929, "learning_rate": 3.2785194230045004e-08, "loss": 1.7945, "step": 4999 }, { "epoch": 0.3743005258922389, "grad_norm": 1.3006261587142944, "learning_rate": 3.120550201815009e-08, "loss": 1.9247, "step": 5000 }, { "epoch": 0.3743753859974173, "grad_norm": 1.5011438131332397, "learning_rate": 2.9664806538465262e-08, "loss": 2.2498, "step": 5001 }, { "epoch": 0.37445024610259575, "grad_norm": 1.3992769718170166, "learning_rate": 2.816310839199865e-08, "loss": 1.8977, "step": 5002 }, { "epoch": 0.37452510620777424, "grad_norm": 1.1042829751968384, "learning_rate": 2.6700408164548328e-08, "loss": 1.2875, "step": 5003 }, { "epoch": 0.37459996631295267, "grad_norm": 1.28590726852417, "learning_rate": 2.5276706426713425e-08, "loss": 1.9616, "step": 5004 }, { "epoch": 0.3746748264181311, "grad_norm": 1.1898860931396484, "learning_rate": 2.3892003733838598e-08, "loss": 1.7639, "step": 5005 }, { "epoch": 0.3747496865233096, "grad_norm": 1.2147506475448608, "learning_rate": 2.2546300626091753e-08, "loss": 1.849, "step": 5006 }, { "epoch": 0.374824546628488, "grad_norm": 1.2294347286224365, "learning_rate": 2.123959762843075e-08, "loss": 2.1053, "step": 5007 }, { "epoch": 0.37489940673366645, "grad_norm": 1.2910950183868408, "learning_rate": 1.997189525055898e-08, "loss": 2.2129, "step": 5008 }, { "epoch": 0.3749742668388449, "grad_norm": 1.4780951738357544, "learning_rate": 1.874319398702529e-08, "loss": 1.9618, "step": 5009 }, { "epoch": 0.37504912694402337, "grad_norm": 1.4901782274246216, "learning_rate": 1.755349431710185e-08, "loss": 2.0023, "step": 5010 }, { "epoch": 0.3751239870492018, "grad_norm": 1.3150880336761475, "learning_rate": 1.6402796704895196e-08, "loss": 1.5646, "step": 5011 }, { "epoch": 0.37519884715438023, "grad_norm": 1.3421481847763062, "learning_rate": 1.52911015992796e-08, "loss": 2.0128, "step": 5012 }, { "epoch": 0.3752737072595587, "grad_norm": 1.2243260145187378, "learning_rate": 1.4218409433908175e-08, "loss": 2.037, "step": 5013 }, { "epoch": 0.37534856736473715, "grad_norm": 1.2326892614364624, "learning_rate": 1.3184720627235081e-08, "loss": 1.8788, "step": 5014 }, { "epoch": 0.3754234274699156, "grad_norm": 1.4980422258377075, "learning_rate": 1.2190035582471115e-08, "loss": 2.3279, "step": 5015 }, { "epoch": 0.37549828757509407, "grad_norm": 1.1286108493804932, "learning_rate": 1.123435468766143e-08, "loss": 1.798, "step": 5016 }, { "epoch": 0.3755731476802725, "grad_norm": 1.1455106735229492, "learning_rate": 1.031767831558561e-08, "loss": 1.6733, "step": 5017 }, { "epoch": 0.37564800778545093, "grad_norm": 1.0452263355255127, "learning_rate": 9.44000682383539e-09, "loss": 1.8773, "step": 5018 }, { "epoch": 0.37572286789062936, "grad_norm": 1.4218225479125977, "learning_rate": 8.601340554781346e-09, "loss": 2.0528, "step": 5019 }, { "epoch": 0.37579772799580785, "grad_norm": 1.6470973491668701, "learning_rate": 7.801679835572895e-09, "loss": 1.7786, "step": 5020 }, { "epoch": 0.3758725881009863, "grad_norm": 1.2469727993011475, "learning_rate": 7.041024978160504e-09, "loss": 2.051, "step": 5021 }, { "epoch": 0.3759474482061647, "grad_norm": 1.224454641342163, "learning_rate": 6.319376279262379e-09, "loss": 2.2662, "step": 5022 }, { "epoch": 0.3760223083113432, "grad_norm": 1.1871840953826904, "learning_rate": 5.636734020375567e-09, "loss": 1.5023, "step": 5023 }, { "epoch": 0.37609716841652163, "grad_norm": 1.135377049446106, "learning_rate": 4.993098467798163e-09, "loss": 1.3566, "step": 5024 }, { "epoch": 0.37617202852170006, "grad_norm": 1.3303520679473877, "learning_rate": 4.388469872618206e-09, "loss": 1.3843, "step": 5025 }, { "epoch": 0.3762468886268785, "grad_norm": 1.462297797203064, "learning_rate": 3.822848470669272e-09, "loss": 2.2968, "step": 5026 }, { "epoch": 0.376321748732057, "grad_norm": 1.2747176885604858, "learning_rate": 3.296234482619287e-09, "loss": 1.7962, "step": 5027 }, { "epoch": 0.3763966088372354, "grad_norm": 1.284083604812622, "learning_rate": 2.8086281138706148e-09, "loss": 2.1965, "step": 5028 }, { "epoch": 0.37647146894241384, "grad_norm": 1.3260568380355835, "learning_rate": 2.3600295546599704e-09, "loss": 1.9976, "step": 5029 }, { "epoch": 0.37654632904759233, "grad_norm": 1.0794789791107178, "learning_rate": 1.950438979958502e-09, "loss": 1.7483, "step": 5030 }, { "epoch": 0.37662118915277076, "grad_norm": 1.2783838510513306, "learning_rate": 1.5798565495495076e-09, "loss": 1.5004, "step": 5031 }, { "epoch": 0.3766960492579492, "grad_norm": 1.3007869720458984, "learning_rate": 1.2482824079951271e-09, "loss": 2.0624, "step": 5032 }, { "epoch": 0.3767709093631277, "grad_norm": 1.372740387916565, "learning_rate": 9.55716684636343e-10, "loss": 1.866, "step": 5033 }, { "epoch": 0.3768457694683061, "grad_norm": 1.371090054512024, "learning_rate": 7.021594936040821e-10, "loss": 1.9814, "step": 5034 }, { "epoch": 0.37692062957348454, "grad_norm": 1.1047563552856445, "learning_rate": 4.876109338081137e-10, "loss": 1.2821, "step": 5035 }, { "epoch": 0.376995489678663, "grad_norm": 1.2861955165863037, "learning_rate": 3.1207108893704927e-10, "loss": 1.6591, "step": 5036 }, { "epoch": 0.37707034978384146, "grad_norm": 1.1517539024353027, "learning_rate": 1.755400274694452e-10, "loss": 1.8742, "step": 5037 }, { "epoch": 0.3771452098890199, "grad_norm": 1.1889746189117432, "learning_rate": 7.80178026738021e-11, "loss": 1.9279, "step": 5038 }, { "epoch": 0.3772200699941983, "grad_norm": 1.262064814567566, "learning_rate": 1.950445256415634e-11, "loss": 1.9269, "step": 5039 }, { "epoch": 0.3772949300993768, "grad_norm": 1.4935128688812256, "learning_rate": 0.0, "loss": 1.9802, "step": 5040 } ], "logging_steps": 1, "max_steps": 5040, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.431234782573363e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }