diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 11.5, "best_model_checkpoint": "miner_id_24/checkpoint-1000", - "epoch": 1.800720288115246, + "epoch": 2.4009603841536613, "eval_steps": 1000, - "global_step": 3000, + "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -21039,6 +21039,7014 @@ "eval_samples_per_second": 261.522, "eval_steps_per_second": 32.807, "step": 3000 + }, + { + "epoch": 1.8013205282112845, + "grad_norm": 0.0011961712734773755, + "learning_rate": 0.00018465877693180072, + "loss": 23.0, + "step": 3001 + }, + { + "epoch": 1.801920768307323, + "grad_norm": 0.0015885218745097518, + "learning_rate": 0.00018464872064075577, + "loss": 23.0, + "step": 3002 + }, + { + "epoch": 1.8025210084033614, + "grad_norm": 0.0019562295638024807, + "learning_rate": 0.00018463866132881578, + "loss": 23.0, + "step": 3003 + }, + { + "epoch": 1.8031212484993997, + "grad_norm": 0.003253971692174673, + "learning_rate": 0.00018462859899633967, + "loss": 23.0, + "step": 3004 + }, + { + "epoch": 1.8037214885954382, + "grad_norm": 0.000821275869384408, + "learning_rate": 0.00018461853364368653, + "loss": 23.0, + "step": 3005 + }, + { + "epoch": 1.8043217286914766, + "grad_norm": 0.0012803277932107449, + "learning_rate": 0.0001846084652712156, + "loss": 23.0, + "step": 3006 + }, + { + "epoch": 1.8049219687875149, + "grad_norm": 0.0007911855354905128, + "learning_rate": 0.00018459839387928619, + "loss": 23.0, + "step": 3007 + }, + { + "epoch": 1.8055222088835534, + "grad_norm": 0.0007396016153506935, + "learning_rate": 0.0001845883194682577, + "loss": 23.0, + "step": 3008 + }, + { + "epoch": 1.806122448979592, + "grad_norm": 0.0014897441724315286, + "learning_rate": 0.0001845782420384897, + "loss": 23.0, + "step": 3009 + }, + { + "epoch": 1.8067226890756303, + "grad_norm": 0.0004990187590010464, + "learning_rate": 0.0001845681615903418, + "loss": 23.0, + "step": 3010 + }, + { + "epoch": 1.8073229291716686, + "grad_norm": 0.0008598102140240371, + "learning_rate": 0.00018455807812417372, + "loss": 23.0, + "step": 3011 + }, + { + "epoch": 1.8079231692677071, + "grad_norm": 0.0011542581487447023, + "learning_rate": 0.00018454799164034538, + "loss": 23.0, + "step": 3012 + }, + { + "epoch": 1.8085234093637454, + "grad_norm": 0.0013645098078995943, + "learning_rate": 0.00018453790213921665, + "loss": 23.0, + "step": 3013 + }, + { + "epoch": 1.8091236494597838, + "grad_norm": 0.0008868533186614513, + "learning_rate": 0.00018452780962114772, + "loss": 23.0, + "step": 3014 + }, + { + "epoch": 1.8097238895558223, + "grad_norm": 0.0018087849020957947, + "learning_rate": 0.00018451771408649865, + "loss": 23.0, + "step": 3015 + }, + { + "epoch": 1.8103241296518608, + "grad_norm": 0.00035064207622781396, + "learning_rate": 0.00018450761553562975, + "loss": 23.0, + "step": 3016 + }, + { + "epoch": 1.8109243697478992, + "grad_norm": 0.0014904963318258524, + "learning_rate": 0.00018449751396890148, + "loss": 23.0, + "step": 3017 + }, + { + "epoch": 1.8115246098439375, + "grad_norm": 0.0019730026833713055, + "learning_rate": 0.00018448740938667427, + "loss": 23.0, + "step": 3018 + }, + { + "epoch": 1.812124849939976, + "grad_norm": 0.0017877138452604413, + "learning_rate": 0.00018447730178930874, + "loss": 23.0, + "step": 3019 + }, + { + "epoch": 1.8127250900360146, + "grad_norm": 0.0008809540886431932, + "learning_rate": 0.0001844671911771656, + "loss": 23.0, + "step": 3020 + }, + { + "epoch": 1.8133253301320527, + "grad_norm": 0.0015870918286964297, + "learning_rate": 0.0001844570775506057, + "loss": 23.0, + "step": 3021 + }, + { + "epoch": 1.8139255702280912, + "grad_norm": 0.001148207113146782, + "learning_rate": 0.00018444696090998995, + "loss": 23.0, + "step": 3022 + }, + { + "epoch": 1.8145258103241297, + "grad_norm": 0.0025478298775851727, + "learning_rate": 0.00018443684125567938, + "loss": 23.0, + "step": 3023 + }, + { + "epoch": 1.815126050420168, + "grad_norm": 0.0015544770285487175, + "learning_rate": 0.00018442671858803514, + "loss": 23.0, + "step": 3024 + }, + { + "epoch": 1.8157262905162064, + "grad_norm": 0.001434922218322754, + "learning_rate": 0.00018441659290741852, + "loss": 23.0, + "step": 3025 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.0009723508264869452, + "learning_rate": 0.00018440646421419085, + "loss": 23.0, + "step": 3026 + }, + { + "epoch": 1.8169267707082835, + "grad_norm": 0.0012221380602568388, + "learning_rate": 0.00018439633250871354, + "loss": 23.0, + "step": 3027 + }, + { + "epoch": 1.8175270108043218, + "grad_norm": 0.0012381580891087651, + "learning_rate": 0.00018438619779134822, + "loss": 23.0, + "step": 3028 + }, + { + "epoch": 1.81812725090036, + "grad_norm": 0.0011028312146663666, + "learning_rate": 0.0001843760600624566, + "loss": 23.0, + "step": 3029 + }, + { + "epoch": 1.8187274909963986, + "grad_norm": 0.0011208378709852695, + "learning_rate": 0.0001843659193224004, + "loss": 23.0, + "step": 3030 + }, + { + "epoch": 1.819327731092437, + "grad_norm": 0.001310601131990552, + "learning_rate": 0.0001843557755715416, + "loss": 23.0, + "step": 3031 + }, + { + "epoch": 1.8199279711884753, + "grad_norm": 0.0017411686712875962, + "learning_rate": 0.00018434562881024214, + "loss": 23.0, + "step": 3032 + }, + { + "epoch": 1.8205282112845138, + "grad_norm": 0.0012776413932442665, + "learning_rate": 0.00018433547903886415, + "loss": 23.0, + "step": 3033 + }, + { + "epoch": 1.8211284513805523, + "grad_norm": 0.00228596362285316, + "learning_rate": 0.00018432532625776987, + "loss": 23.0, + "step": 3034 + }, + { + "epoch": 1.8217286914765907, + "grad_norm": 0.0009363253484480083, + "learning_rate": 0.00018431517046732157, + "loss": 23.0, + "step": 3035 + }, + { + "epoch": 1.822328931572629, + "grad_norm": 0.0019296659156680107, + "learning_rate": 0.00018430501166788173, + "loss": 23.0, + "step": 3036 + }, + { + "epoch": 1.8229291716686675, + "grad_norm": 0.0037253268528729677, + "learning_rate": 0.0001842948498598129, + "loss": 23.0, + "step": 3037 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.0003991881385445595, + "learning_rate": 0.00018428468504347767, + "loss": 23.0, + "step": 3038 + }, + { + "epoch": 1.8241296518607442, + "grad_norm": 0.004006813745945692, + "learning_rate": 0.00018427451721923888, + "loss": 23.0, + "step": 3039 + }, + { + "epoch": 1.8247298919567827, + "grad_norm": 0.0013116990448907018, + "learning_rate": 0.00018426434638745933, + "loss": 23.0, + "step": 3040 + }, + { + "epoch": 1.8253301320528212, + "grad_norm": 0.0044224620796740055, + "learning_rate": 0.000184254172548502, + "loss": 23.0, + "step": 3041 + }, + { + "epoch": 1.8259303721488596, + "grad_norm": 0.0010624263668432832, + "learning_rate": 0.00018424399570273002, + "loss": 23.0, + "step": 3042 + }, + { + "epoch": 1.8265306122448979, + "grad_norm": 0.00032893690513446927, + "learning_rate": 0.0001842338158505065, + "loss": 23.0, + "step": 3043 + }, + { + "epoch": 1.8271308523409364, + "grad_norm": 0.0007519045611843467, + "learning_rate": 0.00018422363299219478, + "loss": 23.0, + "step": 3044 + }, + { + "epoch": 1.8277310924369747, + "grad_norm": 0.0011156450491398573, + "learning_rate": 0.00018421344712815825, + "loss": 23.0, + "step": 3045 + }, + { + "epoch": 1.828331332533013, + "grad_norm": 0.0012068160576745868, + "learning_rate": 0.0001842032582587604, + "loss": 23.0, + "step": 3046 + }, + { + "epoch": 1.8289315726290516, + "grad_norm": 0.00150686118286103, + "learning_rate": 0.00018419306638436486, + "loss": 23.0, + "step": 3047 + }, + { + "epoch": 1.8295318127250901, + "grad_norm": 0.0015496856067329645, + "learning_rate": 0.00018418287150533537, + "loss": 23.0, + "step": 3048 + }, + { + "epoch": 1.8301320528211285, + "grad_norm": 0.0019011717522516847, + "learning_rate": 0.00018417267362203575, + "loss": 23.0, + "step": 3049 + }, + { + "epoch": 1.8307322929171668, + "grad_norm": 0.001324790413491428, + "learning_rate": 0.00018416247273482988, + "loss": 23.0, + "step": 3050 + }, + { + "epoch": 1.8313325330132053, + "grad_norm": 0.0008508468163199723, + "learning_rate": 0.0001841522688440819, + "loss": 23.0, + "step": 3051 + }, + { + "epoch": 1.8319327731092439, + "grad_norm": 0.0004634323122445494, + "learning_rate": 0.00018414206195015587, + "loss": 23.0, + "step": 3052 + }, + { + "epoch": 1.832533013205282, + "grad_norm": 0.003283916972577572, + "learning_rate": 0.0001841318520534161, + "loss": 23.0, + "step": 3053 + }, + { + "epoch": 1.8331332533013205, + "grad_norm": 0.0021354639902710915, + "learning_rate": 0.00018412163915422695, + "loss": 23.0, + "step": 3054 + }, + { + "epoch": 1.833733493397359, + "grad_norm": 0.0010375322308391333, + "learning_rate": 0.0001841114232529529, + "loss": 23.0, + "step": 3055 + }, + { + "epoch": 1.8343337334933973, + "grad_norm": 0.0026274758856743574, + "learning_rate": 0.0001841012043499585, + "loss": 23.0, + "step": 3056 + }, + { + "epoch": 1.8349339735894357, + "grad_norm": 0.0008885942515917122, + "learning_rate": 0.00018409098244560844, + "loss": 23.0, + "step": 3057 + }, + { + "epoch": 1.8355342136854742, + "grad_norm": 0.0019456876907497644, + "learning_rate": 0.00018408075754026753, + "loss": 23.0, + "step": 3058 + }, + { + "epoch": 1.8361344537815127, + "grad_norm": 0.002114507369697094, + "learning_rate": 0.00018407052963430068, + "loss": 23.0, + "step": 3059 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.0008166294428519905, + "learning_rate": 0.00018406029872807287, + "loss": 23.0, + "step": 3060 + }, + { + "epoch": 1.8373349339735894, + "grad_norm": 0.0016927891410887241, + "learning_rate": 0.00018405006482194921, + "loss": 23.0, + "step": 3061 + }, + { + "epoch": 1.837935174069628, + "grad_norm": 0.0014067868469282985, + "learning_rate": 0.00018403982791629497, + "loss": 23.0, + "step": 3062 + }, + { + "epoch": 1.8385354141656662, + "grad_norm": 0.0017834990285336971, + "learning_rate": 0.00018402958801147542, + "loss": 23.0, + "step": 3063 + }, + { + "epoch": 1.8391356542617046, + "grad_norm": 0.0011745449155569077, + "learning_rate": 0.00018401934510785606, + "loss": 23.0, + "step": 3064 + }, + { + "epoch": 1.839735894357743, + "grad_norm": 0.0009545880602672696, + "learning_rate": 0.00018400909920580238, + "loss": 23.0, + "step": 3065 + }, + { + "epoch": 1.8403361344537816, + "grad_norm": 0.003715930739417672, + "learning_rate": 0.00018399885030568003, + "loss": 23.0, + "step": 3066 + }, + { + "epoch": 1.84093637454982, + "grad_norm": 0.0006399541744031012, + "learning_rate": 0.0001839885984078548, + "loss": 23.0, + "step": 3067 + }, + { + "epoch": 1.8415366146458583, + "grad_norm": 0.001763959531672299, + "learning_rate": 0.00018397834351269255, + "loss": 23.0, + "step": 3068 + }, + { + "epoch": 1.8421368547418968, + "grad_norm": 0.0012634745799005032, + "learning_rate": 0.0001839680856205592, + "loss": 23.0, + "step": 3069 + }, + { + "epoch": 1.8427370948379351, + "grad_norm": 0.0015287197893485427, + "learning_rate": 0.0001839578247318209, + "loss": 23.0, + "step": 3070 + }, + { + "epoch": 1.8433373349339734, + "grad_norm": 0.0009792933706194162, + "learning_rate": 0.00018394756084684377, + "loss": 23.0, + "step": 3071 + }, + { + "epoch": 1.843937575030012, + "grad_norm": 0.0009839729173108935, + "learning_rate": 0.00018393729396599416, + "loss": 23.0, + "step": 3072 + }, + { + "epoch": 1.8445378151260505, + "grad_norm": 0.0008178153657354414, + "learning_rate": 0.00018392702408963842, + "loss": 23.0, + "step": 3073 + }, + { + "epoch": 1.8451380552220888, + "grad_norm": 0.0018251874716952443, + "learning_rate": 0.0001839167512181431, + "loss": 23.0, + "step": 3074 + }, + { + "epoch": 1.8457382953181272, + "grad_norm": 0.002089198911562562, + "learning_rate": 0.00018390647535187476, + "loss": 23.0, + "step": 3075 + }, + { + "epoch": 1.8463385354141657, + "grad_norm": 0.0021562918554991484, + "learning_rate": 0.00018389619649120017, + "loss": 23.0, + "step": 3076 + }, + { + "epoch": 1.8469387755102042, + "grad_norm": 0.0017833629390224814, + "learning_rate": 0.00018388591463648616, + "loss": 23.0, + "step": 3077 + }, + { + "epoch": 1.8475390156062423, + "grad_norm": 0.000439735857071355, + "learning_rate": 0.00018387562978809958, + "loss": 23.0, + "step": 3078 + }, + { + "epoch": 1.8481392557022809, + "grad_norm": 0.001050430117174983, + "learning_rate": 0.00018386534194640754, + "loss": 23.0, + "step": 3079 + }, + { + "epoch": 1.8487394957983194, + "grad_norm": 0.001189610455185175, + "learning_rate": 0.00018385505111177719, + "loss": 23.0, + "step": 3080 + }, + { + "epoch": 1.8493397358943577, + "grad_norm": 0.0010707725305110216, + "learning_rate": 0.00018384475728457574, + "loss": 23.0, + "step": 3081 + }, + { + "epoch": 1.849939975990396, + "grad_norm": 0.002398673677816987, + "learning_rate": 0.00018383446046517062, + "loss": 23.0, + "step": 3082 + }, + { + "epoch": 1.8505402160864346, + "grad_norm": 0.0018957408610731363, + "learning_rate": 0.00018382416065392923, + "loss": 23.0, + "step": 3083 + }, + { + "epoch": 1.8511404561824731, + "grad_norm": 0.0022743563167750835, + "learning_rate": 0.0001838138578512192, + "loss": 23.0, + "step": 3084 + }, + { + "epoch": 1.8517406962785115, + "grad_norm": 0.0014944793656468391, + "learning_rate": 0.00018380355205740813, + "loss": 23.0, + "step": 3085 + }, + { + "epoch": 1.8523409363745498, + "grad_norm": 0.0011577775003388524, + "learning_rate": 0.00018379324327286387, + "loss": 23.0, + "step": 3086 + }, + { + "epoch": 1.8529411764705883, + "grad_norm": 0.0023493997287005186, + "learning_rate": 0.0001837829314979543, + "loss": 23.0, + "step": 3087 + }, + { + "epoch": 1.8535414165666266, + "grad_norm": 0.0017023790860548615, + "learning_rate": 0.00018377261673304743, + "loss": 23.0, + "step": 3088 + }, + { + "epoch": 1.854141656662665, + "grad_norm": 0.001627809484489262, + "learning_rate": 0.00018376229897851135, + "loss": 23.0, + "step": 3089 + }, + { + "epoch": 1.8547418967587035, + "grad_norm": 0.0016393931582570076, + "learning_rate": 0.00018375197823471427, + "loss": 23.0, + "step": 3090 + }, + { + "epoch": 1.855342136854742, + "grad_norm": 0.0018370094476267695, + "learning_rate": 0.00018374165450202456, + "loss": 23.0, + "step": 3091 + }, + { + "epoch": 1.8559423769507803, + "grad_norm": 0.0014342045178636909, + "learning_rate": 0.0001837313277808106, + "loss": 23.0, + "step": 3092 + }, + { + "epoch": 1.8565426170468187, + "grad_norm": 0.0029582458082586527, + "learning_rate": 0.0001837209980714409, + "loss": 23.0, + "step": 3093 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.0012816624948754907, + "learning_rate": 0.00018371066537428417, + "loss": 23.0, + "step": 3094 + }, + { + "epoch": 1.8577430972388955, + "grad_norm": 0.002697428921237588, + "learning_rate": 0.0001837003296897091, + "loss": 23.0, + "step": 3095 + }, + { + "epoch": 1.8583433373349338, + "grad_norm": 0.0037919096648693085, + "learning_rate": 0.00018368999101808458, + "loss": 23.0, + "step": 3096 + }, + { + "epoch": 1.8589435774309724, + "grad_norm": 0.003335389541462064, + "learning_rate": 0.00018367964935977953, + "loss": 23.0, + "step": 3097 + }, + { + "epoch": 1.859543817527011, + "grad_norm": 0.0011625085026025772, + "learning_rate": 0.00018366930471516306, + "loss": 23.0, + "step": 3098 + }, + { + "epoch": 1.8601440576230492, + "grad_norm": 0.001312834327109158, + "learning_rate": 0.00018365895708460434, + "loss": 23.0, + "step": 3099 + }, + { + "epoch": 1.8607442977190876, + "grad_norm": 0.0009217692422680557, + "learning_rate": 0.00018364860646847265, + "loss": 23.0, + "step": 3100 + }, + { + "epoch": 1.861344537815126, + "grad_norm": 0.001049682148732245, + "learning_rate": 0.00018363825286713735, + "loss": 23.0, + "step": 3101 + }, + { + "epoch": 1.8619447779111644, + "grad_norm": 0.0009720840607769787, + "learning_rate": 0.00018362789628096794, + "loss": 23.0, + "step": 3102 + }, + { + "epoch": 1.8625450180072027, + "grad_norm": 0.0013249212643131614, + "learning_rate": 0.000183617536710334, + "loss": 23.0, + "step": 3103 + }, + { + "epoch": 1.8631452581032413, + "grad_norm": 0.0006475619738921523, + "learning_rate": 0.00018360717415560532, + "loss": 23.0, + "step": 3104 + }, + { + "epoch": 1.8637454981992798, + "grad_norm": 0.0009137241286225617, + "learning_rate": 0.00018359680861715163, + "loss": 23.0, + "step": 3105 + }, + { + "epoch": 1.8643457382953181, + "grad_norm": 0.001901013427414, + "learning_rate": 0.00018358644009534286, + "loss": 23.0, + "step": 3106 + }, + { + "epoch": 1.8649459783913565, + "grad_norm": 0.0014280707109719515, + "learning_rate": 0.0001835760685905491, + "loss": 23.0, + "step": 3107 + }, + { + "epoch": 1.865546218487395, + "grad_norm": 0.0019309140043333173, + "learning_rate": 0.00018356569410314042, + "loss": 23.0, + "step": 3108 + }, + { + "epoch": 1.8661464585834335, + "grad_norm": 0.0008051837212406099, + "learning_rate": 0.00018355531663348705, + "loss": 23.0, + "step": 3109 + }, + { + "epoch": 1.8667466986794716, + "grad_norm": 0.0013398687588050961, + "learning_rate": 0.00018354493618195935, + "loss": 23.0, + "step": 3110 + }, + { + "epoch": 1.8673469387755102, + "grad_norm": 0.000537229236215353, + "learning_rate": 0.00018353455274892783, + "loss": 23.0, + "step": 3111 + }, + { + "epoch": 1.8679471788715487, + "grad_norm": 0.00452477065846324, + "learning_rate": 0.00018352416633476298, + "loss": 23.0, + "step": 3112 + }, + { + "epoch": 1.868547418967587, + "grad_norm": 0.002200663322582841, + "learning_rate": 0.0001835137769398355, + "loss": 23.0, + "step": 3113 + }, + { + "epoch": 1.8691476590636253, + "grad_norm": 0.0013940661447122693, + "learning_rate": 0.0001835033845645161, + "loss": 23.0, + "step": 3114 + }, + { + "epoch": 1.8697478991596639, + "grad_norm": 0.002113936236128211, + "learning_rate": 0.00018349298920917572, + "loss": 23.0, + "step": 3115 + }, + { + "epoch": 1.8703481392557024, + "grad_norm": 0.0012045325711369514, + "learning_rate": 0.00018348259087418533, + "loss": 23.0, + "step": 3116 + }, + { + "epoch": 1.8709483793517407, + "grad_norm": 0.0033185603097081184, + "learning_rate": 0.000183472189559916, + "loss": 23.0, + "step": 3117 + }, + { + "epoch": 1.871548619447779, + "grad_norm": 0.0011681788600981236, + "learning_rate": 0.00018346178526673897, + "loss": 23.0, + "step": 3118 + }, + { + "epoch": 1.8721488595438176, + "grad_norm": 0.0022086468525230885, + "learning_rate": 0.0001834513779950255, + "loss": 23.0, + "step": 3119 + }, + { + "epoch": 1.872749099639856, + "grad_norm": 0.0005794914322905242, + "learning_rate": 0.000183440967745147, + "loss": 23.0, + "step": 3120 + }, + { + "epoch": 1.8733493397358942, + "grad_norm": 0.0011907683219760656, + "learning_rate": 0.00018343055451747498, + "loss": 23.0, + "step": 3121 + }, + { + "epoch": 1.8739495798319328, + "grad_norm": 0.0005777979386039078, + "learning_rate": 0.0001834201383123811, + "loss": 23.0, + "step": 3122 + }, + { + "epoch": 1.8745498199279713, + "grad_norm": 0.004292360506951809, + "learning_rate": 0.00018340971913023707, + "loss": 23.0, + "step": 3123 + }, + { + "epoch": 1.8751500600240096, + "grad_norm": 0.0035806214436888695, + "learning_rate": 0.00018339929697141474, + "loss": 23.0, + "step": 3124 + }, + { + "epoch": 1.875750300120048, + "grad_norm": 0.0015884626191109419, + "learning_rate": 0.00018338887183628599, + "loss": 23.0, + "step": 3125 + }, + { + "epoch": 1.8763505402160865, + "grad_norm": 0.0011369887506589293, + "learning_rate": 0.00018337844372522292, + "loss": 23.0, + "step": 3126 + }, + { + "epoch": 1.8769507803121248, + "grad_norm": 0.005162140820175409, + "learning_rate": 0.00018336801263859768, + "loss": 23.0, + "step": 3127 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 0.0024860259145498276, + "learning_rate": 0.00018335757857678253, + "loss": 23.0, + "step": 3128 + }, + { + "epoch": 1.8781512605042017, + "grad_norm": 0.0012651464203372598, + "learning_rate": 0.0001833471415401498, + "loss": 23.0, + "step": 3129 + }, + { + "epoch": 1.8787515006002402, + "grad_norm": 0.00254456396214664, + "learning_rate": 0.00018333670152907198, + "loss": 23.0, + "step": 3130 + }, + { + "epoch": 1.8793517406962785, + "grad_norm": 0.0015262291999533772, + "learning_rate": 0.00018332625854392164, + "loss": 23.0, + "step": 3131 + }, + { + "epoch": 1.8799519807923168, + "grad_norm": 0.002018503611907363, + "learning_rate": 0.0001833158125850715, + "loss": 23.0, + "step": 3132 + }, + { + "epoch": 1.8805522208883554, + "grad_norm": 0.00178581103682518, + "learning_rate": 0.0001833053636528943, + "loss": 23.0, + "step": 3133 + }, + { + "epoch": 1.8811524609843937, + "grad_norm": 0.002688503125682473, + "learning_rate": 0.00018329491174776295, + "loss": 23.0, + "step": 3134 + }, + { + "epoch": 1.881752701080432, + "grad_norm": 0.0027273113373667, + "learning_rate": 0.0001832844568700505, + "loss": 23.0, + "step": 3135 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.0010961064836010337, + "learning_rate": 0.00018327399902013, + "loss": 23.0, + "step": 3136 + }, + { + "epoch": 1.882953181272509, + "grad_norm": 0.0035878592170774937, + "learning_rate": 0.00018326353819837467, + "loss": 23.0, + "step": 3137 + }, + { + "epoch": 1.8835534213685474, + "grad_norm": 0.0022766878828406334, + "learning_rate": 0.00018325307440515785, + "loss": 23.0, + "step": 3138 + }, + { + "epoch": 1.8841536614645857, + "grad_norm": 0.0014452520990744233, + "learning_rate": 0.00018324260764085294, + "loss": 23.0, + "step": 3139 + }, + { + "epoch": 1.8847539015606243, + "grad_norm": 0.001565837417729199, + "learning_rate": 0.00018323213790583353, + "loss": 23.0, + "step": 3140 + }, + { + "epoch": 1.8853541416566628, + "grad_norm": 0.002488331636413932, + "learning_rate": 0.0001832216652004732, + "loss": 23.0, + "step": 3141 + }, + { + "epoch": 1.885954381752701, + "grad_norm": 0.0023180756252259016, + "learning_rate": 0.00018321118952514568, + "loss": 23.0, + "step": 3142 + }, + { + "epoch": 1.8865546218487395, + "grad_norm": 0.0013001611223444343, + "learning_rate": 0.0001832007108802249, + "loss": 23.0, + "step": 3143 + }, + { + "epoch": 1.887154861944778, + "grad_norm": 0.00461295573040843, + "learning_rate": 0.0001831902292660847, + "loss": 23.0, + "step": 3144 + }, + { + "epoch": 1.8877551020408163, + "grad_norm": 0.0016194535419344902, + "learning_rate": 0.00018317974468309926, + "loss": 23.0, + "step": 3145 + }, + { + "epoch": 1.8883553421368546, + "grad_norm": 0.002495100488886237, + "learning_rate": 0.00018316925713164267, + "loss": 23.0, + "step": 3146 + }, + { + "epoch": 1.8889555822328932, + "grad_norm": 0.0009624058147892356, + "learning_rate": 0.00018315876661208926, + "loss": 23.0, + "step": 3147 + }, + { + "epoch": 1.8895558223289317, + "grad_norm": 0.002280160551890731, + "learning_rate": 0.00018314827312481335, + "loss": 23.0, + "step": 3148 + }, + { + "epoch": 1.89015606242497, + "grad_norm": 0.001309713814407587, + "learning_rate": 0.00018313777667018947, + "loss": 23.0, + "step": 3149 + }, + { + "epoch": 1.8907563025210083, + "grad_norm": 0.00454369792714715, + "learning_rate": 0.0001831272772485922, + "loss": 23.0, + "step": 3150 + }, + { + "epoch": 1.8913565426170469, + "grad_norm": 0.0026175181847065687, + "learning_rate": 0.0001831167748603962, + "loss": 23.0, + "step": 3151 + }, + { + "epoch": 1.8919567827130852, + "grad_norm": 0.0012701862724497914, + "learning_rate": 0.00018310626950597637, + "loss": 23.0, + "step": 3152 + }, + { + "epoch": 1.8925570228091235, + "grad_norm": 0.0033725642133504152, + "learning_rate": 0.0001830957611857075, + "loss": 23.0, + "step": 3153 + }, + { + "epoch": 1.893157262905162, + "grad_norm": 0.005127619951963425, + "learning_rate": 0.0001830852498999647, + "loss": 23.0, + "step": 3154 + }, + { + "epoch": 1.8937575030012006, + "grad_norm": 0.0004975322517566383, + "learning_rate": 0.00018307473564912303, + "loss": 23.0, + "step": 3155 + }, + { + "epoch": 1.894357743097239, + "grad_norm": 0.002670707879588008, + "learning_rate": 0.00018306421843355773, + "loss": 23.0, + "step": 3156 + }, + { + "epoch": 1.8949579831932772, + "grad_norm": 0.003962348680943251, + "learning_rate": 0.00018305369825364417, + "loss": 23.0, + "step": 3157 + }, + { + "epoch": 1.8955582232893158, + "grad_norm": 0.000557166466023773, + "learning_rate": 0.00018304317510975775, + "loss": 23.0, + "step": 3158 + }, + { + "epoch": 1.896158463385354, + "grad_norm": 0.0021320811938494444, + "learning_rate": 0.00018303264900227398, + "loss": 23.0, + "step": 3159 + }, + { + "epoch": 1.8967587034813924, + "grad_norm": 0.003195348661392927, + "learning_rate": 0.0001830221199315686, + "loss": 23.0, + "step": 3160 + }, + { + "epoch": 1.897358943577431, + "grad_norm": 0.0013635127106681466, + "learning_rate": 0.0001830115878980173, + "loss": 23.0, + "step": 3161 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.002584997797384858, + "learning_rate": 0.00018300105290199595, + "loss": 23.0, + "step": 3162 + }, + { + "epoch": 1.8985594237695078, + "grad_norm": 0.0012520054588094354, + "learning_rate": 0.00018299051494388054, + "loss": 23.0, + "step": 3163 + }, + { + "epoch": 1.8991596638655461, + "grad_norm": 0.0018940556328743696, + "learning_rate": 0.00018297997402404715, + "loss": 23.0, + "step": 3164 + }, + { + "epoch": 1.8997599039615847, + "grad_norm": 0.0019348510541021824, + "learning_rate": 0.00018296943014287193, + "loss": 23.0, + "step": 3165 + }, + { + "epoch": 1.9003601440576232, + "grad_norm": 0.0018156596925109625, + "learning_rate": 0.00018295888330073113, + "loss": 23.0, + "step": 3166 + }, + { + "epoch": 1.9009603841536613, + "grad_norm": 0.0035436146426945925, + "learning_rate": 0.0001829483334980012, + "loss": 23.0, + "step": 3167 + }, + { + "epoch": 1.9015606242496998, + "grad_norm": 0.0012572517152875662, + "learning_rate": 0.00018293778073505865, + "loss": 23.0, + "step": 3168 + }, + { + "epoch": 1.9021608643457384, + "grad_norm": 0.0008448131266050041, + "learning_rate": 0.00018292722501228004, + "loss": 23.0, + "step": 3169 + }, + { + "epoch": 1.9027611044417767, + "grad_norm": 0.0037472823169082403, + "learning_rate": 0.00018291666633004204, + "loss": 23.0, + "step": 3170 + }, + { + "epoch": 1.903361344537815, + "grad_norm": 0.0018287114799022675, + "learning_rate": 0.00018290610468872155, + "loss": 23.0, + "step": 3171 + }, + { + "epoch": 1.9039615846338536, + "grad_norm": 0.0022742743603885174, + "learning_rate": 0.0001828955400886954, + "loss": 23.0, + "step": 3172 + }, + { + "epoch": 1.904561824729892, + "grad_norm": 0.0018581197364255786, + "learning_rate": 0.0001828849725303407, + "loss": 23.0, + "step": 3173 + }, + { + "epoch": 1.9051620648259304, + "grad_norm": 0.001101133180782199, + "learning_rate": 0.0001828744020140345, + "loss": 23.0, + "step": 3174 + }, + { + "epoch": 1.9057623049219687, + "grad_norm": 0.0020204149186611176, + "learning_rate": 0.0001828638285401541, + "loss": 23.0, + "step": 3175 + }, + { + "epoch": 1.9063625450180073, + "grad_norm": 0.0028773958329111338, + "learning_rate": 0.0001828532521090768, + "loss": 23.0, + "step": 3176 + }, + { + "epoch": 1.9069627851140456, + "grad_norm": 0.0018713538302108645, + "learning_rate": 0.00018284267272118004, + "loss": 23.0, + "step": 3177 + }, + { + "epoch": 1.907563025210084, + "grad_norm": 0.0018324162811040878, + "learning_rate": 0.00018283209037684141, + "loss": 23.0, + "step": 3178 + }, + { + "epoch": 1.9081632653061225, + "grad_norm": 0.002509085228666663, + "learning_rate": 0.00018282150507643855, + "loss": 23.0, + "step": 3179 + }, + { + "epoch": 1.908763505402161, + "grad_norm": 0.002151481108739972, + "learning_rate": 0.0001828109168203492, + "loss": 23.0, + "step": 3180 + }, + { + "epoch": 1.9093637454981993, + "grad_norm": 0.0014471776084974408, + "learning_rate": 0.00018280032560895123, + "loss": 23.0, + "step": 3181 + }, + { + "epoch": 1.9099639855942376, + "grad_norm": 0.0023641546722501516, + "learning_rate": 0.00018278973144262266, + "loss": 23.0, + "step": 3182 + }, + { + "epoch": 1.9105642256902762, + "grad_norm": 0.003392763901501894, + "learning_rate": 0.0001827791343217415, + "loss": 23.0, + "step": 3183 + }, + { + "epoch": 1.9111644657863145, + "grad_norm": 0.003529018722474575, + "learning_rate": 0.00018276853424668602, + "loss": 23.0, + "step": 3184 + }, + { + "epoch": 1.9117647058823528, + "grad_norm": 0.003656035289168358, + "learning_rate": 0.0001827579312178344, + "loss": 23.0, + "step": 3185 + }, + { + "epoch": 1.9123649459783914, + "grad_norm": 0.0014189134817570448, + "learning_rate": 0.00018274732523556513, + "loss": 23.0, + "step": 3186 + }, + { + "epoch": 1.91296518607443, + "grad_norm": 0.0013264564331620932, + "learning_rate": 0.00018273671630025665, + "loss": 23.0, + "step": 3187 + }, + { + "epoch": 1.9135654261704682, + "grad_norm": 0.004117329139262438, + "learning_rate": 0.00018272610441228763, + "loss": 23.0, + "step": 3188 + }, + { + "epoch": 1.9141656662665065, + "grad_norm": 0.0024092942476272583, + "learning_rate": 0.0001827154895720367, + "loss": 23.0, + "step": 3189 + }, + { + "epoch": 1.914765906362545, + "grad_norm": 0.001247992506250739, + "learning_rate": 0.00018270487177988272, + "loss": 23.0, + "step": 3190 + }, + { + "epoch": 1.9153661464585834, + "grad_norm": 0.006278992164880037, + "learning_rate": 0.00018269425103620464, + "loss": 23.0, + "step": 3191 + }, + { + "epoch": 1.9159663865546217, + "grad_norm": 0.001128924312070012, + "learning_rate": 0.00018268362734138142, + "loss": 23.0, + "step": 3192 + }, + { + "epoch": 1.9165666266506602, + "grad_norm": 0.004643549211323261, + "learning_rate": 0.00018267300069579222, + "loss": 23.0, + "step": 3193 + }, + { + "epoch": 1.9171668667466988, + "grad_norm": 0.002494530752301216, + "learning_rate": 0.00018266237109981632, + "loss": 23.0, + "step": 3194 + }, + { + "epoch": 1.917767106842737, + "grad_norm": 0.00441815285012126, + "learning_rate": 0.000182651738553833, + "loss": 23.0, + "step": 3195 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 0.0028012660332024097, + "learning_rate": 0.00018264110305822178, + "loss": 23.0, + "step": 3196 + }, + { + "epoch": 1.918967587034814, + "grad_norm": 0.0017156752292066813, + "learning_rate": 0.00018263046461336214, + "loss": 23.0, + "step": 3197 + }, + { + "epoch": 1.9195678271308525, + "grad_norm": 0.002733529545366764, + "learning_rate": 0.00018261982321963379, + "loss": 23.0, + "step": 3198 + }, + { + "epoch": 1.9201680672268906, + "grad_norm": 0.0011137155815958977, + "learning_rate": 0.00018260917887741643, + "loss": 23.0, + "step": 3199 + }, + { + "epoch": 1.9207683073229291, + "grad_norm": 0.0014791876310482621, + "learning_rate": 0.00018259853158708998, + "loss": 23.0, + "step": 3200 + }, + { + "epoch": 1.9213685474189677, + "grad_norm": 0.0009261185768991709, + "learning_rate": 0.00018258788134903445, + "loss": 23.0, + "step": 3201 + }, + { + "epoch": 1.921968787515006, + "grad_norm": 0.0030771689489483833, + "learning_rate": 0.00018257722816362985, + "loss": 23.0, + "step": 3202 + }, + { + "epoch": 1.9225690276110443, + "grad_norm": 0.003100763075053692, + "learning_rate": 0.00018256657203125637, + "loss": 23.0, + "step": 3203 + }, + { + "epoch": 1.9231692677070829, + "grad_norm": 0.002674756571650505, + "learning_rate": 0.00018255591295229437, + "loss": 23.0, + "step": 3204 + }, + { + "epoch": 1.9237695078031214, + "grad_norm": 0.0022848814260214567, + "learning_rate": 0.00018254525092712415, + "loss": 23.0, + "step": 3205 + }, + { + "epoch": 1.9243697478991597, + "grad_norm": 0.0018523391336202621, + "learning_rate": 0.00018253458595612627, + "loss": 23.0, + "step": 3206 + }, + { + "epoch": 1.924969987995198, + "grad_norm": 0.0009723806870169938, + "learning_rate": 0.00018252391803968133, + "loss": 23.0, + "step": 3207 + }, + { + "epoch": 1.9255702280912366, + "grad_norm": 0.001458384795114398, + "learning_rate": 0.00018251324717817005, + "loss": 23.0, + "step": 3208 + }, + { + "epoch": 1.9261704681872749, + "grad_norm": 0.0013265210436657071, + "learning_rate": 0.0001825025733719732, + "loss": 23.0, + "step": 3209 + }, + { + "epoch": 1.9267707082833132, + "grad_norm": 0.0010744367027655244, + "learning_rate": 0.00018249189662147171, + "loss": 23.0, + "step": 3210 + }, + { + "epoch": 1.9273709483793517, + "grad_norm": 0.0026470967568457127, + "learning_rate": 0.00018248121692704668, + "loss": 23.0, + "step": 3211 + }, + { + "epoch": 1.9279711884753903, + "grad_norm": 0.0016062292270362377, + "learning_rate": 0.00018247053428907913, + "loss": 23.0, + "step": 3212 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.000824375543743372, + "learning_rate": 0.00018245984870795038, + "loss": 23.0, + "step": 3213 + }, + { + "epoch": 1.929171668667467, + "grad_norm": 0.002928749890998006, + "learning_rate": 0.00018244916018404173, + "loss": 23.0, + "step": 3214 + }, + { + "epoch": 1.9297719087635055, + "grad_norm": 0.002207791665568948, + "learning_rate": 0.00018243846871773465, + "loss": 23.0, + "step": 3215 + }, + { + "epoch": 1.9303721488595438, + "grad_norm": 0.0028601568192243576, + "learning_rate": 0.00018242777430941068, + "loss": 23.0, + "step": 3216 + }, + { + "epoch": 1.930972388955582, + "grad_norm": 0.0021290185395628214, + "learning_rate": 0.00018241707695945144, + "loss": 23.0, + "step": 3217 + }, + { + "epoch": 1.9315726290516206, + "grad_norm": 0.000839258951600641, + "learning_rate": 0.00018240637666823876, + "loss": 23.0, + "step": 3218 + }, + { + "epoch": 1.9321728691476592, + "grad_norm": 0.002701072720810771, + "learning_rate": 0.00018239567343615443, + "loss": 23.0, + "step": 3219 + }, + { + "epoch": 1.9327731092436975, + "grad_norm": 0.0020356529857963324, + "learning_rate": 0.00018238496726358052, + "loss": 23.0, + "step": 3220 + }, + { + "epoch": 1.9333733493397358, + "grad_norm": 0.0014873469481244683, + "learning_rate": 0.00018237425815089903, + "loss": 23.0, + "step": 3221 + }, + { + "epoch": 1.9339735894357744, + "grad_norm": 0.002182344440370798, + "learning_rate": 0.00018236354609849214, + "loss": 23.0, + "step": 3222 + }, + { + "epoch": 1.9345738295318127, + "grad_norm": 0.0016528373816981912, + "learning_rate": 0.00018235283110674218, + "loss": 23.0, + "step": 3223 + }, + { + "epoch": 1.935174069627851, + "grad_norm": 0.0012866542674601078, + "learning_rate": 0.0001823421131760315, + "loss": 23.0, + "step": 3224 + }, + { + "epoch": 1.9357743097238895, + "grad_norm": 0.0019892966374754906, + "learning_rate": 0.00018233139230674258, + "loss": 23.0, + "step": 3225 + }, + { + "epoch": 1.936374549819928, + "grad_norm": 0.002182098338380456, + "learning_rate": 0.0001823206684992581, + "loss": 23.0, + "step": 3226 + }, + { + "epoch": 1.9369747899159664, + "grad_norm": 0.0021431762725114822, + "learning_rate": 0.0001823099417539607, + "loss": 23.0, + "step": 3227 + }, + { + "epoch": 1.9375750300120047, + "grad_norm": 0.0024950047954916954, + "learning_rate": 0.00018229921207123318, + "loss": 23.0, + "step": 3228 + }, + { + "epoch": 1.9381752701080432, + "grad_norm": 0.0017660352168604732, + "learning_rate": 0.0001822884794514585, + "loss": 23.0, + "step": 3229 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.0065404558554291725, + "learning_rate": 0.00018227774389501967, + "loss": 23.0, + "step": 3230 + }, + { + "epoch": 1.9393757503001199, + "grad_norm": 0.0011474026832729578, + "learning_rate": 0.00018226700540229976, + "loss": 23.0, + "step": 3231 + }, + { + "epoch": 1.9399759903961584, + "grad_norm": 0.00047521217493340373, + "learning_rate": 0.0001822562639736821, + "loss": 23.0, + "step": 3232 + }, + { + "epoch": 1.940576230492197, + "grad_norm": 0.0019478441681712866, + "learning_rate": 0.00018224551960954995, + "loss": 23.0, + "step": 3233 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.0010433909483253956, + "learning_rate": 0.00018223477231028677, + "loss": 23.0, + "step": 3234 + }, + { + "epoch": 1.9417767106842736, + "grad_norm": 0.0018290705047547817, + "learning_rate": 0.00018222402207627606, + "loss": 23.0, + "step": 3235 + }, + { + "epoch": 1.9423769507803121, + "grad_norm": 0.0017392985755577683, + "learning_rate": 0.00018221326890790153, + "loss": 23.0, + "step": 3236 + }, + { + "epoch": 1.9429771908763507, + "grad_norm": 0.002610771218314767, + "learning_rate": 0.00018220251280554696, + "loss": 23.0, + "step": 3237 + }, + { + "epoch": 1.943577430972389, + "grad_norm": 0.0016300787683576345, + "learning_rate": 0.00018219175376959606, + "loss": 23.0, + "step": 3238 + }, + { + "epoch": 1.9441776710684273, + "grad_norm": 0.0022878015879541636, + "learning_rate": 0.00018218099180043295, + "loss": 23.0, + "step": 3239 + }, + { + "epoch": 1.9447779111644659, + "grad_norm": 0.0036289305426180363, + "learning_rate": 0.00018217022689844163, + "loss": 23.0, + "step": 3240 + }, + { + "epoch": 1.9453781512605042, + "grad_norm": 0.0009557732264511287, + "learning_rate": 0.00018215945906400628, + "loss": 23.0, + "step": 3241 + }, + { + "epoch": 1.9459783913565425, + "grad_norm": 0.0023918780498206615, + "learning_rate": 0.0001821486882975112, + "loss": 23.0, + "step": 3242 + }, + { + "epoch": 1.946578631452581, + "grad_norm": 0.001407388597726822, + "learning_rate": 0.0001821379145993407, + "loss": 23.0, + "step": 3243 + }, + { + "epoch": 1.9471788715486196, + "grad_norm": 0.0011877256911247969, + "learning_rate": 0.00018212713796987936, + "loss": 23.0, + "step": 3244 + }, + { + "epoch": 1.947779111644658, + "grad_norm": 0.001021384377963841, + "learning_rate": 0.0001821163584095117, + "loss": 23.0, + "step": 3245 + }, + { + "epoch": 1.9483793517406962, + "grad_norm": 0.0026115647051483393, + "learning_rate": 0.0001821055759186224, + "loss": 23.0, + "step": 3246 + }, + { + "epoch": 1.9489795918367347, + "grad_norm": 0.001517315162345767, + "learning_rate": 0.00018209479049759636, + "loss": 23.0, + "step": 3247 + }, + { + "epoch": 1.949579831932773, + "grad_norm": 0.003172236494719982, + "learning_rate": 0.00018208400214681837, + "loss": 23.0, + "step": 3248 + }, + { + "epoch": 1.9501800720288114, + "grad_norm": 0.001906992169097066, + "learning_rate": 0.00018207321086667353, + "loss": 23.0, + "step": 3249 + }, + { + "epoch": 1.95078031212485, + "grad_norm": 0.003104976611211896, + "learning_rate": 0.00018206241665754688, + "loss": 23.0, + "step": 3250 + }, + { + "epoch": 1.9513805522208885, + "grad_norm": 0.0015054944669827819, + "learning_rate": 0.00018205161951982367, + "loss": 23.0, + "step": 3251 + }, + { + "epoch": 1.9519807923169268, + "grad_norm": 0.0020006780978292227, + "learning_rate": 0.00018204081945388922, + "loss": 23.0, + "step": 3252 + }, + { + "epoch": 1.952581032412965, + "grad_norm": 0.0018055624095723033, + "learning_rate": 0.00018203001646012899, + "loss": 23.0, + "step": 3253 + }, + { + "epoch": 1.9531812725090036, + "grad_norm": 0.0017827142728492618, + "learning_rate": 0.00018201921053892845, + "loss": 23.0, + "step": 3254 + }, + { + "epoch": 1.9537815126050422, + "grad_norm": 0.0020725359208881855, + "learning_rate": 0.0001820084016906733, + "loss": 23.0, + "step": 3255 + }, + { + "epoch": 1.9543817527010803, + "grad_norm": 0.001384815899655223, + "learning_rate": 0.0001819975899157492, + "loss": 23.0, + "step": 3256 + }, + { + "epoch": 1.9549819927971188, + "grad_norm": 0.002811410464346409, + "learning_rate": 0.00018198677521454206, + "loss": 23.0, + "step": 3257 + }, + { + "epoch": 1.9555822328931574, + "grad_norm": 0.0008216790156438947, + "learning_rate": 0.0001819759575874378, + "loss": 23.0, + "step": 3258 + }, + { + "epoch": 1.9561824729891957, + "grad_norm": 0.0014283070340752602, + "learning_rate": 0.00018196513703482248, + "loss": 23.0, + "step": 3259 + }, + { + "epoch": 1.956782713085234, + "grad_norm": 0.0016851375112310052, + "learning_rate": 0.00018195431355708225, + "loss": 23.0, + "step": 3260 + }, + { + "epoch": 1.9573829531812725, + "grad_norm": 0.0019182129763066769, + "learning_rate": 0.0001819434871546034, + "loss": 23.0, + "step": 3261 + }, + { + "epoch": 1.957983193277311, + "grad_norm": 0.0008913344354368746, + "learning_rate": 0.00018193265782777228, + "loss": 23.0, + "step": 3262 + }, + { + "epoch": 1.9585834333733494, + "grad_norm": 0.0013015334261581302, + "learning_rate": 0.00018192182557697534, + "loss": 23.0, + "step": 3263 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.0009710061131045222, + "learning_rate": 0.00018191099040259918, + "loss": 23.0, + "step": 3264 + }, + { + "epoch": 1.9597839135654262, + "grad_norm": 0.0013321270234882832, + "learning_rate": 0.0001819001523050305, + "loss": 23.0, + "step": 3265 + }, + { + "epoch": 1.9603841536614646, + "grad_norm": 0.004230269230902195, + "learning_rate": 0.00018188931128465603, + "loss": 23.0, + "step": 3266 + }, + { + "epoch": 1.9609843937575029, + "grad_norm": 0.0016738398699089885, + "learning_rate": 0.00018187846734186267, + "loss": 23.0, + "step": 3267 + }, + { + "epoch": 1.9615846338535414, + "grad_norm": 0.0009718580404296517, + "learning_rate": 0.00018186762047703745, + "loss": 23.0, + "step": 3268 + }, + { + "epoch": 1.96218487394958, + "grad_norm": 0.0022842460311949253, + "learning_rate": 0.00018185677069056742, + "loss": 23.0, + "step": 3269 + }, + { + "epoch": 1.9627851140456183, + "grad_norm": 0.0014329022960737348, + "learning_rate": 0.00018184591798283984, + "loss": 23.0, + "step": 3270 + }, + { + "epoch": 1.9633853541416566, + "grad_norm": 0.0033265934325754642, + "learning_rate": 0.00018183506235424198, + "loss": 23.0, + "step": 3271 + }, + { + "epoch": 1.9639855942376951, + "grad_norm": 0.0007821978651918471, + "learning_rate": 0.0001818242038051612, + "loss": 23.0, + "step": 3272 + }, + { + "epoch": 1.9645858343337335, + "grad_norm": 0.0015688884304836392, + "learning_rate": 0.00018181334233598507, + "loss": 23.0, + "step": 3273 + }, + { + "epoch": 1.9651860744297718, + "grad_norm": 0.0008193565299734473, + "learning_rate": 0.00018180247794710125, + "loss": 23.0, + "step": 3274 + }, + { + "epoch": 1.9657863145258103, + "grad_norm": 0.002213628264144063, + "learning_rate": 0.0001817916106388974, + "loss": 23.0, + "step": 3275 + }, + { + "epoch": 1.9663865546218489, + "grad_norm": 0.0014024798292666674, + "learning_rate": 0.00018178074041176135, + "loss": 23.0, + "step": 3276 + }, + { + "epoch": 1.9669867947178872, + "grad_norm": 0.0037618144415318966, + "learning_rate": 0.00018176986726608103, + "loss": 23.0, + "step": 3277 + }, + { + "epoch": 1.9675870348139255, + "grad_norm": 0.0024038751143962145, + "learning_rate": 0.00018175899120224449, + "loss": 23.0, + "step": 3278 + }, + { + "epoch": 1.968187274909964, + "grad_norm": 0.0033458671532571316, + "learning_rate": 0.00018174811222063987, + "loss": 23.0, + "step": 3279 + }, + { + "epoch": 1.9687875150060024, + "grad_norm": 0.0019863531924784184, + "learning_rate": 0.0001817372303216554, + "loss": 23.0, + "step": 3280 + }, + { + "epoch": 1.9693877551020407, + "grad_norm": 0.0015641896752640605, + "learning_rate": 0.0001817263455056794, + "loss": 23.0, + "step": 3281 + }, + { + "epoch": 1.9699879951980792, + "grad_norm": 0.0012357387458905578, + "learning_rate": 0.00018171545777310042, + "loss": 23.0, + "step": 3282 + }, + { + "epoch": 1.9705882352941178, + "grad_norm": 0.002113505732268095, + "learning_rate": 0.00018170456712430691, + "loss": 23.0, + "step": 3283 + }, + { + "epoch": 1.971188475390156, + "grad_norm": 0.002042030915617943, + "learning_rate": 0.0001816936735596876, + "loss": 23.0, + "step": 3284 + }, + { + "epoch": 1.9717887154861944, + "grad_norm": 0.0012755439383909106, + "learning_rate": 0.00018168277707963118, + "loss": 23.0, + "step": 3285 + }, + { + "epoch": 1.972388955582233, + "grad_norm": 0.0019094778690487146, + "learning_rate": 0.0001816718776845266, + "loss": 23.0, + "step": 3286 + }, + { + "epoch": 1.9729891956782715, + "grad_norm": 0.00413705175742507, + "learning_rate": 0.00018166097537476277, + "loss": 23.0, + "step": 3287 + }, + { + "epoch": 1.9735894357743096, + "grad_norm": 0.0020690206438302994, + "learning_rate": 0.00018165007015072876, + "loss": 23.0, + "step": 3288 + }, + { + "epoch": 1.974189675870348, + "grad_norm": 0.00169268692843616, + "learning_rate": 0.00018163916201281382, + "loss": 23.0, + "step": 3289 + }, + { + "epoch": 1.9747899159663866, + "grad_norm": 0.001092193415388465, + "learning_rate": 0.00018162825096140718, + "loss": 23.0, + "step": 3290 + }, + { + "epoch": 1.975390156062425, + "grad_norm": 0.0015564619097858667, + "learning_rate": 0.0001816173369968982, + "loss": 23.0, + "step": 3291 + }, + { + "epoch": 1.9759903961584633, + "grad_norm": 0.0038381817284971476, + "learning_rate": 0.00018160642011967644, + "loss": 23.0, + "step": 3292 + }, + { + "epoch": 1.9765906362545018, + "grad_norm": 0.0013099567731842399, + "learning_rate": 0.00018159550033013144, + "loss": 23.0, + "step": 3293 + }, + { + "epoch": 1.9771908763505404, + "grad_norm": 0.00048246965161524713, + "learning_rate": 0.00018158457762865296, + "loss": 23.0, + "step": 3294 + }, + { + "epoch": 1.9777911164465787, + "grad_norm": 0.0024292506277561188, + "learning_rate": 0.00018157365201563073, + "loss": 23.0, + "step": 3295 + }, + { + "epoch": 1.978391356542617, + "grad_norm": 0.0020980066619813442, + "learning_rate": 0.0001815627234914547, + "loss": 23.0, + "step": 3296 + }, + { + "epoch": 1.9789915966386555, + "grad_norm": 0.0009206788381561637, + "learning_rate": 0.00018155179205651487, + "loss": 23.0, + "step": 3297 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.0014182684244588017, + "learning_rate": 0.00018154085771120138, + "loss": 23.0, + "step": 3298 + }, + { + "epoch": 1.9801920768307322, + "grad_norm": 0.0010655620135366917, + "learning_rate": 0.0001815299204559044, + "loss": 23.0, + "step": 3299 + }, + { + "epoch": 1.9807923169267707, + "grad_norm": 0.0018564078491181135, + "learning_rate": 0.0001815189802910143, + "loss": 23.0, + "step": 3300 + }, + { + "epoch": 1.9813925570228093, + "grad_norm": 0.0016347630880773067, + "learning_rate": 0.00018150803721692148, + "loss": 23.0, + "step": 3301 + }, + { + "epoch": 1.9819927971188476, + "grad_norm": 0.0021132517140358686, + "learning_rate": 0.00018149709123401646, + "loss": 23.0, + "step": 3302 + }, + { + "epoch": 1.982593037214886, + "grad_norm": 0.002489116508513689, + "learning_rate": 0.0001814861423426899, + "loss": 23.0, + "step": 3303 + }, + { + "epoch": 1.9831932773109244, + "grad_norm": 0.0014195957919582725, + "learning_rate": 0.0001814751905433325, + "loss": 23.0, + "step": 3304 + }, + { + "epoch": 1.9837935174069627, + "grad_norm": 0.0011736804153770208, + "learning_rate": 0.00018146423583633518, + "loss": 23.0, + "step": 3305 + }, + { + "epoch": 1.984393757503001, + "grad_norm": 0.002241331385448575, + "learning_rate": 0.00018145327822208883, + "loss": 23.0, + "step": 3306 + }, + { + "epoch": 1.9849939975990396, + "grad_norm": 0.0017535320948809385, + "learning_rate": 0.00018144231770098447, + "loss": 23.0, + "step": 3307 + }, + { + "epoch": 1.9855942376950781, + "grad_norm": 0.0021708435378968716, + "learning_rate": 0.00018143135427341328, + "loss": 23.0, + "step": 3308 + }, + { + "epoch": 1.9861944777911165, + "grad_norm": 0.0023300815373659134, + "learning_rate": 0.00018142038793976655, + "loss": 23.0, + "step": 3309 + }, + { + "epoch": 1.9867947178871548, + "grad_norm": 0.0014577904948964715, + "learning_rate": 0.0001814094187004356, + "loss": 23.0, + "step": 3310 + }, + { + "epoch": 1.9873949579831933, + "grad_norm": 0.0024585998617112637, + "learning_rate": 0.00018139844655581193, + "loss": 23.0, + "step": 3311 + }, + { + "epoch": 1.9879951980792316, + "grad_norm": 0.0038247143384069204, + "learning_rate": 0.00018138747150628706, + "loss": 23.0, + "step": 3312 + }, + { + "epoch": 1.98859543817527, + "grad_norm": 0.0011696047149598598, + "learning_rate": 0.0001813764935522527, + "loss": 23.0, + "step": 3313 + }, + { + "epoch": 1.9891956782713085, + "grad_norm": 0.0028556687757372856, + "learning_rate": 0.00018136551269410062, + "loss": 23.0, + "step": 3314 + }, + { + "epoch": 1.989795918367347, + "grad_norm": 0.0023186022881418467, + "learning_rate": 0.00018135452893222262, + "loss": 23.0, + "step": 3315 + }, + { + "epoch": 1.9903961584633854, + "grad_norm": 0.005861006677150726, + "learning_rate": 0.0001813435422670108, + "loss": 23.0, + "step": 3316 + }, + { + "epoch": 1.9909963985594237, + "grad_norm": 0.0006705807754769921, + "learning_rate": 0.00018133255269885723, + "loss": 23.0, + "step": 3317 + }, + { + "epoch": 1.9915966386554622, + "grad_norm": 0.0016647104639559984, + "learning_rate": 0.00018132156022815404, + "loss": 23.0, + "step": 3318 + }, + { + "epoch": 1.9921968787515008, + "grad_norm": 0.0013882304774597287, + "learning_rate": 0.00018131056485529352, + "loss": 23.0, + "step": 3319 + }, + { + "epoch": 1.9927971188475389, + "grad_norm": 0.0024001754354685545, + "learning_rate": 0.00018129956658066812, + "loss": 23.0, + "step": 3320 + }, + { + "epoch": 1.9933973589435774, + "grad_norm": 0.0018285515252500772, + "learning_rate": 0.0001812885654046703, + "loss": 23.0, + "step": 3321 + }, + { + "epoch": 1.993997599039616, + "grad_norm": 0.0015243416419252753, + "learning_rate": 0.00018127756132769272, + "loss": 23.0, + "step": 3322 + }, + { + "epoch": 1.9945978391356542, + "grad_norm": 0.005187567323446274, + "learning_rate": 0.00018126655435012803, + "loss": 23.0, + "step": 3323 + }, + { + "epoch": 1.9951980792316926, + "grad_norm": 0.0017248830990865827, + "learning_rate": 0.00018125554447236906, + "loss": 23.0, + "step": 3324 + }, + { + "epoch": 1.995798319327731, + "grad_norm": 0.0024234713055193424, + "learning_rate": 0.00018124453169480868, + "loss": 23.0, + "step": 3325 + }, + { + "epoch": 1.9963985594237696, + "grad_norm": 0.0025638325605541468, + "learning_rate": 0.00018123351601784, + "loss": 23.0, + "step": 3326 + }, + { + "epoch": 1.996998799519808, + "grad_norm": 0.002679846016690135, + "learning_rate": 0.00018122249744185605, + "loss": 23.0, + "step": 3327 + }, + { + "epoch": 1.9975990396158463, + "grad_norm": 0.003047887934371829, + "learning_rate": 0.00018121147596725015, + "loss": 23.0, + "step": 3328 + }, + { + "epoch": 1.9981992797118848, + "grad_norm": 0.0017722995253279805, + "learning_rate": 0.00018120045159441555, + "loss": 23.0, + "step": 3329 + }, + { + "epoch": 1.9987995198079231, + "grad_norm": 0.0012083728797733784, + "learning_rate": 0.00018118942432374569, + "loss": 23.0, + "step": 3330 + }, + { + "epoch": 1.9993997599039615, + "grad_norm": 0.0022785505279898643, + "learning_rate": 0.00018117839415563412, + "loss": 23.0, + "step": 3331 + }, + { + "epoch": 2.0, + "grad_norm": 0.0014194271061569452, + "learning_rate": 0.0001811673610904745, + "loss": 23.0, + "step": 3332 + }, + { + "epoch": 2.0006002400960385, + "grad_norm": 0.0032136687077581882, + "learning_rate": 0.0001811563251286605, + "loss": 23.0, + "step": 3333 + }, + { + "epoch": 2.0012004801920766, + "grad_norm": 0.00217724172398448, + "learning_rate": 0.00018114528627058605, + "loss": 23.0, + "step": 3334 + }, + { + "epoch": 2.001800720288115, + "grad_norm": 0.0017727838130667806, + "learning_rate": 0.00018113424451664508, + "loss": 23.0, + "step": 3335 + }, + { + "epoch": 2.0024009603841537, + "grad_norm": 0.00423310836777091, + "learning_rate": 0.00018112319986723162, + "loss": 23.0, + "step": 3336 + }, + { + "epoch": 2.0030012004801923, + "grad_norm": 0.0028058600146323442, + "learning_rate": 0.00018111215232273983, + "loss": 23.0, + "step": 3337 + }, + { + "epoch": 2.0036014405762304, + "grad_norm": 0.0015503312461078167, + "learning_rate": 0.00018110110188356395, + "loss": 23.0, + "step": 3338 + }, + { + "epoch": 2.004201680672269, + "grad_norm": 0.0017939833924174309, + "learning_rate": 0.00018109004855009837, + "loss": 23.0, + "step": 3339 + }, + { + "epoch": 2.0048019207683074, + "grad_norm": 0.0033577047288417816, + "learning_rate": 0.00018107899232273756, + "loss": 23.0, + "step": 3340 + }, + { + "epoch": 2.0054021608643455, + "grad_norm": 0.0008726062951609492, + "learning_rate": 0.00018106793320187605, + "loss": 23.0, + "step": 3341 + }, + { + "epoch": 2.006002400960384, + "grad_norm": 0.0011513312347233295, + "learning_rate": 0.00018105687118790854, + "loss": 23.0, + "step": 3342 + }, + { + "epoch": 2.0066026410564226, + "grad_norm": 0.001895427005365491, + "learning_rate": 0.00018104580628122984, + "loss": 23.0, + "step": 3343 + }, + { + "epoch": 2.007202881152461, + "grad_norm": 0.0009158776374533772, + "learning_rate": 0.0001810347384822347, + "loss": 23.0, + "step": 3344 + }, + { + "epoch": 2.0078031212484992, + "grad_norm": 0.0008634175173938274, + "learning_rate": 0.00018102366779131827, + "loss": 23.0, + "step": 3345 + }, + { + "epoch": 2.008403361344538, + "grad_norm": 0.0019585357513278723, + "learning_rate": 0.00018101259420887553, + "loss": 23.0, + "step": 3346 + }, + { + "epoch": 2.0090036014405763, + "grad_norm": 0.005161345470696688, + "learning_rate": 0.00018100151773530168, + "loss": 23.0, + "step": 3347 + }, + { + "epoch": 2.009603841536615, + "grad_norm": 0.003942587878555059, + "learning_rate": 0.00018099043837099203, + "loss": 23.0, + "step": 3348 + }, + { + "epoch": 2.010204081632653, + "grad_norm": 0.0013700701529160142, + "learning_rate": 0.00018097935611634198, + "loss": 23.0, + "step": 3349 + }, + { + "epoch": 2.0108043217286915, + "grad_norm": 0.0007138294167816639, + "learning_rate": 0.000180968270971747, + "loss": 23.0, + "step": 3350 + }, + { + "epoch": 2.01140456182473, + "grad_norm": 0.005530905444175005, + "learning_rate": 0.0001809571829376027, + "loss": 23.0, + "step": 3351 + }, + { + "epoch": 2.012004801920768, + "grad_norm": 0.0032923435792326927, + "learning_rate": 0.0001809460920143048, + "loss": 23.0, + "step": 3352 + }, + { + "epoch": 2.0126050420168067, + "grad_norm": 0.0005104127340018749, + "learning_rate": 0.0001809349982022491, + "loss": 23.0, + "step": 3353 + }, + { + "epoch": 2.013205282112845, + "grad_norm": 0.001574002206325531, + "learning_rate": 0.0001809239015018315, + "loss": 23.0, + "step": 3354 + }, + { + "epoch": 2.0138055222088838, + "grad_norm": 0.002212009159848094, + "learning_rate": 0.000180912801913448, + "loss": 23.0, + "step": 3355 + }, + { + "epoch": 2.014405762304922, + "grad_norm": 0.0011559105478227139, + "learning_rate": 0.00018090169943749476, + "loss": 23.0, + "step": 3356 + }, + { + "epoch": 2.0150060024009604, + "grad_norm": 0.0033464916050434113, + "learning_rate": 0.00018089059407436794, + "loss": 23.0, + "step": 3357 + }, + { + "epoch": 2.015606242496999, + "grad_norm": 0.0011934455251321197, + "learning_rate": 0.0001808794858244639, + "loss": 23.0, + "step": 3358 + }, + { + "epoch": 2.016206482593037, + "grad_norm": 0.0006701741367578506, + "learning_rate": 0.00018086837468817907, + "loss": 23.0, + "step": 3359 + }, + { + "epoch": 2.0168067226890756, + "grad_norm": 0.0014794671442359686, + "learning_rate": 0.00018085726066590998, + "loss": 23.0, + "step": 3360 + }, + { + "epoch": 2.017406962785114, + "grad_norm": 0.00047398850438185036, + "learning_rate": 0.00018084614375805323, + "loss": 23.0, + "step": 3361 + }, + { + "epoch": 2.0180072028811527, + "grad_norm": 0.0023361060302704573, + "learning_rate": 0.00018083502396500556, + "loss": 23.0, + "step": 3362 + }, + { + "epoch": 2.0186074429771907, + "grad_norm": 0.0012610971461981535, + "learning_rate": 0.0001808239012871638, + "loss": 23.0, + "step": 3363 + }, + { + "epoch": 2.0192076830732293, + "grad_norm": 0.004134600982069969, + "learning_rate": 0.00018081277572492494, + "loss": 23.0, + "step": 3364 + }, + { + "epoch": 2.019807923169268, + "grad_norm": 0.000841438421048224, + "learning_rate": 0.000180801647278686, + "loss": 23.0, + "step": 3365 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.002190615748986602, + "learning_rate": 0.0001807905159488441, + "loss": 23.0, + "step": 3366 + }, + { + "epoch": 2.0210084033613445, + "grad_norm": 0.0028436193242669106, + "learning_rate": 0.00018077938173579648, + "loss": 23.0, + "step": 3367 + }, + { + "epoch": 2.021608643457383, + "grad_norm": 0.003500116989016533, + "learning_rate": 0.00018076824463994056, + "loss": 23.0, + "step": 3368 + }, + { + "epoch": 2.0222088835534215, + "grad_norm": 0.0026991087943315506, + "learning_rate": 0.0001807571046616737, + "loss": 23.0, + "step": 3369 + }, + { + "epoch": 2.0228091236494596, + "grad_norm": 0.0010631488403305411, + "learning_rate": 0.00018074596180139353, + "loss": 23.0, + "step": 3370 + }, + { + "epoch": 2.023409363745498, + "grad_norm": 0.0020518836099654436, + "learning_rate": 0.00018073481605949772, + "loss": 23.0, + "step": 3371 + }, + { + "epoch": 2.0240096038415367, + "grad_norm": 0.006052622105926275, + "learning_rate": 0.00018072366743638395, + "loss": 23.0, + "step": 3372 + }, + { + "epoch": 2.024609843937575, + "grad_norm": 0.0021911882795393467, + "learning_rate": 0.00018071251593245016, + "loss": 23.0, + "step": 3373 + }, + { + "epoch": 2.0252100840336134, + "grad_norm": 0.0032586664892733097, + "learning_rate": 0.00018070136154809427, + "loss": 23.0, + "step": 3374 + }, + { + "epoch": 2.025810324129652, + "grad_norm": 0.000785050040576607, + "learning_rate": 0.0001806902042837144, + "loss": 23.0, + "step": 3375 + }, + { + "epoch": 2.0264105642256904, + "grad_norm": 0.0007825555512681603, + "learning_rate": 0.00018067904413970868, + "loss": 23.0, + "step": 3376 + }, + { + "epoch": 2.0270108043217285, + "grad_norm": 0.003356239991262555, + "learning_rate": 0.00018066788111647543, + "loss": 23.0, + "step": 3377 + }, + { + "epoch": 2.027611044417767, + "grad_norm": 0.0014766575768589973, + "learning_rate": 0.00018065671521441302, + "loss": 23.0, + "step": 3378 + }, + { + "epoch": 2.0282112845138056, + "grad_norm": 0.0009158303728327155, + "learning_rate": 0.00018064554643391987, + "loss": 23.0, + "step": 3379 + }, + { + "epoch": 2.028811524609844, + "grad_norm": 0.0009348996100015938, + "learning_rate": 0.00018063437477539466, + "loss": 23.0, + "step": 3380 + }, + { + "epoch": 2.0294117647058822, + "grad_norm": 0.0008702090126462281, + "learning_rate": 0.00018062320023923598, + "loss": 23.0, + "step": 3381 + }, + { + "epoch": 2.030012004801921, + "grad_norm": 0.002282114699482918, + "learning_rate": 0.0001806120228258427, + "loss": 23.0, + "step": 3382 + }, + { + "epoch": 2.0306122448979593, + "grad_norm": 0.005010796710848808, + "learning_rate": 0.00018060084253561368, + "loss": 23.0, + "step": 3383 + }, + { + "epoch": 2.0312124849939974, + "grad_norm": 0.0017042094841599464, + "learning_rate": 0.0001805896593689479, + "loss": 23.0, + "step": 3384 + }, + { + "epoch": 2.031812725090036, + "grad_norm": 0.0037521664053201675, + "learning_rate": 0.00018057847332624452, + "loss": 23.0, + "step": 3385 + }, + { + "epoch": 2.0324129651860745, + "grad_norm": 0.0015214249724522233, + "learning_rate": 0.00018056728440790266, + "loss": 23.0, + "step": 3386 + }, + { + "epoch": 2.033013205282113, + "grad_norm": 0.0004935787292197347, + "learning_rate": 0.00018055609261432169, + "loss": 23.0, + "step": 3387 + }, + { + "epoch": 2.033613445378151, + "grad_norm": 0.003463397966697812, + "learning_rate": 0.00018054489794590097, + "loss": 23.0, + "step": 3388 + }, + { + "epoch": 2.0342136854741897, + "grad_norm": 0.002584519563242793, + "learning_rate": 0.00018053370040304004, + "loss": 23.0, + "step": 3389 + }, + { + "epoch": 2.034813925570228, + "grad_norm": 0.0012157384771853685, + "learning_rate": 0.0001805224999861385, + "loss": 23.0, + "step": 3390 + }, + { + "epoch": 2.0354141656662663, + "grad_norm": 0.0029177102260291576, + "learning_rate": 0.00018051129669559605, + "loss": 23.0, + "step": 3391 + }, + { + "epoch": 2.036014405762305, + "grad_norm": 0.0015428938204422593, + "learning_rate": 0.00018050009053181257, + "loss": 23.0, + "step": 3392 + }, + { + "epoch": 2.0366146458583434, + "grad_norm": 0.0007340401643887162, + "learning_rate": 0.00018048888149518788, + "loss": 23.0, + "step": 3393 + }, + { + "epoch": 2.037214885954382, + "grad_norm": 0.0011957536917179823, + "learning_rate": 0.00018047766958612208, + "loss": 23.0, + "step": 3394 + }, + { + "epoch": 2.03781512605042, + "grad_norm": 0.0017125644953921437, + "learning_rate": 0.00018046645480501524, + "loss": 23.0, + "step": 3395 + }, + { + "epoch": 2.0384153661464586, + "grad_norm": 0.0023938282392919064, + "learning_rate": 0.0001804552371522676, + "loss": 23.0, + "step": 3396 + }, + { + "epoch": 2.039015606242497, + "grad_norm": 0.0033557454589754343, + "learning_rate": 0.00018044401662827955, + "loss": 23.0, + "step": 3397 + }, + { + "epoch": 2.039615846338535, + "grad_norm": 0.0025200413074344397, + "learning_rate": 0.00018043279323345143, + "loss": 23.0, + "step": 3398 + }, + { + "epoch": 2.0402160864345738, + "grad_norm": 0.0025761276483535767, + "learning_rate": 0.00018042156696818385, + "loss": 23.0, + "step": 3399 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.0009423838928341866, + "learning_rate": 0.00018041033783287738, + "loss": 23.0, + "step": 3400 + }, + { + "epoch": 2.041416566626651, + "grad_norm": 0.004442538134753704, + "learning_rate": 0.00018039910582793282, + "loss": 23.0, + "step": 3401 + }, + { + "epoch": 2.042016806722689, + "grad_norm": 0.002732986817136407, + "learning_rate": 0.000180387870953751, + "loss": 23.0, + "step": 3402 + }, + { + "epoch": 2.0426170468187275, + "grad_norm": 0.003854157403111458, + "learning_rate": 0.0001803766332107328, + "loss": 23.0, + "step": 3403 + }, + { + "epoch": 2.043217286914766, + "grad_norm": 0.00363258458673954, + "learning_rate": 0.00018036539259927932, + "loss": 23.0, + "step": 3404 + }, + { + "epoch": 2.0438175270108045, + "grad_norm": 0.004020926542580128, + "learning_rate": 0.00018035414911979177, + "loss": 23.0, + "step": 3405 + }, + { + "epoch": 2.0444177671068426, + "grad_norm": 0.0015982519835233688, + "learning_rate": 0.00018034290277267126, + "loss": 23.0, + "step": 3406 + }, + { + "epoch": 2.045018007202881, + "grad_norm": 0.0038941625971347094, + "learning_rate": 0.00018033165355831924, + "loss": 23.0, + "step": 3407 + }, + { + "epoch": 2.0456182472989197, + "grad_norm": 0.0026739637833088636, + "learning_rate": 0.00018032040147713716, + "loss": 23.0, + "step": 3408 + }, + { + "epoch": 2.046218487394958, + "grad_norm": 0.0016816748538985848, + "learning_rate": 0.00018030914652952652, + "loss": 23.0, + "step": 3409 + }, + { + "epoch": 2.0468187274909964, + "grad_norm": 0.001945959753356874, + "learning_rate": 0.00018029788871588905, + "loss": 23.0, + "step": 3410 + }, + { + "epoch": 2.047418967587035, + "grad_norm": 0.002473194617778063, + "learning_rate": 0.0001802866280366265, + "loss": 23.0, + "step": 3411 + }, + { + "epoch": 2.0480192076830734, + "grad_norm": 0.003011916531249881, + "learning_rate": 0.00018027536449214066, + "loss": 23.0, + "step": 3412 + }, + { + "epoch": 2.0486194477791115, + "grad_norm": 0.0023058606311678886, + "learning_rate": 0.0001802640980828336, + "loss": 23.0, + "step": 3413 + }, + { + "epoch": 2.04921968787515, + "grad_norm": 0.0021154494024813175, + "learning_rate": 0.00018025282880910733, + "loss": 23.0, + "step": 3414 + }, + { + "epoch": 2.0498199279711886, + "grad_norm": 0.000943707418628037, + "learning_rate": 0.00018024155667136405, + "loss": 23.0, + "step": 3415 + }, + { + "epoch": 2.0504201680672267, + "grad_norm": 0.0005733675789088011, + "learning_rate": 0.000180230281670006, + "loss": 23.0, + "step": 3416 + }, + { + "epoch": 2.0510204081632653, + "grad_norm": 0.0012774687493219972, + "learning_rate": 0.00018021900380543558, + "loss": 23.0, + "step": 3417 + }, + { + "epoch": 2.051620648259304, + "grad_norm": 0.0015681550139561296, + "learning_rate": 0.00018020772307805524, + "loss": 23.0, + "step": 3418 + }, + { + "epoch": 2.0522208883553423, + "grad_norm": 0.002922586165368557, + "learning_rate": 0.00018019643948826762, + "loss": 23.0, + "step": 3419 + }, + { + "epoch": 2.0528211284513804, + "grad_norm": 0.000941049656830728, + "learning_rate": 0.00018018515303647533, + "loss": 23.0, + "step": 3420 + }, + { + "epoch": 2.053421368547419, + "grad_norm": 0.0013102737721055746, + "learning_rate": 0.0001801738637230812, + "loss": 23.0, + "step": 3421 + }, + { + "epoch": 2.0540216086434575, + "grad_norm": 0.0010620380053296685, + "learning_rate": 0.00018016257154848813, + "loss": 23.0, + "step": 3422 + }, + { + "epoch": 2.0546218487394956, + "grad_norm": 0.001468815840780735, + "learning_rate": 0.00018015127651309907, + "loss": 23.0, + "step": 3423 + }, + { + "epoch": 2.055222088835534, + "grad_norm": 0.002112776506692171, + "learning_rate": 0.0001801399786173171, + "loss": 23.0, + "step": 3424 + }, + { + "epoch": 2.0558223289315727, + "grad_norm": 0.0043845935724675655, + "learning_rate": 0.00018012867786154545, + "loss": 23.0, + "step": 3425 + }, + { + "epoch": 2.0564225690276112, + "grad_norm": 0.0023064163979142904, + "learning_rate": 0.0001801173742461874, + "loss": 23.0, + "step": 3426 + }, + { + "epoch": 2.0570228091236493, + "grad_norm": 0.0015078387223184109, + "learning_rate": 0.0001801060677716464, + "loss": 23.0, + "step": 3427 + }, + { + "epoch": 2.057623049219688, + "grad_norm": 0.0014458864461630583, + "learning_rate": 0.00018009475843832587, + "loss": 23.0, + "step": 3428 + }, + { + "epoch": 2.0582232893157264, + "grad_norm": 0.0032603242434561253, + "learning_rate": 0.00018008344624662943, + "loss": 23.0, + "step": 3429 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.0012629261473193765, + "learning_rate": 0.00018007213119696077, + "loss": 23.0, + "step": 3430 + }, + { + "epoch": 2.059423769507803, + "grad_norm": 0.0016698999097570777, + "learning_rate": 0.00018006081328972374, + "loss": 23.0, + "step": 3431 + }, + { + "epoch": 2.0600240096038416, + "grad_norm": 0.001395948464050889, + "learning_rate": 0.00018004949252532223, + "loss": 23.0, + "step": 3432 + }, + { + "epoch": 2.06062424969988, + "grad_norm": 0.004593064077198505, + "learning_rate": 0.00018003816890416023, + "loss": 23.0, + "step": 3433 + }, + { + "epoch": 2.061224489795918, + "grad_norm": 0.0021219372283667326, + "learning_rate": 0.00018002684242664186, + "loss": 23.0, + "step": 3434 + }, + { + "epoch": 2.0618247298919568, + "grad_norm": 0.0006292082252912223, + "learning_rate": 0.00018001551309317136, + "loss": 23.0, + "step": 3435 + }, + { + "epoch": 2.0624249699879953, + "grad_norm": 0.0024436218664050102, + "learning_rate": 0.000180004180904153, + "loss": 23.0, + "step": 3436 + }, + { + "epoch": 2.0630252100840334, + "grad_norm": 0.0022101947106420994, + "learning_rate": 0.00017999284585999126, + "loss": 23.0, + "step": 3437 + }, + { + "epoch": 2.063625450180072, + "grad_norm": 0.0024126351345330477, + "learning_rate": 0.00017998150796109057, + "loss": 23.0, + "step": 3438 + }, + { + "epoch": 2.0642256902761105, + "grad_norm": 0.00167564966250211, + "learning_rate": 0.00017997016720785563, + "loss": 23.0, + "step": 3439 + }, + { + "epoch": 2.064825930372149, + "grad_norm": 0.001377441338263452, + "learning_rate": 0.0001799588236006911, + "loss": 23.0, + "step": 3440 + }, + { + "epoch": 2.065426170468187, + "grad_norm": 0.002041054656729102, + "learning_rate": 0.00017994747714000187, + "loss": 23.0, + "step": 3441 + }, + { + "epoch": 2.0660264105642256, + "grad_norm": 0.003891473403200507, + "learning_rate": 0.0001799361278261928, + "loss": 23.0, + "step": 3442 + }, + { + "epoch": 2.066626650660264, + "grad_norm": 0.0010193546768277884, + "learning_rate": 0.00017992477565966896, + "loss": 23.0, + "step": 3443 + }, + { + "epoch": 2.0672268907563027, + "grad_norm": 0.0011679233284667134, + "learning_rate": 0.00017991342064083547, + "loss": 23.0, + "step": 3444 + }, + { + "epoch": 2.067827130852341, + "grad_norm": 0.0009892381494864821, + "learning_rate": 0.00017990206277009756, + "loss": 23.0, + "step": 3445 + }, + { + "epoch": 2.0684273709483794, + "grad_norm": 0.0028641563840210438, + "learning_rate": 0.00017989070204786053, + "loss": 23.0, + "step": 3446 + }, + { + "epoch": 2.069027611044418, + "grad_norm": 0.002391669899225235, + "learning_rate": 0.0001798793384745299, + "loss": 23.0, + "step": 3447 + }, + { + "epoch": 2.069627851140456, + "grad_norm": 0.0008025117567740381, + "learning_rate": 0.0001798679720505111, + "loss": 23.0, + "step": 3448 + }, + { + "epoch": 2.0702280912364945, + "grad_norm": 0.0011154050007462502, + "learning_rate": 0.00017985660277620985, + "loss": 23.0, + "step": 3449 + }, + { + "epoch": 2.070828331332533, + "grad_norm": 0.001934365602210164, + "learning_rate": 0.00017984523065203188, + "loss": 23.0, + "step": 3450 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.0007151182508096099, + "learning_rate": 0.000179833855678383, + "loss": 23.0, + "step": 3451 + }, + { + "epoch": 2.0720288115246097, + "grad_norm": 0.001716459752060473, + "learning_rate": 0.00017982247785566918, + "loss": 23.0, + "step": 3452 + }, + { + "epoch": 2.0726290516206483, + "grad_norm": 0.0010225954465568066, + "learning_rate": 0.00017981109718429643, + "loss": 23.0, + "step": 3453 + }, + { + "epoch": 2.073229291716687, + "grad_norm": 0.0027839874383062124, + "learning_rate": 0.00017979971366467094, + "loss": 23.0, + "step": 3454 + }, + { + "epoch": 2.073829531812725, + "grad_norm": 0.0011828261194750667, + "learning_rate": 0.00017978832729719893, + "loss": 23.0, + "step": 3455 + }, + { + "epoch": 2.0744297719087634, + "grad_norm": 0.00159098906442523, + "learning_rate": 0.00017977693808228677, + "loss": 23.0, + "step": 3456 + }, + { + "epoch": 2.075030012004802, + "grad_norm": 0.0014060731045901775, + "learning_rate": 0.0001797655460203409, + "loss": 23.0, + "step": 3457 + }, + { + "epoch": 2.0756302521008405, + "grad_norm": 0.001484134467318654, + "learning_rate": 0.00017975415111176783, + "loss": 23.0, + "step": 3458 + }, + { + "epoch": 2.0762304921968786, + "grad_norm": 0.0017137357499450445, + "learning_rate": 0.0001797427533569743, + "loss": 23.0, + "step": 3459 + }, + { + "epoch": 2.076830732292917, + "grad_norm": 0.0030945942271500826, + "learning_rate": 0.000179731352756367, + "loss": 23.0, + "step": 3460 + }, + { + "epoch": 2.0774309723889557, + "grad_norm": 0.0011171615915372968, + "learning_rate": 0.00017971994931035283, + "loss": 23.0, + "step": 3461 + }, + { + "epoch": 2.078031212484994, + "grad_norm": 0.001267662737518549, + "learning_rate": 0.00017970854301933874, + "loss": 23.0, + "step": 3462 + }, + { + "epoch": 2.0786314525810323, + "grad_norm": 0.0021587491501122713, + "learning_rate": 0.0001796971338837318, + "loss": 23.0, + "step": 3463 + }, + { + "epoch": 2.079231692677071, + "grad_norm": 0.002123309997841716, + "learning_rate": 0.00017968572190393912, + "loss": 23.0, + "step": 3464 + }, + { + "epoch": 2.0798319327731094, + "grad_norm": 0.0012843715958297253, + "learning_rate": 0.000179674307080368, + "loss": 23.0, + "step": 3465 + }, + { + "epoch": 2.0804321728691475, + "grad_norm": 0.0021665177773684263, + "learning_rate": 0.00017966288941342583, + "loss": 23.0, + "step": 3466 + }, + { + "epoch": 2.081032412965186, + "grad_norm": 0.0019804276525974274, + "learning_rate": 0.00017965146890352006, + "loss": 23.0, + "step": 3467 + }, + { + "epoch": 2.0816326530612246, + "grad_norm": 0.0025594120379537344, + "learning_rate": 0.00017964004555105824, + "loss": 23.0, + "step": 3468 + }, + { + "epoch": 2.082232893157263, + "grad_norm": 0.002886942122131586, + "learning_rate": 0.00017962861935644805, + "loss": 23.0, + "step": 3469 + }, + { + "epoch": 2.082833133253301, + "grad_norm": 0.001857664086855948, + "learning_rate": 0.0001796171903200973, + "loss": 23.0, + "step": 3470 + }, + { + "epoch": 2.0834333733493398, + "grad_norm": 0.002572208410128951, + "learning_rate": 0.0001796057584424138, + "loss": 23.0, + "step": 3471 + }, + { + "epoch": 2.0840336134453783, + "grad_norm": 0.002026068978011608, + "learning_rate": 0.00017959432372380554, + "loss": 23.0, + "step": 3472 + }, + { + "epoch": 2.0846338535414164, + "grad_norm": 0.0016905996017158031, + "learning_rate": 0.0001795828861646806, + "loss": 23.0, + "step": 3473 + }, + { + "epoch": 2.085234093637455, + "grad_norm": 0.0011786259710788727, + "learning_rate": 0.00017957144576544718, + "loss": 23.0, + "step": 3474 + }, + { + "epoch": 2.0858343337334935, + "grad_norm": 0.0007971758022904396, + "learning_rate": 0.00017956000252651353, + "loss": 23.0, + "step": 3475 + }, + { + "epoch": 2.086434573829532, + "grad_norm": 0.0009448099881410599, + "learning_rate": 0.00017954855644828808, + "loss": 23.0, + "step": 3476 + }, + { + "epoch": 2.08703481392557, + "grad_norm": 0.002794134197756648, + "learning_rate": 0.00017953710753117923, + "loss": 23.0, + "step": 3477 + }, + { + "epoch": 2.0876350540216086, + "grad_norm": 0.0017918311059474945, + "learning_rate": 0.00017952565577559562, + "loss": 23.0, + "step": 3478 + }, + { + "epoch": 2.088235294117647, + "grad_norm": 0.0032223749440163374, + "learning_rate": 0.00017951420118194594, + "loss": 23.0, + "step": 3479 + }, + { + "epoch": 2.0888355342136853, + "grad_norm": 0.0009358105598948896, + "learning_rate": 0.00017950274375063893, + "loss": 23.0, + "step": 3480 + }, + { + "epoch": 2.089435774309724, + "grad_norm": 0.004243810195475817, + "learning_rate": 0.00017949128348208353, + "loss": 23.0, + "step": 3481 + }, + { + "epoch": 2.0900360144057624, + "grad_norm": 0.002074790420010686, + "learning_rate": 0.00017947982037668867, + "loss": 23.0, + "step": 3482 + }, + { + "epoch": 2.090636254501801, + "grad_norm": 0.0004455752205103636, + "learning_rate": 0.00017946835443486347, + "loss": 23.0, + "step": 3483 + }, + { + "epoch": 2.091236494597839, + "grad_norm": 0.005418194457888603, + "learning_rate": 0.00017945688565701714, + "loss": 23.0, + "step": 3484 + }, + { + "epoch": 2.0918367346938775, + "grad_norm": 0.0012525448109954596, + "learning_rate": 0.0001794454140435589, + "loss": 23.0, + "step": 3485 + }, + { + "epoch": 2.092436974789916, + "grad_norm": 0.0017173114465549588, + "learning_rate": 0.00017943393959489825, + "loss": 23.0, + "step": 3486 + }, + { + "epoch": 2.093037214885954, + "grad_norm": 0.0008452086476609111, + "learning_rate": 0.00017942246231144457, + "loss": 23.0, + "step": 3487 + }, + { + "epoch": 2.0936374549819927, + "grad_norm": 0.0011205211048945785, + "learning_rate": 0.00017941098219360755, + "loss": 23.0, + "step": 3488 + }, + { + "epoch": 2.0942376950780313, + "grad_norm": 0.0008696285076439381, + "learning_rate": 0.00017939949924179684, + "loss": 23.0, + "step": 3489 + }, + { + "epoch": 2.09483793517407, + "grad_norm": 0.0013399510644376278, + "learning_rate": 0.00017938801345642223, + "loss": 23.0, + "step": 3490 + }, + { + "epoch": 2.095438175270108, + "grad_norm": 0.0006139983888715506, + "learning_rate": 0.00017937652483789363, + "loss": 23.0, + "step": 3491 + }, + { + "epoch": 2.0960384153661464, + "grad_norm": 0.0014349283883348107, + "learning_rate": 0.00017936503338662104, + "loss": 23.0, + "step": 3492 + }, + { + "epoch": 2.096638655462185, + "grad_norm": 0.0010579722002148628, + "learning_rate": 0.00017935353910301455, + "loss": 23.0, + "step": 3493 + }, + { + "epoch": 2.097238895558223, + "grad_norm": 0.0010757379932329059, + "learning_rate": 0.0001793420419874844, + "loss": 23.0, + "step": 3494 + }, + { + "epoch": 2.0978391356542616, + "grad_norm": 0.0016692588105797768, + "learning_rate": 0.00017933054204044083, + "loss": 23.0, + "step": 3495 + }, + { + "epoch": 2.0984393757503, + "grad_norm": 0.002716292394325137, + "learning_rate": 0.00017931903926229428, + "loss": 23.0, + "step": 3496 + }, + { + "epoch": 2.0990396158463387, + "grad_norm": 0.0011049581225961447, + "learning_rate": 0.00017930753365345527, + "loss": 23.0, + "step": 3497 + }, + { + "epoch": 2.099639855942377, + "grad_norm": 0.001968407304957509, + "learning_rate": 0.0001792960252143344, + "loss": 23.0, + "step": 3498 + }, + { + "epoch": 2.1002400960384153, + "grad_norm": 0.0006715644267387688, + "learning_rate": 0.00017928451394534233, + "loss": 23.0, + "step": 3499 + }, + { + "epoch": 2.100840336134454, + "grad_norm": 0.0011009502923116088, + "learning_rate": 0.00017927299984688992, + "loss": 23.0, + "step": 3500 + }, + { + "epoch": 2.1014405762304924, + "grad_norm": 0.002423146739602089, + "learning_rate": 0.00017926148291938808, + "loss": 23.0, + "step": 3501 + }, + { + "epoch": 2.1020408163265305, + "grad_norm": 0.004653971176594496, + "learning_rate": 0.00017924996316324778, + "loss": 23.0, + "step": 3502 + }, + { + "epoch": 2.102641056422569, + "grad_norm": 0.002340496750548482, + "learning_rate": 0.00017923844057888012, + "loss": 23.0, + "step": 3503 + }, + { + "epoch": 2.1032412965186076, + "grad_norm": 0.0018698208732530475, + "learning_rate": 0.00017922691516669635, + "loss": 23.0, + "step": 3504 + }, + { + "epoch": 2.1038415366146457, + "grad_norm": 0.0017265928909182549, + "learning_rate": 0.00017921538692710782, + "loss": 23.0, + "step": 3505 + }, + { + "epoch": 2.104441776710684, + "grad_norm": 0.0035198975820094347, + "learning_rate": 0.00017920385586052588, + "loss": 23.0, + "step": 3506 + }, + { + "epoch": 2.1050420168067228, + "grad_norm": 0.005279972683638334, + "learning_rate": 0.00017919232196736203, + "loss": 23.0, + "step": 3507 + }, + { + "epoch": 2.1056422569027613, + "grad_norm": 0.0019218252273276448, + "learning_rate": 0.00017918078524802795, + "loss": 23.0, + "step": 3508 + }, + { + "epoch": 2.1062424969987994, + "grad_norm": 0.0037660326343029737, + "learning_rate": 0.00017916924570293528, + "loss": 23.0, + "step": 3509 + }, + { + "epoch": 2.106842737094838, + "grad_norm": 0.0016380378510802984, + "learning_rate": 0.00017915770333249594, + "loss": 23.0, + "step": 3510 + }, + { + "epoch": 2.1074429771908765, + "grad_norm": 0.0015107912477105856, + "learning_rate": 0.00017914615813712173, + "loss": 23.0, + "step": 3511 + }, + { + "epoch": 2.1080432172869146, + "grad_norm": 0.0029202804435044527, + "learning_rate": 0.00017913461011722474, + "loss": 23.0, + "step": 3512 + }, + { + "epoch": 2.108643457382953, + "grad_norm": 0.0013868259266018867, + "learning_rate": 0.0001791230592732171, + "loss": 23.0, + "step": 3513 + }, + { + "epoch": 2.1092436974789917, + "grad_norm": 0.0014497030060738325, + "learning_rate": 0.000179111505605511, + "loss": 23.0, + "step": 3514 + }, + { + "epoch": 2.10984393757503, + "grad_norm": 0.0015206874813884497, + "learning_rate": 0.00017909994911451872, + "loss": 23.0, + "step": 3515 + }, + { + "epoch": 2.1104441776710683, + "grad_norm": 0.003641763236373663, + "learning_rate": 0.00017908838980065274, + "loss": 23.0, + "step": 3516 + }, + { + "epoch": 2.111044417767107, + "grad_norm": 0.0021101627498865128, + "learning_rate": 0.0001790768276643256, + "loss": 23.0, + "step": 3517 + }, + { + "epoch": 2.1116446578631454, + "grad_norm": 0.0026833494193851948, + "learning_rate": 0.00017906526270594988, + "loss": 23.0, + "step": 3518 + }, + { + "epoch": 2.1122448979591835, + "grad_norm": 0.0010312756057828665, + "learning_rate": 0.0001790536949259383, + "loss": 23.0, + "step": 3519 + }, + { + "epoch": 2.112845138055222, + "grad_norm": 0.0033930472563952208, + "learning_rate": 0.00017904212432470371, + "loss": 23.0, + "step": 3520 + }, + { + "epoch": 2.1134453781512605, + "grad_norm": 0.0020478384103626013, + "learning_rate": 0.00017903055090265902, + "loss": 23.0, + "step": 3521 + }, + { + "epoch": 2.114045618247299, + "grad_norm": 0.0017036633798852563, + "learning_rate": 0.00017901897466021725, + "loss": 23.0, + "step": 3522 + }, + { + "epoch": 2.114645858343337, + "grad_norm": 0.0015527663053944707, + "learning_rate": 0.00017900739559779155, + "loss": 23.0, + "step": 3523 + }, + { + "epoch": 2.1152460984393757, + "grad_norm": 0.001124212285503745, + "learning_rate": 0.00017899581371579512, + "loss": 23.0, + "step": 3524 + }, + { + "epoch": 2.1158463385354143, + "grad_norm": 0.0014042783295735717, + "learning_rate": 0.0001789842290146413, + "loss": 23.0, + "step": 3525 + }, + { + "epoch": 2.116446578631453, + "grad_norm": 0.003444352652877569, + "learning_rate": 0.0001789726414947435, + "loss": 23.0, + "step": 3526 + }, + { + "epoch": 2.117046818727491, + "grad_norm": 0.003073784988373518, + "learning_rate": 0.00017896105115651533, + "loss": 23.0, + "step": 3527 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.0012429555645212531, + "learning_rate": 0.0001789494580003703, + "loss": 23.0, + "step": 3528 + }, + { + "epoch": 2.118247298919568, + "grad_norm": 0.004864856600761414, + "learning_rate": 0.0001789378620267222, + "loss": 23.0, + "step": 3529 + }, + { + "epoch": 2.118847539015606, + "grad_norm": 0.0022995832841843367, + "learning_rate": 0.00017892626323598486, + "loss": 23.0, + "step": 3530 + }, + { + "epoch": 2.1194477791116446, + "grad_norm": 0.00140533153899014, + "learning_rate": 0.00017891466162857223, + "loss": 23.0, + "step": 3531 + }, + { + "epoch": 2.120048019207683, + "grad_norm": 0.0009306234424002469, + "learning_rate": 0.0001789030572048983, + "loss": 23.0, + "step": 3532 + }, + { + "epoch": 2.1206482593037217, + "grad_norm": 0.003682507202029228, + "learning_rate": 0.00017889144996537725, + "loss": 23.0, + "step": 3533 + }, + { + "epoch": 2.12124849939976, + "grad_norm": 0.0009380745468661189, + "learning_rate": 0.00017887983991042323, + "loss": 23.0, + "step": 3534 + }, + { + "epoch": 2.1218487394957983, + "grad_norm": 0.0013799188891425729, + "learning_rate": 0.00017886822704045064, + "loss": 23.0, + "step": 3535 + }, + { + "epoch": 2.122448979591837, + "grad_norm": 0.0009243764216080308, + "learning_rate": 0.00017885661135587393, + "loss": 23.0, + "step": 3536 + }, + { + "epoch": 2.123049219687875, + "grad_norm": 0.0008515262743458152, + "learning_rate": 0.00017884499285710762, + "loss": 23.0, + "step": 3537 + }, + { + "epoch": 2.1236494597839135, + "grad_norm": 0.0015884919557720423, + "learning_rate": 0.00017883337154456629, + "loss": 23.0, + "step": 3538 + }, + { + "epoch": 2.124249699879952, + "grad_norm": 0.0032588981557637453, + "learning_rate": 0.0001788217474186647, + "loss": 23.0, + "step": 3539 + }, + { + "epoch": 2.1248499399759906, + "grad_norm": 0.0024941442534327507, + "learning_rate": 0.00017881012047981772, + "loss": 23.0, + "step": 3540 + }, + { + "epoch": 2.1254501800720287, + "grad_norm": 0.0017060820246115327, + "learning_rate": 0.00017879849072844028, + "loss": 23.0, + "step": 3541 + }, + { + "epoch": 2.1260504201680672, + "grad_norm": 0.003131990320980549, + "learning_rate": 0.0001787868581649474, + "loss": 23.0, + "step": 3542 + }, + { + "epoch": 2.1266506602641058, + "grad_norm": 0.0024005863815546036, + "learning_rate": 0.00017877522278975417, + "loss": 23.0, + "step": 3543 + }, + { + "epoch": 2.127250900360144, + "grad_norm": 0.0006045355694368482, + "learning_rate": 0.0001787635846032759, + "loss": 23.0, + "step": 3544 + }, + { + "epoch": 2.1278511404561824, + "grad_norm": 0.0035146684385836124, + "learning_rate": 0.0001787519436059279, + "loss": 23.0, + "step": 3545 + }, + { + "epoch": 2.128451380552221, + "grad_norm": 0.0014111412456259131, + "learning_rate": 0.00017874029979812563, + "loss": 23.0, + "step": 3546 + }, + { + "epoch": 2.1290516206482595, + "grad_norm": 0.0018674216698855162, + "learning_rate": 0.00017872865318028459, + "loss": 23.0, + "step": 3547 + }, + { + "epoch": 2.1296518607442976, + "grad_norm": 0.0012715421617031097, + "learning_rate": 0.00017871700375282043, + "loss": 23.0, + "step": 3548 + }, + { + "epoch": 2.130252100840336, + "grad_norm": 0.003209817223250866, + "learning_rate": 0.0001787053515161489, + "loss": 23.0, + "step": 3549 + }, + { + "epoch": 2.1308523409363747, + "grad_norm": 0.001961090601980686, + "learning_rate": 0.0001786936964706858, + "loss": 23.0, + "step": 3550 + }, + { + "epoch": 2.1314525810324128, + "grad_norm": 0.0013860109029337764, + "learning_rate": 0.0001786820386168471, + "loss": 23.0, + "step": 3551 + }, + { + "epoch": 2.1320528211284513, + "grad_norm": 0.00047536773490719497, + "learning_rate": 0.00017867037795504887, + "loss": 23.0, + "step": 3552 + }, + { + "epoch": 2.13265306122449, + "grad_norm": 0.0019673663191497326, + "learning_rate": 0.00017865871448570717, + "loss": 23.0, + "step": 3553 + }, + { + "epoch": 2.1332533013205284, + "grad_norm": 0.0021188133396208286, + "learning_rate": 0.00017864704820923832, + "loss": 23.0, + "step": 3554 + }, + { + "epoch": 2.1338535414165665, + "grad_norm": 0.0027213292196393013, + "learning_rate": 0.0001786353791260586, + "loss": 23.0, + "step": 3555 + }, + { + "epoch": 2.134453781512605, + "grad_norm": 0.0027741962112486362, + "learning_rate": 0.0001786237072365845, + "loss": 23.0, + "step": 3556 + }, + { + "epoch": 2.1350540216086435, + "grad_norm": 0.002745026256889105, + "learning_rate": 0.00017861203254123252, + "loss": 23.0, + "step": 3557 + }, + { + "epoch": 2.1356542617046816, + "grad_norm": 0.0020450023002922535, + "learning_rate": 0.00017860035504041932, + "loss": 23.0, + "step": 3558 + }, + { + "epoch": 2.13625450180072, + "grad_norm": 0.0010032516438513994, + "learning_rate": 0.00017858867473456163, + "loss": 23.0, + "step": 3559 + }, + { + "epoch": 2.1368547418967587, + "grad_norm": 0.0014476818032562733, + "learning_rate": 0.00017857699162407626, + "loss": 23.0, + "step": 3560 + }, + { + "epoch": 2.1374549819927973, + "grad_norm": 0.001177955069579184, + "learning_rate": 0.00017856530570938022, + "loss": 23.0, + "step": 3561 + }, + { + "epoch": 2.1380552220888354, + "grad_norm": 0.002862164517864585, + "learning_rate": 0.0001785536169908905, + "loss": 23.0, + "step": 3562 + }, + { + "epoch": 2.138655462184874, + "grad_norm": 0.0010038964683189988, + "learning_rate": 0.00017854192546902427, + "loss": 23.0, + "step": 3563 + }, + { + "epoch": 2.1392557022809124, + "grad_norm": 0.0017746913945302367, + "learning_rate": 0.00017853023114419877, + "loss": 23.0, + "step": 3564 + }, + { + "epoch": 2.139855942376951, + "grad_norm": 0.0028854040428996086, + "learning_rate": 0.00017851853401683126, + "loss": 23.0, + "step": 3565 + }, + { + "epoch": 2.140456182472989, + "grad_norm": 0.001586511847563088, + "learning_rate": 0.00017850683408733928, + "loss": 23.0, + "step": 3566 + }, + { + "epoch": 2.1410564225690276, + "grad_norm": 0.003254214534536004, + "learning_rate": 0.00017849513135614035, + "loss": 23.0, + "step": 3567 + }, + { + "epoch": 2.141656662665066, + "grad_norm": 0.0012385440059006214, + "learning_rate": 0.00017848342582365206, + "loss": 23.0, + "step": 3568 + }, + { + "epoch": 2.1422569027611043, + "grad_norm": 0.0018727433634921908, + "learning_rate": 0.0001784717174902922, + "loss": 23.0, + "step": 3569 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.004589486867189407, + "learning_rate": 0.00017846000635647863, + "loss": 23.0, + "step": 3570 + }, + { + "epoch": 2.1434573829531813, + "grad_norm": 0.0021497050765901804, + "learning_rate": 0.0001784482924226292, + "loss": 23.0, + "step": 3571 + }, + { + "epoch": 2.14405762304922, + "grad_norm": 0.0012627036776393652, + "learning_rate": 0.00017843657568916206, + "loss": 23.0, + "step": 3572 + }, + { + "epoch": 2.144657863145258, + "grad_norm": 0.001609223079867661, + "learning_rate": 0.00017842485615649528, + "loss": 23.0, + "step": 3573 + }, + { + "epoch": 2.1452581032412965, + "grad_norm": 0.001351055921986699, + "learning_rate": 0.00017841313382504711, + "loss": 23.0, + "step": 3574 + }, + { + "epoch": 2.145858343337335, + "grad_norm": 0.004179363604635, + "learning_rate": 0.00017840140869523587, + "loss": 23.0, + "step": 3575 + }, + { + "epoch": 2.146458583433373, + "grad_norm": 0.0016952658770605922, + "learning_rate": 0.00017838968076748008, + "loss": 23.0, + "step": 3576 + }, + { + "epoch": 2.1470588235294117, + "grad_norm": 0.0015167961828410625, + "learning_rate": 0.00017837795004219817, + "loss": 23.0, + "step": 3577 + }, + { + "epoch": 2.1476590636254502, + "grad_norm": 0.003133446676656604, + "learning_rate": 0.00017836621651980887, + "loss": 23.0, + "step": 3578 + }, + { + "epoch": 2.1482593037214888, + "grad_norm": 0.00226042908616364, + "learning_rate": 0.00017835448020073088, + "loss": 23.0, + "step": 3579 + }, + { + "epoch": 2.148859543817527, + "grad_norm": 0.0013810255331918597, + "learning_rate": 0.00017834274108538304, + "loss": 23.0, + "step": 3580 + }, + { + "epoch": 2.1494597839135654, + "grad_norm": 0.001641986076720059, + "learning_rate": 0.0001783309991741843, + "loss": 23.0, + "step": 3581 + }, + { + "epoch": 2.150060024009604, + "grad_norm": 0.002287675393745303, + "learning_rate": 0.0001783192544675537, + "loss": 23.0, + "step": 3582 + }, + { + "epoch": 2.1506602641056425, + "grad_norm": 0.0009976879227906466, + "learning_rate": 0.00017830750696591033, + "loss": 23.0, + "step": 3583 + }, + { + "epoch": 2.1512605042016806, + "grad_norm": 0.0018353014020249248, + "learning_rate": 0.0001782957566696735, + "loss": 23.0, + "step": 3584 + }, + { + "epoch": 2.151860744297719, + "grad_norm": 0.0019918165635317564, + "learning_rate": 0.00017828400357926253, + "loss": 23.0, + "step": 3585 + }, + { + "epoch": 2.1524609843937577, + "grad_norm": 0.0026910703163594007, + "learning_rate": 0.0001782722476950968, + "loss": 23.0, + "step": 3586 + }, + { + "epoch": 2.1530612244897958, + "grad_norm": 0.002473343163728714, + "learning_rate": 0.00017826048901759588, + "loss": 23.0, + "step": 3587 + }, + { + "epoch": 2.1536614645858343, + "grad_norm": 0.002732550958171487, + "learning_rate": 0.00017824872754717945, + "loss": 23.0, + "step": 3588 + }, + { + "epoch": 2.154261704681873, + "grad_norm": 0.0010042079957202077, + "learning_rate": 0.00017823696328426723, + "loss": 23.0, + "step": 3589 + }, + { + "epoch": 2.1548619447779114, + "grad_norm": 0.0027955600526183844, + "learning_rate": 0.000178225196229279, + "loss": 23.0, + "step": 3590 + }, + { + "epoch": 2.1554621848739495, + "grad_norm": 0.004130184184759855, + "learning_rate": 0.00017821342638263478, + "loss": 23.0, + "step": 3591 + }, + { + "epoch": 2.156062424969988, + "grad_norm": 0.003868487663567066, + "learning_rate": 0.00017820165374475452, + "loss": 23.0, + "step": 3592 + }, + { + "epoch": 2.1566626650660266, + "grad_norm": 0.0011650120140984654, + "learning_rate": 0.00017818987831605843, + "loss": 23.0, + "step": 3593 + }, + { + "epoch": 2.1572629051620646, + "grad_norm": 0.002393965143710375, + "learning_rate": 0.00017817810009696666, + "loss": 23.0, + "step": 3594 + }, + { + "epoch": 2.157863145258103, + "grad_norm": 0.0035790205001831055, + "learning_rate": 0.00017816631908789966, + "loss": 23.0, + "step": 3595 + }, + { + "epoch": 2.1584633853541417, + "grad_norm": 0.003235345706343651, + "learning_rate": 0.00017815453528927778, + "loss": 23.0, + "step": 3596 + }, + { + "epoch": 2.1590636254501803, + "grad_norm": 0.0013717797119170427, + "learning_rate": 0.00017814274870152155, + "loss": 23.0, + "step": 3597 + }, + { + "epoch": 2.1596638655462184, + "grad_norm": 0.0011841889936476946, + "learning_rate": 0.0001781309593250516, + "loss": 23.0, + "step": 3598 + }, + { + "epoch": 2.160264105642257, + "grad_norm": 0.0035572301130741835, + "learning_rate": 0.00017811916716028872, + "loss": 23.0, + "step": 3599 + }, + { + "epoch": 2.1608643457382954, + "grad_norm": 0.0009041174198500812, + "learning_rate": 0.00017810737220765374, + "loss": 23.0, + "step": 3600 + }, + { + "epoch": 2.1614645858343335, + "grad_norm": 0.0011678171576932073, + "learning_rate": 0.00017809557446756756, + "loss": 23.0, + "step": 3601 + }, + { + "epoch": 2.162064825930372, + "grad_norm": 0.0014870171435177326, + "learning_rate": 0.0001780837739404512, + "loss": 23.0, + "step": 3602 + }, + { + "epoch": 2.1626650660264106, + "grad_norm": 0.0015179289039224386, + "learning_rate": 0.00017807197062672577, + "loss": 23.0, + "step": 3603 + }, + { + "epoch": 2.163265306122449, + "grad_norm": 0.0020413794554769993, + "learning_rate": 0.00017806016452681254, + "loss": 23.0, + "step": 3604 + }, + { + "epoch": 2.1638655462184873, + "grad_norm": 0.0032616599928587675, + "learning_rate": 0.00017804835564113288, + "loss": 23.0, + "step": 3605 + }, + { + "epoch": 2.164465786314526, + "grad_norm": 0.0011516850208863616, + "learning_rate": 0.00017803654397010815, + "loss": 23.0, + "step": 3606 + }, + { + "epoch": 2.1650660264105643, + "grad_norm": 0.003489278955385089, + "learning_rate": 0.0001780247295141599, + "loss": 23.0, + "step": 3607 + }, + { + "epoch": 2.1656662665066024, + "grad_norm": 0.003842744044959545, + "learning_rate": 0.00017801291227370974, + "loss": 23.0, + "step": 3608 + }, + { + "epoch": 2.166266506602641, + "grad_norm": 0.001340657938271761, + "learning_rate": 0.0001780010922491794, + "loss": 23.0, + "step": 3609 + }, + { + "epoch": 2.1668667466986795, + "grad_norm": 0.0008930175099521875, + "learning_rate": 0.00017798926944099077, + "loss": 23.0, + "step": 3610 + }, + { + "epoch": 2.167466986794718, + "grad_norm": 0.0036189949605613947, + "learning_rate": 0.0001779774438495657, + "loss": 23.0, + "step": 3611 + }, + { + "epoch": 2.168067226890756, + "grad_norm": 0.003450029995292425, + "learning_rate": 0.00017796561547532623, + "loss": 23.0, + "step": 3612 + }, + { + "epoch": 2.1686674669867947, + "grad_norm": 0.0013816222781315446, + "learning_rate": 0.00017795378431869453, + "loss": 23.0, + "step": 3613 + }, + { + "epoch": 2.1692677070828332, + "grad_norm": 0.003153825644403696, + "learning_rate": 0.00017794195038009275, + "loss": 23.0, + "step": 3614 + }, + { + "epoch": 2.1698679471788713, + "grad_norm": 0.001024477300234139, + "learning_rate": 0.00017793011365994325, + "loss": 23.0, + "step": 3615 + }, + { + "epoch": 2.17046818727491, + "grad_norm": 0.003891037544235587, + "learning_rate": 0.00017791827415866848, + "loss": 23.0, + "step": 3616 + }, + { + "epoch": 2.1710684273709484, + "grad_norm": 0.002924687694758177, + "learning_rate": 0.00017790643187669093, + "loss": 23.0, + "step": 3617 + }, + { + "epoch": 2.171668667466987, + "grad_norm": 0.009289652109146118, + "learning_rate": 0.00017789458681443323, + "loss": 23.0, + "step": 3618 + }, + { + "epoch": 2.172268907563025, + "grad_norm": 0.0021330954041332006, + "learning_rate": 0.00017788273897231807, + "loss": 23.0, + "step": 3619 + }, + { + "epoch": 2.1728691476590636, + "grad_norm": 0.0019930656999349594, + "learning_rate": 0.0001778708883507683, + "loss": 23.0, + "step": 3620 + }, + { + "epoch": 2.173469387755102, + "grad_norm": 0.00517197186127305, + "learning_rate": 0.00017785903495020686, + "loss": 23.0, + "step": 3621 + }, + { + "epoch": 2.1740696278511407, + "grad_norm": 0.0015679982025176287, + "learning_rate": 0.00017784717877105673, + "loss": 23.0, + "step": 3622 + }, + { + "epoch": 2.1746698679471788, + "grad_norm": 0.004141742363572121, + "learning_rate": 0.00017783531981374102, + "loss": 23.0, + "step": 3623 + }, + { + "epoch": 2.1752701080432173, + "grad_norm": 0.00065791723318398, + "learning_rate": 0.00017782345807868297, + "loss": 23.0, + "step": 3624 + }, + { + "epoch": 2.175870348139256, + "grad_norm": 0.004961501341313124, + "learning_rate": 0.0001778115935663059, + "loss": 23.0, + "step": 3625 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.0023005243856459856, + "learning_rate": 0.0001777997262770332, + "loss": 23.0, + "step": 3626 + }, + { + "epoch": 2.1770708283313325, + "grad_norm": 0.00899723544716835, + "learning_rate": 0.00017778785621128836, + "loss": 23.0, + "step": 3627 + }, + { + "epoch": 2.177671068427371, + "grad_norm": 0.0029707015492022038, + "learning_rate": 0.0001777759833694951, + "loss": 23.0, + "step": 3628 + }, + { + "epoch": 2.1782713085234096, + "grad_norm": 0.0018353236373513937, + "learning_rate": 0.000177764107752077, + "loss": 23.0, + "step": 3629 + }, + { + "epoch": 2.1788715486194477, + "grad_norm": 0.0010108323767781258, + "learning_rate": 0.00017775222935945796, + "loss": 23.0, + "step": 3630 + }, + { + "epoch": 2.179471788715486, + "grad_norm": 0.0008860941743478179, + "learning_rate": 0.00017774034819206181, + "loss": 23.0, + "step": 3631 + }, + { + "epoch": 2.1800720288115247, + "grad_norm": 0.003606445388868451, + "learning_rate": 0.00017772846425031266, + "loss": 23.0, + "step": 3632 + }, + { + "epoch": 2.180672268907563, + "grad_norm": 0.0016308276681229472, + "learning_rate": 0.00017771657753463453, + "loss": 23.0, + "step": 3633 + }, + { + "epoch": 2.1812725090036014, + "grad_norm": 0.0007045452948659658, + "learning_rate": 0.00017770468804545169, + "loss": 23.0, + "step": 3634 + }, + { + "epoch": 2.18187274909964, + "grad_norm": 0.0013048743130639195, + "learning_rate": 0.0001776927957831884, + "loss": 23.0, + "step": 3635 + }, + { + "epoch": 2.1824729891956784, + "grad_norm": 0.0012544161872938275, + "learning_rate": 0.00017768090074826904, + "loss": 23.0, + "step": 3636 + }, + { + "epoch": 2.1830732292917165, + "grad_norm": 0.00237638084217906, + "learning_rate": 0.0001776690029411182, + "loss": 23.0, + "step": 3637 + }, + { + "epoch": 2.183673469387755, + "grad_norm": 0.0048915608786046505, + "learning_rate": 0.0001776571023621604, + "loss": 23.0, + "step": 3638 + }, + { + "epoch": 2.1842737094837936, + "grad_norm": 0.000844002643134445, + "learning_rate": 0.0001776451990118204, + "loss": 23.0, + "step": 3639 + }, + { + "epoch": 2.184873949579832, + "grad_norm": 0.00372367724776268, + "learning_rate": 0.00017763329289052296, + "loss": 23.0, + "step": 3640 + }, + { + "epoch": 2.1854741896758703, + "grad_norm": 0.0023307260125875473, + "learning_rate": 0.000177621383998693, + "loss": 23.0, + "step": 3641 + }, + { + "epoch": 2.186074429771909, + "grad_norm": 0.0026479533407837152, + "learning_rate": 0.0001776094723367555, + "loss": 23.0, + "step": 3642 + }, + { + "epoch": 2.1866746698679473, + "grad_norm": 0.00475787278264761, + "learning_rate": 0.00017759755790513557, + "loss": 23.0, + "step": 3643 + }, + { + "epoch": 2.1872749099639854, + "grad_norm": 0.0008646330097690225, + "learning_rate": 0.00017758564070425843, + "loss": 23.0, + "step": 3644 + }, + { + "epoch": 2.187875150060024, + "grad_norm": 0.000730241765268147, + "learning_rate": 0.00017757372073454931, + "loss": 23.0, + "step": 3645 + }, + { + "epoch": 2.1884753901560625, + "grad_norm": 0.0020450761076062918, + "learning_rate": 0.00017756179799643368, + "loss": 23.0, + "step": 3646 + }, + { + "epoch": 2.189075630252101, + "grad_norm": 0.001998509978875518, + "learning_rate": 0.00017754987249033694, + "loss": 23.0, + "step": 3647 + }, + { + "epoch": 2.189675870348139, + "grad_norm": 0.0017022277461364865, + "learning_rate": 0.00017753794421668478, + "loss": 23.0, + "step": 3648 + }, + { + "epoch": 2.1902761104441777, + "grad_norm": 0.0010228764731436968, + "learning_rate": 0.0001775260131759028, + "loss": 23.0, + "step": 3649 + }, + { + "epoch": 2.1908763505402162, + "grad_norm": 0.0021690605208277702, + "learning_rate": 0.00017751407936841688, + "loss": 23.0, + "step": 3650 + }, + { + "epoch": 2.1914765906362543, + "grad_norm": 0.0016400478780269623, + "learning_rate": 0.00017750214279465282, + "loss": 23.0, + "step": 3651 + }, + { + "epoch": 2.192076830732293, + "grad_norm": 0.002070733578875661, + "learning_rate": 0.00017749020345503666, + "loss": 23.0, + "step": 3652 + }, + { + "epoch": 2.1926770708283314, + "grad_norm": 0.0014095326187089086, + "learning_rate": 0.00017747826134999445, + "loss": 23.0, + "step": 3653 + }, + { + "epoch": 2.19327731092437, + "grad_norm": 0.0019122892990708351, + "learning_rate": 0.00017746631647995242, + "loss": 23.0, + "step": 3654 + }, + { + "epoch": 2.193877551020408, + "grad_norm": 0.0029271540697664022, + "learning_rate": 0.00017745436884533683, + "loss": 23.0, + "step": 3655 + }, + { + "epoch": 2.1944777911164466, + "grad_norm": 0.002827174263074994, + "learning_rate": 0.00017744241844657398, + "loss": 23.0, + "step": 3656 + }, + { + "epoch": 2.195078031212485, + "grad_norm": 0.0012498522410169244, + "learning_rate": 0.00017743046528409049, + "loss": 23.0, + "step": 3657 + }, + { + "epoch": 2.195678271308523, + "grad_norm": 0.003829399822279811, + "learning_rate": 0.00017741850935831287, + "loss": 23.0, + "step": 3658 + }, + { + "epoch": 2.1962785114045618, + "grad_norm": 0.0021660744678229094, + "learning_rate": 0.00017740655066966773, + "loss": 23.0, + "step": 3659 + }, + { + "epoch": 2.1968787515006003, + "grad_norm": 0.002248313743621111, + "learning_rate": 0.00017739458921858198, + "loss": 23.0, + "step": 3660 + }, + { + "epoch": 2.197478991596639, + "grad_norm": 0.0018800124526023865, + "learning_rate": 0.00017738262500548236, + "loss": 23.0, + "step": 3661 + }, + { + "epoch": 2.198079231692677, + "grad_norm": 0.0030018959660083055, + "learning_rate": 0.00017737065803079597, + "loss": 23.0, + "step": 3662 + }, + { + "epoch": 2.1986794717887155, + "grad_norm": 0.0009485651389695704, + "learning_rate": 0.00017735868829494978, + "loss": 23.0, + "step": 3663 + }, + { + "epoch": 2.199279711884754, + "grad_norm": 0.002609929535537958, + "learning_rate": 0.000177346715798371, + "loss": 23.0, + "step": 3664 + }, + { + "epoch": 2.199879951980792, + "grad_norm": 0.0013200175017118454, + "learning_rate": 0.0001773347405414869, + "loss": 23.0, + "step": 3665 + }, + { + "epoch": 2.2004801920768307, + "grad_norm": 0.0007020602351985872, + "learning_rate": 0.00017732276252472483, + "loss": 23.0, + "step": 3666 + }, + { + "epoch": 2.201080432172869, + "grad_norm": 0.0026193074882030487, + "learning_rate": 0.00017731078174851227, + "loss": 23.0, + "step": 3667 + }, + { + "epoch": 2.2016806722689077, + "grad_norm": 0.001026253798045218, + "learning_rate": 0.00017729879821327678, + "loss": 23.0, + "step": 3668 + }, + { + "epoch": 2.202280912364946, + "grad_norm": 0.0017351701389998198, + "learning_rate": 0.000177286811919446, + "loss": 23.0, + "step": 3669 + }, + { + "epoch": 2.2028811524609844, + "grad_norm": 0.0014646050985902548, + "learning_rate": 0.0001772748228674477, + "loss": 23.0, + "step": 3670 + }, + { + "epoch": 2.203481392557023, + "grad_norm": 0.0030125817283988, + "learning_rate": 0.0001772628310577098, + "loss": 23.0, + "step": 3671 + }, + { + "epoch": 2.204081632653061, + "grad_norm": 0.004487418103963137, + "learning_rate": 0.00017725083649066016, + "loss": 23.0, + "step": 3672 + }, + { + "epoch": 2.2046818727490995, + "grad_norm": 0.0021111650858074427, + "learning_rate": 0.0001772388391667269, + "loss": 23.0, + "step": 3673 + }, + { + "epoch": 2.205282112845138, + "grad_norm": 0.00048810808220878243, + "learning_rate": 0.00017722683908633812, + "loss": 23.0, + "step": 3674 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 0.003283157479017973, + "learning_rate": 0.00017721483624992215, + "loss": 23.0, + "step": 3675 + }, + { + "epoch": 2.2064825930372147, + "grad_norm": 0.0016707901377230883, + "learning_rate": 0.00017720283065790726, + "loss": 23.0, + "step": 3676 + }, + { + "epoch": 2.2070828331332533, + "grad_norm": 0.001358469482511282, + "learning_rate": 0.00017719082231072193, + "loss": 23.0, + "step": 3677 + }, + { + "epoch": 2.207683073229292, + "grad_norm": 0.0013346460182219744, + "learning_rate": 0.00017717881120879475, + "loss": 23.0, + "step": 3678 + }, + { + "epoch": 2.20828331332533, + "grad_norm": 0.001013239729218185, + "learning_rate": 0.0001771667973525543, + "loss": 23.0, + "step": 3679 + }, + { + "epoch": 2.2088835534213684, + "grad_norm": 0.001471264404244721, + "learning_rate": 0.00017715478074242932, + "loss": 23.0, + "step": 3680 + }, + { + "epoch": 2.209483793517407, + "grad_norm": 0.0015335743082687259, + "learning_rate": 0.00017714276137884873, + "loss": 23.0, + "step": 3681 + }, + { + "epoch": 2.2100840336134455, + "grad_norm": 0.005060260649770498, + "learning_rate": 0.0001771307392622414, + "loss": 23.0, + "step": 3682 + }, + { + "epoch": 2.2106842737094836, + "grad_norm": 0.0011684778146445751, + "learning_rate": 0.00017711871439303638, + "loss": 23.0, + "step": 3683 + }, + { + "epoch": 2.211284513805522, + "grad_norm": 0.0035212927032262087, + "learning_rate": 0.00017710668677166282, + "loss": 23.0, + "step": 3684 + }, + { + "epoch": 2.2118847539015607, + "grad_norm": 0.0023291881661862135, + "learning_rate": 0.00017709465639854997, + "loss": 23.0, + "step": 3685 + }, + { + "epoch": 2.2124849939975992, + "grad_norm": 0.0015391027554869652, + "learning_rate": 0.00017708262327412712, + "loss": 23.0, + "step": 3686 + }, + { + "epoch": 2.2130852340936373, + "grad_norm": 0.001002484350465238, + "learning_rate": 0.00017707058739882374, + "loss": 23.0, + "step": 3687 + }, + { + "epoch": 2.213685474189676, + "grad_norm": 0.0014180615544319153, + "learning_rate": 0.00017705854877306934, + "loss": 23.0, + "step": 3688 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.0008763477671891451, + "learning_rate": 0.00017704650739729357, + "loss": 23.0, + "step": 3689 + }, + { + "epoch": 2.2148859543817525, + "grad_norm": 0.0022361702285706997, + "learning_rate": 0.00017703446327192612, + "loss": 23.0, + "step": 3690 + }, + { + "epoch": 2.215486194477791, + "grad_norm": 0.0015827770112082362, + "learning_rate": 0.0001770224163973968, + "loss": 23.0, + "step": 3691 + }, + { + "epoch": 2.2160864345738296, + "grad_norm": 0.001904232893139124, + "learning_rate": 0.00017701036677413563, + "loss": 23.0, + "step": 3692 + }, + { + "epoch": 2.216686674669868, + "grad_norm": 0.0013081013457849622, + "learning_rate": 0.00017699831440257251, + "loss": 23.0, + "step": 3693 + }, + { + "epoch": 2.2172869147659062, + "grad_norm": 0.004817170090973377, + "learning_rate": 0.00017698625928313765, + "loss": 23.0, + "step": 3694 + }, + { + "epoch": 2.2178871548619448, + "grad_norm": 0.004000441636890173, + "learning_rate": 0.0001769742014162612, + "loss": 23.0, + "step": 3695 + }, + { + "epoch": 2.2184873949579833, + "grad_norm": 0.0027444609440863132, + "learning_rate": 0.00017696214080237352, + "loss": 23.0, + "step": 3696 + }, + { + "epoch": 2.2190876350540214, + "grad_norm": 0.002896189922466874, + "learning_rate": 0.000176950077441905, + "loss": 23.0, + "step": 3697 + }, + { + "epoch": 2.21968787515006, + "grad_norm": 0.002128873486071825, + "learning_rate": 0.00017693801133528616, + "loss": 23.0, + "step": 3698 + }, + { + "epoch": 2.2202881152460985, + "grad_norm": 0.0011179738212376833, + "learning_rate": 0.0001769259424829476, + "loss": 23.0, + "step": 3699 + }, + { + "epoch": 2.220888355342137, + "grad_norm": 0.002058587037026882, + "learning_rate": 0.00017691387088532003, + "loss": 23.0, + "step": 3700 + }, + { + "epoch": 2.221488595438175, + "grad_norm": 0.0022081804927438498, + "learning_rate": 0.00017690179654283425, + "loss": 23.0, + "step": 3701 + }, + { + "epoch": 2.2220888355342137, + "grad_norm": 0.0023553965147584677, + "learning_rate": 0.0001768897194559212, + "loss": 23.0, + "step": 3702 + }, + { + "epoch": 2.222689075630252, + "grad_norm": 0.001018383540213108, + "learning_rate": 0.00017687763962501182, + "loss": 23.0, + "step": 3703 + }, + { + "epoch": 2.2232893157262907, + "grad_norm": 0.0007036729366518557, + "learning_rate": 0.0001768655570505372, + "loss": 23.0, + "step": 3704 + }, + { + "epoch": 2.223889555822329, + "grad_norm": 0.0008182197925634682, + "learning_rate": 0.0001768534717329286, + "loss": 23.0, + "step": 3705 + }, + { + "epoch": 2.2244897959183674, + "grad_norm": 0.0010957723716273904, + "learning_rate": 0.00017684138367261731, + "loss": 23.0, + "step": 3706 + }, + { + "epoch": 2.225090036014406, + "grad_norm": 0.0017919333186000586, + "learning_rate": 0.00017682929287003466, + "loss": 23.0, + "step": 3707 + }, + { + "epoch": 2.225690276110444, + "grad_norm": 0.003724980168044567, + "learning_rate": 0.0001768171993256122, + "loss": 23.0, + "step": 3708 + }, + { + "epoch": 2.2262905162064826, + "grad_norm": 0.0013054338051006198, + "learning_rate": 0.00017680510303978147, + "loss": 23.0, + "step": 3709 + }, + { + "epoch": 2.226890756302521, + "grad_norm": 0.0018630026606842875, + "learning_rate": 0.0001767930040129742, + "loss": 23.0, + "step": 3710 + }, + { + "epoch": 2.2274909963985596, + "grad_norm": 0.0014727768721058965, + "learning_rate": 0.00017678090224562213, + "loss": 23.0, + "step": 3711 + }, + { + "epoch": 2.2280912364945977, + "grad_norm": 0.0010810722596943378, + "learning_rate": 0.0001767687977381572, + "loss": 23.0, + "step": 3712 + }, + { + "epoch": 2.2286914765906363, + "grad_norm": 0.001895120949484408, + "learning_rate": 0.00017675669049101132, + "loss": 23.0, + "step": 3713 + }, + { + "epoch": 2.229291716686675, + "grad_norm": 0.0021297503262758255, + "learning_rate": 0.0001767445805046166, + "loss": 23.0, + "step": 3714 + }, + { + "epoch": 2.229891956782713, + "grad_norm": 0.0009311452740803361, + "learning_rate": 0.00017673246777940522, + "loss": 23.0, + "step": 3715 + }, + { + "epoch": 2.2304921968787514, + "grad_norm": 0.0026380294002592564, + "learning_rate": 0.00017672035231580942, + "loss": 23.0, + "step": 3716 + }, + { + "epoch": 2.23109243697479, + "grad_norm": 0.0012547700898721814, + "learning_rate": 0.00017670823411426162, + "loss": 23.0, + "step": 3717 + }, + { + "epoch": 2.2316926770708285, + "grad_norm": 0.0012008714256808162, + "learning_rate": 0.0001766961131751943, + "loss": 23.0, + "step": 3718 + }, + { + "epoch": 2.2322929171668666, + "grad_norm": 0.0007726665935479105, + "learning_rate": 0.00017668398949903993, + "loss": 23.0, + "step": 3719 + }, + { + "epoch": 2.232893157262905, + "grad_norm": 0.003182122251018882, + "learning_rate": 0.00017667186308623122, + "loss": 23.0, + "step": 3720 + }, + { + "epoch": 2.2334933973589437, + "grad_norm": 0.0025740358978509903, + "learning_rate": 0.000176659733937201, + "loss": 23.0, + "step": 3721 + }, + { + "epoch": 2.234093637454982, + "grad_norm": 0.0025537547189742327, + "learning_rate": 0.00017664760205238203, + "loss": 23.0, + "step": 3722 + }, + { + "epoch": 2.2346938775510203, + "grad_norm": 0.0030393393244594336, + "learning_rate": 0.00017663546743220733, + "loss": 23.0, + "step": 3723 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.0024942343588918447, + "learning_rate": 0.00017662333007710987, + "loss": 23.0, + "step": 3724 + }, + { + "epoch": 2.2358943577430974, + "grad_norm": 0.0016120598884299397, + "learning_rate": 0.0001766111899875229, + "loss": 23.0, + "step": 3725 + }, + { + "epoch": 2.2364945978391355, + "grad_norm": 0.00033892018836922944, + "learning_rate": 0.0001765990471638796, + "loss": 23.0, + "step": 3726 + }, + { + "epoch": 2.237094837935174, + "grad_norm": 0.002562824171036482, + "learning_rate": 0.00017658690160661338, + "loss": 23.0, + "step": 3727 + }, + { + "epoch": 2.2376950780312126, + "grad_norm": 0.0022175584454089403, + "learning_rate": 0.00017657475331615762, + "loss": 23.0, + "step": 3728 + }, + { + "epoch": 2.2382953181272507, + "grad_norm": 0.001756917918100953, + "learning_rate": 0.0001765626022929459, + "loss": 23.0, + "step": 3729 + }, + { + "epoch": 2.2388955582232892, + "grad_norm": 0.003884568577632308, + "learning_rate": 0.00017655044853741184, + "loss": 23.0, + "step": 3730 + }, + { + "epoch": 2.2394957983193278, + "grad_norm": 0.001167623675428331, + "learning_rate": 0.00017653829204998918, + "loss": 23.0, + "step": 3731 + }, + { + "epoch": 2.2400960384153663, + "grad_norm": 0.0010216180235147476, + "learning_rate": 0.00017652613283111176, + "loss": 23.0, + "step": 3732 + }, + { + "epoch": 2.2406962785114044, + "grad_norm": 0.003365274053066969, + "learning_rate": 0.00017651397088121353, + "loss": 23.0, + "step": 3733 + }, + { + "epoch": 2.241296518607443, + "grad_norm": 0.0018816734664142132, + "learning_rate": 0.00017650180620072846, + "loss": 23.0, + "step": 3734 + }, + { + "epoch": 2.2418967587034815, + "grad_norm": 0.0029328542295843363, + "learning_rate": 0.00017648963879009075, + "loss": 23.0, + "step": 3735 + }, + { + "epoch": 2.2424969987995196, + "grad_norm": 0.0007874789298512042, + "learning_rate": 0.00017647746864973458, + "loss": 23.0, + "step": 3736 + }, + { + "epoch": 2.243097238895558, + "grad_norm": 0.0030753707978874445, + "learning_rate": 0.00017646529578009426, + "loss": 23.0, + "step": 3737 + }, + { + "epoch": 2.2436974789915967, + "grad_norm": 0.0010473758447915316, + "learning_rate": 0.00017645312018160422, + "loss": 23.0, + "step": 3738 + }, + { + "epoch": 2.244297719087635, + "grad_norm": 0.0018478642450645566, + "learning_rate": 0.00017644094185469898, + "loss": 23.0, + "step": 3739 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.0013350150547921658, + "learning_rate": 0.0001764287607998132, + "loss": 23.0, + "step": 3740 + }, + { + "epoch": 2.245498199279712, + "grad_norm": 0.0012342286063358188, + "learning_rate": 0.00017641657701738148, + "loss": 23.0, + "step": 3741 + }, + { + "epoch": 2.2460984393757504, + "grad_norm": 0.0013594189658761024, + "learning_rate": 0.00017640439050783875, + "loss": 23.0, + "step": 3742 + }, + { + "epoch": 2.246698679471789, + "grad_norm": 0.0033450396731495857, + "learning_rate": 0.00017639220127161983, + "loss": 23.0, + "step": 3743 + }, + { + "epoch": 2.247298919567827, + "grad_norm": 0.002889930969104171, + "learning_rate": 0.00017638000930915974, + "loss": 23.0, + "step": 3744 + }, + { + "epoch": 2.2478991596638656, + "grad_norm": 0.0019997726194560528, + "learning_rate": 0.00017636781462089364, + "loss": 23.0, + "step": 3745 + }, + { + "epoch": 2.248499399759904, + "grad_norm": 0.0012661672662943602, + "learning_rate": 0.0001763556172072566, + "loss": 23.0, + "step": 3746 + }, + { + "epoch": 2.249099639855942, + "grad_norm": 0.007048753555864096, + "learning_rate": 0.00017634341706868403, + "loss": 23.0, + "step": 3747 + }, + { + "epoch": 2.2496998799519807, + "grad_norm": 0.0015775463543832302, + "learning_rate": 0.00017633121420561131, + "loss": 23.0, + "step": 3748 + }, + { + "epoch": 2.2503001200480193, + "grad_norm": 0.0020231371745467186, + "learning_rate": 0.00017631900861847389, + "loss": 23.0, + "step": 3749 + }, + { + "epoch": 2.250900360144058, + "grad_norm": 0.0012863476295024157, + "learning_rate": 0.00017630680030770735, + "loss": 23.0, + "step": 3750 + }, + { + "epoch": 2.251500600240096, + "grad_norm": 0.0016621587565168738, + "learning_rate": 0.0001762945892737474, + "loss": 23.0, + "step": 3751 + }, + { + "epoch": 2.2521008403361344, + "grad_norm": 0.00226738303899765, + "learning_rate": 0.00017628237551702982, + "loss": 23.0, + "step": 3752 + }, + { + "epoch": 2.252701080432173, + "grad_norm": 0.0014520992990583181, + "learning_rate": 0.00017627015903799048, + "loss": 23.0, + "step": 3753 + }, + { + "epoch": 2.2533013205282115, + "grad_norm": 0.0013434399152174592, + "learning_rate": 0.00017625793983706533, + "loss": 23.0, + "step": 3754 + }, + { + "epoch": 2.2539015606242496, + "grad_norm": 0.0007136189378798008, + "learning_rate": 0.00017624571791469048, + "loss": 23.0, + "step": 3755 + }, + { + "epoch": 2.254501800720288, + "grad_norm": 0.002455267123878002, + "learning_rate": 0.00017623349327130206, + "loss": 23.0, + "step": 3756 + }, + { + "epoch": 2.2551020408163267, + "grad_norm": 0.0009659494389779866, + "learning_rate": 0.00017622126590733644, + "loss": 23.0, + "step": 3757 + }, + { + "epoch": 2.255702280912365, + "grad_norm": 0.002808001358062029, + "learning_rate": 0.00017620903582322983, + "loss": 23.0, + "step": 3758 + }, + { + "epoch": 2.2563025210084033, + "grad_norm": 0.0018349566962569952, + "learning_rate": 0.0001761968030194188, + "loss": 23.0, + "step": 3759 + }, + { + "epoch": 2.256902761104442, + "grad_norm": 0.002876591170206666, + "learning_rate": 0.00017618456749633987, + "loss": 23.0, + "step": 3760 + }, + { + "epoch": 2.2575030012004804, + "grad_norm": 0.0012843102449551225, + "learning_rate": 0.00017617232925442968, + "loss": 23.0, + "step": 3761 + }, + { + "epoch": 2.2581032412965185, + "grad_norm": 0.0011905563296750188, + "learning_rate": 0.00017616008829412502, + "loss": 23.0, + "step": 3762 + }, + { + "epoch": 2.258703481392557, + "grad_norm": 0.000979992444626987, + "learning_rate": 0.00017614784461586266, + "loss": 23.0, + "step": 3763 + }, + { + "epoch": 2.2593037214885956, + "grad_norm": 0.002167720813304186, + "learning_rate": 0.00017613559822007963, + "loss": 23.0, + "step": 3764 + }, + { + "epoch": 2.2599039615846337, + "grad_norm": 0.0008691643597558141, + "learning_rate": 0.00017612334910721298, + "loss": 23.0, + "step": 3765 + }, + { + "epoch": 2.2605042016806722, + "grad_norm": 0.0019571988377720118, + "learning_rate": 0.00017611109727769977, + "loss": 23.0, + "step": 3766 + }, + { + "epoch": 2.2611044417767108, + "grad_norm": 0.0012904416071251035, + "learning_rate": 0.00017609884273197732, + "loss": 23.0, + "step": 3767 + }, + { + "epoch": 2.2617046818727493, + "grad_norm": 0.0017439661314710975, + "learning_rate": 0.00017608658547048288, + "loss": 23.0, + "step": 3768 + }, + { + "epoch": 2.2623049219687874, + "grad_norm": 0.003602192969992757, + "learning_rate": 0.00017607432549365394, + "loss": 23.0, + "step": 3769 + }, + { + "epoch": 2.262905162064826, + "grad_norm": 0.002050132257863879, + "learning_rate": 0.00017606206280192801, + "loss": 23.0, + "step": 3770 + }, + { + "epoch": 2.2635054021608645, + "grad_norm": 0.0030350079759955406, + "learning_rate": 0.00017604979739574273, + "loss": 23.0, + "step": 3771 + }, + { + "epoch": 2.2641056422569026, + "grad_norm": 0.0011385931866243482, + "learning_rate": 0.0001760375292755358, + "loss": 23.0, + "step": 3772 + }, + { + "epoch": 2.264705882352941, + "grad_norm": 0.0015834002988412976, + "learning_rate": 0.000176025258441745, + "loss": 23.0, + "step": 3773 + }, + { + "epoch": 2.2653061224489797, + "grad_norm": 0.002169420477002859, + "learning_rate": 0.00017601298489480834, + "loss": 23.0, + "step": 3774 + }, + { + "epoch": 2.265906362545018, + "grad_norm": 0.0020891001913696527, + "learning_rate": 0.00017600070863516373, + "loss": 23.0, + "step": 3775 + }, + { + "epoch": 2.2665066026410563, + "grad_norm": 0.0015407392056658864, + "learning_rate": 0.00017598842966324937, + "loss": 23.0, + "step": 3776 + }, + { + "epoch": 2.267106842737095, + "grad_norm": 0.0012298728106543422, + "learning_rate": 0.0001759761479795034, + "loss": 23.0, + "step": 3777 + }, + { + "epoch": 2.2677070828331334, + "grad_norm": 0.0022670759353786707, + "learning_rate": 0.00017596386358436413, + "loss": 23.0, + "step": 3778 + }, + { + "epoch": 2.2683073229291715, + "grad_norm": 0.0009981478797271848, + "learning_rate": 0.00017595157647827, + "loss": 23.0, + "step": 3779 + }, + { + "epoch": 2.26890756302521, + "grad_norm": 0.0004938762285746634, + "learning_rate": 0.00017593928666165945, + "loss": 23.0, + "step": 3780 + }, + { + "epoch": 2.2695078031212486, + "grad_norm": 0.0011169624049216509, + "learning_rate": 0.00017592699413497114, + "loss": 23.0, + "step": 3781 + }, + { + "epoch": 2.270108043217287, + "grad_norm": 0.003080305876210332, + "learning_rate": 0.00017591469889864368, + "loss": 23.0, + "step": 3782 + }, + { + "epoch": 2.270708283313325, + "grad_norm": 0.004471518564969301, + "learning_rate": 0.00017590240095311587, + "loss": 23.0, + "step": 3783 + }, + { + "epoch": 2.2713085234093637, + "grad_norm": 0.0011711474508047104, + "learning_rate": 0.00017589010029882664, + "loss": 23.0, + "step": 3784 + }, + { + "epoch": 2.2719087635054023, + "grad_norm": 0.0011285408399999142, + "learning_rate": 0.00017587779693621495, + "loss": 23.0, + "step": 3785 + }, + { + "epoch": 2.2725090036014404, + "grad_norm": 0.0020412777084857225, + "learning_rate": 0.00017586549086571988, + "loss": 23.0, + "step": 3786 + }, + { + "epoch": 2.273109243697479, + "grad_norm": 0.001386416843160987, + "learning_rate": 0.00017585318208778058, + "loss": 23.0, + "step": 3787 + }, + { + "epoch": 2.2737094837935174, + "grad_norm": 0.0021522578317672014, + "learning_rate": 0.00017584087060283632, + "loss": 23.0, + "step": 3788 + }, + { + "epoch": 2.274309723889556, + "grad_norm": 0.0017842737725004554, + "learning_rate": 0.00017582855641132646, + "loss": 23.0, + "step": 3789 + }, + { + "epoch": 2.274909963985594, + "grad_norm": 0.0008319435291923583, + "learning_rate": 0.0001758162395136905, + "loss": 23.0, + "step": 3790 + }, + { + "epoch": 2.2755102040816326, + "grad_norm": 0.0022286258172243834, + "learning_rate": 0.000175803919910368, + "loss": 23.0, + "step": 3791 + }, + { + "epoch": 2.276110444177671, + "grad_norm": 0.0015450834762305021, + "learning_rate": 0.00017579159760179854, + "loss": 23.0, + "step": 3792 + }, + { + "epoch": 2.2767106842737093, + "grad_norm": 0.0017403220990672708, + "learning_rate": 0.00017577927258842193, + "loss": 23.0, + "step": 3793 + }, + { + "epoch": 2.277310924369748, + "grad_norm": 0.0021773509215563536, + "learning_rate": 0.000175766944870678, + "loss": 23.0, + "step": 3794 + }, + { + "epoch": 2.2779111644657863, + "grad_norm": 0.0007699204725213349, + "learning_rate": 0.00017575461444900676, + "loss": 23.0, + "step": 3795 + }, + { + "epoch": 2.278511404561825, + "grad_norm": 0.001452354365028441, + "learning_rate": 0.00017574228132384814, + "loss": 23.0, + "step": 3796 + }, + { + "epoch": 2.279111644657863, + "grad_norm": 0.0027013334911316633, + "learning_rate": 0.00017572994549564235, + "loss": 23.0, + "step": 3797 + }, + { + "epoch": 2.2797118847539015, + "grad_norm": 0.0016655053477734327, + "learning_rate": 0.0001757176069648296, + "loss": 23.0, + "step": 3798 + }, + { + "epoch": 2.28031212484994, + "grad_norm": 0.0017022525426000357, + "learning_rate": 0.00017570526573185022, + "loss": 23.0, + "step": 3799 + }, + { + "epoch": 2.280912364945978, + "grad_norm": 0.0011267677182331681, + "learning_rate": 0.00017569292179714466, + "loss": 23.0, + "step": 3800 + }, + { + "epoch": 2.2815126050420167, + "grad_norm": 0.0012624160153791308, + "learning_rate": 0.00017568057516115343, + "loss": 23.0, + "step": 3801 + }, + { + "epoch": 2.2821128451380552, + "grad_norm": 0.0012623511720448732, + "learning_rate": 0.00017566822582431714, + "loss": 23.0, + "step": 3802 + }, + { + "epoch": 2.2827130852340938, + "grad_norm": 0.002993621863424778, + "learning_rate": 0.0001756558737870765, + "loss": 23.0, + "step": 3803 + }, + { + "epoch": 2.283313325330132, + "grad_norm": 0.001764954999089241, + "learning_rate": 0.00017564351904987232, + "loss": 23.0, + "step": 3804 + }, + { + "epoch": 2.2839135654261704, + "grad_norm": 0.0033577526919543743, + "learning_rate": 0.00017563116161314557, + "loss": 23.0, + "step": 3805 + }, + { + "epoch": 2.284513805522209, + "grad_norm": 0.0030459025874733925, + "learning_rate": 0.0001756188014773372, + "loss": 23.0, + "step": 3806 + }, + { + "epoch": 2.2851140456182475, + "grad_norm": 0.0020036788191646338, + "learning_rate": 0.00017560643864288831, + "loss": 23.0, + "step": 3807 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.003118950640782714, + "learning_rate": 0.00017559407311024008, + "loss": 23.0, + "step": 3808 + }, + { + "epoch": 2.286314525810324, + "grad_norm": 0.0023817415349185467, + "learning_rate": 0.00017558170487983388, + "loss": 23.0, + "step": 3809 + }, + { + "epoch": 2.2869147659063627, + "grad_norm": 0.0020011558663100004, + "learning_rate": 0.00017556933395211099, + "loss": 23.0, + "step": 3810 + }, + { + "epoch": 2.287515006002401, + "grad_norm": 0.0019172454485669732, + "learning_rate": 0.000175556960327513, + "loss": 23.0, + "step": 3811 + }, + { + "epoch": 2.2881152460984393, + "grad_norm": 0.005320819094777107, + "learning_rate": 0.00017554458400648146, + "loss": 23.0, + "step": 3812 + }, + { + "epoch": 2.288715486194478, + "grad_norm": 0.0011504385620355606, + "learning_rate": 0.00017553220498945802, + "loss": 23.0, + "step": 3813 + }, + { + "epoch": 2.2893157262905164, + "grad_norm": 0.004282000940293074, + "learning_rate": 0.00017551982327688447, + "loss": 23.0, + "step": 3814 + }, + { + "epoch": 2.2899159663865545, + "grad_norm": 0.0016033421270549297, + "learning_rate": 0.0001755074388692027, + "loss": 23.0, + "step": 3815 + }, + { + "epoch": 2.290516206482593, + "grad_norm": 0.001392611302435398, + "learning_rate": 0.00017549505176685468, + "loss": 23.0, + "step": 3816 + }, + { + "epoch": 2.2911164465786316, + "grad_norm": 0.0024148111697286367, + "learning_rate": 0.00017548266197028243, + "loss": 23.0, + "step": 3817 + }, + { + "epoch": 2.29171668667467, + "grad_norm": 0.0022317797411233187, + "learning_rate": 0.00017547026947992817, + "loss": 23.0, + "step": 3818 + }, + { + "epoch": 2.292316926770708, + "grad_norm": 0.00245701358653605, + "learning_rate": 0.0001754578742962341, + "loss": 23.0, + "step": 3819 + }, + { + "epoch": 2.2929171668667467, + "grad_norm": 0.0017128176987171173, + "learning_rate": 0.00017544547641964263, + "loss": 23.0, + "step": 3820 + }, + { + "epoch": 2.2935174069627853, + "grad_norm": 0.0031775077804923058, + "learning_rate": 0.00017543307585059613, + "loss": 23.0, + "step": 3821 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 0.002050775336101651, + "learning_rate": 0.00017542067258953723, + "loss": 23.0, + "step": 3822 + }, + { + "epoch": 2.294717887154862, + "grad_norm": 0.0014890647726133466, + "learning_rate": 0.00017540826663690851, + "loss": 23.0, + "step": 3823 + }, + { + "epoch": 2.2953181272509005, + "grad_norm": 0.004480647388845682, + "learning_rate": 0.00017539585799315272, + "loss": 23.0, + "step": 3824 + }, + { + "epoch": 2.295918367346939, + "grad_norm": 0.0015886913752183318, + "learning_rate": 0.0001753834466587127, + "loss": 23.0, + "step": 3825 + }, + { + "epoch": 2.296518607442977, + "grad_norm": 0.0019695230294018984, + "learning_rate": 0.0001753710326340314, + "loss": 23.0, + "step": 3826 + }, + { + "epoch": 2.2971188475390156, + "grad_norm": 0.0018076825654134154, + "learning_rate": 0.0001753586159195518, + "loss": 23.0, + "step": 3827 + }, + { + "epoch": 2.297719087635054, + "grad_norm": 0.002728229621425271, + "learning_rate": 0.00017534619651571705, + "loss": 23.0, + "step": 3828 + }, + { + "epoch": 2.2983193277310923, + "grad_norm": 0.0017432133900001645, + "learning_rate": 0.0001753337744229704, + "loss": 23.0, + "step": 3829 + }, + { + "epoch": 2.298919567827131, + "grad_norm": 0.0018603568896651268, + "learning_rate": 0.00017532134964175504, + "loss": 23.0, + "step": 3830 + }, + { + "epoch": 2.2995198079231693, + "grad_norm": 0.002048965310677886, + "learning_rate": 0.0001753089221725145, + "loss": 23.0, + "step": 3831 + }, + { + "epoch": 2.300120048019208, + "grad_norm": 0.001226543914526701, + "learning_rate": 0.00017529649201569226, + "loss": 23.0, + "step": 3832 + }, + { + "epoch": 2.300720288115246, + "grad_norm": 0.004013392608612776, + "learning_rate": 0.0001752840591717319, + "loss": 23.0, + "step": 3833 + }, + { + "epoch": 2.3013205282112845, + "grad_norm": 0.0008342837681993842, + "learning_rate": 0.00017527162364107713, + "loss": 23.0, + "step": 3834 + }, + { + "epoch": 2.301920768307323, + "grad_norm": 0.0007289492059499025, + "learning_rate": 0.00017525918542417171, + "loss": 23.0, + "step": 3835 + }, + { + "epoch": 2.302521008403361, + "grad_norm": 0.0029447299893945456, + "learning_rate": 0.00017524674452145956, + "loss": 23.0, + "step": 3836 + }, + { + "epoch": 2.3031212484993997, + "grad_norm": 0.002568695694208145, + "learning_rate": 0.00017523430093338468, + "loss": 23.0, + "step": 3837 + }, + { + "epoch": 2.3037214885954382, + "grad_norm": 0.0012608342804014683, + "learning_rate": 0.0001752218546603911, + "loss": 23.0, + "step": 3838 + }, + { + "epoch": 2.304321728691477, + "grad_norm": 0.00195410312153399, + "learning_rate": 0.00017520940570292303, + "loss": 23.0, + "step": 3839 + }, + { + "epoch": 2.304921968787515, + "grad_norm": 0.003413090016692877, + "learning_rate": 0.00017519695406142472, + "loss": 23.0, + "step": 3840 + }, + { + "epoch": 2.3055222088835534, + "grad_norm": 0.002465988975018263, + "learning_rate": 0.00017518449973634055, + "loss": 23.0, + "step": 3841 + }, + { + "epoch": 2.306122448979592, + "grad_norm": 0.00117593037430197, + "learning_rate": 0.00017517204272811501, + "loss": 23.0, + "step": 3842 + }, + { + "epoch": 2.30672268907563, + "grad_norm": 0.00045529394992627203, + "learning_rate": 0.0001751595830371926, + "loss": 23.0, + "step": 3843 + }, + { + "epoch": 2.3073229291716686, + "grad_norm": 0.0013716558460146189, + "learning_rate": 0.000175147120664018, + "loss": 23.0, + "step": 3844 + }, + { + "epoch": 2.307923169267707, + "grad_norm": 0.0009694201871752739, + "learning_rate": 0.000175134655609036, + "loss": 23.0, + "step": 3845 + }, + { + "epoch": 2.3085234093637457, + "grad_norm": 0.002793167717754841, + "learning_rate": 0.00017512218787269138, + "loss": 23.0, + "step": 3846 + }, + { + "epoch": 2.3091236494597838, + "grad_norm": 0.0012160228798165917, + "learning_rate": 0.0001751097174554291, + "loss": 23.0, + "step": 3847 + }, + { + "epoch": 2.3097238895558223, + "grad_norm": 0.0041273715905845165, + "learning_rate": 0.00017509724435769427, + "loss": 23.0, + "step": 3848 + }, + { + "epoch": 2.310324129651861, + "grad_norm": 0.0019413263071328402, + "learning_rate": 0.00017508476857993192, + "loss": 23.0, + "step": 3849 + }, + { + "epoch": 2.310924369747899, + "grad_norm": 0.000679299992043525, + "learning_rate": 0.00017507229012258732, + "loss": 23.0, + "step": 3850 + }, + { + "epoch": 2.3115246098439375, + "grad_norm": 0.0014612323138862848, + "learning_rate": 0.0001750598089861058, + "loss": 23.0, + "step": 3851 + }, + { + "epoch": 2.312124849939976, + "grad_norm": 0.002256182488054037, + "learning_rate": 0.00017504732517093278, + "loss": 23.0, + "step": 3852 + }, + { + "epoch": 2.3127250900360146, + "grad_norm": 0.0009005808969959617, + "learning_rate": 0.00017503483867751377, + "loss": 23.0, + "step": 3853 + }, + { + "epoch": 2.3133253301320527, + "grad_norm": 0.0015893650706857443, + "learning_rate": 0.00017502234950629437, + "loss": 23.0, + "step": 3854 + }, + { + "epoch": 2.313925570228091, + "grad_norm": 0.001747640548273921, + "learning_rate": 0.0001750098576577203, + "loss": 23.0, + "step": 3855 + }, + { + "epoch": 2.3145258103241297, + "grad_norm": 0.0005708754761144519, + "learning_rate": 0.00017499736313223737, + "loss": 23.0, + "step": 3856 + }, + { + "epoch": 2.315126050420168, + "grad_norm": 0.0028467399533838034, + "learning_rate": 0.00017498486593029144, + "loss": 23.0, + "step": 3857 + }, + { + "epoch": 2.3157262905162064, + "grad_norm": 0.0014178903074935079, + "learning_rate": 0.00017497236605232855, + "loss": 23.0, + "step": 3858 + }, + { + "epoch": 2.316326530612245, + "grad_norm": 0.005164925009012222, + "learning_rate": 0.00017495986349879477, + "loss": 23.0, + "step": 3859 + }, + { + "epoch": 2.3169267707082835, + "grad_norm": 0.0027939407154917717, + "learning_rate": 0.00017494735827013627, + "loss": 23.0, + "step": 3860 + }, + { + "epoch": 2.3175270108043216, + "grad_norm": 0.002403461141511798, + "learning_rate": 0.00017493485036679935, + "loss": 23.0, + "step": 3861 + }, + { + "epoch": 2.31812725090036, + "grad_norm": 0.003365919226780534, + "learning_rate": 0.00017492233978923035, + "loss": 23.0, + "step": 3862 + }, + { + "epoch": 2.3187274909963986, + "grad_norm": 0.001767502399161458, + "learning_rate": 0.00017490982653787577, + "loss": 23.0, + "step": 3863 + }, + { + "epoch": 2.3193277310924367, + "grad_norm": 0.0037461065221577883, + "learning_rate": 0.0001748973106131822, + "loss": 23.0, + "step": 3864 + }, + { + "epoch": 2.3199279711884753, + "grad_norm": 0.0054776170291006565, + "learning_rate": 0.00017488479201559627, + "loss": 23.0, + "step": 3865 + }, + { + "epoch": 2.320528211284514, + "grad_norm": 0.0020852498710155487, + "learning_rate": 0.00017487227074556474, + "loss": 23.0, + "step": 3866 + }, + { + "epoch": 2.3211284513805523, + "grad_norm": 0.001896170899271965, + "learning_rate": 0.00017485974680353446, + "loss": 23.0, + "step": 3867 + }, + { + "epoch": 2.3217286914765904, + "grad_norm": 0.0022559871431440115, + "learning_rate": 0.00017484722018995235, + "loss": 23.0, + "step": 3868 + }, + { + "epoch": 2.322328931572629, + "grad_norm": 0.0008233110420405865, + "learning_rate": 0.00017483469090526552, + "loss": 23.0, + "step": 3869 + }, + { + "epoch": 2.3229291716686675, + "grad_norm": 0.003072496736422181, + "learning_rate": 0.00017482215894992106, + "loss": 23.0, + "step": 3870 + }, + { + "epoch": 2.323529411764706, + "grad_norm": 0.0034991884604096413, + "learning_rate": 0.00017480962432436618, + "loss": 23.0, + "step": 3871 + }, + { + "epoch": 2.324129651860744, + "grad_norm": 0.002439623698592186, + "learning_rate": 0.00017479708702904827, + "loss": 23.0, + "step": 3872 + }, + { + "epoch": 2.3247298919567827, + "grad_norm": 0.0018866292666643858, + "learning_rate": 0.00017478454706441472, + "loss": 23.0, + "step": 3873 + }, + { + "epoch": 2.3253301320528212, + "grad_norm": 0.00198563514277339, + "learning_rate": 0.00017477200443091306, + "loss": 23.0, + "step": 3874 + }, + { + "epoch": 2.32593037214886, + "grad_norm": 0.005812112707644701, + "learning_rate": 0.0001747594591289909, + "loss": 23.0, + "step": 3875 + }, + { + "epoch": 2.326530612244898, + "grad_norm": 0.0013709774939343333, + "learning_rate": 0.00017474691115909594, + "loss": 23.0, + "step": 3876 + }, + { + "epoch": 2.3271308523409364, + "grad_norm": 0.0018185654189437628, + "learning_rate": 0.00017473436052167597, + "loss": 23.0, + "step": 3877 + }, + { + "epoch": 2.327731092436975, + "grad_norm": 0.002282187808305025, + "learning_rate": 0.00017472180721717895, + "loss": 23.0, + "step": 3878 + }, + { + "epoch": 2.328331332533013, + "grad_norm": 0.0013887138338759542, + "learning_rate": 0.00017470925124605282, + "loss": 23.0, + "step": 3879 + }, + { + "epoch": 2.3289315726290516, + "grad_norm": 0.00287288217805326, + "learning_rate": 0.0001746966926087457, + "loss": 23.0, + "step": 3880 + }, + { + "epoch": 2.32953181272509, + "grad_norm": 0.0033778510987758636, + "learning_rate": 0.00017468413130570577, + "loss": 23.0, + "step": 3881 + }, + { + "epoch": 2.3301320528211287, + "grad_norm": 0.0016036947490647435, + "learning_rate": 0.00017467156733738128, + "loss": 23.0, + "step": 3882 + }, + { + "epoch": 2.3307322929171668, + "grad_norm": 0.0013769504148513079, + "learning_rate": 0.00017465900070422064, + "loss": 23.0, + "step": 3883 + }, + { + "epoch": 2.3313325330132053, + "grad_norm": 0.00230945716612041, + "learning_rate": 0.0001746464314066723, + "loss": 23.0, + "step": 3884 + }, + { + "epoch": 2.331932773109244, + "grad_norm": 0.0031504861544817686, + "learning_rate": 0.00017463385944518485, + "loss": 23.0, + "step": 3885 + }, + { + "epoch": 2.332533013205282, + "grad_norm": 0.0008833253523334861, + "learning_rate": 0.00017462128482020694, + "loss": 23.0, + "step": 3886 + }, + { + "epoch": 2.3331332533013205, + "grad_norm": 0.002419440308585763, + "learning_rate": 0.00017460870753218733, + "loss": 23.0, + "step": 3887 + }, + { + "epoch": 2.333733493397359, + "grad_norm": 0.00045076129026710987, + "learning_rate": 0.00017459612758157484, + "loss": 23.0, + "step": 3888 + }, + { + "epoch": 2.3343337334933976, + "grad_norm": 0.0007832158589735627, + "learning_rate": 0.00017458354496881846, + "loss": 23.0, + "step": 3889 + }, + { + "epoch": 2.3349339735894357, + "grad_norm": 0.0010271031642332673, + "learning_rate": 0.00017457095969436722, + "loss": 23.0, + "step": 3890 + }, + { + "epoch": 2.335534213685474, + "grad_norm": 0.002968541579321027, + "learning_rate": 0.0001745583717586702, + "loss": 23.0, + "step": 3891 + }, + { + "epoch": 2.3361344537815127, + "grad_norm": 0.005150226876139641, + "learning_rate": 0.00017454578116217674, + "loss": 23.0, + "step": 3892 + }, + { + "epoch": 2.336734693877551, + "grad_norm": 0.0014614604879170656, + "learning_rate": 0.00017453318790533606, + "loss": 23.0, + "step": 3893 + }, + { + "epoch": 2.3373349339735894, + "grad_norm": 0.003210582537576556, + "learning_rate": 0.00017452059198859766, + "loss": 23.0, + "step": 3894 + }, + { + "epoch": 2.337935174069628, + "grad_norm": 0.0043360283598303795, + "learning_rate": 0.000174507993412411, + "loss": 23.0, + "step": 3895 + }, + { + "epoch": 2.3385354141656665, + "grad_norm": 0.0008345422684215009, + "learning_rate": 0.00017449539217722571, + "loss": 23.0, + "step": 3896 + }, + { + "epoch": 2.3391356542617046, + "grad_norm": 0.003543426748365164, + "learning_rate": 0.00017448278828349152, + "loss": 23.0, + "step": 3897 + }, + { + "epoch": 2.339735894357743, + "grad_norm": 0.0006024944595992565, + "learning_rate": 0.00017447018173165818, + "loss": 23.0, + "step": 3898 + }, + { + "epoch": 2.3403361344537816, + "grad_norm": 0.002525531454011798, + "learning_rate": 0.00017445757252217562, + "loss": 23.0, + "step": 3899 + }, + { + "epoch": 2.3409363745498197, + "grad_norm": 0.0018678716151043773, + "learning_rate": 0.00017444496065549385, + "loss": 23.0, + "step": 3900 + }, + { + "epoch": 2.3415366146458583, + "grad_norm": 0.002660698490217328, + "learning_rate": 0.00017443234613206293, + "loss": 23.0, + "step": 3901 + }, + { + "epoch": 2.342136854741897, + "grad_norm": 0.0027779710944741964, + "learning_rate": 0.000174419728952333, + "loss": 23.0, + "step": 3902 + }, + { + "epoch": 2.3427370948379354, + "grad_norm": 0.0036743993405252695, + "learning_rate": 0.0001744071091167544, + "loss": 23.0, + "step": 3903 + }, + { + "epoch": 2.3433373349339734, + "grad_norm": 0.002380775986239314, + "learning_rate": 0.00017439448662577749, + "loss": 23.0, + "step": 3904 + }, + { + "epoch": 2.343937575030012, + "grad_norm": 0.0030045825988054276, + "learning_rate": 0.0001743818614798527, + "loss": 23.0, + "step": 3905 + }, + { + "epoch": 2.3445378151260505, + "grad_norm": 0.0010804989142343402, + "learning_rate": 0.00017436923367943058, + "loss": 23.0, + "step": 3906 + }, + { + "epoch": 2.3451380552220886, + "grad_norm": 0.0031437580473721027, + "learning_rate": 0.00017435660322496187, + "loss": 23.0, + "step": 3907 + }, + { + "epoch": 2.345738295318127, + "grad_norm": 0.002179528819397092, + "learning_rate": 0.0001743439701168972, + "loss": 23.0, + "step": 3908 + }, + { + "epoch": 2.3463385354141657, + "grad_norm": 0.0047058421187102795, + "learning_rate": 0.0001743313343556875, + "loss": 23.0, + "step": 3909 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 0.0017768297111615539, + "learning_rate": 0.00017431869594178366, + "loss": 23.0, + "step": 3910 + }, + { + "epoch": 2.3475390156062423, + "grad_norm": 0.0016148180002346635, + "learning_rate": 0.00017430605487563677, + "loss": 23.0, + "step": 3911 + }, + { + "epoch": 2.348139255702281, + "grad_norm": 0.001844774466007948, + "learning_rate": 0.00017429341115769787, + "loss": 23.0, + "step": 3912 + }, + { + "epoch": 2.3487394957983194, + "grad_norm": 0.001695389044471085, + "learning_rate": 0.0001742807647884183, + "loss": 23.0, + "step": 3913 + }, + { + "epoch": 2.3493397358943575, + "grad_norm": 0.0021679711062461138, + "learning_rate": 0.00017426811576824923, + "loss": 23.0, + "step": 3914 + }, + { + "epoch": 2.349939975990396, + "grad_norm": 0.0033461309503763914, + "learning_rate": 0.00017425546409764218, + "loss": 23.0, + "step": 3915 + }, + { + "epoch": 2.3505402160864346, + "grad_norm": 0.0019258579704910517, + "learning_rate": 0.00017424280977704863, + "loss": 23.0, + "step": 3916 + }, + { + "epoch": 2.351140456182473, + "grad_norm": 0.0012793492060154676, + "learning_rate": 0.0001742301528069202, + "loss": 23.0, + "step": 3917 + }, + { + "epoch": 2.3517406962785112, + "grad_norm": 0.0014299890026450157, + "learning_rate": 0.00017421749318770853, + "loss": 23.0, + "step": 3918 + }, + { + "epoch": 2.3523409363745498, + "grad_norm": 0.0012088280636817217, + "learning_rate": 0.00017420483091986543, + "loss": 23.0, + "step": 3919 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0010071590077131987, + "learning_rate": 0.0001741921660038428, + "loss": 23.0, + "step": 3920 + }, + { + "epoch": 2.3535414165666264, + "grad_norm": 0.0018569778185337782, + "learning_rate": 0.0001741794984400926, + "loss": 23.0, + "step": 3921 + }, + { + "epoch": 2.354141656662665, + "grad_norm": 0.0018122204346582294, + "learning_rate": 0.00017416682822906694, + "loss": 23.0, + "step": 3922 + }, + { + "epoch": 2.3547418967587035, + "grad_norm": 0.0012169646797701716, + "learning_rate": 0.00017415415537121797, + "loss": 23.0, + "step": 3923 + }, + { + "epoch": 2.355342136854742, + "grad_norm": 0.0032757974695414305, + "learning_rate": 0.00017414147986699792, + "loss": 23.0, + "step": 3924 + }, + { + "epoch": 2.35594237695078, + "grad_norm": 0.0015983044868335128, + "learning_rate": 0.00017412880171685914, + "loss": 23.0, + "step": 3925 + }, + { + "epoch": 2.3565426170468187, + "grad_norm": 0.000568926683627069, + "learning_rate": 0.00017411612092125417, + "loss": 23.0, + "step": 3926 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.0018514172406867146, + "learning_rate": 0.00017410343748063545, + "loss": 23.0, + "step": 3927 + }, + { + "epoch": 2.3577430972388957, + "grad_norm": 0.0015337765216827393, + "learning_rate": 0.00017409075139545568, + "loss": 23.0, + "step": 3928 + }, + { + "epoch": 2.358343337334934, + "grad_norm": 0.0009689840371720493, + "learning_rate": 0.00017407806266616754, + "loss": 23.0, + "step": 3929 + }, + { + "epoch": 2.3589435774309724, + "grad_norm": 0.002194127067923546, + "learning_rate": 0.00017406537129322394, + "loss": 23.0, + "step": 3930 + }, + { + "epoch": 2.359543817527011, + "grad_norm": 0.0015414162771776319, + "learning_rate": 0.00017405267727707774, + "loss": 23.0, + "step": 3931 + }, + { + "epoch": 2.3601440576230495, + "grad_norm": 0.00167924037668854, + "learning_rate": 0.000174039980618182, + "loss": 23.0, + "step": 3932 + }, + { + "epoch": 2.3607442977190876, + "grad_norm": 0.001501681748777628, + "learning_rate": 0.00017402728131698977, + "loss": 23.0, + "step": 3933 + }, + { + "epoch": 2.361344537815126, + "grad_norm": 0.0010487270774319768, + "learning_rate": 0.0001740145793739543, + "loss": 23.0, + "step": 3934 + }, + { + "epoch": 2.3619447779111646, + "grad_norm": 0.0017754529835656285, + "learning_rate": 0.0001740018747895289, + "loss": 23.0, + "step": 3935 + }, + { + "epoch": 2.3625450180072027, + "grad_norm": 0.00277283089235425, + "learning_rate": 0.0001739891675641669, + "loss": 23.0, + "step": 3936 + }, + { + "epoch": 2.3631452581032413, + "grad_norm": 0.0016696704551577568, + "learning_rate": 0.00017397645769832187, + "loss": 23.0, + "step": 3937 + }, + { + "epoch": 2.36374549819928, + "grad_norm": 0.0018485120963305235, + "learning_rate": 0.00017396374519244737, + "loss": 23.0, + "step": 3938 + }, + { + "epoch": 2.3643457382953184, + "grad_norm": 0.0054781329818069935, + "learning_rate": 0.000173951030046997, + "loss": 23.0, + "step": 3939 + }, + { + "epoch": 2.3649459783913565, + "grad_norm": 0.0019205682910978794, + "learning_rate": 0.00017393831226242466, + "loss": 23.0, + "step": 3940 + }, + { + "epoch": 2.365546218487395, + "grad_norm": 0.003620882984250784, + "learning_rate": 0.0001739255918391841, + "loss": 23.0, + "step": 3941 + }, + { + "epoch": 2.3661464585834335, + "grad_norm": 0.0012869867496192455, + "learning_rate": 0.00017391286877772933, + "loss": 23.0, + "step": 3942 + }, + { + "epoch": 2.3667466986794716, + "grad_norm": 0.002010358963161707, + "learning_rate": 0.00017390014307851442, + "loss": 23.0, + "step": 3943 + }, + { + "epoch": 2.36734693877551, + "grad_norm": 0.0021779662929475307, + "learning_rate": 0.0001738874147419935, + "loss": 23.0, + "step": 3944 + }, + { + "epoch": 2.3679471788715487, + "grad_norm": 0.0021605216898024082, + "learning_rate": 0.0001738746837686208, + "loss": 23.0, + "step": 3945 + }, + { + "epoch": 2.3685474189675872, + "grad_norm": 0.0006803186261095107, + "learning_rate": 0.00017386195015885064, + "loss": 23.0, + "step": 3946 + }, + { + "epoch": 2.3691476590636253, + "grad_norm": 0.0011268210364505649, + "learning_rate": 0.0001738492139131375, + "loss": 23.0, + "step": 3947 + }, + { + "epoch": 2.369747899159664, + "grad_norm": 0.0007069764542393386, + "learning_rate": 0.0001738364750319359, + "loss": 23.0, + "step": 3948 + }, + { + "epoch": 2.3703481392557024, + "grad_norm": 0.007430940866470337, + "learning_rate": 0.0001738237335157004, + "loss": 23.0, + "step": 3949 + }, + { + "epoch": 2.3709483793517405, + "grad_norm": 0.003919681068509817, + "learning_rate": 0.00017381098936488574, + "loss": 23.0, + "step": 3950 + }, + { + "epoch": 2.371548619447779, + "grad_norm": 0.0008578869164921343, + "learning_rate": 0.0001737982425799468, + "loss": 23.0, + "step": 3951 + }, + { + "epoch": 2.3721488595438176, + "grad_norm": 0.0018030635546892881, + "learning_rate": 0.00017378549316133835, + "loss": 23.0, + "step": 3952 + }, + { + "epoch": 2.372749099639856, + "grad_norm": 0.0018941910238936543, + "learning_rate": 0.00017377274110951543, + "loss": 23.0, + "step": 3953 + }, + { + "epoch": 2.3733493397358942, + "grad_norm": 0.0007153527112677693, + "learning_rate": 0.00017375998642493322, + "loss": 23.0, + "step": 3954 + }, + { + "epoch": 2.3739495798319328, + "grad_norm": 0.0009323756094090641, + "learning_rate": 0.00017374722910804677, + "loss": 23.0, + "step": 3955 + }, + { + "epoch": 2.3745498199279713, + "grad_norm": 0.0036982784513384104, + "learning_rate": 0.00017373446915931143, + "loss": 23.0, + "step": 3956 + }, + { + "epoch": 2.3751500600240094, + "grad_norm": 0.0014185566687956452, + "learning_rate": 0.00017372170657918256, + "loss": 23.0, + "step": 3957 + }, + { + "epoch": 2.375750300120048, + "grad_norm": 0.002080840989947319, + "learning_rate": 0.00017370894136811563, + "loss": 23.0, + "step": 3958 + }, + { + "epoch": 2.3763505402160865, + "grad_norm": 0.0006996280862949789, + "learning_rate": 0.00017369617352656618, + "loss": 23.0, + "step": 3959 + }, + { + "epoch": 2.376950780312125, + "grad_norm": 0.0018821045523509383, + "learning_rate": 0.00017368340305498985, + "loss": 23.0, + "step": 3960 + }, + { + "epoch": 2.377551020408163, + "grad_norm": 0.0013187207514420152, + "learning_rate": 0.0001736706299538424, + "loss": 23.0, + "step": 3961 + }, + { + "epoch": 2.3781512605042017, + "grad_norm": 0.001449449686333537, + "learning_rate": 0.0001736578542235797, + "loss": 23.0, + "step": 3962 + }, + { + "epoch": 2.37875150060024, + "grad_norm": 0.002266083611175418, + "learning_rate": 0.00017364507586465762, + "loss": 23.0, + "step": 3963 + }, + { + "epoch": 2.3793517406962783, + "grad_norm": 0.002582366345450282, + "learning_rate": 0.00017363229487753223, + "loss": 23.0, + "step": 3964 + }, + { + "epoch": 2.379951980792317, + "grad_norm": 0.0003937427536584437, + "learning_rate": 0.00017361951126265964, + "loss": 23.0, + "step": 3965 + }, + { + "epoch": 2.3805522208883554, + "grad_norm": 0.0017639965517446399, + "learning_rate": 0.00017360672502049606, + "loss": 23.0, + "step": 3966 + }, + { + "epoch": 2.381152460984394, + "grad_norm": 0.004503778647631407, + "learning_rate": 0.0001735939361514978, + "loss": 23.0, + "step": 3967 + }, + { + "epoch": 2.381752701080432, + "grad_norm": 0.0036358425859361887, + "learning_rate": 0.0001735811446561213, + "loss": 23.0, + "step": 3968 + }, + { + "epoch": 2.3823529411764706, + "grad_norm": 0.0023689253721386194, + "learning_rate": 0.00017356835053482299, + "loss": 23.0, + "step": 3969 + }, + { + "epoch": 2.382953181272509, + "grad_norm": 0.002466338686645031, + "learning_rate": 0.0001735555537880595, + "loss": 23.0, + "step": 3970 + }, + { + "epoch": 2.383553421368547, + "grad_norm": 0.0021039212588220835, + "learning_rate": 0.0001735427544162875, + "loss": 23.0, + "step": 3971 + }, + { + "epoch": 2.3841536614645857, + "grad_norm": 0.0027853043284267187, + "learning_rate": 0.00017352995241996374, + "loss": 23.0, + "step": 3972 + }, + { + "epoch": 2.3847539015606243, + "grad_norm": 0.0033636244479566813, + "learning_rate": 0.00017351714779954516, + "loss": 23.0, + "step": 3973 + }, + { + "epoch": 2.385354141656663, + "grad_norm": 0.002105874475091696, + "learning_rate": 0.00017350434055548867, + "loss": 23.0, + "step": 3974 + }, + { + "epoch": 2.385954381752701, + "grad_norm": 0.0009028427302837372, + "learning_rate": 0.00017349153068825133, + "loss": 23.0, + "step": 3975 + }, + { + "epoch": 2.3865546218487395, + "grad_norm": 0.0030058857519179583, + "learning_rate": 0.00017347871819829033, + "loss": 23.0, + "step": 3976 + }, + { + "epoch": 2.387154861944778, + "grad_norm": 0.0017001151572912931, + "learning_rate": 0.00017346590308606286, + "loss": 23.0, + "step": 3977 + }, + { + "epoch": 2.387755102040816, + "grad_norm": 0.0008253180421888828, + "learning_rate": 0.0001734530853520263, + "loss": 23.0, + "step": 3978 + }, + { + "epoch": 2.3883553421368546, + "grad_norm": 0.0019861324690282345, + "learning_rate": 0.00017344026499663807, + "loss": 23.0, + "step": 3979 + }, + { + "epoch": 2.388955582232893, + "grad_norm": 0.002510624472051859, + "learning_rate": 0.0001734274420203557, + "loss": 23.0, + "step": 3980 + }, + { + "epoch": 2.3895558223289317, + "grad_norm": 0.0011581958970054984, + "learning_rate": 0.00017341461642363676, + "loss": 23.0, + "step": 3981 + }, + { + "epoch": 2.39015606242497, + "grad_norm": 0.002012391574680805, + "learning_rate": 0.00017340178820693906, + "loss": 23.0, + "step": 3982 + }, + { + "epoch": 2.3907563025210083, + "grad_norm": 0.002154824323952198, + "learning_rate": 0.0001733889573707203, + "loss": 23.0, + "step": 3983 + }, + { + "epoch": 2.391356542617047, + "grad_norm": 0.0009000212885439396, + "learning_rate": 0.00017337612391543847, + "loss": 23.0, + "step": 3984 + }, + { + "epoch": 2.3919567827130854, + "grad_norm": 0.003189866431057453, + "learning_rate": 0.0001733632878415515, + "loss": 23.0, + "step": 3985 + }, + { + "epoch": 2.3925570228091235, + "grad_norm": 0.0023710643872618675, + "learning_rate": 0.00017335044914951753, + "loss": 23.0, + "step": 3986 + }, + { + "epoch": 2.393157262905162, + "grad_norm": 0.0008908773888833821, + "learning_rate": 0.00017333760783979466, + "loss": 23.0, + "step": 3987 + }, + { + "epoch": 2.3937575030012006, + "grad_norm": 0.0027625104412436485, + "learning_rate": 0.00017332476391284128, + "loss": 23.0, + "step": 3988 + }, + { + "epoch": 2.394357743097239, + "grad_norm": 0.002077084966003895, + "learning_rate": 0.00017331191736911564, + "loss": 23.0, + "step": 3989 + }, + { + "epoch": 2.3949579831932772, + "grad_norm": 0.0020401370711624622, + "learning_rate": 0.00017329906820907627, + "loss": 23.0, + "step": 3990 + }, + { + "epoch": 2.395558223289316, + "grad_norm": 0.00204063905403018, + "learning_rate": 0.0001732862164331817, + "loss": 23.0, + "step": 3991 + }, + { + "epoch": 2.3961584633853543, + "grad_norm": 0.0018513710238039494, + "learning_rate": 0.00017327336204189055, + "loss": 23.0, + "step": 3992 + }, + { + "epoch": 2.3967587034813924, + "grad_norm": 0.0022755179088562727, + "learning_rate": 0.00017326050503566162, + "loss": 23.0, + "step": 3993 + }, + { + "epoch": 2.397358943577431, + "grad_norm": 0.0027569227386265993, + "learning_rate": 0.00017324764541495373, + "loss": 23.0, + "step": 3994 + }, + { + "epoch": 2.3979591836734695, + "grad_norm": 0.0031000536400824785, + "learning_rate": 0.00017323478318022578, + "loss": 23.0, + "step": 3995 + }, + { + "epoch": 2.398559423769508, + "grad_norm": 0.0016642792616039515, + "learning_rate": 0.00017322191833193674, + "loss": 23.0, + "step": 3996 + }, + { + "epoch": 2.399159663865546, + "grad_norm": 0.0015426806639879942, + "learning_rate": 0.00017320905087054585, + "loss": 23.0, + "step": 3997 + }, + { + "epoch": 2.3997599039615847, + "grad_norm": 0.0016276773530989885, + "learning_rate": 0.00017319618079651226, + "loss": 23.0, + "step": 3998 + }, + { + "epoch": 2.400360144057623, + "grad_norm": 0.00199799588881433, + "learning_rate": 0.0001731833081102952, + "loss": 23.0, + "step": 3999 + }, + { + "epoch": 2.4009603841536613, + "grad_norm": 0.001323958276771009, + "learning_rate": 0.00017317043281235418, + "loss": 23.0, + "step": 4000 + }, + { + "epoch": 2.4009603841536613, + "eval_loss": 11.5, + "eval_runtime": 5.4819, + "eval_samples_per_second": 255.931, + "eval_steps_per_second": 32.105, + "step": 4000 } ], "logging_steps": 1, @@ -21053,7 +28061,7 @@ "early_stopping_threshold": 0.0 }, "attributes": { - "early_stopping_patience_counter": 2 + "early_stopping_patience_counter": 3 } }, "TrainerControl": { @@ -21062,12 +28070,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 60013876285440.0, + "total_flos": 80021001510912.0, "train_batch_size": 8, "trial_name": null, "trial_params": null