diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9106085900743663, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00030353619669145547, + "grad_norm": 9.667811393737793, + "learning_rate": 1e-05, + "loss": 5.0202, + "step": 1 + }, + { + "epoch": 0.0006070723933829109, + "grad_norm": 10.303421974182129, + "learning_rate": 2e-05, + "loss": 4.7469, + "step": 2 + }, + { + "epoch": 0.0009106085900743664, + "grad_norm": 7.488056182861328, + "learning_rate": 3e-05, + "loss": 5.0105, + "step": 3 + }, + { + "epoch": 0.0012141447867658219, + "grad_norm": 4.885837078094482, + "learning_rate": 4e-05, + "loss": 4.3945, + "step": 4 + }, + { + "epoch": 0.0015176809834572772, + "grad_norm": 3.793656587600708, + "learning_rate": 5e-05, + "loss": 4.0574, + "step": 5 + }, + { + "epoch": 0.0018212171801487327, + "grad_norm": 3.9249916076660156, + "learning_rate": 6e-05, + "loss": 3.8179, + "step": 6 + }, + { + "epoch": 0.002124753376840188, + "grad_norm": 3.4937145709991455, + "learning_rate": 7e-05, + "loss": 3.5297, + "step": 7 + }, + { + "epoch": 0.0024282895735316438, + "grad_norm": 2.499041795730591, + "learning_rate": 8e-05, + "loss": 3.15, + "step": 8 + }, + { + "epoch": 0.002731825770223099, + "grad_norm": 2.0781290531158447, + "learning_rate": 9e-05, + "loss": 2.8658, + "step": 9 + }, + { + "epoch": 0.0030353619669145544, + "grad_norm": 2.0124764442443848, + "learning_rate": 0.0001, + "loss": 2.6826, + "step": 10 + }, + { + "epoch": 0.00333889816360601, + "grad_norm": 1.4209256172180176, + "learning_rate": 9.99949377341298e-05, + "loss": 2.5608, + "step": 11 + }, + { + "epoch": 0.0036424343602974654, + "grad_norm": 3.176084041595459, + "learning_rate": 9.99898754682596e-05, + "loss": 2.2416, + "step": 12 + }, + { + "epoch": 0.003945970556988921, + "grad_norm": 1.4457614421844482, + "learning_rate": 9.998481320238939e-05, + "loss": 2.1925, + "step": 13 + }, + { + "epoch": 0.004249506753680376, + "grad_norm": 1.3989348411560059, + "learning_rate": 9.997975093651918e-05, + "loss": 2.2165, + "step": 14 + }, + { + "epoch": 0.004553042950371832, + "grad_norm": 1.0647027492523193, + "learning_rate": 9.997468867064899e-05, + "loss": 2.3486, + "step": 15 + }, + { + "epoch": 0.0048565791470632875, + "grad_norm": 1.0246940851211548, + "learning_rate": 9.996962640477879e-05, + "loss": 2.19, + "step": 16 + }, + { + "epoch": 0.005160115343754742, + "grad_norm": 1.029646873474121, + "learning_rate": 9.996456413890858e-05, + "loss": 2.4052, + "step": 17 + }, + { + "epoch": 0.005463651540446198, + "grad_norm": 1.322654128074646, + "learning_rate": 9.995950187303838e-05, + "loss": 2.1927, + "step": 18 + }, + { + "epoch": 0.005767187737137654, + "grad_norm": 2.061326026916504, + "learning_rate": 9.995443960716817e-05, + "loss": 2.4574, + "step": 19 + }, + { + "epoch": 0.006070723933829109, + "grad_norm": 1.1343607902526855, + "learning_rate": 9.994937734129797e-05, + "loss": 1.9598, + "step": 20 + }, + { + "epoch": 0.0063742601305205645, + "grad_norm": 1.13712477684021, + "learning_rate": 9.994431507542776e-05, + "loss": 2.8643, + "step": 21 + }, + { + "epoch": 0.00667779632721202, + "grad_norm": 0.8220421671867371, + "learning_rate": 9.993925280955756e-05, + "loss": 2.0474, + "step": 22 + }, + { + "epoch": 0.006981332523903475, + "grad_norm": 0.8233473300933838, + "learning_rate": 9.993419054368735e-05, + "loss": 2.3597, + "step": 23 + }, + { + "epoch": 0.007284868720594931, + "grad_norm": 0.8661925196647644, + "learning_rate": 9.992912827781716e-05, + "loss": 2.2163, + "step": 24 + }, + { + "epoch": 0.007588404917286387, + "grad_norm": 0.7995729446411133, + "learning_rate": 9.992406601194695e-05, + "loss": 1.8051, + "step": 25 + }, + { + "epoch": 0.007891941113977842, + "grad_norm": 0.810165286064148, + "learning_rate": 9.991900374607675e-05, + "loss": 1.9189, + "step": 26 + }, + { + "epoch": 0.008195477310669297, + "grad_norm": 0.8240752220153809, + "learning_rate": 9.991394148020654e-05, + "loss": 1.7, + "step": 27 + }, + { + "epoch": 0.008499013507360752, + "grad_norm": 1.0160635709762573, + "learning_rate": 9.990887921433634e-05, + "loss": 2.2964, + "step": 28 + }, + { + "epoch": 0.008802549704052209, + "grad_norm": 0.794966995716095, + "learning_rate": 9.990381694846613e-05, + "loss": 1.7333, + "step": 29 + }, + { + "epoch": 0.009106085900743664, + "grad_norm": 0.5594797134399414, + "learning_rate": 9.989875468259593e-05, + "loss": 2.0925, + "step": 30 + }, + { + "epoch": 0.009409622097435118, + "grad_norm": 0.8100740909576416, + "learning_rate": 9.989369241672572e-05, + "loss": 2.1218, + "step": 31 + }, + { + "epoch": 0.009713158294126575, + "grad_norm": 0.7057996392250061, + "learning_rate": 9.988863015085552e-05, + "loss": 2.005, + "step": 32 + }, + { + "epoch": 0.01001669449081803, + "grad_norm": 0.8970999121665955, + "learning_rate": 9.988356788498533e-05, + "loss": 2.2414, + "step": 33 + }, + { + "epoch": 0.010320230687509485, + "grad_norm": 0.6290627717971802, + "learning_rate": 9.987850561911512e-05, + "loss": 2.2422, + "step": 34 + }, + { + "epoch": 0.010623766884200941, + "grad_norm": 0.5665722489356995, + "learning_rate": 9.987344335324492e-05, + "loss": 1.9342, + "step": 35 + }, + { + "epoch": 0.010927303080892396, + "grad_norm": 0.5792561173439026, + "learning_rate": 9.986838108737472e-05, + "loss": 1.8733, + "step": 36 + }, + { + "epoch": 0.011230839277583851, + "grad_norm": 0.5264159440994263, + "learning_rate": 9.986331882150452e-05, + "loss": 2.1739, + "step": 37 + }, + { + "epoch": 0.011534375474275308, + "grad_norm": 0.5069584250450134, + "learning_rate": 9.985825655563431e-05, + "loss": 1.6235, + "step": 38 + }, + { + "epoch": 0.011837911670966763, + "grad_norm": 0.7689110636711121, + "learning_rate": 9.985319428976411e-05, + "loss": 1.711, + "step": 39 + }, + { + "epoch": 0.012141447867658217, + "grad_norm": 0.7001574635505676, + "learning_rate": 9.98481320238939e-05, + "loss": 1.651, + "step": 40 + }, + { + "epoch": 0.012444984064349674, + "grad_norm": 0.5615801811218262, + "learning_rate": 9.98430697580237e-05, + "loss": 2.128, + "step": 41 + }, + { + "epoch": 0.012748520261041129, + "grad_norm": 0.8766308426856995, + "learning_rate": 9.983800749215349e-05, + "loss": 2.4421, + "step": 42 + }, + { + "epoch": 0.013052056457732584, + "grad_norm": 0.704547107219696, + "learning_rate": 9.983294522628329e-05, + "loss": 1.6921, + "step": 43 + }, + { + "epoch": 0.01335559265442404, + "grad_norm": 0.5749143362045288, + "learning_rate": 9.982788296041308e-05, + "loss": 2.0173, + "step": 44 + }, + { + "epoch": 0.013659128851115495, + "grad_norm": 0.7929263710975647, + "learning_rate": 9.982282069454289e-05, + "loss": 2.1755, + "step": 45 + }, + { + "epoch": 0.01396266504780695, + "grad_norm": 1.6391934156417847, + "learning_rate": 9.981775842867269e-05, + "loss": 2.4995, + "step": 46 + }, + { + "epoch": 0.014266201244498407, + "grad_norm": 0.49616461992263794, + "learning_rate": 9.981269616280248e-05, + "loss": 2.3363, + "step": 47 + }, + { + "epoch": 0.014569737441189862, + "grad_norm": 0.614272952079773, + "learning_rate": 9.980763389693227e-05, + "loss": 2.0277, + "step": 48 + }, + { + "epoch": 0.014873273637881317, + "grad_norm": 0.6181132197380066, + "learning_rate": 9.980257163106207e-05, + "loss": 2.2867, + "step": 49 + }, + { + "epoch": 0.015176809834572773, + "grad_norm": 0.5342630743980408, + "learning_rate": 9.979750936519186e-05, + "loss": 1.7314, + "step": 50 + }, + { + "epoch": 0.015480346031264228, + "grad_norm": 0.4582519233226776, + "learning_rate": 9.979244709932166e-05, + "loss": 1.9893, + "step": 51 + }, + { + "epoch": 0.015783882227955685, + "grad_norm": 0.5448606014251709, + "learning_rate": 9.978738483345145e-05, + "loss": 2.3266, + "step": 52 + }, + { + "epoch": 0.01608741842464714, + "grad_norm": 1.0823545455932617, + "learning_rate": 9.978232256758125e-05, + "loss": 2.1919, + "step": 53 + }, + { + "epoch": 0.016390954621338594, + "grad_norm": 0.5506464838981628, + "learning_rate": 9.977726030171106e-05, + "loss": 2.0735, + "step": 54 + }, + { + "epoch": 0.01669449081803005, + "grad_norm": 0.568626344203949, + "learning_rate": 9.977219803584085e-05, + "loss": 2.051, + "step": 55 + }, + { + "epoch": 0.016998027014721504, + "grad_norm": 0.512907087802887, + "learning_rate": 9.976713576997065e-05, + "loss": 1.6473, + "step": 56 + }, + { + "epoch": 0.017301563211412962, + "grad_norm": 0.5541898012161255, + "learning_rate": 9.976207350410044e-05, + "loss": 1.8184, + "step": 57 + }, + { + "epoch": 0.017605099408104417, + "grad_norm": 0.5083638429641724, + "learning_rate": 9.975701123823024e-05, + "loss": 1.7573, + "step": 58 + }, + { + "epoch": 0.017908635604795872, + "grad_norm": 0.4722895920276642, + "learning_rate": 9.975194897236003e-05, + "loss": 2.0311, + "step": 59 + }, + { + "epoch": 0.018212171801487327, + "grad_norm": 0.5068002343177795, + "learning_rate": 9.974688670648983e-05, + "loss": 2.1245, + "step": 60 + }, + { + "epoch": 0.018515707998178782, + "grad_norm": 0.5726852416992188, + "learning_rate": 9.974182444061962e-05, + "loss": 2.1017, + "step": 61 + }, + { + "epoch": 0.018819244194870237, + "grad_norm": 0.5240160226821899, + "learning_rate": 9.973676217474942e-05, + "loss": 2.2665, + "step": 62 + }, + { + "epoch": 0.019122780391561695, + "grad_norm": 0.4728144705295563, + "learning_rate": 9.973169990887921e-05, + "loss": 2.0537, + "step": 63 + }, + { + "epoch": 0.01942631658825315, + "grad_norm": 0.47115418314933777, + "learning_rate": 9.972663764300902e-05, + "loss": 1.2815, + "step": 64 + }, + { + "epoch": 0.019729852784944605, + "grad_norm": 0.7070208191871643, + "learning_rate": 9.972157537713881e-05, + "loss": 1.8514, + "step": 65 + }, + { + "epoch": 0.02003338898163606, + "grad_norm": 0.529069185256958, + "learning_rate": 9.971651311126861e-05, + "loss": 1.7602, + "step": 66 + }, + { + "epoch": 0.020336925178327515, + "grad_norm": 0.7532087564468384, + "learning_rate": 9.97114508453984e-05, + "loss": 2.2168, + "step": 67 + }, + { + "epoch": 0.02064046137501897, + "grad_norm": 0.5654622912406921, + "learning_rate": 9.97063885795282e-05, + "loss": 1.9634, + "step": 68 + }, + { + "epoch": 0.020943997571710428, + "grad_norm": 0.701452910900116, + "learning_rate": 9.970132631365799e-05, + "loss": 2.044, + "step": 69 + }, + { + "epoch": 0.021247533768401883, + "grad_norm": 0.5750812888145447, + "learning_rate": 9.969626404778779e-05, + "loss": 1.8015, + "step": 70 + }, + { + "epoch": 0.021551069965093338, + "grad_norm": 0.49930402636528015, + "learning_rate": 9.969120178191758e-05, + "loss": 1.7998, + "step": 71 + }, + { + "epoch": 0.021854606161784793, + "grad_norm": 0.4348014295101166, + "learning_rate": 9.968613951604738e-05, + "loss": 1.9959, + "step": 72 + }, + { + "epoch": 0.022158142358476247, + "grad_norm": 0.5268503427505493, + "learning_rate": 9.968107725017719e-05, + "loss": 1.8497, + "step": 73 + }, + { + "epoch": 0.022461678555167702, + "grad_norm": 0.578822135925293, + "learning_rate": 9.967601498430698e-05, + "loss": 2.3277, + "step": 74 + }, + { + "epoch": 0.02276521475185916, + "grad_norm": 0.52215975522995, + "learning_rate": 9.967095271843677e-05, + "loss": 2.1179, + "step": 75 + }, + { + "epoch": 0.023068750948550616, + "grad_norm": 0.4557477533817291, + "learning_rate": 9.966589045256657e-05, + "loss": 2.0132, + "step": 76 + }, + { + "epoch": 0.02337228714524207, + "grad_norm": 0.5032123327255249, + "learning_rate": 9.966082818669638e-05, + "loss": 1.8608, + "step": 77 + }, + { + "epoch": 0.023675823341933525, + "grad_norm": 0.42689865827560425, + "learning_rate": 9.965576592082617e-05, + "loss": 2.0437, + "step": 78 + }, + { + "epoch": 0.02397935953862498, + "grad_norm": 0.44310206174850464, + "learning_rate": 9.965070365495597e-05, + "loss": 2.1222, + "step": 79 + }, + { + "epoch": 0.024282895735316435, + "grad_norm": 0.4377008378505707, + "learning_rate": 9.964564138908576e-05, + "loss": 2.0418, + "step": 80 + }, + { + "epoch": 0.024586431932007893, + "grad_norm": 0.35174912214279175, + "learning_rate": 9.964057912321556e-05, + "loss": 1.6931, + "step": 81 + }, + { + "epoch": 0.024889968128699348, + "grad_norm": 0.47877687215805054, + "learning_rate": 9.963551685734535e-05, + "loss": 1.7049, + "step": 82 + }, + { + "epoch": 0.025193504325390803, + "grad_norm": 0.4063829183578491, + "learning_rate": 9.963045459147515e-05, + "loss": 1.8611, + "step": 83 + }, + { + "epoch": 0.025497040522082258, + "grad_norm": 0.4149170219898224, + "learning_rate": 9.962539232560496e-05, + "loss": 1.9439, + "step": 84 + }, + { + "epoch": 0.025800576718773713, + "grad_norm": 0.4882602393627167, + "learning_rate": 9.962033005973475e-05, + "loss": 1.5723, + "step": 85 + }, + { + "epoch": 0.026104112915465168, + "grad_norm": 0.4600992202758789, + "learning_rate": 9.961526779386454e-05, + "loss": 2.0142, + "step": 86 + }, + { + "epoch": 0.026407649112156626, + "grad_norm": 0.43366697430610657, + "learning_rate": 9.961020552799434e-05, + "loss": 1.9175, + "step": 87 + }, + { + "epoch": 0.02671118530884808, + "grad_norm": 0.501487135887146, + "learning_rate": 9.960514326212413e-05, + "loss": 1.5043, + "step": 88 + }, + { + "epoch": 0.027014721505539536, + "grad_norm": 0.43821993470191956, + "learning_rate": 9.960008099625393e-05, + "loss": 1.8622, + "step": 89 + }, + { + "epoch": 0.02731825770223099, + "grad_norm": 0.4433805048465729, + "learning_rate": 9.959501873038372e-05, + "loss": 1.9459, + "step": 90 + }, + { + "epoch": 0.027621793898922446, + "grad_norm": 0.4686216115951538, + "learning_rate": 9.958995646451352e-05, + "loss": 1.7405, + "step": 91 + }, + { + "epoch": 0.0279253300956139, + "grad_norm": 0.48586198687553406, + "learning_rate": 9.958489419864331e-05, + "loss": 2.2233, + "step": 92 + }, + { + "epoch": 0.02822886629230536, + "grad_norm": 0.4018734097480774, + "learning_rate": 9.957983193277312e-05, + "loss": 2.0027, + "step": 93 + }, + { + "epoch": 0.028532402488996814, + "grad_norm": 0.4996435344219208, + "learning_rate": 9.957476966690292e-05, + "loss": 1.5949, + "step": 94 + }, + { + "epoch": 0.02883593868568827, + "grad_norm": 0.45447826385498047, + "learning_rate": 9.956970740103271e-05, + "loss": 1.7636, + "step": 95 + }, + { + "epoch": 0.029139474882379723, + "grad_norm": 0.4209904372692108, + "learning_rate": 9.95646451351625e-05, + "loss": 1.7523, + "step": 96 + }, + { + "epoch": 0.029443011079071178, + "grad_norm": 0.3740164637565613, + "learning_rate": 9.95595828692923e-05, + "loss": 1.9136, + "step": 97 + }, + { + "epoch": 0.029746547275762633, + "grad_norm": 0.4169963598251343, + "learning_rate": 9.95545206034221e-05, + "loss": 1.9136, + "step": 98 + }, + { + "epoch": 0.03005008347245409, + "grad_norm": 0.4683006703853607, + "learning_rate": 9.954945833755189e-05, + "loss": 2.0657, + "step": 99 + }, + { + "epoch": 0.030353619669145546, + "grad_norm": 0.4508633017539978, + "learning_rate": 9.954439607168169e-05, + "loss": 2.1099, + "step": 100 + }, + { + "epoch": 0.030657155865837, + "grad_norm": 0.4136218726634979, + "learning_rate": 9.953933380581148e-05, + "loss": 2.0183, + "step": 101 + }, + { + "epoch": 0.030960692062528456, + "grad_norm": 0.44510790705680847, + "learning_rate": 9.953427153994127e-05, + "loss": 1.9307, + "step": 102 + }, + { + "epoch": 0.031264228259219914, + "grad_norm": 0.3713892698287964, + "learning_rate": 9.952920927407108e-05, + "loss": 1.7017, + "step": 103 + }, + { + "epoch": 0.03156776445591137, + "grad_norm": 0.47902294993400574, + "learning_rate": 9.952414700820088e-05, + "loss": 2.1172, + "step": 104 + }, + { + "epoch": 0.031871300652602824, + "grad_norm": 0.4492317736148834, + "learning_rate": 9.951908474233067e-05, + "loss": 1.9752, + "step": 105 + }, + { + "epoch": 0.03217483684929428, + "grad_norm": 0.4096255302429199, + "learning_rate": 9.951402247646047e-05, + "loss": 1.5511, + "step": 106 + }, + { + "epoch": 0.032478373045985734, + "grad_norm": 0.39630818367004395, + "learning_rate": 9.950896021059026e-05, + "loss": 2.11, + "step": 107 + }, + { + "epoch": 0.03278190924267719, + "grad_norm": 0.42648032307624817, + "learning_rate": 9.950389794472006e-05, + "loss": 2.1784, + "step": 108 + }, + { + "epoch": 0.033085445439368644, + "grad_norm": 0.4814178943634033, + "learning_rate": 9.949883567884985e-05, + "loss": 1.955, + "step": 109 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.41600191593170166, + "learning_rate": 9.949377341297965e-05, + "loss": 1.9163, + "step": 110 + }, + { + "epoch": 0.03369251783275155, + "grad_norm": 0.4610773026943207, + "learning_rate": 9.948871114710944e-05, + "loss": 1.7934, + "step": 111 + }, + { + "epoch": 0.03399605402944301, + "grad_norm": 0.43061718344688416, + "learning_rate": 9.948364888123925e-05, + "loss": 1.9278, + "step": 112 + }, + { + "epoch": 0.03429959022613446, + "grad_norm": 0.3907497227191925, + "learning_rate": 9.947858661536904e-05, + "loss": 1.996, + "step": 113 + }, + { + "epoch": 0.034603126422825925, + "grad_norm": 0.3984166383743286, + "learning_rate": 9.947352434949884e-05, + "loss": 1.5936, + "step": 114 + }, + { + "epoch": 0.03490666261951738, + "grad_norm": 0.43406423926353455, + "learning_rate": 9.946846208362863e-05, + "loss": 1.8866, + "step": 115 + }, + { + "epoch": 0.035210198816208835, + "grad_norm": 0.45913639664649963, + "learning_rate": 9.946339981775843e-05, + "loss": 1.972, + "step": 116 + }, + { + "epoch": 0.03551373501290029, + "grad_norm": 0.42077311873435974, + "learning_rate": 9.945833755188822e-05, + "loss": 2.0081, + "step": 117 + }, + { + "epoch": 0.035817271209591744, + "grad_norm": 0.41479435563087463, + "learning_rate": 9.945327528601802e-05, + "loss": 2.0096, + "step": 118 + }, + { + "epoch": 0.0361208074062832, + "grad_norm": 0.35669025778770447, + "learning_rate": 9.944821302014781e-05, + "loss": 2.0074, + "step": 119 + }, + { + "epoch": 0.036424343602974654, + "grad_norm": 0.4088069796562195, + "learning_rate": 9.944315075427761e-05, + "loss": 1.817, + "step": 120 + }, + { + "epoch": 0.03672787979966611, + "grad_norm": 0.49982163310050964, + "learning_rate": 9.943808848840742e-05, + "loss": 1.9218, + "step": 121 + }, + { + "epoch": 0.037031415996357564, + "grad_norm": 0.39924055337905884, + "learning_rate": 9.943302622253721e-05, + "loss": 2.2463, + "step": 122 + }, + { + "epoch": 0.03733495219304902, + "grad_norm": 0.40462177991867065, + "learning_rate": 9.942796395666702e-05, + "loss": 2.0844, + "step": 123 + }, + { + "epoch": 0.037638488389740474, + "grad_norm": 0.43440741300582886, + "learning_rate": 9.942290169079681e-05, + "loss": 1.8808, + "step": 124 + }, + { + "epoch": 0.03794202458643193, + "grad_norm": 0.4029730260372162, + "learning_rate": 9.941783942492661e-05, + "loss": 1.9427, + "step": 125 + }, + { + "epoch": 0.03824556078312339, + "grad_norm": 0.7807103395462036, + "learning_rate": 9.94127771590564e-05, + "loss": 1.9072, + "step": 126 + }, + { + "epoch": 0.038549096979814845, + "grad_norm": 0.5021561980247498, + "learning_rate": 9.94077148931862e-05, + "loss": 2.0582, + "step": 127 + }, + { + "epoch": 0.0388526331765063, + "grad_norm": 0.5161197781562805, + "learning_rate": 9.9402652627316e-05, + "loss": 1.9861, + "step": 128 + }, + { + "epoch": 0.039156169373197755, + "grad_norm": 0.5553935766220093, + "learning_rate": 9.939759036144579e-05, + "loss": 2.1893, + "step": 129 + }, + { + "epoch": 0.03945970556988921, + "grad_norm": 0.4241655170917511, + "learning_rate": 9.939252809557558e-05, + "loss": 1.9722, + "step": 130 + }, + { + "epoch": 0.039763241766580665, + "grad_norm": 0.43290001153945923, + "learning_rate": 9.938746582970538e-05, + "loss": 1.5364, + "step": 131 + }, + { + "epoch": 0.04006677796327212, + "grad_norm": 0.40089091658592224, + "learning_rate": 9.938240356383519e-05, + "loss": 1.9686, + "step": 132 + }, + { + "epoch": 0.040370314159963575, + "grad_norm": 0.4152032434940338, + "learning_rate": 9.937734129796498e-05, + "loss": 1.913, + "step": 133 + }, + { + "epoch": 0.04067385035665503, + "grad_norm": 0.4443211555480957, + "learning_rate": 9.937227903209478e-05, + "loss": 2.2354, + "step": 134 + }, + { + "epoch": 0.040977386553346484, + "grad_norm": 0.41355323791503906, + "learning_rate": 9.936721676622457e-05, + "loss": 2.1055, + "step": 135 + }, + { + "epoch": 0.04128092275003794, + "grad_norm": 0.5837479829788208, + "learning_rate": 9.936215450035437e-05, + "loss": 1.9085, + "step": 136 + }, + { + "epoch": 0.041584458946729394, + "grad_norm": 0.40269389748573303, + "learning_rate": 9.935709223448416e-05, + "loss": 2.0368, + "step": 137 + }, + { + "epoch": 0.041887995143420856, + "grad_norm": 0.5898969769477844, + "learning_rate": 9.935202996861396e-05, + "loss": 1.7933, + "step": 138 + }, + { + "epoch": 0.04219153134011231, + "grad_norm": 0.41117680072784424, + "learning_rate": 9.934696770274375e-05, + "loss": 1.7452, + "step": 139 + }, + { + "epoch": 0.042495067536803766, + "grad_norm": 0.5090368390083313, + "learning_rate": 9.934190543687354e-05, + "loss": 2.0141, + "step": 140 + }, + { + "epoch": 0.04279860373349522, + "grad_norm": 0.4821307957172394, + "learning_rate": 9.933684317100334e-05, + "loss": 1.9443, + "step": 141 + }, + { + "epoch": 0.043102139930186675, + "grad_norm": 0.41939428448677063, + "learning_rate": 9.933178090513315e-05, + "loss": 1.7401, + "step": 142 + }, + { + "epoch": 0.04340567612687813, + "grad_norm": 0.4531096816062927, + "learning_rate": 9.932671863926294e-05, + "loss": 1.9944, + "step": 143 + }, + { + "epoch": 0.043709212323569585, + "grad_norm": 0.44440799951553345, + "learning_rate": 9.932165637339274e-05, + "loss": 1.9648, + "step": 144 + }, + { + "epoch": 0.04401274852026104, + "grad_norm": 0.36847150325775146, + "learning_rate": 9.931659410752253e-05, + "loss": 2.0638, + "step": 145 + }, + { + "epoch": 0.044316284716952495, + "grad_norm": 0.6394171118736267, + "learning_rate": 9.931153184165233e-05, + "loss": 1.9476, + "step": 146 + }, + { + "epoch": 0.04461982091364395, + "grad_norm": 0.41597506403923035, + "learning_rate": 9.930646957578212e-05, + "loss": 1.535, + "step": 147 + }, + { + "epoch": 0.044923357110335405, + "grad_norm": 0.5597077012062073, + "learning_rate": 9.930140730991192e-05, + "loss": 1.6826, + "step": 148 + }, + { + "epoch": 0.045226893307026866, + "grad_norm": 0.5532084703445435, + "learning_rate": 9.929634504404171e-05, + "loss": 1.8063, + "step": 149 + }, + { + "epoch": 0.04553042950371832, + "grad_norm": 0.467339426279068, + "learning_rate": 9.92912827781715e-05, + "loss": 2.017, + "step": 150 + }, + { + "epoch": 0.045833965700409776, + "grad_norm": 0.4054040312767029, + "learning_rate": 9.928622051230131e-05, + "loss": 1.7582, + "step": 151 + }, + { + "epoch": 0.04613750189710123, + "grad_norm": 1.2743823528289795, + "learning_rate": 9.928115824643111e-05, + "loss": 2.0202, + "step": 152 + }, + { + "epoch": 0.046441038093792686, + "grad_norm": 0.4357397258281708, + "learning_rate": 9.92760959805609e-05, + "loss": 1.8788, + "step": 153 + }, + { + "epoch": 0.04674457429048414, + "grad_norm": 2.8793208599090576, + "learning_rate": 9.92710337146907e-05, + "loss": 2.1204, + "step": 154 + }, + { + "epoch": 0.047048110487175596, + "grad_norm": 0.9585952162742615, + "learning_rate": 9.92659714488205e-05, + "loss": 1.9356, + "step": 155 + }, + { + "epoch": 0.04735164668386705, + "grad_norm": 0.7857603430747986, + "learning_rate": 9.926090918295029e-05, + "loss": 1.9097, + "step": 156 + }, + { + "epoch": 0.047655182880558505, + "grad_norm": 0.5259221792221069, + "learning_rate": 9.925584691708008e-05, + "loss": 2.1589, + "step": 157 + }, + { + "epoch": 0.04795871907724996, + "grad_norm": 2.793253183364868, + "learning_rate": 9.925078465120988e-05, + "loss": 1.7202, + "step": 158 + }, + { + "epoch": 0.048262255273941415, + "grad_norm": 0.4432888627052307, + "learning_rate": 9.924572238533967e-05, + "loss": 1.9898, + "step": 159 + }, + { + "epoch": 0.04856579147063287, + "grad_norm": 0.4347291588783264, + "learning_rate": 9.924066011946948e-05, + "loss": 1.8142, + "step": 160 + }, + { + "epoch": 0.04886932766732433, + "grad_norm": 5.273514747619629, + "learning_rate": 9.923559785359928e-05, + "loss": 1.8665, + "step": 161 + }, + { + "epoch": 0.04917286386401579, + "grad_norm": 0.47988301515579224, + "learning_rate": 9.923053558772907e-05, + "loss": 1.9439, + "step": 162 + }, + { + "epoch": 0.04947640006070724, + "grad_norm": 0.3584117293357849, + "learning_rate": 9.922547332185887e-05, + "loss": 1.8109, + "step": 163 + }, + { + "epoch": 0.049779936257398696, + "grad_norm": 0.4074074923992157, + "learning_rate": 9.922041105598866e-05, + "loss": 2.1056, + "step": 164 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 3.159336566925049, + "learning_rate": 9.921534879011846e-05, + "loss": 1.8672, + "step": 165 + }, + { + "epoch": 0.050387008650781606, + "grad_norm": 0.38132309913635254, + "learning_rate": 9.921028652424826e-05, + "loss": 1.8423, + "step": 166 + }, + { + "epoch": 0.05069054484747306, + "grad_norm": 0.39241936802864075, + "learning_rate": 9.920522425837806e-05, + "loss": 1.5949, + "step": 167 + }, + { + "epoch": 0.050994081044164516, + "grad_norm": 0.38212037086486816, + "learning_rate": 9.920016199250785e-05, + "loss": 1.9669, + "step": 168 + }, + { + "epoch": 0.05129761724085597, + "grad_norm": 0.5353955030441284, + "learning_rate": 9.919509972663765e-05, + "loss": 2.1806, + "step": 169 + }, + { + "epoch": 0.051601153437547426, + "grad_norm": 0.4129483699798584, + "learning_rate": 9.919003746076744e-05, + "loss": 1.8858, + "step": 170 + }, + { + "epoch": 0.05190468963423888, + "grad_norm": 0.3832380771636963, + "learning_rate": 9.918497519489725e-05, + "loss": 2.0321, + "step": 171 + }, + { + "epoch": 0.052208225830930335, + "grad_norm": 0.4078863859176636, + "learning_rate": 9.917991292902705e-05, + "loss": 1.6213, + "step": 172 + }, + { + "epoch": 0.0525117620276218, + "grad_norm": 0.38865014910697937, + "learning_rate": 9.917485066315684e-05, + "loss": 2.0052, + "step": 173 + }, + { + "epoch": 0.05281529822431325, + "grad_norm": 0.4339440166950226, + "learning_rate": 9.916978839728664e-05, + "loss": 2.2405, + "step": 174 + }, + { + "epoch": 0.05311883442100471, + "grad_norm": 0.42063045501708984, + "learning_rate": 9.916472613141643e-05, + "loss": 1.6529, + "step": 175 + }, + { + "epoch": 0.05342237061769616, + "grad_norm": 0.4765849709510803, + "learning_rate": 9.915966386554623e-05, + "loss": 1.9645, + "step": 176 + }, + { + "epoch": 0.05372590681438762, + "grad_norm": 0.41431936621665955, + "learning_rate": 9.915460159967602e-05, + "loss": 1.9709, + "step": 177 + }, + { + "epoch": 0.05402944301107907, + "grad_norm": 0.3591434359550476, + "learning_rate": 9.914953933380581e-05, + "loss": 1.685, + "step": 178 + }, + { + "epoch": 0.054332979207770526, + "grad_norm": 0.45483240485191345, + "learning_rate": 9.914447706793561e-05, + "loss": 1.9362, + "step": 179 + }, + { + "epoch": 0.05463651540446198, + "grad_norm": 0.5468000173568726, + "learning_rate": 9.91394148020654e-05, + "loss": 1.6984, + "step": 180 + }, + { + "epoch": 0.054940051601153436, + "grad_norm": 0.4057190716266632, + "learning_rate": 9.913435253619521e-05, + "loss": 1.9887, + "step": 181 + }, + { + "epoch": 0.05524358779784489, + "grad_norm": 0.383211612701416, + "learning_rate": 9.912929027032501e-05, + "loss": 1.7825, + "step": 182 + }, + { + "epoch": 0.055547123994536346, + "grad_norm": 0.3480004668235779, + "learning_rate": 9.91242280044548e-05, + "loss": 1.8721, + "step": 183 + }, + { + "epoch": 0.0558506601912278, + "grad_norm": 0.47680413722991943, + "learning_rate": 9.91191657385846e-05, + "loss": 1.8113, + "step": 184 + }, + { + "epoch": 0.05615419638791926, + "grad_norm": 0.37727096676826477, + "learning_rate": 9.911410347271439e-05, + "loss": 1.7398, + "step": 185 + }, + { + "epoch": 0.05645773258461072, + "grad_norm": 0.47738176584243774, + "learning_rate": 9.910904120684419e-05, + "loss": 1.4651, + "step": 186 + }, + { + "epoch": 0.05676126878130217, + "grad_norm": 0.44533729553222656, + "learning_rate": 9.910397894097398e-05, + "loss": 1.5697, + "step": 187 + }, + { + "epoch": 0.05706480497799363, + "grad_norm": 0.45051974058151245, + "learning_rate": 9.909891667510378e-05, + "loss": 2.1577, + "step": 188 + }, + { + "epoch": 0.05736834117468508, + "grad_norm": 0.4709470272064209, + "learning_rate": 9.909385440923357e-05, + "loss": 2.0486, + "step": 189 + }, + { + "epoch": 0.05767187737137654, + "grad_norm": 0.4063846170902252, + "learning_rate": 9.908879214336338e-05, + "loss": 1.5453, + "step": 190 + }, + { + "epoch": 0.05797541356806799, + "grad_norm": 0.374362587928772, + "learning_rate": 9.908372987749317e-05, + "loss": 1.5611, + "step": 191 + }, + { + "epoch": 0.05827894976475945, + "grad_norm": 0.4852111041545868, + "learning_rate": 9.907866761162297e-05, + "loss": 1.6234, + "step": 192 + }, + { + "epoch": 0.0585824859614509, + "grad_norm": 0.6863122582435608, + "learning_rate": 9.907360534575276e-05, + "loss": 2.1612, + "step": 193 + }, + { + "epoch": 0.058886022158142357, + "grad_norm": 0.6040588021278381, + "learning_rate": 9.906854307988256e-05, + "loss": 2.1092, + "step": 194 + }, + { + "epoch": 0.05918955835483381, + "grad_norm": 0.4148467779159546, + "learning_rate": 9.906348081401235e-05, + "loss": 2.1108, + "step": 195 + }, + { + "epoch": 0.059493094551525266, + "grad_norm": 0.36098209023475647, + "learning_rate": 9.905841854814215e-05, + "loss": 2.0002, + "step": 196 + }, + { + "epoch": 0.05979663074821673, + "grad_norm": 0.42360183596611023, + "learning_rate": 9.905335628227194e-05, + "loss": 2.3124, + "step": 197 + }, + { + "epoch": 0.06010016694490818, + "grad_norm": 0.3650914430618286, + "learning_rate": 9.904829401640174e-05, + "loss": 1.8778, + "step": 198 + }, + { + "epoch": 0.06040370314159964, + "grad_norm": 0.392995148897171, + "learning_rate": 9.904323175053155e-05, + "loss": 2.16, + "step": 199 + }, + { + "epoch": 0.06070723933829109, + "grad_norm": 0.46390387415885925, + "learning_rate": 9.903816948466134e-05, + "loss": 1.8695, + "step": 200 + }, + { + "epoch": 0.06101077553498255, + "grad_norm": 0.3954870402812958, + "learning_rate": 9.903310721879114e-05, + "loss": 1.9233, + "step": 201 + }, + { + "epoch": 0.061314311731674, + "grad_norm": 0.3650193214416504, + "learning_rate": 9.902804495292093e-05, + "loss": 2.2504, + "step": 202 + }, + { + "epoch": 0.06161784792836546, + "grad_norm": 0.3582104742527008, + "learning_rate": 9.902298268705073e-05, + "loss": 1.9303, + "step": 203 + }, + { + "epoch": 0.06192138412505691, + "grad_norm": 0.35688868165016174, + "learning_rate": 9.901792042118052e-05, + "loss": 1.7078, + "step": 204 + }, + { + "epoch": 0.06222492032174837, + "grad_norm": 0.3666802942752838, + "learning_rate": 9.901285815531031e-05, + "loss": 1.941, + "step": 205 + }, + { + "epoch": 0.06252845651843983, + "grad_norm": 0.42375093698501587, + "learning_rate": 9.900779588944011e-05, + "loss": 2.0858, + "step": 206 + }, + { + "epoch": 0.06283199271513128, + "grad_norm": 0.3913770318031311, + "learning_rate": 9.90027336235699e-05, + "loss": 2.1423, + "step": 207 + }, + { + "epoch": 0.06313552891182274, + "grad_norm": 0.4101809859275818, + "learning_rate": 9.89976713576997e-05, + "loss": 2.0497, + "step": 208 + }, + { + "epoch": 0.06343906510851419, + "grad_norm": 0.3696439564228058, + "learning_rate": 9.899260909182951e-05, + "loss": 1.9692, + "step": 209 + }, + { + "epoch": 0.06374260130520565, + "grad_norm": 0.3725574016571045, + "learning_rate": 9.89875468259593e-05, + "loss": 2.2053, + "step": 210 + }, + { + "epoch": 0.0640461375018971, + "grad_norm": 0.4886903166770935, + "learning_rate": 9.898248456008911e-05, + "loss": 1.8981, + "step": 211 + }, + { + "epoch": 0.06434967369858856, + "grad_norm": 0.4423249661922455, + "learning_rate": 9.89774222942189e-05, + "loss": 1.9058, + "step": 212 + }, + { + "epoch": 0.06465320989528, + "grad_norm": 0.4045765697956085, + "learning_rate": 9.89723600283487e-05, + "loss": 1.8056, + "step": 213 + }, + { + "epoch": 0.06495674609197147, + "grad_norm": 0.43866047263145447, + "learning_rate": 9.89672977624785e-05, + "loss": 1.6315, + "step": 214 + }, + { + "epoch": 0.06526028228866293, + "grad_norm": 0.524714469909668, + "learning_rate": 9.896223549660829e-05, + "loss": 2.0156, + "step": 215 + }, + { + "epoch": 0.06556381848535438, + "grad_norm": 0.3752996325492859, + "learning_rate": 9.895717323073808e-05, + "loss": 2.2768, + "step": 216 + }, + { + "epoch": 0.06586735468204584, + "grad_norm": 0.4371670186519623, + "learning_rate": 9.895211096486788e-05, + "loss": 2.0755, + "step": 217 + }, + { + "epoch": 0.06617089087873729, + "grad_norm": 0.3751063644886017, + "learning_rate": 9.894704869899767e-05, + "loss": 2.2451, + "step": 218 + }, + { + "epoch": 0.06647442707542875, + "grad_norm": 0.6649600267410278, + "learning_rate": 9.894198643312747e-05, + "loss": 1.9835, + "step": 219 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.3941735625267029, + "learning_rate": 9.893692416725728e-05, + "loss": 2.0203, + "step": 220 + }, + { + "epoch": 0.06708149946881166, + "grad_norm": 0.41888293623924255, + "learning_rate": 9.893186190138707e-05, + "loss": 1.7572, + "step": 221 + }, + { + "epoch": 0.0673850356655031, + "grad_norm": 0.4820149838924408, + "learning_rate": 9.892679963551687e-05, + "loss": 2.0591, + "step": 222 + }, + { + "epoch": 0.06768857186219457, + "grad_norm": 0.3516736626625061, + "learning_rate": 9.892173736964666e-05, + "loss": 1.9398, + "step": 223 + }, + { + "epoch": 0.06799210805888602, + "grad_norm": 0.3873218894004822, + "learning_rate": 9.891667510377646e-05, + "loss": 1.6389, + "step": 224 + }, + { + "epoch": 0.06829564425557748, + "grad_norm": 0.3793487846851349, + "learning_rate": 9.891161283790625e-05, + "loss": 2.0075, + "step": 225 + }, + { + "epoch": 0.06859918045226893, + "grad_norm": 0.38987675309181213, + "learning_rate": 9.890655057203605e-05, + "loss": 2.0903, + "step": 226 + }, + { + "epoch": 0.06890271664896039, + "grad_norm": 0.4293549358844757, + "learning_rate": 9.890148830616584e-05, + "loss": 2.2099, + "step": 227 + }, + { + "epoch": 0.06920625284565185, + "grad_norm": 0.39895692467689514, + "learning_rate": 9.889642604029564e-05, + "loss": 1.8615, + "step": 228 + }, + { + "epoch": 0.0695097890423433, + "grad_norm": 0.4543936252593994, + "learning_rate": 9.889136377442544e-05, + "loss": 2.0828, + "step": 229 + }, + { + "epoch": 0.06981332523903476, + "grad_norm": 0.448477566242218, + "learning_rate": 9.888630150855524e-05, + "loss": 1.5524, + "step": 230 + }, + { + "epoch": 0.07011686143572621, + "grad_norm": 0.428975373506546, + "learning_rate": 9.888123924268503e-05, + "loss": 1.3828, + "step": 231 + }, + { + "epoch": 0.07042039763241767, + "grad_norm": 0.42287349700927734, + "learning_rate": 9.887617697681483e-05, + "loss": 2.096, + "step": 232 + }, + { + "epoch": 0.07072393382910912, + "grad_norm": 0.43614649772644043, + "learning_rate": 9.887111471094462e-05, + "loss": 1.8238, + "step": 233 + }, + { + "epoch": 0.07102747002580058, + "grad_norm": 0.47309553623199463, + "learning_rate": 9.886605244507442e-05, + "loss": 2.3526, + "step": 234 + }, + { + "epoch": 0.07133100622249203, + "grad_norm": 0.9558483362197876, + "learning_rate": 9.886099017920421e-05, + "loss": 1.9816, + "step": 235 + }, + { + "epoch": 0.07163454241918349, + "grad_norm": 0.3529858887195587, + "learning_rate": 9.885592791333401e-05, + "loss": 2.0314, + "step": 236 + }, + { + "epoch": 0.07193807861587494, + "grad_norm": 0.37652599811553955, + "learning_rate": 9.88508656474638e-05, + "loss": 1.9381, + "step": 237 + }, + { + "epoch": 0.0722416148125664, + "grad_norm": 0.40783143043518066, + "learning_rate": 9.884580338159361e-05, + "loss": 1.966, + "step": 238 + }, + { + "epoch": 0.07254515100925786, + "grad_norm": 0.4160328805446625, + "learning_rate": 9.88407411157234e-05, + "loss": 1.8176, + "step": 239 + }, + { + "epoch": 0.07284868720594931, + "grad_norm": 0.4397304952144623, + "learning_rate": 9.88356788498532e-05, + "loss": 1.6766, + "step": 240 + }, + { + "epoch": 0.07315222340264077, + "grad_norm": 0.42549702525138855, + "learning_rate": 9.8830616583983e-05, + "loss": 2.1176, + "step": 241 + }, + { + "epoch": 0.07345575959933222, + "grad_norm": 0.3747939169406891, + "learning_rate": 9.882555431811279e-05, + "loss": 1.5494, + "step": 242 + }, + { + "epoch": 0.07375929579602368, + "grad_norm": 3.4551990032196045, + "learning_rate": 9.882049205224258e-05, + "loss": 2.0336, + "step": 243 + }, + { + "epoch": 0.07406283199271513, + "grad_norm": 1.5632964372634888, + "learning_rate": 9.881542978637238e-05, + "loss": 1.7452, + "step": 244 + }, + { + "epoch": 0.07436636818940659, + "grad_norm": 0.41575855016708374, + "learning_rate": 9.881036752050217e-05, + "loss": 2.0243, + "step": 245 + }, + { + "epoch": 0.07466990438609804, + "grad_norm": 0.44168713688850403, + "learning_rate": 9.880530525463197e-05, + "loss": 2.0022, + "step": 246 + }, + { + "epoch": 0.0749734405827895, + "grad_norm": 0.46640321612358093, + "learning_rate": 9.880024298876176e-05, + "loss": 1.555, + "step": 247 + }, + { + "epoch": 0.07527697677948095, + "grad_norm": 0.3622835576534271, + "learning_rate": 9.879518072289157e-05, + "loss": 1.876, + "step": 248 + }, + { + "epoch": 0.07558051297617241, + "grad_norm": 0.6277987957000732, + "learning_rate": 9.879011845702137e-05, + "loss": 2.2753, + "step": 249 + }, + { + "epoch": 0.07588404917286386, + "grad_norm": 0.40246644616127014, + "learning_rate": 9.878505619115116e-05, + "loss": 1.5991, + "step": 250 + }, + { + "epoch": 0.07618758536955532, + "grad_norm": 0.38388529419898987, + "learning_rate": 9.877999392528096e-05, + "loss": 1.9226, + "step": 251 + }, + { + "epoch": 0.07649112156624678, + "grad_norm": 0.39985090494155884, + "learning_rate": 9.877493165941075e-05, + "loss": 2.0722, + "step": 252 + }, + { + "epoch": 0.07679465776293823, + "grad_norm": 0.3872128427028656, + "learning_rate": 9.876986939354055e-05, + "loss": 1.9132, + "step": 253 + }, + { + "epoch": 0.07709819395962969, + "grad_norm": 0.3665171265602112, + "learning_rate": 9.876480712767034e-05, + "loss": 1.6244, + "step": 254 + }, + { + "epoch": 0.07740173015632114, + "grad_norm": 0.4011310040950775, + "learning_rate": 9.875974486180015e-05, + "loss": 2.1289, + "step": 255 + }, + { + "epoch": 0.0777052663530126, + "grad_norm": 0.35013166069984436, + "learning_rate": 9.875468259592994e-05, + "loss": 1.9738, + "step": 256 + }, + { + "epoch": 0.07800880254970405, + "grad_norm": 0.48468607664108276, + "learning_rate": 9.874962033005974e-05, + "loss": 2.1368, + "step": 257 + }, + { + "epoch": 0.07831233874639551, + "grad_norm": 0.5015551447868347, + "learning_rate": 9.874455806418953e-05, + "loss": 2.1218, + "step": 258 + }, + { + "epoch": 0.07861587494308696, + "grad_norm": 0.41915133595466614, + "learning_rate": 9.873949579831934e-05, + "loss": 2.0052, + "step": 259 + }, + { + "epoch": 0.07891941113977842, + "grad_norm": 0.4414760172367096, + "learning_rate": 9.873443353244914e-05, + "loss": 1.7249, + "step": 260 + }, + { + "epoch": 0.07922294733646987, + "grad_norm": 0.47259169816970825, + "learning_rate": 9.872937126657893e-05, + "loss": 2.1041, + "step": 261 + }, + { + "epoch": 0.07952648353316133, + "grad_norm": 0.3689124882221222, + "learning_rate": 9.872430900070873e-05, + "loss": 1.8956, + "step": 262 + }, + { + "epoch": 0.07983001972985279, + "grad_norm": 0.3948320150375366, + "learning_rate": 9.871924673483852e-05, + "loss": 1.9211, + "step": 263 + }, + { + "epoch": 0.08013355592654424, + "grad_norm": 0.4235248267650604, + "learning_rate": 9.871418446896832e-05, + "loss": 1.7115, + "step": 264 + }, + { + "epoch": 0.0804370921232357, + "grad_norm": 0.48399198055267334, + "learning_rate": 9.870912220309811e-05, + "loss": 1.77, + "step": 265 + }, + { + "epoch": 0.08074062831992715, + "grad_norm": 0.34047526121139526, + "learning_rate": 9.87040599372279e-05, + "loss": 1.7189, + "step": 266 + }, + { + "epoch": 0.08104416451661861, + "grad_norm": 0.47203269600868225, + "learning_rate": 9.86989976713577e-05, + "loss": 1.7674, + "step": 267 + }, + { + "epoch": 0.08134770071331006, + "grad_norm": 0.3752756118774414, + "learning_rate": 9.869393540548751e-05, + "loss": 1.8716, + "step": 268 + }, + { + "epoch": 0.08165123691000152, + "grad_norm": 0.3437153697013855, + "learning_rate": 9.86888731396173e-05, + "loss": 1.9824, + "step": 269 + }, + { + "epoch": 0.08195477310669297, + "grad_norm": 0.4854094088077545, + "learning_rate": 9.86838108737471e-05, + "loss": 1.4385, + "step": 270 + }, + { + "epoch": 0.08225830930338443, + "grad_norm": 0.37674829363822937, + "learning_rate": 9.86787486078769e-05, + "loss": 1.7877, + "step": 271 + }, + { + "epoch": 0.08256184550007588, + "grad_norm": 0.4215140640735626, + "learning_rate": 9.867368634200669e-05, + "loss": 2.1854, + "step": 272 + }, + { + "epoch": 0.08286538169676734, + "grad_norm": 0.3680359423160553, + "learning_rate": 9.866862407613648e-05, + "loss": 2.104, + "step": 273 + }, + { + "epoch": 0.08316891789345879, + "grad_norm": 0.4195649325847626, + "learning_rate": 9.866356181026628e-05, + "loss": 1.469, + "step": 274 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.480640709400177, + "learning_rate": 9.865849954439607e-05, + "loss": 1.8329, + "step": 275 + }, + { + "epoch": 0.08377599028684171, + "grad_norm": 0.34760695695877075, + "learning_rate": 9.865343727852587e-05, + "loss": 1.9495, + "step": 276 + }, + { + "epoch": 0.08407952648353316, + "grad_norm": 0.3803161680698395, + "learning_rate": 9.864837501265568e-05, + "loss": 1.9294, + "step": 277 + }, + { + "epoch": 0.08438306268022462, + "grad_norm": 0.41739675402641296, + "learning_rate": 9.864331274678547e-05, + "loss": 2.059, + "step": 278 + }, + { + "epoch": 0.08468659887691607, + "grad_norm": 0.3807448744773865, + "learning_rate": 9.863825048091527e-05, + "loss": 1.9741, + "step": 279 + }, + { + "epoch": 0.08499013507360753, + "grad_norm": 0.3610997200012207, + "learning_rate": 9.863318821504506e-05, + "loss": 1.9815, + "step": 280 + }, + { + "epoch": 0.08529367127029898, + "grad_norm": 0.3797460198402405, + "learning_rate": 9.862812594917485e-05, + "loss": 2.1394, + "step": 281 + }, + { + "epoch": 0.08559720746699044, + "grad_norm": 0.3922887444496155, + "learning_rate": 9.862306368330465e-05, + "loss": 2.184, + "step": 282 + }, + { + "epoch": 0.08590074366368189, + "grad_norm": 0.38251930475234985, + "learning_rate": 9.861800141743444e-05, + "loss": 2.0186, + "step": 283 + }, + { + "epoch": 0.08620427986037335, + "grad_norm": 0.35968562960624695, + "learning_rate": 9.861293915156424e-05, + "loss": 2.0, + "step": 284 + }, + { + "epoch": 0.0865078160570648, + "grad_norm": 0.37149590253829956, + "learning_rate": 9.860787688569403e-05, + "loss": 1.7941, + "step": 285 + }, + { + "epoch": 0.08681135225375626, + "grad_norm": 0.36890628933906555, + "learning_rate": 9.860281461982383e-05, + "loss": 1.906, + "step": 286 + }, + { + "epoch": 0.08711488845044772, + "grad_norm": 0.36025917530059814, + "learning_rate": 9.859775235395364e-05, + "loss": 1.9655, + "step": 287 + }, + { + "epoch": 0.08741842464713917, + "grad_norm": 0.3704364001750946, + "learning_rate": 9.859269008808343e-05, + "loss": 1.8657, + "step": 288 + }, + { + "epoch": 0.08772196084383063, + "grad_norm": 0.5996513962745667, + "learning_rate": 9.858762782221323e-05, + "loss": 1.7448, + "step": 289 + }, + { + "epoch": 0.08802549704052208, + "grad_norm": 0.3615630269050598, + "learning_rate": 9.858256555634302e-05, + "loss": 1.9007, + "step": 290 + }, + { + "epoch": 0.08832903323721354, + "grad_norm": 0.36014246940612793, + "learning_rate": 9.857750329047282e-05, + "loss": 1.927, + "step": 291 + }, + { + "epoch": 0.08863256943390499, + "grad_norm": 0.5038754940032959, + "learning_rate": 9.857244102460261e-05, + "loss": 1.6613, + "step": 292 + }, + { + "epoch": 0.08893610563059645, + "grad_norm": 0.3880213797092438, + "learning_rate": 9.85673787587324e-05, + "loss": 1.5563, + "step": 293 + }, + { + "epoch": 0.0892396418272879, + "grad_norm": 0.43225082755088806, + "learning_rate": 9.85623164928622e-05, + "loss": 1.5534, + "step": 294 + }, + { + "epoch": 0.08954317802397936, + "grad_norm": 0.44342055916786194, + "learning_rate": 9.8557254226992e-05, + "loss": 1.6211, + "step": 295 + }, + { + "epoch": 0.08984671422067081, + "grad_norm": 0.42114123702049255, + "learning_rate": 9.85521919611218e-05, + "loss": 1.9731, + "step": 296 + }, + { + "epoch": 0.09015025041736227, + "grad_norm": 0.43151113390922546, + "learning_rate": 9.85471296952516e-05, + "loss": 1.9519, + "step": 297 + }, + { + "epoch": 0.09045378661405373, + "grad_norm": 0.38092517852783203, + "learning_rate": 9.85420674293814e-05, + "loss": 2.0973, + "step": 298 + }, + { + "epoch": 0.09075732281074518, + "grad_norm": 0.40729570388793945, + "learning_rate": 9.853700516351119e-05, + "loss": 1.4395, + "step": 299 + }, + { + "epoch": 0.09106085900743664, + "grad_norm": 0.3631846308708191, + "learning_rate": 9.8531942897641e-05, + "loss": 1.2255, + "step": 300 + }, + { + "epoch": 0.09136439520412809, + "grad_norm": 0.37764397263526917, + "learning_rate": 9.852688063177079e-05, + "loss": 1.9941, + "step": 301 + }, + { + "epoch": 0.09166793140081955, + "grad_norm": 0.3755379319190979, + "learning_rate": 9.852181836590059e-05, + "loss": 1.7154, + "step": 302 + }, + { + "epoch": 0.091971467597511, + "grad_norm": 0.39003854990005493, + "learning_rate": 9.851675610003038e-05, + "loss": 1.928, + "step": 303 + }, + { + "epoch": 0.09227500379420246, + "grad_norm": 0.39592432975769043, + "learning_rate": 9.851169383416018e-05, + "loss": 2.1913, + "step": 304 + }, + { + "epoch": 0.09257853999089391, + "grad_norm": 0.4315894842147827, + "learning_rate": 9.850663156828997e-05, + "loss": 1.6432, + "step": 305 + }, + { + "epoch": 0.09288207618758537, + "grad_norm": 0.4103511571884155, + "learning_rate": 9.850156930241977e-05, + "loss": 1.9944, + "step": 306 + }, + { + "epoch": 0.09318561238427682, + "grad_norm": 0.4236547350883484, + "learning_rate": 9.849650703654957e-05, + "loss": 1.875, + "step": 307 + }, + { + "epoch": 0.09348914858096828, + "grad_norm": 0.41012468934059143, + "learning_rate": 9.849144477067937e-05, + "loss": 2.008, + "step": 308 + }, + { + "epoch": 0.09379268477765973, + "grad_norm": 0.35538622736930847, + "learning_rate": 9.848638250480916e-05, + "loss": 1.7322, + "step": 309 + }, + { + "epoch": 0.09409622097435119, + "grad_norm": 0.3874755799770355, + "learning_rate": 9.848132023893896e-05, + "loss": 1.9818, + "step": 310 + }, + { + "epoch": 0.09439975717104265, + "grad_norm": 0.42444977164268494, + "learning_rate": 9.847625797306875e-05, + "loss": 2.1606, + "step": 311 + }, + { + "epoch": 0.0947032933677341, + "grad_norm": 0.5855305194854736, + "learning_rate": 9.847119570719855e-05, + "loss": 1.4887, + "step": 312 + }, + { + "epoch": 0.09500682956442556, + "grad_norm": 0.35223227739334106, + "learning_rate": 9.846613344132834e-05, + "loss": 2.0025, + "step": 313 + }, + { + "epoch": 0.09531036576111701, + "grad_norm": 0.4013148844242096, + "learning_rate": 9.846107117545814e-05, + "loss": 1.9702, + "step": 314 + }, + { + "epoch": 0.09561390195780847, + "grad_norm": 0.5038349032402039, + "learning_rate": 9.845600890958793e-05, + "loss": 2.1532, + "step": 315 + }, + { + "epoch": 0.09591743815449992, + "grad_norm": 0.4826093018054962, + "learning_rate": 9.845094664371774e-05, + "loss": 2.0118, + "step": 316 + }, + { + "epoch": 0.09622097435119138, + "grad_norm": 0.41135913133621216, + "learning_rate": 9.844588437784754e-05, + "loss": 2.0707, + "step": 317 + }, + { + "epoch": 0.09652451054788283, + "grad_norm": 0.4353053569793701, + "learning_rate": 9.844082211197733e-05, + "loss": 2.104, + "step": 318 + }, + { + "epoch": 0.09682804674457429, + "grad_norm": 0.4192908704280853, + "learning_rate": 9.843575984610712e-05, + "loss": 1.9489, + "step": 319 + }, + { + "epoch": 0.09713158294126574, + "grad_norm": 0.380562424659729, + "learning_rate": 9.843069758023692e-05, + "loss": 1.3602, + "step": 320 + }, + { + "epoch": 0.0974351191379572, + "grad_norm": 0.3394995331764221, + "learning_rate": 9.842563531436671e-05, + "loss": 2.2161, + "step": 321 + }, + { + "epoch": 0.09773865533464866, + "grad_norm": 0.3419237434864044, + "learning_rate": 9.842057304849651e-05, + "loss": 1.7146, + "step": 322 + }, + { + "epoch": 0.09804219153134011, + "grad_norm": 0.3590264618396759, + "learning_rate": 9.84155107826263e-05, + "loss": 1.8654, + "step": 323 + }, + { + "epoch": 0.09834572772803157, + "grad_norm": 0.40006300806999207, + "learning_rate": 9.84104485167561e-05, + "loss": 1.5787, + "step": 324 + }, + { + "epoch": 0.09864926392472302, + "grad_norm": 0.33313074707984924, + "learning_rate": 9.84053862508859e-05, + "loss": 1.8653, + "step": 325 + }, + { + "epoch": 0.09895280012141448, + "grad_norm": 0.39681655168533325, + "learning_rate": 9.84003239850157e-05, + "loss": 2.178, + "step": 326 + }, + { + "epoch": 0.09925633631810593, + "grad_norm": 0.41945868730545044, + "learning_rate": 9.83952617191455e-05, + "loss": 1.8324, + "step": 327 + }, + { + "epoch": 0.09955987251479739, + "grad_norm": 0.3957304060459137, + "learning_rate": 9.839019945327529e-05, + "loss": 1.6468, + "step": 328 + }, + { + "epoch": 0.09986340871148884, + "grad_norm": 0.35814937949180603, + "learning_rate": 9.838513718740509e-05, + "loss": 1.6492, + "step": 329 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.38410916924476624, + "learning_rate": 9.838007492153488e-05, + "loss": 1.7223, + "step": 330 + }, + { + "epoch": 0.10047048110487175, + "grad_norm": 0.38490885496139526, + "learning_rate": 9.837501265566468e-05, + "loss": 2.0166, + "step": 331 + }, + { + "epoch": 0.10077401730156321, + "grad_norm": 0.38943415880203247, + "learning_rate": 9.836995038979447e-05, + "loss": 1.371, + "step": 332 + }, + { + "epoch": 0.10107755349825466, + "grad_norm": 0.39741018414497375, + "learning_rate": 9.836488812392427e-05, + "loss": 1.6233, + "step": 333 + }, + { + "epoch": 0.10138108969494612, + "grad_norm": 0.4663957357406616, + "learning_rate": 9.835982585805406e-05, + "loss": 1.746, + "step": 334 + }, + { + "epoch": 0.10168462589163758, + "grad_norm": 0.37118905782699585, + "learning_rate": 9.835476359218387e-05, + "loss": 1.9684, + "step": 335 + }, + { + "epoch": 0.10198816208832903, + "grad_norm": 0.40275588631629944, + "learning_rate": 9.834970132631366e-05, + "loss": 1.9551, + "step": 336 + }, + { + "epoch": 0.1022916982850205, + "grad_norm": 0.4336283206939697, + "learning_rate": 9.834463906044346e-05, + "loss": 2.0711, + "step": 337 + }, + { + "epoch": 0.10259523448171194, + "grad_norm": 0.35735735297203064, + "learning_rate": 9.833957679457325e-05, + "loss": 2.1397, + "step": 338 + }, + { + "epoch": 0.1028987706784034, + "grad_norm": 0.37825390696525574, + "learning_rate": 9.833451452870305e-05, + "loss": 1.7494, + "step": 339 + }, + { + "epoch": 0.10320230687509485, + "grad_norm": 0.3384961783885956, + "learning_rate": 9.832945226283284e-05, + "loss": 2.0197, + "step": 340 + }, + { + "epoch": 0.10350584307178631, + "grad_norm": 0.46276888251304626, + "learning_rate": 9.832438999696264e-05, + "loss": 1.797, + "step": 341 + }, + { + "epoch": 0.10380937926847776, + "grad_norm": 0.3685421347618103, + "learning_rate": 9.831932773109243e-05, + "loss": 1.9301, + "step": 342 + }, + { + "epoch": 0.10411291546516922, + "grad_norm": 0.38931936025619507, + "learning_rate": 9.831426546522223e-05, + "loss": 1.9623, + "step": 343 + }, + { + "epoch": 0.10441645166186067, + "grad_norm": 0.46678805351257324, + "learning_rate": 9.830920319935204e-05, + "loss": 1.6708, + "step": 344 + }, + { + "epoch": 0.10471998785855213, + "grad_norm": 0.4199204444885254, + "learning_rate": 9.830414093348183e-05, + "loss": 1.8014, + "step": 345 + }, + { + "epoch": 0.1050235240552436, + "grad_norm": 0.41024506092071533, + "learning_rate": 9.829907866761164e-05, + "loss": 1.8829, + "step": 346 + }, + { + "epoch": 0.10532706025193504, + "grad_norm": 0.5271286368370056, + "learning_rate": 9.829401640174143e-05, + "loss": 1.7796, + "step": 347 + }, + { + "epoch": 0.1056305964486265, + "grad_norm": 0.3593878448009491, + "learning_rate": 9.828895413587123e-05, + "loss": 2.0697, + "step": 348 + }, + { + "epoch": 0.10593413264531795, + "grad_norm": 0.44404372572898865, + "learning_rate": 9.828389187000102e-05, + "loss": 2.3235, + "step": 349 + }, + { + "epoch": 0.10623766884200941, + "grad_norm": 0.4072231650352478, + "learning_rate": 9.827882960413082e-05, + "loss": 1.5391, + "step": 350 + }, + { + "epoch": 0.10654120503870086, + "grad_norm": 0.3924303352832794, + "learning_rate": 9.827376733826061e-05, + "loss": 2.0649, + "step": 351 + }, + { + "epoch": 0.10684474123539232, + "grad_norm": 0.3815264105796814, + "learning_rate": 9.826870507239041e-05, + "loss": 1.5821, + "step": 352 + }, + { + "epoch": 0.10714827743208377, + "grad_norm": 0.40832409262657166, + "learning_rate": 9.82636428065202e-05, + "loss": 2.1135, + "step": 353 + }, + { + "epoch": 0.10745181362877523, + "grad_norm": 0.40270155668258667, + "learning_rate": 9.825858054065e-05, + "loss": 1.6561, + "step": 354 + }, + { + "epoch": 0.10775534982546668, + "grad_norm": 0.38295283913612366, + "learning_rate": 9.82535182747798e-05, + "loss": 1.8938, + "step": 355 + }, + { + "epoch": 0.10805888602215814, + "grad_norm": 0.41975417733192444, + "learning_rate": 9.82484560089096e-05, + "loss": 1.8605, + "step": 356 + }, + { + "epoch": 0.10836242221884959, + "grad_norm": 0.41388946771621704, + "learning_rate": 9.82433937430394e-05, + "loss": 1.812, + "step": 357 + }, + { + "epoch": 0.10866595841554105, + "grad_norm": 0.3470607101917267, + "learning_rate": 9.823833147716919e-05, + "loss": 2.1914, + "step": 358 + }, + { + "epoch": 0.10896949461223251, + "grad_norm": 0.4417155385017395, + "learning_rate": 9.823326921129898e-05, + "loss": 1.7644, + "step": 359 + }, + { + "epoch": 0.10927303080892396, + "grad_norm": 0.33910539746284485, + "learning_rate": 9.822820694542878e-05, + "loss": 1.8821, + "step": 360 + }, + { + "epoch": 0.10957656700561542, + "grad_norm": 0.36742356419563293, + "learning_rate": 9.822314467955857e-05, + "loss": 1.9684, + "step": 361 + }, + { + "epoch": 0.10988010320230687, + "grad_norm": 0.407844603061676, + "learning_rate": 9.821808241368837e-05, + "loss": 1.8797, + "step": 362 + }, + { + "epoch": 0.11018363939899833, + "grad_norm": 0.4090898036956787, + "learning_rate": 9.821302014781816e-05, + "loss": 1.8401, + "step": 363 + }, + { + "epoch": 0.11048717559568978, + "grad_norm": 0.3852720260620117, + "learning_rate": 9.820795788194796e-05, + "loss": 1.6887, + "step": 364 + }, + { + "epoch": 0.11079071179238124, + "grad_norm": 0.4147186875343323, + "learning_rate": 9.820289561607777e-05, + "loss": 1.7263, + "step": 365 + }, + { + "epoch": 0.11109424798907269, + "grad_norm": 0.7032086849212646, + "learning_rate": 9.819783335020756e-05, + "loss": 1.5382, + "step": 366 + }, + { + "epoch": 0.11139778418576415, + "grad_norm": 0.3547534644603729, + "learning_rate": 9.819277108433736e-05, + "loss": 1.5988, + "step": 367 + }, + { + "epoch": 0.1117013203824556, + "grad_norm": 0.45878785848617554, + "learning_rate": 9.818770881846715e-05, + "loss": 2.2467, + "step": 368 + }, + { + "epoch": 0.11200485657914706, + "grad_norm": 0.39183077216148376, + "learning_rate": 9.818264655259695e-05, + "loss": 1.848, + "step": 369 + }, + { + "epoch": 0.11230839277583853, + "grad_norm": 0.3735283315181732, + "learning_rate": 9.817758428672674e-05, + "loss": 1.6925, + "step": 370 + }, + { + "epoch": 0.11261192897252997, + "grad_norm": 0.3878265917301178, + "learning_rate": 9.817252202085654e-05, + "loss": 2.04, + "step": 371 + }, + { + "epoch": 0.11291546516922144, + "grad_norm": 0.38978812098503113, + "learning_rate": 9.816745975498633e-05, + "loss": 1.869, + "step": 372 + }, + { + "epoch": 0.11321900136591288, + "grad_norm": 0.39212337136268616, + "learning_rate": 9.816239748911613e-05, + "loss": 2.0549, + "step": 373 + }, + { + "epoch": 0.11352253756260434, + "grad_norm": 0.39528506994247437, + "learning_rate": 9.815733522324593e-05, + "loss": 1.5653, + "step": 374 + }, + { + "epoch": 0.11382607375929579, + "grad_norm": 0.4226018786430359, + "learning_rate": 9.815227295737573e-05, + "loss": 1.6231, + "step": 375 + }, + { + "epoch": 0.11412960995598725, + "grad_norm": 0.3577810823917389, + "learning_rate": 9.814721069150552e-05, + "loss": 1.9599, + "step": 376 + }, + { + "epoch": 0.1144331461526787, + "grad_norm": 0.33580708503723145, + "learning_rate": 9.814214842563532e-05, + "loss": 2.0419, + "step": 377 + }, + { + "epoch": 0.11473668234937016, + "grad_norm": 0.38860392570495605, + "learning_rate": 9.813708615976511e-05, + "loss": 1.7186, + "step": 378 + }, + { + "epoch": 0.11504021854606161, + "grad_norm": 0.38994479179382324, + "learning_rate": 9.813202389389491e-05, + "loss": 2.1848, + "step": 379 + }, + { + "epoch": 0.11534375474275307, + "grad_norm": 0.3947262763977051, + "learning_rate": 9.81269616280247e-05, + "loss": 2.1868, + "step": 380 + }, + { + "epoch": 0.11564729093944452, + "grad_norm": 0.3112877607345581, + "learning_rate": 9.81218993621545e-05, + "loss": 1.8604, + "step": 381 + }, + { + "epoch": 0.11595082713613598, + "grad_norm": 0.375689834356308, + "learning_rate": 9.811683709628429e-05, + "loss": 2.0418, + "step": 382 + }, + { + "epoch": 0.11625436333282745, + "grad_norm": 0.34537243843078613, + "learning_rate": 9.81117748304141e-05, + "loss": 1.8874, + "step": 383 + }, + { + "epoch": 0.1165578995295189, + "grad_norm": 0.5077370405197144, + "learning_rate": 9.81067125645439e-05, + "loss": 1.7497, + "step": 384 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.3703441023826599, + "learning_rate": 9.810165029867369e-05, + "loss": 1.781, + "step": 385 + }, + { + "epoch": 0.1171649719229018, + "grad_norm": 0.4386610984802246, + "learning_rate": 9.809658803280348e-05, + "loss": 1.8428, + "step": 386 + }, + { + "epoch": 0.11746850811959327, + "grad_norm": 0.37781745195388794, + "learning_rate": 9.809152576693328e-05, + "loss": 2.0384, + "step": 387 + }, + { + "epoch": 0.11777204431628471, + "grad_norm": 0.38956716656684875, + "learning_rate": 9.808646350106307e-05, + "loss": 2.3534, + "step": 388 + }, + { + "epoch": 0.11807558051297617, + "grad_norm": 0.3444838523864746, + "learning_rate": 9.808140123519288e-05, + "loss": 1.921, + "step": 389 + }, + { + "epoch": 0.11837911670966762, + "grad_norm": 0.39881742000579834, + "learning_rate": 9.807633896932268e-05, + "loss": 2.1758, + "step": 390 + }, + { + "epoch": 0.11868265290635908, + "grad_norm": 0.384226530790329, + "learning_rate": 9.807127670345247e-05, + "loss": 1.7651, + "step": 391 + }, + { + "epoch": 0.11898618910305053, + "grad_norm": 0.36255109310150146, + "learning_rate": 9.806621443758227e-05, + "loss": 1.8122, + "step": 392 + }, + { + "epoch": 0.119289725299742, + "grad_norm": 0.3627421259880066, + "learning_rate": 9.806115217171206e-05, + "loss": 1.6304, + "step": 393 + }, + { + "epoch": 0.11959326149643346, + "grad_norm": 0.8936781883239746, + "learning_rate": 9.805608990584187e-05, + "loss": 1.8827, + "step": 394 + }, + { + "epoch": 0.1198967976931249, + "grad_norm": 0.5008642673492432, + "learning_rate": 9.805102763997166e-05, + "loss": 1.3597, + "step": 395 + }, + { + "epoch": 0.12020033388981637, + "grad_norm": 0.4444289207458496, + "learning_rate": 9.804596537410146e-05, + "loss": 2.1768, + "step": 396 + }, + { + "epoch": 0.12050387008650781, + "grad_norm": 0.3963356912136078, + "learning_rate": 9.804090310823125e-05, + "loss": 1.8373, + "step": 397 + }, + { + "epoch": 0.12080740628319928, + "grad_norm": 0.44095271825790405, + "learning_rate": 9.803584084236105e-05, + "loss": 1.7893, + "step": 398 + }, + { + "epoch": 0.12111094247989072, + "grad_norm": 0.4162418246269226, + "learning_rate": 9.803077857649084e-05, + "loss": 1.7482, + "step": 399 + }, + { + "epoch": 0.12141447867658219, + "grad_norm": 0.3853035271167755, + "learning_rate": 9.802571631062064e-05, + "loss": 1.6274, + "step": 400 + }, + { + "epoch": 0.12171801487327363, + "grad_norm": 1.1697463989257812, + "learning_rate": 9.802065404475043e-05, + "loss": 2.2254, + "step": 401 + }, + { + "epoch": 0.1220215510699651, + "grad_norm": 0.3899803161621094, + "learning_rate": 9.801559177888023e-05, + "loss": 1.9754, + "step": 402 + }, + { + "epoch": 0.12232508726665654, + "grad_norm": 0.43946412205696106, + "learning_rate": 9.801052951301002e-05, + "loss": 2.1184, + "step": 403 + }, + { + "epoch": 0.122628623463348, + "grad_norm": 0.46882718801498413, + "learning_rate": 9.800546724713983e-05, + "loss": 1.4423, + "step": 404 + }, + { + "epoch": 0.12293215966003945, + "grad_norm": 0.4379485547542572, + "learning_rate": 9.800040498126963e-05, + "loss": 2.0614, + "step": 405 + }, + { + "epoch": 0.12323569585673091, + "grad_norm": 0.3837740123271942, + "learning_rate": 9.799534271539942e-05, + "loss": 1.9974, + "step": 406 + }, + { + "epoch": 0.12353923205342238, + "grad_norm": 0.35403695702552795, + "learning_rate": 9.799028044952922e-05, + "loss": 1.5693, + "step": 407 + }, + { + "epoch": 0.12384276825011382, + "grad_norm": 0.4070426821708679, + "learning_rate": 9.798521818365901e-05, + "loss": 1.8704, + "step": 408 + }, + { + "epoch": 0.12414630444680529, + "grad_norm": 0.4301077425479889, + "learning_rate": 9.79801559177888e-05, + "loss": 1.077, + "step": 409 + }, + { + "epoch": 0.12444984064349673, + "grad_norm": 0.37687429785728455, + "learning_rate": 9.79750936519186e-05, + "loss": 1.7323, + "step": 410 + }, + { + "epoch": 0.1247533768401882, + "grad_norm": 0.37393873929977417, + "learning_rate": 9.79700313860484e-05, + "loss": 1.9532, + "step": 411 + }, + { + "epoch": 0.12505691303687966, + "grad_norm": 0.4518846869468689, + "learning_rate": 9.796496912017819e-05, + "loss": 2.0123, + "step": 412 + }, + { + "epoch": 0.1253604492335711, + "grad_norm": 0.39417609572410583, + "learning_rate": 9.7959906854308e-05, + "loss": 2.2669, + "step": 413 + }, + { + "epoch": 0.12566398543026255, + "grad_norm": 0.3802976608276367, + "learning_rate": 9.795484458843779e-05, + "loss": 2.0506, + "step": 414 + }, + { + "epoch": 0.12596752162695402, + "grad_norm": 1.3118431568145752, + "learning_rate": 9.794978232256759e-05, + "loss": 2.2551, + "step": 415 + }, + { + "epoch": 0.12627105782364548, + "grad_norm": 0.9459638595581055, + "learning_rate": 9.794472005669738e-05, + "loss": 1.7829, + "step": 416 + }, + { + "epoch": 0.1265745940203369, + "grad_norm": 0.571232795715332, + "learning_rate": 9.793965779082718e-05, + "loss": 1.7768, + "step": 417 + }, + { + "epoch": 0.12687813021702837, + "grad_norm": 0.3973385989665985, + "learning_rate": 9.793459552495697e-05, + "loss": 1.88, + "step": 418 + }, + { + "epoch": 0.12718166641371983, + "grad_norm": 0.3883122503757477, + "learning_rate": 9.792953325908677e-05, + "loss": 1.9592, + "step": 419 + }, + { + "epoch": 0.1274852026104113, + "grad_norm": 0.40379586815834045, + "learning_rate": 9.792447099321656e-05, + "loss": 1.9697, + "step": 420 + }, + { + "epoch": 0.12778873880710276, + "grad_norm": 0.3288556635379791, + "learning_rate": 9.791940872734636e-05, + "loss": 1.7282, + "step": 421 + }, + { + "epoch": 0.1280922750037942, + "grad_norm": 0.3872746527194977, + "learning_rate": 9.791434646147616e-05, + "loss": 1.9348, + "step": 422 + }, + { + "epoch": 0.12839581120048565, + "grad_norm": 0.37058207392692566, + "learning_rate": 9.790928419560596e-05, + "loss": 1.5684, + "step": 423 + }, + { + "epoch": 0.12869934739717712, + "grad_norm": 0.37466561794281006, + "learning_rate": 9.790422192973575e-05, + "loss": 1.9535, + "step": 424 + }, + { + "epoch": 0.12900288359386858, + "grad_norm": 0.32176846265792847, + "learning_rate": 9.789915966386555e-05, + "loss": 1.8537, + "step": 425 + }, + { + "epoch": 0.12930641979056, + "grad_norm": 0.37653467059135437, + "learning_rate": 9.789409739799534e-05, + "loss": 2.0701, + "step": 426 + }, + { + "epoch": 0.12960995598725147, + "grad_norm": 0.38768434524536133, + "learning_rate": 9.788903513212514e-05, + "loss": 1.731, + "step": 427 + }, + { + "epoch": 0.12991349218394294, + "grad_norm": 0.5139635801315308, + "learning_rate": 9.788397286625493e-05, + "loss": 2.4437, + "step": 428 + }, + { + "epoch": 0.1302170283806344, + "grad_norm": 0.3759630024433136, + "learning_rate": 9.787891060038473e-05, + "loss": 2.0918, + "step": 429 + }, + { + "epoch": 0.13052056457732586, + "grad_norm": 0.3718818426132202, + "learning_rate": 9.787384833451452e-05, + "loss": 1.5854, + "step": 430 + }, + { + "epoch": 0.1308241007740173, + "grad_norm": 0.6460405588150024, + "learning_rate": 9.786878606864432e-05, + "loss": 2.2442, + "step": 431 + }, + { + "epoch": 0.13112763697070876, + "grad_norm": 0.40393388271331787, + "learning_rate": 9.786372380277413e-05, + "loss": 1.728, + "step": 432 + }, + { + "epoch": 0.13143117316740022, + "grad_norm": 0.3772658407688141, + "learning_rate": 9.785866153690393e-05, + "loss": 1.668, + "step": 433 + }, + { + "epoch": 0.13173470936409168, + "grad_norm": 2.5252649784088135, + "learning_rate": 9.785359927103373e-05, + "loss": 1.8864, + "step": 434 + }, + { + "epoch": 0.1320382455607831, + "grad_norm": 0.42327219247817993, + "learning_rate": 9.784853700516352e-05, + "loss": 2.3174, + "step": 435 + }, + { + "epoch": 0.13234178175747457, + "grad_norm": 0.3689473867416382, + "learning_rate": 9.784347473929332e-05, + "loss": 1.9671, + "step": 436 + }, + { + "epoch": 0.13264531795416604, + "grad_norm": 0.37554243206977844, + "learning_rate": 9.783841247342311e-05, + "loss": 1.783, + "step": 437 + }, + { + "epoch": 0.1329488541508575, + "grad_norm": 0.409587025642395, + "learning_rate": 9.783335020755291e-05, + "loss": 2.0385, + "step": 438 + }, + { + "epoch": 0.13325239034754893, + "grad_norm": 0.349252849817276, + "learning_rate": 9.78282879416827e-05, + "loss": 1.8785, + "step": 439 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.36687588691711426, + "learning_rate": 9.78232256758125e-05, + "loss": 2.1174, + "step": 440 + }, + { + "epoch": 0.13385946274093186, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.781816340994229e-05, + "loss": 1.8385, + "step": 441 + }, + { + "epoch": 0.13416299893762332, + "grad_norm": 0.5634617805480957, + "learning_rate": 9.781310114407209e-05, + "loss": 1.9316, + "step": 442 + }, + { + "epoch": 0.13446653513431478, + "grad_norm": 0.37704020738601685, + "learning_rate": 9.78080388782019e-05, + "loss": 1.8865, + "step": 443 + }, + { + "epoch": 0.1347700713310062, + "grad_norm": 0.36043843626976013, + "learning_rate": 9.780297661233169e-05, + "loss": 1.585, + "step": 444 + }, + { + "epoch": 0.13507360752769768, + "grad_norm": 0.33643844723701477, + "learning_rate": 9.779791434646149e-05, + "loss": 1.8098, + "step": 445 + }, + { + "epoch": 0.13537714372438914, + "grad_norm": 0.6782101988792419, + "learning_rate": 9.779285208059128e-05, + "loss": 2.0468, + "step": 446 + }, + { + "epoch": 0.1356806799210806, + "grad_norm": 0.38101980090141296, + "learning_rate": 9.778778981472108e-05, + "loss": 2.0624, + "step": 447 + }, + { + "epoch": 0.13598421611777203, + "grad_norm": 0.399311900138855, + "learning_rate": 9.778272754885087e-05, + "loss": 2.1652, + "step": 448 + }, + { + "epoch": 0.1362877523144635, + "grad_norm": 0.3491426706314087, + "learning_rate": 9.777766528298066e-05, + "loss": 1.9092, + "step": 449 + }, + { + "epoch": 0.13659128851115496, + "grad_norm": 0.3654717803001404, + "learning_rate": 9.777260301711046e-05, + "loss": 1.9773, + "step": 450 + }, + { + "epoch": 0.13689482470784642, + "grad_norm": 0.394699364900589, + "learning_rate": 9.776754075124025e-05, + "loss": 2.1568, + "step": 451 + }, + { + "epoch": 0.13719836090453785, + "grad_norm": 0.3601212203502655, + "learning_rate": 9.776247848537006e-05, + "loss": 1.8744, + "step": 452 + }, + { + "epoch": 0.13750189710122931, + "grad_norm": 0.40716952085494995, + "learning_rate": 9.775741621949986e-05, + "loss": 2.1052, + "step": 453 + }, + { + "epoch": 0.13780543329792078, + "grad_norm": 0.37777504324913025, + "learning_rate": 9.775235395362965e-05, + "loss": 1.8896, + "step": 454 + }, + { + "epoch": 0.13810896949461224, + "grad_norm": 0.368600994348526, + "learning_rate": 9.774729168775945e-05, + "loss": 1.8285, + "step": 455 + }, + { + "epoch": 0.1384125056913037, + "grad_norm": 0.41742029786109924, + "learning_rate": 9.774222942188924e-05, + "loss": 1.8286, + "step": 456 + }, + { + "epoch": 0.13871604188799513, + "grad_norm": 0.40132156014442444, + "learning_rate": 9.773716715601904e-05, + "loss": 1.9515, + "step": 457 + }, + { + "epoch": 0.1390195780846866, + "grad_norm": 0.44473376870155334, + "learning_rate": 9.773210489014883e-05, + "loss": 1.8715, + "step": 458 + }, + { + "epoch": 0.13932311428137806, + "grad_norm": 0.40146371722221375, + "learning_rate": 9.772704262427863e-05, + "loss": 2.1469, + "step": 459 + }, + { + "epoch": 0.13962665047806952, + "grad_norm": 0.3863317370414734, + "learning_rate": 9.772198035840842e-05, + "loss": 1.9215, + "step": 460 + }, + { + "epoch": 0.13993018667476095, + "grad_norm": 0.40235334634780884, + "learning_rate": 9.771691809253823e-05, + "loss": 2.1276, + "step": 461 + }, + { + "epoch": 0.14023372287145242, + "grad_norm": 0.46011632680892944, + "learning_rate": 9.771185582666802e-05, + "loss": 1.244, + "step": 462 + }, + { + "epoch": 0.14053725906814388, + "grad_norm": 0.3428272008895874, + "learning_rate": 9.770679356079782e-05, + "loss": 1.7991, + "step": 463 + }, + { + "epoch": 0.14084079526483534, + "grad_norm": 0.39976757764816284, + "learning_rate": 9.770173129492761e-05, + "loss": 1.7166, + "step": 464 + }, + { + "epoch": 0.1411443314615268, + "grad_norm": 0.3258446753025055, + "learning_rate": 9.769666902905741e-05, + "loss": 1.677, + "step": 465 + }, + { + "epoch": 0.14144786765821823, + "grad_norm": 0.3950905501842499, + "learning_rate": 9.76916067631872e-05, + "loss": 2.0122, + "step": 466 + }, + { + "epoch": 0.1417514038549097, + "grad_norm": 0.39712047576904297, + "learning_rate": 9.7686544497317e-05, + "loss": 1.7262, + "step": 467 + }, + { + "epoch": 0.14205494005160116, + "grad_norm": 0.8331599235534668, + "learning_rate": 9.768148223144679e-05, + "loss": 1.9852, + "step": 468 + }, + { + "epoch": 0.14235847624829262, + "grad_norm": 0.3578427731990814, + "learning_rate": 9.767641996557659e-05, + "loss": 1.8249, + "step": 469 + }, + { + "epoch": 0.14266201244498405, + "grad_norm": 0.3736058473587036, + "learning_rate": 9.767135769970638e-05, + "loss": 1.43, + "step": 470 + }, + { + "epoch": 0.14296554864167552, + "grad_norm": 0.48153185844421387, + "learning_rate": 9.766629543383619e-05, + "loss": 1.8667, + "step": 471 + }, + { + "epoch": 0.14326908483836698, + "grad_norm": 0.3924524188041687, + "learning_rate": 9.766123316796599e-05, + "loss": 2.0385, + "step": 472 + }, + { + "epoch": 0.14357262103505844, + "grad_norm": 0.38956940174102783, + "learning_rate": 9.765617090209578e-05, + "loss": 1.3157, + "step": 473 + }, + { + "epoch": 0.14387615723174987, + "grad_norm": 0.4032903015613556, + "learning_rate": 9.765110863622558e-05, + "loss": 1.8793, + "step": 474 + }, + { + "epoch": 0.14417969342844134, + "grad_norm": 0.5116568207740784, + "learning_rate": 9.764604637035537e-05, + "loss": 1.7658, + "step": 475 + }, + { + "epoch": 0.1444832296251328, + "grad_norm": 0.3981756269931793, + "learning_rate": 9.764098410448517e-05, + "loss": 1.8087, + "step": 476 + }, + { + "epoch": 0.14478676582182426, + "grad_norm": 0.43181854486465454, + "learning_rate": 9.763592183861496e-05, + "loss": 1.5241, + "step": 477 + }, + { + "epoch": 0.14509030201851572, + "grad_norm": 0.4172961413860321, + "learning_rate": 9.763085957274477e-05, + "loss": 1.8318, + "step": 478 + }, + { + "epoch": 0.14539383821520716, + "grad_norm": 0.4135033190250397, + "learning_rate": 9.762579730687456e-05, + "loss": 2.0783, + "step": 479 + }, + { + "epoch": 0.14569737441189862, + "grad_norm": 0.36482739448547363, + "learning_rate": 9.762073504100436e-05, + "loss": 2.2524, + "step": 480 + }, + { + "epoch": 0.14600091060859008, + "grad_norm": 0.3704656958580017, + "learning_rate": 9.761567277513415e-05, + "loss": 2.0369, + "step": 481 + }, + { + "epoch": 0.14630444680528154, + "grad_norm": 1.588393211364746, + "learning_rate": 9.761061050926396e-05, + "loss": 1.8041, + "step": 482 + }, + { + "epoch": 0.14660798300197297, + "grad_norm": 0.3309743404388428, + "learning_rate": 9.760554824339376e-05, + "loss": 1.8373, + "step": 483 + }, + { + "epoch": 0.14691151919866444, + "grad_norm": 0.34598830342292786, + "learning_rate": 9.760048597752355e-05, + "loss": 1.6249, + "step": 484 + }, + { + "epoch": 0.1472150553953559, + "grad_norm": 0.3433639109134674, + "learning_rate": 9.759542371165335e-05, + "loss": 1.9454, + "step": 485 + }, + { + "epoch": 0.14751859159204736, + "grad_norm": 0.3801734149456024, + "learning_rate": 9.759036144578314e-05, + "loss": 2.1067, + "step": 486 + }, + { + "epoch": 0.1478221277887388, + "grad_norm": 0.36811041831970215, + "learning_rate": 9.758529917991293e-05, + "loss": 1.8642, + "step": 487 + }, + { + "epoch": 0.14812566398543026, + "grad_norm": 0.3999156355857849, + "learning_rate": 9.758023691404273e-05, + "loss": 2.1482, + "step": 488 + }, + { + "epoch": 0.14842920018212172, + "grad_norm": 0.7651489973068237, + "learning_rate": 9.757517464817252e-05, + "loss": 1.8213, + "step": 489 + }, + { + "epoch": 0.14873273637881318, + "grad_norm": 0.3491712808609009, + "learning_rate": 9.757011238230232e-05, + "loss": 2.1047, + "step": 490 + }, + { + "epoch": 0.14903627257550464, + "grad_norm": 1.028256893157959, + "learning_rate": 9.756505011643213e-05, + "loss": 2.0519, + "step": 491 + }, + { + "epoch": 0.14933980877219608, + "grad_norm": 0.5957101583480835, + "learning_rate": 9.755998785056192e-05, + "loss": 2.1236, + "step": 492 + }, + { + "epoch": 0.14964334496888754, + "grad_norm": 0.40934717655181885, + "learning_rate": 9.755492558469172e-05, + "loss": 1.5391, + "step": 493 + }, + { + "epoch": 0.149946881165579, + "grad_norm": 0.4403507709503174, + "learning_rate": 9.754986331882151e-05, + "loss": 1.8388, + "step": 494 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.4258563220500946, + "learning_rate": 9.754480105295131e-05, + "loss": 1.8092, + "step": 495 + }, + { + "epoch": 0.1505539535589619, + "grad_norm": 0.3594823181629181, + "learning_rate": 9.75397387870811e-05, + "loss": 1.7195, + "step": 496 + }, + { + "epoch": 0.15085748975565336, + "grad_norm": 0.30373120307922363, + "learning_rate": 9.75346765212109e-05, + "loss": 1.9267, + "step": 497 + }, + { + "epoch": 0.15116102595234482, + "grad_norm": 0.423096626996994, + "learning_rate": 9.752961425534069e-05, + "loss": 2.1559, + "step": 498 + }, + { + "epoch": 0.15146456214903628, + "grad_norm": 0.36935552954673767, + "learning_rate": 9.752455198947049e-05, + "loss": 2.0357, + "step": 499 + }, + { + "epoch": 0.15176809834572771, + "grad_norm": 0.7172725200653076, + "learning_rate": 9.75194897236003e-05, + "loss": 2.0973, + "step": 500 + }, + { + "epoch": 0.15207163454241918, + "grad_norm": 0.36897605657577515, + "learning_rate": 9.751442745773009e-05, + "loss": 2.1672, + "step": 501 + }, + { + "epoch": 0.15237517073911064, + "grad_norm": 0.35079488158226013, + "learning_rate": 9.750936519185988e-05, + "loss": 2.0808, + "step": 502 + }, + { + "epoch": 0.1526787069358021, + "grad_norm": 0.37833186984062195, + "learning_rate": 9.750430292598968e-05, + "loss": 1.8393, + "step": 503 + }, + { + "epoch": 0.15298224313249356, + "grad_norm": 0.3969264328479767, + "learning_rate": 9.749924066011947e-05, + "loss": 2.1213, + "step": 504 + }, + { + "epoch": 0.153285779329185, + "grad_norm": 0.30432841181755066, + "learning_rate": 9.749417839424927e-05, + "loss": 1.6397, + "step": 505 + }, + { + "epoch": 0.15358931552587646, + "grad_norm": 0.30847886204719543, + "learning_rate": 9.748911612837906e-05, + "loss": 1.6455, + "step": 506 + }, + { + "epoch": 0.15389285172256792, + "grad_norm": 0.38480496406555176, + "learning_rate": 9.748405386250886e-05, + "loss": 1.803, + "step": 507 + }, + { + "epoch": 0.15419638791925938, + "grad_norm": 0.48439183831214905, + "learning_rate": 9.747899159663865e-05, + "loss": 1.6892, + "step": 508 + }, + { + "epoch": 0.15449992411595082, + "grad_norm": 0.5124354362487793, + "learning_rate": 9.747392933076845e-05, + "loss": 2.24, + "step": 509 + }, + { + "epoch": 0.15480346031264228, + "grad_norm": 0.4051717221736908, + "learning_rate": 9.746886706489826e-05, + "loss": 1.8621, + "step": 510 + }, + { + "epoch": 0.15510699650933374, + "grad_norm": 0.6452261209487915, + "learning_rate": 9.746380479902805e-05, + "loss": 1.7043, + "step": 511 + }, + { + "epoch": 0.1554105327060252, + "grad_norm": 0.5453522801399231, + "learning_rate": 9.745874253315785e-05, + "loss": 1.7325, + "step": 512 + }, + { + "epoch": 0.15571406890271666, + "grad_norm": 1.0983595848083496, + "learning_rate": 9.745368026728764e-05, + "loss": 2.169, + "step": 513 + }, + { + "epoch": 0.1560176050994081, + "grad_norm": 0.3821035623550415, + "learning_rate": 9.744861800141744e-05, + "loss": 2.3305, + "step": 514 + }, + { + "epoch": 0.15632114129609956, + "grad_norm": 0.3694508969783783, + "learning_rate": 9.744355573554723e-05, + "loss": 1.8453, + "step": 515 + }, + { + "epoch": 0.15662467749279102, + "grad_norm": 0.3837510943412781, + "learning_rate": 9.743849346967702e-05, + "loss": 1.9679, + "step": 516 + }, + { + "epoch": 0.15692821368948248, + "grad_norm": 0.41427966952323914, + "learning_rate": 9.743343120380682e-05, + "loss": 1.9331, + "step": 517 + }, + { + "epoch": 0.15723174988617392, + "grad_norm": 0.34252259135246277, + "learning_rate": 9.742836893793661e-05, + "loss": 1.7938, + "step": 518 + }, + { + "epoch": 0.15753528608286538, + "grad_norm": 0.4043283462524414, + "learning_rate": 9.742330667206642e-05, + "loss": 1.4037, + "step": 519 + }, + { + "epoch": 0.15783882227955684, + "grad_norm": 0.4225389361381531, + "learning_rate": 9.741824440619622e-05, + "loss": 1.6224, + "step": 520 + }, + { + "epoch": 0.1581423584762483, + "grad_norm": 0.377590537071228, + "learning_rate": 9.741318214032601e-05, + "loss": 2.0567, + "step": 521 + }, + { + "epoch": 0.15844589467293974, + "grad_norm": 0.46170124411582947, + "learning_rate": 9.740811987445582e-05, + "loss": 2.0449, + "step": 522 + }, + { + "epoch": 0.1587494308696312, + "grad_norm": 0.3752427399158478, + "learning_rate": 9.740305760858562e-05, + "loss": 1.8207, + "step": 523 + }, + { + "epoch": 0.15905296706632266, + "grad_norm": 0.390803724527359, + "learning_rate": 9.739799534271541e-05, + "loss": 2.0781, + "step": 524 + }, + { + "epoch": 0.15935650326301412, + "grad_norm": 0.38587453961372375, + "learning_rate": 9.73929330768452e-05, + "loss": 1.9932, + "step": 525 + }, + { + "epoch": 0.15966003945970558, + "grad_norm": 0.4154350459575653, + "learning_rate": 9.7387870810975e-05, + "loss": 1.7649, + "step": 526 + }, + { + "epoch": 0.15996357565639702, + "grad_norm": 0.3698589503765106, + "learning_rate": 9.73828085451048e-05, + "loss": 1.6921, + "step": 527 + }, + { + "epoch": 0.16026711185308848, + "grad_norm": 0.4110312759876251, + "learning_rate": 9.737774627923459e-05, + "loss": 1.1834, + "step": 528 + }, + { + "epoch": 0.16057064804977994, + "grad_norm": 0.4140758812427521, + "learning_rate": 9.737268401336438e-05, + "loss": 1.8354, + "step": 529 + }, + { + "epoch": 0.1608741842464714, + "grad_norm": 0.38738423585891724, + "learning_rate": 9.736762174749419e-05, + "loss": 1.9223, + "step": 530 + }, + { + "epoch": 0.16117772044316284, + "grad_norm": 0.4055260717868805, + "learning_rate": 9.736255948162399e-05, + "loss": 1.7802, + "step": 531 + }, + { + "epoch": 0.1614812566398543, + "grad_norm": 0.44946524500846863, + "learning_rate": 9.735749721575378e-05, + "loss": 1.8654, + "step": 532 + }, + { + "epoch": 0.16178479283654576, + "grad_norm": 0.43206432461738586, + "learning_rate": 9.735243494988358e-05, + "loss": 1.7607, + "step": 533 + }, + { + "epoch": 0.16208832903323722, + "grad_norm": 0.5007991194725037, + "learning_rate": 9.734737268401337e-05, + "loss": 1.9378, + "step": 534 + }, + { + "epoch": 0.16239186522992866, + "grad_norm": 0.48757919669151306, + "learning_rate": 9.734231041814317e-05, + "loss": 2.1829, + "step": 535 + }, + { + "epoch": 0.16269540142662012, + "grad_norm": 0.4159701466560364, + "learning_rate": 9.733724815227296e-05, + "loss": 1.8847, + "step": 536 + }, + { + "epoch": 0.16299893762331158, + "grad_norm": 0.40922749042510986, + "learning_rate": 9.733218588640276e-05, + "loss": 1.4376, + "step": 537 + }, + { + "epoch": 0.16330247382000304, + "grad_norm": 0.33677083253860474, + "learning_rate": 9.732712362053255e-05, + "loss": 1.9568, + "step": 538 + }, + { + "epoch": 0.1636060100166945, + "grad_norm": 0.3255022168159485, + "learning_rate": 9.732206135466236e-05, + "loss": 1.9949, + "step": 539 + }, + { + "epoch": 0.16390954621338594, + "grad_norm": 0.3848338723182678, + "learning_rate": 9.731699908879215e-05, + "loss": 2.042, + "step": 540 + }, + { + "epoch": 0.1642130824100774, + "grad_norm": 0.3888263404369354, + "learning_rate": 9.731193682292195e-05, + "loss": 1.885, + "step": 541 + }, + { + "epoch": 0.16451661860676886, + "grad_norm": 0.40090805292129517, + "learning_rate": 9.730687455705174e-05, + "loss": 1.9093, + "step": 542 + }, + { + "epoch": 0.16482015480346032, + "grad_norm": 0.4106220602989197, + "learning_rate": 9.730181229118154e-05, + "loss": 1.8392, + "step": 543 + }, + { + "epoch": 0.16512369100015176, + "grad_norm": 0.3483395278453827, + "learning_rate": 9.729675002531133e-05, + "loss": 2.0235, + "step": 544 + }, + { + "epoch": 0.16542722719684322, + "grad_norm": 0.3686208128929138, + "learning_rate": 9.729168775944113e-05, + "loss": 1.9218, + "step": 545 + }, + { + "epoch": 0.16573076339353468, + "grad_norm": 0.36063849925994873, + "learning_rate": 9.728662549357092e-05, + "loss": 1.9334, + "step": 546 + }, + { + "epoch": 0.16603429959022614, + "grad_norm": 0.39365142583847046, + "learning_rate": 9.728156322770072e-05, + "loss": 1.9825, + "step": 547 + }, + { + "epoch": 0.16633783578691758, + "grad_norm": 0.4062787592411041, + "learning_rate": 9.727650096183051e-05, + "loss": 1.521, + "step": 548 + }, + { + "epoch": 0.16664137198360904, + "grad_norm": 0.37347134947776794, + "learning_rate": 9.727143869596032e-05, + "loss": 1.9356, + "step": 549 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3538997173309326, + "learning_rate": 9.726637643009012e-05, + "loss": 1.845, + "step": 550 + }, + { + "epoch": 0.16724844437699196, + "grad_norm": 0.3868335783481598, + "learning_rate": 9.726131416421991e-05, + "loss": 1.9803, + "step": 551 + }, + { + "epoch": 0.16755198057368342, + "grad_norm": 0.34705451130867004, + "learning_rate": 9.72562518983497e-05, + "loss": 2.0866, + "step": 552 + }, + { + "epoch": 0.16785551677037486, + "grad_norm": 0.3794872462749481, + "learning_rate": 9.72511896324795e-05, + "loss": 2.094, + "step": 553 + }, + { + "epoch": 0.16815905296706632, + "grad_norm": 0.5801231861114502, + "learning_rate": 9.72461273666093e-05, + "loss": 1.7851, + "step": 554 + }, + { + "epoch": 0.16846258916375778, + "grad_norm": 0.3076344132423401, + "learning_rate": 9.724106510073909e-05, + "loss": 1.5188, + "step": 555 + }, + { + "epoch": 0.16876612536044924, + "grad_norm": 0.3552989363670349, + "learning_rate": 9.723600283486888e-05, + "loss": 2.1063, + "step": 556 + }, + { + "epoch": 0.16906966155714068, + "grad_norm": 0.36939847469329834, + "learning_rate": 9.723094056899868e-05, + "loss": 1.7648, + "step": 557 + }, + { + "epoch": 0.16937319775383214, + "grad_norm": 0.358634889125824, + "learning_rate": 9.722587830312849e-05, + "loss": 1.8007, + "step": 558 + }, + { + "epoch": 0.1696767339505236, + "grad_norm": 0.39962029457092285, + "learning_rate": 9.722081603725828e-05, + "loss": 1.8845, + "step": 559 + }, + { + "epoch": 0.16998027014721506, + "grad_norm": 0.4099076986312866, + "learning_rate": 9.721575377138808e-05, + "loss": 1.8894, + "step": 560 + }, + { + "epoch": 0.17028380634390652, + "grad_norm": 0.3610551655292511, + "learning_rate": 9.721069150551787e-05, + "loss": 1.8089, + "step": 561 + }, + { + "epoch": 0.17058734254059796, + "grad_norm": 0.5951200723648071, + "learning_rate": 9.720562923964767e-05, + "loss": 1.6966, + "step": 562 + }, + { + "epoch": 0.17089087873728942, + "grad_norm": 0.562522292137146, + "learning_rate": 9.720056697377746e-05, + "loss": 1.7704, + "step": 563 + }, + { + "epoch": 0.17119441493398088, + "grad_norm": 0.6662526726722717, + "learning_rate": 9.719550470790726e-05, + "loss": 1.7714, + "step": 564 + }, + { + "epoch": 0.17149795113067234, + "grad_norm": 0.44034865498542786, + "learning_rate": 9.719044244203705e-05, + "loss": 2.1042, + "step": 565 + }, + { + "epoch": 0.17180148732736378, + "grad_norm": 0.39868202805519104, + "learning_rate": 9.718538017616685e-05, + "loss": 1.952, + "step": 566 + }, + { + "epoch": 0.17210502352405524, + "grad_norm": 0.3427380621433258, + "learning_rate": 9.718031791029665e-05, + "loss": 2.037, + "step": 567 + }, + { + "epoch": 0.1724085597207467, + "grad_norm": 0.37980929017066956, + "learning_rate": 9.717525564442645e-05, + "loss": 1.5378, + "step": 568 + }, + { + "epoch": 0.17271209591743816, + "grad_norm": 0.32314518094062805, + "learning_rate": 9.717019337855626e-05, + "loss": 1.6191, + "step": 569 + }, + { + "epoch": 0.1730156321141296, + "grad_norm": 0.40600740909576416, + "learning_rate": 9.716513111268605e-05, + "loss": 1.6055, + "step": 570 + }, + { + "epoch": 0.17331916831082106, + "grad_norm": 0.37318041920661926, + "learning_rate": 9.716006884681585e-05, + "loss": 1.8666, + "step": 571 + }, + { + "epoch": 0.17362270450751252, + "grad_norm": 0.3656068444252014, + "learning_rate": 9.715500658094564e-05, + "loss": 1.5983, + "step": 572 + }, + { + "epoch": 0.17392624070420398, + "grad_norm": 0.3546827733516693, + "learning_rate": 9.714994431507544e-05, + "loss": 2.2088, + "step": 573 + }, + { + "epoch": 0.17422977690089544, + "grad_norm": 0.4293152689933777, + "learning_rate": 9.714488204920523e-05, + "loss": 1.803, + "step": 574 + }, + { + "epoch": 0.17453331309758688, + "grad_norm": 0.3790314495563507, + "learning_rate": 9.713981978333503e-05, + "loss": 1.9874, + "step": 575 + }, + { + "epoch": 0.17483684929427834, + "grad_norm": 0.37619829177856445, + "learning_rate": 9.713475751746482e-05, + "loss": 1.9061, + "step": 576 + }, + { + "epoch": 0.1751403854909698, + "grad_norm": 0.36988991498947144, + "learning_rate": 9.712969525159462e-05, + "loss": 1.5463, + "step": 577 + }, + { + "epoch": 0.17544392168766126, + "grad_norm": 0.367721825838089, + "learning_rate": 9.712463298572442e-05, + "loss": 1.6526, + "step": 578 + }, + { + "epoch": 0.1757474578843527, + "grad_norm": 0.39620110392570496, + "learning_rate": 9.711957071985422e-05, + "loss": 2.056, + "step": 579 + }, + { + "epoch": 0.17605099408104416, + "grad_norm": 0.41518276929855347, + "learning_rate": 9.711450845398401e-05, + "loss": 1.6847, + "step": 580 + }, + { + "epoch": 0.17635453027773562, + "grad_norm": 0.3925170302391052, + "learning_rate": 9.710944618811381e-05, + "loss": 1.8476, + "step": 581 + }, + { + "epoch": 0.17665806647442708, + "grad_norm": 0.36658090353012085, + "learning_rate": 9.71043839222436e-05, + "loss": 2.0699, + "step": 582 + }, + { + "epoch": 0.17696160267111852, + "grad_norm": 0.3741433620452881, + "learning_rate": 9.70993216563734e-05, + "loss": 1.9645, + "step": 583 + }, + { + "epoch": 0.17726513886780998, + "grad_norm": 0.3742316663265228, + "learning_rate": 9.709425939050319e-05, + "loss": 2.3717, + "step": 584 + }, + { + "epoch": 0.17756867506450144, + "grad_norm": 0.3796440660953522, + "learning_rate": 9.708919712463299e-05, + "loss": 1.9356, + "step": 585 + }, + { + "epoch": 0.1778722112611929, + "grad_norm": 0.3976511061191559, + "learning_rate": 9.708413485876278e-05, + "loss": 2.1889, + "step": 586 + }, + { + "epoch": 0.17817574745788436, + "grad_norm": 0.34445542097091675, + "learning_rate": 9.707907259289258e-05, + "loss": 1.6535, + "step": 587 + }, + { + "epoch": 0.1784792836545758, + "grad_norm": 0.3982098698616028, + "learning_rate": 9.707401032702239e-05, + "loss": 2.0542, + "step": 588 + }, + { + "epoch": 0.17878281985126726, + "grad_norm": 0.42155295610427856, + "learning_rate": 9.706894806115218e-05, + "loss": 1.4605, + "step": 589 + }, + { + "epoch": 0.17908635604795872, + "grad_norm": 0.36341744661331177, + "learning_rate": 9.706388579528197e-05, + "loss": 1.8069, + "step": 590 + }, + { + "epoch": 0.17938989224465018, + "grad_norm": 0.3715178668498993, + "learning_rate": 9.705882352941177e-05, + "loss": 1.5512, + "step": 591 + }, + { + "epoch": 0.17969342844134162, + "grad_norm": 0.376767635345459, + "learning_rate": 9.705376126354156e-05, + "loss": 1.6027, + "step": 592 + }, + { + "epoch": 0.17999696463803308, + "grad_norm": 0.4033347964286804, + "learning_rate": 9.704869899767136e-05, + "loss": 1.5071, + "step": 593 + }, + { + "epoch": 0.18030050083472454, + "grad_norm": 0.8200478553771973, + "learning_rate": 9.704363673180115e-05, + "loss": 1.924, + "step": 594 + }, + { + "epoch": 0.180604037031416, + "grad_norm": 0.6224507093429565, + "learning_rate": 9.703857446593095e-05, + "loss": 1.9684, + "step": 595 + }, + { + "epoch": 0.18090757322810747, + "grad_norm": 0.32032859325408936, + "learning_rate": 9.703351220006074e-05, + "loss": 1.9478, + "step": 596 + }, + { + "epoch": 0.1812111094247989, + "grad_norm": 0.33331337571144104, + "learning_rate": 9.702844993419055e-05, + "loss": 1.8177, + "step": 597 + }, + { + "epoch": 0.18151464562149036, + "grad_norm": 0.47399207949638367, + "learning_rate": 9.702338766832035e-05, + "loss": 2.07, + "step": 598 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.30480411648750305, + "learning_rate": 9.701832540245014e-05, + "loss": 2.0407, + "step": 599 + }, + { + "epoch": 0.18212171801487329, + "grad_norm": 0.40148988366127014, + "learning_rate": 9.701326313657994e-05, + "loss": 1.8774, + "step": 600 + }, + { + "epoch": 0.18242525421156472, + "grad_norm": 0.3958423137664795, + "learning_rate": 9.700820087070973e-05, + "loss": 1.8462, + "step": 601 + }, + { + "epoch": 0.18272879040825618, + "grad_norm": 0.34824639558792114, + "learning_rate": 9.700313860483953e-05, + "loss": 1.7839, + "step": 602 + }, + { + "epoch": 0.18303232660494764, + "grad_norm": 0.38002872467041016, + "learning_rate": 9.699807633896932e-05, + "loss": 2.3237, + "step": 603 + }, + { + "epoch": 0.1833358628016391, + "grad_norm": 0.37800419330596924, + "learning_rate": 9.699301407309912e-05, + "loss": 1.9375, + "step": 604 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.4041115939617157, + "learning_rate": 9.698795180722891e-05, + "loss": 2.029, + "step": 605 + }, + { + "epoch": 0.183942935195022, + "grad_norm": 0.3697315454483032, + "learning_rate": 9.698288954135872e-05, + "loss": 1.894, + "step": 606 + }, + { + "epoch": 0.18424647139171346, + "grad_norm": 0.3809906542301178, + "learning_rate": 9.697782727548851e-05, + "loss": 1.8242, + "step": 607 + }, + { + "epoch": 0.18455000758840492, + "grad_norm": 0.3997717499732971, + "learning_rate": 9.697276500961831e-05, + "loss": 2.0522, + "step": 608 + }, + { + "epoch": 0.18485354378509639, + "grad_norm": 0.391699880361557, + "learning_rate": 9.69677027437481e-05, + "loss": 1.8521, + "step": 609 + }, + { + "epoch": 0.18515707998178782, + "grad_norm": 0.3667858839035034, + "learning_rate": 9.69626404778779e-05, + "loss": 1.7613, + "step": 610 + }, + { + "epoch": 0.18546061617847928, + "grad_norm": 0.3905411958694458, + "learning_rate": 9.69575782120077e-05, + "loss": 1.8285, + "step": 611 + }, + { + "epoch": 0.18576415237517074, + "grad_norm": 0.4121951758861542, + "learning_rate": 9.69525159461375e-05, + "loss": 1.8104, + "step": 612 + }, + { + "epoch": 0.1860676885718622, + "grad_norm": 0.34977591037750244, + "learning_rate": 9.69474536802673e-05, + "loss": 1.7737, + "step": 613 + }, + { + "epoch": 0.18637122476855364, + "grad_norm": 0.34084367752075195, + "learning_rate": 9.694239141439709e-05, + "loss": 2.0407, + "step": 614 + }, + { + "epoch": 0.1866747609652451, + "grad_norm": 0.35442525148391724, + "learning_rate": 9.693732914852689e-05, + "loss": 1.9152, + "step": 615 + }, + { + "epoch": 0.18697829716193656, + "grad_norm": 0.34404149651527405, + "learning_rate": 9.693226688265668e-05, + "loss": 1.7621, + "step": 616 + }, + { + "epoch": 0.18728183335862802, + "grad_norm": 0.4516477882862091, + "learning_rate": 9.692720461678649e-05, + "loss": 1.7624, + "step": 617 + }, + { + "epoch": 0.18758536955531946, + "grad_norm": 0.3506614565849304, + "learning_rate": 9.692214235091628e-05, + "loss": 1.6627, + "step": 618 + }, + { + "epoch": 0.18788890575201092, + "grad_norm": 0.9165719151496887, + "learning_rate": 9.691708008504608e-05, + "loss": 2.1926, + "step": 619 + }, + { + "epoch": 0.18819244194870238, + "grad_norm": 0.3361871838569641, + "learning_rate": 9.691201781917587e-05, + "loss": 1.5229, + "step": 620 + }, + { + "epoch": 0.18849597814539384, + "grad_norm": 0.32639381289482117, + "learning_rate": 9.690695555330567e-05, + "loss": 1.8778, + "step": 621 + }, + { + "epoch": 0.1887995143420853, + "grad_norm": 0.44261273741722107, + "learning_rate": 9.690189328743546e-05, + "loss": 2.0903, + "step": 622 + }, + { + "epoch": 0.18910305053877674, + "grad_norm": 0.4438890516757965, + "learning_rate": 9.689683102156526e-05, + "loss": 1.772, + "step": 623 + }, + { + "epoch": 0.1894065867354682, + "grad_norm": 0.40160682797431946, + "learning_rate": 9.689176875569505e-05, + "loss": 2.0964, + "step": 624 + }, + { + "epoch": 0.18971012293215966, + "grad_norm": 0.4022195637226105, + "learning_rate": 9.688670648982485e-05, + "loss": 1.7818, + "step": 625 + }, + { + "epoch": 0.19001365912885113, + "grad_norm": 0.4233214855194092, + "learning_rate": 9.688164422395464e-05, + "loss": 1.922, + "step": 626 + }, + { + "epoch": 0.19031719532554256, + "grad_norm": 0.3864254057407379, + "learning_rate": 9.687658195808445e-05, + "loss": 2.0279, + "step": 627 + }, + { + "epoch": 0.19062073152223402, + "grad_norm": 0.36527585983276367, + "learning_rate": 9.687151969221424e-05, + "loss": 2.0732, + "step": 628 + }, + { + "epoch": 0.19092426771892548, + "grad_norm": 0.399237722158432, + "learning_rate": 9.686645742634404e-05, + "loss": 1.8889, + "step": 629 + }, + { + "epoch": 0.19122780391561695, + "grad_norm": 0.3860459625720978, + "learning_rate": 9.686139516047383e-05, + "loss": 1.968, + "step": 630 + }, + { + "epoch": 0.19153134011230838, + "grad_norm": 0.32555973529815674, + "learning_rate": 9.685633289460363e-05, + "loss": 2.0722, + "step": 631 + }, + { + "epoch": 0.19183487630899984, + "grad_norm": 0.6093998551368713, + "learning_rate": 9.685127062873342e-05, + "loss": 1.8553, + "step": 632 + }, + { + "epoch": 0.1921384125056913, + "grad_norm": 0.4218057692050934, + "learning_rate": 9.684620836286322e-05, + "loss": 1.9647, + "step": 633 + }, + { + "epoch": 0.19244194870238276, + "grad_norm": 0.3779148757457733, + "learning_rate": 9.684114609699301e-05, + "loss": 2.0681, + "step": 634 + }, + { + "epoch": 0.19274548489907423, + "grad_norm": 0.3820381760597229, + "learning_rate": 9.683608383112281e-05, + "loss": 2.0603, + "step": 635 + }, + { + "epoch": 0.19304902109576566, + "grad_norm": 0.29337063431739807, + "learning_rate": 9.683102156525262e-05, + "loss": 1.7516, + "step": 636 + }, + { + "epoch": 0.19335255729245712, + "grad_norm": 0.4369249939918518, + "learning_rate": 9.682595929938241e-05, + "loss": 1.9822, + "step": 637 + }, + { + "epoch": 0.19365609348914858, + "grad_norm": 0.3766214847564697, + "learning_rate": 9.68208970335122e-05, + "loss": 1.7229, + "step": 638 + }, + { + "epoch": 0.19395962968584005, + "grad_norm": 0.4765011668205261, + "learning_rate": 9.6815834767642e-05, + "loss": 1.2865, + "step": 639 + }, + { + "epoch": 0.19426316588253148, + "grad_norm": 0.34236472845077515, + "learning_rate": 9.68107725017718e-05, + "loss": 2.1024, + "step": 640 + }, + { + "epoch": 0.19456670207922294, + "grad_norm": 0.398076593875885, + "learning_rate": 9.680571023590159e-05, + "loss": 1.8628, + "step": 641 + }, + { + "epoch": 0.1948702382759144, + "grad_norm": 0.357099711894989, + "learning_rate": 9.680064797003139e-05, + "loss": 2.2163, + "step": 642 + }, + { + "epoch": 0.19517377447260587, + "grad_norm": 0.3296545445919037, + "learning_rate": 9.679558570416118e-05, + "loss": 1.8227, + "step": 643 + }, + { + "epoch": 0.19547731066929733, + "grad_norm": 0.36754927039146423, + "learning_rate": 9.679052343829098e-05, + "loss": 1.7179, + "step": 644 + }, + { + "epoch": 0.19578084686598876, + "grad_norm": 0.37275364995002747, + "learning_rate": 9.678546117242078e-05, + "loss": 1.6782, + "step": 645 + }, + { + "epoch": 0.19608438306268022, + "grad_norm": 0.3951006531715393, + "learning_rate": 9.678039890655058e-05, + "loss": 2.0756, + "step": 646 + }, + { + "epoch": 0.19638791925937168, + "grad_norm": 0.3560970425605774, + "learning_rate": 9.677533664068037e-05, + "loss": 1.8093, + "step": 647 + }, + { + "epoch": 0.19669145545606315, + "grad_norm": 0.31553730368614197, + "learning_rate": 9.677027437481017e-05, + "loss": 1.9174, + "step": 648 + }, + { + "epoch": 0.19699499165275458, + "grad_norm": 0.39949625730514526, + "learning_rate": 9.676521210893996e-05, + "loss": 1.6687, + "step": 649 + }, + { + "epoch": 0.19729852784944604, + "grad_norm": 0.37323635816574097, + "learning_rate": 9.676014984306976e-05, + "loss": 1.8149, + "step": 650 + }, + { + "epoch": 0.1976020640461375, + "grad_norm": 0.43527746200561523, + "learning_rate": 9.675508757719955e-05, + "loss": 1.8744, + "step": 651 + }, + { + "epoch": 0.19790560024282897, + "grad_norm": 0.39380425214767456, + "learning_rate": 9.675002531132935e-05, + "loss": 1.9721, + "step": 652 + }, + { + "epoch": 0.1982091364395204, + "grad_norm": 0.3384545147418976, + "learning_rate": 9.674496304545914e-05, + "loss": 2.0122, + "step": 653 + }, + { + "epoch": 0.19851267263621186, + "grad_norm": 0.39647915959358215, + "learning_rate": 9.673990077958894e-05, + "loss": 2.2419, + "step": 654 + }, + { + "epoch": 0.19881620883290332, + "grad_norm": 0.3358941674232483, + "learning_rate": 9.673483851371875e-05, + "loss": 1.8758, + "step": 655 + }, + { + "epoch": 0.19911974502959479, + "grad_norm": 0.3486049771308899, + "learning_rate": 9.672977624784855e-05, + "loss": 1.5762, + "step": 656 + }, + { + "epoch": 0.19942328122628625, + "grad_norm": 2.3050696849823, + "learning_rate": 9.672471398197835e-05, + "loss": 2.0056, + "step": 657 + }, + { + "epoch": 0.19972681742297768, + "grad_norm": 0.35023945569992065, + "learning_rate": 9.671965171610814e-05, + "loss": 1.619, + "step": 658 + }, + { + "epoch": 0.20003035361966914, + "grad_norm": 0.513656735420227, + "learning_rate": 9.671458945023794e-05, + "loss": 1.5269, + "step": 659 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.37498149275779724, + "learning_rate": 9.670952718436773e-05, + "loss": 1.8553, + "step": 660 + }, + { + "epoch": 0.20063742601305207, + "grad_norm": 0.4101942479610443, + "learning_rate": 9.670446491849753e-05, + "loss": 2.1121, + "step": 661 + }, + { + "epoch": 0.2009409622097435, + "grad_norm": 0.4265679717063904, + "learning_rate": 9.669940265262732e-05, + "loss": 2.1863, + "step": 662 + }, + { + "epoch": 0.20124449840643496, + "grad_norm": 4.817168712615967, + "learning_rate": 9.669434038675712e-05, + "loss": 2.0906, + "step": 663 + }, + { + "epoch": 0.20154803460312642, + "grad_norm": 7.518252849578857, + "learning_rate": 9.668927812088691e-05, + "loss": 1.8889, + "step": 664 + }, + { + "epoch": 0.2018515707998179, + "grad_norm": 0.5480749011039734, + "learning_rate": 9.66842158550167e-05, + "loss": 1.8439, + "step": 665 + }, + { + "epoch": 0.20215510699650932, + "grad_norm": 0.3578292429447174, + "learning_rate": 9.667915358914651e-05, + "loss": 1.8742, + "step": 666 + }, + { + "epoch": 0.20245864319320078, + "grad_norm": 0.3799275755882263, + "learning_rate": 9.667409132327631e-05, + "loss": 1.994, + "step": 667 + }, + { + "epoch": 0.20276217938989224, + "grad_norm": 0.3736335039138794, + "learning_rate": 9.66690290574061e-05, + "loss": 1.7933, + "step": 668 + }, + { + "epoch": 0.2030657155865837, + "grad_norm": 0.3145211637020111, + "learning_rate": 9.66639667915359e-05, + "loss": 1.8193, + "step": 669 + }, + { + "epoch": 0.20336925178327517, + "grad_norm": 0.4940774142742157, + "learning_rate": 9.66589045256657e-05, + "loss": 1.9238, + "step": 670 + }, + { + "epoch": 0.2036727879799666, + "grad_norm": 0.431134968996048, + "learning_rate": 9.665384225979549e-05, + "loss": 1.5493, + "step": 671 + }, + { + "epoch": 0.20397632417665806, + "grad_norm": 0.41438859701156616, + "learning_rate": 9.664877999392528e-05, + "loss": 1.2076, + "step": 672 + }, + { + "epoch": 0.20427986037334953, + "grad_norm": 0.38191312551498413, + "learning_rate": 9.664371772805508e-05, + "loss": 1.8201, + "step": 673 + }, + { + "epoch": 0.204583396570041, + "grad_norm": 0.3938577175140381, + "learning_rate": 9.663865546218487e-05, + "loss": 1.5166, + "step": 674 + }, + { + "epoch": 0.20488693276673242, + "grad_norm": 0.46312233805656433, + "learning_rate": 9.663359319631468e-05, + "loss": 1.4652, + "step": 675 + }, + { + "epoch": 0.20519046896342388, + "grad_norm": 0.4087234139442444, + "learning_rate": 9.662853093044448e-05, + "loss": 1.8288, + "step": 676 + }, + { + "epoch": 0.20549400516011535, + "grad_norm": 0.37329304218292236, + "learning_rate": 9.662346866457427e-05, + "loss": 1.9084, + "step": 677 + }, + { + "epoch": 0.2057975413568068, + "grad_norm": 0.37109607458114624, + "learning_rate": 9.661840639870407e-05, + "loss": 1.9674, + "step": 678 + }, + { + "epoch": 0.20610107755349824, + "grad_norm": 0.3936561942100525, + "learning_rate": 9.661334413283386e-05, + "loss": 2.0342, + "step": 679 + }, + { + "epoch": 0.2064046137501897, + "grad_norm": 0.4621008634567261, + "learning_rate": 9.660828186696366e-05, + "loss": 1.5157, + "step": 680 + }, + { + "epoch": 0.20670814994688116, + "grad_norm": 0.3849358558654785, + "learning_rate": 9.660321960109345e-05, + "loss": 2.1513, + "step": 681 + }, + { + "epoch": 0.20701168614357263, + "grad_norm": 0.4873330295085907, + "learning_rate": 9.659815733522325e-05, + "loss": 1.9116, + "step": 682 + }, + { + "epoch": 0.2073152223402641, + "grad_norm": 0.4687885642051697, + "learning_rate": 9.659309506935304e-05, + "loss": 2.278, + "step": 683 + }, + { + "epoch": 0.20761875853695552, + "grad_norm": 0.3966952860355377, + "learning_rate": 9.658803280348285e-05, + "loss": 1.4625, + "step": 684 + }, + { + "epoch": 0.20792229473364698, + "grad_norm": 0.5782402157783508, + "learning_rate": 9.658297053761264e-05, + "loss": 2.2779, + "step": 685 + }, + { + "epoch": 0.20822583093033845, + "grad_norm": 0.37465688586235046, + "learning_rate": 9.657790827174244e-05, + "loss": 1.8462, + "step": 686 + }, + { + "epoch": 0.2085293671270299, + "grad_norm": 0.34408631920814514, + "learning_rate": 9.657284600587223e-05, + "loss": 1.9881, + "step": 687 + }, + { + "epoch": 0.20883290332372134, + "grad_norm": 0.6892307996749878, + "learning_rate": 9.656778374000203e-05, + "loss": 1.9835, + "step": 688 + }, + { + "epoch": 0.2091364395204128, + "grad_norm": 0.3698042631149292, + "learning_rate": 9.656272147413182e-05, + "loss": 2.0665, + "step": 689 + }, + { + "epoch": 0.20943997571710427, + "grad_norm": 0.41265738010406494, + "learning_rate": 9.655765920826162e-05, + "loss": 2.0231, + "step": 690 + }, + { + "epoch": 0.20974351191379573, + "grad_norm": 0.38251030445098877, + "learning_rate": 9.655259694239141e-05, + "loss": 1.7058, + "step": 691 + }, + { + "epoch": 0.2100470481104872, + "grad_norm": 0.468905508518219, + "learning_rate": 9.65475346765212e-05, + "loss": 1.6182, + "step": 692 + }, + { + "epoch": 0.21035058430717862, + "grad_norm": 1.0570484399795532, + "learning_rate": 9.6542472410651e-05, + "loss": 2.0165, + "step": 693 + }, + { + "epoch": 0.21065412050387008, + "grad_norm": 0.3978007435798645, + "learning_rate": 9.653741014478081e-05, + "loss": 1.7859, + "step": 694 + }, + { + "epoch": 0.21095765670056155, + "grad_norm": 0.42616939544677734, + "learning_rate": 9.65323478789106e-05, + "loss": 1.5197, + "step": 695 + }, + { + "epoch": 0.211261192897253, + "grad_norm": 0.39380377531051636, + "learning_rate": 9.65272856130404e-05, + "loss": 1.3796, + "step": 696 + }, + { + "epoch": 0.21156472909394444, + "grad_norm": 0.38581010699272156, + "learning_rate": 9.65222233471702e-05, + "loss": 1.8214, + "step": 697 + }, + { + "epoch": 0.2118682652906359, + "grad_norm": 0.3610150218009949, + "learning_rate": 9.651716108129999e-05, + "loss": 1.897, + "step": 698 + }, + { + "epoch": 0.21217180148732737, + "grad_norm": 0.44913700222969055, + "learning_rate": 9.651209881542978e-05, + "loss": 1.8873, + "step": 699 + }, + { + "epoch": 0.21247533768401883, + "grad_norm": 1.9599745273590088, + "learning_rate": 9.650703654955959e-05, + "loss": 1.946, + "step": 700 + }, + { + "epoch": 0.21277887388071026, + "grad_norm": 1.195716381072998, + "learning_rate": 9.650197428368939e-05, + "loss": 1.8749, + "step": 701 + }, + { + "epoch": 0.21308241007740172, + "grad_norm": 0.3154665231704712, + "learning_rate": 9.649691201781918e-05, + "loss": 1.5924, + "step": 702 + }, + { + "epoch": 0.21338594627409319, + "grad_norm": 0.3550672233104706, + "learning_rate": 9.649184975194898e-05, + "loss": 1.6094, + "step": 703 + }, + { + "epoch": 0.21368948247078465, + "grad_norm": 0.33744126558303833, + "learning_rate": 9.648678748607877e-05, + "loss": 1.3399, + "step": 704 + }, + { + "epoch": 0.2139930186674761, + "grad_norm": 0.33931079506874084, + "learning_rate": 9.648172522020858e-05, + "loss": 2.0096, + "step": 705 + }, + { + "epoch": 0.21429655486416754, + "grad_norm": 0.38951364159584045, + "learning_rate": 9.647666295433837e-05, + "loss": 1.7676, + "step": 706 + }, + { + "epoch": 0.214600091060859, + "grad_norm": 0.408087819814682, + "learning_rate": 9.647160068846817e-05, + "loss": 1.7948, + "step": 707 + }, + { + "epoch": 0.21490362725755047, + "grad_norm": 0.37058812379837036, + "learning_rate": 9.646653842259796e-05, + "loss": 1.9891, + "step": 708 + }, + { + "epoch": 0.21520716345424193, + "grad_norm": 0.4003254473209381, + "learning_rate": 9.646147615672776e-05, + "loss": 1.8895, + "step": 709 + }, + { + "epoch": 0.21551069965093336, + "grad_norm": 0.38838204741477966, + "learning_rate": 9.645641389085755e-05, + "loss": 2.0121, + "step": 710 + }, + { + "epoch": 0.21581423584762482, + "grad_norm": 0.41912707686424255, + "learning_rate": 9.645135162498735e-05, + "loss": 1.9804, + "step": 711 + }, + { + "epoch": 0.2161177720443163, + "grad_norm": 0.353454053401947, + "learning_rate": 9.644628935911714e-05, + "loss": 2.0478, + "step": 712 + }, + { + "epoch": 0.21642130824100775, + "grad_norm": 0.3825720548629761, + "learning_rate": 9.644122709324694e-05, + "loss": 1.6676, + "step": 713 + }, + { + "epoch": 0.21672484443769918, + "grad_norm": 0.4197389781475067, + "learning_rate": 9.643616482737675e-05, + "loss": 1.9732, + "step": 714 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.4452435076236725, + "learning_rate": 9.643110256150654e-05, + "loss": 2.0918, + "step": 715 + }, + { + "epoch": 0.2173319168310821, + "grad_norm": 0.3366299271583557, + "learning_rate": 9.642604029563634e-05, + "loss": 1.7469, + "step": 716 + }, + { + "epoch": 0.21763545302777357, + "grad_norm": 0.31280553340911865, + "learning_rate": 9.642097802976613e-05, + "loss": 2.0348, + "step": 717 + }, + { + "epoch": 0.21793898922446503, + "grad_norm": 0.425503671169281, + "learning_rate": 9.641591576389593e-05, + "loss": 1.3629, + "step": 718 + }, + { + "epoch": 0.21824252542115646, + "grad_norm": 0.3986441493034363, + "learning_rate": 9.641085349802572e-05, + "loss": 1.4703, + "step": 719 + }, + { + "epoch": 0.21854606161784793, + "grad_norm": 0.34377026557922363, + "learning_rate": 9.640579123215552e-05, + "loss": 1.9788, + "step": 720 + }, + { + "epoch": 0.2188495978145394, + "grad_norm": 0.3445621430873871, + "learning_rate": 9.640072896628531e-05, + "loss": 1.9137, + "step": 721 + }, + { + "epoch": 0.21915313401123085, + "grad_norm": 0.40363574028015137, + "learning_rate": 9.63956667004151e-05, + "loss": 1.8911, + "step": 722 + }, + { + "epoch": 0.21945667020792228, + "grad_norm": 0.36166059970855713, + "learning_rate": 9.639060443454491e-05, + "loss": 1.9176, + "step": 723 + }, + { + "epoch": 0.21976020640461374, + "grad_norm": 0.7732321619987488, + "learning_rate": 9.638554216867471e-05, + "loss": 2.1942, + "step": 724 + }, + { + "epoch": 0.2200637426013052, + "grad_norm": 0.4042604863643646, + "learning_rate": 9.63804799028045e-05, + "loss": 1.8964, + "step": 725 + }, + { + "epoch": 0.22036727879799667, + "grad_norm": 0.3888862133026123, + "learning_rate": 9.63754176369343e-05, + "loss": 1.716, + "step": 726 + }, + { + "epoch": 0.22067081499468813, + "grad_norm": 0.32185250520706177, + "learning_rate": 9.637035537106409e-05, + "loss": 2.1227, + "step": 727 + }, + { + "epoch": 0.22097435119137956, + "grad_norm": 0.36421746015548706, + "learning_rate": 9.636529310519389e-05, + "loss": 1.3262, + "step": 728 + }, + { + "epoch": 0.22127788738807103, + "grad_norm": 0.42780765891075134, + "learning_rate": 9.636023083932368e-05, + "loss": 1.806, + "step": 729 + }, + { + "epoch": 0.2215814235847625, + "grad_norm": 0.3754510283470154, + "learning_rate": 9.635516857345348e-05, + "loss": 1.9286, + "step": 730 + }, + { + "epoch": 0.22188495978145395, + "grad_norm": 0.35199174284935, + "learning_rate": 9.635010630758327e-05, + "loss": 1.9703, + "step": 731 + }, + { + "epoch": 0.22218849597814538, + "grad_norm": 0.36272746324539185, + "learning_rate": 9.634504404171307e-05, + "loss": 1.7773, + "step": 732 + }, + { + "epoch": 0.22249203217483685, + "grad_norm": 0.4233802556991577, + "learning_rate": 9.633998177584287e-05, + "loss": 2.0016, + "step": 733 + }, + { + "epoch": 0.2227955683715283, + "grad_norm": 0.46138089895248413, + "learning_rate": 9.633491950997267e-05, + "loss": 1.764, + "step": 734 + }, + { + "epoch": 0.22309910456821977, + "grad_norm": 0.37863031029701233, + "learning_rate": 9.632985724410246e-05, + "loss": 1.6493, + "step": 735 + }, + { + "epoch": 0.2234026407649112, + "grad_norm": 0.4493837356567383, + "learning_rate": 9.632479497823226e-05, + "loss": 2.04, + "step": 736 + }, + { + "epoch": 0.22370617696160267, + "grad_norm": 0.581119179725647, + "learning_rate": 9.631973271236205e-05, + "loss": 1.777, + "step": 737 + }, + { + "epoch": 0.22400971315829413, + "grad_norm": 0.3730584979057312, + "learning_rate": 9.631467044649185e-05, + "loss": 1.8932, + "step": 738 + }, + { + "epoch": 0.2243132493549856, + "grad_norm": 0.351421594619751, + "learning_rate": 9.630960818062164e-05, + "loss": 2.3182, + "step": 739 + }, + { + "epoch": 0.22461678555167705, + "grad_norm": 0.4237976670265198, + "learning_rate": 9.630454591475144e-05, + "loss": 2.1315, + "step": 740 + }, + { + "epoch": 0.22492032174836848, + "grad_norm": 0.38544562458992004, + "learning_rate": 9.629948364888123e-05, + "loss": 1.9596, + "step": 741 + }, + { + "epoch": 0.22522385794505995, + "grad_norm": 0.407672256231308, + "learning_rate": 9.629442138301104e-05, + "loss": 1.8694, + "step": 742 + }, + { + "epoch": 0.2255273941417514, + "grad_norm": 0.4415782690048218, + "learning_rate": 9.628935911714084e-05, + "loss": 1.8658, + "step": 743 + }, + { + "epoch": 0.22583093033844287, + "grad_norm": 0.41300657391548157, + "learning_rate": 9.628429685127063e-05, + "loss": 2.0477, + "step": 744 + }, + { + "epoch": 0.2261344665351343, + "grad_norm": 0.36000654101371765, + "learning_rate": 9.627923458540044e-05, + "loss": 1.9045, + "step": 745 + }, + { + "epoch": 0.22643800273182577, + "grad_norm": 0.42653003334999084, + "learning_rate": 9.627417231953023e-05, + "loss": 1.2151, + "step": 746 + }, + { + "epoch": 0.22674153892851723, + "grad_norm": 0.4157649874687195, + "learning_rate": 9.626911005366003e-05, + "loss": 1.9335, + "step": 747 + }, + { + "epoch": 0.2270450751252087, + "grad_norm": 0.3805077373981476, + "learning_rate": 9.626404778778982e-05, + "loss": 2.0803, + "step": 748 + }, + { + "epoch": 0.22734861132190012, + "grad_norm": 0.39710867404937744, + "learning_rate": 9.625898552191962e-05, + "loss": 2.2628, + "step": 749 + }, + { + "epoch": 0.22765214751859159, + "grad_norm": 0.4012609124183655, + "learning_rate": 9.625392325604941e-05, + "loss": 1.9586, + "step": 750 + }, + { + "epoch": 0.22795568371528305, + "grad_norm": 0.9281008243560791, + "learning_rate": 9.624886099017921e-05, + "loss": 1.168, + "step": 751 + }, + { + "epoch": 0.2282592199119745, + "grad_norm": 0.36847764253616333, + "learning_rate": 9.6243798724309e-05, + "loss": 1.8907, + "step": 752 + }, + { + "epoch": 0.22856275610866597, + "grad_norm": 0.4531751573085785, + "learning_rate": 9.623873645843881e-05, + "loss": 1.4511, + "step": 753 + }, + { + "epoch": 0.2288662923053574, + "grad_norm": 0.36623820662498474, + "learning_rate": 9.62336741925686e-05, + "loss": 1.6707, + "step": 754 + }, + { + "epoch": 0.22916982850204887, + "grad_norm": 0.3104342222213745, + "learning_rate": 9.62286119266984e-05, + "loss": 1.988, + "step": 755 + }, + { + "epoch": 0.22947336469874033, + "grad_norm": 0.3790084421634674, + "learning_rate": 9.62235496608282e-05, + "loss": 1.979, + "step": 756 + }, + { + "epoch": 0.2297769008954318, + "grad_norm": 0.3642970323562622, + "learning_rate": 9.621848739495799e-05, + "loss": 1.9998, + "step": 757 + }, + { + "epoch": 0.23008043709212322, + "grad_norm": 0.34588292241096497, + "learning_rate": 9.621342512908779e-05, + "loss": 2.0511, + "step": 758 + }, + { + "epoch": 0.2303839732888147, + "grad_norm": 0.3556496798992157, + "learning_rate": 9.620836286321758e-05, + "loss": 1.8785, + "step": 759 + }, + { + "epoch": 0.23068750948550615, + "grad_norm": 0.4669034779071808, + "learning_rate": 9.620330059734737e-05, + "loss": 1.5027, + "step": 760 + }, + { + "epoch": 0.2309910456821976, + "grad_norm": 0.39685994386672974, + "learning_rate": 9.619823833147717e-05, + "loss": 2.1644, + "step": 761 + }, + { + "epoch": 0.23129458187888904, + "grad_norm": 0.39183005690574646, + "learning_rate": 9.619317606560698e-05, + "loss": 1.9615, + "step": 762 + }, + { + "epoch": 0.2315981180755805, + "grad_norm": 0.36401331424713135, + "learning_rate": 9.618811379973677e-05, + "loss": 1.7535, + "step": 763 + }, + { + "epoch": 0.23190165427227197, + "grad_norm": 0.43118295073509216, + "learning_rate": 9.618305153386657e-05, + "loss": 1.884, + "step": 764 + }, + { + "epoch": 0.23220519046896343, + "grad_norm": 0.5061665177345276, + "learning_rate": 9.617798926799636e-05, + "loss": 2.0051, + "step": 765 + }, + { + "epoch": 0.2325087266656549, + "grad_norm": 0.4487472474575043, + "learning_rate": 9.617292700212616e-05, + "loss": 1.6831, + "step": 766 + }, + { + "epoch": 0.23281226286234633, + "grad_norm": 0.3660997450351715, + "learning_rate": 9.616786473625595e-05, + "loss": 1.9276, + "step": 767 + }, + { + "epoch": 0.2331157990590378, + "grad_norm": 0.3823026716709137, + "learning_rate": 9.616280247038575e-05, + "loss": 1.9817, + "step": 768 + }, + { + "epoch": 0.23341933525572925, + "grad_norm": 0.32568395137786865, + "learning_rate": 9.615774020451554e-05, + "loss": 1.508, + "step": 769 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.34985265135765076, + "learning_rate": 9.615267793864534e-05, + "loss": 1.6793, + "step": 770 + }, + { + "epoch": 0.23402640764911214, + "grad_norm": 0.38563957810401917, + "learning_rate": 9.614761567277513e-05, + "loss": 1.588, + "step": 771 + }, + { + "epoch": 0.2343299438458036, + "grad_norm": 0.33572301268577576, + "learning_rate": 9.614255340690494e-05, + "loss": 1.9541, + "step": 772 + }, + { + "epoch": 0.23463348004249507, + "grad_norm": 0.33936449885368347, + "learning_rate": 9.613749114103473e-05, + "loss": 1.9311, + "step": 773 + }, + { + "epoch": 0.23493701623918653, + "grad_norm": 0.34984657168388367, + "learning_rate": 9.613242887516453e-05, + "loss": 1.9532, + "step": 774 + }, + { + "epoch": 0.235240552435878, + "grad_norm": 0.3651373088359833, + "learning_rate": 9.612736660929432e-05, + "loss": 1.8815, + "step": 775 + }, + { + "epoch": 0.23554408863256943, + "grad_norm": 0.4317852854728699, + "learning_rate": 9.612230434342412e-05, + "loss": 2.0262, + "step": 776 + }, + { + "epoch": 0.2358476248292609, + "grad_norm": 0.375522255897522, + "learning_rate": 9.611724207755391e-05, + "loss": 1.9964, + "step": 777 + }, + { + "epoch": 0.23615116102595235, + "grad_norm": 0.37290844321250916, + "learning_rate": 9.611217981168371e-05, + "loss": 1.7456, + "step": 778 + }, + { + "epoch": 0.2364546972226438, + "grad_norm": 0.3768545985221863, + "learning_rate": 9.61071175458135e-05, + "loss": 1.9591, + "step": 779 + }, + { + "epoch": 0.23675823341933525, + "grad_norm": 0.3147246837615967, + "learning_rate": 9.61020552799433e-05, + "loss": 1.4033, + "step": 780 + }, + { + "epoch": 0.2370617696160267, + "grad_norm": 0.4480874240398407, + "learning_rate": 9.60969930140731e-05, + "loss": 1.9598, + "step": 781 + }, + { + "epoch": 0.23736530581271817, + "grad_norm": 0.7287562489509583, + "learning_rate": 9.60919307482029e-05, + "loss": 2.0097, + "step": 782 + }, + { + "epoch": 0.23766884200940963, + "grad_norm": 0.36199334263801575, + "learning_rate": 9.60868684823327e-05, + "loss": 1.8089, + "step": 783 + }, + { + "epoch": 0.23797237820610107, + "grad_norm": 0.32855263352394104, + "learning_rate": 9.608180621646249e-05, + "loss": 2.0199, + "step": 784 + }, + { + "epoch": 0.23827591440279253, + "grad_norm": 0.37182894349098206, + "learning_rate": 9.607674395059229e-05, + "loss": 1.7253, + "step": 785 + }, + { + "epoch": 0.238579450599484, + "grad_norm": 0.3365595042705536, + "learning_rate": 9.607168168472208e-05, + "loss": 1.9308, + "step": 786 + }, + { + "epoch": 0.23888298679617545, + "grad_norm": 0.400685578584671, + "learning_rate": 9.606661941885187e-05, + "loss": 1.8939, + "step": 787 + }, + { + "epoch": 0.2391865229928669, + "grad_norm": 0.6354159116744995, + "learning_rate": 9.606155715298167e-05, + "loss": 2.1476, + "step": 788 + }, + { + "epoch": 0.23949005918955835, + "grad_norm": 0.4196738600730896, + "learning_rate": 9.605649488711148e-05, + "loss": 1.8457, + "step": 789 + }, + { + "epoch": 0.2397935953862498, + "grad_norm": 0.35839545726776123, + "learning_rate": 9.605143262124127e-05, + "loss": 1.824, + "step": 790 + }, + { + "epoch": 0.24009713158294127, + "grad_norm": 0.3597940504550934, + "learning_rate": 9.604637035537107e-05, + "loss": 1.9583, + "step": 791 + }, + { + "epoch": 0.24040066777963273, + "grad_norm": 0.5783160924911499, + "learning_rate": 9.604130808950088e-05, + "loss": 2.2, + "step": 792 + }, + { + "epoch": 0.24070420397632417, + "grad_norm": 0.3544808030128479, + "learning_rate": 9.603624582363067e-05, + "loss": 2.1092, + "step": 793 + }, + { + "epoch": 0.24100774017301563, + "grad_norm": 0.41170623898506165, + "learning_rate": 9.603118355776047e-05, + "loss": 1.6004, + "step": 794 + }, + { + "epoch": 0.2413112763697071, + "grad_norm": 0.3832992613315582, + "learning_rate": 9.602612129189026e-05, + "loss": 1.4981, + "step": 795 + }, + { + "epoch": 0.24161481256639855, + "grad_norm": 0.5239993333816528, + "learning_rate": 9.602105902602006e-05, + "loss": 1.6026, + "step": 796 + }, + { + "epoch": 0.24191834876308999, + "grad_norm": 0.38445138931274414, + "learning_rate": 9.601599676014985e-05, + "loss": 1.5765, + "step": 797 + }, + { + "epoch": 0.24222188495978145, + "grad_norm": 0.38520511984825134, + "learning_rate": 9.601093449427964e-05, + "loss": 2.1069, + "step": 798 + }, + { + "epoch": 0.2425254211564729, + "grad_norm": 0.3519560694694519, + "learning_rate": 9.600587222840944e-05, + "loss": 1.8896, + "step": 799 + }, + { + "epoch": 0.24282895735316437, + "grad_norm": 0.5392457246780396, + "learning_rate": 9.600080996253923e-05, + "loss": 1.6273, + "step": 800 + }, + { + "epoch": 0.24313249354985583, + "grad_norm": 0.4213111996650696, + "learning_rate": 9.599574769666904e-05, + "loss": 1.489, + "step": 801 + }, + { + "epoch": 0.24343602974654727, + "grad_norm": 0.4006531834602356, + "learning_rate": 9.599068543079884e-05, + "loss": 1.9842, + "step": 802 + }, + { + "epoch": 0.24373956594323873, + "grad_norm": 0.3792324364185333, + "learning_rate": 9.598562316492863e-05, + "loss": 1.727, + "step": 803 + }, + { + "epoch": 0.2440431021399302, + "grad_norm": 0.3555270731449127, + "learning_rate": 9.598056089905843e-05, + "loss": 1.68, + "step": 804 + }, + { + "epoch": 0.24434663833662165, + "grad_norm": 0.33837342262268066, + "learning_rate": 9.597549863318822e-05, + "loss": 2.0709, + "step": 805 + }, + { + "epoch": 0.2446501745333131, + "grad_norm": 0.3812510371208191, + "learning_rate": 9.597043636731802e-05, + "loss": 2.1211, + "step": 806 + }, + { + "epoch": 0.24495371073000455, + "grad_norm": 0.33870792388916016, + "learning_rate": 9.596537410144781e-05, + "loss": 2.1047, + "step": 807 + }, + { + "epoch": 0.245257246926696, + "grad_norm": 0.3948252201080322, + "learning_rate": 9.59603118355776e-05, + "loss": 1.7553, + "step": 808 + }, + { + "epoch": 0.24556078312338747, + "grad_norm": 0.39410725235939026, + "learning_rate": 9.59552495697074e-05, + "loss": 1.9383, + "step": 809 + }, + { + "epoch": 0.2458643193200789, + "grad_norm": 0.37794989347457886, + "learning_rate": 9.59501873038372e-05, + "loss": 1.9115, + "step": 810 + }, + { + "epoch": 0.24616785551677037, + "grad_norm": 1.6270610094070435, + "learning_rate": 9.5945125037967e-05, + "loss": 1.8472, + "step": 811 + }, + { + "epoch": 0.24647139171346183, + "grad_norm": 0.3724587559700012, + "learning_rate": 9.59400627720968e-05, + "loss": 1.9087, + "step": 812 + }, + { + "epoch": 0.2467749279101533, + "grad_norm": 0.4097403585910797, + "learning_rate": 9.59350005062266e-05, + "loss": 1.8325, + "step": 813 + }, + { + "epoch": 0.24707846410684475, + "grad_norm": 0.4052940905094147, + "learning_rate": 9.592993824035639e-05, + "loss": 2.0241, + "step": 814 + }, + { + "epoch": 0.2473820003035362, + "grad_norm": 0.3887682557106018, + "learning_rate": 9.592487597448618e-05, + "loss": 1.6114, + "step": 815 + }, + { + "epoch": 0.24768553650022765, + "grad_norm": 0.404450386762619, + "learning_rate": 9.591981370861598e-05, + "loss": 1.8384, + "step": 816 + }, + { + "epoch": 0.2479890726969191, + "grad_norm": 0.7955893874168396, + "learning_rate": 9.591475144274577e-05, + "loss": 2.2149, + "step": 817 + }, + { + "epoch": 0.24829260889361057, + "grad_norm": 4.355859279632568, + "learning_rate": 9.590968917687557e-05, + "loss": 2.3753, + "step": 818 + }, + { + "epoch": 0.248596145090302, + "grad_norm": 0.3698444962501526, + "learning_rate": 9.590462691100536e-05, + "loss": 1.7354, + "step": 819 + }, + { + "epoch": 0.24889968128699347, + "grad_norm": 0.3658899962902069, + "learning_rate": 9.589956464513517e-05, + "loss": 1.7803, + "step": 820 + }, + { + "epoch": 0.24920321748368493, + "grad_norm": 0.405072957277298, + "learning_rate": 9.589450237926497e-05, + "loss": 1.7684, + "step": 821 + }, + { + "epoch": 0.2495067536803764, + "grad_norm": 0.7590973973274231, + "learning_rate": 9.588944011339476e-05, + "loss": 1.9466, + "step": 822 + }, + { + "epoch": 0.24981028987706785, + "grad_norm": 0.5217581987380981, + "learning_rate": 9.588437784752456e-05, + "loss": 2.1281, + "step": 823 + }, + { + "epoch": 0.2501138260737593, + "grad_norm": 0.3716435134410858, + "learning_rate": 9.587931558165435e-05, + "loss": 2.114, + "step": 824 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.44017624855041504, + "learning_rate": 9.587425331578414e-05, + "loss": 2.0445, + "step": 825 + }, + { + "epoch": 0.2507208984671422, + "grad_norm": 0.370370090007782, + "learning_rate": 9.586919104991394e-05, + "loss": 1.8674, + "step": 826 + }, + { + "epoch": 0.2510244346638337, + "grad_norm": 0.32125499844551086, + "learning_rate": 9.586412878404373e-05, + "loss": 1.4129, + "step": 827 + }, + { + "epoch": 0.2513279708605251, + "grad_norm": 0.4143073856830597, + "learning_rate": 9.585906651817353e-05, + "loss": 1.9895, + "step": 828 + }, + { + "epoch": 0.2516315070572166, + "grad_norm": 0.3492576777935028, + "learning_rate": 9.585400425230334e-05, + "loss": 2.0669, + "step": 829 + }, + { + "epoch": 0.25193504325390803, + "grad_norm": 0.4044751524925232, + "learning_rate": 9.584894198643313e-05, + "loss": 1.5909, + "step": 830 + }, + { + "epoch": 0.25223857945059946, + "grad_norm": 0.3410158157348633, + "learning_rate": 9.584387972056293e-05, + "loss": 1.7485, + "step": 831 + }, + { + "epoch": 0.25254211564729095, + "grad_norm": 0.340320348739624, + "learning_rate": 9.583881745469272e-05, + "loss": 1.8897, + "step": 832 + }, + { + "epoch": 0.2528456518439824, + "grad_norm": 0.35516276955604553, + "learning_rate": 9.583375518882252e-05, + "loss": 1.6332, + "step": 833 + }, + { + "epoch": 0.2531491880406738, + "grad_norm": 0.4099842309951782, + "learning_rate": 9.582869292295232e-05, + "loss": 1.5617, + "step": 834 + }, + { + "epoch": 0.2534527242373653, + "grad_norm": 0.38086098432540894, + "learning_rate": 9.582363065708212e-05, + "loss": 2.0837, + "step": 835 + }, + { + "epoch": 0.25375626043405675, + "grad_norm": 0.8040663003921509, + "learning_rate": 9.581856839121191e-05, + "loss": 1.8587, + "step": 836 + }, + { + "epoch": 0.25405979663074824, + "grad_norm": 0.41297683119773865, + "learning_rate": 9.581350612534171e-05, + "loss": 1.9602, + "step": 837 + }, + { + "epoch": 0.25436333282743967, + "grad_norm": 0.38155442476272583, + "learning_rate": 9.58084438594715e-05, + "loss": 1.375, + "step": 838 + }, + { + "epoch": 0.2546668690241311, + "grad_norm": 0.3956829905509949, + "learning_rate": 9.58033815936013e-05, + "loss": 1.9617, + "step": 839 + }, + { + "epoch": 0.2549704052208226, + "grad_norm": 0.38675928115844727, + "learning_rate": 9.579831932773111e-05, + "loss": 1.8186, + "step": 840 + }, + { + "epoch": 0.255273941417514, + "grad_norm": 0.33989018201828003, + "learning_rate": 9.57932570618609e-05, + "loss": 2.1734, + "step": 841 + }, + { + "epoch": 0.2555774776142055, + "grad_norm": 0.3240448534488678, + "learning_rate": 9.57881947959907e-05, + "loss": 1.6238, + "step": 842 + }, + { + "epoch": 0.25588101381089695, + "grad_norm": 0.6117075681686401, + "learning_rate": 9.578313253012049e-05, + "loss": 1.986, + "step": 843 + }, + { + "epoch": 0.2561845500075884, + "grad_norm": 0.3781290650367737, + "learning_rate": 9.577807026425029e-05, + "loss": 2.0021, + "step": 844 + }, + { + "epoch": 0.2564880862042799, + "grad_norm": 0.4373374879360199, + "learning_rate": 9.577300799838008e-05, + "loss": 2.0195, + "step": 845 + }, + { + "epoch": 0.2567916224009713, + "grad_norm": 0.4125923216342926, + "learning_rate": 9.576794573250988e-05, + "loss": 1.9412, + "step": 846 + }, + { + "epoch": 0.2570951585976628, + "grad_norm": 0.3557007908821106, + "learning_rate": 9.576288346663967e-05, + "loss": 1.8098, + "step": 847 + }, + { + "epoch": 0.25739869479435423, + "grad_norm": 0.49475541710853577, + "learning_rate": 9.575782120076947e-05, + "loss": 1.5756, + "step": 848 + }, + { + "epoch": 0.25770223099104567, + "grad_norm": 0.3507518768310547, + "learning_rate": 9.575275893489926e-05, + "loss": 1.6413, + "step": 849 + }, + { + "epoch": 0.25800576718773716, + "grad_norm": 0.39508333802223206, + "learning_rate": 9.574769666902907e-05, + "loss": 1.9777, + "step": 850 + }, + { + "epoch": 0.2583093033844286, + "grad_norm": 0.328807532787323, + "learning_rate": 9.574263440315886e-05, + "loss": 1.4948, + "step": 851 + }, + { + "epoch": 0.25861283958112, + "grad_norm": 0.3154551386833191, + "learning_rate": 9.573757213728866e-05, + "loss": 1.7809, + "step": 852 + }, + { + "epoch": 0.2589163757778115, + "grad_norm": 0.502554178237915, + "learning_rate": 9.573250987141845e-05, + "loss": 1.4369, + "step": 853 + }, + { + "epoch": 0.25921991197450295, + "grad_norm": 0.4416670799255371, + "learning_rate": 9.572744760554825e-05, + "loss": 1.7364, + "step": 854 + }, + { + "epoch": 0.25952344817119444, + "grad_norm": 0.43228060007095337, + "learning_rate": 9.572238533967804e-05, + "loss": 1.3281, + "step": 855 + }, + { + "epoch": 0.25982698436788587, + "grad_norm": 0.3714723289012909, + "learning_rate": 9.571732307380784e-05, + "loss": 2.0893, + "step": 856 + }, + { + "epoch": 0.2601305205645773, + "grad_norm": 0.3309679925441742, + "learning_rate": 9.571226080793763e-05, + "loss": 1.7982, + "step": 857 + }, + { + "epoch": 0.2604340567612688, + "grad_norm": 0.3709767460823059, + "learning_rate": 9.570719854206743e-05, + "loss": 1.8628, + "step": 858 + }, + { + "epoch": 0.26073759295796023, + "grad_norm": 0.6020816564559937, + "learning_rate": 9.570213627619724e-05, + "loss": 2.0077, + "step": 859 + }, + { + "epoch": 0.2610411291546517, + "grad_norm": 0.30620431900024414, + "learning_rate": 9.569707401032703e-05, + "loss": 1.8834, + "step": 860 + }, + { + "epoch": 0.26134466535134315, + "grad_norm": 0.41518962383270264, + "learning_rate": 9.569201174445683e-05, + "loss": 1.8025, + "step": 861 + }, + { + "epoch": 0.2616482015480346, + "grad_norm": 0.3919786512851715, + "learning_rate": 9.568694947858662e-05, + "loss": 1.995, + "step": 862 + }, + { + "epoch": 0.2619517377447261, + "grad_norm": 0.47429168224334717, + "learning_rate": 9.568188721271641e-05, + "loss": 1.9423, + "step": 863 + }, + { + "epoch": 0.2622552739414175, + "grad_norm": 0.8941421508789062, + "learning_rate": 9.567682494684621e-05, + "loss": 1.5046, + "step": 864 + }, + { + "epoch": 0.26255881013810894, + "grad_norm": 0.4357859194278717, + "learning_rate": 9.5671762680976e-05, + "loss": 2.0023, + "step": 865 + }, + { + "epoch": 0.26286234633480043, + "grad_norm": 0.3873944878578186, + "learning_rate": 9.56667004151058e-05, + "loss": 2.0607, + "step": 866 + }, + { + "epoch": 0.26316588253149187, + "grad_norm": 0.4355853497982025, + "learning_rate": 9.56616381492356e-05, + "loss": 1.8254, + "step": 867 + }, + { + "epoch": 0.26346941872818336, + "grad_norm": 0.3882213234901428, + "learning_rate": 9.56565758833654e-05, + "loss": 1.7809, + "step": 868 + }, + { + "epoch": 0.2637729549248748, + "grad_norm": 0.4021656811237335, + "learning_rate": 9.56515136174952e-05, + "loss": 2.1321, + "step": 869 + }, + { + "epoch": 0.2640764911215662, + "grad_norm": 0.43587526679039, + "learning_rate": 9.564645135162499e-05, + "loss": 1.7865, + "step": 870 + }, + { + "epoch": 0.2643800273182577, + "grad_norm": 0.364045649766922, + "learning_rate": 9.564138908575479e-05, + "loss": 1.8173, + "step": 871 + }, + { + "epoch": 0.26468356351494915, + "grad_norm": 0.3956625461578369, + "learning_rate": 9.563632681988458e-05, + "loss": 1.4822, + "step": 872 + }, + { + "epoch": 0.26498709971164064, + "grad_norm": 0.40755051374435425, + "learning_rate": 9.563126455401438e-05, + "loss": 1.9418, + "step": 873 + }, + { + "epoch": 0.2652906359083321, + "grad_norm": 0.39405086636543274, + "learning_rate": 9.562620228814417e-05, + "loss": 1.4529, + "step": 874 + }, + { + "epoch": 0.2655941721050235, + "grad_norm": 0.4400351047515869, + "learning_rate": 9.562114002227397e-05, + "loss": 2.1095, + "step": 875 + }, + { + "epoch": 0.265897708301715, + "grad_norm": 0.40135496854782104, + "learning_rate": 9.561607775640376e-05, + "loss": 1.9462, + "step": 876 + }, + { + "epoch": 0.26620124449840643, + "grad_norm": 0.5949604511260986, + "learning_rate": 9.561101549053356e-05, + "loss": 1.8797, + "step": 877 + }, + { + "epoch": 0.26650478069509786, + "grad_norm": 0.38301005959510803, + "learning_rate": 9.560595322466336e-05, + "loss": 2.0887, + "step": 878 + }, + { + "epoch": 0.26680831689178935, + "grad_norm": 0.6215627789497375, + "learning_rate": 9.560089095879317e-05, + "loss": 1.7846, + "step": 879 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.4041058123111725, + "learning_rate": 9.559582869292297e-05, + "loss": 1.5127, + "step": 880 + }, + { + "epoch": 0.2674153892851723, + "grad_norm": 0.30281975865364075, + "learning_rate": 9.559076642705276e-05, + "loss": 1.8487, + "step": 881 + }, + { + "epoch": 0.2677189254818637, + "grad_norm": 0.34536200761795044, + "learning_rate": 9.558570416118256e-05, + "loss": 1.8976, + "step": 882 + }, + { + "epoch": 0.26802246167855515, + "grad_norm": 0.367245614528656, + "learning_rate": 9.558064189531235e-05, + "loss": 1.9804, + "step": 883 + }, + { + "epoch": 0.26832599787524664, + "grad_norm": 0.41750359535217285, + "learning_rate": 9.557557962944215e-05, + "loss": 1.5932, + "step": 884 + }, + { + "epoch": 0.26862953407193807, + "grad_norm": 0.7777047157287598, + "learning_rate": 9.557051736357194e-05, + "loss": 1.8513, + "step": 885 + }, + { + "epoch": 0.26893307026862956, + "grad_norm": 0.3720252215862274, + "learning_rate": 9.556545509770174e-05, + "loss": 2.1819, + "step": 886 + }, + { + "epoch": 0.269236606465321, + "grad_norm": 0.7321712970733643, + "learning_rate": 9.556039283183153e-05, + "loss": 1.4653, + "step": 887 + }, + { + "epoch": 0.2695401426620124, + "grad_norm": 0.4140429198741913, + "learning_rate": 9.555533056596133e-05, + "loss": 1.9816, + "step": 888 + }, + { + "epoch": 0.2698436788587039, + "grad_norm": 0.40684935450553894, + "learning_rate": 9.555026830009113e-05, + "loss": 1.5866, + "step": 889 + }, + { + "epoch": 0.27014721505539535, + "grad_norm": 0.4067225754261017, + "learning_rate": 9.554520603422093e-05, + "loss": 1.5951, + "step": 890 + }, + { + "epoch": 0.2704507512520868, + "grad_norm": 0.34240391850471497, + "learning_rate": 9.554014376835072e-05, + "loss": 1.9076, + "step": 891 + }, + { + "epoch": 0.2707542874487783, + "grad_norm": 0.4634522795677185, + "learning_rate": 9.553508150248052e-05, + "loss": 1.9856, + "step": 892 + }, + { + "epoch": 0.2710578236454697, + "grad_norm": 0.408015638589859, + "learning_rate": 9.553001923661031e-05, + "loss": 1.7997, + "step": 893 + }, + { + "epoch": 0.2713613598421612, + "grad_norm": 0.3894648253917694, + "learning_rate": 9.552495697074011e-05, + "loss": 1.8381, + "step": 894 + }, + { + "epoch": 0.27166489603885263, + "grad_norm": 0.37494730949401855, + "learning_rate": 9.55198947048699e-05, + "loss": 2.0548, + "step": 895 + }, + { + "epoch": 0.27196843223554407, + "grad_norm": 0.39796411991119385, + "learning_rate": 9.55148324389997e-05, + "loss": 1.9272, + "step": 896 + }, + { + "epoch": 0.27227196843223556, + "grad_norm": 0.40153494477272034, + "learning_rate": 9.550977017312949e-05, + "loss": 1.7136, + "step": 897 + }, + { + "epoch": 0.272575504628927, + "grad_norm": 0.39771386981010437, + "learning_rate": 9.55047079072593e-05, + "loss": 2.1017, + "step": 898 + }, + { + "epoch": 0.2728790408256185, + "grad_norm": 0.4085974097251892, + "learning_rate": 9.54996456413891e-05, + "loss": 1.3951, + "step": 899 + }, + { + "epoch": 0.2731825770223099, + "grad_norm": 0.39849239587783813, + "learning_rate": 9.549458337551889e-05, + "loss": 1.9988, + "step": 900 + }, + { + "epoch": 0.27348611321900135, + "grad_norm": 0.38662001490592957, + "learning_rate": 9.548952110964868e-05, + "loss": 1.8491, + "step": 901 + }, + { + "epoch": 0.27378964941569284, + "grad_norm": 0.38078710436820984, + "learning_rate": 9.548445884377848e-05, + "loss": 1.9, + "step": 902 + }, + { + "epoch": 0.27409318561238427, + "grad_norm": 0.3548724949359894, + "learning_rate": 9.547939657790827e-05, + "loss": 1.8754, + "step": 903 + }, + { + "epoch": 0.2743967218090757, + "grad_norm": 0.37712323665618896, + "learning_rate": 9.547433431203807e-05, + "loss": 1.5497, + "step": 904 + }, + { + "epoch": 0.2747002580057672, + "grad_norm": 0.4060449004173279, + "learning_rate": 9.546927204616786e-05, + "loss": 1.7231, + "step": 905 + }, + { + "epoch": 0.27500379420245863, + "grad_norm": 0.42080479860305786, + "learning_rate": 9.546420978029766e-05, + "loss": 2.1538, + "step": 906 + }, + { + "epoch": 0.2753073303991501, + "grad_norm": 0.4034046232700348, + "learning_rate": 9.545914751442747e-05, + "loss": 1.7335, + "step": 907 + }, + { + "epoch": 0.27561086659584155, + "grad_norm": 0.3676345646381378, + "learning_rate": 9.545408524855726e-05, + "loss": 1.6193, + "step": 908 + }, + { + "epoch": 0.275914402792533, + "grad_norm": 0.3349851965904236, + "learning_rate": 9.544902298268706e-05, + "loss": 1.8997, + "step": 909 + }, + { + "epoch": 0.2762179389892245, + "grad_norm": 0.3676302134990692, + "learning_rate": 9.544396071681685e-05, + "loss": 1.4031, + "step": 910 + }, + { + "epoch": 0.2765214751859159, + "grad_norm": 0.36593666672706604, + "learning_rate": 9.543889845094665e-05, + "loss": 1.8838, + "step": 911 + }, + { + "epoch": 0.2768250113826074, + "grad_norm": 0.3793712258338928, + "learning_rate": 9.543383618507644e-05, + "loss": 1.5949, + "step": 912 + }, + { + "epoch": 0.27712854757929883, + "grad_norm": 0.47586631774902344, + "learning_rate": 9.542877391920624e-05, + "loss": 1.5687, + "step": 913 + }, + { + "epoch": 0.27743208377599027, + "grad_norm": 0.38850024342536926, + "learning_rate": 9.542371165333603e-05, + "loss": 1.7336, + "step": 914 + }, + { + "epoch": 0.27773561997268176, + "grad_norm": 0.4039680063724518, + "learning_rate": 9.541864938746583e-05, + "loss": 2.0476, + "step": 915 + }, + { + "epoch": 0.2780391561693732, + "grad_norm": 0.40498992800712585, + "learning_rate": 9.541358712159562e-05, + "loss": 1.6699, + "step": 916 + }, + { + "epoch": 0.2783426923660646, + "grad_norm": 0.39011168479919434, + "learning_rate": 9.540852485572543e-05, + "loss": 1.9935, + "step": 917 + }, + { + "epoch": 0.2786462285627561, + "grad_norm": 0.3864549696445465, + "learning_rate": 9.540346258985522e-05, + "loss": 1.8271, + "step": 918 + }, + { + "epoch": 0.27894976475944755, + "grad_norm": 0.33493247628211975, + "learning_rate": 9.539840032398502e-05, + "loss": 1.856, + "step": 919 + }, + { + "epoch": 0.27925330095613904, + "grad_norm": 0.34132060408592224, + "learning_rate": 9.539333805811481e-05, + "loss": 1.8836, + "step": 920 + }, + { + "epoch": 0.2795568371528305, + "grad_norm": 1.5312176942825317, + "learning_rate": 9.538827579224461e-05, + "loss": 2.0207, + "step": 921 + }, + { + "epoch": 0.2798603733495219, + "grad_norm": 0.333932489156723, + "learning_rate": 9.53832135263744e-05, + "loss": 2.0908, + "step": 922 + }, + { + "epoch": 0.2801639095462134, + "grad_norm": 0.3688269555568695, + "learning_rate": 9.537815126050421e-05, + "loss": 1.8464, + "step": 923 + }, + { + "epoch": 0.28046744574290483, + "grad_norm": 0.4097294211387634, + "learning_rate": 9.5373088994634e-05, + "loss": 1.6891, + "step": 924 + }, + { + "epoch": 0.2807709819395963, + "grad_norm": 0.3737453818321228, + "learning_rate": 9.53680267287638e-05, + "loss": 2.0549, + "step": 925 + }, + { + "epoch": 0.28107451813628775, + "grad_norm": 0.6109428405761719, + "learning_rate": 9.53629644628936e-05, + "loss": 1.9437, + "step": 926 + }, + { + "epoch": 0.2813780543329792, + "grad_norm": 0.46215322613716125, + "learning_rate": 9.535790219702339e-05, + "loss": 1.5133, + "step": 927 + }, + { + "epoch": 0.2816815905296707, + "grad_norm": 0.8070108294487, + "learning_rate": 9.53528399311532e-05, + "loss": 1.8843, + "step": 928 + }, + { + "epoch": 0.2819851267263621, + "grad_norm": 0.40304142236709595, + "learning_rate": 9.534777766528299e-05, + "loss": 1.9742, + "step": 929 + }, + { + "epoch": 0.2822886629230536, + "grad_norm": 0.35046708583831787, + "learning_rate": 9.534271539941279e-05, + "loss": 1.8969, + "step": 930 + }, + { + "epoch": 0.28259219911974504, + "grad_norm": 0.37241777777671814, + "learning_rate": 9.533765313354258e-05, + "loss": 1.8138, + "step": 931 + }, + { + "epoch": 0.28289573531643647, + "grad_norm": 0.38689473271369934, + "learning_rate": 9.533259086767238e-05, + "loss": 1.669, + "step": 932 + }, + { + "epoch": 0.28319927151312796, + "grad_norm": 0.3672066926956177, + "learning_rate": 9.532752860180217e-05, + "loss": 1.9093, + "step": 933 + }, + { + "epoch": 0.2835028077098194, + "grad_norm": 0.4022217392921448, + "learning_rate": 9.532246633593197e-05, + "loss": 1.6959, + "step": 934 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.3894721269607544, + "learning_rate": 9.531740407006176e-05, + "loss": 1.9898, + "step": 935 + }, + { + "epoch": 0.2841098801032023, + "grad_norm": 0.4395015835762024, + "learning_rate": 9.531234180419156e-05, + "loss": 1.5538, + "step": 936 + }, + { + "epoch": 0.28441341629989375, + "grad_norm": 0.8121886849403381, + "learning_rate": 9.530727953832136e-05, + "loss": 1.7403, + "step": 937 + }, + { + "epoch": 0.28471695249658524, + "grad_norm": 0.40073227882385254, + "learning_rate": 9.530221727245116e-05, + "loss": 2.0544, + "step": 938 + }, + { + "epoch": 0.2850204886932767, + "grad_norm": 0.3571331202983856, + "learning_rate": 9.529715500658095e-05, + "loss": 1.7157, + "step": 939 + }, + { + "epoch": 0.2853240248899681, + "grad_norm": 0.485147625207901, + "learning_rate": 9.529209274071075e-05, + "loss": 2.1489, + "step": 940 + }, + { + "epoch": 0.2856275610866596, + "grad_norm": 0.6882160305976868, + "learning_rate": 9.528703047484054e-05, + "loss": 1.8458, + "step": 941 + }, + { + "epoch": 0.28593109728335103, + "grad_norm": 0.7156968116760254, + "learning_rate": 9.528196820897034e-05, + "loss": 1.9529, + "step": 942 + }, + { + "epoch": 0.2862346334800425, + "grad_norm": 0.4198112487792969, + "learning_rate": 9.527690594310013e-05, + "loss": 2.0355, + "step": 943 + }, + { + "epoch": 0.28653816967673396, + "grad_norm": 0.4178343117237091, + "learning_rate": 9.527184367722993e-05, + "loss": 1.5801, + "step": 944 + }, + { + "epoch": 0.2868417058734254, + "grad_norm": 0.3721866011619568, + "learning_rate": 9.526678141135972e-05, + "loss": 2.1657, + "step": 945 + }, + { + "epoch": 0.2871452420701169, + "grad_norm": 0.38586944341659546, + "learning_rate": 9.526171914548953e-05, + "loss": 1.4879, + "step": 946 + }, + { + "epoch": 0.2874487782668083, + "grad_norm": 0.42727598547935486, + "learning_rate": 9.525665687961933e-05, + "loss": 1.8434, + "step": 947 + }, + { + "epoch": 0.28775231446349975, + "grad_norm": 0.3686284124851227, + "learning_rate": 9.525159461374912e-05, + "loss": 1.9346, + "step": 948 + }, + { + "epoch": 0.28805585066019124, + "grad_norm": 0.41984260082244873, + "learning_rate": 9.524653234787892e-05, + "loss": 1.4474, + "step": 949 + }, + { + "epoch": 0.28835938685688267, + "grad_norm": 0.4530123174190521, + "learning_rate": 9.524147008200871e-05, + "loss": 1.6863, + "step": 950 + }, + { + "epoch": 0.28866292305357416, + "grad_norm": 0.40047594904899597, + "learning_rate": 9.52364078161385e-05, + "loss": 1.908, + "step": 951 + }, + { + "epoch": 0.2889664592502656, + "grad_norm": 0.3757762610912323, + "learning_rate": 9.52313455502683e-05, + "loss": 1.6235, + "step": 952 + }, + { + "epoch": 0.28926999544695703, + "grad_norm": 0.4337126612663269, + "learning_rate": 9.52262832843981e-05, + "loss": 1.6229, + "step": 953 + }, + { + "epoch": 0.2895735316436485, + "grad_norm": 0.4407886564731598, + "learning_rate": 9.522122101852789e-05, + "loss": 1.875, + "step": 954 + }, + { + "epoch": 0.28987706784033995, + "grad_norm": 0.5278657674789429, + "learning_rate": 9.521615875265768e-05, + "loss": 1.7199, + "step": 955 + }, + { + "epoch": 0.29018060403703144, + "grad_norm": 0.4441334307193756, + "learning_rate": 9.521109648678749e-05, + "loss": 1.1319, + "step": 956 + }, + { + "epoch": 0.2904841402337229, + "grad_norm": 0.3992663025856018, + "learning_rate": 9.520603422091729e-05, + "loss": 1.6948, + "step": 957 + }, + { + "epoch": 0.2907876764304143, + "grad_norm": 0.3979544937610626, + "learning_rate": 9.520097195504708e-05, + "loss": 1.8689, + "step": 958 + }, + { + "epoch": 0.2910912126271058, + "grad_norm": 0.4011298418045044, + "learning_rate": 9.519590968917688e-05, + "loss": 1.9491, + "step": 959 + }, + { + "epoch": 0.29139474882379723, + "grad_norm": 0.4377354383468628, + "learning_rate": 9.519084742330667e-05, + "loss": 1.7274, + "step": 960 + }, + { + "epoch": 0.29169828502048867, + "grad_norm": 0.5056617856025696, + "learning_rate": 9.518578515743647e-05, + "loss": 2.006, + "step": 961 + }, + { + "epoch": 0.29200182121718016, + "grad_norm": 0.36736002564430237, + "learning_rate": 9.518072289156626e-05, + "loss": 1.6558, + "step": 962 + }, + { + "epoch": 0.2923053574138716, + "grad_norm": 0.37966540455818176, + "learning_rate": 9.517566062569606e-05, + "loss": 2.0098, + "step": 963 + }, + { + "epoch": 0.2926088936105631, + "grad_norm": 0.4026505947113037, + "learning_rate": 9.517059835982585e-05, + "loss": 1.868, + "step": 964 + }, + { + "epoch": 0.2929124298072545, + "grad_norm": 0.461910218000412, + "learning_rate": 9.516553609395566e-05, + "loss": 2.1131, + "step": 965 + }, + { + "epoch": 0.29321596600394595, + "grad_norm": 0.4329175651073456, + "learning_rate": 9.516047382808545e-05, + "loss": 2.0068, + "step": 966 + }, + { + "epoch": 0.29351950220063744, + "grad_norm": 0.7611956000328064, + "learning_rate": 9.515541156221526e-05, + "loss": 1.9177, + "step": 967 + }, + { + "epoch": 0.2938230383973289, + "grad_norm": 0.6180218458175659, + "learning_rate": 9.515034929634506e-05, + "loss": 1.5603, + "step": 968 + }, + { + "epoch": 0.29412657459402036, + "grad_norm": 0.6556726694107056, + "learning_rate": 9.514528703047485e-05, + "loss": 2.1081, + "step": 969 + }, + { + "epoch": 0.2944301107907118, + "grad_norm": 0.3379404842853546, + "learning_rate": 9.514022476460465e-05, + "loss": 1.9701, + "step": 970 + }, + { + "epoch": 0.29473364698740323, + "grad_norm": 0.42676112055778503, + "learning_rate": 9.513516249873444e-05, + "loss": 1.6116, + "step": 971 + }, + { + "epoch": 0.2950371831840947, + "grad_norm": 0.35374894738197327, + "learning_rate": 9.513010023286424e-05, + "loss": 2.0621, + "step": 972 + }, + { + "epoch": 0.29534071938078615, + "grad_norm": 0.33012476563453674, + "learning_rate": 9.512503796699403e-05, + "loss": 1.4534, + "step": 973 + }, + { + "epoch": 0.2956442555774776, + "grad_norm": 0.37993383407592773, + "learning_rate": 9.511997570112383e-05, + "loss": 1.6306, + "step": 974 + }, + { + "epoch": 0.2959477917741691, + "grad_norm": 0.47140204906463623, + "learning_rate": 9.511491343525362e-05, + "loss": 2.0465, + "step": 975 + }, + { + "epoch": 0.2962513279708605, + "grad_norm": 0.40235936641693115, + "learning_rate": 9.510985116938343e-05, + "loss": 1.8247, + "step": 976 + }, + { + "epoch": 0.296554864167552, + "grad_norm": 0.3992665112018585, + "learning_rate": 9.510478890351322e-05, + "loss": 1.5702, + "step": 977 + }, + { + "epoch": 0.29685840036424344, + "grad_norm": 0.4469521641731262, + "learning_rate": 9.509972663764302e-05, + "loss": 1.8811, + "step": 978 + }, + { + "epoch": 0.29716193656093487, + "grad_norm": 0.41400644183158875, + "learning_rate": 9.509466437177281e-05, + "loss": 1.5374, + "step": 979 + }, + { + "epoch": 0.29746547275762636, + "grad_norm": 0.36348387598991394, + "learning_rate": 9.508960210590261e-05, + "loss": 1.9022, + "step": 980 + }, + { + "epoch": 0.2977690089543178, + "grad_norm": 0.4069242477416992, + "learning_rate": 9.50845398400324e-05, + "loss": 2.0066, + "step": 981 + }, + { + "epoch": 0.2980725451510093, + "grad_norm": 0.3684113323688507, + "learning_rate": 9.50794775741622e-05, + "loss": 1.8972, + "step": 982 + }, + { + "epoch": 0.2983760813477007, + "grad_norm": 0.40827688574790955, + "learning_rate": 9.5074415308292e-05, + "loss": 2.0659, + "step": 983 + }, + { + "epoch": 0.29867961754439215, + "grad_norm": 0.32065409421920776, + "learning_rate": 9.506935304242179e-05, + "loss": 2.0008, + "step": 984 + }, + { + "epoch": 0.29898315374108364, + "grad_norm": 0.38805294036865234, + "learning_rate": 9.50642907765516e-05, + "loss": 1.5027, + "step": 985 + }, + { + "epoch": 0.2992866899377751, + "grad_norm": 0.3656708896160126, + "learning_rate": 9.505922851068139e-05, + "loss": 1.7931, + "step": 986 + }, + { + "epoch": 0.2995902261344665, + "grad_norm": 0.4354289770126343, + "learning_rate": 9.505416624481119e-05, + "loss": 2.1183, + "step": 987 + }, + { + "epoch": 0.299893762331158, + "grad_norm": 0.3970641493797302, + "learning_rate": 9.504910397894098e-05, + "loss": 1.8188, + "step": 988 + }, + { + "epoch": 0.30019729852784943, + "grad_norm": 0.35527995228767395, + "learning_rate": 9.504404171307078e-05, + "loss": 1.6329, + "step": 989 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.4018630385398865, + "learning_rate": 9.503897944720057e-05, + "loss": 1.993, + "step": 990 + }, + { + "epoch": 0.30080437092123236, + "grad_norm": 0.36514052748680115, + "learning_rate": 9.503391718133037e-05, + "loss": 2.0482, + "step": 991 + }, + { + "epoch": 0.3011079071179238, + "grad_norm": 0.3790993094444275, + "learning_rate": 9.502885491546016e-05, + "loss": 2.0286, + "step": 992 + }, + { + "epoch": 0.3014114433146153, + "grad_norm": 0.314779669046402, + "learning_rate": 9.502379264958995e-05, + "loss": 1.8135, + "step": 993 + }, + { + "epoch": 0.3017149795113067, + "grad_norm": 0.42383378744125366, + "learning_rate": 9.501873038371975e-05, + "loss": 1.8783, + "step": 994 + }, + { + "epoch": 0.3020185157079982, + "grad_norm": 0.4036683738231659, + "learning_rate": 9.501366811784956e-05, + "loss": 1.6091, + "step": 995 + }, + { + "epoch": 0.30232205190468964, + "grad_norm": 0.3611324429512024, + "learning_rate": 9.500860585197935e-05, + "loss": 1.3388, + "step": 996 + }, + { + "epoch": 0.30262558810138107, + "grad_norm": 0.44210389256477356, + "learning_rate": 9.500354358610915e-05, + "loss": 1.6133, + "step": 997 + }, + { + "epoch": 0.30292912429807256, + "grad_norm": 0.37780526280403137, + "learning_rate": 9.499848132023894e-05, + "loss": 1.9993, + "step": 998 + }, + { + "epoch": 0.303232660494764, + "grad_norm": 0.469959557056427, + "learning_rate": 9.499341905436874e-05, + "loss": 1.8094, + "step": 999 + }, + { + "epoch": 0.30353619669145543, + "grad_norm": 0.38992664217948914, + "learning_rate": 9.498835678849853e-05, + "loss": 1.8975, + "step": 1000 + }, + { + "epoch": 0.3038397328881469, + "grad_norm": 0.44024091958999634, + "learning_rate": 9.498329452262833e-05, + "loss": 1.7081, + "step": 1001 + }, + { + "epoch": 0.30414326908483835, + "grad_norm": 0.32488685846328735, + "learning_rate": 9.497823225675812e-05, + "loss": 1.4921, + "step": 1002 + }, + { + "epoch": 0.30444680528152984, + "grad_norm": 0.7046712636947632, + "learning_rate": 9.497316999088792e-05, + "loss": 1.9693, + "step": 1003 + }, + { + "epoch": 0.3047503414782213, + "grad_norm": 0.39591220021247864, + "learning_rate": 9.496810772501772e-05, + "loss": 2.0266, + "step": 1004 + }, + { + "epoch": 0.3050538776749127, + "grad_norm": 0.371804416179657, + "learning_rate": 9.496304545914752e-05, + "loss": 1.9906, + "step": 1005 + }, + { + "epoch": 0.3053574138716042, + "grad_norm": 0.32893630862236023, + "learning_rate": 9.495798319327731e-05, + "loss": 1.9469, + "step": 1006 + }, + { + "epoch": 0.30566095006829563, + "grad_norm": 0.406531423330307, + "learning_rate": 9.495292092740711e-05, + "loss": 1.7575, + "step": 1007 + }, + { + "epoch": 0.3059644862649871, + "grad_norm": 0.3299405872821808, + "learning_rate": 9.49478586615369e-05, + "loss": 1.6457, + "step": 1008 + }, + { + "epoch": 0.30626802246167856, + "grad_norm": 0.40007394552230835, + "learning_rate": 9.49427963956667e-05, + "loss": 1.9291, + "step": 1009 + }, + { + "epoch": 0.30657155865837, + "grad_norm": 0.41286107897758484, + "learning_rate": 9.49377341297965e-05, + "loss": 1.9869, + "step": 1010 + }, + { + "epoch": 0.3068750948550615, + "grad_norm": 0.6297092437744141, + "learning_rate": 9.493267186392629e-05, + "loss": 2.1354, + "step": 1011 + }, + { + "epoch": 0.3071786310517529, + "grad_norm": 0.4763343334197998, + "learning_rate": 9.49276095980561e-05, + "loss": 1.6641, + "step": 1012 + }, + { + "epoch": 0.3074821672484444, + "grad_norm": 0.343218058347702, + "learning_rate": 9.492254733218589e-05, + "loss": 1.9556, + "step": 1013 + }, + { + "epoch": 0.30778570344513584, + "grad_norm": 0.4180206060409546, + "learning_rate": 9.491748506631569e-05, + "loss": 2.064, + "step": 1014 + }, + { + "epoch": 0.3080892396418273, + "grad_norm": 0.3307478725910187, + "learning_rate": 9.49124228004455e-05, + "loss": 1.9579, + "step": 1015 + }, + { + "epoch": 0.30839277583851876, + "grad_norm": 0.31935417652130127, + "learning_rate": 9.490736053457529e-05, + "loss": 2.0038, + "step": 1016 + }, + { + "epoch": 0.3086963120352102, + "grad_norm": 0.4078797399997711, + "learning_rate": 9.490229826870508e-05, + "loss": 1.6727, + "step": 1017 + }, + { + "epoch": 0.30899984823190163, + "grad_norm": 0.4393940269947052, + "learning_rate": 9.489723600283488e-05, + "loss": 1.9709, + "step": 1018 + }, + { + "epoch": 0.3093033844285931, + "grad_norm": 0.41586485505104065, + "learning_rate": 9.489217373696467e-05, + "loss": 1.9976, + "step": 1019 + }, + { + "epoch": 0.30960692062528455, + "grad_norm": 0.32988855242729187, + "learning_rate": 9.488711147109447e-05, + "loss": 2.1278, + "step": 1020 + }, + { + "epoch": 0.30991045682197604, + "grad_norm": 0.47184863686561584, + "learning_rate": 9.488204920522426e-05, + "loss": 1.8132, + "step": 1021 + }, + { + "epoch": 0.3102139930186675, + "grad_norm": 0.32716313004493713, + "learning_rate": 9.487698693935406e-05, + "loss": 1.6124, + "step": 1022 + }, + { + "epoch": 0.3105175292153589, + "grad_norm": 0.46906420588493347, + "learning_rate": 9.487192467348385e-05, + "loss": 1.9718, + "step": 1023 + }, + { + "epoch": 0.3108210654120504, + "grad_norm": 0.3436840772628784, + "learning_rate": 9.486686240761366e-05, + "loss": 1.809, + "step": 1024 + }, + { + "epoch": 0.31112460160874184, + "grad_norm": 0.39674249291419983, + "learning_rate": 9.486180014174346e-05, + "loss": 1.5307, + "step": 1025 + }, + { + "epoch": 0.3114281378054333, + "grad_norm": 0.40978574752807617, + "learning_rate": 9.485673787587325e-05, + "loss": 2.0251, + "step": 1026 + }, + { + "epoch": 0.31173167400212476, + "grad_norm": 0.39651399850845337, + "learning_rate": 9.485167561000305e-05, + "loss": 1.8872, + "step": 1027 + }, + { + "epoch": 0.3120352101988162, + "grad_norm": 0.7730064988136292, + "learning_rate": 9.484661334413284e-05, + "loss": 1.7339, + "step": 1028 + }, + { + "epoch": 0.3123387463955077, + "grad_norm": 0.36178770661354065, + "learning_rate": 9.484155107826264e-05, + "loss": 1.8594, + "step": 1029 + }, + { + "epoch": 0.3126422825921991, + "grad_norm": 0.4153605103492737, + "learning_rate": 9.483648881239243e-05, + "loss": 1.8687, + "step": 1030 + }, + { + "epoch": 0.31294581878889055, + "grad_norm": 0.41472381353378296, + "learning_rate": 9.483142654652222e-05, + "loss": 1.9665, + "step": 1031 + }, + { + "epoch": 0.31324935498558204, + "grad_norm": 0.3871115744113922, + "learning_rate": 9.482636428065202e-05, + "loss": 2.1858, + "step": 1032 + }, + { + "epoch": 0.3135528911822735, + "grad_norm": 0.33978626132011414, + "learning_rate": 9.482130201478181e-05, + "loss": 1.5615, + "step": 1033 + }, + { + "epoch": 0.31385642737896496, + "grad_norm": 0.33726009726524353, + "learning_rate": 9.481623974891162e-05, + "loss": 2.1119, + "step": 1034 + }, + { + "epoch": 0.3141599635756564, + "grad_norm": 0.35080355405807495, + "learning_rate": 9.481117748304142e-05, + "loss": 1.9497, + "step": 1035 + }, + { + "epoch": 0.31446349977234783, + "grad_norm": 0.37655749917030334, + "learning_rate": 9.480611521717121e-05, + "loss": 1.6486, + "step": 1036 + }, + { + "epoch": 0.3147670359690393, + "grad_norm": 0.3838097155094147, + "learning_rate": 9.480105295130101e-05, + "loss": 2.0504, + "step": 1037 + }, + { + "epoch": 0.31507057216573076, + "grad_norm": 0.3412497341632843, + "learning_rate": 9.47959906854308e-05, + "loss": 1.8417, + "step": 1038 + }, + { + "epoch": 0.31537410836242225, + "grad_norm": 0.3633384108543396, + "learning_rate": 9.47909284195606e-05, + "loss": 1.9713, + "step": 1039 + }, + { + "epoch": 0.3156776445591137, + "grad_norm": 0.332861989736557, + "learning_rate": 9.478586615369039e-05, + "loss": 1.8967, + "step": 1040 + }, + { + "epoch": 0.3159811807558051, + "grad_norm": 0.5054538249969482, + "learning_rate": 9.478080388782019e-05, + "loss": 1.8217, + "step": 1041 + }, + { + "epoch": 0.3162847169524966, + "grad_norm": 0.30825376510620117, + "learning_rate": 9.477574162194998e-05, + "loss": 1.8026, + "step": 1042 + }, + { + "epoch": 0.31658825314918804, + "grad_norm": 0.3759863078594208, + "learning_rate": 9.477067935607979e-05, + "loss": 1.6662, + "step": 1043 + }, + { + "epoch": 0.31689178934587947, + "grad_norm": 0.36408594250679016, + "learning_rate": 9.476561709020958e-05, + "loss": 2.3524, + "step": 1044 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.38226181268692017, + "learning_rate": 9.476055482433938e-05, + "loss": 1.8966, + "step": 1045 + }, + { + "epoch": 0.3174988617392624, + "grad_norm": 0.35480546951293945, + "learning_rate": 9.475549255846917e-05, + "loss": 1.9114, + "step": 1046 + }, + { + "epoch": 0.3178023979359539, + "grad_norm": 0.378701776266098, + "learning_rate": 9.475043029259897e-05, + "loss": 1.9151, + "step": 1047 + }, + { + "epoch": 0.3181059341326453, + "grad_norm": 1.0800230503082275, + "learning_rate": 9.474536802672876e-05, + "loss": 1.3396, + "step": 1048 + }, + { + "epoch": 0.31840947032933675, + "grad_norm": 0.4015067219734192, + "learning_rate": 9.474030576085856e-05, + "loss": 1.6889, + "step": 1049 + }, + { + "epoch": 0.31871300652602824, + "grad_norm": 0.35431405901908875, + "learning_rate": 9.473524349498835e-05, + "loss": 1.4716, + "step": 1050 + }, + { + "epoch": 0.3190165427227197, + "grad_norm": 0.4030434787273407, + "learning_rate": 9.473018122911815e-05, + "loss": 1.6192, + "step": 1051 + }, + { + "epoch": 0.31932007891941117, + "grad_norm": 0.4005342423915863, + "learning_rate": 9.472511896324796e-05, + "loss": 1.5092, + "step": 1052 + }, + { + "epoch": 0.3196236151161026, + "grad_norm": 1.130418062210083, + "learning_rate": 9.472005669737775e-05, + "loss": 1.5802, + "step": 1053 + }, + { + "epoch": 0.31992715131279403, + "grad_norm": 0.41232943534851074, + "learning_rate": 9.471499443150755e-05, + "loss": 2.0205, + "step": 1054 + }, + { + "epoch": 0.3202306875094855, + "grad_norm": 0.4155721366405487, + "learning_rate": 9.470993216563734e-05, + "loss": 1.88, + "step": 1055 + }, + { + "epoch": 0.32053422370617696, + "grad_norm": 0.36597010493278503, + "learning_rate": 9.470486989976715e-05, + "loss": 1.9922, + "step": 1056 + }, + { + "epoch": 0.3208377599028684, + "grad_norm": 0.8094148635864258, + "learning_rate": 9.469980763389694e-05, + "loss": 1.8267, + "step": 1057 + }, + { + "epoch": 0.3211412960995599, + "grad_norm": 0.36358359456062317, + "learning_rate": 9.469474536802674e-05, + "loss": 1.5307, + "step": 1058 + }, + { + "epoch": 0.3214448322962513, + "grad_norm": 0.400796502828598, + "learning_rate": 9.468968310215653e-05, + "loss": 1.9742, + "step": 1059 + }, + { + "epoch": 0.3217483684929428, + "grad_norm": 0.3251611888408661, + "learning_rate": 9.468462083628633e-05, + "loss": 1.7736, + "step": 1060 + }, + { + "epoch": 0.32205190468963424, + "grad_norm": 0.4060586988925934, + "learning_rate": 9.467955857041612e-05, + "loss": 1.7211, + "step": 1061 + }, + { + "epoch": 0.3223554408863257, + "grad_norm": 0.4181293547153473, + "learning_rate": 9.467449630454592e-05, + "loss": 1.5085, + "step": 1062 + }, + { + "epoch": 0.32265897708301716, + "grad_norm": 0.3514660894870758, + "learning_rate": 9.466943403867573e-05, + "loss": 1.8939, + "step": 1063 + }, + { + "epoch": 0.3229625132797086, + "grad_norm": 0.3337076008319855, + "learning_rate": 9.466437177280552e-05, + "loss": 1.6281, + "step": 1064 + }, + { + "epoch": 0.3232660494764001, + "grad_norm": 0.39011150598526, + "learning_rate": 9.465930950693532e-05, + "loss": 2.3316, + "step": 1065 + }, + { + "epoch": 0.3235695856730915, + "grad_norm": 0.42054951190948486, + "learning_rate": 9.465424724106511e-05, + "loss": 1.9249, + "step": 1066 + }, + { + "epoch": 0.32387312186978295, + "grad_norm": 0.37516888976097107, + "learning_rate": 9.46491849751949e-05, + "loss": 1.9643, + "step": 1067 + }, + { + "epoch": 0.32417665806647444, + "grad_norm": 0.3549358546733856, + "learning_rate": 9.46441227093247e-05, + "loss": 1.9069, + "step": 1068 + }, + { + "epoch": 0.3244801942631659, + "grad_norm": 1.4541680812835693, + "learning_rate": 9.46390604434545e-05, + "loss": 1.4868, + "step": 1069 + }, + { + "epoch": 0.3247837304598573, + "grad_norm": 0.31561896204948425, + "learning_rate": 9.463399817758429e-05, + "loss": 1.8972, + "step": 1070 + }, + { + "epoch": 0.3250872666565488, + "grad_norm": 0.35816720128059387, + "learning_rate": 9.462893591171408e-05, + "loss": 2.0286, + "step": 1071 + }, + { + "epoch": 0.32539080285324024, + "grad_norm": 0.38618069887161255, + "learning_rate": 9.462387364584388e-05, + "loss": 1.7604, + "step": 1072 + }, + { + "epoch": 0.3256943390499317, + "grad_norm": 0.42617419362068176, + "learning_rate": 9.461881137997369e-05, + "loss": 1.2767, + "step": 1073 + }, + { + "epoch": 0.32599787524662316, + "grad_norm": 0.3996577262878418, + "learning_rate": 9.461374911410348e-05, + "loss": 2.0023, + "step": 1074 + }, + { + "epoch": 0.3263014114433146, + "grad_norm": 0.6627565026283264, + "learning_rate": 9.460868684823328e-05, + "loss": 2.2386, + "step": 1075 + }, + { + "epoch": 0.3266049476400061, + "grad_norm": 0.3753213882446289, + "learning_rate": 9.460362458236307e-05, + "loss": 1.6935, + "step": 1076 + }, + { + "epoch": 0.3269084838366975, + "grad_norm": 0.4097970724105835, + "learning_rate": 9.459856231649287e-05, + "loss": 1.59, + "step": 1077 + }, + { + "epoch": 0.327212020033389, + "grad_norm": 0.39637240767478943, + "learning_rate": 9.459350005062266e-05, + "loss": 1.5338, + "step": 1078 + }, + { + "epoch": 0.32751555623008044, + "grad_norm": 0.38365036249160767, + "learning_rate": 9.458843778475246e-05, + "loss": 2.0128, + "step": 1079 + }, + { + "epoch": 0.3278190924267719, + "grad_norm": 0.42568036913871765, + "learning_rate": 9.458337551888225e-05, + "loss": 1.3282, + "step": 1080 + }, + { + "epoch": 0.32812262862346336, + "grad_norm": 0.4248203933238983, + "learning_rate": 9.457831325301205e-05, + "loss": 1.9059, + "step": 1081 + }, + { + "epoch": 0.3284261648201548, + "grad_norm": 0.37200963497161865, + "learning_rate": 9.457325098714185e-05, + "loss": 2.085, + "step": 1082 + }, + { + "epoch": 0.32872970101684623, + "grad_norm": 0.44390764832496643, + "learning_rate": 9.456818872127165e-05, + "loss": 1.9431, + "step": 1083 + }, + { + "epoch": 0.3290332372135377, + "grad_norm": 0.44483283162117004, + "learning_rate": 9.456312645540144e-05, + "loss": 2.088, + "step": 1084 + }, + { + "epoch": 0.32933677341022916, + "grad_norm": 0.3765670955181122, + "learning_rate": 9.455806418953124e-05, + "loss": 2.0446, + "step": 1085 + }, + { + "epoch": 0.32964030960692065, + "grad_norm": 0.428964763879776, + "learning_rate": 9.455300192366103e-05, + "loss": 1.9831, + "step": 1086 + }, + { + "epoch": 0.3299438458036121, + "grad_norm": 0.3957151770591736, + "learning_rate": 9.454793965779083e-05, + "loss": 1.7817, + "step": 1087 + }, + { + "epoch": 0.3302473820003035, + "grad_norm": 0.3726184368133545, + "learning_rate": 9.454287739192062e-05, + "loss": 1.9929, + "step": 1088 + }, + { + "epoch": 0.330550918196995, + "grad_norm": 0.41574302315711975, + "learning_rate": 9.453781512605042e-05, + "loss": 2.0094, + "step": 1089 + }, + { + "epoch": 0.33085445439368644, + "grad_norm": 0.36284613609313965, + "learning_rate": 9.453275286018021e-05, + "loss": 2.0273, + "step": 1090 + }, + { + "epoch": 0.3311579905903779, + "grad_norm": 0.48810014128685, + "learning_rate": 9.452769059431002e-05, + "loss": 1.4371, + "step": 1091 + }, + { + "epoch": 0.33146152678706936, + "grad_norm": 0.3929893672466278, + "learning_rate": 9.452262832843982e-05, + "loss": 2.0663, + "step": 1092 + }, + { + "epoch": 0.3317650629837608, + "grad_norm": 0.401722252368927, + "learning_rate": 9.451756606256961e-05, + "loss": 1.6119, + "step": 1093 + }, + { + "epoch": 0.3320685991804523, + "grad_norm": 0.42032745480537415, + "learning_rate": 9.45125037966994e-05, + "loss": 1.7541, + "step": 1094 + }, + { + "epoch": 0.3323721353771437, + "grad_norm": 0.3663571774959564, + "learning_rate": 9.45074415308292e-05, + "loss": 1.4438, + "step": 1095 + }, + { + "epoch": 0.33267567157383515, + "grad_norm": 0.397624671459198, + "learning_rate": 9.4502379264959e-05, + "loss": 2.0996, + "step": 1096 + }, + { + "epoch": 0.33297920777052664, + "grad_norm": 0.3914051651954651, + "learning_rate": 9.449731699908879e-05, + "loss": 1.5906, + "step": 1097 + }, + { + "epoch": 0.3332827439672181, + "grad_norm": 0.3951834440231323, + "learning_rate": 9.449225473321858e-05, + "loss": 1.9128, + "step": 1098 + }, + { + "epoch": 0.33358628016390957, + "grad_norm": 0.363696426153183, + "learning_rate": 9.448719246734838e-05, + "loss": 1.3447, + "step": 1099 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.3522724211215973, + "learning_rate": 9.448213020147817e-05, + "loss": 1.6755, + "step": 1100 + }, + { + "epoch": 0.33419335255729243, + "grad_norm": 1.1290934085845947, + "learning_rate": 9.447706793560798e-05, + "loss": 1.6796, + "step": 1101 + }, + { + "epoch": 0.3344968887539839, + "grad_norm": 0.3527061939239502, + "learning_rate": 9.447200566973779e-05, + "loss": 1.4917, + "step": 1102 + }, + { + "epoch": 0.33480042495067536, + "grad_norm": 0.3770875930786133, + "learning_rate": 9.446694340386759e-05, + "loss": 1.2733, + "step": 1103 + }, + { + "epoch": 0.33510396114736685, + "grad_norm": 0.3742992579936981, + "learning_rate": 9.446188113799738e-05, + "loss": 1.8584, + "step": 1104 + }, + { + "epoch": 0.3354074973440583, + "grad_norm": 0.7284528017044067, + "learning_rate": 9.445681887212718e-05, + "loss": 2.0183, + "step": 1105 + }, + { + "epoch": 0.3357110335407497, + "grad_norm": 0.37331897020339966, + "learning_rate": 9.445175660625697e-05, + "loss": 1.9604, + "step": 1106 + }, + { + "epoch": 0.3360145697374412, + "grad_norm": 0.3642507791519165, + "learning_rate": 9.444669434038676e-05, + "loss": 1.8661, + "step": 1107 + }, + { + "epoch": 0.33631810593413264, + "grad_norm": 0.4249272346496582, + "learning_rate": 9.444163207451656e-05, + "loss": 2.039, + "step": 1108 + }, + { + "epoch": 0.33662164213082413, + "grad_norm": 0.5299102067947388, + "learning_rate": 9.443656980864635e-05, + "loss": 1.618, + "step": 1109 + }, + { + "epoch": 0.33692517832751556, + "grad_norm": 0.37671583890914917, + "learning_rate": 9.443150754277615e-05, + "loss": 1.5634, + "step": 1110 + }, + { + "epoch": 0.337228714524207, + "grad_norm": 0.9504343867301941, + "learning_rate": 9.442644527690594e-05, + "loss": 1.7275, + "step": 1111 + }, + { + "epoch": 0.3375322507208985, + "grad_norm": 0.37230974435806274, + "learning_rate": 9.442138301103575e-05, + "loss": 1.9971, + "step": 1112 + }, + { + "epoch": 0.3378357869175899, + "grad_norm": 0.4015982449054718, + "learning_rate": 9.441632074516555e-05, + "loss": 2.0012, + "step": 1113 + }, + { + "epoch": 0.33813932311428135, + "grad_norm": 0.42521438002586365, + "learning_rate": 9.441125847929534e-05, + "loss": 2.1657, + "step": 1114 + }, + { + "epoch": 0.33844285931097284, + "grad_norm": 0.3954319953918457, + "learning_rate": 9.440619621342514e-05, + "loss": 1.7999, + "step": 1115 + }, + { + "epoch": 0.3387463955076643, + "grad_norm": 0.5241403579711914, + "learning_rate": 9.440113394755493e-05, + "loss": 2.0102, + "step": 1116 + }, + { + "epoch": 0.33904993170435577, + "grad_norm": 0.4186641275882721, + "learning_rate": 9.439607168168473e-05, + "loss": 1.854, + "step": 1117 + }, + { + "epoch": 0.3393534679010472, + "grad_norm": 0.4375157654285431, + "learning_rate": 9.439100941581452e-05, + "loss": 1.7774, + "step": 1118 + }, + { + "epoch": 0.33965700409773864, + "grad_norm": 0.43266987800598145, + "learning_rate": 9.438594714994432e-05, + "loss": 1.9328, + "step": 1119 + }, + { + "epoch": 0.3399605402944301, + "grad_norm": 0.5544857382774353, + "learning_rate": 9.438088488407411e-05, + "loss": 1.7284, + "step": 1120 + }, + { + "epoch": 0.34026407649112156, + "grad_norm": 0.39998582005500793, + "learning_rate": 9.437582261820392e-05, + "loss": 1.8724, + "step": 1121 + }, + { + "epoch": 0.34056761268781305, + "grad_norm": 0.41390395164489746, + "learning_rate": 9.437076035233371e-05, + "loss": 2.1188, + "step": 1122 + }, + { + "epoch": 0.3408711488845045, + "grad_norm": 0.4374658167362213, + "learning_rate": 9.436569808646351e-05, + "loss": 1.9946, + "step": 1123 + }, + { + "epoch": 0.3411746850811959, + "grad_norm": 0.3902375102043152, + "learning_rate": 9.43606358205933e-05, + "loss": 1.9352, + "step": 1124 + }, + { + "epoch": 0.3414782212778874, + "grad_norm": 0.4049385190010071, + "learning_rate": 9.43555735547231e-05, + "loss": 1.7674, + "step": 1125 + }, + { + "epoch": 0.34178175747457884, + "grad_norm": 0.42752334475517273, + "learning_rate": 9.435051128885289e-05, + "loss": 2.1489, + "step": 1126 + }, + { + "epoch": 0.3420852936712703, + "grad_norm": 0.3927367925643921, + "learning_rate": 9.434544902298269e-05, + "loss": 1.9454, + "step": 1127 + }, + { + "epoch": 0.34238882986796176, + "grad_norm": 1.4001588821411133, + "learning_rate": 9.434038675711248e-05, + "loss": 1.8791, + "step": 1128 + }, + { + "epoch": 0.3426923660646532, + "grad_norm": 0.3640120327472687, + "learning_rate": 9.433532449124228e-05, + "loss": 1.9714, + "step": 1129 + }, + { + "epoch": 0.3429959022613447, + "grad_norm": 0.3569428026676178, + "learning_rate": 9.433026222537209e-05, + "loss": 1.715, + "step": 1130 + }, + { + "epoch": 0.3432994384580361, + "grad_norm": 0.3593400716781616, + "learning_rate": 9.432519995950188e-05, + "loss": 1.9454, + "step": 1131 + }, + { + "epoch": 0.34360297465472756, + "grad_norm": 0.38255101442337036, + "learning_rate": 9.432013769363168e-05, + "loss": 1.9629, + "step": 1132 + }, + { + "epoch": 0.34390651085141904, + "grad_norm": 0.4099471867084503, + "learning_rate": 9.431507542776147e-05, + "loss": 1.9556, + "step": 1133 + }, + { + "epoch": 0.3442100470481105, + "grad_norm": 0.36562618613243103, + "learning_rate": 9.431001316189126e-05, + "loss": 1.3671, + "step": 1134 + }, + { + "epoch": 0.34451358324480197, + "grad_norm": 0.49943339824676514, + "learning_rate": 9.430495089602106e-05, + "loss": 1.8581, + "step": 1135 + }, + { + "epoch": 0.3448171194414934, + "grad_norm": 0.3707871437072754, + "learning_rate": 9.429988863015085e-05, + "loss": 2.0911, + "step": 1136 + }, + { + "epoch": 0.34512065563818484, + "grad_norm": 0.3699527382850647, + "learning_rate": 9.429482636428065e-05, + "loss": 2.0198, + "step": 1137 + }, + { + "epoch": 0.3454241918348763, + "grad_norm": 0.4300304055213928, + "learning_rate": 9.428976409841044e-05, + "loss": 2.2398, + "step": 1138 + }, + { + "epoch": 0.34572772803156776, + "grad_norm": 0.38733771443367004, + "learning_rate": 9.428470183254024e-05, + "loss": 1.9505, + "step": 1139 + }, + { + "epoch": 0.3460312642282592, + "grad_norm": 0.38434740900993347, + "learning_rate": 9.427963956667005e-05, + "loss": 2.0853, + "step": 1140 + }, + { + "epoch": 0.3463348004249507, + "grad_norm": 0.3448013961315155, + "learning_rate": 9.427457730079984e-05, + "loss": 1.816, + "step": 1141 + }, + { + "epoch": 0.3466383366216421, + "grad_norm": 2.17158842086792, + "learning_rate": 9.426951503492964e-05, + "loss": 1.9041, + "step": 1142 + }, + { + "epoch": 0.3469418728183336, + "grad_norm": 0.39879223704338074, + "learning_rate": 9.426445276905943e-05, + "loss": 1.8991, + "step": 1143 + }, + { + "epoch": 0.34724540901502504, + "grad_norm": 0.517691433429718, + "learning_rate": 9.425939050318923e-05, + "loss": 1.4864, + "step": 1144 + }, + { + "epoch": 0.3475489452117165, + "grad_norm": 0.4679596722126007, + "learning_rate": 9.425432823731903e-05, + "loss": 2.127, + "step": 1145 + }, + { + "epoch": 0.34785248140840797, + "grad_norm": 0.47220855951309204, + "learning_rate": 9.424926597144883e-05, + "loss": 1.1827, + "step": 1146 + }, + { + "epoch": 0.3481560176050994, + "grad_norm": 0.4707253575325012, + "learning_rate": 9.424420370557862e-05, + "loss": 1.6538, + "step": 1147 + }, + { + "epoch": 0.3484595538017909, + "grad_norm": 0.5610188245773315, + "learning_rate": 9.423914143970842e-05, + "loss": 2.0097, + "step": 1148 + }, + { + "epoch": 0.3487630899984823, + "grad_norm": 0.6568597555160522, + "learning_rate": 9.423407917383821e-05, + "loss": 1.8777, + "step": 1149 + }, + { + "epoch": 0.34906662619517376, + "grad_norm": 0.38883280754089355, + "learning_rate": 9.422901690796801e-05, + "loss": 1.8524, + "step": 1150 + }, + { + "epoch": 0.34937016239186525, + "grad_norm": 0.34381693601608276, + "learning_rate": 9.422395464209782e-05, + "loss": 1.7728, + "step": 1151 + }, + { + "epoch": 0.3496736985885567, + "grad_norm": 0.4320678412914276, + "learning_rate": 9.421889237622761e-05, + "loss": 1.6346, + "step": 1152 + }, + { + "epoch": 0.3499772347852481, + "grad_norm": 0.4651411771774292, + "learning_rate": 9.42138301103574e-05, + "loss": 2.008, + "step": 1153 + }, + { + "epoch": 0.3502807709819396, + "grad_norm": 0.5340977907180786, + "learning_rate": 9.42087678444872e-05, + "loss": 1.6537, + "step": 1154 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.3686065971851349, + "learning_rate": 9.4203705578617e-05, + "loss": 1.6585, + "step": 1155 + }, + { + "epoch": 0.35088784337532253, + "grad_norm": 0.4016922116279602, + "learning_rate": 9.419864331274679e-05, + "loss": 1.4874, + "step": 1156 + }, + { + "epoch": 0.35119137957201396, + "grad_norm": 0.4304169714450836, + "learning_rate": 9.419358104687659e-05, + "loss": 2.0125, + "step": 1157 + }, + { + "epoch": 0.3514949157687054, + "grad_norm": 0.3944842517375946, + "learning_rate": 9.418851878100638e-05, + "loss": 2.0397, + "step": 1158 + }, + { + "epoch": 0.3517984519653969, + "grad_norm": 0.3778032064437866, + "learning_rate": 9.418345651513618e-05, + "loss": 2.0629, + "step": 1159 + }, + { + "epoch": 0.3521019881620883, + "grad_norm": 0.4209291636943817, + "learning_rate": 9.417839424926598e-05, + "loss": 1.845, + "step": 1160 + }, + { + "epoch": 0.3524055243587798, + "grad_norm": 0.3948676586151123, + "learning_rate": 9.417333198339578e-05, + "loss": 1.9479, + "step": 1161 + }, + { + "epoch": 0.35270906055547124, + "grad_norm": 0.4018319547176361, + "learning_rate": 9.416826971752557e-05, + "loss": 1.9, + "step": 1162 + }, + { + "epoch": 0.3530125967521627, + "grad_norm": 0.42170947790145874, + "learning_rate": 9.416320745165537e-05, + "loss": 1.8034, + "step": 1163 + }, + { + "epoch": 0.35331613294885417, + "grad_norm": 0.3817223310470581, + "learning_rate": 9.415814518578516e-05, + "loss": 1.4884, + "step": 1164 + }, + { + "epoch": 0.3536196691455456, + "grad_norm": 0.35511648654937744, + "learning_rate": 9.415308291991496e-05, + "loss": 1.7336, + "step": 1165 + }, + { + "epoch": 0.35392320534223703, + "grad_norm": 0.45333489775657654, + "learning_rate": 9.414802065404475e-05, + "loss": 1.6451, + "step": 1166 + }, + { + "epoch": 0.3542267415389285, + "grad_norm": 0.42814895510673523, + "learning_rate": 9.414295838817455e-05, + "loss": 1.8122, + "step": 1167 + }, + { + "epoch": 0.35453027773561996, + "grad_norm": 0.39475324749946594, + "learning_rate": 9.413789612230434e-05, + "loss": 2.2534, + "step": 1168 + }, + { + "epoch": 0.35483381393231145, + "grad_norm": 0.41115859150886536, + "learning_rate": 9.413283385643415e-05, + "loss": 1.8317, + "step": 1169 + }, + { + "epoch": 0.3551373501290029, + "grad_norm": 0.44518032670021057, + "learning_rate": 9.412777159056395e-05, + "loss": 1.8648, + "step": 1170 + }, + { + "epoch": 0.3554408863256943, + "grad_norm": 0.3964219391345978, + "learning_rate": 9.412270932469374e-05, + "loss": 1.4205, + "step": 1171 + }, + { + "epoch": 0.3557444225223858, + "grad_norm": 0.3874772787094116, + "learning_rate": 9.411764705882353e-05, + "loss": 1.7562, + "step": 1172 + }, + { + "epoch": 0.35604795871907724, + "grad_norm": 0.35493049025535583, + "learning_rate": 9.411258479295333e-05, + "loss": 1.1856, + "step": 1173 + }, + { + "epoch": 0.35635149491576873, + "grad_norm": 0.3838149905204773, + "learning_rate": 9.410752252708312e-05, + "loss": 1.955, + "step": 1174 + }, + { + "epoch": 0.35665503111246016, + "grad_norm": 0.46874240040779114, + "learning_rate": 9.410246026121292e-05, + "loss": 1.7283, + "step": 1175 + }, + { + "epoch": 0.3569585673091516, + "grad_norm": 0.3673109710216522, + "learning_rate": 9.409739799534271e-05, + "loss": 1.8228, + "step": 1176 + }, + { + "epoch": 0.3572621035058431, + "grad_norm": 0.4494078457355499, + "learning_rate": 9.409233572947251e-05, + "loss": 1.6355, + "step": 1177 + }, + { + "epoch": 0.3575656397025345, + "grad_norm": 0.4009113609790802, + "learning_rate": 9.40872734636023e-05, + "loss": 1.7594, + "step": 1178 + }, + { + "epoch": 0.35786917589922596, + "grad_norm": 0.4051864445209503, + "learning_rate": 9.408221119773211e-05, + "loss": 1.7057, + "step": 1179 + }, + { + "epoch": 0.35817271209591744, + "grad_norm": 0.33628928661346436, + "learning_rate": 9.40771489318619e-05, + "loss": 1.8971, + "step": 1180 + }, + { + "epoch": 0.3584762482926089, + "grad_norm": 0.3441104590892792, + "learning_rate": 9.40720866659917e-05, + "loss": 1.8399, + "step": 1181 + }, + { + "epoch": 0.35877978448930037, + "grad_norm": 0.38719773292541504, + "learning_rate": 9.40670244001215e-05, + "loss": 2.0484, + "step": 1182 + }, + { + "epoch": 0.3590833206859918, + "grad_norm": 0.4182259142398834, + "learning_rate": 9.406196213425129e-05, + "loss": 1.525, + "step": 1183 + }, + { + "epoch": 0.35938685688268324, + "grad_norm": 0.42075198888778687, + "learning_rate": 9.405689986838109e-05, + "loss": 2.1262, + "step": 1184 + }, + { + "epoch": 0.3596903930793747, + "grad_norm": 0.3604430556297302, + "learning_rate": 9.405183760251088e-05, + "loss": 1.9715, + "step": 1185 + }, + { + "epoch": 0.35999392927606616, + "grad_norm": 0.46226024627685547, + "learning_rate": 9.404677533664068e-05, + "loss": 1.7088, + "step": 1186 + }, + { + "epoch": 0.36029746547275765, + "grad_norm": 0.3673461377620697, + "learning_rate": 9.404171307077047e-05, + "loss": 1.7057, + "step": 1187 + }, + { + "epoch": 0.3606010016694491, + "grad_norm": 0.40370312333106995, + "learning_rate": 9.403665080490028e-05, + "loss": 1.9058, + "step": 1188 + }, + { + "epoch": 0.3609045378661405, + "grad_norm": 0.39149123430252075, + "learning_rate": 9.403158853903007e-05, + "loss": 2.0148, + "step": 1189 + }, + { + "epoch": 0.361208074062832, + "grad_norm": 0.6711376309394836, + "learning_rate": 9.402652627315988e-05, + "loss": 1.69, + "step": 1190 + }, + { + "epoch": 0.36151161025952344, + "grad_norm": 0.3052380084991455, + "learning_rate": 9.402146400728968e-05, + "loss": 1.4772, + "step": 1191 + }, + { + "epoch": 0.36181514645621493, + "grad_norm": 0.37661212682724, + "learning_rate": 9.401640174141947e-05, + "loss": 1.7378, + "step": 1192 + }, + { + "epoch": 0.36211868265290637, + "grad_norm": 0.39574167132377625, + "learning_rate": 9.401133947554927e-05, + "loss": 1.801, + "step": 1193 + }, + { + "epoch": 0.3624222188495978, + "grad_norm": 0.44611817598342896, + "learning_rate": 9.400627720967906e-05, + "loss": 1.5995, + "step": 1194 + }, + { + "epoch": 0.3627257550462893, + "grad_norm": 0.40026605129241943, + "learning_rate": 9.400121494380886e-05, + "loss": 1.6517, + "step": 1195 + }, + { + "epoch": 0.3630292912429807, + "grad_norm": 0.36110207438468933, + "learning_rate": 9.399615267793865e-05, + "loss": 1.9764, + "step": 1196 + }, + { + "epoch": 0.36333282743967216, + "grad_norm": 0.38339897990226746, + "learning_rate": 9.399109041206845e-05, + "loss": 2.085, + "step": 1197 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.36159849166870117, + "learning_rate": 9.398602814619824e-05, + "loss": 1.7839, + "step": 1198 + }, + { + "epoch": 0.3639398998330551, + "grad_norm": 0.3263375461101532, + "learning_rate": 9.398096588032805e-05, + "loss": 1.8143, + "step": 1199 + }, + { + "epoch": 0.36424343602974657, + "grad_norm": 0.3886968195438385, + "learning_rate": 9.397590361445784e-05, + "loss": 2.0904, + "step": 1200 + }, + { + "epoch": 0.364546972226438, + "grad_norm": 0.41123297810554504, + "learning_rate": 9.397084134858764e-05, + "loss": 1.8622, + "step": 1201 + }, + { + "epoch": 0.36485050842312944, + "grad_norm": 0.580788791179657, + "learning_rate": 9.396577908271743e-05, + "loss": 1.9017, + "step": 1202 + }, + { + "epoch": 0.36515404461982093, + "grad_norm": 0.3737773895263672, + "learning_rate": 9.396071681684723e-05, + "loss": 1.5775, + "step": 1203 + }, + { + "epoch": 0.36545758081651236, + "grad_norm": 0.38713717460632324, + "learning_rate": 9.395565455097702e-05, + "loss": 1.967, + "step": 1204 + }, + { + "epoch": 0.36576111701320385, + "grad_norm": 0.7311956882476807, + "learning_rate": 9.395059228510682e-05, + "loss": 1.7088, + "step": 1205 + }, + { + "epoch": 0.3660646532098953, + "grad_norm": 1.4061527252197266, + "learning_rate": 9.394553001923661e-05, + "loss": 1.9163, + "step": 1206 + }, + { + "epoch": 0.3663681894065867, + "grad_norm": 0.3753696382045746, + "learning_rate": 9.39404677533664e-05, + "loss": 2.1954, + "step": 1207 + }, + { + "epoch": 0.3666717256032782, + "grad_norm": 0.38732466101646423, + "learning_rate": 9.393540548749622e-05, + "loss": 1.9101, + "step": 1208 + }, + { + "epoch": 0.36697526179996964, + "grad_norm": 0.41291502118110657, + "learning_rate": 9.393034322162601e-05, + "loss": 1.9483, + "step": 1209 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.44216427206993103, + "learning_rate": 9.39252809557558e-05, + "loss": 2.043, + "step": 1210 + }, + { + "epoch": 0.36758233419335257, + "grad_norm": 0.798313319683075, + "learning_rate": 9.39202186898856e-05, + "loss": 1.9988, + "step": 1211 + }, + { + "epoch": 0.367885870390044, + "grad_norm": 0.5483587384223938, + "learning_rate": 9.39151564240154e-05, + "loss": 2.0979, + "step": 1212 + }, + { + "epoch": 0.3681894065867355, + "grad_norm": 0.44406580924987793, + "learning_rate": 9.391009415814519e-05, + "loss": 1.5858, + "step": 1213 + }, + { + "epoch": 0.3684929427834269, + "grad_norm": 0.3883718252182007, + "learning_rate": 9.390503189227498e-05, + "loss": 1.9014, + "step": 1214 + }, + { + "epoch": 0.36879647898011836, + "grad_norm": 0.7284543514251709, + "learning_rate": 9.389996962640478e-05, + "loss": 1.9709, + "step": 1215 + }, + { + "epoch": 0.36910001517680985, + "grad_norm": 0.38549402356147766, + "learning_rate": 9.389490736053457e-05, + "loss": 1.9513, + "step": 1216 + }, + { + "epoch": 0.3694035513735013, + "grad_norm": 0.39417389035224915, + "learning_rate": 9.388984509466437e-05, + "loss": 2.0409, + "step": 1217 + }, + { + "epoch": 0.36970708757019277, + "grad_norm": 0.40816301107406616, + "learning_rate": 9.388478282879418e-05, + "loss": 2.0136, + "step": 1218 + }, + { + "epoch": 0.3700106237668842, + "grad_norm": 0.5700183510780334, + "learning_rate": 9.387972056292397e-05, + "loss": 1.8478, + "step": 1219 + }, + { + "epoch": 0.37031415996357564, + "grad_norm": 0.35159793496131897, + "learning_rate": 9.387465829705377e-05, + "loss": 1.6004, + "step": 1220 + }, + { + "epoch": 0.37061769616026713, + "grad_norm": 0.41622206568717957, + "learning_rate": 9.386959603118356e-05, + "loss": 1.9104, + "step": 1221 + }, + { + "epoch": 0.37092123235695856, + "grad_norm": 0.4205602705478668, + "learning_rate": 9.386453376531336e-05, + "loss": 2.1058, + "step": 1222 + }, + { + "epoch": 0.37122476855365, + "grad_norm": 0.38390764594078064, + "learning_rate": 9.385947149944315e-05, + "loss": 1.8972, + "step": 1223 + }, + { + "epoch": 0.3715283047503415, + "grad_norm": 0.3790401816368103, + "learning_rate": 9.385440923357295e-05, + "loss": 1.8975, + "step": 1224 + }, + { + "epoch": 0.3718318409470329, + "grad_norm": 0.5210400223731995, + "learning_rate": 9.384934696770274e-05, + "loss": 1.7181, + "step": 1225 + }, + { + "epoch": 0.3721353771437244, + "grad_norm": 0.4098454415798187, + "learning_rate": 9.384428470183253e-05, + "loss": 2.1405, + "step": 1226 + }, + { + "epoch": 0.37243891334041584, + "grad_norm": 0.40917104482650757, + "learning_rate": 9.383922243596234e-05, + "loss": 1.8696, + "step": 1227 + }, + { + "epoch": 0.3727424495371073, + "grad_norm": 0.3712831139564514, + "learning_rate": 9.383416017009214e-05, + "loss": 1.8792, + "step": 1228 + }, + { + "epoch": 0.37304598573379877, + "grad_norm": 0.3110792934894562, + "learning_rate": 9.382909790422193e-05, + "loss": 1.5782, + "step": 1229 + }, + { + "epoch": 0.3733495219304902, + "grad_norm": 0.3657875061035156, + "learning_rate": 9.382403563835173e-05, + "loss": 2.0311, + "step": 1230 + }, + { + "epoch": 0.3736530581271817, + "grad_norm": 0.37432965636253357, + "learning_rate": 9.381897337248152e-05, + "loss": 1.8505, + "step": 1231 + }, + { + "epoch": 0.3739565943238731, + "grad_norm": 0.3771384656429291, + "learning_rate": 9.381391110661132e-05, + "loss": 1.5715, + "step": 1232 + }, + { + "epoch": 0.37426013052056456, + "grad_norm": 1.2401721477508545, + "learning_rate": 9.380884884074111e-05, + "loss": 1.4436, + "step": 1233 + }, + { + "epoch": 0.37456366671725605, + "grad_norm": 0.36102503538131714, + "learning_rate": 9.380378657487092e-05, + "loss": 1.8907, + "step": 1234 + }, + { + "epoch": 0.3748672029139475, + "grad_norm": 0.46541303396224976, + "learning_rate": 9.379872430900072e-05, + "loss": 2.0067, + "step": 1235 + }, + { + "epoch": 0.3751707391106389, + "grad_norm": 0.46490392088890076, + "learning_rate": 9.379366204313051e-05, + "loss": 1.379, + "step": 1236 + }, + { + "epoch": 0.3754742753073304, + "grad_norm": 0.40038684010505676, + "learning_rate": 9.37885997772603e-05, + "loss": 1.9102, + "step": 1237 + }, + { + "epoch": 0.37577781150402184, + "grad_norm": 0.401563435792923, + "learning_rate": 9.378353751139011e-05, + "loss": 2.0087, + "step": 1238 + }, + { + "epoch": 0.37608134770071333, + "grad_norm": 0.38930457830429077, + "learning_rate": 9.377847524551991e-05, + "loss": 1.7222, + "step": 1239 + }, + { + "epoch": 0.37638488389740477, + "grad_norm": 0.4146344065666199, + "learning_rate": 9.37734129796497e-05, + "loss": 1.9965, + "step": 1240 + }, + { + "epoch": 0.3766884200940962, + "grad_norm": 0.7829983830451965, + "learning_rate": 9.37683507137795e-05, + "loss": 1.7133, + "step": 1241 + }, + { + "epoch": 0.3769919562907877, + "grad_norm": 0.3819306492805481, + "learning_rate": 9.376328844790929e-05, + "loss": 1.7532, + "step": 1242 + }, + { + "epoch": 0.3772954924874791, + "grad_norm": 0.35361188650131226, + "learning_rate": 9.375822618203909e-05, + "loss": 1.8794, + "step": 1243 + }, + { + "epoch": 0.3775990286841706, + "grad_norm": 0.37844938039779663, + "learning_rate": 9.375316391616888e-05, + "loss": 2.0991, + "step": 1244 + }, + { + "epoch": 0.37790256488086205, + "grad_norm": 0.49530112743377686, + "learning_rate": 9.374810165029868e-05, + "loss": 1.5838, + "step": 1245 + }, + { + "epoch": 0.3782061010775535, + "grad_norm": 0.36716628074645996, + "learning_rate": 9.374303938442847e-05, + "loss": 1.812, + "step": 1246 + }, + { + "epoch": 0.37850963727424497, + "grad_norm": 0.3772716522216797, + "learning_rate": 9.373797711855828e-05, + "loss": 1.8649, + "step": 1247 + }, + { + "epoch": 0.3788131734709364, + "grad_norm": 0.42215248942375183, + "learning_rate": 9.373291485268807e-05, + "loss": 1.8589, + "step": 1248 + }, + { + "epoch": 0.37911670966762784, + "grad_norm": 0.4086074233055115, + "learning_rate": 9.372785258681787e-05, + "loss": 2.0305, + "step": 1249 + }, + { + "epoch": 0.3794202458643193, + "grad_norm": 0.5096133947372437, + "learning_rate": 9.372279032094766e-05, + "loss": 2.0852, + "step": 1250 + }, + { + "epoch": 0.37972378206101076, + "grad_norm": 0.41633352637290955, + "learning_rate": 9.371772805507746e-05, + "loss": 1.8879, + "step": 1251 + }, + { + "epoch": 0.38002731825770225, + "grad_norm": 0.4787557125091553, + "learning_rate": 9.371266578920725e-05, + "loss": 1.9307, + "step": 1252 + }, + { + "epoch": 0.3803308544543937, + "grad_norm": 0.4313805103302002, + "learning_rate": 9.370760352333705e-05, + "loss": 1.097, + "step": 1253 + }, + { + "epoch": 0.3806343906510851, + "grad_norm": 0.3604517877101898, + "learning_rate": 9.370254125746684e-05, + "loss": 1.9466, + "step": 1254 + }, + { + "epoch": 0.3809379268477766, + "grad_norm": 0.35350343585014343, + "learning_rate": 9.369747899159664e-05, + "loss": 2.1093, + "step": 1255 + }, + { + "epoch": 0.38124146304446804, + "grad_norm": 0.43002399802207947, + "learning_rate": 9.369241672572643e-05, + "loss": 1.9016, + "step": 1256 + }, + { + "epoch": 0.38154499924115953, + "grad_norm": 0.46702131628990173, + "learning_rate": 9.368735445985624e-05, + "loss": 1.9909, + "step": 1257 + }, + { + "epoch": 0.38184853543785097, + "grad_norm": 0.42195767164230347, + "learning_rate": 9.368229219398604e-05, + "loss": 1.9486, + "step": 1258 + }, + { + "epoch": 0.3821520716345424, + "grad_norm": 0.4160800874233246, + "learning_rate": 9.367722992811583e-05, + "loss": 1.2547, + "step": 1259 + }, + { + "epoch": 0.3824556078312339, + "grad_norm": 0.398027628660202, + "learning_rate": 9.367216766224563e-05, + "loss": 1.7109, + "step": 1260 + }, + { + "epoch": 0.3827591440279253, + "grad_norm": 0.35801073908805847, + "learning_rate": 9.366710539637542e-05, + "loss": 1.7718, + "step": 1261 + }, + { + "epoch": 0.38306268022461676, + "grad_norm": 0.3769727647304535, + "learning_rate": 9.366204313050522e-05, + "loss": 1.7201, + "step": 1262 + }, + { + "epoch": 0.38336621642130825, + "grad_norm": 0.4340580105781555, + "learning_rate": 9.365698086463501e-05, + "loss": 1.5747, + "step": 1263 + }, + { + "epoch": 0.3836697526179997, + "grad_norm": 0.48839592933654785, + "learning_rate": 9.36519185987648e-05, + "loss": 2.0381, + "step": 1264 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.3686861991882324, + "learning_rate": 9.36468563328946e-05, + "loss": 1.8126, + "step": 1265 + }, + { + "epoch": 0.3842768250113826, + "grad_norm": 0.45264241099357605, + "learning_rate": 9.364179406702441e-05, + "loss": 1.7206, + "step": 1266 + }, + { + "epoch": 0.38458036120807404, + "grad_norm": 0.45419684052467346, + "learning_rate": 9.36367318011542e-05, + "loss": 2.028, + "step": 1267 + }, + { + "epoch": 0.38488389740476553, + "grad_norm": 0.38863110542297363, + "learning_rate": 9.3631669535284e-05, + "loss": 1.8139, + "step": 1268 + }, + { + "epoch": 0.38518743360145696, + "grad_norm": 0.41982683539390564, + "learning_rate": 9.362660726941379e-05, + "loss": 1.9528, + "step": 1269 + }, + { + "epoch": 0.38549096979814845, + "grad_norm": 0.3579862415790558, + "learning_rate": 9.362154500354359e-05, + "loss": 2.1949, + "step": 1270 + }, + { + "epoch": 0.3857945059948399, + "grad_norm": 0.39172133803367615, + "learning_rate": 9.361648273767338e-05, + "loss": 1.8242, + "step": 1271 + }, + { + "epoch": 0.3860980421915313, + "grad_norm": 0.36367735266685486, + "learning_rate": 9.361142047180318e-05, + "loss": 1.9508, + "step": 1272 + }, + { + "epoch": 0.3864015783882228, + "grad_norm": 0.3536215126514435, + "learning_rate": 9.360635820593297e-05, + "loss": 1.7761, + "step": 1273 + }, + { + "epoch": 0.38670511458491424, + "grad_norm": 0.44467857480049133, + "learning_rate": 9.360129594006277e-05, + "loss": 2.2006, + "step": 1274 + }, + { + "epoch": 0.38700865078160573, + "grad_norm": 0.41492581367492676, + "learning_rate": 9.359623367419257e-05, + "loss": 1.7899, + "step": 1275 + }, + { + "epoch": 0.38731218697829717, + "grad_norm": 0.4128611087799072, + "learning_rate": 9.359117140832237e-05, + "loss": 1.9787, + "step": 1276 + }, + { + "epoch": 0.3876157231749886, + "grad_norm": 0.36134451627731323, + "learning_rate": 9.358610914245216e-05, + "loss": 1.941, + "step": 1277 + }, + { + "epoch": 0.3879192593716801, + "grad_norm": 0.36279958486557007, + "learning_rate": 9.358104687658197e-05, + "loss": 2.1067, + "step": 1278 + }, + { + "epoch": 0.3882227955683715, + "grad_norm": 0.432478666305542, + "learning_rate": 9.357598461071177e-05, + "loss": 2.1264, + "step": 1279 + }, + { + "epoch": 0.38852633176506296, + "grad_norm": 0.3920331299304962, + "learning_rate": 9.357092234484156e-05, + "loss": 1.5522, + "step": 1280 + }, + { + "epoch": 0.38882986796175445, + "grad_norm": 0.3537754416465759, + "learning_rate": 9.356586007897136e-05, + "loss": 1.8354, + "step": 1281 + }, + { + "epoch": 0.3891334041584459, + "grad_norm": 0.40271031856536865, + "learning_rate": 9.356079781310115e-05, + "loss": 1.7567, + "step": 1282 + }, + { + "epoch": 0.3894369403551374, + "grad_norm": 0.47157374024391174, + "learning_rate": 9.355573554723095e-05, + "loss": 1.8542, + "step": 1283 + }, + { + "epoch": 0.3897404765518288, + "grad_norm": 0.3331926167011261, + "learning_rate": 9.355067328136074e-05, + "loss": 1.8651, + "step": 1284 + }, + { + "epoch": 0.39004401274852024, + "grad_norm": 0.884087860584259, + "learning_rate": 9.354561101549054e-05, + "loss": 1.4765, + "step": 1285 + }, + { + "epoch": 0.39034754894521173, + "grad_norm": 0.3618917167186737, + "learning_rate": 9.354054874962034e-05, + "loss": 2.0908, + "step": 1286 + }, + { + "epoch": 0.39065108514190316, + "grad_norm": 0.3494134843349457, + "learning_rate": 9.353548648375014e-05, + "loss": 1.1975, + "step": 1287 + }, + { + "epoch": 0.39095462133859465, + "grad_norm": 0.40450137853622437, + "learning_rate": 9.353042421787993e-05, + "loss": 1.5567, + "step": 1288 + }, + { + "epoch": 0.3912581575352861, + "grad_norm": 0.3893278241157532, + "learning_rate": 9.352536195200973e-05, + "loss": 1.6993, + "step": 1289 + }, + { + "epoch": 0.3915616937319775, + "grad_norm": 0.6020291447639465, + "learning_rate": 9.352029968613952e-05, + "loss": 1.7073, + "step": 1290 + }, + { + "epoch": 0.391865229928669, + "grad_norm": 0.43949219584465027, + "learning_rate": 9.351523742026932e-05, + "loss": 1.8407, + "step": 1291 + }, + { + "epoch": 0.39216876612536045, + "grad_norm": 0.41567811369895935, + "learning_rate": 9.351017515439911e-05, + "loss": 2.0354, + "step": 1292 + }, + { + "epoch": 0.3924723023220519, + "grad_norm": 0.41198036074638367, + "learning_rate": 9.350511288852891e-05, + "loss": 1.964, + "step": 1293 + }, + { + "epoch": 0.39277583851874337, + "grad_norm": 0.3735191524028778, + "learning_rate": 9.35000506226587e-05, + "loss": 1.8359, + "step": 1294 + }, + { + "epoch": 0.3930793747154348, + "grad_norm": 0.4426116347312927, + "learning_rate": 9.34949883567885e-05, + "loss": 1.8876, + "step": 1295 + }, + { + "epoch": 0.3933829109121263, + "grad_norm": 0.3956250548362732, + "learning_rate": 9.34899260909183e-05, + "loss": 1.5177, + "step": 1296 + }, + { + "epoch": 0.3936864471088177, + "grad_norm": 0.3534790575504303, + "learning_rate": 9.34848638250481e-05, + "loss": 2.1419, + "step": 1297 + }, + { + "epoch": 0.39398998330550916, + "grad_norm": 0.4134576916694641, + "learning_rate": 9.34798015591779e-05, + "loss": 1.5873, + "step": 1298 + }, + { + "epoch": 0.39429351950220065, + "grad_norm": 0.4386560916900635, + "learning_rate": 9.347473929330769e-05, + "loss": 1.4547, + "step": 1299 + }, + { + "epoch": 0.3945970556988921, + "grad_norm": 0.41839587688446045, + "learning_rate": 9.346967702743749e-05, + "loss": 1.5251, + "step": 1300 + }, + { + "epoch": 0.3949005918955836, + "grad_norm": 0.333609938621521, + "learning_rate": 9.346461476156728e-05, + "loss": 1.5575, + "step": 1301 + }, + { + "epoch": 0.395204128092275, + "grad_norm": 0.4706360101699829, + "learning_rate": 9.345955249569707e-05, + "loss": 1.8686, + "step": 1302 + }, + { + "epoch": 0.39550766428896644, + "grad_norm": 0.3555939495563507, + "learning_rate": 9.345449022982687e-05, + "loss": 1.5205, + "step": 1303 + }, + { + "epoch": 0.39581120048565793, + "grad_norm": 0.47611120343208313, + "learning_rate": 9.344942796395666e-05, + "loss": 1.5114, + "step": 1304 + }, + { + "epoch": 0.39611473668234937, + "grad_norm": 0.570785641670227, + "learning_rate": 9.344436569808647e-05, + "loss": 1.9987, + "step": 1305 + }, + { + "epoch": 0.3964182728790408, + "grad_norm": 0.3685778081417084, + "learning_rate": 9.343930343221627e-05, + "loss": 1.9471, + "step": 1306 + }, + { + "epoch": 0.3967218090757323, + "grad_norm": 0.4187014698982239, + "learning_rate": 9.343424116634606e-05, + "loss": 2.1728, + "step": 1307 + }, + { + "epoch": 0.3970253452724237, + "grad_norm": 0.35904020071029663, + "learning_rate": 9.342917890047586e-05, + "loss": 1.9576, + "step": 1308 + }, + { + "epoch": 0.3973288814691152, + "grad_norm": 0.48214206099510193, + "learning_rate": 9.342411663460565e-05, + "loss": 1.7529, + "step": 1309 + }, + { + "epoch": 0.39763241766580665, + "grad_norm": 0.3852714002132416, + "learning_rate": 9.341905436873545e-05, + "loss": 2.256, + "step": 1310 + }, + { + "epoch": 0.3979359538624981, + "grad_norm": 0.44712984561920166, + "learning_rate": 9.341399210286524e-05, + "loss": 1.8981, + "step": 1311 + }, + { + "epoch": 0.39823949005918957, + "grad_norm": 0.42379963397979736, + "learning_rate": 9.340892983699504e-05, + "loss": 2.0528, + "step": 1312 + }, + { + "epoch": 0.398543026255881, + "grad_norm": 0.3936759829521179, + "learning_rate": 9.340386757112483e-05, + "loss": 1.592, + "step": 1313 + }, + { + "epoch": 0.3988465624525725, + "grad_norm": 0.4035021662712097, + "learning_rate": 9.339880530525464e-05, + "loss": 2.0751, + "step": 1314 + }, + { + "epoch": 0.39915009864926393, + "grad_norm": 0.3658972382545471, + "learning_rate": 9.339374303938443e-05, + "loss": 1.7568, + "step": 1315 + }, + { + "epoch": 0.39945363484595536, + "grad_norm": 0.4271409511566162, + "learning_rate": 9.338868077351423e-05, + "loss": 2.1243, + "step": 1316 + }, + { + "epoch": 0.39975717104264685, + "grad_norm": 0.3799911439418793, + "learning_rate": 9.338361850764402e-05, + "loss": 1.9763, + "step": 1317 + }, + { + "epoch": 0.4000607072393383, + "grad_norm": 0.3878629803657532, + "learning_rate": 9.337855624177382e-05, + "loss": 1.6328, + "step": 1318 + }, + { + "epoch": 0.4003642434360297, + "grad_norm": 0.3611898124217987, + "learning_rate": 9.337349397590361e-05, + "loss": 1.8017, + "step": 1319 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.4010205864906311, + "learning_rate": 9.336843171003341e-05, + "loss": 2.1213, + "step": 1320 + }, + { + "epoch": 0.40097131582941264, + "grad_norm": 0.4076247811317444, + "learning_rate": 9.33633694441632e-05, + "loss": 1.8366, + "step": 1321 + }, + { + "epoch": 0.40127485202610413, + "grad_norm": 0.4172746241092682, + "learning_rate": 9.3358307178293e-05, + "loss": 1.7759, + "step": 1322 + }, + { + "epoch": 0.40157838822279557, + "grad_norm": 0.6179870367050171, + "learning_rate": 9.33532449124228e-05, + "loss": 1.7803, + "step": 1323 + }, + { + "epoch": 0.401881924419487, + "grad_norm": 0.38707882165908813, + "learning_rate": 9.33481826465526e-05, + "loss": 1.2795, + "step": 1324 + }, + { + "epoch": 0.4021854606161785, + "grad_norm": 0.35764575004577637, + "learning_rate": 9.334312038068241e-05, + "loss": 1.3463, + "step": 1325 + }, + { + "epoch": 0.4024889968128699, + "grad_norm": 0.40050292015075684, + "learning_rate": 9.33380581148122e-05, + "loss": 1.8487, + "step": 1326 + }, + { + "epoch": 0.4027925330095614, + "grad_norm": 0.5421705842018127, + "learning_rate": 9.3332995848942e-05, + "loss": 2.0407, + "step": 1327 + }, + { + "epoch": 0.40309606920625285, + "grad_norm": 0.5423186421394348, + "learning_rate": 9.33279335830718e-05, + "loss": 1.6743, + "step": 1328 + }, + { + "epoch": 0.4033996054029443, + "grad_norm": 0.41429242491722107, + "learning_rate": 9.332287131720159e-05, + "loss": 2.1614, + "step": 1329 + }, + { + "epoch": 0.4037031415996358, + "grad_norm": 0.41197100281715393, + "learning_rate": 9.331780905133138e-05, + "loss": 1.9414, + "step": 1330 + }, + { + "epoch": 0.4040066777963272, + "grad_norm": 0.3833538293838501, + "learning_rate": 9.331274678546118e-05, + "loss": 1.4602, + "step": 1331 + }, + { + "epoch": 0.40431021399301864, + "grad_norm": 0.4118226170539856, + "learning_rate": 9.330768451959097e-05, + "loss": 2.0595, + "step": 1332 + }, + { + "epoch": 0.40461375018971013, + "grad_norm": 0.3417702317237854, + "learning_rate": 9.330262225372077e-05, + "loss": 1.5938, + "step": 1333 + }, + { + "epoch": 0.40491728638640156, + "grad_norm": 0.3822105824947357, + "learning_rate": 9.329755998785056e-05, + "loss": 1.9013, + "step": 1334 + }, + { + "epoch": 0.40522082258309305, + "grad_norm": 0.7788810133934021, + "learning_rate": 9.329249772198037e-05, + "loss": 1.6752, + "step": 1335 + }, + { + "epoch": 0.4055243587797845, + "grad_norm": 0.4163956940174103, + "learning_rate": 9.328743545611017e-05, + "loss": 1.7016, + "step": 1336 + }, + { + "epoch": 0.4058278949764759, + "grad_norm": 0.42450758814811707, + "learning_rate": 9.328237319023996e-05, + "loss": 1.831, + "step": 1337 + }, + { + "epoch": 0.4061314311731674, + "grad_norm": 0.4169425666332245, + "learning_rate": 9.327731092436976e-05, + "loss": 1.7361, + "step": 1338 + }, + { + "epoch": 0.40643496736985885, + "grad_norm": 0.3413407802581787, + "learning_rate": 9.327224865849955e-05, + "loss": 2.0173, + "step": 1339 + }, + { + "epoch": 0.40673850356655034, + "grad_norm": 0.3989046812057495, + "learning_rate": 9.326718639262934e-05, + "loss": 1.9196, + "step": 1340 + }, + { + "epoch": 0.40704203976324177, + "grad_norm": 0.47707435488700867, + "learning_rate": 9.326212412675914e-05, + "loss": 1.9119, + "step": 1341 + }, + { + "epoch": 0.4073455759599332, + "grad_norm": 0.3998529314994812, + "learning_rate": 9.325706186088893e-05, + "loss": 1.8923, + "step": 1342 + }, + { + "epoch": 0.4076491121566247, + "grad_norm": 0.3560973107814789, + "learning_rate": 9.325199959501873e-05, + "loss": 1.8033, + "step": 1343 + }, + { + "epoch": 0.4079526483533161, + "grad_norm": 0.42655158042907715, + "learning_rate": 9.324693732914854e-05, + "loss": 1.5517, + "step": 1344 + }, + { + "epoch": 0.40825618455000756, + "grad_norm": 0.4044337272644043, + "learning_rate": 9.324187506327833e-05, + "loss": 1.8332, + "step": 1345 + }, + { + "epoch": 0.40855972074669905, + "grad_norm": 0.382467120885849, + "learning_rate": 9.323681279740813e-05, + "loss": 2.0407, + "step": 1346 + }, + { + "epoch": 0.4088632569433905, + "grad_norm": 0.46734219789505005, + "learning_rate": 9.323175053153792e-05, + "loss": 1.5662, + "step": 1347 + }, + { + "epoch": 0.409166793140082, + "grad_norm": 0.45105868577957153, + "learning_rate": 9.322668826566772e-05, + "loss": 2.0506, + "step": 1348 + }, + { + "epoch": 0.4094703293367734, + "grad_norm": 0.3531922399997711, + "learning_rate": 9.322162599979751e-05, + "loss": 1.7494, + "step": 1349 + }, + { + "epoch": 0.40977386553346484, + "grad_norm": 0.3707609474658966, + "learning_rate": 9.32165637339273e-05, + "loss": 2.2804, + "step": 1350 + }, + { + "epoch": 0.41007740173015633, + "grad_norm": 0.38254693150520325, + "learning_rate": 9.32115014680571e-05, + "loss": 1.9183, + "step": 1351 + }, + { + "epoch": 0.41038093792684777, + "grad_norm": 0.41418614983558655, + "learning_rate": 9.32064392021869e-05, + "loss": 1.8523, + "step": 1352 + }, + { + "epoch": 0.41068447412353926, + "grad_norm": 0.42098134756088257, + "learning_rate": 9.32013769363167e-05, + "loss": 1.712, + "step": 1353 + }, + { + "epoch": 0.4109880103202307, + "grad_norm": 0.3387204706668854, + "learning_rate": 9.31963146704465e-05, + "loss": 1.7652, + "step": 1354 + }, + { + "epoch": 0.4112915465169221, + "grad_norm": 0.4330706000328064, + "learning_rate": 9.31912524045763e-05, + "loss": 1.478, + "step": 1355 + }, + { + "epoch": 0.4115950827136136, + "grad_norm": 0.36673831939697266, + "learning_rate": 9.318619013870609e-05, + "loss": 1.7798, + "step": 1356 + }, + { + "epoch": 0.41189861891030505, + "grad_norm": 0.40374481678009033, + "learning_rate": 9.318112787283588e-05, + "loss": 1.6161, + "step": 1357 + }, + { + "epoch": 0.4122021551069965, + "grad_norm": 0.38840124011039734, + "learning_rate": 9.317606560696568e-05, + "loss": 1.3431, + "step": 1358 + }, + { + "epoch": 0.41250569130368797, + "grad_norm": 0.4768214225769043, + "learning_rate": 9.317100334109547e-05, + "loss": 1.6115, + "step": 1359 + }, + { + "epoch": 0.4128092275003794, + "grad_norm": 0.43069908022880554, + "learning_rate": 9.316594107522527e-05, + "loss": 2.0131, + "step": 1360 + }, + { + "epoch": 0.4131127636970709, + "grad_norm": 0.36959967017173767, + "learning_rate": 9.316087880935506e-05, + "loss": 1.982, + "step": 1361 + }, + { + "epoch": 0.41341629989376233, + "grad_norm": 0.3068915009498596, + "learning_rate": 9.315581654348486e-05, + "loss": 1.8105, + "step": 1362 + }, + { + "epoch": 0.41371983609045376, + "grad_norm": 0.33738118410110474, + "learning_rate": 9.315075427761467e-05, + "loss": 1.6039, + "step": 1363 + }, + { + "epoch": 0.41402337228714525, + "grad_norm": 0.38889479637145996, + "learning_rate": 9.314569201174446e-05, + "loss": 1.7897, + "step": 1364 + }, + { + "epoch": 0.4143269084838367, + "grad_norm": 0.35099512338638306, + "learning_rate": 9.314062974587426e-05, + "loss": 1.5897, + "step": 1365 + }, + { + "epoch": 0.4146304446805282, + "grad_norm": 0.3819596767425537, + "learning_rate": 9.313556748000405e-05, + "loss": 1.4781, + "step": 1366 + }, + { + "epoch": 0.4149339808772196, + "grad_norm": 0.392493337392807, + "learning_rate": 9.313050521413386e-05, + "loss": 1.8577, + "step": 1367 + }, + { + "epoch": 0.41523751707391104, + "grad_norm": 0.34424975514411926, + "learning_rate": 9.312544294826365e-05, + "loss": 1.4452, + "step": 1368 + }, + { + "epoch": 0.41554105327060253, + "grad_norm": 0.44334256649017334, + "learning_rate": 9.312038068239345e-05, + "loss": 1.5221, + "step": 1369 + }, + { + "epoch": 0.41584458946729397, + "grad_norm": 0.4194605350494385, + "learning_rate": 9.311531841652324e-05, + "loss": 2.3915, + "step": 1370 + }, + { + "epoch": 0.41614812566398546, + "grad_norm": 0.33700132369995117, + "learning_rate": 9.311025615065304e-05, + "loss": 1.9193, + "step": 1371 + }, + { + "epoch": 0.4164516618606769, + "grad_norm": 0.4527650773525238, + "learning_rate": 9.310519388478283e-05, + "loss": 1.766, + "step": 1372 + }, + { + "epoch": 0.4167551980573683, + "grad_norm": 0.3435012996196747, + "learning_rate": 9.310013161891263e-05, + "loss": 1.8662, + "step": 1373 + }, + { + "epoch": 0.4170587342540598, + "grad_norm": 0.3468983471393585, + "learning_rate": 9.309506935304244e-05, + "loss": 1.5261, + "step": 1374 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38368481397628784, + "learning_rate": 9.309000708717223e-05, + "loss": 1.6389, + "step": 1375 + }, + { + "epoch": 0.4176658066474427, + "grad_norm": 0.38153746724128723, + "learning_rate": 9.308494482130203e-05, + "loss": 1.7431, + "step": 1376 + }, + { + "epoch": 0.4179693428441342, + "grad_norm": 0.4192492961883545, + "learning_rate": 9.307988255543182e-05, + "loss": 1.8582, + "step": 1377 + }, + { + "epoch": 0.4182728790408256, + "grad_norm": 0.42689767479896545, + "learning_rate": 9.307482028956161e-05, + "loss": 1.9049, + "step": 1378 + }, + { + "epoch": 0.4185764152375171, + "grad_norm": 0.38545602560043335, + "learning_rate": 9.306975802369141e-05, + "loss": 1.2598, + "step": 1379 + }, + { + "epoch": 0.41887995143420853, + "grad_norm": 0.4117288887500763, + "learning_rate": 9.30646957578212e-05, + "loss": 1.9808, + "step": 1380 + }, + { + "epoch": 0.41918348763089996, + "grad_norm": 0.38102084398269653, + "learning_rate": 9.3059633491951e-05, + "loss": 1.9734, + "step": 1381 + }, + { + "epoch": 0.41948702382759145, + "grad_norm": 0.3788716495037079, + "learning_rate": 9.30545712260808e-05, + "loss": 1.9655, + "step": 1382 + }, + { + "epoch": 0.4197905600242829, + "grad_norm": 1.5338399410247803, + "learning_rate": 9.30495089602106e-05, + "loss": 2.1111, + "step": 1383 + }, + { + "epoch": 0.4200940962209744, + "grad_norm": 0.40994685888290405, + "learning_rate": 9.30444466943404e-05, + "loss": 1.9063, + "step": 1384 + }, + { + "epoch": 0.4203976324176658, + "grad_norm": 0.3389085829257965, + "learning_rate": 9.303938442847019e-05, + "loss": 1.8592, + "step": 1385 + }, + { + "epoch": 0.42070116861435725, + "grad_norm": 1.0117053985595703, + "learning_rate": 9.303432216259999e-05, + "loss": 1.4198, + "step": 1386 + }, + { + "epoch": 0.42100470481104874, + "grad_norm": 0.37429583072662354, + "learning_rate": 9.302925989672978e-05, + "loss": 1.6439, + "step": 1387 + }, + { + "epoch": 0.42130824100774017, + "grad_norm": 0.397991806268692, + "learning_rate": 9.302419763085958e-05, + "loss": 1.9744, + "step": 1388 + }, + { + "epoch": 0.4216117772044316, + "grad_norm": 0.39546629786491394, + "learning_rate": 9.301913536498937e-05, + "loss": 1.993, + "step": 1389 + }, + { + "epoch": 0.4219153134011231, + "grad_norm": 0.3465210497379303, + "learning_rate": 9.301407309911917e-05, + "loss": 1.8254, + "step": 1390 + }, + { + "epoch": 0.4222188495978145, + "grad_norm": 0.36281952261924744, + "learning_rate": 9.300901083324896e-05, + "loss": 1.9205, + "step": 1391 + }, + { + "epoch": 0.422522385794506, + "grad_norm": 0.37978988885879517, + "learning_rate": 9.300394856737877e-05, + "loss": 1.8021, + "step": 1392 + }, + { + "epoch": 0.42282592199119745, + "grad_norm": 0.3463260531425476, + "learning_rate": 9.299888630150856e-05, + "loss": 2.1022, + "step": 1393 + }, + { + "epoch": 0.4231294581878889, + "grad_norm": 0.3449305593967438, + "learning_rate": 9.299382403563836e-05, + "loss": 1.808, + "step": 1394 + }, + { + "epoch": 0.4234329943845804, + "grad_norm": 0.3900066018104553, + "learning_rate": 9.298876176976815e-05, + "loss": 1.8926, + "step": 1395 + }, + { + "epoch": 0.4237365305812718, + "grad_norm": 0.3958972692489624, + "learning_rate": 9.298369950389795e-05, + "loss": 1.7716, + "step": 1396 + }, + { + "epoch": 0.4240400667779633, + "grad_norm": 0.41263818740844727, + "learning_rate": 9.297863723802774e-05, + "loss": 1.9745, + "step": 1397 + }, + { + "epoch": 0.42434360297465473, + "grad_norm": 0.44245028495788574, + "learning_rate": 9.297357497215754e-05, + "loss": 1.6498, + "step": 1398 + }, + { + "epoch": 0.42464713917134617, + "grad_norm": 0.36662882566452026, + "learning_rate": 9.296851270628733e-05, + "loss": 1.9321, + "step": 1399 + }, + { + "epoch": 0.42495067536803766, + "grad_norm": 0.38561105728149414, + "learning_rate": 9.296345044041713e-05, + "loss": 1.8661, + "step": 1400 + }, + { + "epoch": 0.4252542115647291, + "grad_norm": 0.3688740134239197, + "learning_rate": 9.295838817454692e-05, + "loss": 2.1375, + "step": 1401 + }, + { + "epoch": 0.4255577477614205, + "grad_norm": 0.3883054256439209, + "learning_rate": 9.295332590867673e-05, + "loss": 1.486, + "step": 1402 + }, + { + "epoch": 0.425861283958112, + "grad_norm": 0.4107448160648346, + "learning_rate": 9.294826364280653e-05, + "loss": 1.9075, + "step": 1403 + }, + { + "epoch": 0.42616482015480345, + "grad_norm": 0.4174923896789551, + "learning_rate": 9.294320137693632e-05, + "loss": 2.0668, + "step": 1404 + }, + { + "epoch": 0.42646835635149494, + "grad_norm": 0.4573984444141388, + "learning_rate": 9.293813911106611e-05, + "loss": 1.8517, + "step": 1405 + }, + { + "epoch": 0.42677189254818637, + "grad_norm": 0.3820217251777649, + "learning_rate": 9.293307684519591e-05, + "loss": 1.7841, + "step": 1406 + }, + { + "epoch": 0.4270754287448778, + "grad_norm": 0.34213465452194214, + "learning_rate": 9.29280145793257e-05, + "loss": 1.9139, + "step": 1407 + }, + { + "epoch": 0.4273789649415693, + "grad_norm": 0.3995790481567383, + "learning_rate": 9.29229523134555e-05, + "loss": 1.6883, + "step": 1408 + }, + { + "epoch": 0.42768250113826073, + "grad_norm": 0.4142625331878662, + "learning_rate": 9.29178900475853e-05, + "loss": 2.0771, + "step": 1409 + }, + { + "epoch": 0.4279860373349522, + "grad_norm": 0.3818739354610443, + "learning_rate": 9.291282778171509e-05, + "loss": 1.6682, + "step": 1410 + }, + { + "epoch": 0.42828957353164365, + "grad_norm": 0.36996081471443176, + "learning_rate": 9.29077655158449e-05, + "loss": 2.1084, + "step": 1411 + }, + { + "epoch": 0.4285931097283351, + "grad_norm": 0.4592280983924866, + "learning_rate": 9.290270324997469e-05, + "loss": 1.4502, + "step": 1412 + }, + { + "epoch": 0.4288966459250266, + "grad_norm": 0.4243657886981964, + "learning_rate": 9.28976409841045e-05, + "loss": 1.8459, + "step": 1413 + }, + { + "epoch": 0.429200182121718, + "grad_norm": 0.4068589508533478, + "learning_rate": 9.28925787182343e-05, + "loss": 1.8392, + "step": 1414 + }, + { + "epoch": 0.42950371831840944, + "grad_norm": 0.3421384394168854, + "learning_rate": 9.288751645236409e-05, + "loss": 1.9204, + "step": 1415 + }, + { + "epoch": 0.42980725451510093, + "grad_norm": 0.36633387207984924, + "learning_rate": 9.288245418649388e-05, + "loss": 2.1934, + "step": 1416 + }, + { + "epoch": 0.43011079071179237, + "grad_norm": 0.6671120524406433, + "learning_rate": 9.287739192062368e-05, + "loss": 1.7614, + "step": 1417 + }, + { + "epoch": 0.43041432690848386, + "grad_norm": 0.3610883057117462, + "learning_rate": 9.287232965475347e-05, + "loss": 1.9075, + "step": 1418 + }, + { + "epoch": 0.4307178631051753, + "grad_norm": 0.42165407538414, + "learning_rate": 9.286726738888327e-05, + "loss": 1.4474, + "step": 1419 + }, + { + "epoch": 0.4310213993018667, + "grad_norm": 0.38051116466522217, + "learning_rate": 9.286220512301306e-05, + "loss": 1.7629, + "step": 1420 + }, + { + "epoch": 0.4313249354985582, + "grad_norm": 0.38990986347198486, + "learning_rate": 9.285714285714286e-05, + "loss": 1.7111, + "step": 1421 + }, + { + "epoch": 0.43162847169524965, + "grad_norm": 0.3510812222957611, + "learning_rate": 9.285208059127267e-05, + "loss": 1.7695, + "step": 1422 + }, + { + "epoch": 0.43193200789194114, + "grad_norm": 0.34757426381111145, + "learning_rate": 9.284701832540246e-05, + "loss": 2.173, + "step": 1423 + }, + { + "epoch": 0.4322355440886326, + "grad_norm": 0.3806573152542114, + "learning_rate": 9.284195605953226e-05, + "loss": 1.8029, + "step": 1424 + }, + { + "epoch": 0.432539080285324, + "grad_norm": 0.3845151662826538, + "learning_rate": 9.283689379366205e-05, + "loss": 1.902, + "step": 1425 + }, + { + "epoch": 0.4328426164820155, + "grad_norm": 0.40006932616233826, + "learning_rate": 9.283183152779185e-05, + "loss": 1.6436, + "step": 1426 + }, + { + "epoch": 0.43314615267870693, + "grad_norm": 0.5392235517501831, + "learning_rate": 9.282676926192164e-05, + "loss": 1.921, + "step": 1427 + }, + { + "epoch": 0.43344968887539836, + "grad_norm": 0.4523599147796631, + "learning_rate": 9.282170699605144e-05, + "loss": 1.7473, + "step": 1428 + }, + { + "epoch": 0.43375322507208985, + "grad_norm": 0.3809603154659271, + "learning_rate": 9.281664473018123e-05, + "loss": 1.5461, + "step": 1429 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.4202471375465393, + "learning_rate": 9.281158246431103e-05, + "loss": 1.995, + "step": 1430 + }, + { + "epoch": 0.4343602974654728, + "grad_norm": 0.42767444252967834, + "learning_rate": 9.280652019844083e-05, + "loss": 1.9536, + "step": 1431 + }, + { + "epoch": 0.4346638336621642, + "grad_norm": 0.4050025939941406, + "learning_rate": 9.280145793257063e-05, + "loss": 1.6169, + "step": 1432 + }, + { + "epoch": 0.43496736985885565, + "grad_norm": 0.4572995901107788, + "learning_rate": 9.279639566670042e-05, + "loss": 1.5711, + "step": 1433 + }, + { + "epoch": 0.43527090605554714, + "grad_norm": 0.4102776050567627, + "learning_rate": 9.279133340083022e-05, + "loss": 1.9844, + "step": 1434 + }, + { + "epoch": 0.43557444225223857, + "grad_norm": 0.4309599995613098, + "learning_rate": 9.278627113496001e-05, + "loss": 1.8742, + "step": 1435 + }, + { + "epoch": 0.43587797844893006, + "grad_norm": 0.34951043128967285, + "learning_rate": 9.278120886908981e-05, + "loss": 1.9262, + "step": 1436 + }, + { + "epoch": 0.4361815146456215, + "grad_norm": 0.47722557187080383, + "learning_rate": 9.27761466032196e-05, + "loss": 1.5605, + "step": 1437 + }, + { + "epoch": 0.4364850508423129, + "grad_norm": 0.37698620557785034, + "learning_rate": 9.27710843373494e-05, + "loss": 2.3081, + "step": 1438 + }, + { + "epoch": 0.4367885870390044, + "grad_norm": 0.40668490529060364, + "learning_rate": 9.276602207147919e-05, + "loss": 1.4524, + "step": 1439 + }, + { + "epoch": 0.43709212323569585, + "grad_norm": 0.4384947121143341, + "learning_rate": 9.276095980560899e-05, + "loss": 1.7878, + "step": 1440 + }, + { + "epoch": 0.4373956594323873, + "grad_norm": 3.140451192855835, + "learning_rate": 9.27558975397388e-05, + "loss": 1.7084, + "step": 1441 + }, + { + "epoch": 0.4376991956290788, + "grad_norm": 0.43369218707084656, + "learning_rate": 9.275083527386859e-05, + "loss": 2.0439, + "step": 1442 + }, + { + "epoch": 0.4380027318257702, + "grad_norm": 0.4725881516933441, + "learning_rate": 9.274577300799838e-05, + "loss": 2.0507, + "step": 1443 + }, + { + "epoch": 0.4383062680224617, + "grad_norm": 0.4496382474899292, + "learning_rate": 9.274071074212818e-05, + "loss": 2.1349, + "step": 1444 + }, + { + "epoch": 0.43860980421915313, + "grad_norm": 1.6437734365463257, + "learning_rate": 9.273564847625797e-05, + "loss": 1.9441, + "step": 1445 + }, + { + "epoch": 0.43891334041584457, + "grad_norm": 0.4106156527996063, + "learning_rate": 9.273058621038777e-05, + "loss": 1.4615, + "step": 1446 + }, + { + "epoch": 0.43921687661253606, + "grad_norm": 0.4387066960334778, + "learning_rate": 9.272552394451756e-05, + "loss": 1.8679, + "step": 1447 + }, + { + "epoch": 0.4395204128092275, + "grad_norm": 0.44515758752822876, + "learning_rate": 9.272046167864736e-05, + "loss": 1.9675, + "step": 1448 + }, + { + "epoch": 0.439823949005919, + "grad_norm": 0.43665841221809387, + "learning_rate": 9.271539941277715e-05, + "loss": 2.2782, + "step": 1449 + }, + { + "epoch": 0.4401274852026104, + "grad_norm": 0.3593182861804962, + "learning_rate": 9.271033714690696e-05, + "loss": 1.6537, + "step": 1450 + }, + { + "epoch": 0.44043102139930185, + "grad_norm": 0.38529497385025024, + "learning_rate": 9.270527488103676e-05, + "loss": 1.9399, + "step": 1451 + }, + { + "epoch": 0.44073455759599334, + "grad_norm": 0.42474156618118286, + "learning_rate": 9.270021261516655e-05, + "loss": 1.849, + "step": 1452 + }, + { + "epoch": 0.44103809379268477, + "grad_norm": 0.4505622684955597, + "learning_rate": 9.269515034929635e-05, + "loss": 1.9889, + "step": 1453 + }, + { + "epoch": 0.44134162998937626, + "grad_norm": 1.8219722509384155, + "learning_rate": 9.269008808342614e-05, + "loss": 2.1467, + "step": 1454 + }, + { + "epoch": 0.4416451661860677, + "grad_norm": 0.6941187381744385, + "learning_rate": 9.268502581755594e-05, + "loss": 2.1441, + "step": 1455 + }, + { + "epoch": 0.44194870238275913, + "grad_norm": 0.6262606978416443, + "learning_rate": 9.267996355168574e-05, + "loss": 1.9937, + "step": 1456 + }, + { + "epoch": 0.4422522385794506, + "grad_norm": 0.3790215253829956, + "learning_rate": 9.267490128581554e-05, + "loss": 1.7468, + "step": 1457 + }, + { + "epoch": 0.44255577477614205, + "grad_norm": 0.42074668407440186, + "learning_rate": 9.266983901994533e-05, + "loss": 2.1245, + "step": 1458 + }, + { + "epoch": 0.4428593109728335, + "grad_norm": 0.464870810508728, + "learning_rate": 9.266477675407513e-05, + "loss": 1.8672, + "step": 1459 + }, + { + "epoch": 0.443162847169525, + "grad_norm": 0.4551111161708832, + "learning_rate": 9.265971448820492e-05, + "loss": 2.054, + "step": 1460 + }, + { + "epoch": 0.4434663833662164, + "grad_norm": 0.3874572813510895, + "learning_rate": 9.265465222233473e-05, + "loss": 1.8281, + "step": 1461 + }, + { + "epoch": 0.4437699195629079, + "grad_norm": 0.44287312030792236, + "learning_rate": 9.264958995646453e-05, + "loss": 1.6435, + "step": 1462 + }, + { + "epoch": 0.44407345575959933, + "grad_norm": 0.41155338287353516, + "learning_rate": 9.264452769059432e-05, + "loss": 1.9611, + "step": 1463 + }, + { + "epoch": 0.44437699195629077, + "grad_norm": 0.480648398399353, + "learning_rate": 9.263946542472412e-05, + "loss": 1.7771, + "step": 1464 + }, + { + "epoch": 0.44468052815298226, + "grad_norm": 0.4704960286617279, + "learning_rate": 9.263440315885391e-05, + "loss": 0.6294, + "step": 1465 + }, + { + "epoch": 0.4449840643496737, + "grad_norm": 0.4150315821170807, + "learning_rate": 9.26293408929837e-05, + "loss": 1.7698, + "step": 1466 + }, + { + "epoch": 0.4452876005463652, + "grad_norm": 0.5981085300445557, + "learning_rate": 9.26242786271135e-05, + "loss": 1.7192, + "step": 1467 + }, + { + "epoch": 0.4455911367430566, + "grad_norm": 0.43365392088890076, + "learning_rate": 9.26192163612433e-05, + "loss": 1.8843, + "step": 1468 + }, + { + "epoch": 0.44589467293974805, + "grad_norm": 0.7336254715919495, + "learning_rate": 9.261415409537309e-05, + "loss": 2.0101, + "step": 1469 + }, + { + "epoch": 0.44619820913643954, + "grad_norm": 0.4002796411514282, + "learning_rate": 9.26090918295029e-05, + "loss": 1.9817, + "step": 1470 + }, + { + "epoch": 0.446501745333131, + "grad_norm": 0.4379813075065613, + "learning_rate": 9.26040295636327e-05, + "loss": 2.1091, + "step": 1471 + }, + { + "epoch": 0.4468052815298224, + "grad_norm": 0.4577115774154663, + "learning_rate": 9.259896729776249e-05, + "loss": 1.6132, + "step": 1472 + }, + { + "epoch": 0.4471088177265139, + "grad_norm": 0.40199458599090576, + "learning_rate": 9.259390503189228e-05, + "loss": 1.9815, + "step": 1473 + }, + { + "epoch": 0.44741235392320533, + "grad_norm": 0.4442947506904602, + "learning_rate": 9.258884276602208e-05, + "loss": 1.8425, + "step": 1474 + }, + { + "epoch": 0.4477158901198968, + "grad_norm": 0.3720739781856537, + "learning_rate": 9.258378050015187e-05, + "loss": 2.1161, + "step": 1475 + }, + { + "epoch": 0.44801942631658825, + "grad_norm": 0.39746803045272827, + "learning_rate": 9.257871823428167e-05, + "loss": 2.0404, + "step": 1476 + }, + { + "epoch": 0.4483229625132797, + "grad_norm": 0.4376835525035858, + "learning_rate": 9.257365596841146e-05, + "loss": 1.7201, + "step": 1477 + }, + { + "epoch": 0.4486264987099712, + "grad_norm": 0.35988250374794006, + "learning_rate": 9.256859370254126e-05, + "loss": 1.1999, + "step": 1478 + }, + { + "epoch": 0.4489300349066626, + "grad_norm": 0.41253864765167236, + "learning_rate": 9.256353143667105e-05, + "loss": 1.9916, + "step": 1479 + }, + { + "epoch": 0.4492335711033541, + "grad_norm": 0.34956973791122437, + "learning_rate": 9.255846917080086e-05, + "loss": 1.7406, + "step": 1480 + }, + { + "epoch": 0.44953710730004554, + "grad_norm": 0.452239453792572, + "learning_rate": 9.255340690493065e-05, + "loss": 2.0101, + "step": 1481 + }, + { + "epoch": 0.44984064349673697, + "grad_norm": 0.36039796471595764, + "learning_rate": 9.254834463906045e-05, + "loss": 1.9181, + "step": 1482 + }, + { + "epoch": 0.45014417969342846, + "grad_norm": 0.34030023217201233, + "learning_rate": 9.254328237319024e-05, + "loss": 1.6803, + "step": 1483 + }, + { + "epoch": 0.4504477158901199, + "grad_norm": 0.3585798144340515, + "learning_rate": 9.253822010732004e-05, + "loss": 1.8983, + "step": 1484 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.4554307758808136, + "learning_rate": 9.253315784144983e-05, + "loss": 1.741, + "step": 1485 + }, + { + "epoch": 0.4510547882835028, + "grad_norm": 0.36281803250312805, + "learning_rate": 9.252809557557963e-05, + "loss": 2.0279, + "step": 1486 + }, + { + "epoch": 0.45135832448019425, + "grad_norm": 0.4025228023529053, + "learning_rate": 9.252303330970942e-05, + "loss": 1.8517, + "step": 1487 + }, + { + "epoch": 0.45166186067688574, + "grad_norm": 0.3962991535663605, + "learning_rate": 9.251797104383922e-05, + "loss": 1.9199, + "step": 1488 + }, + { + "epoch": 0.4519653968735772, + "grad_norm": 0.4201490879058838, + "learning_rate": 9.251290877796903e-05, + "loss": 2.0137, + "step": 1489 + }, + { + "epoch": 0.4522689330702686, + "grad_norm": 0.4605710804462433, + "learning_rate": 9.250784651209882e-05, + "loss": 1.975, + "step": 1490 + }, + { + "epoch": 0.4525724692669601, + "grad_norm": 0.3571039140224457, + "learning_rate": 9.250278424622862e-05, + "loss": 1.8478, + "step": 1491 + }, + { + "epoch": 0.45287600546365153, + "grad_norm": 0.406676709651947, + "learning_rate": 9.249772198035841e-05, + "loss": 1.965, + "step": 1492 + }, + { + "epoch": 0.453179541660343, + "grad_norm": 0.6116447448730469, + "learning_rate": 9.24926597144882e-05, + "loss": 1.6192, + "step": 1493 + }, + { + "epoch": 0.45348307785703446, + "grad_norm": 0.4193543493747711, + "learning_rate": 9.2487597448618e-05, + "loss": 1.8085, + "step": 1494 + }, + { + "epoch": 0.4537866140537259, + "grad_norm": 0.4082903563976288, + "learning_rate": 9.24825351827478e-05, + "loss": 1.8924, + "step": 1495 + }, + { + "epoch": 0.4540901502504174, + "grad_norm": 0.4163326919078827, + "learning_rate": 9.247747291687759e-05, + "loss": 1.9238, + "step": 1496 + }, + { + "epoch": 0.4543936864471088, + "grad_norm": 0.4481281340122223, + "learning_rate": 9.247241065100739e-05, + "loss": 1.7663, + "step": 1497 + }, + { + "epoch": 0.45469722264380025, + "grad_norm": 0.3282391428947449, + "learning_rate": 9.24673483851372e-05, + "loss": 2.0332, + "step": 1498 + }, + { + "epoch": 0.45500075884049174, + "grad_norm": 0.43553873896598816, + "learning_rate": 9.246228611926699e-05, + "loss": 2.0712, + "step": 1499 + }, + { + "epoch": 0.45530429503718317, + "grad_norm": 0.40410909056663513, + "learning_rate": 9.245722385339678e-05, + "loss": 1.9479, + "step": 1500 + }, + { + "epoch": 0.45560783123387466, + "grad_norm": 0.36232396960258484, + "learning_rate": 9.245216158752659e-05, + "loss": 1.9359, + "step": 1501 + }, + { + "epoch": 0.4559113674305661, + "grad_norm": 0.44860419631004333, + "learning_rate": 9.244709932165639e-05, + "loss": 1.263, + "step": 1502 + }, + { + "epoch": 0.45621490362725753, + "grad_norm": 0.5308701395988464, + "learning_rate": 9.244203705578618e-05, + "loss": 2.2914, + "step": 1503 + }, + { + "epoch": 0.456518439823949, + "grad_norm": 0.4460773468017578, + "learning_rate": 9.243697478991598e-05, + "loss": 1.8063, + "step": 1504 + }, + { + "epoch": 0.45682197602064045, + "grad_norm": 0.4147963523864746, + "learning_rate": 9.243191252404577e-05, + "loss": 2.045, + "step": 1505 + }, + { + "epoch": 0.45712551221733194, + "grad_norm": 0.34958329796791077, + "learning_rate": 9.242685025817557e-05, + "loss": 1.8712, + "step": 1506 + }, + { + "epoch": 0.4574290484140234, + "grad_norm": 0.36072060465812683, + "learning_rate": 9.242178799230536e-05, + "loss": 1.7198, + "step": 1507 + }, + { + "epoch": 0.4577325846107148, + "grad_norm": 0.4608067274093628, + "learning_rate": 9.241672572643515e-05, + "loss": 1.7165, + "step": 1508 + }, + { + "epoch": 0.4580361208074063, + "grad_norm": 0.39580467343330383, + "learning_rate": 9.241166346056496e-05, + "loss": 1.665, + "step": 1509 + }, + { + "epoch": 0.45833965700409773, + "grad_norm": 0.4920599162578583, + "learning_rate": 9.240660119469476e-05, + "loss": 1.872, + "step": 1510 + }, + { + "epoch": 0.45864319320078917, + "grad_norm": 0.4332992136478424, + "learning_rate": 9.240153892882455e-05, + "loss": 1.8972, + "step": 1511 + }, + { + "epoch": 0.45894672939748066, + "grad_norm": 0.39618152379989624, + "learning_rate": 9.239647666295435e-05, + "loss": 2.0167, + "step": 1512 + }, + { + "epoch": 0.4592502655941721, + "grad_norm": 0.6713082790374756, + "learning_rate": 9.239141439708414e-05, + "loss": 2.0, + "step": 1513 + }, + { + "epoch": 0.4595538017908636, + "grad_norm": 0.34422579407691956, + "learning_rate": 9.238635213121394e-05, + "loss": 1.6438, + "step": 1514 + }, + { + "epoch": 0.459857337987555, + "grad_norm": 0.43874865770339966, + "learning_rate": 9.238128986534373e-05, + "loss": 1.6388, + "step": 1515 + }, + { + "epoch": 0.46016087418424645, + "grad_norm": 0.5863097906112671, + "learning_rate": 9.237622759947353e-05, + "loss": 1.6764, + "step": 1516 + }, + { + "epoch": 0.46046441038093794, + "grad_norm": 0.3312426805496216, + "learning_rate": 9.237116533360332e-05, + "loss": 1.8491, + "step": 1517 + }, + { + "epoch": 0.4607679465776294, + "grad_norm": 0.3111588656902313, + "learning_rate": 9.236610306773312e-05, + "loss": 2.0298, + "step": 1518 + }, + { + "epoch": 0.46107148277432086, + "grad_norm": 0.38705703616142273, + "learning_rate": 9.236104080186292e-05, + "loss": 2.0584, + "step": 1519 + }, + { + "epoch": 0.4613750189710123, + "grad_norm": 0.32613542675971985, + "learning_rate": 9.235597853599272e-05, + "loss": 1.8722, + "step": 1520 + }, + { + "epoch": 0.46167855516770373, + "grad_norm": 0.9304127097129822, + "learning_rate": 9.235091627012251e-05, + "loss": 1.978, + "step": 1521 + }, + { + "epoch": 0.4619820913643952, + "grad_norm": 0.3754931688308716, + "learning_rate": 9.234585400425231e-05, + "loss": 1.8724, + "step": 1522 + }, + { + "epoch": 0.46228562756108665, + "grad_norm": 0.4033370912075043, + "learning_rate": 9.23407917383821e-05, + "loss": 1.3349, + "step": 1523 + }, + { + "epoch": 0.4625891637577781, + "grad_norm": 0.35285013914108276, + "learning_rate": 9.23357294725119e-05, + "loss": 1.442, + "step": 1524 + }, + { + "epoch": 0.4628926999544696, + "grad_norm": 0.4044554531574249, + "learning_rate": 9.23306672066417e-05, + "loss": 2.0633, + "step": 1525 + }, + { + "epoch": 0.463196236151161, + "grad_norm": 0.46915552020072937, + "learning_rate": 9.232560494077149e-05, + "loss": 1.3861, + "step": 1526 + }, + { + "epoch": 0.4634997723478525, + "grad_norm": 0.4107852280139923, + "learning_rate": 9.232054267490128e-05, + "loss": 1.9011, + "step": 1527 + }, + { + "epoch": 0.46380330854454394, + "grad_norm": 0.4018856883049011, + "learning_rate": 9.231548040903109e-05, + "loss": 1.843, + "step": 1528 + }, + { + "epoch": 0.46410684474123537, + "grad_norm": 0.36814266443252563, + "learning_rate": 9.231041814316089e-05, + "loss": 1.897, + "step": 1529 + }, + { + "epoch": 0.46441038093792686, + "grad_norm": 0.42271214723587036, + "learning_rate": 9.230535587729068e-05, + "loss": 1.9761, + "step": 1530 + }, + { + "epoch": 0.4647139171346183, + "grad_norm": 0.4548446238040924, + "learning_rate": 9.230029361142048e-05, + "loss": 1.9313, + "step": 1531 + }, + { + "epoch": 0.4650174533313098, + "grad_norm": 0.4320158064365387, + "learning_rate": 9.229523134555027e-05, + "loss": 1.5687, + "step": 1532 + }, + { + "epoch": 0.4653209895280012, + "grad_norm": 0.3909349739551544, + "learning_rate": 9.229016907968007e-05, + "loss": 1.337, + "step": 1533 + }, + { + "epoch": 0.46562452572469265, + "grad_norm": 0.40204015374183655, + "learning_rate": 9.228510681380986e-05, + "loss": 1.9838, + "step": 1534 + }, + { + "epoch": 0.46592806192138414, + "grad_norm": 0.3997584879398346, + "learning_rate": 9.228004454793966e-05, + "loss": 1.8321, + "step": 1535 + }, + { + "epoch": 0.4662315981180756, + "grad_norm": 0.43689507246017456, + "learning_rate": 9.227498228206945e-05, + "loss": 2.0649, + "step": 1536 + }, + { + "epoch": 0.466535134314767, + "grad_norm": 0.3970150649547577, + "learning_rate": 9.226992001619926e-05, + "loss": 2.0077, + "step": 1537 + }, + { + "epoch": 0.4668386705114585, + "grad_norm": 0.3847435414791107, + "learning_rate": 9.226485775032905e-05, + "loss": 2.0168, + "step": 1538 + }, + { + "epoch": 0.46714220670814993, + "grad_norm": 0.40491220355033875, + "learning_rate": 9.225979548445885e-05, + "loss": 1.7831, + "step": 1539 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.401903361082077, + "learning_rate": 9.225473321858864e-05, + "loss": 2.007, + "step": 1540 + }, + { + "epoch": 0.46774927910153286, + "grad_norm": 0.6656989455223083, + "learning_rate": 9.224967095271844e-05, + "loss": 2.008, + "step": 1541 + }, + { + "epoch": 0.4680528152982243, + "grad_norm": 0.36222347617149353, + "learning_rate": 9.224460868684823e-05, + "loss": 1.8659, + "step": 1542 + }, + { + "epoch": 0.4683563514949158, + "grad_norm": 0.4396745562553406, + "learning_rate": 9.223954642097803e-05, + "loss": 1.9881, + "step": 1543 + }, + { + "epoch": 0.4686598876916072, + "grad_norm": 0.5994194149971008, + "learning_rate": 9.223448415510782e-05, + "loss": 2.1998, + "step": 1544 + }, + { + "epoch": 0.4689634238882987, + "grad_norm": 0.45558032393455505, + "learning_rate": 9.222942188923763e-05, + "loss": 1.6082, + "step": 1545 + }, + { + "epoch": 0.46926696008499014, + "grad_norm": 0.38417017459869385, + "learning_rate": 9.222435962336742e-05, + "loss": 1.7655, + "step": 1546 + }, + { + "epoch": 0.46957049628168157, + "grad_norm": 0.41946941614151, + "learning_rate": 9.221929735749722e-05, + "loss": 1.7472, + "step": 1547 + }, + { + "epoch": 0.46987403247837306, + "grad_norm": 0.39455467462539673, + "learning_rate": 9.221423509162703e-05, + "loss": 1.8377, + "step": 1548 + }, + { + "epoch": 0.4701775686750645, + "grad_norm": 0.3967253565788269, + "learning_rate": 9.220917282575682e-05, + "loss": 1.7838, + "step": 1549 + }, + { + "epoch": 0.470481104871756, + "grad_norm": 0.42535534501075745, + "learning_rate": 9.220411055988662e-05, + "loss": 2.1495, + "step": 1550 + }, + { + "epoch": 0.4707846410684474, + "grad_norm": 0.36706385016441345, + "learning_rate": 9.219904829401641e-05, + "loss": 1.9875, + "step": 1551 + }, + { + "epoch": 0.47108817726513885, + "grad_norm": 0.3747560977935791, + "learning_rate": 9.219398602814621e-05, + "loss": 1.5342, + "step": 1552 + }, + { + "epoch": 0.47139171346183034, + "grad_norm": 0.34010231494903564, + "learning_rate": 9.2188923762276e-05, + "loss": 2.0999, + "step": 1553 + }, + { + "epoch": 0.4716952496585218, + "grad_norm": 0.40051451325416565, + "learning_rate": 9.21838614964058e-05, + "loss": 1.8481, + "step": 1554 + }, + { + "epoch": 0.4719987858552132, + "grad_norm": 0.5217362642288208, + "learning_rate": 9.217879923053559e-05, + "loss": 1.4746, + "step": 1555 + }, + { + "epoch": 0.4723023220519047, + "grad_norm": 0.42339226603507996, + "learning_rate": 9.217373696466539e-05, + "loss": 1.947, + "step": 1556 + }, + { + "epoch": 0.47260585824859613, + "grad_norm": 0.3780953586101532, + "learning_rate": 9.216867469879518e-05, + "loss": 2.2093, + "step": 1557 + }, + { + "epoch": 0.4729093944452876, + "grad_norm": 0.38509401679039, + "learning_rate": 9.216361243292499e-05, + "loss": 1.6966, + "step": 1558 + }, + { + "epoch": 0.47321293064197906, + "grad_norm": 0.501438319683075, + "learning_rate": 9.215855016705478e-05, + "loss": 2.0505, + "step": 1559 + }, + { + "epoch": 0.4735164668386705, + "grad_norm": 0.42260050773620605, + "learning_rate": 9.215348790118458e-05, + "loss": 1.9439, + "step": 1560 + }, + { + "epoch": 0.473820003035362, + "grad_norm": 0.6031399965286255, + "learning_rate": 9.214842563531437e-05, + "loss": 1.9674, + "step": 1561 + }, + { + "epoch": 0.4741235392320534, + "grad_norm": 0.3809618055820465, + "learning_rate": 9.214336336944417e-05, + "loss": 1.9882, + "step": 1562 + }, + { + "epoch": 0.4744270754287449, + "grad_norm": 0.4074794352054596, + "learning_rate": 9.213830110357396e-05, + "loss": 1.6648, + "step": 1563 + }, + { + "epoch": 0.47473061162543634, + "grad_norm": 0.4380822479724884, + "learning_rate": 9.213323883770376e-05, + "loss": 2.1327, + "step": 1564 + }, + { + "epoch": 0.4750341478221278, + "grad_norm": 0.6130182147026062, + "learning_rate": 9.212817657183355e-05, + "loss": 1.957, + "step": 1565 + }, + { + "epoch": 0.47533768401881926, + "grad_norm": 0.359451025724411, + "learning_rate": 9.212311430596335e-05, + "loss": 1.5301, + "step": 1566 + }, + { + "epoch": 0.4756412202155107, + "grad_norm": 0.508237898349762, + "learning_rate": 9.211805204009316e-05, + "loss": 2.1409, + "step": 1567 + }, + { + "epoch": 0.47594475641220213, + "grad_norm": 0.5652433037757874, + "learning_rate": 9.211298977422295e-05, + "loss": 2.2032, + "step": 1568 + }, + { + "epoch": 0.4762482926088936, + "grad_norm": 0.36153456568717957, + "learning_rate": 9.210792750835275e-05, + "loss": 2.0994, + "step": 1569 + }, + { + "epoch": 0.47655182880558505, + "grad_norm": 0.4140501320362091, + "learning_rate": 9.210286524248254e-05, + "loss": 1.6165, + "step": 1570 + }, + { + "epoch": 0.47685536500227654, + "grad_norm": 0.36080101132392883, + "learning_rate": 9.209780297661234e-05, + "loss": 2.0203, + "step": 1571 + }, + { + "epoch": 0.477158901198968, + "grad_norm": 0.3501390218734741, + "learning_rate": 9.209274071074213e-05, + "loss": 1.9692, + "step": 1572 + }, + { + "epoch": 0.4774624373956594, + "grad_norm": 0.3753308653831482, + "learning_rate": 9.208767844487192e-05, + "loss": 1.799, + "step": 1573 + }, + { + "epoch": 0.4777659735923509, + "grad_norm": 0.3621695935726166, + "learning_rate": 9.208261617900172e-05, + "loss": 1.8412, + "step": 1574 + }, + { + "epoch": 0.47806950978904234, + "grad_norm": 0.4215545952320099, + "learning_rate": 9.207755391313151e-05, + "loss": 1.8227, + "step": 1575 + }, + { + "epoch": 0.4783730459857338, + "grad_norm": 0.32205232977867126, + "learning_rate": 9.207249164726132e-05, + "loss": 1.3949, + "step": 1576 + }, + { + "epoch": 0.47867658218242526, + "grad_norm": 0.34510162472724915, + "learning_rate": 9.206742938139112e-05, + "loss": 1.8627, + "step": 1577 + }, + { + "epoch": 0.4789801183791167, + "grad_norm": 0.41916847229003906, + "learning_rate": 9.206236711552091e-05, + "loss": 1.6164, + "step": 1578 + }, + { + "epoch": 0.4792836545758082, + "grad_norm": 0.323519229888916, + "learning_rate": 9.205730484965071e-05, + "loss": 1.5688, + "step": 1579 + }, + { + "epoch": 0.4795871907724996, + "grad_norm": 0.4150819778442383, + "learning_rate": 9.20522425837805e-05, + "loss": 1.8097, + "step": 1580 + }, + { + "epoch": 0.47989072696919105, + "grad_norm": 0.4045346975326538, + "learning_rate": 9.20471803179103e-05, + "loss": 1.91, + "step": 1581 + }, + { + "epoch": 0.48019426316588254, + "grad_norm": 0.3251115083694458, + "learning_rate": 9.204211805204009e-05, + "loss": 1.9278, + "step": 1582 + }, + { + "epoch": 0.480497799362574, + "grad_norm": 0.37068256735801697, + "learning_rate": 9.203705578616989e-05, + "loss": 1.8667, + "step": 1583 + }, + { + "epoch": 0.48080133555926546, + "grad_norm": 0.4208294749259949, + "learning_rate": 9.203199352029968e-05, + "loss": 1.9405, + "step": 1584 + }, + { + "epoch": 0.4811048717559569, + "grad_norm": 0.3996240794658661, + "learning_rate": 9.202693125442948e-05, + "loss": 1.6466, + "step": 1585 + }, + { + "epoch": 0.48140840795264833, + "grad_norm": 0.44182920455932617, + "learning_rate": 9.202186898855928e-05, + "loss": 2.0223, + "step": 1586 + }, + { + "epoch": 0.4817119441493398, + "grad_norm": 0.43203607201576233, + "learning_rate": 9.201680672268908e-05, + "loss": 1.7969, + "step": 1587 + }, + { + "epoch": 0.48201548034603126, + "grad_norm": 0.3604522943496704, + "learning_rate": 9.201174445681887e-05, + "loss": 2.0201, + "step": 1588 + }, + { + "epoch": 0.48231901654272274, + "grad_norm": 0.4073752760887146, + "learning_rate": 9.200668219094867e-05, + "loss": 1.993, + "step": 1589 + }, + { + "epoch": 0.4826225527394142, + "grad_norm": 0.39307650923728943, + "learning_rate": 9.200161992507848e-05, + "loss": 2.3445, + "step": 1590 + }, + { + "epoch": 0.4829260889361056, + "grad_norm": 0.355831503868103, + "learning_rate": 9.199655765920827e-05, + "loss": 2.0101, + "step": 1591 + }, + { + "epoch": 0.4832296251327971, + "grad_norm": 0.5814805030822754, + "learning_rate": 9.199149539333807e-05, + "loss": 2.2421, + "step": 1592 + }, + { + "epoch": 0.48353316132948854, + "grad_norm": 0.4290510416030884, + "learning_rate": 9.198643312746786e-05, + "loss": 2.1818, + "step": 1593 + }, + { + "epoch": 0.48383669752617997, + "grad_norm": 7.360002040863037, + "learning_rate": 9.198137086159766e-05, + "loss": 2.0011, + "step": 1594 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.5217785835266113, + "learning_rate": 9.197630859572745e-05, + "loss": 1.817, + "step": 1595 + }, + { + "epoch": 0.4844437699195629, + "grad_norm": 0.4188072383403778, + "learning_rate": 9.197124632985725e-05, + "loss": 1.5588, + "step": 1596 + }, + { + "epoch": 0.4847473061162544, + "grad_norm": 0.4220346212387085, + "learning_rate": 9.196618406398705e-05, + "loss": 1.9217, + "step": 1597 + }, + { + "epoch": 0.4850508423129458, + "grad_norm": 0.5502439141273499, + "learning_rate": 9.196112179811685e-05, + "loss": 1.841, + "step": 1598 + }, + { + "epoch": 0.48535437850963725, + "grad_norm": 0.4167909622192383, + "learning_rate": 9.195605953224664e-05, + "loss": 1.7868, + "step": 1599 + }, + { + "epoch": 0.48565791470632874, + "grad_norm": 0.45999062061309814, + "learning_rate": 9.195099726637644e-05, + "loss": 1.8886, + "step": 1600 + }, + { + "epoch": 0.4859614509030202, + "grad_norm": 0.3937031626701355, + "learning_rate": 9.194593500050623e-05, + "loss": 1.7736, + "step": 1601 + }, + { + "epoch": 0.48626498709971167, + "grad_norm": 0.44424453377723694, + "learning_rate": 9.194087273463603e-05, + "loss": 1.6786, + "step": 1602 + }, + { + "epoch": 0.4865685232964031, + "grad_norm": 0.35432350635528564, + "learning_rate": 9.193581046876582e-05, + "loss": 1.8425, + "step": 1603 + }, + { + "epoch": 0.48687205949309453, + "grad_norm": 0.41191428899765015, + "learning_rate": 9.193074820289562e-05, + "loss": 1.342, + "step": 1604 + }, + { + "epoch": 0.487175595689786, + "grad_norm": 0.4410790503025055, + "learning_rate": 9.192568593702541e-05, + "loss": 1.3158, + "step": 1605 + }, + { + "epoch": 0.48747913188647746, + "grad_norm": 0.4214244782924652, + "learning_rate": 9.192062367115522e-05, + "loss": 2.1983, + "step": 1606 + }, + { + "epoch": 0.4877826680831689, + "grad_norm": 0.4066750109195709, + "learning_rate": 9.191556140528502e-05, + "loss": 1.5839, + "step": 1607 + }, + { + "epoch": 0.4880862042798604, + "grad_norm": 0.5248275995254517, + "learning_rate": 9.191049913941481e-05, + "loss": 1.6415, + "step": 1608 + }, + { + "epoch": 0.4883897404765518, + "grad_norm": 0.3945814073085785, + "learning_rate": 9.19054368735446e-05, + "loss": 1.6788, + "step": 1609 + }, + { + "epoch": 0.4886932766732433, + "grad_norm": 0.42285215854644775, + "learning_rate": 9.19003746076744e-05, + "loss": 1.7365, + "step": 1610 + }, + { + "epoch": 0.48899681286993474, + "grad_norm": 0.43051236867904663, + "learning_rate": 9.18953123418042e-05, + "loss": 1.8906, + "step": 1611 + }, + { + "epoch": 0.4893003490666262, + "grad_norm": 0.4336687922477722, + "learning_rate": 9.189025007593399e-05, + "loss": 1.6145, + "step": 1612 + }, + { + "epoch": 0.48960388526331766, + "grad_norm": 0.34237489104270935, + "learning_rate": 9.188518781006378e-05, + "loss": 1.9992, + "step": 1613 + }, + { + "epoch": 0.4899074214600091, + "grad_norm": 0.4344857931137085, + "learning_rate": 9.188012554419358e-05, + "loss": 1.9943, + "step": 1614 + }, + { + "epoch": 0.4902109576567006, + "grad_norm": 0.3851914703845978, + "learning_rate": 9.187506327832339e-05, + "loss": 1.8428, + "step": 1615 + }, + { + "epoch": 0.490514493853392, + "grad_norm": 0.39165550470352173, + "learning_rate": 9.187000101245318e-05, + "loss": 1.7958, + "step": 1616 + }, + { + "epoch": 0.49081803005008345, + "grad_norm": 0.34605157375335693, + "learning_rate": 9.186493874658298e-05, + "loss": 1.9257, + "step": 1617 + }, + { + "epoch": 0.49112156624677494, + "grad_norm": 0.422831654548645, + "learning_rate": 9.185987648071277e-05, + "loss": 2.1828, + "step": 1618 + }, + { + "epoch": 0.4914251024434664, + "grad_norm": 0.7868388891220093, + "learning_rate": 9.185481421484257e-05, + "loss": 1.4172, + "step": 1619 + }, + { + "epoch": 0.4917286386401578, + "grad_norm": 0.3971206247806549, + "learning_rate": 9.184975194897236e-05, + "loss": 1.8442, + "step": 1620 + }, + { + "epoch": 0.4920321748368493, + "grad_norm": 0.39479488134384155, + "learning_rate": 9.184468968310216e-05, + "loss": 1.6141, + "step": 1621 + }, + { + "epoch": 0.49233571103354073, + "grad_norm": 2.7340400218963623, + "learning_rate": 9.183962741723195e-05, + "loss": 1.7567, + "step": 1622 + }, + { + "epoch": 0.4926392472302322, + "grad_norm": 0.7024746537208557, + "learning_rate": 9.183456515136175e-05, + "loss": 2.3221, + "step": 1623 + }, + { + "epoch": 0.49294278342692366, + "grad_norm": 0.3881623148918152, + "learning_rate": 9.182950288549154e-05, + "loss": 2.0143, + "step": 1624 + }, + { + "epoch": 0.4932463196236151, + "grad_norm": 0.35226500034332275, + "learning_rate": 9.182444061962135e-05, + "loss": 1.8097, + "step": 1625 + }, + { + "epoch": 0.4935498558203066, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.181937835375114e-05, + "loss": 1.9594, + "step": 1626 + }, + { + "epoch": 0.493853392016998, + "grad_norm": 0.333279013633728, + "learning_rate": 9.181431608788094e-05, + "loss": 1.8533, + "step": 1627 + }, + { + "epoch": 0.4941569282136895, + "grad_norm": 0.6945008039474487, + "learning_rate": 9.180925382201073e-05, + "loss": 1.3658, + "step": 1628 + }, + { + "epoch": 0.49446046441038094, + "grad_norm": 0.4481600224971771, + "learning_rate": 9.180419155614053e-05, + "loss": 1.9189, + "step": 1629 + }, + { + "epoch": 0.4947640006070724, + "grad_norm": 0.35472220182418823, + "learning_rate": 9.179912929027032e-05, + "loss": 1.3206, + "step": 1630 + }, + { + "epoch": 0.49506753680376386, + "grad_norm": 0.5124238729476929, + "learning_rate": 9.179406702440012e-05, + "loss": 2.0371, + "step": 1631 + }, + { + "epoch": 0.4953710730004553, + "grad_norm": 0.3843775987625122, + "learning_rate": 9.178900475852991e-05, + "loss": 1.5858, + "step": 1632 + }, + { + "epoch": 0.4956746091971468, + "grad_norm": 0.41060924530029297, + "learning_rate": 9.178394249265971e-05, + "loss": 1.4591, + "step": 1633 + }, + { + "epoch": 0.4959781453938382, + "grad_norm": 0.5426920056343079, + "learning_rate": 9.177888022678952e-05, + "loss": 2.2744, + "step": 1634 + }, + { + "epoch": 0.49628168159052966, + "grad_norm": 0.4275033175945282, + "learning_rate": 9.177381796091931e-05, + "loss": 1.9274, + "step": 1635 + }, + { + "epoch": 0.49658521778722114, + "grad_norm": 0.4715273976325989, + "learning_rate": 9.176875569504912e-05, + "loss": 1.5788, + "step": 1636 + }, + { + "epoch": 0.4968887539839126, + "grad_norm": 0.41464027762413025, + "learning_rate": 9.176369342917891e-05, + "loss": 1.8147, + "step": 1637 + }, + { + "epoch": 0.497192290180604, + "grad_norm": 0.4175771176815033, + "learning_rate": 9.175863116330871e-05, + "loss": 2.02, + "step": 1638 + }, + { + "epoch": 0.4974958263772955, + "grad_norm": 0.42781904339790344, + "learning_rate": 9.17535688974385e-05, + "loss": 1.8772, + "step": 1639 + }, + { + "epoch": 0.49779936257398694, + "grad_norm": 0.381352961063385, + "learning_rate": 9.17485066315683e-05, + "loss": 1.9982, + "step": 1640 + }, + { + "epoch": 0.4981028987706784, + "grad_norm": 0.44887885451316833, + "learning_rate": 9.174344436569809e-05, + "loss": 1.6724, + "step": 1641 + }, + { + "epoch": 0.49840643496736986, + "grad_norm": 0.3764267563819885, + "learning_rate": 9.173838209982789e-05, + "loss": 1.7327, + "step": 1642 + }, + { + "epoch": 0.4987099711640613, + "grad_norm": 0.6911460161209106, + "learning_rate": 9.173331983395768e-05, + "loss": 2.1353, + "step": 1643 + }, + { + "epoch": 0.4990135073607528, + "grad_norm": 0.39581048488616943, + "learning_rate": 9.172825756808748e-05, + "loss": 2.1394, + "step": 1644 + }, + { + "epoch": 0.4993170435574442, + "grad_norm": 0.420389860868454, + "learning_rate": 9.172319530221729e-05, + "loss": 2.0948, + "step": 1645 + }, + { + "epoch": 0.4996205797541357, + "grad_norm": 0.3843049108982086, + "learning_rate": 9.171813303634708e-05, + "loss": 2.0618, + "step": 1646 + }, + { + "epoch": 0.49992411595082714, + "grad_norm": 0.3946545422077179, + "learning_rate": 9.171307077047688e-05, + "loss": 1.7997, + "step": 1647 + }, + { + "epoch": 0.5002276521475186, + "grad_norm": 0.3740834593772888, + "learning_rate": 9.170800850460667e-05, + "loss": 1.8436, + "step": 1648 + }, + { + "epoch": 0.5005311883442101, + "grad_norm": 0.42691826820373535, + "learning_rate": 9.170294623873646e-05, + "loss": 1.8915, + "step": 1649 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.41487646102905273, + "learning_rate": 9.169788397286626e-05, + "loss": 1.6712, + "step": 1650 + }, + { + "epoch": 0.5011382607375929, + "grad_norm": 0.44870665669441223, + "learning_rate": 9.169282170699605e-05, + "loss": 1.3704, + "step": 1651 + }, + { + "epoch": 0.5014417969342844, + "grad_norm": 0.5584750771522522, + "learning_rate": 9.168775944112585e-05, + "loss": 2.2258, + "step": 1652 + }, + { + "epoch": 0.5017453331309759, + "grad_norm": 0.4336828291416168, + "learning_rate": 9.168269717525564e-05, + "loss": 2.0302, + "step": 1653 + }, + { + "epoch": 0.5020488693276673, + "grad_norm": 0.3990234434604645, + "learning_rate": 9.167763490938545e-05, + "loss": 1.9813, + "step": 1654 + }, + { + "epoch": 0.5023524055243588, + "grad_norm": 0.42252814769744873, + "learning_rate": 9.167257264351525e-05, + "loss": 1.7596, + "step": 1655 + }, + { + "epoch": 0.5026559417210502, + "grad_norm": 0.42766478657722473, + "learning_rate": 9.166751037764504e-05, + "loss": 1.4622, + "step": 1656 + }, + { + "epoch": 0.5029594779177416, + "grad_norm": 0.4347383975982666, + "learning_rate": 9.166244811177484e-05, + "loss": 1.4987, + "step": 1657 + }, + { + "epoch": 0.5032630141144332, + "grad_norm": 0.3660615384578705, + "learning_rate": 9.165738584590463e-05, + "loss": 1.3758, + "step": 1658 + }, + { + "epoch": 0.5035665503111246, + "grad_norm": 0.3933682441711426, + "learning_rate": 9.165232358003443e-05, + "loss": 1.9097, + "step": 1659 + }, + { + "epoch": 0.5038700865078161, + "grad_norm": 0.39718765020370483, + "learning_rate": 9.164726131416422e-05, + "loss": 1.9825, + "step": 1660 + }, + { + "epoch": 0.5041736227045075, + "grad_norm": 0.4161352515220642, + "learning_rate": 9.164219904829402e-05, + "loss": 1.6346, + "step": 1661 + }, + { + "epoch": 0.5044771589011989, + "grad_norm": 0.413492888212204, + "learning_rate": 9.163713678242381e-05, + "loss": 1.9286, + "step": 1662 + }, + { + "epoch": 0.5047806950978905, + "grad_norm": 0.4211573600769043, + "learning_rate": 9.16320745165536e-05, + "loss": 1.5557, + "step": 1663 + }, + { + "epoch": 0.5050842312945819, + "grad_norm": 0.3247505724430084, + "learning_rate": 9.162701225068341e-05, + "loss": 1.8372, + "step": 1664 + }, + { + "epoch": 0.5053877674912733, + "grad_norm": 0.699242889881134, + "learning_rate": 9.162194998481321e-05, + "loss": 1.3331, + "step": 1665 + }, + { + "epoch": 0.5056913036879648, + "grad_norm": 0.45382243394851685, + "learning_rate": 9.1616887718943e-05, + "loss": 1.5317, + "step": 1666 + }, + { + "epoch": 0.5059948398846562, + "grad_norm": 0.37562644481658936, + "learning_rate": 9.16118254530728e-05, + "loss": 1.4138, + "step": 1667 + }, + { + "epoch": 0.5062983760813476, + "grad_norm": 0.41830095648765564, + "learning_rate": 9.160676318720259e-05, + "loss": 2.0788, + "step": 1668 + }, + { + "epoch": 0.5066019122780392, + "grad_norm": 0.4154708981513977, + "learning_rate": 9.160170092133239e-05, + "loss": 2.0095, + "step": 1669 + }, + { + "epoch": 0.5069054484747306, + "grad_norm": 0.3693794906139374, + "learning_rate": 9.159663865546218e-05, + "loss": 1.8871, + "step": 1670 + }, + { + "epoch": 0.5072089846714221, + "grad_norm": 0.42712700366973877, + "learning_rate": 9.159157638959198e-05, + "loss": 1.9114, + "step": 1671 + }, + { + "epoch": 0.5075125208681135, + "grad_norm": 0.406843900680542, + "learning_rate": 9.158651412372177e-05, + "loss": 1.7887, + "step": 1672 + }, + { + "epoch": 0.5078160570648049, + "grad_norm": 0.3689083456993103, + "learning_rate": 9.158145185785158e-05, + "loss": 1.8421, + "step": 1673 + }, + { + "epoch": 0.5081195932614965, + "grad_norm": 0.40796002745628357, + "learning_rate": 9.157638959198138e-05, + "loss": 1.5014, + "step": 1674 + }, + { + "epoch": 0.5084231294581879, + "grad_norm": 0.44102364778518677, + "learning_rate": 9.157132732611117e-05, + "loss": 1.5184, + "step": 1675 + }, + { + "epoch": 0.5087266656548793, + "grad_norm": 0.4265199899673462, + "learning_rate": 9.156626506024096e-05, + "loss": 2.017, + "step": 1676 + }, + { + "epoch": 0.5090302018515708, + "grad_norm": 0.4618091285228729, + "learning_rate": 9.156120279437076e-05, + "loss": 2.056, + "step": 1677 + }, + { + "epoch": 0.5093337380482622, + "grad_norm": 0.4058600068092346, + "learning_rate": 9.155614052850055e-05, + "loss": 1.9897, + "step": 1678 + }, + { + "epoch": 0.5096372742449538, + "grad_norm": 0.46722692251205444, + "learning_rate": 9.155107826263036e-05, + "loss": 1.9713, + "step": 1679 + }, + { + "epoch": 0.5099408104416452, + "grad_norm": 0.36259156465530396, + "learning_rate": 9.154601599676016e-05, + "loss": 1.9321, + "step": 1680 + }, + { + "epoch": 0.5102443466383366, + "grad_norm": 0.366148442029953, + "learning_rate": 9.154095373088995e-05, + "loss": 1.9573, + "step": 1681 + }, + { + "epoch": 0.510547882835028, + "grad_norm": 0.3328361213207245, + "learning_rate": 9.153589146501975e-05, + "loss": 1.8222, + "step": 1682 + }, + { + "epoch": 0.5108514190317195, + "grad_norm": 0.45891711115837097, + "learning_rate": 9.153082919914954e-05, + "loss": 1.7177, + "step": 1683 + }, + { + "epoch": 0.511154955228411, + "grad_norm": 0.4405977427959442, + "learning_rate": 9.152576693327935e-05, + "loss": 1.8499, + "step": 1684 + }, + { + "epoch": 0.5114584914251025, + "grad_norm": 0.7388264536857605, + "learning_rate": 9.152070466740915e-05, + "loss": 1.9884, + "step": 1685 + }, + { + "epoch": 0.5117620276217939, + "grad_norm": 0.43892955780029297, + "learning_rate": 9.151564240153894e-05, + "loss": 2.0027, + "step": 1686 + }, + { + "epoch": 0.5120655638184853, + "grad_norm": 0.42659783363342285, + "learning_rate": 9.151058013566873e-05, + "loss": 1.8386, + "step": 1687 + }, + { + "epoch": 0.5123691000151768, + "grad_norm": 0.4364768862724304, + "learning_rate": 9.150551786979853e-05, + "loss": 1.6248, + "step": 1688 + }, + { + "epoch": 0.5126726362118683, + "grad_norm": 0.35849112272262573, + "learning_rate": 9.150045560392832e-05, + "loss": 2.0983, + "step": 1689 + }, + { + "epoch": 0.5129761724085597, + "grad_norm": 0.38595572113990784, + "learning_rate": 9.149539333805812e-05, + "loss": 1.956, + "step": 1690 + }, + { + "epoch": 0.5132797086052512, + "grad_norm": 0.4161504805088043, + "learning_rate": 9.149033107218791e-05, + "loss": 1.8132, + "step": 1691 + }, + { + "epoch": 0.5135832448019426, + "grad_norm": 0.6614299416542053, + "learning_rate": 9.148526880631771e-05, + "loss": 1.4403, + "step": 1692 + }, + { + "epoch": 0.513886780998634, + "grad_norm": 0.4609692692756653, + "learning_rate": 9.148020654044752e-05, + "loss": 1.9215, + "step": 1693 + }, + { + "epoch": 0.5141903171953256, + "grad_norm": 0.4489036202430725, + "learning_rate": 9.147514427457731e-05, + "loss": 1.7922, + "step": 1694 + }, + { + "epoch": 0.514493853392017, + "grad_norm": 0.46497032046318054, + "learning_rate": 9.14700820087071e-05, + "loss": 1.6058, + "step": 1695 + }, + { + "epoch": 0.5147973895887085, + "grad_norm": 0.39706695079803467, + "learning_rate": 9.14650197428369e-05, + "loss": 1.7717, + "step": 1696 + }, + { + "epoch": 0.5151009257853999, + "grad_norm": 0.3839566111564636, + "learning_rate": 9.14599574769667e-05, + "loss": 1.8218, + "step": 1697 + }, + { + "epoch": 0.5154044619820913, + "grad_norm": 0.7339301109313965, + "learning_rate": 9.145489521109649e-05, + "loss": 1.9836, + "step": 1698 + }, + { + "epoch": 0.5157079981787828, + "grad_norm": 0.4512780010700226, + "learning_rate": 9.144983294522629e-05, + "loss": 2.0034, + "step": 1699 + }, + { + "epoch": 0.5160115343754743, + "grad_norm": 1.845346212387085, + "learning_rate": 9.144477067935608e-05, + "loss": 1.6995, + "step": 1700 + }, + { + "epoch": 0.5163150705721657, + "grad_norm": 0.42541632056236267, + "learning_rate": 9.143970841348588e-05, + "loss": 2.0264, + "step": 1701 + }, + { + "epoch": 0.5166186067688572, + "grad_norm": 0.404821515083313, + "learning_rate": 9.143464614761567e-05, + "loss": 1.9064, + "step": 1702 + }, + { + "epoch": 0.5169221429655486, + "grad_norm": 0.4223015606403351, + "learning_rate": 9.142958388174548e-05, + "loss": 1.8442, + "step": 1703 + }, + { + "epoch": 0.51722567916224, + "grad_norm": 0.38094672560691833, + "learning_rate": 9.142452161587527e-05, + "loss": 1.7625, + "step": 1704 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.3759573698043823, + "learning_rate": 9.141945935000507e-05, + "loss": 2.0585, + "step": 1705 + }, + { + "epoch": 0.517832751555623, + "grad_norm": 0.3938165307044983, + "learning_rate": 9.141439708413486e-05, + "loss": 1.9594, + "step": 1706 + }, + { + "epoch": 0.5181362877523145, + "grad_norm": 0.4222012758255005, + "learning_rate": 9.140933481826466e-05, + "loss": 1.1698, + "step": 1707 + }, + { + "epoch": 0.5184398239490059, + "grad_norm": 0.419763445854187, + "learning_rate": 9.140427255239445e-05, + "loss": 1.9484, + "step": 1708 + }, + { + "epoch": 0.5187433601456973, + "grad_norm": 0.4546319544315338, + "learning_rate": 9.139921028652425e-05, + "loss": 1.9924, + "step": 1709 + }, + { + "epoch": 0.5190468963423889, + "grad_norm": 0.5007880330085754, + "learning_rate": 9.139414802065404e-05, + "loss": 2.0619, + "step": 1710 + }, + { + "epoch": 0.5193504325390803, + "grad_norm": 0.3647090494632721, + "learning_rate": 9.138908575478384e-05, + "loss": 1.9504, + "step": 1711 + }, + { + "epoch": 0.5196539687357717, + "grad_norm": 0.4546000063419342, + "learning_rate": 9.138402348891365e-05, + "loss": 2.0943, + "step": 1712 + }, + { + "epoch": 0.5199575049324632, + "grad_norm": 0.36992448568344116, + "learning_rate": 9.137896122304344e-05, + "loss": 1.8111, + "step": 1713 + }, + { + "epoch": 0.5202610411291546, + "grad_norm": 0.40882760286331177, + "learning_rate": 9.137389895717323e-05, + "loss": 1.8935, + "step": 1714 + }, + { + "epoch": 0.5205645773258462, + "grad_norm": 0.39158037304878235, + "learning_rate": 9.136883669130303e-05, + "loss": 1.4147, + "step": 1715 + }, + { + "epoch": 0.5208681135225376, + "grad_norm": 0.42174550890922546, + "learning_rate": 9.136377442543282e-05, + "loss": 1.5931, + "step": 1716 + }, + { + "epoch": 0.521171649719229, + "grad_norm": 0.4003652036190033, + "learning_rate": 9.135871215956262e-05, + "loss": 1.4119, + "step": 1717 + }, + { + "epoch": 0.5214751859159205, + "grad_norm": 0.42328763008117676, + "learning_rate": 9.135364989369241e-05, + "loss": 1.6943, + "step": 1718 + }, + { + "epoch": 0.5217787221126119, + "grad_norm": 0.3831746578216553, + "learning_rate": 9.134858762782221e-05, + "loss": 1.8067, + "step": 1719 + }, + { + "epoch": 0.5220822583093034, + "grad_norm": 0.4160243272781372, + "learning_rate": 9.1343525361952e-05, + "loss": 2.0725, + "step": 1720 + }, + { + "epoch": 0.5223857945059949, + "grad_norm": 0.47441422939300537, + "learning_rate": 9.133846309608181e-05, + "loss": 2.2569, + "step": 1721 + }, + { + "epoch": 0.5226893307026863, + "grad_norm": 0.34522169828414917, + "learning_rate": 9.133340083021161e-05, + "loss": 1.6977, + "step": 1722 + }, + { + "epoch": 0.5229928668993777, + "grad_norm": 0.6760712265968323, + "learning_rate": 9.132833856434142e-05, + "loss": 1.7252, + "step": 1723 + }, + { + "epoch": 0.5232964030960692, + "grad_norm": 0.42016392946243286, + "learning_rate": 9.132327629847121e-05, + "loss": 1.9835, + "step": 1724 + }, + { + "epoch": 0.5235999392927606, + "grad_norm": 0.4062696099281311, + "learning_rate": 9.1318214032601e-05, + "loss": 1.8181, + "step": 1725 + }, + { + "epoch": 0.5239034754894522, + "grad_norm": 0.37092477083206177, + "learning_rate": 9.13131517667308e-05, + "loss": 1.9989, + "step": 1726 + }, + { + "epoch": 0.5242070116861436, + "grad_norm": 0.30382564663887024, + "learning_rate": 9.13080895008606e-05, + "loss": 1.5613, + "step": 1727 + }, + { + "epoch": 0.524510547882835, + "grad_norm": 0.39715448021888733, + "learning_rate": 9.130302723499039e-05, + "loss": 1.8396, + "step": 1728 + }, + { + "epoch": 0.5248140840795265, + "grad_norm": 0.698819637298584, + "learning_rate": 9.129796496912018e-05, + "loss": 1.6617, + "step": 1729 + }, + { + "epoch": 0.5251176202762179, + "grad_norm": 0.37083616852760315, + "learning_rate": 9.129290270324998e-05, + "loss": 1.0619, + "step": 1730 + }, + { + "epoch": 0.5254211564729094, + "grad_norm": 0.37196993827819824, + "learning_rate": 9.128784043737977e-05, + "loss": 1.4654, + "step": 1731 + }, + { + "epoch": 0.5257246926696009, + "grad_norm": 0.38970932364463806, + "learning_rate": 9.128277817150958e-05, + "loss": 1.9632, + "step": 1732 + }, + { + "epoch": 0.5260282288662923, + "grad_norm": 0.4937323033809662, + "learning_rate": 9.127771590563938e-05, + "loss": 1.5989, + "step": 1733 + }, + { + "epoch": 0.5263317650629837, + "grad_norm": 0.37157008051872253, + "learning_rate": 9.127265363976917e-05, + "loss": 1.8486, + "step": 1734 + }, + { + "epoch": 0.5266353012596752, + "grad_norm": 0.3973872661590576, + "learning_rate": 9.126759137389897e-05, + "loss": 1.5195, + "step": 1735 + }, + { + "epoch": 0.5269388374563667, + "grad_norm": 0.3511494994163513, + "learning_rate": 9.126252910802876e-05, + "loss": 1.9055, + "step": 1736 + }, + { + "epoch": 0.5272423736530581, + "grad_norm": 0.36223629117012024, + "learning_rate": 9.125746684215856e-05, + "loss": 1.5545, + "step": 1737 + }, + { + "epoch": 0.5275459098497496, + "grad_norm": 0.4978778660297394, + "learning_rate": 9.125240457628835e-05, + "loss": 1.9145, + "step": 1738 + }, + { + "epoch": 0.527849446046441, + "grad_norm": 0.3191153407096863, + "learning_rate": 9.124734231041815e-05, + "loss": 1.754, + "step": 1739 + }, + { + "epoch": 0.5281529822431325, + "grad_norm": 0.39094769954681396, + "learning_rate": 9.124228004454794e-05, + "loss": 1.9462, + "step": 1740 + }, + { + "epoch": 0.528456518439824, + "grad_norm": 0.6246857047080994, + "learning_rate": 9.123721777867774e-05, + "loss": 2.0239, + "step": 1741 + }, + { + "epoch": 0.5287600546365154, + "grad_norm": 0.41962483525276184, + "learning_rate": 9.123215551280754e-05, + "loss": 1.9372, + "step": 1742 + }, + { + "epoch": 0.5290635908332069, + "grad_norm": 0.3055092394351959, + "learning_rate": 9.122709324693734e-05, + "loss": 0.9516, + "step": 1743 + }, + { + "epoch": 0.5293671270298983, + "grad_norm": 0.4911038875579834, + "learning_rate": 9.122203098106713e-05, + "loss": 1.7127, + "step": 1744 + }, + { + "epoch": 0.5296706632265897, + "grad_norm": 0.7481783032417297, + "learning_rate": 9.121696871519693e-05, + "loss": 2.1368, + "step": 1745 + }, + { + "epoch": 0.5299741994232813, + "grad_norm": 0.4397221803665161, + "learning_rate": 9.121190644932672e-05, + "loss": 1.978, + "step": 1746 + }, + { + "epoch": 0.5302777356199727, + "grad_norm": 0.3751915991306305, + "learning_rate": 9.120684418345652e-05, + "loss": 2.0627, + "step": 1747 + }, + { + "epoch": 0.5305812718166641, + "grad_norm": 0.474575400352478, + "learning_rate": 9.120178191758631e-05, + "loss": 2.018, + "step": 1748 + }, + { + "epoch": 0.5308848080133556, + "grad_norm": 0.3762502372264862, + "learning_rate": 9.119671965171611e-05, + "loss": 1.7076, + "step": 1749 + }, + { + "epoch": 0.531188344210047, + "grad_norm": 0.4058527946472168, + "learning_rate": 9.11916573858459e-05, + "loss": 1.768, + "step": 1750 + }, + { + "epoch": 0.5314918804067384, + "grad_norm": 0.3765137791633606, + "learning_rate": 9.118659511997571e-05, + "loss": 1.8357, + "step": 1751 + }, + { + "epoch": 0.53179541660343, + "grad_norm": 0.459602415561676, + "learning_rate": 9.11815328541055e-05, + "loss": 0.918, + "step": 1752 + }, + { + "epoch": 0.5320989528001214, + "grad_norm": 0.4160063564777374, + "learning_rate": 9.11764705882353e-05, + "loss": 1.8438, + "step": 1753 + }, + { + "epoch": 0.5324024889968129, + "grad_norm": 0.44720131158828735, + "learning_rate": 9.11714083223651e-05, + "loss": 1.6503, + "step": 1754 + }, + { + "epoch": 0.5327060251935043, + "grad_norm": 0.35455620288848877, + "learning_rate": 9.116634605649489e-05, + "loss": 2.0683, + "step": 1755 + }, + { + "epoch": 0.5330095613901957, + "grad_norm": 0.3938636779785156, + "learning_rate": 9.116128379062468e-05, + "loss": 1.8191, + "step": 1756 + }, + { + "epoch": 0.5333130975868873, + "grad_norm": 0.38144779205322266, + "learning_rate": 9.115622152475448e-05, + "loss": 1.4855, + "step": 1757 + }, + { + "epoch": 0.5336166337835787, + "grad_norm": 0.3418583571910858, + "learning_rate": 9.115115925888427e-05, + "loss": 1.8684, + "step": 1758 + }, + { + "epoch": 0.5339201699802701, + "grad_norm": 0.3342360854148865, + "learning_rate": 9.114609699301407e-05, + "loss": 1.7817, + "step": 1759 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.4178410768508911, + "learning_rate": 9.114103472714388e-05, + "loss": 1.848, + "step": 1760 + }, + { + "epoch": 0.534527242373653, + "grad_norm": 0.37378084659576416, + "learning_rate": 9.113597246127367e-05, + "loss": 2.1967, + "step": 1761 + }, + { + "epoch": 0.5348307785703446, + "grad_norm": 0.33370524644851685, + "learning_rate": 9.113091019540347e-05, + "loss": 1.9943, + "step": 1762 + }, + { + "epoch": 0.535134314767036, + "grad_norm": 0.3402559757232666, + "learning_rate": 9.112584792953326e-05, + "loss": 1.7164, + "step": 1763 + }, + { + "epoch": 0.5354378509637274, + "grad_norm": 0.3962159752845764, + "learning_rate": 9.112078566366306e-05, + "loss": 1.8821, + "step": 1764 + }, + { + "epoch": 0.5357413871604189, + "grad_norm": 0.4659918546676636, + "learning_rate": 9.111572339779285e-05, + "loss": 1.7065, + "step": 1765 + }, + { + "epoch": 0.5360449233571103, + "grad_norm": 0.38673698902130127, + "learning_rate": 9.111066113192265e-05, + "loss": 1.8969, + "step": 1766 + }, + { + "epoch": 0.5363484595538018, + "grad_norm": 0.3595302999019623, + "learning_rate": 9.110559886605244e-05, + "loss": 1.881, + "step": 1767 + }, + { + "epoch": 0.5366519957504933, + "grad_norm": 0.4756614565849304, + "learning_rate": 9.110053660018225e-05, + "loss": 1.9395, + "step": 1768 + }, + { + "epoch": 0.5369555319471847, + "grad_norm": 0.36729127168655396, + "learning_rate": 9.109547433431204e-05, + "loss": 2.0762, + "step": 1769 + }, + { + "epoch": 0.5372590681438761, + "grad_norm": 0.5436307191848755, + "learning_rate": 9.109041206844184e-05, + "loss": 1.9971, + "step": 1770 + }, + { + "epoch": 0.5375626043405676, + "grad_norm": 0.42176029086112976, + "learning_rate": 9.108534980257165e-05, + "loss": 1.5326, + "step": 1771 + }, + { + "epoch": 0.5378661405372591, + "grad_norm": 0.6235511302947998, + "learning_rate": 9.108028753670144e-05, + "loss": 1.8906, + "step": 1772 + }, + { + "epoch": 0.5381696767339506, + "grad_norm": 0.42510315775871277, + "learning_rate": 9.107522527083124e-05, + "loss": 1.7236, + "step": 1773 + }, + { + "epoch": 0.538473212930642, + "grad_norm": 0.4418346583843231, + "learning_rate": 9.107016300496103e-05, + "loss": 1.9227, + "step": 1774 + }, + { + "epoch": 0.5387767491273334, + "grad_norm": 0.9422191977500916, + "learning_rate": 9.106510073909083e-05, + "loss": 1.7426, + "step": 1775 + }, + { + "epoch": 0.5390802853240249, + "grad_norm": 0.44353923201560974, + "learning_rate": 9.106003847322062e-05, + "loss": 2.0377, + "step": 1776 + }, + { + "epoch": 0.5393838215207164, + "grad_norm": 0.457926481962204, + "learning_rate": 9.105497620735042e-05, + "loss": 1.9583, + "step": 1777 + }, + { + "epoch": 0.5396873577174078, + "grad_norm": 0.3857896327972412, + "learning_rate": 9.104991394148021e-05, + "loss": 1.4618, + "step": 1778 + }, + { + "epoch": 0.5399908939140993, + "grad_norm": 0.4202859401702881, + "learning_rate": 9.104485167561e-05, + "loss": 1.7507, + "step": 1779 + }, + { + "epoch": 0.5402944301107907, + "grad_norm": 0.3665039837360382, + "learning_rate": 9.10397894097398e-05, + "loss": 1.8576, + "step": 1780 + }, + { + "epoch": 0.5405979663074821, + "grad_norm": 0.39893728494644165, + "learning_rate": 9.103472714386961e-05, + "loss": 1.81, + "step": 1781 + }, + { + "epoch": 0.5409015025041736, + "grad_norm": 2.199347972869873, + "learning_rate": 9.10296648779994e-05, + "loss": 1.4915, + "step": 1782 + }, + { + "epoch": 0.5412050387008651, + "grad_norm": 0.4976440966129303, + "learning_rate": 9.10246026121292e-05, + "loss": 1.6961, + "step": 1783 + }, + { + "epoch": 0.5415085748975565, + "grad_norm": 0.4084802269935608, + "learning_rate": 9.101954034625899e-05, + "loss": 1.4498, + "step": 1784 + }, + { + "epoch": 0.541812111094248, + "grad_norm": 0.37160369753837585, + "learning_rate": 9.101447808038879e-05, + "loss": 1.8383, + "step": 1785 + }, + { + "epoch": 0.5421156472909394, + "grad_norm": 0.4095883071422577, + "learning_rate": 9.100941581451858e-05, + "loss": 1.7035, + "step": 1786 + }, + { + "epoch": 0.5424191834876309, + "grad_norm": 0.3713209927082062, + "learning_rate": 9.100435354864838e-05, + "loss": 1.8924, + "step": 1787 + }, + { + "epoch": 0.5427227196843224, + "grad_norm": 0.465432733297348, + "learning_rate": 9.099929128277817e-05, + "loss": 2.1334, + "step": 1788 + }, + { + "epoch": 0.5430262558810138, + "grad_norm": 0.4591209590435028, + "learning_rate": 9.099422901690797e-05, + "loss": 1.9845, + "step": 1789 + }, + { + "epoch": 0.5433297920777053, + "grad_norm": 0.45076972246170044, + "learning_rate": 9.098916675103777e-05, + "loss": 1.7297, + "step": 1790 + }, + { + "epoch": 0.5436333282743967, + "grad_norm": 0.44921204447746277, + "learning_rate": 9.098410448516757e-05, + "loss": 1.9707, + "step": 1791 + }, + { + "epoch": 0.5439368644710881, + "grad_norm": 0.3970228135585785, + "learning_rate": 9.097904221929736e-05, + "loss": 2.024, + "step": 1792 + }, + { + "epoch": 0.5442404006677797, + "grad_norm": 0.4587130546569824, + "learning_rate": 9.097397995342716e-05, + "loss": 1.5426, + "step": 1793 + }, + { + "epoch": 0.5445439368644711, + "grad_norm": 0.4152527153491974, + "learning_rate": 9.096891768755695e-05, + "loss": 1.9575, + "step": 1794 + }, + { + "epoch": 0.5448474730611625, + "grad_norm": 0.3973013758659363, + "learning_rate": 9.096385542168675e-05, + "loss": 2.0246, + "step": 1795 + }, + { + "epoch": 0.545151009257854, + "grad_norm": 0.3950592875480652, + "learning_rate": 9.095879315581654e-05, + "loss": 1.9213, + "step": 1796 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.4187184274196625, + "learning_rate": 9.095373088994634e-05, + "loss": 1.8801, + "step": 1797 + }, + { + "epoch": 0.545758081651237, + "grad_norm": 0.43511682748794556, + "learning_rate": 9.094866862407613e-05, + "loss": 1.4022, + "step": 1798 + }, + { + "epoch": 0.5460616178479284, + "grad_norm": 0.40607360005378723, + "learning_rate": 9.094360635820594e-05, + "loss": 1.9416, + "step": 1799 + }, + { + "epoch": 0.5463651540446198, + "grad_norm": 0.3357563018798828, + "learning_rate": 9.093854409233574e-05, + "loss": 1.8906, + "step": 1800 + }, + { + "epoch": 0.5466686902413113, + "grad_norm": 0.3839071989059448, + "learning_rate": 9.093348182646553e-05, + "loss": 1.7918, + "step": 1801 + }, + { + "epoch": 0.5469722264380027, + "grad_norm": 0.3882817029953003, + "learning_rate": 9.092841956059533e-05, + "loss": 1.6346, + "step": 1802 + }, + { + "epoch": 0.5472757626346942, + "grad_norm": 0.4323276877403259, + "learning_rate": 9.092335729472512e-05, + "loss": 1.7346, + "step": 1803 + }, + { + "epoch": 0.5475792988313857, + "grad_norm": 0.39711809158325195, + "learning_rate": 9.091829502885492e-05, + "loss": 1.8855, + "step": 1804 + }, + { + "epoch": 0.5478828350280771, + "grad_norm": 0.4660872519016266, + "learning_rate": 9.091323276298471e-05, + "loss": 1.1871, + "step": 1805 + }, + { + "epoch": 0.5481863712247685, + "grad_norm": 0.45804888010025024, + "learning_rate": 9.09081704971145e-05, + "loss": 2.1685, + "step": 1806 + }, + { + "epoch": 0.54848990742146, + "grad_norm": 0.5922791361808777, + "learning_rate": 9.09031082312443e-05, + "loss": 1.8701, + "step": 1807 + }, + { + "epoch": 0.5487934436181514, + "grad_norm": 0.43038979172706604, + "learning_rate": 9.08980459653741e-05, + "loss": 2.1007, + "step": 1808 + }, + { + "epoch": 0.549096979814843, + "grad_norm": 0.3624688684940338, + "learning_rate": 9.08929836995039e-05, + "loss": 1.7411, + "step": 1809 + }, + { + "epoch": 0.5494005160115344, + "grad_norm": 0.40898412466049194, + "learning_rate": 9.08879214336337e-05, + "loss": 1.9381, + "step": 1810 + }, + { + "epoch": 0.5497040522082258, + "grad_norm": 0.45767003297805786, + "learning_rate": 9.088285916776349e-05, + "loss": 1.8295, + "step": 1811 + }, + { + "epoch": 0.5500075884049173, + "grad_norm": 0.41230660676956177, + "learning_rate": 9.08777969018933e-05, + "loss": 1.7066, + "step": 1812 + }, + { + "epoch": 0.5503111246016087, + "grad_norm": 0.6730133891105652, + "learning_rate": 9.08727346360231e-05, + "loss": 2.2395, + "step": 1813 + }, + { + "epoch": 0.5506146607983002, + "grad_norm": 0.39757731556892395, + "learning_rate": 9.086767237015289e-05, + "loss": 1.9659, + "step": 1814 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.48182088136672974, + "learning_rate": 9.086261010428269e-05, + "loss": 1.7596, + "step": 1815 + }, + { + "epoch": 0.5512217331916831, + "grad_norm": 0.4225050210952759, + "learning_rate": 9.085754783841248e-05, + "loss": 1.6582, + "step": 1816 + }, + { + "epoch": 0.5515252693883745, + "grad_norm": 0.40362295508384705, + "learning_rate": 9.085248557254227e-05, + "loss": 1.8002, + "step": 1817 + }, + { + "epoch": 0.551828805585066, + "grad_norm": 0.4283868968486786, + "learning_rate": 9.084742330667207e-05, + "loss": 2.0566, + "step": 1818 + }, + { + "epoch": 0.5521323417817575, + "grad_norm": 0.3864719569683075, + "learning_rate": 9.084236104080186e-05, + "loss": 1.7037, + "step": 1819 + }, + { + "epoch": 0.552435877978449, + "grad_norm": 0.45380616188049316, + "learning_rate": 9.083729877493167e-05, + "loss": 1.7841, + "step": 1820 + }, + { + "epoch": 0.5527394141751404, + "grad_norm": 0.42916885018348694, + "learning_rate": 9.083223650906147e-05, + "loss": 1.8914, + "step": 1821 + }, + { + "epoch": 0.5530429503718318, + "grad_norm": 0.4037598669528961, + "learning_rate": 9.082717424319126e-05, + "loss": 1.9326, + "step": 1822 + }, + { + "epoch": 0.5533464865685233, + "grad_norm": 0.37888360023498535, + "learning_rate": 9.082211197732106e-05, + "loss": 1.9225, + "step": 1823 + }, + { + "epoch": 0.5536500227652148, + "grad_norm": 0.44082072377204895, + "learning_rate": 9.081704971145085e-05, + "loss": 2.0965, + "step": 1824 + }, + { + "epoch": 0.5539535589619062, + "grad_norm": 0.40458253026008606, + "learning_rate": 9.081198744558065e-05, + "loss": 1.9156, + "step": 1825 + }, + { + "epoch": 0.5542570951585977, + "grad_norm": 0.648024320602417, + "learning_rate": 9.080692517971044e-05, + "loss": 1.6612, + "step": 1826 + }, + { + "epoch": 0.5545606313552891, + "grad_norm": 0.38878655433654785, + "learning_rate": 9.080186291384024e-05, + "loss": 1.9156, + "step": 1827 + }, + { + "epoch": 0.5548641675519805, + "grad_norm": 0.3615175187587738, + "learning_rate": 9.079680064797003e-05, + "loss": 1.9214, + "step": 1828 + }, + { + "epoch": 0.5551677037486721, + "grad_norm": 0.34867003560066223, + "learning_rate": 9.079173838209984e-05, + "loss": 1.906, + "step": 1829 + }, + { + "epoch": 0.5554712399453635, + "grad_norm": 0.6473682522773743, + "learning_rate": 9.078667611622963e-05, + "loss": 1.6777, + "step": 1830 + }, + { + "epoch": 0.555774776142055, + "grad_norm": 0.4099821150302887, + "learning_rate": 9.078161385035943e-05, + "loss": 2.1986, + "step": 1831 + }, + { + "epoch": 0.5560783123387464, + "grad_norm": 0.3992425799369812, + "learning_rate": 9.077655158448922e-05, + "loss": 2.0874, + "step": 1832 + }, + { + "epoch": 0.5563818485354378, + "grad_norm": 0.3562420904636383, + "learning_rate": 9.077148931861902e-05, + "loss": 1.8174, + "step": 1833 + }, + { + "epoch": 0.5566853847321293, + "grad_norm": 0.45232492685317993, + "learning_rate": 9.076642705274881e-05, + "loss": 2.0863, + "step": 1834 + }, + { + "epoch": 0.5569889209288208, + "grad_norm": 0.39387455582618713, + "learning_rate": 9.076136478687861e-05, + "loss": 1.8224, + "step": 1835 + }, + { + "epoch": 0.5572924571255122, + "grad_norm": 0.35372141003608704, + "learning_rate": 9.07563025210084e-05, + "loss": 1.5546, + "step": 1836 + }, + { + "epoch": 0.5575959933222037, + "grad_norm": 0.4068455100059509, + "learning_rate": 9.07512402551382e-05, + "loss": 1.6266, + "step": 1837 + }, + { + "epoch": 0.5578995295188951, + "grad_norm": 0.39574089646339417, + "learning_rate": 9.0746177989268e-05, + "loss": 2.0823, + "step": 1838 + }, + { + "epoch": 0.5582030657155865, + "grad_norm": 1.1845453977584839, + "learning_rate": 9.07411157233978e-05, + "loss": 1.6966, + "step": 1839 + }, + { + "epoch": 0.5585066019122781, + "grad_norm": 0.39268460869789124, + "learning_rate": 9.07360534575276e-05, + "loss": 1.1071, + "step": 1840 + }, + { + "epoch": 0.5588101381089695, + "grad_norm": 0.4749743640422821, + "learning_rate": 9.073099119165739e-05, + "loss": 1.9787, + "step": 1841 + }, + { + "epoch": 0.559113674305661, + "grad_norm": 0.4099438786506653, + "learning_rate": 9.072592892578719e-05, + "loss": 1.952, + "step": 1842 + }, + { + "epoch": 0.5594172105023524, + "grad_norm": 0.4282529354095459, + "learning_rate": 9.072086665991698e-05, + "loss": 1.9985, + "step": 1843 + }, + { + "epoch": 0.5597207466990438, + "grad_norm": 0.41518470644950867, + "learning_rate": 9.071580439404678e-05, + "loss": 1.6999, + "step": 1844 + }, + { + "epoch": 0.5600242828957354, + "grad_norm": 0.4059050381183624, + "learning_rate": 9.071074212817657e-05, + "loss": 1.9737, + "step": 1845 + }, + { + "epoch": 0.5603278190924268, + "grad_norm": 0.3274436295032501, + "learning_rate": 9.070567986230636e-05, + "loss": 1.1522, + "step": 1846 + }, + { + "epoch": 0.5606313552891182, + "grad_norm": 0.4117715656757355, + "learning_rate": 9.070061759643616e-05, + "loss": 2.2128, + "step": 1847 + }, + { + "epoch": 0.5609348914858097, + "grad_norm": 1.530457854270935, + "learning_rate": 9.069555533056597e-05, + "loss": 2.2275, + "step": 1848 + }, + { + "epoch": 0.5612384276825011, + "grad_norm": 1.6292579174041748, + "learning_rate": 9.069049306469576e-05, + "loss": 1.6642, + "step": 1849 + }, + { + "epoch": 0.5615419638791926, + "grad_norm": 0.4147336184978485, + "learning_rate": 9.068543079882556e-05, + "loss": 1.7811, + "step": 1850 + }, + { + "epoch": 0.5618455000758841, + "grad_norm": 0.4253292679786682, + "learning_rate": 9.068036853295535e-05, + "loss": 2.1084, + "step": 1851 + }, + { + "epoch": 0.5621490362725755, + "grad_norm": 0.3340885043144226, + "learning_rate": 9.067530626708515e-05, + "loss": 1.0015, + "step": 1852 + }, + { + "epoch": 0.5624525724692669, + "grad_norm": 0.34140780568122864, + "learning_rate": 9.067024400121494e-05, + "loss": 1.9773, + "step": 1853 + }, + { + "epoch": 0.5627561086659584, + "grad_norm": 0.48916199803352356, + "learning_rate": 9.066518173534474e-05, + "loss": 1.7743, + "step": 1854 + }, + { + "epoch": 0.5630596448626499, + "grad_norm": 0.43407005071640015, + "learning_rate": 9.066011946947453e-05, + "loss": 1.8134, + "step": 1855 + }, + { + "epoch": 0.5633631810593414, + "grad_norm": 1.257241129875183, + "learning_rate": 9.065505720360433e-05, + "loss": 1.9903, + "step": 1856 + }, + { + "epoch": 0.5636667172560328, + "grad_norm": 0.4004335105419159, + "learning_rate": 9.064999493773413e-05, + "loss": 1.9988, + "step": 1857 + }, + { + "epoch": 0.5639702534527242, + "grad_norm": 0.41307345032691956, + "learning_rate": 9.064493267186393e-05, + "loss": 1.9789, + "step": 1858 + }, + { + "epoch": 0.5642737896494157, + "grad_norm": 0.41875752806663513, + "learning_rate": 9.063987040599374e-05, + "loss": 1.8535, + "step": 1859 + }, + { + "epoch": 0.5645773258461072, + "grad_norm": 0.4912898540496826, + "learning_rate": 9.063480814012353e-05, + "loss": 2.3529, + "step": 1860 + }, + { + "epoch": 0.5648808620427986, + "grad_norm": 0.4265078604221344, + "learning_rate": 9.062974587425333e-05, + "loss": 2.0698, + "step": 1861 + }, + { + "epoch": 0.5651843982394901, + "grad_norm": 0.3786260187625885, + "learning_rate": 9.062468360838312e-05, + "loss": 1.818, + "step": 1862 + }, + { + "epoch": 0.5654879344361815, + "grad_norm": 0.3665534257888794, + "learning_rate": 9.061962134251292e-05, + "loss": 1.9464, + "step": 1863 + }, + { + "epoch": 0.5657914706328729, + "grad_norm": 0.4516305923461914, + "learning_rate": 9.061455907664271e-05, + "loss": 1.7718, + "step": 1864 + }, + { + "epoch": 0.5660950068295644, + "grad_norm": 1.0637644529342651, + "learning_rate": 9.06094968107725e-05, + "loss": 1.8881, + "step": 1865 + }, + { + "epoch": 0.5663985430262559, + "grad_norm": 0.41039812564849854, + "learning_rate": 9.06044345449023e-05, + "loss": 2.0346, + "step": 1866 + }, + { + "epoch": 0.5667020792229474, + "grad_norm": 0.40830013155937195, + "learning_rate": 9.05993722790321e-05, + "loss": 1.6864, + "step": 1867 + }, + { + "epoch": 0.5670056154196388, + "grad_norm": 0.37757718563079834, + "learning_rate": 9.05943100131619e-05, + "loss": 1.7925, + "step": 1868 + }, + { + "epoch": 0.5673091516163302, + "grad_norm": 0.45366227626800537, + "learning_rate": 9.05892477472917e-05, + "loss": 1.7869, + "step": 1869 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.4220414459705353, + "learning_rate": 9.05841854814215e-05, + "loss": 1.932, + "step": 1870 + }, + { + "epoch": 0.5679162240097132, + "grad_norm": 0.4413476884365082, + "learning_rate": 9.057912321555129e-05, + "loss": 1.5842, + "step": 1871 + }, + { + "epoch": 0.5682197602064046, + "grad_norm": 0.40240782499313354, + "learning_rate": 9.057406094968108e-05, + "loss": 1.9477, + "step": 1872 + }, + { + "epoch": 0.5685232964030961, + "grad_norm": 0.4332951605319977, + "learning_rate": 9.056899868381088e-05, + "loss": 1.8664, + "step": 1873 + }, + { + "epoch": 0.5688268325997875, + "grad_norm": 0.3841226398944855, + "learning_rate": 9.056393641794067e-05, + "loss": 2.3058, + "step": 1874 + }, + { + "epoch": 0.5691303687964789, + "grad_norm": 0.3937263488769531, + "learning_rate": 9.055887415207047e-05, + "loss": 1.683, + "step": 1875 + }, + { + "epoch": 0.5694339049931705, + "grad_norm": 0.33709925413131714, + "learning_rate": 9.055381188620026e-05, + "loss": 1.8616, + "step": 1876 + }, + { + "epoch": 0.5697374411898619, + "grad_norm": 0.3934507668018341, + "learning_rate": 9.054874962033007e-05, + "loss": 1.6839, + "step": 1877 + }, + { + "epoch": 0.5700409773865533, + "grad_norm": 0.4386683702468872, + "learning_rate": 9.054368735445987e-05, + "loss": 1.8068, + "step": 1878 + }, + { + "epoch": 0.5703445135832448, + "grad_norm": 0.4416390657424927, + "learning_rate": 9.053862508858966e-05, + "loss": 1.6858, + "step": 1879 + }, + { + "epoch": 0.5706480497799362, + "grad_norm": 0.4287014901638031, + "learning_rate": 9.053356282271946e-05, + "loss": 1.8889, + "step": 1880 + }, + { + "epoch": 0.5709515859766278, + "grad_norm": 0.4297000765800476, + "learning_rate": 9.052850055684925e-05, + "loss": 1.8281, + "step": 1881 + }, + { + "epoch": 0.5712551221733192, + "grad_norm": 0.48270586133003235, + "learning_rate": 9.052343829097905e-05, + "loss": 1.6692, + "step": 1882 + }, + { + "epoch": 0.5715586583700106, + "grad_norm": 0.44133251905441284, + "learning_rate": 9.051837602510884e-05, + "loss": 1.8359, + "step": 1883 + }, + { + "epoch": 0.5718621945667021, + "grad_norm": 0.5127750039100647, + "learning_rate": 9.051331375923863e-05, + "loss": 1.9437, + "step": 1884 + }, + { + "epoch": 0.5721657307633935, + "grad_norm": 0.4890953600406647, + "learning_rate": 9.050825149336843e-05, + "loss": 1.6396, + "step": 1885 + }, + { + "epoch": 0.572469266960085, + "grad_norm": 0.36201316118240356, + "learning_rate": 9.050318922749822e-05, + "loss": 1.6985, + "step": 1886 + }, + { + "epoch": 0.5727728031567765, + "grad_norm": 0.3880859911441803, + "learning_rate": 9.049812696162803e-05, + "loss": 1.7916, + "step": 1887 + }, + { + "epoch": 0.5730763393534679, + "grad_norm": 0.500619649887085, + "learning_rate": 9.049306469575783e-05, + "loss": 1.826, + "step": 1888 + }, + { + "epoch": 0.5733798755501593, + "grad_norm": 0.764751672744751, + "learning_rate": 9.048800242988762e-05, + "loss": 1.5406, + "step": 1889 + }, + { + "epoch": 0.5736834117468508, + "grad_norm": 0.4573342502117157, + "learning_rate": 9.048294016401742e-05, + "loss": 1.6461, + "step": 1890 + }, + { + "epoch": 0.5739869479435422, + "grad_norm": 0.5972601175308228, + "learning_rate": 9.047787789814721e-05, + "loss": 2.3081, + "step": 1891 + }, + { + "epoch": 0.5742904841402338, + "grad_norm": 0.4419214129447937, + "learning_rate": 9.0472815632277e-05, + "loss": 1.8907, + "step": 1892 + }, + { + "epoch": 0.5745940203369252, + "grad_norm": 0.3364506959915161, + "learning_rate": 9.04677533664068e-05, + "loss": 1.598, + "step": 1893 + }, + { + "epoch": 0.5748975565336166, + "grad_norm": 0.41443008184432983, + "learning_rate": 9.04626911005366e-05, + "loss": 1.91, + "step": 1894 + }, + { + "epoch": 0.5752010927303081, + "grad_norm": 0.3931877315044403, + "learning_rate": 9.045762883466639e-05, + "loss": 2.0265, + "step": 1895 + }, + { + "epoch": 0.5755046289269995, + "grad_norm": 0.3768281042575836, + "learning_rate": 9.04525665687962e-05, + "loss": 2.0458, + "step": 1896 + }, + { + "epoch": 0.575808165123691, + "grad_norm": 0.726582407951355, + "learning_rate": 9.0447504302926e-05, + "loss": 1.5708, + "step": 1897 + }, + { + "epoch": 0.5761117013203825, + "grad_norm": 0.4031538665294647, + "learning_rate": 9.044244203705579e-05, + "loss": 1.6248, + "step": 1898 + }, + { + "epoch": 0.5764152375170739, + "grad_norm": 0.3605407476425171, + "learning_rate": 9.043737977118558e-05, + "loss": 1.9504, + "step": 1899 + }, + { + "epoch": 0.5767187737137653, + "grad_norm": 0.3802354633808136, + "learning_rate": 9.043231750531538e-05, + "loss": 2.0487, + "step": 1900 + }, + { + "epoch": 0.5770223099104568, + "grad_norm": 0.41240641474723816, + "learning_rate": 9.042725523944519e-05, + "loss": 1.9202, + "step": 1901 + }, + { + "epoch": 0.5773258461071483, + "grad_norm": 0.36771708726882935, + "learning_rate": 9.042219297357498e-05, + "loss": 2.2273, + "step": 1902 + }, + { + "epoch": 0.5776293823038398, + "grad_norm": 0.4182611405849457, + "learning_rate": 9.041713070770478e-05, + "loss": 1.8689, + "step": 1903 + }, + { + "epoch": 0.5779329185005312, + "grad_norm": 0.39633724093437195, + "learning_rate": 9.041206844183457e-05, + "loss": 1.9744, + "step": 1904 + }, + { + "epoch": 0.5782364546972226, + "grad_norm": 0.3978392481803894, + "learning_rate": 9.040700617596437e-05, + "loss": 1.6662, + "step": 1905 + }, + { + "epoch": 0.5785399908939141, + "grad_norm": 0.3734360635280609, + "learning_rate": 9.040194391009416e-05, + "loss": 1.0948, + "step": 1906 + }, + { + "epoch": 0.5788435270906056, + "grad_norm": 0.403392493724823, + "learning_rate": 9.039688164422397e-05, + "loss": 2.0251, + "step": 1907 + }, + { + "epoch": 0.579147063287297, + "grad_norm": 0.350067138671875, + "learning_rate": 9.039181937835376e-05, + "loss": 1.543, + "step": 1908 + }, + { + "epoch": 0.5794505994839885, + "grad_norm": 0.4273326098918915, + "learning_rate": 9.038675711248356e-05, + "loss": 1.8694, + "step": 1909 + }, + { + "epoch": 0.5797541356806799, + "grad_norm": 0.4815780222415924, + "learning_rate": 9.038169484661335e-05, + "loss": 1.9565, + "step": 1910 + }, + { + "epoch": 0.5800576718773713, + "grad_norm": 0.5379179120063782, + "learning_rate": 9.037663258074315e-05, + "loss": 1.9631, + "step": 1911 + }, + { + "epoch": 0.5803612080740629, + "grad_norm": 0.47738704085350037, + "learning_rate": 9.037157031487294e-05, + "loss": 1.878, + "step": 1912 + }, + { + "epoch": 0.5806647442707543, + "grad_norm": 0.426543653011322, + "learning_rate": 9.036650804900274e-05, + "loss": 2.0392, + "step": 1913 + }, + { + "epoch": 0.5809682804674458, + "grad_norm": 0.38239404559135437, + "learning_rate": 9.036144578313253e-05, + "loss": 1.9695, + "step": 1914 + }, + { + "epoch": 0.5812718166641372, + "grad_norm": 0.40093934535980225, + "learning_rate": 9.035638351726233e-05, + "loss": 1.9695, + "step": 1915 + }, + { + "epoch": 0.5815753528608286, + "grad_norm": 0.3865903317928314, + "learning_rate": 9.035132125139214e-05, + "loss": 1.7925, + "step": 1916 + }, + { + "epoch": 0.58187888905752, + "grad_norm": 0.6183242201805115, + "learning_rate": 9.034625898552193e-05, + "loss": 1.853, + "step": 1917 + }, + { + "epoch": 0.5821824252542116, + "grad_norm": 0.4869506061077118, + "learning_rate": 9.034119671965173e-05, + "loss": 1.9418, + "step": 1918 + }, + { + "epoch": 0.582485961450903, + "grad_norm": 0.40212881565093994, + "learning_rate": 9.033613445378152e-05, + "loss": 2.0259, + "step": 1919 + }, + { + "epoch": 0.5827894976475945, + "grad_norm": 0.7224326729774475, + "learning_rate": 9.033107218791131e-05, + "loss": 1.933, + "step": 1920 + }, + { + "epoch": 0.5830930338442859, + "grad_norm": 0.4369768500328064, + "learning_rate": 9.032600992204111e-05, + "loss": 1.7931, + "step": 1921 + }, + { + "epoch": 0.5833965700409773, + "grad_norm": 0.3920018672943115, + "learning_rate": 9.03209476561709e-05, + "loss": 1.913, + "step": 1922 + }, + { + "epoch": 0.5837001062376689, + "grad_norm": 0.5076978206634521, + "learning_rate": 9.03158853903007e-05, + "loss": 2.0081, + "step": 1923 + }, + { + "epoch": 0.5840036424343603, + "grad_norm": 0.38379955291748047, + "learning_rate": 9.03108231244305e-05, + "loss": 2.0153, + "step": 1924 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.4367254376411438, + "learning_rate": 9.030576085856029e-05, + "loss": 2.1004, + "step": 1925 + }, + { + "epoch": 0.5846107148277432, + "grad_norm": 0.37425291538238525, + "learning_rate": 9.03006985926901e-05, + "loss": 1.6777, + "step": 1926 + }, + { + "epoch": 0.5849142510244346, + "grad_norm": 0.37925392389297485, + "learning_rate": 9.029563632681989e-05, + "loss": 2.1164, + "step": 1927 + }, + { + "epoch": 0.5852177872211262, + "grad_norm": 0.41369903087615967, + "learning_rate": 9.029057406094969e-05, + "loss": 1.7252, + "step": 1928 + }, + { + "epoch": 0.5855213234178176, + "grad_norm": 0.3528081476688385, + "learning_rate": 9.028551179507948e-05, + "loss": 1.9181, + "step": 1929 + }, + { + "epoch": 0.585824859614509, + "grad_norm": 0.38274556398391724, + "learning_rate": 9.028044952920928e-05, + "loss": 1.8585, + "step": 1930 + }, + { + "epoch": 0.5861283958112005, + "grad_norm": 0.4036407768726349, + "learning_rate": 9.027538726333907e-05, + "loss": 1.9679, + "step": 1931 + }, + { + "epoch": 0.5864319320078919, + "grad_norm": 0.34841248393058777, + "learning_rate": 9.027032499746887e-05, + "loss": 1.8571, + "step": 1932 + }, + { + "epoch": 0.5867354682045834, + "grad_norm": 0.3821954131126404, + "learning_rate": 9.026526273159866e-05, + "loss": 1.8175, + "step": 1933 + }, + { + "epoch": 0.5870390044012749, + "grad_norm": 0.3724253475666046, + "learning_rate": 9.026020046572846e-05, + "loss": 1.9549, + "step": 1934 + }, + { + "epoch": 0.5873425405979663, + "grad_norm": 0.40494081377983093, + "learning_rate": 9.025513819985826e-05, + "loss": 2.0013, + "step": 1935 + }, + { + "epoch": 0.5876460767946577, + "grad_norm": 0.7746275663375854, + "learning_rate": 9.025007593398806e-05, + "loss": 1.6739, + "step": 1936 + }, + { + "epoch": 0.5879496129913492, + "grad_norm": 0.34239932894706726, + "learning_rate": 9.024501366811785e-05, + "loss": 1.1999, + "step": 1937 + }, + { + "epoch": 0.5882531491880407, + "grad_norm": 0.40239185094833374, + "learning_rate": 9.023995140224765e-05, + "loss": 1.956, + "step": 1938 + }, + { + "epoch": 0.5885566853847322, + "grad_norm": 0.4756642282009125, + "learning_rate": 9.023488913637744e-05, + "loss": 1.5705, + "step": 1939 + }, + { + "epoch": 0.5888602215814236, + "grad_norm": 0.412263959646225, + "learning_rate": 9.022982687050724e-05, + "loss": 1.8187, + "step": 1940 + }, + { + "epoch": 0.589163757778115, + "grad_norm": 0.4178502857685089, + "learning_rate": 9.022476460463703e-05, + "loss": 1.8955, + "step": 1941 + }, + { + "epoch": 0.5894672939748065, + "grad_norm": 0.4619811475276947, + "learning_rate": 9.021970233876683e-05, + "loss": 1.9968, + "step": 1942 + }, + { + "epoch": 0.589770830171498, + "grad_norm": 0.42839181423187256, + "learning_rate": 9.021464007289662e-05, + "loss": 1.5708, + "step": 1943 + }, + { + "epoch": 0.5900743663681894, + "grad_norm": 0.4423038363456726, + "learning_rate": 9.020957780702643e-05, + "loss": 1.9684, + "step": 1944 + }, + { + "epoch": 0.5903779025648809, + "grad_norm": 0.3898191452026367, + "learning_rate": 9.020451554115623e-05, + "loss": 1.9497, + "step": 1945 + }, + { + "epoch": 0.5906814387615723, + "grad_norm": 0.701366662979126, + "learning_rate": 9.019945327528603e-05, + "loss": 1.7624, + "step": 1946 + }, + { + "epoch": 0.5909849749582637, + "grad_norm": 0.32581913471221924, + "learning_rate": 9.019439100941583e-05, + "loss": 2.1484, + "step": 1947 + }, + { + "epoch": 0.5912885111549552, + "grad_norm": 0.4372369050979614, + "learning_rate": 9.018932874354562e-05, + "loss": 1.6075, + "step": 1948 + }, + { + "epoch": 0.5915920473516467, + "grad_norm": 0.39428946375846863, + "learning_rate": 9.018426647767542e-05, + "loss": 1.6004, + "step": 1949 + }, + { + "epoch": 0.5918955835483382, + "grad_norm": 0.3934183120727539, + "learning_rate": 9.017920421180521e-05, + "loss": 1.9358, + "step": 1950 + }, + { + "epoch": 0.5921991197450296, + "grad_norm": 0.42696380615234375, + "learning_rate": 9.017414194593501e-05, + "loss": 1.9646, + "step": 1951 + }, + { + "epoch": 0.592502655941721, + "grad_norm": 0.38243913650512695, + "learning_rate": 9.01690796800648e-05, + "loss": 1.9946, + "step": 1952 + }, + { + "epoch": 0.5928061921384125, + "grad_norm": 0.4068431556224823, + "learning_rate": 9.01640174141946e-05, + "loss": 2.0786, + "step": 1953 + }, + { + "epoch": 0.593109728335104, + "grad_norm": 0.44560736417770386, + "learning_rate": 9.015895514832439e-05, + "loss": 1.7514, + "step": 1954 + }, + { + "epoch": 0.5934132645317954, + "grad_norm": 0.4143114686012268, + "learning_rate": 9.01538928824542e-05, + "loss": 2.083, + "step": 1955 + }, + { + "epoch": 0.5937168007284869, + "grad_norm": 0.45947229862213135, + "learning_rate": 9.0148830616584e-05, + "loss": 1.9313, + "step": 1956 + }, + { + "epoch": 0.5940203369251783, + "grad_norm": 2.7487032413482666, + "learning_rate": 9.014376835071379e-05, + "loss": 1.7648, + "step": 1957 + }, + { + "epoch": 0.5943238731218697, + "grad_norm": 0.3856576979160309, + "learning_rate": 9.013870608484358e-05, + "loss": 2.1141, + "step": 1958 + }, + { + "epoch": 0.5946274093185613, + "grad_norm": 0.3741602897644043, + "learning_rate": 9.013364381897338e-05, + "loss": 1.6458, + "step": 1959 + }, + { + "epoch": 0.5949309455152527, + "grad_norm": 0.3791872262954712, + "learning_rate": 9.012858155310317e-05, + "loss": 1.433, + "step": 1960 + }, + { + "epoch": 0.5952344817119442, + "grad_norm": 0.32848575711250305, + "learning_rate": 9.012351928723297e-05, + "loss": 1.6748, + "step": 1961 + }, + { + "epoch": 0.5955380179086356, + "grad_norm": 0.4328818917274475, + "learning_rate": 9.011845702136276e-05, + "loss": 1.8309, + "step": 1962 + }, + { + "epoch": 0.595841554105327, + "grad_norm": 0.40931710600852966, + "learning_rate": 9.011339475549256e-05, + "loss": 2.0837, + "step": 1963 + }, + { + "epoch": 0.5961450903020186, + "grad_norm": 0.3625456690788269, + "learning_rate": 9.010833248962235e-05, + "loss": 1.8895, + "step": 1964 + }, + { + "epoch": 0.59644862649871, + "grad_norm": 0.33840253949165344, + "learning_rate": 9.010327022375216e-05, + "loss": 1.8706, + "step": 1965 + }, + { + "epoch": 0.5967521626954014, + "grad_norm": 0.38374340534210205, + "learning_rate": 9.009820795788196e-05, + "loss": 1.8782, + "step": 1966 + }, + { + "epoch": 0.5970556988920929, + "grad_norm": 0.41515031456947327, + "learning_rate": 9.009314569201175e-05, + "loss": 1.7455, + "step": 1967 + }, + { + "epoch": 0.5973592350887843, + "grad_norm": 0.35676872730255127, + "learning_rate": 9.008808342614155e-05, + "loss": 1.7706, + "step": 1968 + }, + { + "epoch": 0.5976627712854758, + "grad_norm": 0.4770854711532593, + "learning_rate": 9.008302116027134e-05, + "loss": 2.0954, + "step": 1969 + }, + { + "epoch": 0.5979663074821673, + "grad_norm": 0.3612794876098633, + "learning_rate": 9.007795889440114e-05, + "loss": 2.1938, + "step": 1970 + }, + { + "epoch": 0.5982698436788587, + "grad_norm": 0.5067920684814453, + "learning_rate": 9.007289662853093e-05, + "loss": 1.2096, + "step": 1971 + }, + { + "epoch": 0.5985733798755501, + "grad_norm": 0.4193328022956848, + "learning_rate": 9.006783436266073e-05, + "loss": 1.6632, + "step": 1972 + }, + { + "epoch": 0.5988769160722416, + "grad_norm": 0.41445595026016235, + "learning_rate": 9.006277209679052e-05, + "loss": 2.0237, + "step": 1973 + }, + { + "epoch": 0.599180452268933, + "grad_norm": 0.4083717167377472, + "learning_rate": 9.005770983092033e-05, + "loss": 2.1022, + "step": 1974 + }, + { + "epoch": 0.5994839884656246, + "grad_norm": 0.4897996485233307, + "learning_rate": 9.005264756505012e-05, + "loss": 1.6074, + "step": 1975 + }, + { + "epoch": 0.599787524662316, + "grad_norm": 0.46923205256462097, + "learning_rate": 9.004758529917992e-05, + "loss": 2.0915, + "step": 1976 + }, + { + "epoch": 0.6000910608590074, + "grad_norm": 0.37507691979408264, + "learning_rate": 9.004252303330971e-05, + "loss": 1.6793, + "step": 1977 + }, + { + "epoch": 0.6003945970556989, + "grad_norm": 0.3973737061023712, + "learning_rate": 9.003746076743951e-05, + "loss": 2.0935, + "step": 1978 + }, + { + "epoch": 0.6006981332523903, + "grad_norm": 0.40313783288002014, + "learning_rate": 9.00323985015693e-05, + "loss": 1.7405, + "step": 1979 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.36169835925102234, + "learning_rate": 9.00273362356991e-05, + "loss": 2.1843, + "step": 1980 + }, + { + "epoch": 0.6013052056457733, + "grad_norm": 0.41355371475219727, + "learning_rate": 9.002227396982889e-05, + "loss": 2.0969, + "step": 1981 + }, + { + "epoch": 0.6016087418424647, + "grad_norm": 0.42378634214401245, + "learning_rate": 9.001721170395869e-05, + "loss": 1.9652, + "step": 1982 + }, + { + "epoch": 0.6019122780391561, + "grad_norm": 0.42945531010627747, + "learning_rate": 9.00121494380885e-05, + "loss": 2.0625, + "step": 1983 + }, + { + "epoch": 0.6022158142358476, + "grad_norm": 0.5348070859909058, + "learning_rate": 9.000708717221829e-05, + "loss": 1.3785, + "step": 1984 + }, + { + "epoch": 0.6025193504325391, + "grad_norm": 0.35933446884155273, + "learning_rate": 9.000202490634809e-05, + "loss": 1.5254, + "step": 1985 + }, + { + "epoch": 0.6028228866292306, + "grad_norm": 0.42495015263557434, + "learning_rate": 8.999696264047788e-05, + "loss": 2.1669, + "step": 1986 + }, + { + "epoch": 0.603126422825922, + "grad_norm": 0.43792733550071716, + "learning_rate": 8.999190037460767e-05, + "loss": 1.8057, + "step": 1987 + }, + { + "epoch": 0.6034299590226134, + "grad_norm": 0.39334404468536377, + "learning_rate": 8.998683810873747e-05, + "loss": 1.8157, + "step": 1988 + }, + { + "epoch": 0.6037334952193049, + "grad_norm": 0.38974860310554504, + "learning_rate": 8.998177584286726e-05, + "loss": 1.8293, + "step": 1989 + }, + { + "epoch": 0.6040370314159964, + "grad_norm": 0.44241687655448914, + "learning_rate": 8.997671357699707e-05, + "loss": 2.2078, + "step": 1990 + }, + { + "epoch": 0.6043405676126878, + "grad_norm": 0.40700820088386536, + "learning_rate": 8.997165131112687e-05, + "loss": 1.9656, + "step": 1991 + }, + { + "epoch": 0.6046441038093793, + "grad_norm": 0.3992595076560974, + "learning_rate": 8.996658904525666e-05, + "loss": 1.9423, + "step": 1992 + }, + { + "epoch": 0.6049476400060707, + "grad_norm": 0.3922860622406006, + "learning_rate": 8.996152677938646e-05, + "loss": 2.1253, + "step": 1993 + }, + { + "epoch": 0.6052511762027621, + "grad_norm": 0.3843866288661957, + "learning_rate": 8.995646451351627e-05, + "loss": 2.0756, + "step": 1994 + }, + { + "epoch": 0.6055547123994537, + "grad_norm": 0.3822995722293854, + "learning_rate": 8.995140224764606e-05, + "loss": 1.9475, + "step": 1995 + }, + { + "epoch": 0.6058582485961451, + "grad_norm": 0.4001995325088501, + "learning_rate": 8.994633998177585e-05, + "loss": 1.9781, + "step": 1996 + }, + { + "epoch": 0.6061617847928366, + "grad_norm": 0.3775820732116699, + "learning_rate": 8.994127771590565e-05, + "loss": 1.7857, + "step": 1997 + }, + { + "epoch": 0.606465320989528, + "grad_norm": 0.4260796308517456, + "learning_rate": 8.993621545003544e-05, + "loss": 1.6416, + "step": 1998 + }, + { + "epoch": 0.6067688571862194, + "grad_norm": 0.39824166893959045, + "learning_rate": 8.993115318416524e-05, + "loss": 1.7657, + "step": 1999 + }, + { + "epoch": 0.6070723933829109, + "grad_norm": 0.46430447697639465, + "learning_rate": 8.992609091829503e-05, + "loss": 1.8802, + "step": 2000 + }, + { + "epoch": 0.6073759295796024, + "grad_norm": 0.4773789048194885, + "learning_rate": 8.992102865242483e-05, + "loss": 1.8786, + "step": 2001 + }, + { + "epoch": 0.6076794657762938, + "grad_norm": 0.4296311140060425, + "learning_rate": 8.991596638655462e-05, + "loss": 1.9457, + "step": 2002 + }, + { + "epoch": 0.6079830019729853, + "grad_norm": 0.41193845868110657, + "learning_rate": 8.991090412068442e-05, + "loss": 1.5856, + "step": 2003 + }, + { + "epoch": 0.6082865381696767, + "grad_norm": 0.43040478229522705, + "learning_rate": 8.990584185481423e-05, + "loss": 2.1432, + "step": 2004 + }, + { + "epoch": 0.6085900743663681, + "grad_norm": 0.5215789079666138, + "learning_rate": 8.990077958894402e-05, + "loss": 1.9584, + "step": 2005 + }, + { + "epoch": 0.6088936105630597, + "grad_norm": 0.4370077848434448, + "learning_rate": 8.989571732307382e-05, + "loss": 1.5612, + "step": 2006 + }, + { + "epoch": 0.6091971467597511, + "grad_norm": 0.4200492203235626, + "learning_rate": 8.989065505720361e-05, + "loss": 1.9493, + "step": 2007 + }, + { + "epoch": 0.6095006829564426, + "grad_norm": 0.39453452825546265, + "learning_rate": 8.98855927913334e-05, + "loss": 1.9529, + "step": 2008 + }, + { + "epoch": 0.609804219153134, + "grad_norm": 0.4478731155395508, + "learning_rate": 8.98805305254632e-05, + "loss": 2.0098, + "step": 2009 + }, + { + "epoch": 0.6101077553498254, + "grad_norm": 0.39515209197998047, + "learning_rate": 8.9875468259593e-05, + "loss": 1.958, + "step": 2010 + }, + { + "epoch": 0.610411291546517, + "grad_norm": 0.3660414516925812, + "learning_rate": 8.987040599372279e-05, + "loss": 1.9538, + "step": 2011 + }, + { + "epoch": 0.6107148277432084, + "grad_norm": 0.3517032861709595, + "learning_rate": 8.986534372785259e-05, + "loss": 1.8833, + "step": 2012 + }, + { + "epoch": 0.6110183639398998, + "grad_norm": 0.6502123475074768, + "learning_rate": 8.98602814619824e-05, + "loss": 2.1106, + "step": 2013 + }, + { + "epoch": 0.6113219001365913, + "grad_norm": 0.4674864709377289, + "learning_rate": 8.985521919611219e-05, + "loss": 1.8986, + "step": 2014 + }, + { + "epoch": 0.6116254363332827, + "grad_norm": 0.4143102467060089, + "learning_rate": 8.985015693024198e-05, + "loss": 1.7635, + "step": 2015 + }, + { + "epoch": 0.6119289725299742, + "grad_norm": 0.4329308867454529, + "learning_rate": 8.984509466437178e-05, + "loss": 1.648, + "step": 2016 + }, + { + "epoch": 0.6122325087266657, + "grad_norm": 0.34939324855804443, + "learning_rate": 8.984003239850157e-05, + "loss": 1.7641, + "step": 2017 + }, + { + "epoch": 0.6125360449233571, + "grad_norm": 0.4234546720981598, + "learning_rate": 8.983497013263137e-05, + "loss": 1.8691, + "step": 2018 + }, + { + "epoch": 0.6128395811200485, + "grad_norm": 0.7465669512748718, + "learning_rate": 8.982990786676116e-05, + "loss": 2.0573, + "step": 2019 + }, + { + "epoch": 0.61314311731674, + "grad_norm": 0.36259400844573975, + "learning_rate": 8.982484560089096e-05, + "loss": 2.0654, + "step": 2020 + }, + { + "epoch": 0.6134466535134315, + "grad_norm": 0.3918156623840332, + "learning_rate": 8.981978333502075e-05, + "loss": 2.1658, + "step": 2021 + }, + { + "epoch": 0.613750189710123, + "grad_norm": 0.3924868404865265, + "learning_rate": 8.981472106915056e-05, + "loss": 1.9306, + "step": 2022 + }, + { + "epoch": 0.6140537259068144, + "grad_norm": 0.7729107141494751, + "learning_rate": 8.980965880328035e-05, + "loss": 1.5911, + "step": 2023 + }, + { + "epoch": 0.6143572621035058, + "grad_norm": 0.4199913442134857, + "learning_rate": 8.980459653741015e-05, + "loss": 1.9833, + "step": 2024 + }, + { + "epoch": 0.6146607983001973, + "grad_norm": 0.40258511900901794, + "learning_rate": 8.979953427153994e-05, + "loss": 1.9178, + "step": 2025 + }, + { + "epoch": 0.6149643344968888, + "grad_norm": 0.3859613239765167, + "learning_rate": 8.979447200566974e-05, + "loss": 1.7585, + "step": 2026 + }, + { + "epoch": 0.6152678706935802, + "grad_norm": 0.42048898339271545, + "learning_rate": 8.978940973979953e-05, + "loss": 1.953, + "step": 2027 + }, + { + "epoch": 0.6155714068902717, + "grad_norm": 0.39669451117515564, + "learning_rate": 8.978434747392933e-05, + "loss": 1.6132, + "step": 2028 + }, + { + "epoch": 0.6158749430869631, + "grad_norm": 0.6679760217666626, + "learning_rate": 8.977928520805912e-05, + "loss": 1.9793, + "step": 2029 + }, + { + "epoch": 0.6161784792836545, + "grad_norm": 0.4262414276599884, + "learning_rate": 8.977422294218892e-05, + "loss": 1.8002, + "step": 2030 + }, + { + "epoch": 0.616482015480346, + "grad_norm": 0.3899317681789398, + "learning_rate": 8.976916067631871e-05, + "loss": 2.0585, + "step": 2031 + }, + { + "epoch": 0.6167855516770375, + "grad_norm": 0.5402538776397705, + "learning_rate": 8.976409841044852e-05, + "loss": 1.6196, + "step": 2032 + }, + { + "epoch": 0.617089087873729, + "grad_norm": 0.40976065397262573, + "learning_rate": 8.975903614457832e-05, + "loss": 1.6395, + "step": 2033 + }, + { + "epoch": 0.6173926240704204, + "grad_norm": 0.5633681416511536, + "learning_rate": 8.975397387870811e-05, + "loss": 2.117, + "step": 2034 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.4393365681171417, + "learning_rate": 8.974891161283792e-05, + "loss": 2.0793, + "step": 2035 + }, + { + "epoch": 0.6179996964638033, + "grad_norm": 0.3982914388179779, + "learning_rate": 8.974384934696771e-05, + "loss": 1.9608, + "step": 2036 + }, + { + "epoch": 0.6183032326604948, + "grad_norm": 0.41689884662628174, + "learning_rate": 8.973878708109751e-05, + "loss": 2.0265, + "step": 2037 + }, + { + "epoch": 0.6186067688571862, + "grad_norm": 0.46085304021835327, + "learning_rate": 8.97337248152273e-05, + "loss": 1.8147, + "step": 2038 + }, + { + "epoch": 0.6189103050538777, + "grad_norm": 0.4536703824996948, + "learning_rate": 8.97286625493571e-05, + "loss": 1.861, + "step": 2039 + }, + { + "epoch": 0.6192138412505691, + "grad_norm": 0.4332161843776703, + "learning_rate": 8.97236002834869e-05, + "loss": 1.5579, + "step": 2040 + }, + { + "epoch": 0.6195173774472605, + "grad_norm": 0.3992736041545868, + "learning_rate": 8.971853801761669e-05, + "loss": 1.9451, + "step": 2041 + }, + { + "epoch": 0.6198209136439521, + "grad_norm": 0.39501848816871643, + "learning_rate": 8.971347575174648e-05, + "loss": 1.9228, + "step": 2042 + }, + { + "epoch": 0.6201244498406435, + "grad_norm": 0.44429096579551697, + "learning_rate": 8.970841348587629e-05, + "loss": 1.6277, + "step": 2043 + }, + { + "epoch": 0.620427986037335, + "grad_norm": 0.5381520390510559, + "learning_rate": 8.970335122000609e-05, + "loss": 1.4842, + "step": 2044 + }, + { + "epoch": 0.6207315222340264, + "grad_norm": 0.3807857036590576, + "learning_rate": 8.969828895413588e-05, + "loss": 1.9271, + "step": 2045 + }, + { + "epoch": 0.6210350584307178, + "grad_norm": 0.4522213041782379, + "learning_rate": 8.969322668826568e-05, + "loss": 1.747, + "step": 2046 + }, + { + "epoch": 0.6213385946274094, + "grad_norm": 0.37111926078796387, + "learning_rate": 8.968816442239547e-05, + "loss": 1.9039, + "step": 2047 + }, + { + "epoch": 0.6216421308241008, + "grad_norm": 0.7616074681282043, + "learning_rate": 8.968310215652527e-05, + "loss": 2.0966, + "step": 2048 + }, + { + "epoch": 0.6219456670207922, + "grad_norm": 0.42669475078582764, + "learning_rate": 8.967803989065506e-05, + "loss": 2.0545, + "step": 2049 + }, + { + "epoch": 0.6222492032174837, + "grad_norm": 0.3741990625858307, + "learning_rate": 8.967297762478486e-05, + "loss": 1.9964, + "step": 2050 + }, + { + "epoch": 0.6225527394141751, + "grad_norm": 0.38759157061576843, + "learning_rate": 8.966791535891465e-05, + "loss": 1.8108, + "step": 2051 + }, + { + "epoch": 0.6228562756108666, + "grad_norm": 0.424344003200531, + "learning_rate": 8.966285309304446e-05, + "loss": 1.8411, + "step": 2052 + }, + { + "epoch": 0.6231598118075581, + "grad_norm": 0.3969878852367401, + "learning_rate": 8.965779082717425e-05, + "loss": 2.1543, + "step": 2053 + }, + { + "epoch": 0.6234633480042495, + "grad_norm": 0.4188143312931061, + "learning_rate": 8.965272856130405e-05, + "loss": 1.8696, + "step": 2054 + }, + { + "epoch": 0.623766884200941, + "grad_norm": 0.42061781883239746, + "learning_rate": 8.964766629543384e-05, + "loss": 2.0641, + "step": 2055 + }, + { + "epoch": 0.6240704203976324, + "grad_norm": 0.3898957371711731, + "learning_rate": 8.964260402956364e-05, + "loss": 1.9002, + "step": 2056 + }, + { + "epoch": 0.6243739565943238, + "grad_norm": 0.4503360688686371, + "learning_rate": 8.963754176369343e-05, + "loss": 1.8302, + "step": 2057 + }, + { + "epoch": 0.6246774927910154, + "grad_norm": 0.44356441497802734, + "learning_rate": 8.963247949782323e-05, + "loss": 1.589, + "step": 2058 + }, + { + "epoch": 0.6249810289877068, + "grad_norm": 0.3989812433719635, + "learning_rate": 8.962741723195302e-05, + "loss": 2.1592, + "step": 2059 + }, + { + "epoch": 0.6252845651843982, + "grad_norm": 0.3959946632385254, + "learning_rate": 8.962235496608282e-05, + "loss": 2.0769, + "step": 2060 + }, + { + "epoch": 0.6255881013810897, + "grad_norm": 0.37260061502456665, + "learning_rate": 8.961729270021262e-05, + "loss": 1.2118, + "step": 2061 + }, + { + "epoch": 0.6258916375777811, + "grad_norm": 0.5840566754341125, + "learning_rate": 8.961223043434242e-05, + "loss": 2.1389, + "step": 2062 + }, + { + "epoch": 0.6261951737744726, + "grad_norm": 0.44715970754623413, + "learning_rate": 8.960716816847221e-05, + "loss": 1.6985, + "step": 2063 + }, + { + "epoch": 0.6264987099711641, + "grad_norm": 0.40047672390937805, + "learning_rate": 8.960210590260201e-05, + "loss": 1.9951, + "step": 2064 + }, + { + "epoch": 0.6268022461678555, + "grad_norm": 0.4090017080307007, + "learning_rate": 8.95970436367318e-05, + "loss": 1.8258, + "step": 2065 + }, + { + "epoch": 0.627105782364547, + "grad_norm": 0.39617207646369934, + "learning_rate": 8.95919813708616e-05, + "loss": 1.6203, + "step": 2066 + }, + { + "epoch": 0.6274093185612384, + "grad_norm": 0.4236812889575958, + "learning_rate": 8.95869191049914e-05, + "loss": 1.9766, + "step": 2067 + }, + { + "epoch": 0.6277128547579299, + "grad_norm": 0.560946524143219, + "learning_rate": 8.958185683912119e-05, + "loss": 2.3476, + "step": 2068 + }, + { + "epoch": 0.6280163909546214, + "grad_norm": 0.4474948048591614, + "learning_rate": 8.957679457325098e-05, + "loss": 1.9045, + "step": 2069 + }, + { + "epoch": 0.6283199271513128, + "grad_norm": 0.47307664155960083, + "learning_rate": 8.957173230738078e-05, + "loss": 1.9436, + "step": 2070 + }, + { + "epoch": 0.6286234633480042, + "grad_norm": 0.4518156945705414, + "learning_rate": 8.956667004151059e-05, + "loss": 1.6307, + "step": 2071 + }, + { + "epoch": 0.6289269995446957, + "grad_norm": 0.3907441794872284, + "learning_rate": 8.956160777564038e-05, + "loss": 1.9797, + "step": 2072 + }, + { + "epoch": 0.6292305357413872, + "grad_norm": 0.7602722644805908, + "learning_rate": 8.955654550977018e-05, + "loss": 1.467, + "step": 2073 + }, + { + "epoch": 0.6295340719380786, + "grad_norm": 0.4778296947479248, + "learning_rate": 8.955148324389997e-05, + "loss": 1.592, + "step": 2074 + }, + { + "epoch": 0.6298376081347701, + "grad_norm": 0.5303634405136108, + "learning_rate": 8.954642097802977e-05, + "loss": 1.9301, + "step": 2075 + }, + { + "epoch": 0.6301411443314615, + "grad_norm": 0.37609922885894775, + "learning_rate": 8.954135871215956e-05, + "loss": 2.0062, + "step": 2076 + }, + { + "epoch": 0.630444680528153, + "grad_norm": 0.3961854875087738, + "learning_rate": 8.953629644628936e-05, + "loss": 2.0677, + "step": 2077 + }, + { + "epoch": 0.6307482167248445, + "grad_norm": 0.43167874217033386, + "learning_rate": 8.953123418041915e-05, + "loss": 1.7997, + "step": 2078 + }, + { + "epoch": 0.6310517529215359, + "grad_norm": 0.4458840489387512, + "learning_rate": 8.952617191454896e-05, + "loss": 1.6458, + "step": 2079 + }, + { + "epoch": 0.6313552891182274, + "grad_norm": 0.8174628615379333, + "learning_rate": 8.952110964867875e-05, + "loss": 1.3436, + "step": 2080 + }, + { + "epoch": 0.6316588253149188, + "grad_norm": 0.40314528346061707, + "learning_rate": 8.951604738280855e-05, + "loss": 1.648, + "step": 2081 + }, + { + "epoch": 0.6319623615116102, + "grad_norm": 2.845505952835083, + "learning_rate": 8.951098511693836e-05, + "loss": 2.0645, + "step": 2082 + }, + { + "epoch": 0.6322658977083017, + "grad_norm": 0.41686686873435974, + "learning_rate": 8.950592285106815e-05, + "loss": 1.9434, + "step": 2083 + }, + { + "epoch": 0.6325694339049932, + "grad_norm": 5.579742431640625, + "learning_rate": 8.950086058519795e-05, + "loss": 2.1942, + "step": 2084 + }, + { + "epoch": 0.6328729701016846, + "grad_norm": 0.40614521503448486, + "learning_rate": 8.949579831932774e-05, + "loss": 1.7526, + "step": 2085 + }, + { + "epoch": 0.6331765062983761, + "grad_norm": 0.8609543442726135, + "learning_rate": 8.949073605345754e-05, + "loss": 1.4042, + "step": 2086 + }, + { + "epoch": 0.6334800424950675, + "grad_norm": 0.451594740152359, + "learning_rate": 8.948567378758733e-05, + "loss": 2.085, + "step": 2087 + }, + { + "epoch": 0.6337835786917589, + "grad_norm": 0.48546943068504333, + "learning_rate": 8.948061152171713e-05, + "loss": 1.8299, + "step": 2088 + }, + { + "epoch": 0.6340871148884505, + "grad_norm": 0.4435253143310547, + "learning_rate": 8.947554925584692e-05, + "loss": 1.9175, + "step": 2089 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.4109974801540375, + "learning_rate": 8.947048698997671e-05, + "loss": 1.6901, + "step": 2090 + }, + { + "epoch": 0.6346941872818334, + "grad_norm": 0.4205876290798187, + "learning_rate": 8.946542472410652e-05, + "loss": 1.6726, + "step": 2091 + }, + { + "epoch": 0.6349977234785248, + "grad_norm": 0.4449016749858856, + "learning_rate": 8.946036245823632e-05, + "loss": 2.2222, + "step": 2092 + }, + { + "epoch": 0.6353012596752162, + "grad_norm": 0.44236990809440613, + "learning_rate": 8.945530019236611e-05, + "loss": 2.2247, + "step": 2093 + }, + { + "epoch": 0.6356047958719078, + "grad_norm": 0.4700889587402344, + "learning_rate": 8.945023792649591e-05, + "loss": 1.6692, + "step": 2094 + }, + { + "epoch": 0.6359083320685992, + "grad_norm": 0.42525413632392883, + "learning_rate": 8.94451756606257e-05, + "loss": 1.5726, + "step": 2095 + }, + { + "epoch": 0.6362118682652906, + "grad_norm": 0.3753025233745575, + "learning_rate": 8.94401133947555e-05, + "loss": 1.8565, + "step": 2096 + }, + { + "epoch": 0.6365154044619821, + "grad_norm": 0.3908928632736206, + "learning_rate": 8.943505112888529e-05, + "loss": 1.9776, + "step": 2097 + }, + { + "epoch": 0.6368189406586735, + "grad_norm": 0.4409022927284241, + "learning_rate": 8.942998886301509e-05, + "loss": 1.8076, + "step": 2098 + }, + { + "epoch": 0.637122476855365, + "grad_norm": 0.4187740981578827, + "learning_rate": 8.942492659714488e-05, + "loss": 2.0177, + "step": 2099 + }, + { + "epoch": 0.6374260130520565, + "grad_norm": 0.4491542875766754, + "learning_rate": 8.941986433127469e-05, + "loss": 1.817, + "step": 2100 + }, + { + "epoch": 0.6377295492487479, + "grad_norm": 0.4964027404785156, + "learning_rate": 8.941480206540448e-05, + "loss": 1.618, + "step": 2101 + }, + { + "epoch": 0.6380330854454394, + "grad_norm": 0.4044201672077179, + "learning_rate": 8.940973979953428e-05, + "loss": 1.4983, + "step": 2102 + }, + { + "epoch": 0.6383366216421308, + "grad_norm": 0.4115463197231293, + "learning_rate": 8.940467753366407e-05, + "loss": 1.9043, + "step": 2103 + }, + { + "epoch": 0.6386401578388223, + "grad_norm": 0.39303481578826904, + "learning_rate": 8.939961526779387e-05, + "loss": 1.434, + "step": 2104 + }, + { + "epoch": 0.6389436940355138, + "grad_norm": 0.3657127618789673, + "learning_rate": 8.939455300192366e-05, + "loss": 1.9805, + "step": 2105 + }, + { + "epoch": 0.6392472302322052, + "grad_norm": 0.414969265460968, + "learning_rate": 8.938949073605346e-05, + "loss": 1.745, + "step": 2106 + }, + { + "epoch": 0.6395507664288966, + "grad_norm": 0.4560664892196655, + "learning_rate": 8.938442847018325e-05, + "loss": 1.6991, + "step": 2107 + }, + { + "epoch": 0.6398543026255881, + "grad_norm": 0.4387153387069702, + "learning_rate": 8.937936620431305e-05, + "loss": 1.7517, + "step": 2108 + }, + { + "epoch": 0.6401578388222796, + "grad_norm": 0.39767786860466003, + "learning_rate": 8.937430393844284e-05, + "loss": 1.4668, + "step": 2109 + }, + { + "epoch": 0.640461375018971, + "grad_norm": 0.4568266272544861, + "learning_rate": 8.936924167257265e-05, + "loss": 1.7829, + "step": 2110 + }, + { + "epoch": 0.6407649112156625, + "grad_norm": 0.3790264427661896, + "learning_rate": 8.936417940670245e-05, + "loss": 1.8335, + "step": 2111 + }, + { + "epoch": 0.6410684474123539, + "grad_norm": 0.39457952976226807, + "learning_rate": 8.935911714083224e-05, + "loss": 1.687, + "step": 2112 + }, + { + "epoch": 0.6413719836090453, + "grad_norm": 0.32461151480674744, + "learning_rate": 8.935405487496204e-05, + "loss": 1.4628, + "step": 2113 + }, + { + "epoch": 0.6416755198057368, + "grad_norm": 0.36477747559547424, + "learning_rate": 8.934899260909183e-05, + "loss": 1.9856, + "step": 2114 + }, + { + "epoch": 0.6419790560024283, + "grad_norm": 0.4230240285396576, + "learning_rate": 8.934393034322163e-05, + "loss": 2.037, + "step": 2115 + }, + { + "epoch": 0.6422825921991198, + "grad_norm": 0.3885568380355835, + "learning_rate": 8.933886807735142e-05, + "loss": 1.7229, + "step": 2116 + }, + { + "epoch": 0.6425861283958112, + "grad_norm": 0.46484097838401794, + "learning_rate": 8.933380581148121e-05, + "loss": 1.9656, + "step": 2117 + }, + { + "epoch": 0.6428896645925026, + "grad_norm": 0.3922126591205597, + "learning_rate": 8.932874354561101e-05, + "loss": 1.652, + "step": 2118 + }, + { + "epoch": 0.6431932007891941, + "grad_norm": 0.4676629602909088, + "learning_rate": 8.932368127974082e-05, + "loss": 2.1305, + "step": 2119 + }, + { + "epoch": 0.6434967369858856, + "grad_norm": 0.3731312155723572, + "learning_rate": 8.931861901387061e-05, + "loss": 2.0093, + "step": 2120 + }, + { + "epoch": 0.643800273182577, + "grad_norm": 0.44040486216545105, + "learning_rate": 8.931355674800041e-05, + "loss": 1.9446, + "step": 2121 + }, + { + "epoch": 0.6441038093792685, + "grad_norm": 0.3713996112346649, + "learning_rate": 8.93084944821302e-05, + "loss": 2.1773, + "step": 2122 + }, + { + "epoch": 0.6444073455759599, + "grad_norm": 0.3798523247241974, + "learning_rate": 8.930343221626e-05, + "loss": 1.7056, + "step": 2123 + }, + { + "epoch": 0.6447108817726513, + "grad_norm": 0.4175238013267517, + "learning_rate": 8.92983699503898e-05, + "loss": 1.966, + "step": 2124 + }, + { + "epoch": 0.6450144179693429, + "grad_norm": 0.40957748889923096, + "learning_rate": 8.92933076845196e-05, + "loss": 1.6715, + "step": 2125 + }, + { + "epoch": 0.6453179541660343, + "grad_norm": 0.46979820728302, + "learning_rate": 8.92882454186494e-05, + "loss": 1.8604, + "step": 2126 + }, + { + "epoch": 0.6456214903627258, + "grad_norm": 0.3671952188014984, + "learning_rate": 8.928318315277919e-05, + "loss": 1.1596, + "step": 2127 + }, + { + "epoch": 0.6459250265594172, + "grad_norm": 0.363288551568985, + "learning_rate": 8.927812088690898e-05, + "loss": 2.14, + "step": 2128 + }, + { + "epoch": 0.6462285627561086, + "grad_norm": 0.3632570505142212, + "learning_rate": 8.927305862103878e-05, + "loss": 2.0746, + "step": 2129 + }, + { + "epoch": 0.6465320989528002, + "grad_norm": 0.5912741422653198, + "learning_rate": 8.926799635516859e-05, + "loss": 2.1828, + "step": 2130 + }, + { + "epoch": 0.6468356351494916, + "grad_norm": 0.3740077018737793, + "learning_rate": 8.926293408929838e-05, + "loss": 1.9439, + "step": 2131 + }, + { + "epoch": 0.647139171346183, + "grad_norm": 0.5042386651039124, + "learning_rate": 8.925787182342818e-05, + "loss": 1.1905, + "step": 2132 + }, + { + "epoch": 0.6474427075428745, + "grad_norm": 0.39761942625045776, + "learning_rate": 8.925280955755797e-05, + "loss": 1.3763, + "step": 2133 + }, + { + "epoch": 0.6477462437395659, + "grad_norm": 0.6671484112739563, + "learning_rate": 8.924774729168777e-05, + "loss": 2.2412, + "step": 2134 + }, + { + "epoch": 0.6480497799362575, + "grad_norm": 0.40470197796821594, + "learning_rate": 8.924268502581756e-05, + "loss": 2.0007, + "step": 2135 + }, + { + "epoch": 0.6483533161329489, + "grad_norm": 1.5381660461425781, + "learning_rate": 8.923762275994736e-05, + "loss": 2.101, + "step": 2136 + }, + { + "epoch": 0.6486568523296403, + "grad_norm": 0.39186039566993713, + "learning_rate": 8.923256049407715e-05, + "loss": 1.8134, + "step": 2137 + }, + { + "epoch": 0.6489603885263318, + "grad_norm": 0.351701021194458, + "learning_rate": 8.922749822820695e-05, + "loss": 1.6034, + "step": 2138 + }, + { + "epoch": 0.6492639247230232, + "grad_norm": 1.6814361810684204, + "learning_rate": 8.922243596233675e-05, + "loss": 1.6059, + "step": 2139 + }, + { + "epoch": 0.6495674609197146, + "grad_norm": 0.4578597843647003, + "learning_rate": 8.921737369646655e-05, + "loss": 1.5098, + "step": 2140 + }, + { + "epoch": 0.6498709971164062, + "grad_norm": 0.44496893882751465, + "learning_rate": 8.921231143059634e-05, + "loss": 2.0494, + "step": 2141 + }, + { + "epoch": 0.6501745333130976, + "grad_norm": 0.3577191233634949, + "learning_rate": 8.920724916472614e-05, + "loss": 1.8182, + "step": 2142 + }, + { + "epoch": 0.650478069509789, + "grad_norm": 0.42032182216644287, + "learning_rate": 8.920218689885593e-05, + "loss": 2.0543, + "step": 2143 + }, + { + "epoch": 0.6507816057064805, + "grad_norm": 0.3442663550376892, + "learning_rate": 8.919712463298573e-05, + "loss": 1.699, + "step": 2144 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.479557067155838, + "learning_rate": 8.919206236711552e-05, + "loss": 2.3661, + "step": 2145 + }, + { + "epoch": 0.6513886780998634, + "grad_norm": 0.4386119246482849, + "learning_rate": 8.918700010124532e-05, + "loss": 1.9253, + "step": 2146 + }, + { + "epoch": 0.6516922142965549, + "grad_norm": 0.38390249013900757, + "learning_rate": 8.918193783537511e-05, + "loss": 1.7774, + "step": 2147 + }, + { + "epoch": 0.6519957504932463, + "grad_norm": 0.3760508596897125, + "learning_rate": 8.917687556950491e-05, + "loss": 2.0236, + "step": 2148 + }, + { + "epoch": 0.6522992866899378, + "grad_norm": 0.41757336258888245, + "learning_rate": 8.917181330363472e-05, + "loss": 1.8861, + "step": 2149 + }, + { + "epoch": 0.6526028228866292, + "grad_norm": 0.40840038657188416, + "learning_rate": 8.916675103776451e-05, + "loss": 1.405, + "step": 2150 + }, + { + "epoch": 0.6529063590833207, + "grad_norm": 0.3661898672580719, + "learning_rate": 8.91616887718943e-05, + "loss": 1.6111, + "step": 2151 + }, + { + "epoch": 0.6532098952800122, + "grad_norm": 0.42466968297958374, + "learning_rate": 8.91566265060241e-05, + "loss": 1.6713, + "step": 2152 + }, + { + "epoch": 0.6535134314767036, + "grad_norm": 0.5033214092254639, + "learning_rate": 8.91515642401539e-05, + "loss": 2.0999, + "step": 2153 + }, + { + "epoch": 0.653816967673395, + "grad_norm": 0.3836124837398529, + "learning_rate": 8.914650197428369e-05, + "loss": 2.2854, + "step": 2154 + }, + { + "epoch": 0.6541205038700865, + "grad_norm": 0.42189982533454895, + "learning_rate": 8.914143970841348e-05, + "loss": 2.154, + "step": 2155 + }, + { + "epoch": 0.654424040066778, + "grad_norm": 0.3981611430644989, + "learning_rate": 8.913637744254328e-05, + "loss": 1.6845, + "step": 2156 + }, + { + "epoch": 0.6547275762634694, + "grad_norm": 0.4584210515022278, + "learning_rate": 8.913131517667307e-05, + "loss": 1.8831, + "step": 2157 + }, + { + "epoch": 0.6550311124601609, + "grad_norm": 0.42922207713127136, + "learning_rate": 8.912625291080288e-05, + "loss": 1.8187, + "step": 2158 + }, + { + "epoch": 0.6553346486568523, + "grad_norm": 0.4891490638256073, + "learning_rate": 8.912119064493268e-05, + "loss": 2.0393, + "step": 2159 + }, + { + "epoch": 0.6556381848535437, + "grad_norm": 0.44946572184562683, + "learning_rate": 8.911612837906247e-05, + "loss": 2.0362, + "step": 2160 + }, + { + "epoch": 0.6559417210502353, + "grad_norm": 0.5170040726661682, + "learning_rate": 8.911106611319227e-05, + "loss": 1.0148, + "step": 2161 + }, + { + "epoch": 0.6562452572469267, + "grad_norm": 0.45176056027412415, + "learning_rate": 8.910600384732206e-05, + "loss": 1.9087, + "step": 2162 + }, + { + "epoch": 0.6565487934436182, + "grad_norm": 0.3974052965641022, + "learning_rate": 8.910094158145186e-05, + "loss": 1.7759, + "step": 2163 + }, + { + "epoch": 0.6568523296403096, + "grad_norm": 0.4142087399959564, + "learning_rate": 8.909587931558165e-05, + "loss": 1.8639, + "step": 2164 + }, + { + "epoch": 0.657155865837001, + "grad_norm": 0.4220983386039734, + "learning_rate": 8.909081704971145e-05, + "loss": 1.4122, + "step": 2165 + }, + { + "epoch": 0.6574594020336925, + "grad_norm": 0.37949880957603455, + "learning_rate": 8.908575478384124e-05, + "loss": 1.9989, + "step": 2166 + }, + { + "epoch": 0.657762938230384, + "grad_norm": 0.35547998547554016, + "learning_rate": 8.908069251797105e-05, + "loss": 1.9514, + "step": 2167 + }, + { + "epoch": 0.6580664744270754, + "grad_norm": 0.4009557366371155, + "learning_rate": 8.907563025210084e-05, + "loss": 1.7043, + "step": 2168 + }, + { + "epoch": 0.6583700106237669, + "grad_norm": 0.38969942927360535, + "learning_rate": 8.907056798623065e-05, + "loss": 1.8512, + "step": 2169 + }, + { + "epoch": 0.6586735468204583, + "grad_norm": 0.4015234708786011, + "learning_rate": 8.906550572036045e-05, + "loss": 1.9016, + "step": 2170 + }, + { + "epoch": 0.6589770830171497, + "grad_norm": 0.45555707812309265, + "learning_rate": 8.906044345449024e-05, + "loss": 2.1088, + "step": 2171 + }, + { + "epoch": 0.6592806192138413, + "grad_norm": 0.3557066321372986, + "learning_rate": 8.905538118862004e-05, + "loss": 1.6273, + "step": 2172 + }, + { + "epoch": 0.6595841554105327, + "grad_norm": 0.44995880126953125, + "learning_rate": 8.905031892274983e-05, + "loss": 1.7946, + "step": 2173 + }, + { + "epoch": 0.6598876916072242, + "grad_norm": 0.40973517298698425, + "learning_rate": 8.904525665687963e-05, + "loss": 1.7571, + "step": 2174 + }, + { + "epoch": 0.6601912278039156, + "grad_norm": 0.3300071656703949, + "learning_rate": 8.904019439100942e-05, + "loss": 1.2977, + "step": 2175 + }, + { + "epoch": 0.660494764000607, + "grad_norm": 0.4011610746383667, + "learning_rate": 8.903513212513922e-05, + "loss": 2.0934, + "step": 2176 + }, + { + "epoch": 0.6607983001972986, + "grad_norm": 0.35637664794921875, + "learning_rate": 8.903006985926901e-05, + "loss": 1.9632, + "step": 2177 + }, + { + "epoch": 0.66110183639399, + "grad_norm": 0.45524492859840393, + "learning_rate": 8.902500759339882e-05, + "loss": 1.8951, + "step": 2178 + }, + { + "epoch": 0.6614053725906814, + "grad_norm": 0.45453348755836487, + "learning_rate": 8.901994532752861e-05, + "loss": 1.84, + "step": 2179 + }, + { + "epoch": 0.6617089087873729, + "grad_norm": 0.4106372892856598, + "learning_rate": 8.901488306165841e-05, + "loss": 2.171, + "step": 2180 + }, + { + "epoch": 0.6620124449840643, + "grad_norm": 0.6188797950744629, + "learning_rate": 8.90098207957882e-05, + "loss": 1.3866, + "step": 2181 + }, + { + "epoch": 0.6623159811807559, + "grad_norm": 0.3466598093509674, + "learning_rate": 8.9004758529918e-05, + "loss": 1.7782, + "step": 2182 + }, + { + "epoch": 0.6626195173774473, + "grad_norm": 0.4912582337856293, + "learning_rate": 8.899969626404779e-05, + "loss": 1.2761, + "step": 2183 + }, + { + "epoch": 0.6629230535741387, + "grad_norm": 0.46108344197273254, + "learning_rate": 8.899463399817759e-05, + "loss": 1.4373, + "step": 2184 + }, + { + "epoch": 0.6632265897708302, + "grad_norm": 0.5269731879234314, + "learning_rate": 8.898957173230738e-05, + "loss": 1.4146, + "step": 2185 + }, + { + "epoch": 0.6635301259675216, + "grad_norm": 0.4078417122364044, + "learning_rate": 8.898450946643718e-05, + "loss": 2.2392, + "step": 2186 + }, + { + "epoch": 0.6638336621642131, + "grad_norm": 0.36829376220703125, + "learning_rate": 8.897944720056697e-05, + "loss": 2.1447, + "step": 2187 + }, + { + "epoch": 0.6641371983609046, + "grad_norm": 0.38769134879112244, + "learning_rate": 8.897438493469678e-05, + "loss": 1.5877, + "step": 2188 + }, + { + "epoch": 0.664440734557596, + "grad_norm": 0.3847033381462097, + "learning_rate": 8.896932266882658e-05, + "loss": 1.9341, + "step": 2189 + }, + { + "epoch": 0.6647442707542874, + "grad_norm": 0.47725987434387207, + "learning_rate": 8.896426040295637e-05, + "loss": 2.122, + "step": 2190 + }, + { + "epoch": 0.6650478069509789, + "grad_norm": 0.4192405045032501, + "learning_rate": 8.895919813708617e-05, + "loss": 2.1324, + "step": 2191 + }, + { + "epoch": 0.6653513431476703, + "grad_norm": 0.5160967707633972, + "learning_rate": 8.895413587121596e-05, + "loss": 1.4909, + "step": 2192 + }, + { + "epoch": 0.6656548793443618, + "grad_norm": 0.437773734331131, + "learning_rate": 8.894907360534575e-05, + "loss": 1.7785, + "step": 2193 + }, + { + "epoch": 0.6659584155410533, + "grad_norm": 0.36092495918273926, + "learning_rate": 8.894401133947555e-05, + "loss": 1.9376, + "step": 2194 + }, + { + "epoch": 0.6662619517377447, + "grad_norm": 0.4263538122177124, + "learning_rate": 8.893894907360534e-05, + "loss": 2.1226, + "step": 2195 + }, + { + "epoch": 0.6665654879344362, + "grad_norm": 0.41431042551994324, + "learning_rate": 8.893388680773514e-05, + "loss": 2.1941, + "step": 2196 + }, + { + "epoch": 0.6668690241311276, + "grad_norm": 0.4508149027824402, + "learning_rate": 8.892882454186495e-05, + "loss": 1.8846, + "step": 2197 + }, + { + "epoch": 0.6671725603278191, + "grad_norm": 0.3481595516204834, + "learning_rate": 8.892376227599474e-05, + "loss": 1.9913, + "step": 2198 + }, + { + "epoch": 0.6674760965245106, + "grad_norm": 0.420114129781723, + "learning_rate": 8.891870001012454e-05, + "loss": 1.5997, + "step": 2199 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.44123902916908264, + "learning_rate": 8.891363774425433e-05, + "loss": 2.112, + "step": 2200 + }, + { + "epoch": 0.6680831689178934, + "grad_norm": 0.4254309833049774, + "learning_rate": 8.890857547838413e-05, + "loss": 1.8562, + "step": 2201 + }, + { + "epoch": 0.6683867051145849, + "grad_norm": 0.3660505414009094, + "learning_rate": 8.890351321251392e-05, + "loss": 1.8612, + "step": 2202 + }, + { + "epoch": 0.6686902413112764, + "grad_norm": 0.3969692587852478, + "learning_rate": 8.889845094664372e-05, + "loss": 2.2214, + "step": 2203 + }, + { + "epoch": 0.6689937775079678, + "grad_norm": 0.36976855993270874, + "learning_rate": 8.889338868077351e-05, + "loss": 1.8667, + "step": 2204 + }, + { + "epoch": 0.6692973137046593, + "grad_norm": 0.44680026173591614, + "learning_rate": 8.88883264149033e-05, + "loss": 2.0648, + "step": 2205 + }, + { + "epoch": 0.6696008499013507, + "grad_norm": 0.40876418352127075, + "learning_rate": 8.888326414903311e-05, + "loss": 2.027, + "step": 2206 + }, + { + "epoch": 0.6699043860980421, + "grad_norm": 0.35822147130966187, + "learning_rate": 8.887820188316291e-05, + "loss": 1.8987, + "step": 2207 + }, + { + "epoch": 0.6702079222947337, + "grad_norm": 0.41419175267219543, + "learning_rate": 8.88731396172927e-05, + "loss": 2.114, + "step": 2208 + }, + { + "epoch": 0.6705114584914251, + "grad_norm": 0.3790142834186554, + "learning_rate": 8.88680773514225e-05, + "loss": 1.8878, + "step": 2209 + }, + { + "epoch": 0.6708149946881166, + "grad_norm": 0.42493680119514465, + "learning_rate": 8.88630150855523e-05, + "loss": 1.8914, + "step": 2210 + }, + { + "epoch": 0.671118530884808, + "grad_norm": 0.34427767992019653, + "learning_rate": 8.885795281968209e-05, + "loss": 2.1252, + "step": 2211 + }, + { + "epoch": 0.6714220670814994, + "grad_norm": 0.43361228704452515, + "learning_rate": 8.885289055381188e-05, + "loss": 1.677, + "step": 2212 + }, + { + "epoch": 0.671725603278191, + "grad_norm": 0.3793098032474518, + "learning_rate": 8.884782828794169e-05, + "loss": 1.9914, + "step": 2213 + }, + { + "epoch": 0.6720291394748824, + "grad_norm": 0.4355357587337494, + "learning_rate": 8.884276602207149e-05, + "loss": 2.0324, + "step": 2214 + }, + { + "epoch": 0.6723326756715738, + "grad_norm": 0.41514819860458374, + "learning_rate": 8.883770375620128e-05, + "loss": 1.6874, + "step": 2215 + }, + { + "epoch": 0.6726362118682653, + "grad_norm": 0.4044744074344635, + "learning_rate": 8.883264149033108e-05, + "loss": 1.8879, + "step": 2216 + }, + { + "epoch": 0.6729397480649567, + "grad_norm": 0.38892972469329834, + "learning_rate": 8.882757922446088e-05, + "loss": 1.8009, + "step": 2217 + }, + { + "epoch": 0.6732432842616483, + "grad_norm": 0.41450080275535583, + "learning_rate": 8.882251695859068e-05, + "loss": 1.7411, + "step": 2218 + }, + { + "epoch": 0.6735468204583397, + "grad_norm": 0.41548603773117065, + "learning_rate": 8.881745469272047e-05, + "loss": 2.158, + "step": 2219 + }, + { + "epoch": 0.6738503566550311, + "grad_norm": 0.4178054928779602, + "learning_rate": 8.881239242685027e-05, + "loss": 1.7454, + "step": 2220 + }, + { + "epoch": 0.6741538928517226, + "grad_norm": 1.661353588104248, + "learning_rate": 8.880733016098006e-05, + "loss": 1.6626, + "step": 2221 + }, + { + "epoch": 0.674457429048414, + "grad_norm": 0.40055370330810547, + "learning_rate": 8.880226789510986e-05, + "loss": 1.9827, + "step": 2222 + }, + { + "epoch": 0.6747609652451054, + "grad_norm": 0.41323450207710266, + "learning_rate": 8.879720562923965e-05, + "loss": 1.6335, + "step": 2223 + }, + { + "epoch": 0.675064501441797, + "grad_norm": 0.4238756597042084, + "learning_rate": 8.879214336336945e-05, + "loss": 1.7076, + "step": 2224 + }, + { + "epoch": 0.6753680376384884, + "grad_norm": 0.39978405833244324, + "learning_rate": 8.878708109749924e-05, + "loss": 1.6837, + "step": 2225 + }, + { + "epoch": 0.6756715738351798, + "grad_norm": 0.4585546851158142, + "learning_rate": 8.878201883162904e-05, + "loss": 2.0821, + "step": 2226 + }, + { + "epoch": 0.6759751100318713, + "grad_norm": 0.40500447154045105, + "learning_rate": 8.877695656575885e-05, + "loss": 1.7139, + "step": 2227 + }, + { + "epoch": 0.6762786462285627, + "grad_norm": 0.4829038381576538, + "learning_rate": 8.877189429988864e-05, + "loss": 1.9029, + "step": 2228 + }, + { + "epoch": 0.6765821824252543, + "grad_norm": 0.4088328182697296, + "learning_rate": 8.876683203401844e-05, + "loss": 2.0233, + "step": 2229 + }, + { + "epoch": 0.6768857186219457, + "grad_norm": 0.4438897967338562, + "learning_rate": 8.876176976814823e-05, + "loss": 1.824, + "step": 2230 + }, + { + "epoch": 0.6771892548186371, + "grad_norm": 0.4573661684989929, + "learning_rate": 8.875670750227802e-05, + "loss": 2.0605, + "step": 2231 + }, + { + "epoch": 0.6774927910153286, + "grad_norm": 0.5133582949638367, + "learning_rate": 8.875164523640782e-05, + "loss": 1.7161, + "step": 2232 + }, + { + "epoch": 0.67779632721202, + "grad_norm": 0.3775865137577057, + "learning_rate": 8.874658297053761e-05, + "loss": 1.513, + "step": 2233 + }, + { + "epoch": 0.6780998634087115, + "grad_norm": 0.4122471809387207, + "learning_rate": 8.874152070466741e-05, + "loss": 1.8155, + "step": 2234 + }, + { + "epoch": 0.678403399605403, + "grad_norm": 0.6338900327682495, + "learning_rate": 8.87364584387972e-05, + "loss": 1.5857, + "step": 2235 + }, + { + "epoch": 0.6787069358020944, + "grad_norm": 0.41020557284355164, + "learning_rate": 8.873139617292701e-05, + "loss": 1.7888, + "step": 2236 + }, + { + "epoch": 0.6790104719987858, + "grad_norm": 0.3700268268585205, + "learning_rate": 8.872633390705681e-05, + "loss": 1.5622, + "step": 2237 + }, + { + "epoch": 0.6793140081954773, + "grad_norm": 0.4358409345149994, + "learning_rate": 8.87212716411866e-05, + "loss": 2.0885, + "step": 2238 + }, + { + "epoch": 0.6796175443921688, + "grad_norm": 0.4212052822113037, + "learning_rate": 8.87162093753164e-05, + "loss": 2.0268, + "step": 2239 + }, + { + "epoch": 0.6799210805888602, + "grad_norm": 0.7132793068885803, + "learning_rate": 8.871114710944619e-05, + "loss": 2.0234, + "step": 2240 + }, + { + "epoch": 0.6802246167855517, + "grad_norm": 0.38493213057518005, + "learning_rate": 8.870608484357599e-05, + "loss": 1.9204, + "step": 2241 + }, + { + "epoch": 0.6805281529822431, + "grad_norm": 0.3852492570877075, + "learning_rate": 8.870102257770578e-05, + "loss": 1.8373, + "step": 2242 + }, + { + "epoch": 0.6808316891789346, + "grad_norm": 0.5180450081825256, + "learning_rate": 8.869596031183558e-05, + "loss": 1.3947, + "step": 2243 + }, + { + "epoch": 0.6811352253756261, + "grad_norm": 0.46512570977211, + "learning_rate": 8.869089804596537e-05, + "loss": 2.2241, + "step": 2244 + }, + { + "epoch": 0.6814387615723175, + "grad_norm": 0.387101411819458, + "learning_rate": 8.868583578009518e-05, + "loss": 1.7226, + "step": 2245 + }, + { + "epoch": 0.681742297769009, + "grad_norm": 0.40807807445526123, + "learning_rate": 8.868077351422497e-05, + "loss": 1.709, + "step": 2246 + }, + { + "epoch": 0.6820458339657004, + "grad_norm": 0.4039689600467682, + "learning_rate": 8.867571124835477e-05, + "loss": 1.6902, + "step": 2247 + }, + { + "epoch": 0.6823493701623918, + "grad_norm": 0.7101170420646667, + "learning_rate": 8.867064898248456e-05, + "loss": 1.6965, + "step": 2248 + }, + { + "epoch": 0.6826529063590833, + "grad_norm": 0.42346715927124023, + "learning_rate": 8.866558671661436e-05, + "loss": 1.4751, + "step": 2249 + }, + { + "epoch": 0.6829564425557748, + "grad_norm": 0.42237260937690735, + "learning_rate": 8.866052445074415e-05, + "loss": 1.8187, + "step": 2250 + }, + { + "epoch": 0.6832599787524662, + "grad_norm": 0.4752514958381653, + "learning_rate": 8.865546218487395e-05, + "loss": 1.7068, + "step": 2251 + }, + { + "epoch": 0.6835635149491577, + "grad_norm": 0.4356227219104767, + "learning_rate": 8.865039991900374e-05, + "loss": 2.0188, + "step": 2252 + }, + { + "epoch": 0.6838670511458491, + "grad_norm": 0.5964135527610779, + "learning_rate": 8.864533765313354e-05, + "loss": 1.5663, + "step": 2253 + }, + { + "epoch": 0.6841705873425405, + "grad_norm": 0.40307527780532837, + "learning_rate": 8.864027538726333e-05, + "loss": 1.8429, + "step": 2254 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.4318184554576874, + "learning_rate": 8.863521312139314e-05, + "loss": 1.7906, + "step": 2255 + }, + { + "epoch": 0.6847776597359235, + "grad_norm": 0.4366863965988159, + "learning_rate": 8.863015085552294e-05, + "loss": 1.9137, + "step": 2256 + }, + { + "epoch": 0.685081195932615, + "grad_norm": 0.4154497981071472, + "learning_rate": 8.862508858965274e-05, + "loss": 2.0196, + "step": 2257 + }, + { + "epoch": 0.6853847321293064, + "grad_norm": 0.39866191148757935, + "learning_rate": 8.862002632378254e-05, + "loss": 1.9397, + "step": 2258 + }, + { + "epoch": 0.6856882683259978, + "grad_norm": 0.42318910360336304, + "learning_rate": 8.861496405791233e-05, + "loss": 2.1399, + "step": 2259 + }, + { + "epoch": 0.6859918045226894, + "grad_norm": 0.4558073878288269, + "learning_rate": 8.860990179204213e-05, + "loss": 1.9519, + "step": 2260 + }, + { + "epoch": 0.6862953407193808, + "grad_norm": 0.45745977759361267, + "learning_rate": 8.860483952617192e-05, + "loss": 1.3459, + "step": 2261 + }, + { + "epoch": 0.6865988769160722, + "grad_norm": 0.4418570399284363, + "learning_rate": 8.859977726030172e-05, + "loss": 1.9207, + "step": 2262 + }, + { + "epoch": 0.6869024131127637, + "grad_norm": 0.3995778560638428, + "learning_rate": 8.859471499443151e-05, + "loss": 1.8226, + "step": 2263 + }, + { + "epoch": 0.6872059493094551, + "grad_norm": 0.5238348841667175, + "learning_rate": 8.858965272856131e-05, + "loss": 1.7849, + "step": 2264 + }, + { + "epoch": 0.6875094855061467, + "grad_norm": 0.3357613682746887, + "learning_rate": 8.85845904626911e-05, + "loss": 1.4962, + "step": 2265 + }, + { + "epoch": 0.6878130217028381, + "grad_norm": 0.45454543828964233, + "learning_rate": 8.857952819682091e-05, + "loss": 1.8111, + "step": 2266 + }, + { + "epoch": 0.6881165578995295, + "grad_norm": 0.4192381501197815, + "learning_rate": 8.85744659309507e-05, + "loss": 1.7505, + "step": 2267 + }, + { + "epoch": 0.688420094096221, + "grad_norm": 0.4213124215602875, + "learning_rate": 8.85694036650805e-05, + "loss": 1.9307, + "step": 2268 + }, + { + "epoch": 0.6887236302929124, + "grad_norm": 0.42022505402565, + "learning_rate": 8.85643413992103e-05, + "loss": 1.9889, + "step": 2269 + }, + { + "epoch": 0.6890271664896039, + "grad_norm": 0.42116105556488037, + "learning_rate": 8.855927913334009e-05, + "loss": 1.7128, + "step": 2270 + }, + { + "epoch": 0.6893307026862954, + "grad_norm": 1.0270413160324097, + "learning_rate": 8.855421686746988e-05, + "loss": 1.3238, + "step": 2271 + }, + { + "epoch": 0.6896342388829868, + "grad_norm": 0.42168179154396057, + "learning_rate": 8.854915460159968e-05, + "loss": 1.6624, + "step": 2272 + }, + { + "epoch": 0.6899377750796782, + "grad_norm": 0.3498344421386719, + "learning_rate": 8.854409233572947e-05, + "loss": 1.7382, + "step": 2273 + }, + { + "epoch": 0.6902413112763697, + "grad_norm": 0.42905229330062866, + "learning_rate": 8.853903006985927e-05, + "loss": 2.0058, + "step": 2274 + }, + { + "epoch": 0.6905448474730611, + "grad_norm": 0.41980302333831787, + "learning_rate": 8.853396780398908e-05, + "loss": 1.4661, + "step": 2275 + }, + { + "epoch": 0.6908483836697527, + "grad_norm": 0.5022958517074585, + "learning_rate": 8.852890553811887e-05, + "loss": 1.6155, + "step": 2276 + }, + { + "epoch": 0.6911519198664441, + "grad_norm": 0.4186939597129822, + "learning_rate": 8.852384327224867e-05, + "loss": 1.7008, + "step": 2277 + }, + { + "epoch": 0.6914554560631355, + "grad_norm": 0.39082199335098267, + "learning_rate": 8.851878100637846e-05, + "loss": 2.1537, + "step": 2278 + }, + { + "epoch": 0.691758992259827, + "grad_norm": 0.42378294467926025, + "learning_rate": 8.851371874050826e-05, + "loss": 2.0288, + "step": 2279 + }, + { + "epoch": 0.6920625284565184, + "grad_norm": 0.36108916997909546, + "learning_rate": 8.850865647463805e-05, + "loss": 2.0387, + "step": 2280 + }, + { + "epoch": 0.6923660646532099, + "grad_norm": 0.4613724648952484, + "learning_rate": 8.850359420876785e-05, + "loss": 1.7304, + "step": 2281 + }, + { + "epoch": 0.6926696008499014, + "grad_norm": 0.4140026867389679, + "learning_rate": 8.849853194289764e-05, + "loss": 1.9746, + "step": 2282 + }, + { + "epoch": 0.6929731370465928, + "grad_norm": 0.43233832716941833, + "learning_rate": 8.849346967702744e-05, + "loss": 1.9922, + "step": 2283 + }, + { + "epoch": 0.6932766732432842, + "grad_norm": 0.8021528124809265, + "learning_rate": 8.848840741115724e-05, + "loss": 2.1604, + "step": 2284 + }, + { + "epoch": 0.6935802094399757, + "grad_norm": 0.4009002447128296, + "learning_rate": 8.848334514528704e-05, + "loss": 1.4224, + "step": 2285 + }, + { + "epoch": 0.6938837456366672, + "grad_norm": 0.3914124369621277, + "learning_rate": 8.847828287941683e-05, + "loss": 1.8222, + "step": 2286 + }, + { + "epoch": 0.6941872818333586, + "grad_norm": 0.41309481859207153, + "learning_rate": 8.847322061354663e-05, + "loss": 1.9296, + "step": 2287 + }, + { + "epoch": 0.6944908180300501, + "grad_norm": 0.5561639666557312, + "learning_rate": 8.846815834767642e-05, + "loss": 1.3577, + "step": 2288 + }, + { + "epoch": 0.6947943542267415, + "grad_norm": 0.41699445247650146, + "learning_rate": 8.846309608180622e-05, + "loss": 1.8751, + "step": 2289 + }, + { + "epoch": 0.695097890423433, + "grad_norm": 0.3643866181373596, + "learning_rate": 8.845803381593601e-05, + "loss": 1.5099, + "step": 2290 + }, + { + "epoch": 0.6954014266201245, + "grad_norm": 0.44212576746940613, + "learning_rate": 8.845297155006581e-05, + "loss": 1.8293, + "step": 2291 + }, + { + "epoch": 0.6957049628168159, + "grad_norm": 0.36881545186042786, + "learning_rate": 8.84479092841956e-05, + "loss": 1.6359, + "step": 2292 + }, + { + "epoch": 0.6960084990135074, + "grad_norm": 0.3785519003868103, + "learning_rate": 8.84428470183254e-05, + "loss": 1.63, + "step": 2293 + }, + { + "epoch": 0.6963120352101988, + "grad_norm": 0.6767301559448242, + "learning_rate": 8.84377847524552e-05, + "loss": 1.0786, + "step": 2294 + }, + { + "epoch": 0.6966155714068902, + "grad_norm": 0.38619041442871094, + "learning_rate": 8.8432722486585e-05, + "loss": 1.4952, + "step": 2295 + }, + { + "epoch": 0.6969191076035818, + "grad_norm": 0.6698863506317139, + "learning_rate": 8.84276602207148e-05, + "loss": 2.0425, + "step": 2296 + }, + { + "epoch": 0.6972226438002732, + "grad_norm": 0.4446139931678772, + "learning_rate": 8.842259795484459e-05, + "loss": 1.7511, + "step": 2297 + }, + { + "epoch": 0.6975261799969646, + "grad_norm": 0.6287797093391418, + "learning_rate": 8.841753568897438e-05, + "loss": 1.8198, + "step": 2298 + }, + { + "epoch": 0.6978297161936561, + "grad_norm": 0.3704979717731476, + "learning_rate": 8.841247342310418e-05, + "loss": 1.44, + "step": 2299 + }, + { + "epoch": 0.6981332523903475, + "grad_norm": 0.4163188636302948, + "learning_rate": 8.840741115723397e-05, + "loss": 1.623, + "step": 2300 + }, + { + "epoch": 0.6984367885870391, + "grad_norm": 0.3959861993789673, + "learning_rate": 8.840234889136377e-05, + "loss": 1.9259, + "step": 2301 + }, + { + "epoch": 0.6987403247837305, + "grad_norm": 0.5066584348678589, + "learning_rate": 8.839728662549358e-05, + "loss": 1.8163, + "step": 2302 + }, + { + "epoch": 0.6990438609804219, + "grad_norm": 0.4553223252296448, + "learning_rate": 8.839222435962337e-05, + "loss": 1.4527, + "step": 2303 + }, + { + "epoch": 0.6993473971771134, + "grad_norm": 0.47616320848464966, + "learning_rate": 8.838716209375317e-05, + "loss": 1.8694, + "step": 2304 + }, + { + "epoch": 0.6996509333738048, + "grad_norm": 0.4735199511051178, + "learning_rate": 8.838209982788297e-05, + "loss": 1.7023, + "step": 2305 + }, + { + "epoch": 0.6999544695704962, + "grad_norm": 0.427415668964386, + "learning_rate": 8.837703756201277e-05, + "loss": 2.0255, + "step": 2306 + }, + { + "epoch": 0.7002580057671878, + "grad_norm": 0.35021111369132996, + "learning_rate": 8.837197529614256e-05, + "loss": 2.1431, + "step": 2307 + }, + { + "epoch": 0.7005615419638792, + "grad_norm": 0.35905367136001587, + "learning_rate": 8.836691303027236e-05, + "loss": 1.2157, + "step": 2308 + }, + { + "epoch": 0.7008650781605706, + "grad_norm": 0.39521825313568115, + "learning_rate": 8.836185076440215e-05, + "loss": 1.8051, + "step": 2309 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.42580053210258484, + "learning_rate": 8.835678849853195e-05, + "loss": 1.9845, + "step": 2310 + }, + { + "epoch": 0.7014721505539535, + "grad_norm": 0.6940016150474548, + "learning_rate": 8.835172623266174e-05, + "loss": 2.0824, + "step": 2311 + }, + { + "epoch": 0.7017756867506451, + "grad_norm": 0.44518351554870605, + "learning_rate": 8.834666396679154e-05, + "loss": 2.0919, + "step": 2312 + }, + { + "epoch": 0.7020792229473365, + "grad_norm": 0.4215528070926666, + "learning_rate": 8.834160170092133e-05, + "loss": 1.9665, + "step": 2313 + }, + { + "epoch": 0.7023827591440279, + "grad_norm": 0.4305053651332855, + "learning_rate": 8.833653943505114e-05, + "loss": 2.0338, + "step": 2314 + }, + { + "epoch": 0.7026862953407194, + "grad_norm": 0.7952874302864075, + "learning_rate": 8.833147716918094e-05, + "loss": 1.6609, + "step": 2315 + }, + { + "epoch": 0.7029898315374108, + "grad_norm": 0.42054691910743713, + "learning_rate": 8.832641490331073e-05, + "loss": 1.3244, + "step": 2316 + }, + { + "epoch": 0.7032933677341023, + "grad_norm": 0.3898642659187317, + "learning_rate": 8.832135263744053e-05, + "loss": 1.5618, + "step": 2317 + }, + { + "epoch": 0.7035969039307938, + "grad_norm": 0.4959927201271057, + "learning_rate": 8.831629037157032e-05, + "loss": 1.999, + "step": 2318 + }, + { + "epoch": 0.7039004401274852, + "grad_norm": 0.4488220810890198, + "learning_rate": 8.831122810570012e-05, + "loss": 1.946, + "step": 2319 + }, + { + "epoch": 0.7042039763241766, + "grad_norm": 0.4661828577518463, + "learning_rate": 8.830616583982991e-05, + "loss": 1.9832, + "step": 2320 + }, + { + "epoch": 0.7045075125208681, + "grad_norm": 0.3740446865558624, + "learning_rate": 8.83011035739597e-05, + "loss": 1.8934, + "step": 2321 + }, + { + "epoch": 0.7048110487175596, + "grad_norm": 0.4690150022506714, + "learning_rate": 8.82960413080895e-05, + "loss": 1.9083, + "step": 2322 + }, + { + "epoch": 0.705114584914251, + "grad_norm": 0.3680610954761505, + "learning_rate": 8.829097904221931e-05, + "loss": 1.9494, + "step": 2323 + }, + { + "epoch": 0.7054181211109425, + "grad_norm": 0.40403270721435547, + "learning_rate": 8.82859167763491e-05, + "loss": 2.1369, + "step": 2324 + }, + { + "epoch": 0.7057216573076339, + "grad_norm": 0.4465244710445404, + "learning_rate": 8.82808545104789e-05, + "loss": 1.8854, + "step": 2325 + }, + { + "epoch": 0.7060251935043254, + "grad_norm": 0.45881539583206177, + "learning_rate": 8.827579224460869e-05, + "loss": 1.6319, + "step": 2326 + }, + { + "epoch": 0.7063287297010169, + "grad_norm": 0.43863871693611145, + "learning_rate": 8.827072997873849e-05, + "loss": 1.8284, + "step": 2327 + }, + { + "epoch": 0.7066322658977083, + "grad_norm": 0.3942803740501404, + "learning_rate": 8.826566771286828e-05, + "loss": 1.8663, + "step": 2328 + }, + { + "epoch": 0.7069358020943998, + "grad_norm": 0.42162778973579407, + "learning_rate": 8.826060544699808e-05, + "loss": 1.8563, + "step": 2329 + }, + { + "epoch": 0.7072393382910912, + "grad_norm": 0.4088474214076996, + "learning_rate": 8.825554318112787e-05, + "loss": 2.2026, + "step": 2330 + }, + { + "epoch": 0.7075428744877826, + "grad_norm": 0.43421268463134766, + "learning_rate": 8.825048091525767e-05, + "loss": 2.2395, + "step": 2331 + }, + { + "epoch": 0.7078464106844741, + "grad_norm": 0.4430371820926666, + "learning_rate": 8.824541864938746e-05, + "loss": 1.7794, + "step": 2332 + }, + { + "epoch": 0.7081499468811656, + "grad_norm": 0.41605162620544434, + "learning_rate": 8.824035638351727e-05, + "loss": 2.1565, + "step": 2333 + }, + { + "epoch": 0.708453483077857, + "grad_norm": 0.3622266352176666, + "learning_rate": 8.823529411764706e-05, + "loss": 1.4366, + "step": 2334 + }, + { + "epoch": 0.7087570192745485, + "grad_norm": 0.4030252695083618, + "learning_rate": 8.823023185177686e-05, + "loss": 1.7127, + "step": 2335 + }, + { + "epoch": 0.7090605554712399, + "grad_norm": 0.4723038375377655, + "learning_rate": 8.822516958590665e-05, + "loss": 1.6736, + "step": 2336 + }, + { + "epoch": 0.7093640916679314, + "grad_norm": 0.39395782351493835, + "learning_rate": 8.822010732003645e-05, + "loss": 1.8631, + "step": 2337 + }, + { + "epoch": 0.7096676278646229, + "grad_norm": 0.4566243290901184, + "learning_rate": 8.821504505416624e-05, + "loss": 1.8543, + "step": 2338 + }, + { + "epoch": 0.7099711640613143, + "grad_norm": 0.3434160351753235, + "learning_rate": 8.820998278829604e-05, + "loss": 1.9953, + "step": 2339 + }, + { + "epoch": 0.7102747002580058, + "grad_norm": 0.5802703499794006, + "learning_rate": 8.820492052242583e-05, + "loss": 1.7014, + "step": 2340 + }, + { + "epoch": 0.7105782364546972, + "grad_norm": 1.1318562030792236, + "learning_rate": 8.819985825655563e-05, + "loss": 1.9866, + "step": 2341 + }, + { + "epoch": 0.7108817726513886, + "grad_norm": 0.8500426411628723, + "learning_rate": 8.819479599068544e-05, + "loss": 1.2291, + "step": 2342 + }, + { + "epoch": 0.7111853088480802, + "grad_norm": 0.4189402163028717, + "learning_rate": 8.818973372481523e-05, + "loss": 1.9396, + "step": 2343 + }, + { + "epoch": 0.7114888450447716, + "grad_norm": 0.7509348392486572, + "learning_rate": 8.818467145894503e-05, + "loss": 2.168, + "step": 2344 + }, + { + "epoch": 0.711792381241463, + "grad_norm": 0.42071589827537537, + "learning_rate": 8.817960919307482e-05, + "loss": 1.9493, + "step": 2345 + }, + { + "epoch": 0.7120959174381545, + "grad_norm": 0.7269922494888306, + "learning_rate": 8.817454692720463e-05, + "loss": 1.6344, + "step": 2346 + }, + { + "epoch": 0.7123994536348459, + "grad_norm": 0.5446398854255676, + "learning_rate": 8.816948466133442e-05, + "loss": 1.9525, + "step": 2347 + }, + { + "epoch": 0.7127029898315375, + "grad_norm": 0.43752509355545044, + "learning_rate": 8.816442239546422e-05, + "loss": 1.9692, + "step": 2348 + }, + { + "epoch": 0.7130065260282289, + "grad_norm": 0.4986307919025421, + "learning_rate": 8.815936012959401e-05, + "loss": 1.805, + "step": 2349 + }, + { + "epoch": 0.7133100622249203, + "grad_norm": 0.47302186489105225, + "learning_rate": 8.815429786372381e-05, + "loss": 2.2322, + "step": 2350 + }, + { + "epoch": 0.7136135984216118, + "grad_norm": 0.4359181523323059, + "learning_rate": 8.81492355978536e-05, + "loss": 1.7579, + "step": 2351 + }, + { + "epoch": 0.7139171346183032, + "grad_norm": 0.9149986505508423, + "learning_rate": 8.81441733319834e-05, + "loss": 1.4888, + "step": 2352 + }, + { + "epoch": 0.7142206708149947, + "grad_norm": 0.37777209281921387, + "learning_rate": 8.81391110661132e-05, + "loss": 2.0646, + "step": 2353 + }, + { + "epoch": 0.7145242070116862, + "grad_norm": 0.527703046798706, + "learning_rate": 8.8134048800243e-05, + "loss": 1.9213, + "step": 2354 + }, + { + "epoch": 0.7148277432083776, + "grad_norm": 0.41505396366119385, + "learning_rate": 8.81289865343728e-05, + "loss": 1.7469, + "step": 2355 + }, + { + "epoch": 0.715131279405069, + "grad_norm": 0.44212964177131653, + "learning_rate": 8.812392426850259e-05, + "loss": 1.2739, + "step": 2356 + }, + { + "epoch": 0.7154348156017605, + "grad_norm": 0.3863414525985718, + "learning_rate": 8.811886200263239e-05, + "loss": 2.2481, + "step": 2357 + }, + { + "epoch": 0.7157383517984519, + "grad_norm": 0.45853668451309204, + "learning_rate": 8.811379973676218e-05, + "loss": 2.2813, + "step": 2358 + }, + { + "epoch": 0.7160418879951435, + "grad_norm": 11.417152404785156, + "learning_rate": 8.810873747089198e-05, + "loss": 2.0079, + "step": 2359 + }, + { + "epoch": 0.7163454241918349, + "grad_norm": 0.49986690282821655, + "learning_rate": 8.810367520502177e-05, + "loss": 1.5595, + "step": 2360 + }, + { + "epoch": 0.7166489603885263, + "grad_norm": 0.4734189510345459, + "learning_rate": 8.809861293915156e-05, + "loss": 1.8213, + "step": 2361 + }, + { + "epoch": 0.7169524965852178, + "grad_norm": 0.43908554315567017, + "learning_rate": 8.809355067328137e-05, + "loss": 1.889, + "step": 2362 + }, + { + "epoch": 0.7172560327819092, + "grad_norm": 0.48986315727233887, + "learning_rate": 8.808848840741117e-05, + "loss": 1.9236, + "step": 2363 + }, + { + "epoch": 0.7175595689786007, + "grad_norm": 0.42691266536712646, + "learning_rate": 8.808342614154096e-05, + "loss": 2.0663, + "step": 2364 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.38523420691490173, + "learning_rate": 8.807836387567076e-05, + "loss": 1.9165, + "step": 2365 + }, + { + "epoch": 0.7181666413719836, + "grad_norm": 0.29487428069114685, + "learning_rate": 8.807330160980055e-05, + "loss": 1.4763, + "step": 2366 + }, + { + "epoch": 0.718470177568675, + "grad_norm": 0.9072676301002502, + "learning_rate": 8.806823934393035e-05, + "loss": 2.0882, + "step": 2367 + }, + { + "epoch": 0.7187737137653665, + "grad_norm": 0.37307825684547424, + "learning_rate": 8.806317707806014e-05, + "loss": 1.4694, + "step": 2368 + }, + { + "epoch": 0.719077249962058, + "grad_norm": 0.41390106081962585, + "learning_rate": 8.805811481218994e-05, + "loss": 1.6849, + "step": 2369 + }, + { + "epoch": 0.7193807861587495, + "grad_norm": 0.4989478886127472, + "learning_rate": 8.805305254631973e-05, + "loss": 2.146, + "step": 2370 + }, + { + "epoch": 0.7196843223554409, + "grad_norm": 0.4021719694137573, + "learning_rate": 8.804799028044953e-05, + "loss": 1.942, + "step": 2371 + }, + { + "epoch": 0.7199878585521323, + "grad_norm": 0.4169461727142334, + "learning_rate": 8.804292801457933e-05, + "loss": 2.0278, + "step": 2372 + }, + { + "epoch": 0.7202913947488238, + "grad_norm": 0.39091089367866516, + "learning_rate": 8.803786574870913e-05, + "loss": 1.9644, + "step": 2373 + }, + { + "epoch": 0.7205949309455153, + "grad_norm": 0.45431414246559143, + "learning_rate": 8.803280348283892e-05, + "loss": 1.5611, + "step": 2374 + }, + { + "epoch": 0.7208984671422067, + "grad_norm": 0.3896774351596832, + "learning_rate": 8.802774121696872e-05, + "loss": 1.7838, + "step": 2375 + }, + { + "epoch": 0.7212020033388982, + "grad_norm": 0.4076644778251648, + "learning_rate": 8.802267895109851e-05, + "loss": 1.9717, + "step": 2376 + }, + { + "epoch": 0.7215055395355896, + "grad_norm": 0.4065254032611847, + "learning_rate": 8.801761668522831e-05, + "loss": 1.6598, + "step": 2377 + }, + { + "epoch": 0.721809075732281, + "grad_norm": 0.6506657004356384, + "learning_rate": 8.80125544193581e-05, + "loss": 1.9463, + "step": 2378 + }, + { + "epoch": 0.7221126119289726, + "grad_norm": 0.46132421493530273, + "learning_rate": 8.80074921534879e-05, + "loss": 2.0071, + "step": 2379 + }, + { + "epoch": 0.722416148125664, + "grad_norm": 0.3932840824127197, + "learning_rate": 8.800242988761769e-05, + "loss": 1.9956, + "step": 2380 + }, + { + "epoch": 0.7227196843223554, + "grad_norm": 0.4919872581958771, + "learning_rate": 8.79973676217475e-05, + "loss": 1.658, + "step": 2381 + }, + { + "epoch": 0.7230232205190469, + "grad_norm": 0.4147129952907562, + "learning_rate": 8.79923053558773e-05, + "loss": 1.9331, + "step": 2382 + }, + { + "epoch": 0.7233267567157383, + "grad_norm": 0.4280264973640442, + "learning_rate": 8.798724309000709e-05, + "loss": 1.7016, + "step": 2383 + }, + { + "epoch": 0.7236302929124299, + "grad_norm": 0.4554193913936615, + "learning_rate": 8.798218082413689e-05, + "loss": 2.08, + "step": 2384 + }, + { + "epoch": 0.7239338291091213, + "grad_norm": 0.4477219581604004, + "learning_rate": 8.797711855826668e-05, + "loss": 1.6204, + "step": 2385 + }, + { + "epoch": 0.7242373653058127, + "grad_norm": 0.32487139105796814, + "learning_rate": 8.797205629239648e-05, + "loss": 1.7271, + "step": 2386 + }, + { + "epoch": 0.7245409015025042, + "grad_norm": 2.3734400272369385, + "learning_rate": 8.796699402652627e-05, + "loss": 2.3315, + "step": 2387 + }, + { + "epoch": 0.7248444376991956, + "grad_norm": 0.3860095739364624, + "learning_rate": 8.796193176065606e-05, + "loss": 1.4387, + "step": 2388 + }, + { + "epoch": 0.725147973895887, + "grad_norm": 0.3950817286968231, + "learning_rate": 8.795686949478586e-05, + "loss": 2.1907, + "step": 2389 + }, + { + "epoch": 0.7254515100925786, + "grad_norm": 0.37350189685821533, + "learning_rate": 8.795180722891567e-05, + "loss": 1.5662, + "step": 2390 + }, + { + "epoch": 0.72575504628927, + "grad_norm": 0.46780696511268616, + "learning_rate": 8.794674496304546e-05, + "loss": 1.4264, + "step": 2391 + }, + { + "epoch": 0.7260585824859614, + "grad_norm": 0.3911786377429962, + "learning_rate": 8.794168269717527e-05, + "loss": 1.8752, + "step": 2392 + }, + { + "epoch": 0.7263621186826529, + "grad_norm": 0.5619503855705261, + "learning_rate": 8.793662043130507e-05, + "loss": 2.0974, + "step": 2393 + }, + { + "epoch": 0.7266656548793443, + "grad_norm": 0.44586917757987976, + "learning_rate": 8.793155816543486e-05, + "loss": 2.0871, + "step": 2394 + }, + { + "epoch": 0.7269691910760359, + "grad_norm": 0.3445717990398407, + "learning_rate": 8.792649589956466e-05, + "loss": 1.3626, + "step": 2395 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.410279244184494, + "learning_rate": 8.792143363369445e-05, + "loss": 1.5149, + "step": 2396 + }, + { + "epoch": 0.7275762634694187, + "grad_norm": 0.3766261339187622, + "learning_rate": 8.791637136782425e-05, + "loss": 1.8248, + "step": 2397 + }, + { + "epoch": 0.7278797996661102, + "grad_norm": 0.3978814482688904, + "learning_rate": 8.791130910195404e-05, + "loss": 1.9437, + "step": 2398 + }, + { + "epoch": 0.7281833358628016, + "grad_norm": 0.36028674244880676, + "learning_rate": 8.790624683608383e-05, + "loss": 1.8169, + "step": 2399 + }, + { + "epoch": 0.7284868720594931, + "grad_norm": 0.38613566756248474, + "learning_rate": 8.790118457021363e-05, + "loss": 1.3976, + "step": 2400 + }, + { + "epoch": 0.7287904082561846, + "grad_norm": 0.4028817117214203, + "learning_rate": 8.789612230434344e-05, + "loss": 1.5574, + "step": 2401 + }, + { + "epoch": 0.729093944452876, + "grad_norm": 0.4536430239677429, + "learning_rate": 8.789106003847323e-05, + "loss": 1.7994, + "step": 2402 + }, + { + "epoch": 0.7293974806495674, + "grad_norm": 0.421176016330719, + "learning_rate": 8.788599777260303e-05, + "loss": 2.1671, + "step": 2403 + }, + { + "epoch": 0.7297010168462589, + "grad_norm": 0.42854103446006775, + "learning_rate": 8.788093550673282e-05, + "loss": 1.5606, + "step": 2404 + }, + { + "epoch": 0.7300045530429504, + "grad_norm": 0.38108232617378235, + "learning_rate": 8.787587324086262e-05, + "loss": 1.8415, + "step": 2405 + }, + { + "epoch": 0.7303080892396419, + "grad_norm": 0.454464852809906, + "learning_rate": 8.787081097499241e-05, + "loss": 1.5911, + "step": 2406 + }, + { + "epoch": 0.7306116254363333, + "grad_norm": 0.4082881808280945, + "learning_rate": 8.78657487091222e-05, + "loss": 2.022, + "step": 2407 + }, + { + "epoch": 0.7309151616330247, + "grad_norm": 0.4951760470867157, + "learning_rate": 8.7860686443252e-05, + "loss": 2.12, + "step": 2408 + }, + { + "epoch": 0.7312186978297162, + "grad_norm": 0.40377724170684814, + "learning_rate": 8.78556241773818e-05, + "loss": 2.0016, + "step": 2409 + }, + { + "epoch": 0.7315222340264077, + "grad_norm": 0.403481662273407, + "learning_rate": 8.785056191151159e-05, + "loss": 1.9914, + "step": 2410 + }, + { + "epoch": 0.7318257702230991, + "grad_norm": 0.4195014536380768, + "learning_rate": 8.78454996456414e-05, + "loss": 1.8749, + "step": 2411 + }, + { + "epoch": 0.7321293064197906, + "grad_norm": 0.40575090050697327, + "learning_rate": 8.78404373797712e-05, + "loss": 1.8565, + "step": 2412 + }, + { + "epoch": 0.732432842616482, + "grad_norm": 0.4025145173072815, + "learning_rate": 8.783537511390099e-05, + "loss": 1.7397, + "step": 2413 + }, + { + "epoch": 0.7327363788131734, + "grad_norm": 0.35525646805763245, + "learning_rate": 8.783031284803078e-05, + "loss": 2.086, + "step": 2414 + }, + { + "epoch": 0.7330399150098649, + "grad_norm": 0.4063604772090912, + "learning_rate": 8.782525058216058e-05, + "loss": 1.8643, + "step": 2415 + }, + { + "epoch": 0.7333434512065564, + "grad_norm": 0.3689418137073517, + "learning_rate": 8.782018831629037e-05, + "loss": 1.9982, + "step": 2416 + }, + { + "epoch": 0.7336469874032479, + "grad_norm": 0.4066859185695648, + "learning_rate": 8.781512605042017e-05, + "loss": 1.4861, + "step": 2417 + }, + { + "epoch": 0.7339505235999393, + "grad_norm": 0.4118275046348572, + "learning_rate": 8.781006378454996e-05, + "loss": 1.923, + "step": 2418 + }, + { + "epoch": 0.7342540597966307, + "grad_norm": 0.4238114058971405, + "learning_rate": 8.780500151867976e-05, + "loss": 1.7656, + "step": 2419 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.4456924498081207, + "learning_rate": 8.779993925280957e-05, + "loss": 1.3612, + "step": 2420 + }, + { + "epoch": 0.7348611321900137, + "grad_norm": 0.3890600800514221, + "learning_rate": 8.779487698693936e-05, + "loss": 1.7588, + "step": 2421 + }, + { + "epoch": 0.7351646683867051, + "grad_norm": 0.35149431228637695, + "learning_rate": 8.778981472106916e-05, + "loss": 1.9639, + "step": 2422 + }, + { + "epoch": 0.7354682045833966, + "grad_norm": 0.43158209323883057, + "learning_rate": 8.778475245519895e-05, + "loss": 2.008, + "step": 2423 + }, + { + "epoch": 0.735771740780088, + "grad_norm": 0.4337812066078186, + "learning_rate": 8.777969018932875e-05, + "loss": 1.9703, + "step": 2424 + }, + { + "epoch": 0.7360752769767794, + "grad_norm": 0.36363622546195984, + "learning_rate": 8.777462792345854e-05, + "loss": 1.712, + "step": 2425 + }, + { + "epoch": 0.736378813173471, + "grad_norm": 0.3731040358543396, + "learning_rate": 8.776956565758833e-05, + "loss": 2.11, + "step": 2426 + }, + { + "epoch": 0.7366823493701624, + "grad_norm": 0.37047722935676575, + "learning_rate": 8.776450339171813e-05, + "loss": 2.1481, + "step": 2427 + }, + { + "epoch": 0.7369858855668538, + "grad_norm": 1.4641270637512207, + "learning_rate": 8.775944112584792e-05, + "loss": 1.9506, + "step": 2428 + }, + { + "epoch": 0.7372894217635453, + "grad_norm": 0.3781220018863678, + "learning_rate": 8.775437885997773e-05, + "loss": 1.9651, + "step": 2429 + }, + { + "epoch": 0.7375929579602367, + "grad_norm": 0.353572815656662, + "learning_rate": 8.774931659410753e-05, + "loss": 1.8171, + "step": 2430 + }, + { + "epoch": 0.7378964941569283, + "grad_norm": 0.4101322889328003, + "learning_rate": 8.774425432823732e-05, + "loss": 1.1176, + "step": 2431 + }, + { + "epoch": 0.7382000303536197, + "grad_norm": 0.3967879116535187, + "learning_rate": 8.773919206236712e-05, + "loss": 1.9563, + "step": 2432 + }, + { + "epoch": 0.7385035665503111, + "grad_norm": 0.41269639134407043, + "learning_rate": 8.773412979649691e-05, + "loss": 1.8312, + "step": 2433 + }, + { + "epoch": 0.7388071027470026, + "grad_norm": 0.4125451445579529, + "learning_rate": 8.77290675306267e-05, + "loss": 1.9907, + "step": 2434 + }, + { + "epoch": 0.739110638943694, + "grad_norm": 0.447773814201355, + "learning_rate": 8.772400526475652e-05, + "loss": 2.0819, + "step": 2435 + }, + { + "epoch": 0.7394141751403855, + "grad_norm": 0.3990137279033661, + "learning_rate": 8.771894299888631e-05, + "loss": 1.5107, + "step": 2436 + }, + { + "epoch": 0.739717711337077, + "grad_norm": 0.56345534324646, + "learning_rate": 8.77138807330161e-05, + "loss": 1.6428, + "step": 2437 + }, + { + "epoch": 0.7400212475337684, + "grad_norm": 0.42566823959350586, + "learning_rate": 8.77088184671459e-05, + "loss": 1.5812, + "step": 2438 + }, + { + "epoch": 0.7403247837304598, + "grad_norm": 0.4182227551937103, + "learning_rate": 8.77037562012757e-05, + "loss": 1.8838, + "step": 2439 + }, + { + "epoch": 0.7406283199271513, + "grad_norm": 0.3614279627799988, + "learning_rate": 8.76986939354055e-05, + "loss": 1.8822, + "step": 2440 + }, + { + "epoch": 0.7409318561238427, + "grad_norm": 0.4376552104949951, + "learning_rate": 8.76936316695353e-05, + "loss": 2.0327, + "step": 2441 + }, + { + "epoch": 0.7412353923205343, + "grad_norm": 0.3294520676136017, + "learning_rate": 8.768856940366509e-05, + "loss": 1.7045, + "step": 2442 + }, + { + "epoch": 0.7415389285172257, + "grad_norm": 0.39772239327430725, + "learning_rate": 8.768350713779489e-05, + "loss": 1.7068, + "step": 2443 + }, + { + "epoch": 0.7418424647139171, + "grad_norm": 0.4332139194011688, + "learning_rate": 8.767844487192468e-05, + "loss": 1.5432, + "step": 2444 + }, + { + "epoch": 0.7421460009106086, + "grad_norm": 0.40865209698677063, + "learning_rate": 8.767338260605448e-05, + "loss": 2.0378, + "step": 2445 + }, + { + "epoch": 0.7424495371073, + "grad_norm": 0.3608027696609497, + "learning_rate": 8.766832034018427e-05, + "loss": 1.4315, + "step": 2446 + }, + { + "epoch": 0.7427530733039915, + "grad_norm": 0.38700732588768005, + "learning_rate": 8.766325807431407e-05, + "loss": 1.1194, + "step": 2447 + }, + { + "epoch": 0.743056609500683, + "grad_norm": 0.4182412624359131, + "learning_rate": 8.765819580844386e-05, + "loss": 2.0976, + "step": 2448 + }, + { + "epoch": 0.7433601456973744, + "grad_norm": 0.40817487239837646, + "learning_rate": 8.765313354257366e-05, + "loss": 1.9548, + "step": 2449 + }, + { + "epoch": 0.7436636818940658, + "grad_norm": 0.4414690434932709, + "learning_rate": 8.764807127670346e-05, + "loss": 1.8551, + "step": 2450 + }, + { + "epoch": 0.7439672180907573, + "grad_norm": 0.393435001373291, + "learning_rate": 8.764300901083326e-05, + "loss": 1.7419, + "step": 2451 + }, + { + "epoch": 0.7442707542874488, + "grad_norm": 0.36712646484375, + "learning_rate": 8.763794674496305e-05, + "loss": 1.9663, + "step": 2452 + }, + { + "epoch": 0.7445742904841403, + "grad_norm": 0.47254228591918945, + "learning_rate": 8.763288447909285e-05, + "loss": 1.6821, + "step": 2453 + }, + { + "epoch": 0.7448778266808317, + "grad_norm": 0.6918731927871704, + "learning_rate": 8.762782221322264e-05, + "loss": 1.4488, + "step": 2454 + }, + { + "epoch": 0.7451813628775231, + "grad_norm": 0.4374895393848419, + "learning_rate": 8.762275994735244e-05, + "loss": 1.8773, + "step": 2455 + }, + { + "epoch": 0.7454848990742146, + "grad_norm": 0.3807445466518402, + "learning_rate": 8.761769768148223e-05, + "loss": 1.9019, + "step": 2456 + }, + { + "epoch": 0.7457884352709061, + "grad_norm": 0.804283857345581, + "learning_rate": 8.761263541561203e-05, + "loss": 1.6503, + "step": 2457 + }, + { + "epoch": 0.7460919714675975, + "grad_norm": 0.4001246690750122, + "learning_rate": 8.760757314974182e-05, + "loss": 1.8858, + "step": 2458 + }, + { + "epoch": 0.746395507664289, + "grad_norm": 0.4548395574092865, + "learning_rate": 8.760251088387163e-05, + "loss": 1.4205, + "step": 2459 + }, + { + "epoch": 0.7466990438609804, + "grad_norm": 0.4249577522277832, + "learning_rate": 8.759744861800143e-05, + "loss": 2.0217, + "step": 2460 + }, + { + "epoch": 0.7470025800576718, + "grad_norm": 0.356995165348053, + "learning_rate": 8.759238635213122e-05, + "loss": 2.1138, + "step": 2461 + }, + { + "epoch": 0.7473061162543634, + "grad_norm": 0.39245614409446716, + "learning_rate": 8.758732408626102e-05, + "loss": 1.9274, + "step": 2462 + }, + { + "epoch": 0.7476096524510548, + "grad_norm": 0.5045961737632751, + "learning_rate": 8.758226182039081e-05, + "loss": 1.5911, + "step": 2463 + }, + { + "epoch": 0.7479131886477463, + "grad_norm": 0.4416704773902893, + "learning_rate": 8.75771995545206e-05, + "loss": 1.7762, + "step": 2464 + }, + { + "epoch": 0.7482167248444377, + "grad_norm": 0.6794231534004211, + "learning_rate": 8.75721372886504e-05, + "loss": 2.1296, + "step": 2465 + }, + { + "epoch": 0.7485202610411291, + "grad_norm": 0.4514855444431305, + "learning_rate": 8.75670750227802e-05, + "loss": 1.9012, + "step": 2466 + }, + { + "epoch": 0.7488237972378207, + "grad_norm": 0.3483482003211975, + "learning_rate": 8.756201275690999e-05, + "loss": 1.3373, + "step": 2467 + }, + { + "epoch": 0.7491273334345121, + "grad_norm": 0.4310845136642456, + "learning_rate": 8.75569504910398e-05, + "loss": 1.7987, + "step": 2468 + }, + { + "epoch": 0.7494308696312035, + "grad_norm": 0.39404624700546265, + "learning_rate": 8.755188822516959e-05, + "loss": 1.764, + "step": 2469 + }, + { + "epoch": 0.749734405827895, + "grad_norm": 0.39560645818710327, + "learning_rate": 8.754682595929939e-05, + "loss": 1.7229, + "step": 2470 + }, + { + "epoch": 0.7500379420245864, + "grad_norm": 0.4125354588031769, + "learning_rate": 8.754176369342918e-05, + "loss": 1.6562, + "step": 2471 + }, + { + "epoch": 0.7503414782212778, + "grad_norm": 0.4781520366668701, + "learning_rate": 8.753670142755898e-05, + "loss": 2.0631, + "step": 2472 + }, + { + "epoch": 0.7506450144179694, + "grad_norm": 0.4587598443031311, + "learning_rate": 8.753163916168877e-05, + "loss": 1.7579, + "step": 2473 + }, + { + "epoch": 0.7509485506146608, + "grad_norm": 0.44834762811660767, + "learning_rate": 8.752657689581857e-05, + "loss": 1.8958, + "step": 2474 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3947749137878418, + "learning_rate": 8.752151462994836e-05, + "loss": 2.041, + "step": 2475 + }, + { + "epoch": 0.7515556230080437, + "grad_norm": 0.38898783922195435, + "learning_rate": 8.751645236407816e-05, + "loss": 1.8326, + "step": 2476 + }, + { + "epoch": 0.7518591592047351, + "grad_norm": 0.4535033404827118, + "learning_rate": 8.751139009820795e-05, + "loss": 1.7797, + "step": 2477 + }, + { + "epoch": 0.7521626954014267, + "grad_norm": 0.34348368644714355, + "learning_rate": 8.750632783233776e-05, + "loss": 1.8936, + "step": 2478 + }, + { + "epoch": 0.7524662315981181, + "grad_norm": 0.36187052726745605, + "learning_rate": 8.750126556646755e-05, + "loss": 1.4416, + "step": 2479 + }, + { + "epoch": 0.7527697677948095, + "grad_norm": 0.4151141941547394, + "learning_rate": 8.749620330059736e-05, + "loss": 2.0148, + "step": 2480 + }, + { + "epoch": 0.753073303991501, + "grad_norm": 0.39229243993759155, + "learning_rate": 8.749114103472716e-05, + "loss": 1.622, + "step": 2481 + }, + { + "epoch": 0.7533768401881924, + "grad_norm": 0.44165119528770447, + "learning_rate": 8.748607876885695e-05, + "loss": 2.2657, + "step": 2482 + }, + { + "epoch": 0.7536803763848839, + "grad_norm": 0.5234296917915344, + "learning_rate": 8.748101650298675e-05, + "loss": 1.6954, + "step": 2483 + }, + { + "epoch": 0.7539839125815754, + "grad_norm": 0.4218185842037201, + "learning_rate": 8.747595423711654e-05, + "loss": 2.028, + "step": 2484 + }, + { + "epoch": 0.7542874487782668, + "grad_norm": 0.6535462737083435, + "learning_rate": 8.747089197124634e-05, + "loss": 1.9042, + "step": 2485 + }, + { + "epoch": 0.7545909849749582, + "grad_norm": 0.34253132343292236, + "learning_rate": 8.746582970537613e-05, + "loss": 1.5497, + "step": 2486 + }, + { + "epoch": 0.7548945211716497, + "grad_norm": 0.4396836757659912, + "learning_rate": 8.746076743950593e-05, + "loss": 1.912, + "step": 2487 + }, + { + "epoch": 0.7551980573683412, + "grad_norm": 0.44126465916633606, + "learning_rate": 8.745570517363572e-05, + "loss": 1.7907, + "step": 2488 + }, + { + "epoch": 0.7555015935650327, + "grad_norm": 0.42292916774749756, + "learning_rate": 8.745064290776553e-05, + "loss": 1.9956, + "step": 2489 + }, + { + "epoch": 0.7558051297617241, + "grad_norm": 0.4493507146835327, + "learning_rate": 8.744558064189532e-05, + "loss": 1.9819, + "step": 2490 + }, + { + "epoch": 0.7561086659584155, + "grad_norm": 0.5793929100036621, + "learning_rate": 8.744051837602512e-05, + "loss": 2.1241, + "step": 2491 + }, + { + "epoch": 0.756412202155107, + "grad_norm": 0.3927520513534546, + "learning_rate": 8.743545611015491e-05, + "loss": 1.6615, + "step": 2492 + }, + { + "epoch": 0.7567157383517985, + "grad_norm": 0.4623410403728485, + "learning_rate": 8.743039384428471e-05, + "loss": 1.9174, + "step": 2493 + }, + { + "epoch": 0.7570192745484899, + "grad_norm": 0.4135148823261261, + "learning_rate": 8.74253315784145e-05, + "loss": 1.9403, + "step": 2494 + }, + { + "epoch": 0.7573228107451814, + "grad_norm": 0.4701920449733734, + "learning_rate": 8.74202693125443e-05, + "loss": 1.9479, + "step": 2495 + }, + { + "epoch": 0.7576263469418728, + "grad_norm": 0.42168691754341125, + "learning_rate": 8.741520704667409e-05, + "loss": 1.6007, + "step": 2496 + }, + { + "epoch": 0.7579298831385642, + "grad_norm": 1.0131754875183105, + "learning_rate": 8.741014478080389e-05, + "loss": 1.8092, + "step": 2497 + }, + { + "epoch": 0.7582334193352557, + "grad_norm": 0.39457446336746216, + "learning_rate": 8.74050825149337e-05, + "loss": 2.0884, + "step": 2498 + }, + { + "epoch": 0.7585369555319472, + "grad_norm": 0.9458907842636108, + "learning_rate": 8.740002024906349e-05, + "loss": 1.5426, + "step": 2499 + }, + { + "epoch": 0.7588404917286387, + "grad_norm": 0.6375271677970886, + "learning_rate": 8.739495798319329e-05, + "loss": 1.9517, + "step": 2500 + }, + { + "epoch": 0.7591440279253301, + "grad_norm": 0.4703015089035034, + "learning_rate": 8.738989571732308e-05, + "loss": 1.8815, + "step": 2501 + }, + { + "epoch": 0.7594475641220215, + "grad_norm": 0.4868961572647095, + "learning_rate": 8.738483345145287e-05, + "loss": 2.1519, + "step": 2502 + }, + { + "epoch": 0.759751100318713, + "grad_norm": 0.3808225691318512, + "learning_rate": 8.737977118558267e-05, + "loss": 1.9137, + "step": 2503 + }, + { + "epoch": 0.7600546365154045, + "grad_norm": 0.4780614674091339, + "learning_rate": 8.737470891971246e-05, + "loss": 2.0755, + "step": 2504 + }, + { + "epoch": 0.7603581727120959, + "grad_norm": 0.8534510135650635, + "learning_rate": 8.736964665384226e-05, + "loss": 2.2839, + "step": 2505 + }, + { + "epoch": 0.7606617089087874, + "grad_norm": 0.3620678782463074, + "learning_rate": 8.736458438797205e-05, + "loss": 1.9497, + "step": 2506 + }, + { + "epoch": 0.7609652451054788, + "grad_norm": 0.40993764996528625, + "learning_rate": 8.735952212210186e-05, + "loss": 1.7032, + "step": 2507 + }, + { + "epoch": 0.7612687813021702, + "grad_norm": 0.43798285722732544, + "learning_rate": 8.735445985623166e-05, + "loss": 1.8932, + "step": 2508 + }, + { + "epoch": 0.7615723174988618, + "grad_norm": 0.8272436857223511, + "learning_rate": 8.734939759036145e-05, + "loss": 1.9612, + "step": 2509 + }, + { + "epoch": 0.7618758536955532, + "grad_norm": 0.3841719329357147, + "learning_rate": 8.734433532449125e-05, + "loss": 1.8698, + "step": 2510 + }, + { + "epoch": 0.7621793898922447, + "grad_norm": 0.459075391292572, + "learning_rate": 8.733927305862104e-05, + "loss": 1.8878, + "step": 2511 + }, + { + "epoch": 0.7624829260889361, + "grad_norm": 0.41815492510795593, + "learning_rate": 8.733421079275084e-05, + "loss": 1.8751, + "step": 2512 + }, + { + "epoch": 0.7627864622856275, + "grad_norm": 0.41531050205230713, + "learning_rate": 8.732914852688063e-05, + "loss": 1.8247, + "step": 2513 + }, + { + "epoch": 0.7630899984823191, + "grad_norm": 0.36942997574806213, + "learning_rate": 8.732408626101043e-05, + "loss": 2.0158, + "step": 2514 + }, + { + "epoch": 0.7633935346790105, + "grad_norm": 0.3985773026943207, + "learning_rate": 8.731902399514022e-05, + "loss": 2.0955, + "step": 2515 + }, + { + "epoch": 0.7636970708757019, + "grad_norm": 0.45657238364219666, + "learning_rate": 8.731396172927002e-05, + "loss": 1.2635, + "step": 2516 + }, + { + "epoch": 0.7640006070723934, + "grad_norm": 0.35013964772224426, + "learning_rate": 8.730889946339982e-05, + "loss": 2.1223, + "step": 2517 + }, + { + "epoch": 0.7643041432690848, + "grad_norm": 0.48166340589523315, + "learning_rate": 8.730383719752962e-05, + "loss": 1.9952, + "step": 2518 + }, + { + "epoch": 0.7646076794657763, + "grad_norm": 0.3770373463630676, + "learning_rate": 8.729877493165941e-05, + "loss": 2.0698, + "step": 2519 + }, + { + "epoch": 0.7649112156624678, + "grad_norm": 0.6299264430999756, + "learning_rate": 8.729371266578921e-05, + "loss": 2.1133, + "step": 2520 + }, + { + "epoch": 0.7652147518591592, + "grad_norm": 0.3834339380264282, + "learning_rate": 8.7288650399919e-05, + "loss": 1.6254, + "step": 2521 + }, + { + "epoch": 0.7655182880558506, + "grad_norm": 0.4225000739097595, + "learning_rate": 8.72835881340488e-05, + "loss": 1.5402, + "step": 2522 + }, + { + "epoch": 0.7658218242525421, + "grad_norm": 0.3836756646633148, + "learning_rate": 8.727852586817859e-05, + "loss": 1.8559, + "step": 2523 + }, + { + "epoch": 0.7661253604492335, + "grad_norm": 0.43883371353149414, + "learning_rate": 8.72734636023084e-05, + "loss": 1.8589, + "step": 2524 + }, + { + "epoch": 0.7664288966459251, + "grad_norm": 0.3844871520996094, + "learning_rate": 8.72684013364382e-05, + "loss": 1.9853, + "step": 2525 + }, + { + "epoch": 0.7667324328426165, + "grad_norm": 0.37431496381759644, + "learning_rate": 8.726333907056799e-05, + "loss": 1.9808, + "step": 2526 + }, + { + "epoch": 0.7670359690393079, + "grad_norm": 0.35484790802001953, + "learning_rate": 8.725827680469779e-05, + "loss": 1.2544, + "step": 2527 + }, + { + "epoch": 0.7673395052359994, + "grad_norm": 0.3555900752544403, + "learning_rate": 8.72532145388276e-05, + "loss": 1.3952, + "step": 2528 + }, + { + "epoch": 0.7676430414326908, + "grad_norm": 0.4385487139225006, + "learning_rate": 8.724815227295739e-05, + "loss": 2.1207, + "step": 2529 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.41865015029907227, + "learning_rate": 8.724309000708718e-05, + "loss": 1.2081, + "step": 2530 + }, + { + "epoch": 0.7682501138260738, + "grad_norm": 0.44620874524116516, + "learning_rate": 8.723802774121698e-05, + "loss": 1.7552, + "step": 2531 + }, + { + "epoch": 0.7685536500227652, + "grad_norm": 0.42884379625320435, + "learning_rate": 8.723296547534677e-05, + "loss": 1.8443, + "step": 2532 + }, + { + "epoch": 0.7688571862194566, + "grad_norm": 0.41244685649871826, + "learning_rate": 8.722790320947657e-05, + "loss": 1.9597, + "step": 2533 + }, + { + "epoch": 0.7691607224161481, + "grad_norm": 0.7400226593017578, + "learning_rate": 8.722284094360636e-05, + "loss": 1.9847, + "step": 2534 + }, + { + "epoch": 0.7694642586128396, + "grad_norm": 0.4088320732116699, + "learning_rate": 8.721777867773616e-05, + "loss": 1.5025, + "step": 2535 + }, + { + "epoch": 0.7697677948095311, + "grad_norm": 0.4008265435695648, + "learning_rate": 8.721271641186595e-05, + "loss": 1.879, + "step": 2536 + }, + { + "epoch": 0.7700713310062225, + "grad_norm": 0.3870142996311188, + "learning_rate": 8.720765414599576e-05, + "loss": 1.9367, + "step": 2537 + }, + { + "epoch": 0.7703748672029139, + "grad_norm": 0.4387873411178589, + "learning_rate": 8.720259188012556e-05, + "loss": 2.2303, + "step": 2538 + }, + { + "epoch": 0.7706784033996054, + "grad_norm": 0.707614541053772, + "learning_rate": 8.719752961425535e-05, + "loss": 1.5612, + "step": 2539 + }, + { + "epoch": 0.7709819395962969, + "grad_norm": 0.43096137046813965, + "learning_rate": 8.719246734838514e-05, + "loss": 1.544, + "step": 2540 + }, + { + "epoch": 0.7712854757929883, + "grad_norm": 0.3771781027317047, + "learning_rate": 8.718740508251494e-05, + "loss": 1.79, + "step": 2541 + }, + { + "epoch": 0.7715890119896798, + "grad_norm": 0.39454761147499084, + "learning_rate": 8.718234281664473e-05, + "loss": 1.4474, + "step": 2542 + }, + { + "epoch": 0.7718925481863712, + "grad_norm": 0.421641081571579, + "learning_rate": 8.717728055077453e-05, + "loss": 1.8482, + "step": 2543 + }, + { + "epoch": 0.7721960843830626, + "grad_norm": 0.38047879934310913, + "learning_rate": 8.717221828490432e-05, + "loss": 1.2413, + "step": 2544 + }, + { + "epoch": 0.7724996205797542, + "grad_norm": 0.38516274094581604, + "learning_rate": 8.716715601903412e-05, + "loss": 1.9199, + "step": 2545 + }, + { + "epoch": 0.7728031567764456, + "grad_norm": 0.38349801301956177, + "learning_rate": 8.716209375316393e-05, + "loss": 1.999, + "step": 2546 + }, + { + "epoch": 0.773106692973137, + "grad_norm": 0.5327167510986328, + "learning_rate": 8.715703148729372e-05, + "loss": 1.5738, + "step": 2547 + }, + { + "epoch": 0.7734102291698285, + "grad_norm": 0.3783544898033142, + "learning_rate": 8.715196922142352e-05, + "loss": 1.8474, + "step": 2548 + }, + { + "epoch": 0.7737137653665199, + "grad_norm": 0.509729266166687, + "learning_rate": 8.714690695555331e-05, + "loss": 2.2824, + "step": 2549 + }, + { + "epoch": 0.7740173015632115, + "grad_norm": 0.4439513087272644, + "learning_rate": 8.71418446896831e-05, + "loss": 1.8373, + "step": 2550 + }, + { + "epoch": 0.7743208377599029, + "grad_norm": 0.4309268891811371, + "learning_rate": 8.71367824238129e-05, + "loss": 1.3441, + "step": 2551 + }, + { + "epoch": 0.7746243739565943, + "grad_norm": 0.4033602178096771, + "learning_rate": 8.71317201579427e-05, + "loss": 2.0037, + "step": 2552 + }, + { + "epoch": 0.7749279101532858, + "grad_norm": 0.42097219824790955, + "learning_rate": 8.712665789207249e-05, + "loss": 2.0319, + "step": 2553 + }, + { + "epoch": 0.7752314463499772, + "grad_norm": 0.43752336502075195, + "learning_rate": 8.712159562620229e-05, + "loss": 1.6834, + "step": 2554 + }, + { + "epoch": 0.7755349825466686, + "grad_norm": 0.4009190499782562, + "learning_rate": 8.711653336033208e-05, + "loss": 1.8764, + "step": 2555 + }, + { + "epoch": 0.7758385187433602, + "grad_norm": 0.38049957156181335, + "learning_rate": 8.711147109446189e-05, + "loss": 1.5514, + "step": 2556 + }, + { + "epoch": 0.7761420549400516, + "grad_norm": 0.7045227289199829, + "learning_rate": 8.710640882859168e-05, + "loss": 2.045, + "step": 2557 + }, + { + "epoch": 0.776445591136743, + "grad_norm": 0.4141732454299927, + "learning_rate": 8.710134656272148e-05, + "loss": 1.9546, + "step": 2558 + }, + { + "epoch": 0.7767491273334345, + "grad_norm": 0.36503976583480835, + "learning_rate": 8.709628429685127e-05, + "loss": 1.8611, + "step": 2559 + }, + { + "epoch": 0.7770526635301259, + "grad_norm": 0.4061439335346222, + "learning_rate": 8.709122203098107e-05, + "loss": 1.5297, + "step": 2560 + }, + { + "epoch": 0.7773561997268175, + "grad_norm": 0.39136406779289246, + "learning_rate": 8.708615976511086e-05, + "loss": 1.7483, + "step": 2561 + }, + { + "epoch": 0.7776597359235089, + "grad_norm": 0.38786038756370544, + "learning_rate": 8.708109749924066e-05, + "loss": 1.6304, + "step": 2562 + }, + { + "epoch": 0.7779632721202003, + "grad_norm": 0.44066160917282104, + "learning_rate": 8.707603523337045e-05, + "loss": 1.8819, + "step": 2563 + }, + { + "epoch": 0.7782668083168918, + "grad_norm": 0.4141193628311157, + "learning_rate": 8.707097296750025e-05, + "loss": 1.5542, + "step": 2564 + }, + { + "epoch": 0.7785703445135832, + "grad_norm": 0.3722589910030365, + "learning_rate": 8.706591070163006e-05, + "loss": 1.792, + "step": 2565 + }, + { + "epoch": 0.7788738807102747, + "grad_norm": 0.4519922435283661, + "learning_rate": 8.706084843575985e-05, + "loss": 1.8324, + "step": 2566 + }, + { + "epoch": 0.7791774169069662, + "grad_norm": 0.41349706053733826, + "learning_rate": 8.705578616988964e-05, + "loss": 1.841, + "step": 2567 + }, + { + "epoch": 0.7794809531036576, + "grad_norm": 0.445417195558548, + "learning_rate": 8.705072390401944e-05, + "loss": 1.6788, + "step": 2568 + }, + { + "epoch": 0.779784489300349, + "grad_norm": 0.35337746143341064, + "learning_rate": 8.704566163814925e-05, + "loss": 1.5129, + "step": 2569 + }, + { + "epoch": 0.7800880254970405, + "grad_norm": 0.49805590510368347, + "learning_rate": 8.704059937227904e-05, + "loss": 1.7206, + "step": 2570 + }, + { + "epoch": 0.780391561693732, + "grad_norm": 0.3580697774887085, + "learning_rate": 8.703553710640884e-05, + "loss": 1.8458, + "step": 2571 + }, + { + "epoch": 0.7806950978904235, + "grad_norm": 0.557847797870636, + "learning_rate": 8.703047484053863e-05, + "loss": 1.824, + "step": 2572 + }, + { + "epoch": 0.7809986340871149, + "grad_norm": 1.6153925657272339, + "learning_rate": 8.702541257466843e-05, + "loss": 2.0839, + "step": 2573 + }, + { + "epoch": 0.7813021702838063, + "grad_norm": 0.44338542222976685, + "learning_rate": 8.702035030879822e-05, + "loss": 1.8798, + "step": 2574 + }, + { + "epoch": 0.7816057064804978, + "grad_norm": 0.4379113018512726, + "learning_rate": 8.701528804292802e-05, + "loss": 1.8003, + "step": 2575 + }, + { + "epoch": 0.7819092426771893, + "grad_norm": 0.42209142446517944, + "learning_rate": 8.701022577705783e-05, + "loss": 1.1716, + "step": 2576 + }, + { + "epoch": 0.7822127788738807, + "grad_norm": 0.4423658549785614, + "learning_rate": 8.700516351118762e-05, + "loss": 1.9534, + "step": 2577 + }, + { + "epoch": 0.7825163150705722, + "grad_norm": 0.4544404149055481, + "learning_rate": 8.700010124531741e-05, + "loss": 1.3336, + "step": 2578 + }, + { + "epoch": 0.7828198512672636, + "grad_norm": 0.34568536281585693, + "learning_rate": 8.699503897944721e-05, + "loss": 1.9537, + "step": 2579 + }, + { + "epoch": 0.783123387463955, + "grad_norm": 0.545414924621582, + "learning_rate": 8.6989976713577e-05, + "loss": 1.9132, + "step": 2580 + }, + { + "epoch": 0.7834269236606465, + "grad_norm": 0.4345841705799103, + "learning_rate": 8.69849144477068e-05, + "loss": 1.7581, + "step": 2581 + }, + { + "epoch": 0.783730459857338, + "grad_norm": 0.4052067995071411, + "learning_rate": 8.69798521818366e-05, + "loss": 1.7303, + "step": 2582 + }, + { + "epoch": 0.7840339960540295, + "grad_norm": 0.34817397594451904, + "learning_rate": 8.697478991596639e-05, + "loss": 1.8363, + "step": 2583 + }, + { + "epoch": 0.7843375322507209, + "grad_norm": 0.3445320725440979, + "learning_rate": 8.696972765009618e-05, + "loss": 1.9267, + "step": 2584 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.43368205428123474, + "learning_rate": 8.696466538422599e-05, + "loss": 2.0271, + "step": 2585 + }, + { + "epoch": 0.7849446046441038, + "grad_norm": 0.4825034439563751, + "learning_rate": 8.695960311835579e-05, + "loss": 2.0727, + "step": 2586 + }, + { + "epoch": 0.7852481408407953, + "grad_norm": 0.6833105087280273, + "learning_rate": 8.695454085248558e-05, + "loss": 1.5662, + "step": 2587 + }, + { + "epoch": 0.7855516770374867, + "grad_norm": 0.3476558029651642, + "learning_rate": 8.694947858661538e-05, + "loss": 1.9023, + "step": 2588 + }, + { + "epoch": 0.7858552132341782, + "grad_norm": 0.49442049860954285, + "learning_rate": 8.694441632074517e-05, + "loss": 1.6733, + "step": 2589 + }, + { + "epoch": 0.7861587494308696, + "grad_norm": 1.6963638067245483, + "learning_rate": 8.693935405487497e-05, + "loss": 2.0808, + "step": 2590 + }, + { + "epoch": 0.786462285627561, + "grad_norm": 0.48170772194862366, + "learning_rate": 8.693429178900476e-05, + "loss": 1.6269, + "step": 2591 + }, + { + "epoch": 0.7867658218242526, + "grad_norm": 0.427327036857605, + "learning_rate": 8.692922952313456e-05, + "loss": 2.0448, + "step": 2592 + }, + { + "epoch": 0.787069358020944, + "grad_norm": 0.3641161322593689, + "learning_rate": 8.692416725726435e-05, + "loss": 2.0699, + "step": 2593 + }, + { + "epoch": 0.7873728942176355, + "grad_norm": 0.4324423372745514, + "learning_rate": 8.691910499139414e-05, + "loss": 1.1989, + "step": 2594 + }, + { + "epoch": 0.7876764304143269, + "grad_norm": 0.4303852617740631, + "learning_rate": 8.691404272552395e-05, + "loss": 1.866, + "step": 2595 + }, + { + "epoch": 0.7879799666110183, + "grad_norm": 0.36840641498565674, + "learning_rate": 8.690898045965375e-05, + "loss": 2.0927, + "step": 2596 + }, + { + "epoch": 0.7882835028077099, + "grad_norm": 0.43906763195991516, + "learning_rate": 8.690391819378354e-05, + "loss": 1.8755, + "step": 2597 + }, + { + "epoch": 0.7885870390044013, + "grad_norm": 0.43337517976760864, + "learning_rate": 8.689885592791334e-05, + "loss": 1.7263, + "step": 2598 + }, + { + "epoch": 0.7888905752010927, + "grad_norm": 0.35808295011520386, + "learning_rate": 8.689379366204313e-05, + "loss": 1.5395, + "step": 2599 + }, + { + "epoch": 0.7891941113977842, + "grad_norm": 0.4063914120197296, + "learning_rate": 8.688873139617293e-05, + "loss": 1.7125, + "step": 2600 + }, + { + "epoch": 0.7894976475944756, + "grad_norm": 0.35243427753448486, + "learning_rate": 8.688366913030272e-05, + "loss": 1.705, + "step": 2601 + }, + { + "epoch": 0.7898011837911671, + "grad_norm": 0.4404586851596832, + "learning_rate": 8.687860686443252e-05, + "loss": 1.9805, + "step": 2602 + }, + { + "epoch": 0.7901047199878586, + "grad_norm": 0.45531004667282104, + "learning_rate": 8.687354459856231e-05, + "loss": 2.1248, + "step": 2603 + }, + { + "epoch": 0.79040825618455, + "grad_norm": 0.4575786292552948, + "learning_rate": 8.686848233269212e-05, + "loss": 2.0493, + "step": 2604 + }, + { + "epoch": 0.7907117923812415, + "grad_norm": 0.4143056571483612, + "learning_rate": 8.686342006682191e-05, + "loss": 1.7757, + "step": 2605 + }, + { + "epoch": 0.7910153285779329, + "grad_norm": 0.41257745027542114, + "learning_rate": 8.685835780095171e-05, + "loss": 2.1223, + "step": 2606 + }, + { + "epoch": 0.7913188647746243, + "grad_norm": 0.4308036267757416, + "learning_rate": 8.68532955350815e-05, + "loss": 1.8967, + "step": 2607 + }, + { + "epoch": 0.7916224009713159, + "grad_norm": 1.4339756965637207, + "learning_rate": 8.68482332692113e-05, + "loss": 2.0286, + "step": 2608 + }, + { + "epoch": 0.7919259371680073, + "grad_norm": 0.39570608735084534, + "learning_rate": 8.68431710033411e-05, + "loss": 2.0292, + "step": 2609 + }, + { + "epoch": 0.7922294733646987, + "grad_norm": 0.39638906717300415, + "learning_rate": 8.683810873747089e-05, + "loss": 1.9292, + "step": 2610 + }, + { + "epoch": 0.7925330095613902, + "grad_norm": 0.40838631987571716, + "learning_rate": 8.683304647160068e-05, + "loss": 1.9439, + "step": 2611 + }, + { + "epoch": 0.7928365457580816, + "grad_norm": 0.41017046570777893, + "learning_rate": 8.682798420573048e-05, + "loss": 1.7575, + "step": 2612 + }, + { + "epoch": 0.7931400819547731, + "grad_norm": 0.38030532002449036, + "learning_rate": 8.682292193986029e-05, + "loss": 2.0398, + "step": 2613 + }, + { + "epoch": 0.7934436181514646, + "grad_norm": 0.42547357082366943, + "learning_rate": 8.681785967399008e-05, + "loss": 1.9265, + "step": 2614 + }, + { + "epoch": 0.793747154348156, + "grad_norm": 0.42651450634002686, + "learning_rate": 8.681279740811989e-05, + "loss": 1.8808, + "step": 2615 + }, + { + "epoch": 0.7940506905448474, + "grad_norm": 0.4874178469181061, + "learning_rate": 8.680773514224968e-05, + "loss": 1.8267, + "step": 2616 + }, + { + "epoch": 0.7943542267415389, + "grad_norm": 0.4573056101799011, + "learning_rate": 8.680267287637948e-05, + "loss": 2.0073, + "step": 2617 + }, + { + "epoch": 0.7946577629382304, + "grad_norm": 0.4408004879951477, + "learning_rate": 8.679761061050927e-05, + "loss": 2.1719, + "step": 2618 + }, + { + "epoch": 0.7949612991349219, + "grad_norm": 0.41363367438316345, + "learning_rate": 8.679254834463907e-05, + "loss": 2.0006, + "step": 2619 + }, + { + "epoch": 0.7952648353316133, + "grad_norm": 0.3256136178970337, + "learning_rate": 8.678748607876886e-05, + "loss": 1.5214, + "step": 2620 + }, + { + "epoch": 0.7955683715283047, + "grad_norm": 0.3597501516342163, + "learning_rate": 8.678242381289866e-05, + "loss": 1.9775, + "step": 2621 + }, + { + "epoch": 0.7958719077249962, + "grad_norm": 0.43128228187561035, + "learning_rate": 8.677736154702845e-05, + "loss": 1.3915, + "step": 2622 + }, + { + "epoch": 0.7961754439216877, + "grad_norm": 0.6114957332611084, + "learning_rate": 8.677229928115825e-05, + "loss": 2.0548, + "step": 2623 + }, + { + "epoch": 0.7964789801183791, + "grad_norm": 0.6381771564483643, + "learning_rate": 8.676723701528806e-05, + "loss": 2.1353, + "step": 2624 + }, + { + "epoch": 0.7967825163150706, + "grad_norm": 0.39409366250038147, + "learning_rate": 8.676217474941785e-05, + "loss": 0.9988, + "step": 2625 + }, + { + "epoch": 0.797086052511762, + "grad_norm": 0.4145677387714386, + "learning_rate": 8.675711248354765e-05, + "loss": 2.1825, + "step": 2626 + }, + { + "epoch": 0.7973895887084534, + "grad_norm": 0.40860435366630554, + "learning_rate": 8.675205021767744e-05, + "loss": 2.1545, + "step": 2627 + }, + { + "epoch": 0.797693124905145, + "grad_norm": 0.42259758710861206, + "learning_rate": 8.674698795180724e-05, + "loss": 2.0872, + "step": 2628 + }, + { + "epoch": 0.7979966611018364, + "grad_norm": 0.9106017351150513, + "learning_rate": 8.674192568593703e-05, + "loss": 1.8756, + "step": 2629 + }, + { + "epoch": 0.7983001972985279, + "grad_norm": 0.4160531163215637, + "learning_rate": 8.673686342006683e-05, + "loss": 1.6454, + "step": 2630 + }, + { + "epoch": 0.7986037334952193, + "grad_norm": 0.4564226269721985, + "learning_rate": 8.673180115419662e-05, + "loss": 1.6036, + "step": 2631 + }, + { + "epoch": 0.7989072696919107, + "grad_norm": 0.5077611207962036, + "learning_rate": 8.672673888832641e-05, + "loss": 1.8217, + "step": 2632 + }, + { + "epoch": 0.7992108058886022, + "grad_norm": 0.3732128143310547, + "learning_rate": 8.672167662245621e-05, + "loss": 1.6299, + "step": 2633 + }, + { + "epoch": 0.7995143420852937, + "grad_norm": 0.4433646500110626, + "learning_rate": 8.671661435658602e-05, + "loss": 2.0876, + "step": 2634 + }, + { + "epoch": 0.7998178782819851, + "grad_norm": 0.3869750201702118, + "learning_rate": 8.671155209071581e-05, + "loss": 1.9312, + "step": 2635 + }, + { + "epoch": 0.8001214144786766, + "grad_norm": 0.3622623682022095, + "learning_rate": 8.670648982484561e-05, + "loss": 1.3211, + "step": 2636 + }, + { + "epoch": 0.800424950675368, + "grad_norm": 0.38390904664993286, + "learning_rate": 8.67014275589754e-05, + "loss": 2.0186, + "step": 2637 + }, + { + "epoch": 0.8007284868720594, + "grad_norm": 0.5641773343086243, + "learning_rate": 8.66963652931052e-05, + "loss": 1.5207, + "step": 2638 + }, + { + "epoch": 0.801032023068751, + "grad_norm": 0.399679571390152, + "learning_rate": 8.669130302723499e-05, + "loss": 1.621, + "step": 2639 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.38951337337493896, + "learning_rate": 8.668624076136479e-05, + "loss": 1.8445, + "step": 2640 + }, + { + "epoch": 0.8016390954621339, + "grad_norm": 0.47294196486473083, + "learning_rate": 8.668117849549458e-05, + "loss": 1.4963, + "step": 2641 + }, + { + "epoch": 0.8019426316588253, + "grad_norm": 0.4139798581600189, + "learning_rate": 8.667611622962438e-05, + "loss": 1.863, + "step": 2642 + }, + { + "epoch": 0.8022461678555167, + "grad_norm": 0.42392146587371826, + "learning_rate": 8.667105396375418e-05, + "loss": 1.4778, + "step": 2643 + }, + { + "epoch": 0.8025497040522083, + "grad_norm": 0.4208984673023224, + "learning_rate": 8.666599169788398e-05, + "loss": 1.8756, + "step": 2644 + }, + { + "epoch": 0.8028532402488997, + "grad_norm": 0.3515019714832306, + "learning_rate": 8.666092943201377e-05, + "loss": 1.8822, + "step": 2645 + }, + { + "epoch": 0.8031567764455911, + "grad_norm": 0.4780139923095703, + "learning_rate": 8.665586716614357e-05, + "loss": 1.7862, + "step": 2646 + }, + { + "epoch": 0.8034603126422826, + "grad_norm": 0.4419991970062256, + "learning_rate": 8.665080490027336e-05, + "loss": 2.0666, + "step": 2647 + }, + { + "epoch": 0.803763848838974, + "grad_norm": 0.43830469250679016, + "learning_rate": 8.664574263440316e-05, + "loss": 2.2725, + "step": 2648 + }, + { + "epoch": 0.8040673850356655, + "grad_norm": 0.3989112377166748, + "learning_rate": 8.664068036853295e-05, + "loss": 1.9306, + "step": 2649 + }, + { + "epoch": 0.804370921232357, + "grad_norm": 0.3216220438480377, + "learning_rate": 8.663561810266275e-05, + "loss": 1.862, + "step": 2650 + }, + { + "epoch": 0.8046744574290484, + "grad_norm": 0.4546568989753723, + "learning_rate": 8.663055583679254e-05, + "loss": 2.0723, + "step": 2651 + }, + { + "epoch": 0.8049779936257399, + "grad_norm": 0.39314141869544983, + "learning_rate": 8.662549357092235e-05, + "loss": 1.7299, + "step": 2652 + }, + { + "epoch": 0.8052815298224313, + "grad_norm": 0.4112257957458496, + "learning_rate": 8.662043130505215e-05, + "loss": 1.2306, + "step": 2653 + }, + { + "epoch": 0.8055850660191228, + "grad_norm": 0.8900299072265625, + "learning_rate": 8.661536903918194e-05, + "loss": 2.0176, + "step": 2654 + }, + { + "epoch": 0.8058886022158143, + "grad_norm": 0.35671380162239075, + "learning_rate": 8.661030677331174e-05, + "loss": 1.521, + "step": 2655 + }, + { + "epoch": 0.8061921384125057, + "grad_norm": 0.3438098430633545, + "learning_rate": 8.660524450744153e-05, + "loss": 1.5401, + "step": 2656 + }, + { + "epoch": 0.8064956746091971, + "grad_norm": 0.4241732656955719, + "learning_rate": 8.660018224157133e-05, + "loss": 1.5139, + "step": 2657 + }, + { + "epoch": 0.8067992108058886, + "grad_norm": 0.41691192984580994, + "learning_rate": 8.659511997570113e-05, + "loss": 1.9927, + "step": 2658 + }, + { + "epoch": 0.8071027470025801, + "grad_norm": 0.36074796319007874, + "learning_rate": 8.659005770983093e-05, + "loss": 1.3125, + "step": 2659 + }, + { + "epoch": 0.8074062831992715, + "grad_norm": 0.503271222114563, + "learning_rate": 8.658499544396072e-05, + "loss": 2.0934, + "step": 2660 + }, + { + "epoch": 0.807709819395963, + "grad_norm": 0.47022250294685364, + "learning_rate": 8.657993317809052e-05, + "loss": 1.515, + "step": 2661 + }, + { + "epoch": 0.8080133555926544, + "grad_norm": 0.5267159938812256, + "learning_rate": 8.657487091222031e-05, + "loss": 1.7448, + "step": 2662 + }, + { + "epoch": 0.8083168917893458, + "grad_norm": 0.5382044315338135, + "learning_rate": 8.656980864635012e-05, + "loss": 1.6507, + "step": 2663 + }, + { + "epoch": 0.8086204279860373, + "grad_norm": 0.5040610432624817, + "learning_rate": 8.656474638047992e-05, + "loss": 1.3789, + "step": 2664 + }, + { + "epoch": 0.8089239641827288, + "grad_norm": 0.356317400932312, + "learning_rate": 8.655968411460971e-05, + "loss": 1.8226, + "step": 2665 + }, + { + "epoch": 0.8092275003794203, + "grad_norm": 0.38693082332611084, + "learning_rate": 8.65546218487395e-05, + "loss": 1.8358, + "step": 2666 + }, + { + "epoch": 0.8095310365761117, + "grad_norm": 0.42606496810913086, + "learning_rate": 8.65495595828693e-05, + "loss": 1.9002, + "step": 2667 + }, + { + "epoch": 0.8098345727728031, + "grad_norm": 0.3855800926685333, + "learning_rate": 8.65444973169991e-05, + "loss": 1.8488, + "step": 2668 + }, + { + "epoch": 0.8101381089694946, + "grad_norm": 0.46677157282829285, + "learning_rate": 8.653943505112889e-05, + "loss": 2.264, + "step": 2669 + }, + { + "epoch": 0.8104416451661861, + "grad_norm": 0.3479576110839844, + "learning_rate": 8.653437278525868e-05, + "loss": 1.796, + "step": 2670 + }, + { + "epoch": 0.8107451813628775, + "grad_norm": 0.4703936278820038, + "learning_rate": 8.652931051938848e-05, + "loss": 2.0457, + "step": 2671 + }, + { + "epoch": 0.811048717559569, + "grad_norm": 0.3478047847747803, + "learning_rate": 8.652424825351827e-05, + "loss": 1.8033, + "step": 2672 + }, + { + "epoch": 0.8113522537562604, + "grad_norm": 0.4196014702320099, + "learning_rate": 8.651918598764808e-05, + "loss": 1.9513, + "step": 2673 + }, + { + "epoch": 0.8116557899529518, + "grad_norm": 0.36813899874687195, + "learning_rate": 8.651412372177788e-05, + "loss": 1.9776, + "step": 2674 + }, + { + "epoch": 0.8119593261496434, + "grad_norm": 0.44413039088249207, + "learning_rate": 8.650906145590767e-05, + "loss": 1.9041, + "step": 2675 + }, + { + "epoch": 0.8122628623463348, + "grad_norm": 0.4073639512062073, + "learning_rate": 8.650399919003747e-05, + "loss": 1.9749, + "step": 2676 + }, + { + "epoch": 0.8125663985430263, + "grad_norm": 0.3961658775806427, + "learning_rate": 8.649893692416726e-05, + "loss": 1.8646, + "step": 2677 + }, + { + "epoch": 0.8128699347397177, + "grad_norm": 0.536353349685669, + "learning_rate": 8.649387465829706e-05, + "loss": 1.7772, + "step": 2678 + }, + { + "epoch": 0.8131734709364091, + "grad_norm": 0.4030105471611023, + "learning_rate": 8.648881239242685e-05, + "loss": 2.1668, + "step": 2679 + }, + { + "epoch": 0.8134770071331007, + "grad_norm": 0.4185904264450073, + "learning_rate": 8.648375012655665e-05, + "loss": 1.877, + "step": 2680 + }, + { + "epoch": 0.8137805433297921, + "grad_norm": 0.4530700445175171, + "learning_rate": 8.647868786068644e-05, + "loss": 1.7669, + "step": 2681 + }, + { + "epoch": 0.8140840795264835, + "grad_norm": 0.7239555716514587, + "learning_rate": 8.647362559481625e-05, + "loss": 1.7399, + "step": 2682 + }, + { + "epoch": 0.814387615723175, + "grad_norm": 0.5373411178588867, + "learning_rate": 8.646856332894604e-05, + "loss": 1.3157, + "step": 2683 + }, + { + "epoch": 0.8146911519198664, + "grad_norm": 0.41730010509490967, + "learning_rate": 8.646350106307584e-05, + "loss": 1.9912, + "step": 2684 + }, + { + "epoch": 0.814994688116558, + "grad_norm": 0.4486635625362396, + "learning_rate": 8.645843879720563e-05, + "loss": 1.9651, + "step": 2685 + }, + { + "epoch": 0.8152982243132494, + "grad_norm": 0.5648373961448669, + "learning_rate": 8.645337653133543e-05, + "loss": 1.7917, + "step": 2686 + }, + { + "epoch": 0.8156017605099408, + "grad_norm": 0.4445558488368988, + "learning_rate": 8.644831426546522e-05, + "loss": 1.71, + "step": 2687 + }, + { + "epoch": 0.8159052967066323, + "grad_norm": 0.4051614999771118, + "learning_rate": 8.644325199959502e-05, + "loss": 2.0976, + "step": 2688 + }, + { + "epoch": 0.8162088329033237, + "grad_norm": 0.41357868909835815, + "learning_rate": 8.643818973372481e-05, + "loss": 1.8658, + "step": 2689 + }, + { + "epoch": 0.8165123691000151, + "grad_norm": 0.43194282054901123, + "learning_rate": 8.643312746785461e-05, + "loss": 1.3586, + "step": 2690 + }, + { + "epoch": 0.8168159052967067, + "grad_norm": 0.4093270003795624, + "learning_rate": 8.642806520198442e-05, + "loss": 1.8946, + "step": 2691 + }, + { + "epoch": 0.8171194414933981, + "grad_norm": 0.5622807741165161, + "learning_rate": 8.642300293611421e-05, + "loss": 1.9175, + "step": 2692 + }, + { + "epoch": 0.8174229776900895, + "grad_norm": 0.41735681891441345, + "learning_rate": 8.6417940670244e-05, + "loss": 2.0407, + "step": 2693 + }, + { + "epoch": 0.817726513886781, + "grad_norm": 0.4518575966358185, + "learning_rate": 8.64128784043738e-05, + "loss": 1.8137, + "step": 2694 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.34537529945373535, + "learning_rate": 8.64078161385036e-05, + "loss": 1.7952, + "step": 2695 + }, + { + "epoch": 0.818333586280164, + "grad_norm": 0.44123801589012146, + "learning_rate": 8.640275387263339e-05, + "loss": 1.74, + "step": 2696 + }, + { + "epoch": 0.8186371224768554, + "grad_norm": 0.36231714487075806, + "learning_rate": 8.639769160676318e-05, + "loss": 1.9205, + "step": 2697 + }, + { + "epoch": 0.8189406586735468, + "grad_norm": 0.4573259651660919, + "learning_rate": 8.639262934089298e-05, + "loss": 1.8026, + "step": 2698 + }, + { + "epoch": 0.8192441948702383, + "grad_norm": 0.43909579515457153, + "learning_rate": 8.638756707502277e-05, + "loss": 1.9559, + "step": 2699 + }, + { + "epoch": 0.8195477310669297, + "grad_norm": 0.4051404893398285, + "learning_rate": 8.638250480915257e-05, + "loss": 1.9962, + "step": 2700 + }, + { + "epoch": 0.8198512672636212, + "grad_norm": 0.41793152689933777, + "learning_rate": 8.637744254328238e-05, + "loss": 1.7749, + "step": 2701 + }, + { + "epoch": 0.8201548034603127, + "grad_norm": 0.5424450039863586, + "learning_rate": 8.637238027741219e-05, + "loss": 1.8725, + "step": 2702 + }, + { + "epoch": 0.8204583396570041, + "grad_norm": 0.39918193221092224, + "learning_rate": 8.636731801154198e-05, + "loss": 2.0224, + "step": 2703 + }, + { + "epoch": 0.8207618758536955, + "grad_norm": 0.40323561429977417, + "learning_rate": 8.636225574567178e-05, + "loss": 1.9338, + "step": 2704 + }, + { + "epoch": 0.821065412050387, + "grad_norm": 0.40484192967414856, + "learning_rate": 8.635719347980157e-05, + "loss": 1.8665, + "step": 2705 + }, + { + "epoch": 0.8213689482470785, + "grad_norm": 0.45353245735168457, + "learning_rate": 8.635213121393137e-05, + "loss": 2.0364, + "step": 2706 + }, + { + "epoch": 0.82167248444377, + "grad_norm": 0.5369464755058289, + "learning_rate": 8.634706894806116e-05, + "loss": 1.3729, + "step": 2707 + }, + { + "epoch": 0.8219760206404614, + "grad_norm": 0.3175603449344635, + "learning_rate": 8.634200668219095e-05, + "loss": 1.6742, + "step": 2708 + }, + { + "epoch": 0.8222795568371528, + "grad_norm": 0.4314495623111725, + "learning_rate": 8.633694441632075e-05, + "loss": 1.9536, + "step": 2709 + }, + { + "epoch": 0.8225830930338442, + "grad_norm": 0.4610050916671753, + "learning_rate": 8.633188215045054e-05, + "loss": 2.0608, + "step": 2710 + }, + { + "epoch": 0.8228866292305358, + "grad_norm": 0.3542473018169403, + "learning_rate": 8.632681988458034e-05, + "loss": 1.5889, + "step": 2711 + }, + { + "epoch": 0.8231901654272272, + "grad_norm": 0.4445483684539795, + "learning_rate": 8.632175761871015e-05, + "loss": 2.0438, + "step": 2712 + }, + { + "epoch": 0.8234937016239187, + "grad_norm": 0.42590487003326416, + "learning_rate": 8.631669535283994e-05, + "loss": 1.6821, + "step": 2713 + }, + { + "epoch": 0.8237972378206101, + "grad_norm": 0.3951219618320465, + "learning_rate": 8.631163308696974e-05, + "loss": 1.8549, + "step": 2714 + }, + { + "epoch": 0.8241007740173015, + "grad_norm": 0.4422662556171417, + "learning_rate": 8.630657082109953e-05, + "loss": 1.0172, + "step": 2715 + }, + { + "epoch": 0.824404310213993, + "grad_norm": 0.6093502640724182, + "learning_rate": 8.630150855522933e-05, + "loss": 1.469, + "step": 2716 + }, + { + "epoch": 0.8247078464106845, + "grad_norm": 0.6702497005462646, + "learning_rate": 8.629644628935912e-05, + "loss": 1.7706, + "step": 2717 + }, + { + "epoch": 0.8250113826073759, + "grad_norm": 0.4154108166694641, + "learning_rate": 8.629138402348892e-05, + "loss": 2.0392, + "step": 2718 + }, + { + "epoch": 0.8253149188040674, + "grad_norm": 0.4183025062084198, + "learning_rate": 8.628632175761871e-05, + "loss": 1.648, + "step": 2719 + }, + { + "epoch": 0.8256184550007588, + "grad_norm": 0.40831395983695984, + "learning_rate": 8.62812594917485e-05, + "loss": 1.8159, + "step": 2720 + }, + { + "epoch": 0.8259219911974502, + "grad_norm": 0.3942376673221588, + "learning_rate": 8.627619722587831e-05, + "loss": 2.1595, + "step": 2721 + }, + { + "epoch": 0.8262255273941418, + "grad_norm": 0.4016304314136505, + "learning_rate": 8.627113496000811e-05, + "loss": 1.9915, + "step": 2722 + }, + { + "epoch": 0.8265290635908332, + "grad_norm": 0.43526315689086914, + "learning_rate": 8.62660726941379e-05, + "loss": 2.009, + "step": 2723 + }, + { + "epoch": 0.8268325997875247, + "grad_norm": 0.4215218424797058, + "learning_rate": 8.62610104282677e-05, + "loss": 1.6511, + "step": 2724 + }, + { + "epoch": 0.8271361359842161, + "grad_norm": 0.38574057817459106, + "learning_rate": 8.62559481623975e-05, + "loss": 1.6658, + "step": 2725 + }, + { + "epoch": 0.8274396721809075, + "grad_norm": 0.4340943694114685, + "learning_rate": 8.625088589652729e-05, + "loss": 1.9324, + "step": 2726 + }, + { + "epoch": 0.8277432083775991, + "grad_norm": 0.474386066198349, + "learning_rate": 8.624582363065708e-05, + "loss": 1.7866, + "step": 2727 + }, + { + "epoch": 0.8280467445742905, + "grad_norm": 0.4177556335926056, + "learning_rate": 8.624076136478688e-05, + "loss": 1.4593, + "step": 2728 + }, + { + "epoch": 0.8283502807709819, + "grad_norm": 0.412219375371933, + "learning_rate": 8.623569909891667e-05, + "loss": 1.592, + "step": 2729 + }, + { + "epoch": 0.8286538169676734, + "grad_norm": 0.5068672895431519, + "learning_rate": 8.623063683304648e-05, + "loss": 2.0902, + "step": 2730 + }, + { + "epoch": 0.8289573531643648, + "grad_norm": 0.43211403489112854, + "learning_rate": 8.622557456717628e-05, + "loss": 1.1154, + "step": 2731 + }, + { + "epoch": 0.8292608893610564, + "grad_norm": 0.37955889105796814, + "learning_rate": 8.622051230130607e-05, + "loss": 1.9858, + "step": 2732 + }, + { + "epoch": 0.8295644255577478, + "grad_norm": 0.4571453928947449, + "learning_rate": 8.621545003543587e-05, + "loss": 1.6966, + "step": 2733 + }, + { + "epoch": 0.8298679617544392, + "grad_norm": 0.43323561549186707, + "learning_rate": 8.621038776956566e-05, + "loss": 2.0712, + "step": 2734 + }, + { + "epoch": 0.8301714979511307, + "grad_norm": 0.4051443338394165, + "learning_rate": 8.620532550369545e-05, + "loss": 1.9132, + "step": 2735 + }, + { + "epoch": 0.8304750341478221, + "grad_norm": 0.4727547764778137, + "learning_rate": 8.620026323782525e-05, + "loss": 2.0754, + "step": 2736 + }, + { + "epoch": 0.8307785703445136, + "grad_norm": 0.8304808139801025, + "learning_rate": 8.619520097195504e-05, + "loss": 1.3452, + "step": 2737 + }, + { + "epoch": 0.8310821065412051, + "grad_norm": 0.40300288796424866, + "learning_rate": 8.619013870608484e-05, + "loss": 1.903, + "step": 2738 + }, + { + "epoch": 0.8313856427378965, + "grad_norm": 0.4302805960178375, + "learning_rate": 8.618507644021463e-05, + "loss": 1.9938, + "step": 2739 + }, + { + "epoch": 0.8316891789345879, + "grad_norm": 0.41586950421333313, + "learning_rate": 8.618001417434444e-05, + "loss": 1.8044, + "step": 2740 + }, + { + "epoch": 0.8319927151312794, + "grad_norm": 0.4185795187950134, + "learning_rate": 8.617495190847424e-05, + "loss": 2.0513, + "step": 2741 + }, + { + "epoch": 0.8322962513279709, + "grad_norm": 0.4664061367511749, + "learning_rate": 8.616988964260403e-05, + "loss": 1.6634, + "step": 2742 + }, + { + "epoch": 0.8325997875246623, + "grad_norm": 0.44080016016960144, + "learning_rate": 8.616482737673383e-05, + "loss": 1.854, + "step": 2743 + }, + { + "epoch": 0.8329033237213538, + "grad_norm": 0.4284375011920929, + "learning_rate": 8.615976511086362e-05, + "loss": 1.9713, + "step": 2744 + }, + { + "epoch": 0.8332068599180452, + "grad_norm": 0.42498892545700073, + "learning_rate": 8.615470284499342e-05, + "loss": 1.5883, + "step": 2745 + }, + { + "epoch": 0.8335103961147367, + "grad_norm": 0.5301217436790466, + "learning_rate": 8.614964057912321e-05, + "loss": 0.8439, + "step": 2746 + }, + { + "epoch": 0.8338139323114281, + "grad_norm": 0.5539612174034119, + "learning_rate": 8.614457831325302e-05, + "loss": 2.1118, + "step": 2747 + }, + { + "epoch": 0.8341174685081196, + "grad_norm": 0.47817254066467285, + "learning_rate": 8.613951604738281e-05, + "loss": 2.1536, + "step": 2748 + }, + { + "epoch": 0.8344210047048111, + "grad_norm": 0.3291810154914856, + "learning_rate": 8.613445378151261e-05, + "loss": 1.951, + "step": 2749 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.365369975566864, + "learning_rate": 8.61293915156424e-05, + "loss": 1.4365, + "step": 2750 + }, + { + "epoch": 0.8350280770981939, + "grad_norm": 0.4028696119785309, + "learning_rate": 8.612432924977221e-05, + "loss": 2.2443, + "step": 2751 + }, + { + "epoch": 0.8353316132948854, + "grad_norm": 0.40695205330848694, + "learning_rate": 8.611926698390201e-05, + "loss": 2.0337, + "step": 2752 + }, + { + "epoch": 0.8356351494915769, + "grad_norm": 1.1030051708221436, + "learning_rate": 8.61142047180318e-05, + "loss": 1.9415, + "step": 2753 + }, + { + "epoch": 0.8359386856882683, + "grad_norm": 0.36230286955833435, + "learning_rate": 8.61091424521616e-05, + "loss": 1.7159, + "step": 2754 + }, + { + "epoch": 0.8362422218849598, + "grad_norm": 0.36546850204467773, + "learning_rate": 8.610408018629139e-05, + "loss": 1.7412, + "step": 2755 + }, + { + "epoch": 0.8365457580816512, + "grad_norm": 0.3433781564235687, + "learning_rate": 8.609901792042119e-05, + "loss": 1.6014, + "step": 2756 + }, + { + "epoch": 0.8368492942783426, + "grad_norm": 0.4202607274055481, + "learning_rate": 8.609395565455098e-05, + "loss": 1.8733, + "step": 2757 + }, + { + "epoch": 0.8371528304750342, + "grad_norm": 0.41112053394317627, + "learning_rate": 8.608889338868078e-05, + "loss": 1.8103, + "step": 2758 + }, + { + "epoch": 0.8374563666717256, + "grad_norm": 0.44569066166877747, + "learning_rate": 8.608383112281057e-05, + "loss": 1.3989, + "step": 2759 + }, + { + "epoch": 0.8377599028684171, + "grad_norm": 0.4139041602611542, + "learning_rate": 8.607876885694038e-05, + "loss": 1.8406, + "step": 2760 + }, + { + "epoch": 0.8380634390651085, + "grad_norm": 0.40701228380203247, + "learning_rate": 8.607370659107017e-05, + "loss": 2.1573, + "step": 2761 + }, + { + "epoch": 0.8383669752617999, + "grad_norm": 0.44878578186035156, + "learning_rate": 8.606864432519997e-05, + "loss": 1.5094, + "step": 2762 + }, + { + "epoch": 0.8386705114584915, + "grad_norm": 0.5021698474884033, + "learning_rate": 8.606358205932976e-05, + "loss": 2.1396, + "step": 2763 + }, + { + "epoch": 0.8389740476551829, + "grad_norm": 0.5416968464851379, + "learning_rate": 8.605851979345956e-05, + "loss": 1.974, + "step": 2764 + }, + { + "epoch": 0.8392775838518743, + "grad_norm": 0.5953572988510132, + "learning_rate": 8.605345752758935e-05, + "loss": 1.7463, + "step": 2765 + }, + { + "epoch": 0.8395811200485658, + "grad_norm": 0.43414852023124695, + "learning_rate": 8.604839526171915e-05, + "loss": 1.7121, + "step": 2766 + }, + { + "epoch": 0.8398846562452572, + "grad_norm": 0.4000817537307739, + "learning_rate": 8.604333299584894e-05, + "loss": 2.1051, + "step": 2767 + }, + { + "epoch": 0.8401881924419488, + "grad_norm": 0.6544987559318542, + "learning_rate": 8.603827072997874e-05, + "loss": 2.063, + "step": 2768 + }, + { + "epoch": 0.8404917286386402, + "grad_norm": 0.7102285623550415, + "learning_rate": 8.603320846410855e-05, + "loss": 2.2053, + "step": 2769 + }, + { + "epoch": 0.8407952648353316, + "grad_norm": 0.39218565821647644, + "learning_rate": 8.602814619823834e-05, + "loss": 1.2981, + "step": 2770 + }, + { + "epoch": 0.8410988010320231, + "grad_norm": 0.42591944336891174, + "learning_rate": 8.602308393236814e-05, + "loss": 2.0123, + "step": 2771 + }, + { + "epoch": 0.8414023372287145, + "grad_norm": 0.3958960175514221, + "learning_rate": 8.601802166649793e-05, + "loss": 2.1612, + "step": 2772 + }, + { + "epoch": 0.8417058734254059, + "grad_norm": 0.4331991672515869, + "learning_rate": 8.601295940062772e-05, + "loss": 1.8874, + "step": 2773 + }, + { + "epoch": 0.8420094096220975, + "grad_norm": 0.42542657256126404, + "learning_rate": 8.600789713475752e-05, + "loss": 1.3892, + "step": 2774 + }, + { + "epoch": 0.8423129458187889, + "grad_norm": 0.3944099545478821, + "learning_rate": 8.600283486888731e-05, + "loss": 1.8069, + "step": 2775 + }, + { + "epoch": 0.8426164820154803, + "grad_norm": 0.42040184140205383, + "learning_rate": 8.599777260301711e-05, + "loss": 1.8774, + "step": 2776 + }, + { + "epoch": 0.8429200182121718, + "grad_norm": 0.4341401755809784, + "learning_rate": 8.59927103371469e-05, + "loss": 1.9773, + "step": 2777 + }, + { + "epoch": 0.8432235544088632, + "grad_norm": 0.4372880458831787, + "learning_rate": 8.59876480712767e-05, + "loss": 2.0873, + "step": 2778 + }, + { + "epoch": 0.8435270906055548, + "grad_norm": 0.3675346374511719, + "learning_rate": 8.598258580540651e-05, + "loss": 1.1921, + "step": 2779 + }, + { + "epoch": 0.8438306268022462, + "grad_norm": 0.39204880595207214, + "learning_rate": 8.59775235395363e-05, + "loss": 1.9907, + "step": 2780 + }, + { + "epoch": 0.8441341629989376, + "grad_norm": 0.4221871793270111, + "learning_rate": 8.59724612736661e-05, + "loss": 1.7039, + "step": 2781 + }, + { + "epoch": 0.844437699195629, + "grad_norm": 0.4046156406402588, + "learning_rate": 8.596739900779589e-05, + "loss": 2.1068, + "step": 2782 + }, + { + "epoch": 0.8447412353923205, + "grad_norm": 0.42431220412254333, + "learning_rate": 8.596233674192569e-05, + "loss": 1.7547, + "step": 2783 + }, + { + "epoch": 0.845044771589012, + "grad_norm": 0.33057859539985657, + "learning_rate": 8.595727447605548e-05, + "loss": 1.6987, + "step": 2784 + }, + { + "epoch": 0.8453483077857035, + "grad_norm": 0.40820497274398804, + "learning_rate": 8.595221221018528e-05, + "loss": 1.6101, + "step": 2785 + }, + { + "epoch": 0.8456518439823949, + "grad_norm": 0.6457285284996033, + "learning_rate": 8.594714994431507e-05, + "loss": 1.927, + "step": 2786 + }, + { + "epoch": 0.8459553801790863, + "grad_norm": 0.4055453836917877, + "learning_rate": 8.594208767844487e-05, + "loss": 1.9902, + "step": 2787 + }, + { + "epoch": 0.8462589163757778, + "grad_norm": 0.45660969614982605, + "learning_rate": 8.593702541257467e-05, + "loss": 1.8487, + "step": 2788 + }, + { + "epoch": 0.8465624525724693, + "grad_norm": 0.4082806408405304, + "learning_rate": 8.593196314670447e-05, + "loss": 1.7797, + "step": 2789 + }, + { + "epoch": 0.8468659887691607, + "grad_norm": 0.39490821957588196, + "learning_rate": 8.592690088083426e-05, + "loss": 1.9501, + "step": 2790 + }, + { + "epoch": 0.8471695249658522, + "grad_norm": 0.47634264826774597, + "learning_rate": 8.592183861496407e-05, + "loss": 1.5822, + "step": 2791 + }, + { + "epoch": 0.8474730611625436, + "grad_norm": 0.4166494607925415, + "learning_rate": 8.591677634909387e-05, + "loss": 2.0393, + "step": 2792 + }, + { + "epoch": 0.847776597359235, + "grad_norm": 0.3837972581386566, + "learning_rate": 8.591171408322366e-05, + "loss": 1.8317, + "step": 2793 + }, + { + "epoch": 0.8480801335559266, + "grad_norm": 0.3955104947090149, + "learning_rate": 8.590665181735346e-05, + "loss": 2.0546, + "step": 2794 + }, + { + "epoch": 0.848383669752618, + "grad_norm": 0.35945233702659607, + "learning_rate": 8.590158955148325e-05, + "loss": 2.0985, + "step": 2795 + }, + { + "epoch": 0.8486872059493095, + "grad_norm": 0.5097954869270325, + "learning_rate": 8.589652728561305e-05, + "loss": 1.6378, + "step": 2796 + }, + { + "epoch": 0.8489907421460009, + "grad_norm": 0.37827685475349426, + "learning_rate": 8.589146501974284e-05, + "loss": 1.9986, + "step": 2797 + }, + { + "epoch": 0.8492942783426923, + "grad_norm": 0.39725548028945923, + "learning_rate": 8.588640275387264e-05, + "loss": 1.9275, + "step": 2798 + }, + { + "epoch": 0.8495978145393838, + "grad_norm": 0.3660275936126709, + "learning_rate": 8.588134048800244e-05, + "loss": 1.7952, + "step": 2799 + }, + { + "epoch": 0.8499013507360753, + "grad_norm": 0.7100840210914612, + "learning_rate": 8.587627822213224e-05, + "loss": 1.8986, + "step": 2800 + }, + { + "epoch": 0.8502048869327667, + "grad_norm": 0.4502932131290436, + "learning_rate": 8.587121595626203e-05, + "loss": 1.8977, + "step": 2801 + }, + { + "epoch": 0.8505084231294582, + "grad_norm": 0.3382154107093811, + "learning_rate": 8.586615369039183e-05, + "loss": 1.8838, + "step": 2802 + }, + { + "epoch": 0.8508119593261496, + "grad_norm": 0.42528000473976135, + "learning_rate": 8.586109142452162e-05, + "loss": 2.4048, + "step": 2803 + }, + { + "epoch": 0.851115495522841, + "grad_norm": 0.41296571493148804, + "learning_rate": 8.585602915865142e-05, + "loss": 1.5728, + "step": 2804 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.4023008644580841, + "learning_rate": 8.585096689278121e-05, + "loss": 1.9834, + "step": 2805 + }, + { + "epoch": 0.851722567916224, + "grad_norm": 0.34381431341171265, + "learning_rate": 8.584590462691101e-05, + "loss": 2.0097, + "step": 2806 + }, + { + "epoch": 0.8520261041129155, + "grad_norm": 0.3973744809627533, + "learning_rate": 8.58408423610408e-05, + "loss": 2.008, + "step": 2807 + }, + { + "epoch": 0.8523296403096069, + "grad_norm": 0.40327900648117065, + "learning_rate": 8.583578009517061e-05, + "loss": 1.9841, + "step": 2808 + }, + { + "epoch": 0.8526331765062983, + "grad_norm": 0.3598316013813019, + "learning_rate": 8.58307178293004e-05, + "loss": 1.9542, + "step": 2809 + }, + { + "epoch": 0.8529367127029899, + "grad_norm": 0.4372098743915558, + "learning_rate": 8.58256555634302e-05, + "loss": 1.7054, + "step": 2810 + }, + { + "epoch": 0.8532402488996813, + "grad_norm": 0.39820167422294617, + "learning_rate": 8.582059329756e-05, + "loss": 1.6737, + "step": 2811 + }, + { + "epoch": 0.8535437850963727, + "grad_norm": 0.45620962977409363, + "learning_rate": 8.581553103168979e-05, + "loss": 1.895, + "step": 2812 + }, + { + "epoch": 0.8538473212930642, + "grad_norm": 0.4112420678138733, + "learning_rate": 8.581046876581958e-05, + "loss": 2.0469, + "step": 2813 + }, + { + "epoch": 0.8541508574897556, + "grad_norm": 0.4265507161617279, + "learning_rate": 8.580540649994938e-05, + "loss": 1.8141, + "step": 2814 + }, + { + "epoch": 0.8544543936864472, + "grad_norm": 0.9317876696586609, + "learning_rate": 8.580034423407917e-05, + "loss": 1.9912, + "step": 2815 + }, + { + "epoch": 0.8547579298831386, + "grad_norm": 0.41293710470199585, + "learning_rate": 8.579528196820897e-05, + "loss": 2.0593, + "step": 2816 + }, + { + "epoch": 0.85506146607983, + "grad_norm": 0.6074060201644897, + "learning_rate": 8.579021970233876e-05, + "loss": 1.7091, + "step": 2817 + }, + { + "epoch": 0.8553650022765215, + "grad_norm": 0.39665672183036804, + "learning_rate": 8.578515743646857e-05, + "loss": 1.5974, + "step": 2818 + }, + { + "epoch": 0.8556685384732129, + "grad_norm": 0.34235861897468567, + "learning_rate": 8.578009517059837e-05, + "loss": 1.7635, + "step": 2819 + }, + { + "epoch": 0.8559720746699044, + "grad_norm": 0.416742742061615, + "learning_rate": 8.577503290472816e-05, + "loss": 1.6473, + "step": 2820 + }, + { + "epoch": 0.8562756108665959, + "grad_norm": 0.41152289509773254, + "learning_rate": 8.576997063885796e-05, + "loss": 1.8142, + "step": 2821 + }, + { + "epoch": 0.8565791470632873, + "grad_norm": 0.44638922810554504, + "learning_rate": 8.576490837298775e-05, + "loss": 1.5819, + "step": 2822 + }, + { + "epoch": 0.8568826832599787, + "grad_norm": 0.38064852356910706, + "learning_rate": 8.575984610711755e-05, + "loss": 1.4263, + "step": 2823 + }, + { + "epoch": 0.8571862194566702, + "grad_norm": 0.41755181550979614, + "learning_rate": 8.575478384124734e-05, + "loss": 1.5668, + "step": 2824 + }, + { + "epoch": 0.8574897556533617, + "grad_norm": 0.45153340697288513, + "learning_rate": 8.574972157537714e-05, + "loss": 1.6601, + "step": 2825 + }, + { + "epoch": 0.8577932918500532, + "grad_norm": 0.3700641989707947, + "learning_rate": 8.574465930950693e-05, + "loss": 1.3577, + "step": 2826 + }, + { + "epoch": 0.8580968280467446, + "grad_norm": 0.44846633076667786, + "learning_rate": 8.573959704363674e-05, + "loss": 2.0195, + "step": 2827 + }, + { + "epoch": 0.858400364243436, + "grad_norm": 0.4378660023212433, + "learning_rate": 8.573453477776653e-05, + "loss": 1.7914, + "step": 2828 + }, + { + "epoch": 0.8587039004401275, + "grad_norm": 0.4284498691558838, + "learning_rate": 8.572947251189633e-05, + "loss": 1.8443, + "step": 2829 + }, + { + "epoch": 0.8590074366368189, + "grad_norm": 0.4318150579929352, + "learning_rate": 8.572441024602612e-05, + "loss": 1.7671, + "step": 2830 + }, + { + "epoch": 0.8593109728335104, + "grad_norm": 1.0121327638626099, + "learning_rate": 8.571934798015592e-05, + "loss": 1.9886, + "step": 2831 + }, + { + "epoch": 0.8596145090302019, + "grad_norm": 0.4319281578063965, + "learning_rate": 8.571428571428571e-05, + "loss": 1.8299, + "step": 2832 + }, + { + "epoch": 0.8599180452268933, + "grad_norm": 0.42897358536720276, + "learning_rate": 8.570922344841551e-05, + "loss": 2.0147, + "step": 2833 + }, + { + "epoch": 0.8602215814235847, + "grad_norm": 0.39335522055625916, + "learning_rate": 8.57041611825453e-05, + "loss": 1.5326, + "step": 2834 + }, + { + "epoch": 0.8605251176202762, + "grad_norm": 1.2661360502243042, + "learning_rate": 8.56990989166751e-05, + "loss": 2.1234, + "step": 2835 + }, + { + "epoch": 0.8608286538169677, + "grad_norm": 0.7632877230644226, + "learning_rate": 8.56940366508049e-05, + "loss": 1.8358, + "step": 2836 + }, + { + "epoch": 0.8611321900136591, + "grad_norm": 0.3894922733306885, + "learning_rate": 8.56889743849347e-05, + "loss": 1.9101, + "step": 2837 + }, + { + "epoch": 0.8614357262103506, + "grad_norm": 0.3832629919052124, + "learning_rate": 8.568391211906451e-05, + "loss": 1.772, + "step": 2838 + }, + { + "epoch": 0.861739262407042, + "grad_norm": 0.4298574924468994, + "learning_rate": 8.56788498531943e-05, + "loss": 1.4439, + "step": 2839 + }, + { + "epoch": 0.8620427986037335, + "grad_norm": 0.44331902265548706, + "learning_rate": 8.56737875873241e-05, + "loss": 1.9716, + "step": 2840 + }, + { + "epoch": 0.862346334800425, + "grad_norm": 0.43073487281799316, + "learning_rate": 8.566872532145389e-05, + "loss": 2.1116, + "step": 2841 + }, + { + "epoch": 0.8626498709971164, + "grad_norm": 0.4528077244758606, + "learning_rate": 8.566366305558369e-05, + "loss": 1.8215, + "step": 2842 + }, + { + "epoch": 0.8629534071938079, + "grad_norm": 0.43540868163108826, + "learning_rate": 8.565860078971348e-05, + "loss": 1.9536, + "step": 2843 + }, + { + "epoch": 0.8632569433904993, + "grad_norm": 0.4424208998680115, + "learning_rate": 8.565353852384328e-05, + "loss": 2.0443, + "step": 2844 + }, + { + "epoch": 0.8635604795871907, + "grad_norm": 0.42500391602516174, + "learning_rate": 8.564847625797307e-05, + "loss": 1.7454, + "step": 2845 + }, + { + "epoch": 0.8638640157838823, + "grad_norm": 0.5110988020896912, + "learning_rate": 8.564341399210287e-05, + "loss": 1.9146, + "step": 2846 + }, + { + "epoch": 0.8641675519805737, + "grad_norm": 0.44191688299179077, + "learning_rate": 8.563835172623268e-05, + "loss": 2.0113, + "step": 2847 + }, + { + "epoch": 0.8644710881772651, + "grad_norm": 0.42467302083969116, + "learning_rate": 8.563328946036247e-05, + "loss": 1.8504, + "step": 2848 + }, + { + "epoch": 0.8647746243739566, + "grad_norm": 0.48334258794784546, + "learning_rate": 8.562822719449226e-05, + "loss": 1.9385, + "step": 2849 + }, + { + "epoch": 0.865078160570648, + "grad_norm": 0.42993229627609253, + "learning_rate": 8.562316492862206e-05, + "loss": 1.7958, + "step": 2850 + }, + { + "epoch": 0.8653816967673396, + "grad_norm": 0.391629695892334, + "learning_rate": 8.561810266275185e-05, + "loss": 1.7413, + "step": 2851 + }, + { + "epoch": 0.865685232964031, + "grad_norm": 0.46686479449272156, + "learning_rate": 8.561304039688165e-05, + "loss": 1.9155, + "step": 2852 + }, + { + "epoch": 0.8659887691607224, + "grad_norm": 0.41826534271240234, + "learning_rate": 8.560797813101144e-05, + "loss": 1.7625, + "step": 2853 + }, + { + "epoch": 0.8662923053574139, + "grad_norm": 0.42303943634033203, + "learning_rate": 8.560291586514124e-05, + "loss": 2.0604, + "step": 2854 + }, + { + "epoch": 0.8665958415541053, + "grad_norm": 0.42215773463249207, + "learning_rate": 8.559785359927103e-05, + "loss": 2.0058, + "step": 2855 + }, + { + "epoch": 0.8668993777507967, + "grad_norm": 0.45129135251045227, + "learning_rate": 8.559279133340083e-05, + "loss": 1.7881, + "step": 2856 + }, + { + "epoch": 0.8672029139474883, + "grad_norm": 0.41676831245422363, + "learning_rate": 8.558772906753064e-05, + "loss": 1.8624, + "step": 2857 + }, + { + "epoch": 0.8675064501441797, + "grad_norm": 0.4166240990161896, + "learning_rate": 8.558266680166043e-05, + "loss": 1.8828, + "step": 2858 + }, + { + "epoch": 0.8678099863408711, + "grad_norm": 0.407652348279953, + "learning_rate": 8.557760453579023e-05, + "loss": 1.7462, + "step": 2859 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.42764970660209656, + "learning_rate": 8.557254226992002e-05, + "loss": 1.607, + "step": 2860 + }, + { + "epoch": 0.868417058734254, + "grad_norm": 0.4612043797969818, + "learning_rate": 8.556748000404982e-05, + "loss": 1.8854, + "step": 2861 + }, + { + "epoch": 0.8687205949309456, + "grad_norm": 0.35503968596458435, + "learning_rate": 8.556241773817961e-05, + "loss": 1.6402, + "step": 2862 + }, + { + "epoch": 0.869024131127637, + "grad_norm": 0.3938760459423065, + "learning_rate": 8.55573554723094e-05, + "loss": 1.7154, + "step": 2863 + }, + { + "epoch": 0.8693276673243284, + "grad_norm": 0.6211029291152954, + "learning_rate": 8.55522932064392e-05, + "loss": 1.3675, + "step": 2864 + }, + { + "epoch": 0.8696312035210199, + "grad_norm": 0.4015885293483734, + "learning_rate": 8.5547230940569e-05, + "loss": 1.1773, + "step": 2865 + }, + { + "epoch": 0.8699347397177113, + "grad_norm": 0.39615508913993835, + "learning_rate": 8.55421686746988e-05, + "loss": 1.816, + "step": 2866 + }, + { + "epoch": 0.8702382759144028, + "grad_norm": 0.4044322669506073, + "learning_rate": 8.55371064088286e-05, + "loss": 1.5465, + "step": 2867 + }, + { + "epoch": 0.8705418121110943, + "grad_norm": 0.410137802362442, + "learning_rate": 8.553204414295839e-05, + "loss": 1.7645, + "step": 2868 + }, + { + "epoch": 0.8708453483077857, + "grad_norm": 0.456717312335968, + "learning_rate": 8.552698187708819e-05, + "loss": 1.9905, + "step": 2869 + }, + { + "epoch": 0.8711488845044771, + "grad_norm": 0.3661191761493683, + "learning_rate": 8.552191961121798e-05, + "loss": 2.1578, + "step": 2870 + }, + { + "epoch": 0.8714524207011686, + "grad_norm": 0.3868817389011383, + "learning_rate": 8.551685734534778e-05, + "loss": 1.9996, + "step": 2871 + }, + { + "epoch": 0.8717559568978601, + "grad_norm": 0.35258975625038147, + "learning_rate": 8.551179507947757e-05, + "loss": 1.5475, + "step": 2872 + }, + { + "epoch": 0.8720594930945516, + "grad_norm": 0.4110967516899109, + "learning_rate": 8.550673281360737e-05, + "loss": 1.7818, + "step": 2873 + }, + { + "epoch": 0.872363029291243, + "grad_norm": 0.39448168873786926, + "learning_rate": 8.550167054773716e-05, + "loss": 1.8685, + "step": 2874 + }, + { + "epoch": 0.8726665654879344, + "grad_norm": 0.5225607752799988, + "learning_rate": 8.549660828186697e-05, + "loss": 2.056, + "step": 2875 + }, + { + "epoch": 0.8729701016846259, + "grad_norm": 0.4417632818222046, + "learning_rate": 8.549154601599676e-05, + "loss": 1.8328, + "step": 2876 + }, + { + "epoch": 0.8732736378813174, + "grad_norm": 0.3205631673336029, + "learning_rate": 8.548648375012656e-05, + "loss": 1.2795, + "step": 2877 + }, + { + "epoch": 0.8735771740780088, + "grad_norm": 0.35961270332336426, + "learning_rate": 8.548142148425635e-05, + "loss": 1.9222, + "step": 2878 + }, + { + "epoch": 0.8738807102747003, + "grad_norm": 0.4819619059562683, + "learning_rate": 8.547635921838615e-05, + "loss": 2.1051, + "step": 2879 + }, + { + "epoch": 0.8741842464713917, + "grad_norm": 0.4361310601234436, + "learning_rate": 8.547129695251596e-05, + "loss": 1.3526, + "step": 2880 + }, + { + "epoch": 0.8744877826680831, + "grad_norm": 0.41012874245643616, + "learning_rate": 8.546623468664575e-05, + "loss": 1.4308, + "step": 2881 + }, + { + "epoch": 0.8747913188647746, + "grad_norm": 0.4581417441368103, + "learning_rate": 8.546117242077555e-05, + "loss": 2.0414, + "step": 2882 + }, + { + "epoch": 0.8750948550614661, + "grad_norm": 0.5409611463546753, + "learning_rate": 8.545611015490534e-05, + "loss": 1.3438, + "step": 2883 + }, + { + "epoch": 0.8753983912581575, + "grad_norm": 0.390472412109375, + "learning_rate": 8.545104788903514e-05, + "loss": 1.6817, + "step": 2884 + }, + { + "epoch": 0.875701927454849, + "grad_norm": 0.5236276984214783, + "learning_rate": 8.544598562316493e-05, + "loss": 1.7992, + "step": 2885 + }, + { + "epoch": 0.8760054636515404, + "grad_norm": 0.43483301997184753, + "learning_rate": 8.544092335729474e-05, + "loss": 1.8767, + "step": 2886 + }, + { + "epoch": 0.8763089998482319, + "grad_norm": 0.5605120658874512, + "learning_rate": 8.543586109142453e-05, + "loss": 1.5323, + "step": 2887 + }, + { + "epoch": 0.8766125360449234, + "grad_norm": 0.4484270215034485, + "learning_rate": 8.543079882555433e-05, + "loss": 1.9958, + "step": 2888 + }, + { + "epoch": 0.8769160722416148, + "grad_norm": 0.40156564116477966, + "learning_rate": 8.542573655968412e-05, + "loss": 1.8555, + "step": 2889 + }, + { + "epoch": 0.8772196084383063, + "grad_norm": 0.42205923795700073, + "learning_rate": 8.542067429381392e-05, + "loss": 1.5127, + "step": 2890 + }, + { + "epoch": 0.8775231446349977, + "grad_norm": 0.40961888432502747, + "learning_rate": 8.541561202794371e-05, + "loss": 1.7508, + "step": 2891 + }, + { + "epoch": 0.8778266808316891, + "grad_norm": 0.4366128742694855, + "learning_rate": 8.541054976207351e-05, + "loss": 1.9704, + "step": 2892 + }, + { + "epoch": 0.8781302170283807, + "grad_norm": 0.4367973804473877, + "learning_rate": 8.54054874962033e-05, + "loss": 1.978, + "step": 2893 + }, + { + "epoch": 0.8784337532250721, + "grad_norm": 0.4191198945045471, + "learning_rate": 8.54004252303331e-05, + "loss": 1.93, + "step": 2894 + }, + { + "epoch": 0.8787372894217635, + "grad_norm": 0.40298399329185486, + "learning_rate": 8.539536296446289e-05, + "loss": 1.8862, + "step": 2895 + }, + { + "epoch": 0.879040825618455, + "grad_norm": 0.4513075351715088, + "learning_rate": 8.53903006985927e-05, + "loss": 1.4634, + "step": 2896 + }, + { + "epoch": 0.8793443618151464, + "grad_norm": 0.452395498752594, + "learning_rate": 8.53852384327225e-05, + "loss": 2.0012, + "step": 2897 + }, + { + "epoch": 0.879647898011838, + "grad_norm": 0.4072858691215515, + "learning_rate": 8.538017616685229e-05, + "loss": 1.7729, + "step": 2898 + }, + { + "epoch": 0.8799514342085294, + "grad_norm": 0.42640551924705505, + "learning_rate": 8.537511390098209e-05, + "loss": 1.4541, + "step": 2899 + }, + { + "epoch": 0.8802549704052208, + "grad_norm": 0.37970346212387085, + "learning_rate": 8.537005163511188e-05, + "loss": 1.7628, + "step": 2900 + }, + { + "epoch": 0.8805585066019123, + "grad_norm": 0.4421388804912567, + "learning_rate": 8.536498936924168e-05, + "loss": 1.7712, + "step": 2901 + }, + { + "epoch": 0.8808620427986037, + "grad_norm": 0.42706549167633057, + "learning_rate": 8.535992710337147e-05, + "loss": 1.1601, + "step": 2902 + }, + { + "epoch": 0.8811655789952952, + "grad_norm": 0.42218390107154846, + "learning_rate": 8.535486483750126e-05, + "loss": 2.081, + "step": 2903 + }, + { + "epoch": 0.8814691151919867, + "grad_norm": 0.4469526410102844, + "learning_rate": 8.534980257163106e-05, + "loss": 1.9124, + "step": 2904 + }, + { + "epoch": 0.8817726513886781, + "grad_norm": 0.42796406149864197, + "learning_rate": 8.534474030576087e-05, + "loss": 1.8129, + "step": 2905 + }, + { + "epoch": 0.8820761875853695, + "grad_norm": 0.549192488193512, + "learning_rate": 8.533967803989066e-05, + "loss": 1.7816, + "step": 2906 + }, + { + "epoch": 0.882379723782061, + "grad_norm": 0.3347112834453583, + "learning_rate": 8.533461577402046e-05, + "loss": 2.0233, + "step": 2907 + }, + { + "epoch": 0.8826832599787525, + "grad_norm": 0.4557845890522003, + "learning_rate": 8.532955350815025e-05, + "loss": 1.8571, + "step": 2908 + }, + { + "epoch": 0.882986796175444, + "grad_norm": 0.9646681547164917, + "learning_rate": 8.532449124228005e-05, + "loss": 1.9618, + "step": 2909 + }, + { + "epoch": 0.8832903323721354, + "grad_norm": 0.43224748969078064, + "learning_rate": 8.531942897640984e-05, + "loss": 1.9805, + "step": 2910 + }, + { + "epoch": 0.8835938685688268, + "grad_norm": 0.635966420173645, + "learning_rate": 8.531436671053964e-05, + "loss": 1.8572, + "step": 2911 + }, + { + "epoch": 0.8838974047655183, + "grad_norm": 0.46912774443626404, + "learning_rate": 8.530930444466943e-05, + "loss": 1.9028, + "step": 2912 + }, + { + "epoch": 0.8842009409622097, + "grad_norm": 0.37521597743034363, + "learning_rate": 8.530424217879923e-05, + "loss": 1.7295, + "step": 2913 + }, + { + "epoch": 0.8845044771589012, + "grad_norm": 0.761882483959198, + "learning_rate": 8.529917991292903e-05, + "loss": 1.8966, + "step": 2914 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.7302446365356445, + "learning_rate": 8.529411764705883e-05, + "loss": 1.9641, + "step": 2915 + }, + { + "epoch": 0.8851115495522841, + "grad_norm": 0.47133728861808777, + "learning_rate": 8.528905538118862e-05, + "loss": 1.4893, + "step": 2916 + }, + { + "epoch": 0.8854150857489755, + "grad_norm": 0.4192088842391968, + "learning_rate": 8.528399311531842e-05, + "loss": 1.9727, + "step": 2917 + }, + { + "epoch": 0.885718621945667, + "grad_norm": 0.43679505586624146, + "learning_rate": 8.527893084944821e-05, + "loss": 1.7336, + "step": 2918 + }, + { + "epoch": 0.8860221581423585, + "grad_norm": 0.4226585328578949, + "learning_rate": 8.527386858357801e-05, + "loss": 2.0413, + "step": 2919 + }, + { + "epoch": 0.88632569433905, + "grad_norm": 0.5971319079399109, + "learning_rate": 8.52688063177078e-05, + "loss": 1.5096, + "step": 2920 + }, + { + "epoch": 0.8866292305357414, + "grad_norm": 0.5016753673553467, + "learning_rate": 8.52637440518376e-05, + "loss": 1.9617, + "step": 2921 + }, + { + "epoch": 0.8869327667324328, + "grad_norm": 0.4263944625854492, + "learning_rate": 8.525868178596739e-05, + "loss": 1.866, + "step": 2922 + }, + { + "epoch": 0.8872363029291243, + "grad_norm": 0.4154115915298462, + "learning_rate": 8.525361952009719e-05, + "loss": 2.0773, + "step": 2923 + }, + { + "epoch": 0.8875398391258158, + "grad_norm": 0.45908719301223755, + "learning_rate": 8.5248557254227e-05, + "loss": 1.9902, + "step": 2924 + }, + { + "epoch": 0.8878433753225072, + "grad_norm": 0.43351608514785767, + "learning_rate": 8.52434949883568e-05, + "loss": 1.7658, + "step": 2925 + }, + { + "epoch": 0.8881469115191987, + "grad_norm": 0.4773117005825043, + "learning_rate": 8.52384327224866e-05, + "loss": 1.8285, + "step": 2926 + }, + { + "epoch": 0.8884504477158901, + "grad_norm": 0.604767382144928, + "learning_rate": 8.52333704566164e-05, + "loss": 1.9066, + "step": 2927 + }, + { + "epoch": 0.8887539839125815, + "grad_norm": 1.4794889688491821, + "learning_rate": 8.522830819074619e-05, + "loss": 1.6015, + "step": 2928 + }, + { + "epoch": 0.8890575201092731, + "grad_norm": 0.9518802165985107, + "learning_rate": 8.522324592487598e-05, + "loss": 2.0818, + "step": 2929 + }, + { + "epoch": 0.8893610563059645, + "grad_norm": 0.9635084271430969, + "learning_rate": 8.521818365900578e-05, + "loss": 1.926, + "step": 2930 + }, + { + "epoch": 0.889664592502656, + "grad_norm": 0.4159846007823944, + "learning_rate": 8.521312139313557e-05, + "loss": 1.8518, + "step": 2931 + }, + { + "epoch": 0.8899681286993474, + "grad_norm": 0.42167580127716064, + "learning_rate": 8.520805912726537e-05, + "loss": 1.9682, + "step": 2932 + }, + { + "epoch": 0.8902716648960388, + "grad_norm": 0.4509316384792328, + "learning_rate": 8.520299686139516e-05, + "loss": 1.9891, + "step": 2933 + }, + { + "epoch": 0.8905752010927304, + "grad_norm": 0.3452865183353424, + "learning_rate": 8.519793459552496e-05, + "loss": 1.8003, + "step": 2934 + }, + { + "epoch": 0.8908787372894218, + "grad_norm": 0.38899463415145874, + "learning_rate": 8.519287232965477e-05, + "loss": 1.8849, + "step": 2935 + }, + { + "epoch": 0.8911822734861132, + "grad_norm": 0.4010523557662964, + "learning_rate": 8.518781006378456e-05, + "loss": 1.7905, + "step": 2936 + }, + { + "epoch": 0.8914858096828047, + "grad_norm": 0.3848381042480469, + "learning_rate": 8.518274779791436e-05, + "loss": 1.8661, + "step": 2937 + }, + { + "epoch": 0.8917893458794961, + "grad_norm": 0.41806578636169434, + "learning_rate": 8.517768553204415e-05, + "loss": 1.9617, + "step": 2938 + }, + { + "epoch": 0.8920928820761875, + "grad_norm": 0.4648883640766144, + "learning_rate": 8.517262326617395e-05, + "loss": 1.1984, + "step": 2939 + }, + { + "epoch": 0.8923964182728791, + "grad_norm": 0.43756723403930664, + "learning_rate": 8.516756100030374e-05, + "loss": 1.5523, + "step": 2940 + }, + { + "epoch": 0.8926999544695705, + "grad_norm": 0.393741637468338, + "learning_rate": 8.516249873443353e-05, + "loss": 1.8876, + "step": 2941 + }, + { + "epoch": 0.893003490666262, + "grad_norm": 0.41412442922592163, + "learning_rate": 8.515743646856333e-05, + "loss": 1.4302, + "step": 2942 + }, + { + "epoch": 0.8933070268629534, + "grad_norm": 0.4743058681488037, + "learning_rate": 8.515237420269312e-05, + "loss": 1.9108, + "step": 2943 + }, + { + "epoch": 0.8936105630596448, + "grad_norm": 0.40074145793914795, + "learning_rate": 8.514731193682293e-05, + "loss": 2.0512, + "step": 2944 + }, + { + "epoch": 0.8939140992563364, + "grad_norm": 0.39886727929115295, + "learning_rate": 8.514224967095273e-05, + "loss": 1.8853, + "step": 2945 + }, + { + "epoch": 0.8942176354530278, + "grad_norm": 0.9438028335571289, + "learning_rate": 8.513718740508252e-05, + "loss": 2.1271, + "step": 2946 + }, + { + "epoch": 0.8945211716497192, + "grad_norm": 0.38940876722335815, + "learning_rate": 8.513212513921232e-05, + "loss": 1.9008, + "step": 2947 + }, + { + "epoch": 0.8948247078464107, + "grad_norm": 0.3668425381183624, + "learning_rate": 8.512706287334211e-05, + "loss": 1.8479, + "step": 2948 + }, + { + "epoch": 0.8951282440431021, + "grad_norm": 0.41969189047813416, + "learning_rate": 8.512200060747191e-05, + "loss": 2.0179, + "step": 2949 + }, + { + "epoch": 0.8954317802397936, + "grad_norm": 0.377257376909256, + "learning_rate": 8.51169383416017e-05, + "loss": 1.8602, + "step": 2950 + }, + { + "epoch": 0.8957353164364851, + "grad_norm": 0.47926634550094604, + "learning_rate": 8.51118760757315e-05, + "loss": 1.9323, + "step": 2951 + }, + { + "epoch": 0.8960388526331765, + "grad_norm": 0.4736182689666748, + "learning_rate": 8.510681380986129e-05, + "loss": 1.7645, + "step": 2952 + }, + { + "epoch": 0.8963423888298679, + "grad_norm": 0.45783525705337524, + "learning_rate": 8.51017515439911e-05, + "loss": 2.0585, + "step": 2953 + }, + { + "epoch": 0.8966459250265594, + "grad_norm": 0.4085424840450287, + "learning_rate": 8.50966892781209e-05, + "loss": 1.8516, + "step": 2954 + }, + { + "epoch": 0.8969494612232509, + "grad_norm": 0.4012138545513153, + "learning_rate": 8.509162701225069e-05, + "loss": 1.9907, + "step": 2955 + }, + { + "epoch": 0.8972529974199424, + "grad_norm": 0.4017476737499237, + "learning_rate": 8.508656474638048e-05, + "loss": 1.9477, + "step": 2956 + }, + { + "epoch": 0.8975565336166338, + "grad_norm": 0.3720763325691223, + "learning_rate": 8.508150248051028e-05, + "loss": 1.9121, + "step": 2957 + }, + { + "epoch": 0.8978600698133252, + "grad_norm": 0.3642348349094391, + "learning_rate": 8.507644021464007e-05, + "loss": 1.8507, + "step": 2958 + }, + { + "epoch": 0.8981636060100167, + "grad_norm": 0.46299463510513306, + "learning_rate": 8.507137794876987e-05, + "loss": 2.0811, + "step": 2959 + }, + { + "epoch": 0.8984671422067082, + "grad_norm": 0.3806562125682831, + "learning_rate": 8.506631568289966e-05, + "loss": 1.6783, + "step": 2960 + }, + { + "epoch": 0.8987706784033996, + "grad_norm": 0.4003051221370697, + "learning_rate": 8.506125341702946e-05, + "loss": 1.7978, + "step": 2961 + }, + { + "epoch": 0.8990742146000911, + "grad_norm": 0.42008984088897705, + "learning_rate": 8.505619115115925e-05, + "loss": 1.9064, + "step": 2962 + }, + { + "epoch": 0.8993777507967825, + "grad_norm": 0.4423260986804962, + "learning_rate": 8.505112888528906e-05, + "loss": 1.9743, + "step": 2963 + }, + { + "epoch": 0.8996812869934739, + "grad_norm": 0.4516521990299225, + "learning_rate": 8.504606661941886e-05, + "loss": 1.6559, + "step": 2964 + }, + { + "epoch": 0.8999848231901654, + "grad_norm": 0.4269407093524933, + "learning_rate": 8.504100435354865e-05, + "loss": 1.677, + "step": 2965 + }, + { + "epoch": 0.9002883593868569, + "grad_norm": 0.4931739568710327, + "learning_rate": 8.503594208767845e-05, + "loss": 1.7786, + "step": 2966 + }, + { + "epoch": 0.9005918955835484, + "grad_norm": 0.4014637768268585, + "learning_rate": 8.503087982180824e-05, + "loss": 2.0737, + "step": 2967 + }, + { + "epoch": 0.9008954317802398, + "grad_norm": 0.4077427387237549, + "learning_rate": 8.502581755593804e-05, + "loss": 1.9301, + "step": 2968 + }, + { + "epoch": 0.9011989679769312, + "grad_norm": 0.40187394618988037, + "learning_rate": 8.502075529006784e-05, + "loss": 2.0045, + "step": 2969 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.6499014496803284, + "learning_rate": 8.501569302419764e-05, + "loss": 2.066, + "step": 2970 + }, + { + "epoch": 0.9018060403703142, + "grad_norm": 0.4545782804489136, + "learning_rate": 8.501063075832743e-05, + "loss": 1.7235, + "step": 2971 + }, + { + "epoch": 0.9021095765670056, + "grad_norm": 0.4248214066028595, + "learning_rate": 8.500556849245723e-05, + "loss": 2.0612, + "step": 2972 + }, + { + "epoch": 0.9024131127636971, + "grad_norm": 0.49832749366760254, + "learning_rate": 8.500050622658702e-05, + "loss": 1.9235, + "step": 2973 + }, + { + "epoch": 0.9027166489603885, + "grad_norm": 0.35065793991088867, + "learning_rate": 8.499544396071683e-05, + "loss": 1.7691, + "step": 2974 + }, + { + "epoch": 0.9030201851570799, + "grad_norm": 0.40507805347442627, + "learning_rate": 8.499038169484663e-05, + "loss": 2.055, + "step": 2975 + }, + { + "epoch": 0.9033237213537715, + "grad_norm": 0.44182345271110535, + "learning_rate": 8.498531942897642e-05, + "loss": 1.5353, + "step": 2976 + }, + { + "epoch": 0.9036272575504629, + "grad_norm": 0.4512852430343628, + "learning_rate": 8.498025716310622e-05, + "loss": 1.9116, + "step": 2977 + }, + { + "epoch": 0.9039307937471543, + "grad_norm": 0.44310954213142395, + "learning_rate": 8.497519489723601e-05, + "loss": 1.6875, + "step": 2978 + }, + { + "epoch": 0.9042343299438458, + "grad_norm": 0.4079609215259552, + "learning_rate": 8.49701326313658e-05, + "loss": 1.0174, + "step": 2979 + }, + { + "epoch": 0.9045378661405372, + "grad_norm": 0.3950175940990448, + "learning_rate": 8.49650703654956e-05, + "loss": 1.933, + "step": 2980 + }, + { + "epoch": 0.9048414023372288, + "grad_norm": 0.3858761787414551, + "learning_rate": 8.49600080996254e-05, + "loss": 1.6877, + "step": 2981 + }, + { + "epoch": 0.9051449385339202, + "grad_norm": 0.41248536109924316, + "learning_rate": 8.495494583375519e-05, + "loss": 1.1987, + "step": 2982 + }, + { + "epoch": 0.9054484747306116, + "grad_norm": 0.3943655490875244, + "learning_rate": 8.4949883567885e-05, + "loss": 1.5532, + "step": 2983 + }, + { + "epoch": 0.9057520109273031, + "grad_norm": 0.37889233231544495, + "learning_rate": 8.494482130201479e-05, + "loss": 1.8119, + "step": 2984 + }, + { + "epoch": 0.9060555471239945, + "grad_norm": 0.3723227381706238, + "learning_rate": 8.493975903614459e-05, + "loss": 1.8415, + "step": 2985 + }, + { + "epoch": 0.906359083320686, + "grad_norm": 0.4503065347671509, + "learning_rate": 8.493469677027438e-05, + "loss": 1.6841, + "step": 2986 + }, + { + "epoch": 0.9066626195173775, + "grad_norm": 0.41649529337882996, + "learning_rate": 8.492963450440418e-05, + "loss": 1.9298, + "step": 2987 + }, + { + "epoch": 0.9069661557140689, + "grad_norm": 0.3602710962295532, + "learning_rate": 8.492457223853397e-05, + "loss": 2.167, + "step": 2988 + }, + { + "epoch": 0.9072696919107603, + "grad_norm": 0.39875030517578125, + "learning_rate": 8.491950997266377e-05, + "loss": 2.1081, + "step": 2989 + }, + { + "epoch": 0.9075732281074518, + "grad_norm": 0.42908263206481934, + "learning_rate": 8.491444770679356e-05, + "loss": 1.7717, + "step": 2990 + }, + { + "epoch": 0.9078767643041433, + "grad_norm": 0.4125417470932007, + "learning_rate": 8.490938544092336e-05, + "loss": 2.2085, + "step": 2991 + }, + { + "epoch": 0.9081803005008348, + "grad_norm": 0.4204493463039398, + "learning_rate": 8.490432317505316e-05, + "loss": 1.7223, + "step": 2992 + }, + { + "epoch": 0.9084838366975262, + "grad_norm": 0.48912370204925537, + "learning_rate": 8.489926090918296e-05, + "loss": 1.7614, + "step": 2993 + }, + { + "epoch": 0.9087873728942176, + "grad_norm": 0.44855475425720215, + "learning_rate": 8.489419864331275e-05, + "loss": 1.9345, + "step": 2994 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.35656431317329407, + "learning_rate": 8.488913637744255e-05, + "loss": 1.3761, + "step": 2995 + }, + { + "epoch": 0.9093944452876005, + "grad_norm": 0.431643545627594, + "learning_rate": 8.488407411157234e-05, + "loss": 2.0976, + "step": 2996 + }, + { + "epoch": 0.909697981484292, + "grad_norm": 0.3734411299228668, + "learning_rate": 8.487901184570214e-05, + "loss": 1.5943, + "step": 2997 + }, + { + "epoch": 0.9100015176809835, + "grad_norm": 0.4734323024749756, + "learning_rate": 8.487394957983193e-05, + "loss": 1.9213, + "step": 2998 + }, + { + "epoch": 0.9103050538776749, + "grad_norm": 0.39105552434921265, + "learning_rate": 8.486888731396173e-05, + "loss": 1.7569, + "step": 2999 + }, + { + "epoch": 0.9106085900743663, + "grad_norm": 0.35980144143104553, + "learning_rate": 8.486382504809152e-05, + "loss": 1.4274, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 19764, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9522198604162335e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}