{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7995735607675906, "eval_steps": 500, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010660980810234541, "grad_norm": 21.70728302001953, "learning_rate": 2.1276595744680852e-07, "loss": 0.9072, "step": 1 }, { "epoch": 0.0021321961620469083, "grad_norm": 28.253578186035156, "learning_rate": 4.2553191489361704e-07, "loss": 0.9694, "step": 2 }, { "epoch": 0.0031982942430703624, "grad_norm": 25.478811264038086, "learning_rate": 6.382978723404255e-07, "loss": 0.7973, "step": 3 }, { "epoch": 0.0042643923240938165, "grad_norm": 28.413204193115234, "learning_rate": 8.510638297872341e-07, "loss": 1.0934, "step": 4 }, { "epoch": 0.005330490405117271, "grad_norm": 34.61251449584961, "learning_rate": 1.0638297872340427e-06, "loss": 1.0402, "step": 5 }, { "epoch": 0.006396588486140725, "grad_norm": 65.53425598144531, "learning_rate": 1.276595744680851e-06, "loss": 1.2195, "step": 6 }, { "epoch": 0.007462686567164179, "grad_norm": 20.583099365234375, "learning_rate": 1.4893617021276596e-06, "loss": 0.8989, "step": 7 }, { "epoch": 0.008528784648187633, "grad_norm": 18.38344383239746, "learning_rate": 1.7021276595744682e-06, "loss": 0.884, "step": 8 }, { "epoch": 0.009594882729211088, "grad_norm": 23.684789657592773, "learning_rate": 1.9148936170212767e-06, "loss": 0.9259, "step": 9 }, { "epoch": 0.010660980810234541, "grad_norm": 24.472211837768555, "learning_rate": 2.1276595744680853e-06, "loss": 0.9317, "step": 10 }, { "epoch": 0.011727078891257996, "grad_norm": 23.060693740844727, "learning_rate": 2.340425531914894e-06, "loss": 0.8298, "step": 11 }, { "epoch": 0.01279317697228145, "grad_norm": 18.553483963012695, "learning_rate": 2.553191489361702e-06, "loss": 0.7868, "step": 12 }, { "epoch": 0.013859275053304905, "grad_norm": 21.778993606567383, "learning_rate": 2.765957446808511e-06, "loss": 0.8889, "step": 13 }, { "epoch": 0.014925373134328358, "grad_norm": 17.681299209594727, "learning_rate": 2.978723404255319e-06, "loss": 0.8804, "step": 14 }, { "epoch": 0.015991471215351813, "grad_norm": 15.009004592895508, "learning_rate": 3.191489361702128e-06, "loss": 0.8442, "step": 15 }, { "epoch": 0.017057569296375266, "grad_norm": 14.337655067443848, "learning_rate": 3.4042553191489363e-06, "loss": 0.8908, "step": 16 }, { "epoch": 0.01812366737739872, "grad_norm": 9.931138038635254, "learning_rate": 3.6170212765957453e-06, "loss": 0.6709, "step": 17 }, { "epoch": 0.019189765458422176, "grad_norm": 15.07895278930664, "learning_rate": 3.8297872340425535e-06, "loss": 0.7694, "step": 18 }, { "epoch": 0.02025586353944563, "grad_norm": 10.424509048461914, "learning_rate": 4.042553191489362e-06, "loss": 0.8027, "step": 19 }, { "epoch": 0.021321961620469083, "grad_norm": 7.655640125274658, "learning_rate": 4.255319148936171e-06, "loss": 0.5685, "step": 20 }, { "epoch": 0.022388059701492536, "grad_norm": 4.088193416595459, "learning_rate": 4.468085106382979e-06, "loss": 0.5534, "step": 21 }, { "epoch": 0.023454157782515993, "grad_norm": 7.443810939788818, "learning_rate": 4.680851063829788e-06, "loss": 0.7799, "step": 22 }, { "epoch": 0.024520255863539446, "grad_norm": 7.610146522521973, "learning_rate": 4.893617021276596e-06, "loss": 0.659, "step": 23 }, { "epoch": 0.0255863539445629, "grad_norm": 6.722533226013184, "learning_rate": 5.106382978723404e-06, "loss": 0.6812, "step": 24 }, { "epoch": 0.026652452025586353, "grad_norm": 4.733678340911865, "learning_rate": 5.319148936170213e-06, "loss": 0.4429, "step": 25 }, { "epoch": 0.02771855010660981, "grad_norm": 6.856286525726318, "learning_rate": 5.531914893617022e-06, "loss": 0.8113, "step": 26 }, { "epoch": 0.028784648187633263, "grad_norm": 4.969576835632324, "learning_rate": 5.744680851063831e-06, "loss": 0.627, "step": 27 }, { "epoch": 0.029850746268656716, "grad_norm": 3.894331932067871, "learning_rate": 5.957446808510638e-06, "loss": 0.57, "step": 28 }, { "epoch": 0.03091684434968017, "grad_norm": 3.7609620094299316, "learning_rate": 6.170212765957447e-06, "loss": 0.5351, "step": 29 }, { "epoch": 0.031982942430703626, "grad_norm": 4.346351146697998, "learning_rate": 6.382978723404256e-06, "loss": 0.5308, "step": 30 }, { "epoch": 0.03304904051172708, "grad_norm": 4.090020179748535, "learning_rate": 6.595744680851064e-06, "loss": 0.4916, "step": 31 }, { "epoch": 0.03411513859275053, "grad_norm": 4.351005554199219, "learning_rate": 6.808510638297873e-06, "loss": 0.7224, "step": 32 }, { "epoch": 0.035181236673773986, "grad_norm": 3.752434492111206, "learning_rate": 7.021276595744682e-06, "loss": 0.6372, "step": 33 }, { "epoch": 0.03624733475479744, "grad_norm": 4.596726894378662, "learning_rate": 7.234042553191491e-06, "loss": 0.7351, "step": 34 }, { "epoch": 0.03731343283582089, "grad_norm": 5.994707107543945, "learning_rate": 7.446808510638298e-06, "loss": 0.6696, "step": 35 }, { "epoch": 0.03837953091684435, "grad_norm": 4.608494281768799, "learning_rate": 7.659574468085107e-06, "loss": 0.5317, "step": 36 }, { "epoch": 0.039445628997867806, "grad_norm": 4.104159832000732, "learning_rate": 7.872340425531916e-06, "loss": 0.5087, "step": 37 }, { "epoch": 0.04051172707889126, "grad_norm": 4.3180012702941895, "learning_rate": 8.085106382978723e-06, "loss": 0.5023, "step": 38 }, { "epoch": 0.04157782515991471, "grad_norm": 5.02656364440918, "learning_rate": 8.297872340425532e-06, "loss": 0.7083, "step": 39 }, { "epoch": 0.042643923240938165, "grad_norm": 5.303328037261963, "learning_rate": 8.510638297872341e-06, "loss": 0.6561, "step": 40 }, { "epoch": 0.04371002132196162, "grad_norm": 5.885382175445557, "learning_rate": 8.72340425531915e-06, "loss": 0.664, "step": 41 }, { "epoch": 0.04477611940298507, "grad_norm": 6.578125, "learning_rate": 8.936170212765958e-06, "loss": 0.7865, "step": 42 }, { "epoch": 0.04584221748400853, "grad_norm": 3.013359308242798, "learning_rate": 9.148936170212767e-06, "loss": 0.5047, "step": 43 }, { "epoch": 0.046908315565031986, "grad_norm": 3.5396573543548584, "learning_rate": 9.361702127659576e-06, "loss": 0.6327, "step": 44 }, { "epoch": 0.04797441364605544, "grad_norm": 3.504297971725464, "learning_rate": 9.574468085106385e-06, "loss": 0.5679, "step": 45 }, { "epoch": 0.04904051172707889, "grad_norm": 4.056636333465576, "learning_rate": 9.787234042553192e-06, "loss": 0.6116, "step": 46 }, { "epoch": 0.050106609808102345, "grad_norm": 3.2248194217681885, "learning_rate": 1e-05, "loss": 0.4363, "step": 47 }, { "epoch": 0.0511727078891258, "grad_norm": 3.281310558319092, "learning_rate": 9.99996891979347e-06, "loss": 0.5049, "step": 48 }, { "epoch": 0.05223880597014925, "grad_norm": 4.442873954772949, "learning_rate": 9.999875679560272e-06, "loss": 0.6558, "step": 49 }, { "epoch": 0.053304904051172705, "grad_norm": 4.242443561553955, "learning_rate": 9.999720280459576e-06, "loss": 0.6884, "step": 50 }, { "epoch": 0.054371002132196165, "grad_norm": 4.739858150482178, "learning_rate": 9.999502724423316e-06, "loss": 0.678, "step": 51 }, { "epoch": 0.05543710021321962, "grad_norm": 3.2829554080963135, "learning_rate": 9.999223014156167e-06, "loss": 0.7131, "step": 52 }, { "epoch": 0.05650319829424307, "grad_norm": 5.118709087371826, "learning_rate": 9.99888115313551e-06, "loss": 0.6518, "step": 53 }, { "epoch": 0.057569296375266525, "grad_norm": 5.85404634475708, "learning_rate": 9.998477145611389e-06, "loss": 0.6947, "step": 54 }, { "epoch": 0.05863539445628998, "grad_norm": 4.439341068267822, "learning_rate": 9.99801099660646e-06, "loss": 0.6939, "step": 55 }, { "epoch": 0.05970149253731343, "grad_norm": 4.336203575134277, "learning_rate": 9.997482711915926e-06, "loss": 0.5033, "step": 56 }, { "epoch": 0.060767590618336885, "grad_norm": 4.772335529327393, "learning_rate": 9.996892298107466e-06, "loss": 0.576, "step": 57 }, { "epoch": 0.06183368869936034, "grad_norm": 4.428223133087158, "learning_rate": 9.996239762521152e-06, "loss": 0.6461, "step": 58 }, { "epoch": 0.0628997867803838, "grad_norm": 3.775033712387085, "learning_rate": 9.99552511326936e-06, "loss": 0.544, "step": 59 }, { "epoch": 0.06396588486140725, "grad_norm": 15.40304183959961, "learning_rate": 9.99474835923667e-06, "loss": 0.7836, "step": 60 }, { "epoch": 0.0650319829424307, "grad_norm": 6.83998441696167, "learning_rate": 9.993909510079752e-06, "loss": 0.5651, "step": 61 }, { "epoch": 0.06609808102345416, "grad_norm": 3.080057382583618, "learning_rate": 9.993008576227248e-06, "loss": 0.418, "step": 62 }, { "epoch": 0.06716417910447761, "grad_norm": 7.665582180023193, "learning_rate": 9.99204556887964e-06, "loss": 0.7883, "step": 63 }, { "epoch": 0.06823027718550106, "grad_norm": 4.856597423553467, "learning_rate": 9.991020500009118e-06, "loss": 0.5644, "step": 64 }, { "epoch": 0.06929637526652452, "grad_norm": 3.5741424560546875, "learning_rate": 9.989933382359423e-06, "loss": 0.5762, "step": 65 }, { "epoch": 0.07036247334754797, "grad_norm": 17.599828720092773, "learning_rate": 9.988784229445689e-06, "loss": 0.563, "step": 66 }, { "epoch": 0.07142857142857142, "grad_norm": 4.717083930969238, "learning_rate": 9.98757305555428e-06, "loss": 0.6294, "step": 67 }, { "epoch": 0.07249466950959488, "grad_norm": 11.448975563049316, "learning_rate": 9.986299875742612e-06, "loss": 0.5505, "step": 68 }, { "epoch": 0.07356076759061833, "grad_norm": 36.52313995361328, "learning_rate": 9.98496470583896e-06, "loss": 0.6096, "step": 69 }, { "epoch": 0.07462686567164178, "grad_norm": 35.91273498535156, "learning_rate": 9.98356756244227e-06, "loss": 0.5964, "step": 70 }, { "epoch": 0.07569296375266525, "grad_norm": 4.168030261993408, "learning_rate": 9.982108462921938e-06, "loss": 0.65, "step": 71 }, { "epoch": 0.0767590618336887, "grad_norm": 7.867929935455322, "learning_rate": 9.980587425417612e-06, "loss": 0.6722, "step": 72 }, { "epoch": 0.07782515991471216, "grad_norm": 4.325233459472656, "learning_rate": 9.97900446883896e-06, "loss": 0.619, "step": 73 }, { "epoch": 0.07889125799573561, "grad_norm": 6.3012309074401855, "learning_rate": 9.977359612865424e-06, "loss": 0.5862, "step": 74 }, { "epoch": 0.07995735607675906, "grad_norm": 4.738587856292725, "learning_rate": 9.975652877945991e-06, "loss": 0.6556, "step": 75 }, { "epoch": 0.08102345415778252, "grad_norm": 10.31258487701416, "learning_rate": 9.973884285298932e-06, "loss": 0.548, "step": 76 }, { "epoch": 0.08208955223880597, "grad_norm": 3.8817405700683594, "learning_rate": 9.972053856911534e-06, "loss": 0.6013, "step": 77 }, { "epoch": 0.08315565031982942, "grad_norm": 4.519651889801025, "learning_rate": 9.970161615539837e-06, "loss": 0.5549, "step": 78 }, { "epoch": 0.08422174840085288, "grad_norm": 3.7909624576568604, "learning_rate": 9.96820758470834e-06, "loss": 0.5291, "step": 79 }, { "epoch": 0.08528784648187633, "grad_norm": 4.346678733825684, "learning_rate": 9.966191788709716e-06, "loss": 0.6665, "step": 80 }, { "epoch": 0.08635394456289978, "grad_norm": 4.5973663330078125, "learning_rate": 9.964114252604508e-06, "loss": 0.6331, "step": 81 }, { "epoch": 0.08742004264392324, "grad_norm": 3.9288721084594727, "learning_rate": 9.961975002220816e-06, "loss": 0.6104, "step": 82 }, { "epoch": 0.08848614072494669, "grad_norm": 3.3124406337738037, "learning_rate": 9.959774064153977e-06, "loss": 0.5207, "step": 83 }, { "epoch": 0.08955223880597014, "grad_norm": 6.775738716125488, "learning_rate": 9.957511465766236e-06, "loss": 0.684, "step": 84 }, { "epoch": 0.0906183368869936, "grad_norm": 4.9728193283081055, "learning_rate": 9.955187235186403e-06, "loss": 0.6627, "step": 85 }, { "epoch": 0.09168443496801706, "grad_norm": 5.727108478546143, "learning_rate": 9.952801401309504e-06, "loss": 0.6904, "step": 86 }, { "epoch": 0.09275053304904052, "grad_norm": 3.6153433322906494, "learning_rate": 9.950353993796424e-06, "loss": 0.6201, "step": 87 }, { "epoch": 0.09381663113006397, "grad_norm": 5.316452503204346, "learning_rate": 9.947845043073533e-06, "loss": 0.5817, "step": 88 }, { "epoch": 0.09488272921108742, "grad_norm": 3.6538376808166504, "learning_rate": 9.945274580332316e-06, "loss": 0.6307, "step": 89 }, { "epoch": 0.09594882729211088, "grad_norm": 4.65852165222168, "learning_rate": 9.942642637528977e-06, "loss": 0.4973, "step": 90 }, { "epoch": 0.09701492537313433, "grad_norm": 4.196765899658203, "learning_rate": 9.939949247384046e-06, "loss": 0.4843, "step": 91 }, { "epoch": 0.09808102345415778, "grad_norm": 4.000326633453369, "learning_rate": 9.937194443381972e-06, "loss": 0.5727, "step": 92 }, { "epoch": 0.09914712153518124, "grad_norm": 4.436466217041016, "learning_rate": 9.934378259770708e-06, "loss": 0.6755, "step": 93 }, { "epoch": 0.10021321961620469, "grad_norm": 3.843862771987915, "learning_rate": 9.931500731561279e-06, "loss": 0.5139, "step": 94 }, { "epoch": 0.10127931769722814, "grad_norm": 3.680150032043457, "learning_rate": 9.928561894527354e-06, "loss": 0.5273, "step": 95 }, { "epoch": 0.1023454157782516, "grad_norm": 4.106802463531494, "learning_rate": 9.925561785204797e-06, "loss": 0.6143, "step": 96 }, { "epoch": 0.10341151385927505, "grad_norm": 3.7003519535064697, "learning_rate": 9.922500440891217e-06, "loss": 0.4448, "step": 97 }, { "epoch": 0.1044776119402985, "grad_norm": 5.373023986816406, "learning_rate": 9.919377899645497e-06, "loss": 0.6817, "step": 98 }, { "epoch": 0.10554371002132196, "grad_norm": 7.320279121398926, "learning_rate": 9.916194200287329e-06, "loss": 0.7365, "step": 99 }, { "epoch": 0.10660980810234541, "grad_norm": 3.6290242671966553, "learning_rate": 9.912949382396728e-06, "loss": 0.53, "step": 100 }, { "epoch": 0.10767590618336886, "grad_norm": 4.134979248046875, "learning_rate": 9.909643486313533e-06, "loss": 0.6045, "step": 101 }, { "epoch": 0.10874200426439233, "grad_norm": 5.797875881195068, "learning_rate": 9.906276553136924e-06, "loss": 0.7174, "step": 102 }, { "epoch": 0.10980810234541578, "grad_norm": 3.856888771057129, "learning_rate": 9.902848624724887e-06, "loss": 0.5169, "step": 103 }, { "epoch": 0.11087420042643924, "grad_norm": 5.883880615234375, "learning_rate": 9.899359743693715e-06, "loss": 0.8011, "step": 104 }, { "epoch": 0.11194029850746269, "grad_norm": 4.1314616203308105, "learning_rate": 9.895809953417464e-06, "loss": 0.552, "step": 105 }, { "epoch": 0.11300639658848614, "grad_norm": 3.7794744968414307, "learning_rate": 9.892199298027416e-06, "loss": 0.6137, "step": 106 }, { "epoch": 0.1140724946695096, "grad_norm": 3.5039713382720947, "learning_rate": 9.888527822411543e-06, "loss": 0.5691, "step": 107 }, { "epoch": 0.11513859275053305, "grad_norm": 3.7832272052764893, "learning_rate": 9.88479557221393e-06, "loss": 0.4884, "step": 108 }, { "epoch": 0.1162046908315565, "grad_norm": 3.9969241619110107, "learning_rate": 9.881002593834221e-06, "loss": 0.5502, "step": 109 }, { "epoch": 0.11727078891257996, "grad_norm": 4.871984481811523, "learning_rate": 9.877148934427037e-06, "loss": 0.6097, "step": 110 }, { "epoch": 0.11833688699360341, "grad_norm": 4.803427219390869, "learning_rate": 9.873234641901387e-06, "loss": 0.6528, "step": 111 }, { "epoch": 0.11940298507462686, "grad_norm": 3.8672356605529785, "learning_rate": 9.869259764920081e-06, "loss": 0.5328, "step": 112 }, { "epoch": 0.12046908315565032, "grad_norm": 4.863016128540039, "learning_rate": 9.86522435289912e-06, "loss": 0.7655, "step": 113 }, { "epoch": 0.12153518123667377, "grad_norm": 4.648331642150879, "learning_rate": 9.861128456007076e-06, "loss": 0.5407, "step": 114 }, { "epoch": 0.12260127931769722, "grad_norm": 3.3792529106140137, "learning_rate": 9.85697212516448e-06, "loss": 0.4453, "step": 115 }, { "epoch": 0.12366737739872068, "grad_norm": 4.340742588043213, "learning_rate": 9.85275541204318e-06, "loss": 0.5988, "step": 116 }, { "epoch": 0.12473347547974413, "grad_norm": 4.506209373474121, "learning_rate": 9.848478369065703e-06, "loss": 0.6093, "step": 117 }, { "epoch": 0.1257995735607676, "grad_norm": 3.4612324237823486, "learning_rate": 9.844141049404598e-06, "loss": 0.5399, "step": 118 }, { "epoch": 0.12686567164179105, "grad_norm": 6.610842704772949, "learning_rate": 9.839743506981783e-06, "loss": 0.7042, "step": 119 }, { "epoch": 0.1279317697228145, "grad_norm": 4.095919609069824, "learning_rate": 9.835285796467867e-06, "loss": 0.6019, "step": 120 }, { "epoch": 0.12899786780383796, "grad_norm": 3.828929901123047, "learning_rate": 9.830767973281477e-06, "loss": 0.5414, "step": 121 }, { "epoch": 0.1300639658848614, "grad_norm": 3.887065887451172, "learning_rate": 9.826190093588564e-06, "loss": 0.4489, "step": 122 }, { "epoch": 0.13113006396588486, "grad_norm": 4.068256378173828, "learning_rate": 9.821552214301705e-06, "loss": 0.7193, "step": 123 }, { "epoch": 0.13219616204690832, "grad_norm": 4.338530540466309, "learning_rate": 9.816854393079402e-06, "loss": 0.6538, "step": 124 }, { "epoch": 0.13326226012793177, "grad_norm": 4.922677516937256, "learning_rate": 9.812096688325354e-06, "loss": 0.6031, "step": 125 }, { "epoch": 0.13432835820895522, "grad_norm": 4.428820610046387, "learning_rate": 9.80727915918774e-06, "loss": 0.7907, "step": 126 }, { "epoch": 0.13539445628997868, "grad_norm": 3.2011046409606934, "learning_rate": 9.802401865558477e-06, "loss": 0.677, "step": 127 }, { "epoch": 0.13646055437100213, "grad_norm": 4.500326156616211, "learning_rate": 9.797464868072489e-06, "loss": 0.7907, "step": 128 }, { "epoch": 0.13752665245202558, "grad_norm": 4.290125370025635, "learning_rate": 9.79246822810693e-06, "loss": 0.5771, "step": 129 }, { "epoch": 0.13859275053304904, "grad_norm": 6.419731616973877, "learning_rate": 9.787412007780445e-06, "loss": 0.7094, "step": 130 }, { "epoch": 0.1396588486140725, "grad_norm": 5.237648963928223, "learning_rate": 9.78229626995238e-06, "loss": 0.6524, "step": 131 }, { "epoch": 0.14072494669509594, "grad_norm": 3.8653342723846436, "learning_rate": 9.777121078222015e-06, "loss": 0.4659, "step": 132 }, { "epoch": 0.1417910447761194, "grad_norm": 5.471773624420166, "learning_rate": 9.771886496927756e-06, "loss": 0.6951, "step": 133 }, { "epoch": 0.14285714285714285, "grad_norm": 6.148238658905029, "learning_rate": 9.766592591146353e-06, "loss": 0.5987, "step": 134 }, { "epoch": 0.1439232409381663, "grad_norm": 3.1601390838623047, "learning_rate": 9.761239426692077e-06, "loss": 0.5482, "step": 135 }, { "epoch": 0.14498933901918976, "grad_norm": 4.635965347290039, "learning_rate": 9.755827070115915e-06, "loss": 0.6074, "step": 136 }, { "epoch": 0.1460554371002132, "grad_norm": 5.910302639007568, "learning_rate": 9.750355588704728e-06, "loss": 0.7222, "step": 137 }, { "epoch": 0.14712153518123666, "grad_norm": 4.27485990524292, "learning_rate": 9.744825050480425e-06, "loss": 0.5813, "step": 138 }, { "epoch": 0.14818763326226012, "grad_norm": 4.451240062713623, "learning_rate": 9.739235524199117e-06, "loss": 0.5906, "step": 139 }, { "epoch": 0.14925373134328357, "grad_norm": 4.774477005004883, "learning_rate": 9.733587079350254e-06, "loss": 0.6689, "step": 140 }, { "epoch": 0.15031982942430705, "grad_norm": 5.139758110046387, "learning_rate": 9.727879786155767e-06, "loss": 0.4125, "step": 141 }, { "epoch": 0.1513859275053305, "grad_norm": 4.601865291595459, "learning_rate": 9.7221137155692e-06, "loss": 0.71, "step": 142 }, { "epoch": 0.15245202558635396, "grad_norm": 3.651667594909668, "learning_rate": 9.716288939274818e-06, "loss": 0.5557, "step": 143 }, { "epoch": 0.1535181236673774, "grad_norm": 4.867353916168213, "learning_rate": 9.710405529686722e-06, "loss": 0.6621, "step": 144 }, { "epoch": 0.15458422174840086, "grad_norm": 5.342178821563721, "learning_rate": 9.704463559947944e-06, "loss": 0.7686, "step": 145 }, { "epoch": 0.15565031982942432, "grad_norm": 4.675542831420898, "learning_rate": 9.698463103929542e-06, "loss": 0.6024, "step": 146 }, { "epoch": 0.15671641791044777, "grad_norm": 3.657904624938965, "learning_rate": 9.692404236229684e-06, "loss": 0.687, "step": 147 }, { "epoch": 0.15778251599147122, "grad_norm": 4.105894565582275, "learning_rate": 9.686287032172712e-06, "loss": 0.7184, "step": 148 }, { "epoch": 0.15884861407249468, "grad_norm": 4.484536647796631, "learning_rate": 9.680111567808212e-06, "loss": 0.5951, "step": 149 }, { "epoch": 0.15991471215351813, "grad_norm": 4.449892997741699, "learning_rate": 9.673877919910069e-06, "loss": 0.6165, "step": 150 }, { "epoch": 0.16098081023454158, "grad_norm": 3.7848403453826904, "learning_rate": 9.667586165975507e-06, "loss": 0.4339, "step": 151 }, { "epoch": 0.16204690831556504, "grad_norm": 3.7008159160614014, "learning_rate": 9.66123638422413e-06, "loss": 0.6985, "step": 152 }, { "epoch": 0.1631130063965885, "grad_norm": 4.082990646362305, "learning_rate": 9.65482865359695e-06, "loss": 0.555, "step": 153 }, { "epoch": 0.16417910447761194, "grad_norm": 3.710919141769409, "learning_rate": 9.648363053755406e-06, "loss": 0.5125, "step": 154 }, { "epoch": 0.1652452025586354, "grad_norm": 4.507083415985107, "learning_rate": 9.641839665080363e-06, "loss": 0.7256, "step": 155 }, { "epoch": 0.16631130063965885, "grad_norm": 4.448705196380615, "learning_rate": 9.635258568671135e-06, "loss": 0.5447, "step": 156 }, { "epoch": 0.1673773987206823, "grad_norm": 3.6309075355529785, "learning_rate": 9.628619846344453e-06, "loss": 0.5479, "step": 157 }, { "epoch": 0.16844349680170576, "grad_norm": 3.7882144451141357, "learning_rate": 9.621923580633462e-06, "loss": 0.6248, "step": 158 }, { "epoch": 0.1695095948827292, "grad_norm": 3.5979318618774414, "learning_rate": 9.615169854786688e-06, "loss": 0.6613, "step": 159 }, { "epoch": 0.17057569296375266, "grad_norm": 3.5035629272460938, "learning_rate": 9.608358752767013e-06, "loss": 0.4057, "step": 160 }, { "epoch": 0.17164179104477612, "grad_norm": 4.959547996520996, "learning_rate": 9.601490359250616e-06, "loss": 0.7039, "step": 161 }, { "epoch": 0.17270788912579957, "grad_norm": 3.4736928939819336, "learning_rate": 9.594564759625936e-06, "loss": 0.5509, "step": 162 }, { "epoch": 0.17377398720682302, "grad_norm": 4.24041748046875, "learning_rate": 9.587582039992598e-06, "loss": 0.5625, "step": 163 }, { "epoch": 0.17484008528784648, "grad_norm": 6.845096111297607, "learning_rate": 9.580542287160348e-06, "loss": 0.7931, "step": 164 }, { "epoch": 0.17590618336886993, "grad_norm": 4.748083591461182, "learning_rate": 9.573445588647978e-06, "loss": 0.8929, "step": 165 }, { "epoch": 0.17697228144989338, "grad_norm": 4.676606178283691, "learning_rate": 9.566292032682228e-06, "loss": 0.5997, "step": 166 }, { "epoch": 0.17803837953091683, "grad_norm": 3.188955307006836, "learning_rate": 9.559081708196696e-06, "loss": 0.5145, "step": 167 }, { "epoch": 0.1791044776119403, "grad_norm": 3.0883800983428955, "learning_rate": 9.551814704830734e-06, "loss": 0.5487, "step": 168 }, { "epoch": 0.18017057569296374, "grad_norm": 3.665557861328125, "learning_rate": 9.544491112928327e-06, "loss": 0.5495, "step": 169 }, { "epoch": 0.1812366737739872, "grad_norm": 2.871556282043457, "learning_rate": 9.537111023536973e-06, "loss": 0.525, "step": 170 }, { "epoch": 0.18230277185501065, "grad_norm": 3.542114019393921, "learning_rate": 9.529674528406556e-06, "loss": 0.4435, "step": 171 }, { "epoch": 0.18336886993603413, "grad_norm": 3.734936237335205, "learning_rate": 9.522181719988196e-06, "loss": 0.5063, "step": 172 }, { "epoch": 0.18443496801705758, "grad_norm": 3.7635278701782227, "learning_rate": 9.514632691433108e-06, "loss": 0.5343, "step": 173 }, { "epoch": 0.18550106609808104, "grad_norm": 3.4705896377563477, "learning_rate": 9.507027536591436e-06, "loss": 0.5683, "step": 174 }, { "epoch": 0.1865671641791045, "grad_norm": 3.6808106899261475, "learning_rate": 9.499366350011093e-06, "loss": 0.5909, "step": 175 }, { "epoch": 0.18763326226012794, "grad_norm": 2.8479557037353516, "learning_rate": 9.491649226936586e-06, "loss": 0.6401, "step": 176 }, { "epoch": 0.1886993603411514, "grad_norm": 4.607538223266602, "learning_rate": 9.483876263307825e-06, "loss": 0.5319, "step": 177 }, { "epoch": 0.18976545842217485, "grad_norm": 4.576630115509033, "learning_rate": 9.476047555758938e-06, "loss": 0.5794, "step": 178 }, { "epoch": 0.1908315565031983, "grad_norm": 4.175206661224365, "learning_rate": 9.468163201617063e-06, "loss": 0.5952, "step": 179 }, { "epoch": 0.19189765458422176, "grad_norm": 4.810217380523682, "learning_rate": 9.460223298901138e-06, "loss": 0.6182, "step": 180 }, { "epoch": 0.1929637526652452, "grad_norm": 4.137125015258789, "learning_rate": 9.452227946320697e-06, "loss": 0.7669, "step": 181 }, { "epoch": 0.19402985074626866, "grad_norm": 2.4917850494384766, "learning_rate": 9.444177243274619e-06, "loss": 0.4388, "step": 182 }, { "epoch": 0.19509594882729211, "grad_norm": 4.999782085418701, "learning_rate": 9.436071289849909e-06, "loss": 0.7209, "step": 183 }, { "epoch": 0.19616204690831557, "grad_norm": 4.8152360916137695, "learning_rate": 9.42791018682045e-06, "loss": 0.5722, "step": 184 }, { "epoch": 0.19722814498933902, "grad_norm": 4.715020179748535, "learning_rate": 9.419694035645753e-06, "loss": 0.8164, "step": 185 }, { "epoch": 0.19829424307036247, "grad_norm": 4.70846700668335, "learning_rate": 9.411422938469683e-06, "loss": 0.5515, "step": 186 }, { "epoch": 0.19936034115138593, "grad_norm": 5.402597427368164, "learning_rate": 9.403096998119206e-06, "loss": 0.9022, "step": 187 }, { "epoch": 0.20042643923240938, "grad_norm": 4.062778472900391, "learning_rate": 9.394716318103098e-06, "loss": 0.3908, "step": 188 }, { "epoch": 0.20149253731343283, "grad_norm": 4.452870845794678, "learning_rate": 9.386281002610669e-06, "loss": 0.5355, "step": 189 }, { "epoch": 0.2025586353944563, "grad_norm": 3.802704095840454, "learning_rate": 9.377791156510456e-06, "loss": 0.5454, "step": 190 }, { "epoch": 0.20362473347547974, "grad_norm": 5.103549957275391, "learning_rate": 9.369246885348926e-06, "loss": 0.5592, "step": 191 }, { "epoch": 0.2046908315565032, "grad_norm": 3.22542142868042, "learning_rate": 9.360648295349165e-06, "loss": 0.3836, "step": 192 }, { "epoch": 0.20575692963752665, "grad_norm": 3.6439967155456543, "learning_rate": 9.351995493409556e-06, "loss": 0.5673, "step": 193 }, { "epoch": 0.2068230277185501, "grad_norm": 3.438176393508911, "learning_rate": 9.343288587102444e-06, "loss": 0.5468, "step": 194 }, { "epoch": 0.20788912579957355, "grad_norm": 3.922708034515381, "learning_rate": 9.334527684672809e-06, "loss": 0.5859, "step": 195 }, { "epoch": 0.208955223880597, "grad_norm": 3.739628791809082, "learning_rate": 9.325712895036916e-06, "loss": 0.6449, "step": 196 }, { "epoch": 0.21002132196162046, "grad_norm": 5.679324150085449, "learning_rate": 9.316844327780955e-06, "loss": 0.6591, "step": 197 }, { "epoch": 0.21108742004264391, "grad_norm": 3.4804131984710693, "learning_rate": 9.307922093159688e-06, "loss": 0.6192, "step": 198 }, { "epoch": 0.21215351812366737, "grad_norm": 4.995907783508301, "learning_rate": 9.298946302095074e-06, "loss": 0.6955, "step": 199 }, { "epoch": 0.21321961620469082, "grad_norm": 5.0011186599731445, "learning_rate": 9.289917066174887e-06, "loss": 0.677, "step": 200 }, { "epoch": 0.21428571428571427, "grad_norm": 4.903175354003906, "learning_rate": 9.280834497651334e-06, "loss": 0.8227, "step": 201 }, { "epoch": 0.21535181236673773, "grad_norm": 4.204827308654785, "learning_rate": 9.271698709439658e-06, "loss": 0.533, "step": 202 }, { "epoch": 0.21641791044776118, "grad_norm": 4.831717491149902, "learning_rate": 9.262509815116732e-06, "loss": 0.6569, "step": 203 }, { "epoch": 0.21748400852878466, "grad_norm": 3.499429225921631, "learning_rate": 9.253267928919652e-06, "loss": 0.4983, "step": 204 }, { "epoch": 0.21855010660980811, "grad_norm": 3.433262348175049, "learning_rate": 9.243973165744306e-06, "loss": 0.6177, "step": 205 }, { "epoch": 0.21961620469083157, "grad_norm": 4.511760711669922, "learning_rate": 9.234625641143962e-06, "loss": 0.5917, "step": 206 }, { "epoch": 0.22068230277185502, "grad_norm": 4.996474742889404, "learning_rate": 9.225225471327815e-06, "loss": 0.8201, "step": 207 }, { "epoch": 0.22174840085287847, "grad_norm": 4.288403034210205, "learning_rate": 9.215772773159556e-06, "loss": 0.6561, "step": 208 }, { "epoch": 0.22281449893390193, "grad_norm": 3.7271835803985596, "learning_rate": 9.206267664155906e-06, "loss": 0.5767, "step": 209 }, { "epoch": 0.22388059701492538, "grad_norm": 3.76225209236145, "learning_rate": 9.196710262485168e-06, "loss": 0.5986, "step": 210 }, { "epoch": 0.22494669509594883, "grad_norm": 4.633055210113525, "learning_rate": 9.187100686965749e-06, "loss": 0.5592, "step": 211 }, { "epoch": 0.2260127931769723, "grad_norm": 4.4236650466918945, "learning_rate": 9.177439057064684e-06, "loss": 0.8021, "step": 212 }, { "epoch": 0.22707889125799574, "grad_norm": 3.8128890991210938, "learning_rate": 9.167725492896153e-06, "loss": 0.5424, "step": 213 }, { "epoch": 0.2281449893390192, "grad_norm": 4.3256940841674805, "learning_rate": 9.157960115219993e-06, "loss": 0.5517, "step": 214 }, { "epoch": 0.22921108742004265, "grad_norm": 3.8854434490203857, "learning_rate": 9.148143045440181e-06, "loss": 0.6936, "step": 215 }, { "epoch": 0.2302771855010661, "grad_norm": 5.513343811035156, "learning_rate": 9.138274405603342e-06, "loss": 0.6737, "step": 216 }, { "epoch": 0.23134328358208955, "grad_norm": 3.945483684539795, "learning_rate": 9.128354318397223e-06, "loss": 0.5544, "step": 217 }, { "epoch": 0.232409381663113, "grad_norm": 3.4478654861450195, "learning_rate": 9.118382907149164e-06, "loss": 0.6122, "step": 218 }, { "epoch": 0.23347547974413646, "grad_norm": 4.193946838378906, "learning_rate": 9.108360295824576e-06, "loss": 0.5894, "step": 219 }, { "epoch": 0.2345415778251599, "grad_norm": 3.8955042362213135, "learning_rate": 9.098286609025392e-06, "loss": 0.5319, "step": 220 }, { "epoch": 0.23560767590618337, "grad_norm": 4.086699962615967, "learning_rate": 9.088161971988517e-06, "loss": 0.572, "step": 221 }, { "epoch": 0.23667377398720682, "grad_norm": 4.041664123535156, "learning_rate": 9.077986510584273e-06, "loss": 0.5894, "step": 222 }, { "epoch": 0.23773987206823027, "grad_norm": 4.037685871124268, "learning_rate": 9.067760351314838e-06, "loss": 0.7726, "step": 223 }, { "epoch": 0.23880597014925373, "grad_norm": 3.610032081604004, "learning_rate": 9.057483621312671e-06, "loss": 0.5974, "step": 224 }, { "epoch": 0.23987206823027718, "grad_norm": 4.686352729797363, "learning_rate": 9.047156448338927e-06, "loss": 0.7261, "step": 225 }, { "epoch": 0.24093816631130063, "grad_norm": 3.578303337097168, "learning_rate": 9.036778960781874e-06, "loss": 0.7701, "step": 226 }, { "epoch": 0.2420042643923241, "grad_norm": 3.9661757946014404, "learning_rate": 9.026351287655294e-06, "loss": 0.5793, "step": 227 }, { "epoch": 0.24307036247334754, "grad_norm": 5.025670051574707, "learning_rate": 9.01587355859688e-06, "loss": 0.651, "step": 228 }, { "epoch": 0.244136460554371, "grad_norm": 5.90174674987793, "learning_rate": 9.005345903866627e-06, "loss": 0.6967, "step": 229 }, { "epoch": 0.24520255863539445, "grad_norm": 3.2720894813537598, "learning_rate": 8.994768454345207e-06, "loss": 0.4902, "step": 230 }, { "epoch": 0.2462686567164179, "grad_norm": 3.946726083755493, "learning_rate": 8.984141341532346e-06, "loss": 0.5975, "step": 231 }, { "epoch": 0.24733475479744135, "grad_norm": 3.8276216983795166, "learning_rate": 8.973464697545191e-06, "loss": 0.5866, "step": 232 }, { "epoch": 0.2484008528784648, "grad_norm": 5.156033039093018, "learning_rate": 8.96273865511666e-06, "loss": 0.8146, "step": 233 }, { "epoch": 0.24946695095948826, "grad_norm": 4.176912307739258, "learning_rate": 8.951963347593797e-06, "loss": 0.6922, "step": 234 }, { "epoch": 0.2505330490405117, "grad_norm": 4.903069972991943, "learning_rate": 8.941138908936118e-06, "loss": 0.6601, "step": 235 }, { "epoch": 0.2515991471215352, "grad_norm": 3.735518455505371, "learning_rate": 8.930265473713939e-06, "loss": 0.4573, "step": 236 }, { "epoch": 0.2526652452025586, "grad_norm": 3.3747098445892334, "learning_rate": 8.9193431771067e-06, "loss": 0.6536, "step": 237 }, { "epoch": 0.2537313432835821, "grad_norm": 4.593052387237549, "learning_rate": 8.908372154901302e-06, "loss": 0.5591, "step": 238 }, { "epoch": 0.2547974413646055, "grad_norm": 3.179175615310669, "learning_rate": 8.897352543490396e-06, "loss": 0.5779, "step": 239 }, { "epoch": 0.255863539445629, "grad_norm": 4.020960807800293, "learning_rate": 8.8862844798707e-06, "loss": 0.5545, "step": 240 }, { "epoch": 0.25692963752665243, "grad_norm": 4.3520097732543945, "learning_rate": 8.875168101641294e-06, "loss": 0.4946, "step": 241 }, { "epoch": 0.2579957356076759, "grad_norm": 4.192246913909912, "learning_rate": 8.864003547001916e-06, "loss": 0.5169, "step": 242 }, { "epoch": 0.25906183368869934, "grad_norm": 4.779207706451416, "learning_rate": 8.852790954751229e-06, "loss": 0.6603, "step": 243 }, { "epoch": 0.2601279317697228, "grad_norm": 5.023779392242432, "learning_rate": 8.841530464285105e-06, "loss": 0.6669, "step": 244 }, { "epoch": 0.26119402985074625, "grad_norm": 4.117948055267334, "learning_rate": 8.83022221559489e-06, "loss": 0.5461, "step": 245 }, { "epoch": 0.2622601279317697, "grad_norm": 5.511953830718994, "learning_rate": 8.81886634926567e-06, "loss": 0.8502, "step": 246 }, { "epoch": 0.26332622601279315, "grad_norm": 4.709165573120117, "learning_rate": 8.807463006474514e-06, "loss": 0.7608, "step": 247 }, { "epoch": 0.26439232409381663, "grad_norm": 3.6320269107818604, "learning_rate": 8.796012328988716e-06, "loss": 0.6373, "step": 248 }, { "epoch": 0.26545842217484006, "grad_norm": 4.277188777923584, "learning_rate": 8.78451445916405e-06, "loss": 0.6774, "step": 249 }, { "epoch": 0.26652452025586354, "grad_norm": 4.38502836227417, "learning_rate": 8.772969539942981e-06, "loss": 0.7912, "step": 250 }, { "epoch": 0.267590618336887, "grad_norm": 3.549016237258911, "learning_rate": 8.7613777148529e-06, "loss": 0.5561, "step": 251 }, { "epoch": 0.26865671641791045, "grad_norm": 3.661815881729126, "learning_rate": 8.749739128004329e-06, "loss": 0.6785, "step": 252 }, { "epoch": 0.2697228144989339, "grad_norm": 4.392899513244629, "learning_rate": 8.738053924089149e-06, "loss": 0.6327, "step": 253 }, { "epoch": 0.27078891257995735, "grad_norm": 4.508695602416992, "learning_rate": 8.726322248378775e-06, "loss": 0.5687, "step": 254 }, { "epoch": 0.27185501066098083, "grad_norm": 3.7261910438537598, "learning_rate": 8.714544246722369e-06, "loss": 0.4706, "step": 255 }, { "epoch": 0.27292110874200426, "grad_norm": 4.259292125701904, "learning_rate": 8.702720065545024e-06, "loss": 0.719, "step": 256 }, { "epoch": 0.27398720682302774, "grad_norm": 3.5374395847320557, "learning_rate": 8.690849851845933e-06, "loss": 0.554, "step": 257 }, { "epoch": 0.27505330490405117, "grad_norm": 3.5622363090515137, "learning_rate": 8.678933753196577e-06, "loss": 0.6285, "step": 258 }, { "epoch": 0.27611940298507465, "grad_norm": 3.590463638305664, "learning_rate": 8.666971917738876e-06, "loss": 0.5025, "step": 259 }, { "epoch": 0.2771855010660981, "grad_norm": 3.7988028526306152, "learning_rate": 8.65496449418336e-06, "loss": 0.6732, "step": 260 }, { "epoch": 0.27825159914712155, "grad_norm": 5.532444477081299, "learning_rate": 8.642911631807306e-06, "loss": 0.5134, "step": 261 }, { "epoch": 0.279317697228145, "grad_norm": 3.07771372795105, "learning_rate": 8.630813480452898e-06, "loss": 0.5896, "step": 262 }, { "epoch": 0.28038379530916846, "grad_norm": 3.9535741806030273, "learning_rate": 8.61867019052535e-06, "loss": 0.66, "step": 263 }, { "epoch": 0.2814498933901919, "grad_norm": 3.4191770553588867, "learning_rate": 8.606481912991052e-06, "loss": 0.694, "step": 264 }, { "epoch": 0.28251599147121537, "grad_norm": 2.6824557781219482, "learning_rate": 8.594248799375671e-06, "loss": 0.5519, "step": 265 }, { "epoch": 0.2835820895522388, "grad_norm": 4.041289806365967, "learning_rate": 8.581971001762287e-06, "loss": 0.6278, "step": 266 }, { "epoch": 0.2846481876332623, "grad_norm": 2.8673665523529053, "learning_rate": 8.569648672789496e-06, "loss": 0.5583, "step": 267 }, { "epoch": 0.2857142857142857, "grad_norm": 4.713343620300293, "learning_rate": 8.557281965649508e-06, "loss": 0.5938, "step": 268 }, { "epoch": 0.2867803837953092, "grad_norm": 3.814359664916992, "learning_rate": 8.54487103408625e-06, "loss": 0.5791, "step": 269 }, { "epoch": 0.2878464818763326, "grad_norm": 3.7030696868896484, "learning_rate": 8.532416032393447e-06, "loss": 0.6769, "step": 270 }, { "epoch": 0.2889125799573561, "grad_norm": 5.340911865234375, "learning_rate": 8.51991711541271e-06, "loss": 0.7043, "step": 271 }, { "epoch": 0.2899786780383795, "grad_norm": 4.9029765129089355, "learning_rate": 8.507374438531606e-06, "loss": 0.642, "step": 272 }, { "epoch": 0.291044776119403, "grad_norm": 3.156393527984619, "learning_rate": 8.494788157681733e-06, "loss": 0.4767, "step": 273 }, { "epoch": 0.2921108742004264, "grad_norm": 3.3508381843566895, "learning_rate": 8.482158429336769e-06, "loss": 0.4239, "step": 274 }, { "epoch": 0.2931769722814499, "grad_norm": 4.38541316986084, "learning_rate": 8.469485410510545e-06, "loss": 0.5657, "step": 275 }, { "epoch": 0.2942430703624733, "grad_norm": 4.696578502655029, "learning_rate": 8.456769258755078e-06, "loss": 0.5802, "step": 276 }, { "epoch": 0.2953091684434968, "grad_norm": 3.3699562549591064, "learning_rate": 8.444010132158614e-06, "loss": 0.5772, "step": 277 }, { "epoch": 0.29637526652452023, "grad_norm": 11.163981437683105, "learning_rate": 8.43120818934367e-06, "loss": 0.5702, "step": 278 }, { "epoch": 0.2974413646055437, "grad_norm": 4.010303974151611, "learning_rate": 8.418363589465055e-06, "loss": 0.5831, "step": 279 }, { "epoch": 0.29850746268656714, "grad_norm": 3.675544261932373, "learning_rate": 8.405476492207902e-06, "loss": 0.7393, "step": 280 }, { "epoch": 0.2995735607675906, "grad_norm": 3.4825985431671143, "learning_rate": 8.392547057785662e-06, "loss": 0.6718, "step": 281 }, { "epoch": 0.3006396588486141, "grad_norm": 4.751382827758789, "learning_rate": 8.379575446938136e-06, "loss": 0.5711, "step": 282 }, { "epoch": 0.3017057569296375, "grad_norm": 4.104181289672852, "learning_rate": 8.366561820929457e-06, "loss": 0.6158, "step": 283 }, { "epoch": 0.302771855010661, "grad_norm": 4.452391624450684, "learning_rate": 8.353506341546106e-06, "loss": 0.5655, "step": 284 }, { "epoch": 0.30383795309168443, "grad_norm": 3.4318346977233887, "learning_rate": 8.340409171094874e-06, "loss": 0.5709, "step": 285 }, { "epoch": 0.3049040511727079, "grad_norm": 3.9533636569976807, "learning_rate": 8.32727047240087e-06, "loss": 0.5992, "step": 286 }, { "epoch": 0.30597014925373134, "grad_norm": 3.464911937713623, "learning_rate": 8.314090408805481e-06, "loss": 0.6862, "step": 287 }, { "epoch": 0.3070362473347548, "grad_norm": 3.5581862926483154, "learning_rate": 8.300869144164346e-06, "loss": 0.5648, "step": 288 }, { "epoch": 0.30810234541577824, "grad_norm": 4.116286754608154, "learning_rate": 8.28760684284532e-06, "loss": 0.46, "step": 289 }, { "epoch": 0.3091684434968017, "grad_norm": 3.6500816345214844, "learning_rate": 8.274303669726427e-06, "loss": 0.4572, "step": 290 }, { "epoch": 0.31023454157782515, "grad_norm": 3.4843218326568604, "learning_rate": 8.260959790193815e-06, "loss": 0.6161, "step": 291 }, { "epoch": 0.31130063965884863, "grad_norm": 3.8216540813446045, "learning_rate": 8.247575370139695e-06, "loss": 0.5235, "step": 292 }, { "epoch": 0.31236673773987206, "grad_norm": 3.1122493743896484, "learning_rate": 8.234150575960288e-06, "loss": 0.4064, "step": 293 }, { "epoch": 0.31343283582089554, "grad_norm": 2.9414212703704834, "learning_rate": 8.220685574553739e-06, "loss": 0.4356, "step": 294 }, { "epoch": 0.31449893390191896, "grad_norm": 3.4299416542053223, "learning_rate": 8.207180533318061e-06, "loss": 0.4433, "step": 295 }, { "epoch": 0.31556503198294245, "grad_norm": 5.096414566040039, "learning_rate": 8.193635620149041e-06, "loss": 0.5758, "step": 296 }, { "epoch": 0.31663113006396587, "grad_norm": 4.246921539306641, "learning_rate": 8.180051003438158e-06, "loss": 0.772, "step": 297 }, { "epoch": 0.31769722814498935, "grad_norm": 4.30014181137085, "learning_rate": 8.16642685207049e-06, "loss": 0.4775, "step": 298 }, { "epoch": 0.3187633262260128, "grad_norm": 3.2169227600097656, "learning_rate": 8.152763335422612e-06, "loss": 0.4522, "step": 299 }, { "epoch": 0.31982942430703626, "grad_norm": 4.288012504577637, "learning_rate": 8.139060623360494e-06, "loss": 0.8488, "step": 300 }, { "epoch": 0.3208955223880597, "grad_norm": 3.9659626483917236, "learning_rate": 8.125318886237382e-06, "loss": 0.5675, "step": 301 }, { "epoch": 0.32196162046908317, "grad_norm": 4.620718955993652, "learning_rate": 8.111538294891684e-06, "loss": 0.5964, "step": 302 }, { "epoch": 0.3230277185501066, "grad_norm": 4.675658226013184, "learning_rate": 8.097719020644855e-06, "loss": 0.6418, "step": 303 }, { "epoch": 0.32409381663113007, "grad_norm": 4.752927780151367, "learning_rate": 8.083861235299253e-06, "loss": 0.7189, "step": 304 }, { "epoch": 0.3251599147121535, "grad_norm": 4.81943416595459, "learning_rate": 8.06996511113601e-06, "loss": 0.5279, "step": 305 }, { "epoch": 0.326226012793177, "grad_norm": 3.218155860900879, "learning_rate": 8.05603082091289e-06, "loss": 0.5851, "step": 306 }, { "epoch": 0.3272921108742004, "grad_norm": 5.198525428771973, "learning_rate": 8.04205853786214e-06, "loss": 0.7337, "step": 307 }, { "epoch": 0.3283582089552239, "grad_norm": 3.304112672805786, "learning_rate": 8.028048435688333e-06, "loss": 0.5363, "step": 308 }, { "epoch": 0.3294243070362473, "grad_norm": 4.646366596221924, "learning_rate": 8.014000688566224e-06, "loss": 0.4362, "step": 309 }, { "epoch": 0.3304904051172708, "grad_norm": 5.290032386779785, "learning_rate": 7.999915471138562e-06, "loss": 0.792, "step": 310 }, { "epoch": 0.3315565031982942, "grad_norm": 3.598745107650757, "learning_rate": 7.985792958513932e-06, "loss": 0.6106, "step": 311 }, { "epoch": 0.3326226012793177, "grad_norm": 3.7250967025756836, "learning_rate": 7.971633326264581e-06, "loss": 0.6502, "step": 312 }, { "epoch": 0.3336886993603412, "grad_norm": 4.78380823135376, "learning_rate": 7.957436750424223e-06, "loss": 0.4703, "step": 313 }, { "epoch": 0.3347547974413646, "grad_norm": 4.254910945892334, "learning_rate": 7.943203407485864e-06, "loss": 0.6798, "step": 314 }, { "epoch": 0.3358208955223881, "grad_norm": 3.7106809616088867, "learning_rate": 7.928933474399601e-06, "loss": 0.5532, "step": 315 }, { "epoch": 0.3368869936034115, "grad_norm": 3.9405412673950195, "learning_rate": 7.91462712857042e-06, "loss": 0.563, "step": 316 }, { "epoch": 0.337953091684435, "grad_norm": 3.9015417098999023, "learning_rate": 7.900284547855992e-06, "loss": 0.5218, "step": 317 }, { "epoch": 0.3390191897654584, "grad_norm": 4.474105358123779, "learning_rate": 7.885905910564466e-06, "loss": 0.5495, "step": 318 }, { "epoch": 0.3400852878464819, "grad_norm": 3.2573843002319336, "learning_rate": 7.87149139545225e-06, "loss": 0.8234, "step": 319 }, { "epoch": 0.3411513859275053, "grad_norm": 4.134116172790527, "learning_rate": 7.857041181721788e-06, "loss": 0.6878, "step": 320 }, { "epoch": 0.3422174840085288, "grad_norm": 3.7018449306488037, "learning_rate": 7.842555449019326e-06, "loss": 0.6351, "step": 321 }, { "epoch": 0.34328358208955223, "grad_norm": 3.8063786029815674, "learning_rate": 7.828034377432694e-06, "loss": 0.6841, "step": 322 }, { "epoch": 0.3443496801705757, "grad_norm": 3.2621402740478516, "learning_rate": 7.813478147489052e-06, "loss": 0.6191, "step": 323 }, { "epoch": 0.34541577825159914, "grad_norm": 4.136355400085449, "learning_rate": 7.798886940152654e-06, "loss": 0.8123, "step": 324 }, { "epoch": 0.3464818763326226, "grad_norm": 2.7552497386932373, "learning_rate": 7.784260936822592e-06, "loss": 0.5188, "step": 325 }, { "epoch": 0.34754797441364604, "grad_norm": 3.848180055618286, "learning_rate": 7.769600319330553e-06, "loss": 0.5112, "step": 326 }, { "epoch": 0.3486140724946695, "grad_norm": 3.5491998195648193, "learning_rate": 7.75490526993854e-06, "loss": 0.5926, "step": 327 }, { "epoch": 0.34968017057569295, "grad_norm": 3.8252878189086914, "learning_rate": 7.740175971336624e-06, "loss": 0.6565, "step": 328 }, { "epoch": 0.35074626865671643, "grad_norm": 3.8406832218170166, "learning_rate": 7.725412606640658e-06, "loss": 0.6515, "step": 329 }, { "epoch": 0.35181236673773986, "grad_norm": 3.478328227996826, "learning_rate": 7.710615359390018e-06, "loss": 0.5813, "step": 330 }, { "epoch": 0.35287846481876334, "grad_norm": 3.934687852859497, "learning_rate": 7.6957844135453e-06, "loss": 0.6966, "step": 331 }, { "epoch": 0.35394456289978676, "grad_norm": 3.123108386993408, "learning_rate": 7.680919953486047e-06, "loss": 0.4644, "step": 332 }, { "epoch": 0.35501066098081024, "grad_norm": 3.3772783279418945, "learning_rate": 7.666022164008458e-06, "loss": 0.5603, "step": 333 }, { "epoch": 0.35607675906183367, "grad_norm": 4.320068836212158, "learning_rate": 7.651091230323079e-06, "loss": 0.6079, "step": 334 }, { "epoch": 0.35714285714285715, "grad_norm": 3.2757842540740967, "learning_rate": 7.636127338052513e-06, "loss": 0.5704, "step": 335 }, { "epoch": 0.3582089552238806, "grad_norm": 5.183093547821045, "learning_rate": 7.621130673229105e-06, "loss": 0.6036, "step": 336 }, { "epoch": 0.35927505330490406, "grad_norm": 4.51352071762085, "learning_rate": 7.606101422292629e-06, "loss": 0.6214, "step": 337 }, { "epoch": 0.3603411513859275, "grad_norm": 3.7789204120635986, "learning_rate": 7.5910397720879785e-06, "loss": 0.4881, "step": 338 }, { "epoch": 0.36140724946695096, "grad_norm": 5.321053504943848, "learning_rate": 7.575945909862829e-06, "loss": 0.6238, "step": 339 }, { "epoch": 0.3624733475479744, "grad_norm": 3.838653802871704, "learning_rate": 7.5608200232653254e-06, "loss": 0.4916, "step": 340 }, { "epoch": 0.36353944562899787, "grad_norm": 5.858667373657227, "learning_rate": 7.545662300341736e-06, "loss": 0.5477, "step": 341 }, { "epoch": 0.3646055437100213, "grad_norm": 3.9880146980285645, "learning_rate": 7.530472929534126e-06, "loss": 0.5602, "step": 342 }, { "epoch": 0.3656716417910448, "grad_norm": 4.388545513153076, "learning_rate": 7.515252099678011e-06, "loss": 0.5575, "step": 343 }, { "epoch": 0.36673773987206826, "grad_norm": 3.4173972606658936, "learning_rate": 7.500000000000001e-06, "loss": 0.6036, "step": 344 }, { "epoch": 0.3678038379530917, "grad_norm": 3.01003360748291, "learning_rate": 7.484716820115461e-06, "loss": 0.6286, "step": 345 }, { "epoch": 0.36886993603411516, "grad_norm": 4.146943092346191, "learning_rate": 7.469402750026147e-06, "loss": 0.4832, "step": 346 }, { "epoch": 0.3699360341151386, "grad_norm": 3.9188106060028076, "learning_rate": 7.454057980117842e-06, "loss": 0.6687, "step": 347 }, { "epoch": 0.37100213219616207, "grad_norm": 4.771470069885254, "learning_rate": 7.438682701157993e-06, "loss": 0.4679, "step": 348 }, { "epoch": 0.3720682302771855, "grad_norm": 4.040458679199219, "learning_rate": 7.423277104293338e-06, "loss": 0.4826, "step": 349 }, { "epoch": 0.373134328358209, "grad_norm": 3.8164219856262207, "learning_rate": 7.407841381047533e-06, "loss": 0.6319, "step": 350 }, { "epoch": 0.3742004264392324, "grad_norm": 3.6950464248657227, "learning_rate": 7.392375723318761e-06, "loss": 0.5198, "step": 351 }, { "epoch": 0.3752665245202559, "grad_norm": 2.5784971714019775, "learning_rate": 7.376880323377357e-06, "loss": 0.431, "step": 352 }, { "epoch": 0.3763326226012793, "grad_norm": 3.1525065898895264, "learning_rate": 7.361355373863415e-06, "loss": 0.5914, "step": 353 }, { "epoch": 0.3773987206823028, "grad_norm": 3.6381683349609375, "learning_rate": 7.345801067784388e-06, "loss": 0.5482, "step": 354 }, { "epoch": 0.3784648187633262, "grad_norm": 3.502392292022705, "learning_rate": 7.330217598512696e-06, "loss": 0.5811, "step": 355 }, { "epoch": 0.3795309168443497, "grad_norm": 4.3851165771484375, "learning_rate": 7.314605159783313e-06, "loss": 0.4824, "step": 356 }, { "epoch": 0.3805970149253731, "grad_norm": 4.022966384887695, "learning_rate": 7.298963945691371e-06, "loss": 0.7176, "step": 357 }, { "epoch": 0.3816631130063966, "grad_norm": 2.9058971405029297, "learning_rate": 7.283294150689735e-06, "loss": 0.599, "step": 358 }, { "epoch": 0.38272921108742003, "grad_norm": 3.7773144245147705, "learning_rate": 7.2675959695865896e-06, "loss": 0.5278, "step": 359 }, { "epoch": 0.3837953091684435, "grad_norm": 4.681633472442627, "learning_rate": 7.251869597543019e-06, "loss": 0.6136, "step": 360 }, { "epoch": 0.38486140724946694, "grad_norm": 3.3713104724884033, "learning_rate": 7.2361152300705795e-06, "loss": 0.4926, "step": 361 }, { "epoch": 0.3859275053304904, "grad_norm": 3.0311970710754395, "learning_rate": 7.2203330630288714e-06, "loss": 0.4955, "step": 362 }, { "epoch": 0.38699360341151384, "grad_norm": 4.987486839294434, "learning_rate": 7.2045232926230965e-06, "loss": 0.646, "step": 363 }, { "epoch": 0.3880597014925373, "grad_norm": 3.312826633453369, "learning_rate": 7.188686115401628e-06, "loss": 0.415, "step": 364 }, { "epoch": 0.38912579957356075, "grad_norm": 4.103660583496094, "learning_rate": 7.172821728253563e-06, "loss": 0.6072, "step": 365 }, { "epoch": 0.39019189765458423, "grad_norm": 4.510636329650879, "learning_rate": 7.156930328406268e-06, "loss": 0.7277, "step": 366 }, { "epoch": 0.39125799573560766, "grad_norm": 3.8437888622283936, "learning_rate": 7.141012113422942e-06, "loss": 0.7967, "step": 367 }, { "epoch": 0.39232409381663114, "grad_norm": 4.921056270599365, "learning_rate": 7.1250672812001505e-06, "loss": 0.5895, "step": 368 }, { "epoch": 0.39339019189765456, "grad_norm": 4.110161304473877, "learning_rate": 7.109096029965362e-06, "loss": 0.6858, "step": 369 }, { "epoch": 0.39445628997867804, "grad_norm": 3.090623140335083, "learning_rate": 7.093098558274494e-06, "loss": 0.4737, "step": 370 }, { "epoch": 0.39552238805970147, "grad_norm": 4.709496974945068, "learning_rate": 7.0770750650094335e-06, "loss": 0.6804, "step": 371 }, { "epoch": 0.39658848614072495, "grad_norm": 3.614657402038574, "learning_rate": 7.061025749375572e-06, "loss": 0.6267, "step": 372 }, { "epoch": 0.3976545842217484, "grad_norm": 3.6256072521209717, "learning_rate": 7.044950810899332e-06, "loss": 0.5447, "step": 373 }, { "epoch": 0.39872068230277186, "grad_norm": 3.082536220550537, "learning_rate": 7.02885044942567e-06, "loss": 0.5205, "step": 374 }, { "epoch": 0.3997867803837953, "grad_norm": 4.234053611755371, "learning_rate": 7.012724865115615e-06, "loss": 0.7579, "step": 375 }, { "epoch": 0.40085287846481876, "grad_norm": 3.8977458477020264, "learning_rate": 6.996574258443761e-06, "loss": 0.7173, "step": 376 }, { "epoch": 0.40191897654584224, "grad_norm": 4.204172611236572, "learning_rate": 6.980398830195785e-06, "loss": 0.5162, "step": 377 }, { "epoch": 0.40298507462686567, "grad_norm": 3.8334693908691406, "learning_rate": 6.964198781465948e-06, "loss": 0.5747, "step": 378 }, { "epoch": 0.40405117270788915, "grad_norm": 3.4994022846221924, "learning_rate": 6.947974313654592e-06, "loss": 0.4717, "step": 379 }, { "epoch": 0.4051172707889126, "grad_norm": 2.826071262359619, "learning_rate": 6.931725628465643e-06, "loss": 0.6413, "step": 380 }, { "epoch": 0.40618336886993606, "grad_norm": 4.1695051193237305, "learning_rate": 6.9154529279040985e-06, "loss": 0.7095, "step": 381 }, { "epoch": 0.4072494669509595, "grad_norm": 4.134946346282959, "learning_rate": 6.899156414273514e-06, "loss": 0.6418, "step": 382 }, { "epoch": 0.40831556503198296, "grad_norm": 3.5761585235595703, "learning_rate": 6.882836290173493e-06, "loss": 0.6792, "step": 383 }, { "epoch": 0.4093816631130064, "grad_norm": 3.9440622329711914, "learning_rate": 6.866492758497171e-06, "loss": 0.6186, "step": 384 }, { "epoch": 0.41044776119402987, "grad_norm": 3.589193820953369, "learning_rate": 6.850126022428678e-06, "loss": 0.4929, "step": 385 }, { "epoch": 0.4115138592750533, "grad_norm": 7.7840046882629395, "learning_rate": 6.833736285440632e-06, "loss": 0.6553, "step": 386 }, { "epoch": 0.4125799573560768, "grad_norm": 4.186641216278076, "learning_rate": 6.817323751291598e-06, "loss": 0.5585, "step": 387 }, { "epoch": 0.4136460554371002, "grad_norm": 4.252668857574463, "learning_rate": 6.800888624023552e-06, "loss": 0.6665, "step": 388 }, { "epoch": 0.4147121535181237, "grad_norm": 3.530285120010376, "learning_rate": 6.78443110795936e-06, "loss": 0.5735, "step": 389 }, { "epoch": 0.4157782515991471, "grad_norm": 3.9304752349853516, "learning_rate": 6.767951407700217e-06, "loss": 0.5321, "step": 390 }, { "epoch": 0.4168443496801706, "grad_norm": 3.7759995460510254, "learning_rate": 6.75144972812312e-06, "loss": 0.6324, "step": 391 }, { "epoch": 0.417910447761194, "grad_norm": 3.334618091583252, "learning_rate": 6.734926274378313e-06, "loss": 0.5712, "step": 392 }, { "epoch": 0.4189765458422175, "grad_norm": 3.486398220062256, "learning_rate": 6.7183812518867365e-06, "loss": 0.6824, "step": 393 }, { "epoch": 0.4200426439232409, "grad_norm": 3.6092135906219482, "learning_rate": 6.701814866337477e-06, "loss": 0.5945, "step": 394 }, { "epoch": 0.4211087420042644, "grad_norm": 3.755216121673584, "learning_rate": 6.685227323685209e-06, "loss": 0.4773, "step": 395 }, { "epoch": 0.42217484008528783, "grad_norm": 4.845449447631836, "learning_rate": 6.668618830147634e-06, "loss": 0.5127, "step": 396 }, { "epoch": 0.4232409381663113, "grad_norm": 3.2996935844421387, "learning_rate": 6.651989592202913e-06, "loss": 0.6736, "step": 397 }, { "epoch": 0.42430703624733473, "grad_norm": 5.081860065460205, "learning_rate": 6.635339816587109e-06, "loss": 0.8248, "step": 398 }, { "epoch": 0.4253731343283582, "grad_norm": 4.356993198394775, "learning_rate": 6.618669710291607e-06, "loss": 0.7287, "step": 399 }, { "epoch": 0.42643923240938164, "grad_norm": 2.8682384490966797, "learning_rate": 6.601979480560543e-06, "loss": 0.5816, "step": 400 }, { "epoch": 0.4275053304904051, "grad_norm": 5.256602764129639, "learning_rate": 6.5852693348882345e-06, "loss": 0.6942, "step": 401 }, { "epoch": 0.42857142857142855, "grad_norm": 3.685624599456787, "learning_rate": 6.568539481016593e-06, "loss": 0.619, "step": 402 }, { "epoch": 0.42963752665245203, "grad_norm": 3.7546422481536865, "learning_rate": 6.551790126932543e-06, "loss": 0.5492, "step": 403 }, { "epoch": 0.43070362473347545, "grad_norm": 3.4464101791381836, "learning_rate": 6.535021480865439e-06, "loss": 0.4916, "step": 404 }, { "epoch": 0.43176972281449894, "grad_norm": 4.058125972747803, "learning_rate": 6.5182337512844725e-06, "loss": 0.607, "step": 405 }, { "epoch": 0.43283582089552236, "grad_norm": 3.8935625553131104, "learning_rate": 6.501427146896087e-06, "loss": 0.5569, "step": 406 }, { "epoch": 0.43390191897654584, "grad_norm": 3.8241467475891113, "learning_rate": 6.484601876641375e-06, "loss": 0.6678, "step": 407 }, { "epoch": 0.4349680170575693, "grad_norm": 3.7094409465789795, "learning_rate": 6.467758149693486e-06, "loss": 0.6035, "step": 408 }, { "epoch": 0.43603411513859275, "grad_norm": 4.244424819946289, "learning_rate": 6.450896175455027e-06, "loss": 0.7245, "step": 409 }, { "epoch": 0.43710021321961623, "grad_norm": 3.3813302516937256, "learning_rate": 6.434016163555452e-06, "loss": 0.7384, "step": 410 }, { "epoch": 0.43816631130063965, "grad_norm": 3.6579248905181885, "learning_rate": 6.417118323848465e-06, "loss": 0.4834, "step": 411 }, { "epoch": 0.43923240938166314, "grad_norm": 8.583823204040527, "learning_rate": 6.400202866409405e-06, "loss": 0.8517, "step": 412 }, { "epoch": 0.44029850746268656, "grad_norm": 3.1570165157318115, "learning_rate": 6.383270001532636e-06, "loss": 0.4952, "step": 413 }, { "epoch": 0.44136460554371004, "grad_norm": 4.001248359680176, "learning_rate": 6.366319939728934e-06, "loss": 0.8003, "step": 414 }, { "epoch": 0.44243070362473347, "grad_norm": 3.315763473510742, "learning_rate": 6.3493528917228664e-06, "loss": 0.5263, "step": 415 }, { "epoch": 0.44349680170575695, "grad_norm": 3.5100173950195312, "learning_rate": 6.332369068450175e-06, "loss": 0.6011, "step": 416 }, { "epoch": 0.4445628997867804, "grad_norm": 3.7093770503997803, "learning_rate": 6.315368681055157e-06, "loss": 0.5395, "step": 417 }, { "epoch": 0.44562899786780386, "grad_norm": 4.817330837249756, "learning_rate": 6.29835194088803e-06, "loss": 0.7976, "step": 418 }, { "epoch": 0.4466950959488273, "grad_norm": 3.750582218170166, "learning_rate": 6.2813190595023135e-06, "loss": 0.6298, "step": 419 }, { "epoch": 0.44776119402985076, "grad_norm": 3.5215864181518555, "learning_rate": 6.264270248652199e-06, "loss": 0.6795, "step": 420 }, { "epoch": 0.4488272921108742, "grad_norm": 3.3415589332580566, "learning_rate": 6.247205720289907e-06, "loss": 0.5405, "step": 421 }, { "epoch": 0.44989339019189767, "grad_norm": 3.9054391384124756, "learning_rate": 6.230125686563068e-06, "loss": 0.6196, "step": 422 }, { "epoch": 0.4509594882729211, "grad_norm": 3.709041118621826, "learning_rate": 6.213030359812069e-06, "loss": 0.5513, "step": 423 }, { "epoch": 0.4520255863539446, "grad_norm": 4.883928298950195, "learning_rate": 6.195919952567426e-06, "loss": 0.7326, "step": 424 }, { "epoch": 0.453091684434968, "grad_norm": 3.3161134719848633, "learning_rate": 6.178794677547138e-06, "loss": 0.7268, "step": 425 }, { "epoch": 0.4541577825159915, "grad_norm": 3.9062917232513428, "learning_rate": 6.161654747654033e-06, "loss": 0.6047, "step": 426 }, { "epoch": 0.4552238805970149, "grad_norm": 3.8535194396972656, "learning_rate": 6.14450037597314e-06, "loss": 0.6635, "step": 427 }, { "epoch": 0.4562899786780384, "grad_norm": 4.017055034637451, "learning_rate": 6.127331775769023e-06, "loss": 0.6548, "step": 428 }, { "epoch": 0.4573560767590618, "grad_norm": 3.888073205947876, "learning_rate": 6.110149160483139e-06, "loss": 0.7055, "step": 429 }, { "epoch": 0.4584221748400853, "grad_norm": 4.234426021575928, "learning_rate": 6.092952743731179e-06, "loss": 0.799, "step": 430 }, { "epoch": 0.4594882729211087, "grad_norm": 3.7901766300201416, "learning_rate": 6.07574273930042e-06, "loss": 0.6032, "step": 431 }, { "epoch": 0.4605543710021322, "grad_norm": 4.043405532836914, "learning_rate": 6.058519361147055e-06, "loss": 0.6698, "step": 432 }, { "epoch": 0.4616204690831556, "grad_norm": 3.9013798236846924, "learning_rate": 6.041282823393546e-06, "loss": 0.6494, "step": 433 }, { "epoch": 0.4626865671641791, "grad_norm": 2.807823896408081, "learning_rate": 6.024033340325954e-06, "loss": 0.5004, "step": 434 }, { "epoch": 0.46375266524520253, "grad_norm": 4.466773509979248, "learning_rate": 6.006771126391278e-06, "loss": 0.6496, "step": 435 }, { "epoch": 0.464818763326226, "grad_norm": 3.818918228149414, "learning_rate": 5.989496396194787e-06, "loss": 0.556, "step": 436 }, { "epoch": 0.46588486140724944, "grad_norm": 3.359609365463257, "learning_rate": 5.972209364497355e-06, "loss": 0.6322, "step": 437 }, { "epoch": 0.4669509594882729, "grad_norm": 2.93123459815979, "learning_rate": 5.954910246212787e-06, "loss": 0.5556, "step": 438 }, { "epoch": 0.4680170575692964, "grad_norm": 5.493821620941162, "learning_rate": 5.937599256405151e-06, "loss": 0.7258, "step": 439 }, { "epoch": 0.4690831556503198, "grad_norm": 3.857835054397583, "learning_rate": 5.920276610286102e-06, "loss": 0.5284, "step": 440 }, { "epoch": 0.4701492537313433, "grad_norm": 3.4338183403015137, "learning_rate": 5.90294252321221e-06, "loss": 0.4414, "step": 441 }, { "epoch": 0.47121535181236673, "grad_norm": 4.471683025360107, "learning_rate": 5.885597210682273e-06, "loss": 0.5146, "step": 442 }, { "epoch": 0.4722814498933902, "grad_norm": 3.333407402038574, "learning_rate": 5.8682408883346535e-06, "loss": 0.4936, "step": 443 }, { "epoch": 0.47334754797441364, "grad_norm": 3.492453098297119, "learning_rate": 5.850873771944581e-06, "loss": 0.5287, "step": 444 }, { "epoch": 0.4744136460554371, "grad_norm": 3.034766912460327, "learning_rate": 5.833496077421485e-06, "loss": 0.4371, "step": 445 }, { "epoch": 0.47547974413646055, "grad_norm": 3.7001404762268066, "learning_rate": 5.816108020806297e-06, "loss": 0.5731, "step": 446 }, { "epoch": 0.47654584221748403, "grad_norm": 3.7596747875213623, "learning_rate": 5.798709818268775e-06, "loss": 0.691, "step": 447 }, { "epoch": 0.47761194029850745, "grad_norm": 3.5472121238708496, "learning_rate": 5.781301686104808e-06, "loss": 0.661, "step": 448 }, { "epoch": 0.47867803837953093, "grad_norm": 4.468401908874512, "learning_rate": 5.763883840733736e-06, "loss": 0.7486, "step": 449 }, { "epoch": 0.47974413646055436, "grad_norm": 4.200370788574219, "learning_rate": 5.746456498695648e-06, "loss": 0.4656, "step": 450 }, { "epoch": 0.48081023454157784, "grad_norm": 3.460596799850464, "learning_rate": 5.729019876648704e-06, "loss": 0.6845, "step": 451 }, { "epoch": 0.48187633262260127, "grad_norm": 4.082694053649902, "learning_rate": 5.711574191366427e-06, "loss": 0.3716, "step": 452 }, { "epoch": 0.48294243070362475, "grad_norm": 4.091470241546631, "learning_rate": 5.694119659735018e-06, "loss": 0.4248, "step": 453 }, { "epoch": 0.4840085287846482, "grad_norm": 4.31389856338501, "learning_rate": 5.6766564987506564e-06, "loss": 0.4651, "step": 454 }, { "epoch": 0.48507462686567165, "grad_norm": 3.317697763442993, "learning_rate": 5.659184925516802e-06, "loss": 0.3848, "step": 455 }, { "epoch": 0.4861407249466951, "grad_norm": 3.163891315460205, "learning_rate": 5.641705157241497e-06, "loss": 0.5658, "step": 456 }, { "epoch": 0.48720682302771856, "grad_norm": 3.1871583461761475, "learning_rate": 5.624217411234667e-06, "loss": 0.5956, "step": 457 }, { "epoch": 0.488272921108742, "grad_norm": 3.883317708969116, "learning_rate": 5.60672190490541e-06, "loss": 0.5181, "step": 458 }, { "epoch": 0.48933901918976547, "grad_norm": 4.082622528076172, "learning_rate": 5.58921885575931e-06, "loss": 0.7151, "step": 459 }, { "epoch": 0.4904051172707889, "grad_norm": 3.3456997871398926, "learning_rate": 5.571708481395719e-06, "loss": 0.619, "step": 460 }, { "epoch": 0.4914712153518124, "grad_norm": 3.6043877601623535, "learning_rate": 5.5541909995050554e-06, "loss": 0.6869, "step": 461 }, { "epoch": 0.4925373134328358, "grad_norm": 4.19076681137085, "learning_rate": 5.536666627866104e-06, "loss": 0.548, "step": 462 }, { "epoch": 0.4936034115138593, "grad_norm": 4.946224212646484, "learning_rate": 5.519135584343301e-06, "loss": 0.61, "step": 463 }, { "epoch": 0.4946695095948827, "grad_norm": 3.0236222743988037, "learning_rate": 5.5015980868840254e-06, "loss": 0.4456, "step": 464 }, { "epoch": 0.4957356076759062, "grad_norm": 4.4427170753479, "learning_rate": 5.484054353515896e-06, "loss": 0.556, "step": 465 }, { "epoch": 0.4968017057569296, "grad_norm": 3.3823118209838867, "learning_rate": 5.466504602344055e-06, "loss": 0.7274, "step": 466 }, { "epoch": 0.4978678038379531, "grad_norm": 3.582003116607666, "learning_rate": 5.448949051548459e-06, "loss": 0.71, "step": 467 }, { "epoch": 0.4989339019189765, "grad_norm": 3.869276762008667, "learning_rate": 5.431387919381166e-06, "loss": 0.5856, "step": 468 }, { "epoch": 0.5, "grad_norm": 3.6900205612182617, "learning_rate": 5.41382142416362e-06, "loss": 0.4308, "step": 469 }, { "epoch": 0.5010660980810234, "grad_norm": 4.586563587188721, "learning_rate": 5.396249784283943e-06, "loss": 0.7768, "step": 470 }, { "epoch": 0.502132196162047, "grad_norm": 3.9025163650512695, "learning_rate": 5.3786732181942135e-06, "loss": 0.5551, "step": 471 }, { "epoch": 0.5031982942430704, "grad_norm": 3.4454126358032227, "learning_rate": 5.361091944407751e-06, "loss": 0.7165, "step": 472 }, { "epoch": 0.5042643923240938, "grad_norm": 3.720132350921631, "learning_rate": 5.343506181496405e-06, "loss": 0.5594, "step": 473 }, { "epoch": 0.5053304904051172, "grad_norm": 3.436967611312866, "learning_rate": 5.3259161480878354e-06, "loss": 0.5669, "step": 474 }, { "epoch": 0.5063965884861408, "grad_norm": 3.355968475341797, "learning_rate": 5.308322062862786e-06, "loss": 0.5253, "step": 475 }, { "epoch": 0.5074626865671642, "grad_norm": 4.170572280883789, "learning_rate": 5.290724144552379e-06, "loss": 0.7352, "step": 476 }, { "epoch": 0.5085287846481876, "grad_norm": 3.228147029876709, "learning_rate": 5.2731226119353915e-06, "loss": 0.654, "step": 477 }, { "epoch": 0.509594882729211, "grad_norm": 3.3596479892730713, "learning_rate": 5.255517683835528e-06, "loss": 0.4964, "step": 478 }, { "epoch": 0.5106609808102346, "grad_norm": 3.6093690395355225, "learning_rate": 5.237909579118713e-06, "loss": 0.5802, "step": 479 }, { "epoch": 0.511727078891258, "grad_norm": 3.618178129196167, "learning_rate": 5.220298516690353e-06, "loss": 0.7444, "step": 480 }, { "epoch": 0.5127931769722814, "grad_norm": 4.173375129699707, "learning_rate": 5.202684715492635e-06, "loss": 0.5926, "step": 481 }, { "epoch": 0.5138592750533049, "grad_norm": 2.8536767959594727, "learning_rate": 5.185068394501791e-06, "loss": 0.4298, "step": 482 }, { "epoch": 0.5149253731343284, "grad_norm": 3.693899393081665, "learning_rate": 5.1674497727253766e-06, "loss": 0.6429, "step": 483 }, { "epoch": 0.5159914712153518, "grad_norm": 3.8439035415649414, "learning_rate": 5.149829069199555e-06, "loss": 0.7529, "step": 484 }, { "epoch": 0.5170575692963753, "grad_norm": 4.210805416107178, "learning_rate": 5.132206502986368e-06, "loss": 0.5794, "step": 485 }, { "epoch": 0.5181236673773987, "grad_norm": 2.9010701179504395, "learning_rate": 5.114582293171012e-06, "loss": 0.5061, "step": 486 }, { "epoch": 0.5191897654584222, "grad_norm": 5.9206695556640625, "learning_rate": 5.096956658859122e-06, "loss": 0.6725, "step": 487 }, { "epoch": 0.5202558635394456, "grad_norm": 2.9351608753204346, "learning_rate": 5.07932981917404e-06, "loss": 0.6279, "step": 488 }, { "epoch": 0.5213219616204691, "grad_norm": 4.449506759643555, "learning_rate": 5.061701993254092e-06, "loss": 0.5733, "step": 489 }, { "epoch": 0.5223880597014925, "grad_norm": 3.6895699501037598, "learning_rate": 5.044073400249867e-06, "loss": 0.5113, "step": 490 }, { "epoch": 0.523454157782516, "grad_norm": 4.424152851104736, "learning_rate": 5.026444259321489e-06, "loss": 0.5759, "step": 491 }, { "epoch": 0.5245202558635395, "grad_norm": 3.878295421600342, "learning_rate": 5.008814789635894e-06, "loss": 0.5668, "step": 492 }, { "epoch": 0.5255863539445629, "grad_norm": 4.013467311859131, "learning_rate": 4.9911852103641065e-06, "loss": 0.658, "step": 493 }, { "epoch": 0.5266524520255863, "grad_norm": 5.206568717956543, "learning_rate": 4.973555740678512e-06, "loss": 0.8008, "step": 494 }, { "epoch": 0.5277185501066098, "grad_norm": 4.0673370361328125, "learning_rate": 4.955926599750134e-06, "loss": 0.5527, "step": 495 }, { "epoch": 0.5287846481876333, "grad_norm": 3.218914270401001, "learning_rate": 4.938298006745909e-06, "loss": 0.4456, "step": 496 }, { "epoch": 0.5298507462686567, "grad_norm": 4.850662708282471, "learning_rate": 4.9206701808259605e-06, "loss": 0.8031, "step": 497 }, { "epoch": 0.5309168443496801, "grad_norm": 4.463698863983154, "learning_rate": 4.903043341140879e-06, "loss": 0.59, "step": 498 }, { "epoch": 0.5319829424307037, "grad_norm": 3.5384063720703125, "learning_rate": 4.885417706828989e-06, "loss": 0.5534, "step": 499 }, { "epoch": 0.5330490405117271, "grad_norm": 3.725389242172241, "learning_rate": 4.867793497013634e-06, "loss": 0.5397, "step": 500 }, { "epoch": 0.5341151385927505, "grad_norm": 3.5254456996917725, "learning_rate": 4.850170930800447e-06, "loss": 0.62, "step": 501 }, { "epoch": 0.535181236673774, "grad_norm": 4.125686168670654, "learning_rate": 4.832550227274624e-06, "loss": 0.6528, "step": 502 }, { "epoch": 0.5362473347547975, "grad_norm": 3.163158893585205, "learning_rate": 4.81493160549821e-06, "loss": 0.5764, "step": 503 }, { "epoch": 0.5373134328358209, "grad_norm": 3.6023948192596436, "learning_rate": 4.7973152845073666e-06, "loss": 0.4275, "step": 504 }, { "epoch": 0.5383795309168443, "grad_norm": 6.683622360229492, "learning_rate": 4.779701483309648e-06, "loss": 0.676, "step": 505 }, { "epoch": 0.5394456289978679, "grad_norm": 2.622581720352173, "learning_rate": 4.762090420881289e-06, "loss": 0.4214, "step": 506 }, { "epoch": 0.5405117270788913, "grad_norm": 3.246778964996338, "learning_rate": 4.7444823161644725e-06, "loss": 0.507, "step": 507 }, { "epoch": 0.5415778251599147, "grad_norm": 3.778594493865967, "learning_rate": 4.726877388064609e-06, "loss": 0.7838, "step": 508 }, { "epoch": 0.5426439232409381, "grad_norm": 3.714376449584961, "learning_rate": 4.7092758554476215e-06, "loss": 0.5786, "step": 509 }, { "epoch": 0.5437100213219617, "grad_norm": 3.2910237312316895, "learning_rate": 4.691677937137217e-06, "loss": 0.4689, "step": 510 }, { "epoch": 0.5447761194029851, "grad_norm": 3.709904193878174, "learning_rate": 4.674083851912167e-06, "loss": 0.4826, "step": 511 }, { "epoch": 0.5458422174840085, "grad_norm": 4.968465328216553, "learning_rate": 4.6564938185035954e-06, "loss": 0.7548, "step": 512 }, { "epoch": 0.5469083155650319, "grad_norm": 3.9498393535614014, "learning_rate": 4.638908055592252e-06, "loss": 0.6188, "step": 513 }, { "epoch": 0.5479744136460555, "grad_norm": 2.755976915359497, "learning_rate": 4.62132678180579e-06, "loss": 0.5959, "step": 514 }, { "epoch": 0.5490405117270789, "grad_norm": 3.850876569747925, "learning_rate": 4.603750215716057e-06, "loss": 0.6446, "step": 515 }, { "epoch": 0.5501066098081023, "grad_norm": 4.096807479858398, "learning_rate": 4.58617857583638e-06, "loss": 0.7002, "step": 516 }, { "epoch": 0.5511727078891258, "grad_norm": 4.319389820098877, "learning_rate": 4.568612080618836e-06, "loss": 0.5365, "step": 517 }, { "epoch": 0.5522388059701493, "grad_norm": 4.218343734741211, "learning_rate": 4.551050948451542e-06, "loss": 0.6871, "step": 518 }, { "epoch": 0.5533049040511727, "grad_norm": 3.1689553260803223, "learning_rate": 4.533495397655946e-06, "loss": 0.5465, "step": 519 }, { "epoch": 0.5543710021321961, "grad_norm": 3.6630654335021973, "learning_rate": 4.515945646484105e-06, "loss": 0.6929, "step": 520 }, { "epoch": 0.5554371002132196, "grad_norm": 4.538209915161133, "learning_rate": 4.498401913115975e-06, "loss": 0.818, "step": 521 }, { "epoch": 0.5565031982942431, "grad_norm": 3.9067232608795166, "learning_rate": 4.4808644156567e-06, "loss": 0.6543, "step": 522 }, { "epoch": 0.5575692963752665, "grad_norm": 3.3959505558013916, "learning_rate": 4.463333372133897e-06, "loss": 0.4869, "step": 523 }, { "epoch": 0.55863539445629, "grad_norm": 4.093690872192383, "learning_rate": 4.445809000494945e-06, "loss": 0.5658, "step": 524 }, { "epoch": 0.5597014925373134, "grad_norm": 4.169078826904297, "learning_rate": 4.428291518604283e-06, "loss": 0.6523, "step": 525 }, { "epoch": 0.5607675906183369, "grad_norm": 4.127382755279541, "learning_rate": 4.410781144240692e-06, "loss": 0.5763, "step": 526 }, { "epoch": 0.5618336886993603, "grad_norm": 4.4920172691345215, "learning_rate": 4.393278095094591e-06, "loss": 0.6354, "step": 527 }, { "epoch": 0.5628997867803838, "grad_norm": 4.825352668762207, "learning_rate": 4.3757825887653345e-06, "loss": 0.753, "step": 528 }, { "epoch": 0.5639658848614072, "grad_norm": 4.326002597808838, "learning_rate": 4.358294842758504e-06, "loss": 0.7274, "step": 529 }, { "epoch": 0.5650319829424307, "grad_norm": 3.5843746662139893, "learning_rate": 4.340815074483199e-06, "loss": 0.5815, "step": 530 }, { "epoch": 0.5660980810234542, "grad_norm": 3.441131591796875, "learning_rate": 4.323343501249346e-06, "loss": 0.5635, "step": 531 }, { "epoch": 0.5671641791044776, "grad_norm": 4.307285308837891, "learning_rate": 4.305880340264985e-06, "loss": 0.6288, "step": 532 }, { "epoch": 0.5682302771855011, "grad_norm": 4.174529075622559, "learning_rate": 4.2884258086335755e-06, "loss": 0.4411, "step": 533 }, { "epoch": 0.5692963752665245, "grad_norm": 3.7027828693389893, "learning_rate": 4.270980123351299e-06, "loss": 0.4706, "step": 534 }, { "epoch": 0.570362473347548, "grad_norm": 4.442177772521973, "learning_rate": 4.2535435013043535e-06, "loss": 0.4758, "step": 535 }, { "epoch": 0.5714285714285714, "grad_norm": 3.1039648056030273, "learning_rate": 4.2361161592662655e-06, "loss": 0.564, "step": 536 }, { "epoch": 0.5724946695095949, "grad_norm": 3.2122061252593994, "learning_rate": 4.218698313895192e-06, "loss": 0.5879, "step": 537 }, { "epoch": 0.5735607675906184, "grad_norm": 4.600186347961426, "learning_rate": 4.2012901817312255e-06, "loss": 0.6367, "step": 538 }, { "epoch": 0.5746268656716418, "grad_norm": 2.987234354019165, "learning_rate": 4.183891979193703e-06, "loss": 0.4316, "step": 539 }, { "epoch": 0.5756929637526652, "grad_norm": 4.250531196594238, "learning_rate": 4.166503922578516e-06, "loss": 0.6576, "step": 540 }, { "epoch": 0.5767590618336887, "grad_norm": 2.5906317234039307, "learning_rate": 4.149126228055419e-06, "loss": 0.5795, "step": 541 }, { "epoch": 0.5778251599147122, "grad_norm": 2.783703327178955, "learning_rate": 4.131759111665349e-06, "loss": 0.598, "step": 542 }, { "epoch": 0.5788912579957356, "grad_norm": 3.789217710494995, "learning_rate": 4.114402789317729e-06, "loss": 0.5345, "step": 543 }, { "epoch": 0.579957356076759, "grad_norm": 3.190741539001465, "learning_rate": 4.097057476787792e-06, "loss": 0.4705, "step": 544 }, { "epoch": 0.5810234541577826, "grad_norm": 2.9710161685943604, "learning_rate": 4.079723389713899e-06, "loss": 0.3804, "step": 545 }, { "epoch": 0.582089552238806, "grad_norm": 3.794388771057129, "learning_rate": 4.06240074359485e-06, "loss": 0.5019, "step": 546 }, { "epoch": 0.5831556503198294, "grad_norm": 3.181882858276367, "learning_rate": 4.045089753787214e-06, "loss": 0.4988, "step": 547 }, { "epoch": 0.5842217484008528, "grad_norm": 4.025869846343994, "learning_rate": 4.027790635502646e-06, "loss": 0.755, "step": 548 }, { "epoch": 0.5852878464818764, "grad_norm": 3.1576344966888428, "learning_rate": 4.010503603805214e-06, "loss": 0.4467, "step": 549 }, { "epoch": 0.5863539445628998, "grad_norm": 4.127837181091309, "learning_rate": 3.993228873608724e-06, "loss": 0.5099, "step": 550 }, { "epoch": 0.5874200426439232, "grad_norm": 2.9941911697387695, "learning_rate": 3.975966659674048e-06, "loss": 0.5594, "step": 551 }, { "epoch": 0.5884861407249466, "grad_norm": 2.9025707244873047, "learning_rate": 3.958717176606456e-06, "loss": 0.5682, "step": 552 }, { "epoch": 0.5895522388059702, "grad_norm": 3.670173168182373, "learning_rate": 3.941480638852948e-06, "loss": 0.6297, "step": 553 }, { "epoch": 0.5906183368869936, "grad_norm": 4.246592998504639, "learning_rate": 3.924257260699583e-06, "loss": 0.5991, "step": 554 }, { "epoch": 0.591684434968017, "grad_norm": 3.1765317916870117, "learning_rate": 3.907047256268822e-06, "loss": 0.5027, "step": 555 }, { "epoch": 0.5927505330490405, "grad_norm": 3.362060070037842, "learning_rate": 3.8898508395168645e-06, "loss": 0.5502, "step": 556 }, { "epoch": 0.593816631130064, "grad_norm": 3.0551469326019287, "learning_rate": 3.872668224230979e-06, "loss": 0.5425, "step": 557 }, { "epoch": 0.5948827292110874, "grad_norm": 2.877169609069824, "learning_rate": 3.855499624026861e-06, "loss": 0.538, "step": 558 }, { "epoch": 0.5959488272921108, "grad_norm": 4.517727375030518, "learning_rate": 3.838345252345968e-06, "loss": 0.7813, "step": 559 }, { "epoch": 0.5970149253731343, "grad_norm": 3.632627010345459, "learning_rate": 3.821205322452863e-06, "loss": 0.7069, "step": 560 }, { "epoch": 0.5980810234541578, "grad_norm": 4.394737243652344, "learning_rate": 3.804080047432574e-06, "loss": 0.8848, "step": 561 }, { "epoch": 0.5991471215351812, "grad_norm": 3.7429492473602295, "learning_rate": 3.786969640187932e-06, "loss": 0.5491, "step": 562 }, { "epoch": 0.6002132196162047, "grad_norm": 3.724515199661255, "learning_rate": 3.769874313436933e-06, "loss": 0.6281, "step": 563 }, { "epoch": 0.6012793176972282, "grad_norm": 3.8958685398101807, "learning_rate": 3.752794279710094e-06, "loss": 0.6543, "step": 564 }, { "epoch": 0.6023454157782516, "grad_norm": 3.707008123397827, "learning_rate": 3.735729751347803e-06, "loss": 0.5302, "step": 565 }, { "epoch": 0.603411513859275, "grad_norm": 3.8350601196289062, "learning_rate": 3.7186809404976877e-06, "loss": 0.6216, "step": 566 }, { "epoch": 0.6044776119402985, "grad_norm": 3.758518934249878, "learning_rate": 3.701648059111972e-06, "loss": 0.4982, "step": 567 }, { "epoch": 0.605543710021322, "grad_norm": 4.729321002960205, "learning_rate": 3.6846313189448447e-06, "loss": 0.4905, "step": 568 }, { "epoch": 0.6066098081023454, "grad_norm": 3.554718494415283, "learning_rate": 3.667630931549826e-06, "loss": 0.4375, "step": 569 }, { "epoch": 0.6076759061833689, "grad_norm": 5.01503324508667, "learning_rate": 3.6506471082771357e-06, "loss": 0.6721, "step": 570 }, { "epoch": 0.6087420042643923, "grad_norm": 4.222141742706299, "learning_rate": 3.6336800602710676e-06, "loss": 0.787, "step": 571 }, { "epoch": 0.6098081023454158, "grad_norm": 4.128366470336914, "learning_rate": 3.6167299984673655e-06, "loss": 0.5689, "step": 572 }, { "epoch": 0.6108742004264393, "grad_norm": 2.7572109699249268, "learning_rate": 3.5997971335905966e-06, "loss": 0.7662, "step": 573 }, { "epoch": 0.6119402985074627, "grad_norm": 4.168936252593994, "learning_rate": 3.582881676151536e-06, "loss": 0.709, "step": 574 }, { "epoch": 0.6130063965884861, "grad_norm": 4.17940092086792, "learning_rate": 3.5659838364445505e-06, "loss": 0.6845, "step": 575 }, { "epoch": 0.6140724946695096, "grad_norm": 3.636746644973755, "learning_rate": 3.549103824544975e-06, "loss": 0.4872, "step": 576 }, { "epoch": 0.6151385927505331, "grad_norm": 5.7504777908325195, "learning_rate": 3.5322418503065148e-06, "loss": 0.6376, "step": 577 }, { "epoch": 0.6162046908315565, "grad_norm": 6.510772228240967, "learning_rate": 3.5153981233586277e-06, "loss": 0.6849, "step": 578 }, { "epoch": 0.6172707889125799, "grad_norm": 3.533358573913574, "learning_rate": 3.498572853103915e-06, "loss": 0.698, "step": 579 }, { "epoch": 0.6183368869936035, "grad_norm": 4.93348503112793, "learning_rate": 3.481766248715528e-06, "loss": 0.5477, "step": 580 }, { "epoch": 0.6194029850746269, "grad_norm": 4.318631172180176, "learning_rate": 3.4649785191345613e-06, "loss": 0.5777, "step": 581 }, { "epoch": 0.6204690831556503, "grad_norm": 4.286785125732422, "learning_rate": 3.4482098730674577e-06, "loss": 0.5659, "step": 582 }, { "epoch": 0.6215351812366737, "grad_norm": 3.9310171604156494, "learning_rate": 3.4314605189834076e-06, "loss": 0.5589, "step": 583 }, { "epoch": 0.6226012793176973, "grad_norm": 3.892915964126587, "learning_rate": 3.4147306651117663e-06, "loss": 0.5975, "step": 584 }, { "epoch": 0.6236673773987207, "grad_norm": 3.209181308746338, "learning_rate": 3.398020519439459e-06, "loss": 0.5605, "step": 585 }, { "epoch": 0.6247334754797441, "grad_norm": 3.3576905727386475, "learning_rate": 3.3813302897083955e-06, "loss": 0.5895, "step": 586 }, { "epoch": 0.6257995735607675, "grad_norm": 2.719895362854004, "learning_rate": 3.3646601834128924e-06, "loss": 0.4634, "step": 587 }, { "epoch": 0.6268656716417911, "grad_norm": 3.9210264682769775, "learning_rate": 3.348010407797088e-06, "loss": 0.6512, "step": 588 }, { "epoch": 0.6279317697228145, "grad_norm": 2.4289424419403076, "learning_rate": 3.3313811698523677e-06, "loss": 0.5486, "step": 589 }, { "epoch": 0.6289978678038379, "grad_norm": 3.724541425704956, "learning_rate": 3.3147726763147913e-06, "loss": 0.4343, "step": 590 }, { "epoch": 0.6300639658848614, "grad_norm": 4.287071704864502, "learning_rate": 3.298185133662525e-06, "loss": 0.5126, "step": 591 }, { "epoch": 0.6311300639658849, "grad_norm": 3.1851956844329834, "learning_rate": 3.2816187481132655e-06, "loss": 0.5809, "step": 592 }, { "epoch": 0.6321961620469083, "grad_norm": 3.2316763401031494, "learning_rate": 3.2650737256216885e-06, "loss": 0.3208, "step": 593 }, { "epoch": 0.6332622601279317, "grad_norm": 4.77504301071167, "learning_rate": 3.2485502718768814e-06, "loss": 0.605, "step": 594 }, { "epoch": 0.6343283582089553, "grad_norm": 3.186673879623413, "learning_rate": 3.2320485922997842e-06, "loss": 0.6013, "step": 595 }, { "epoch": 0.6353944562899787, "grad_norm": 4.105803966522217, "learning_rate": 3.2155688920406415e-06, "loss": 0.4346, "step": 596 }, { "epoch": 0.6364605543710021, "grad_norm": 3.5919454097747803, "learning_rate": 3.1991113759764493e-06, "loss": 0.4577, "step": 597 }, { "epoch": 0.6375266524520256, "grad_norm": 3.144244432449341, "learning_rate": 3.1826762487084053e-06, "loss": 0.5382, "step": 598 }, { "epoch": 0.6385927505330491, "grad_norm": 3.883305549621582, "learning_rate": 3.16626371455937e-06, "loss": 0.5929, "step": 599 }, { "epoch": 0.6396588486140725, "grad_norm": 3.2877514362335205, "learning_rate": 3.149873977571324e-06, "loss": 0.5709, "step": 600 }, { "epoch": 0.6407249466950959, "grad_norm": 4.199596405029297, "learning_rate": 3.133507241502832e-06, "loss": 0.7197, "step": 601 }, { "epoch": 0.6417910447761194, "grad_norm": 3.0299744606018066, "learning_rate": 3.1171637098265063e-06, "loss": 0.3672, "step": 602 }, { "epoch": 0.6428571428571429, "grad_norm": 5.116127014160156, "learning_rate": 3.1008435857264862e-06, "loss": 0.6355, "step": 603 }, { "epoch": 0.6439232409381663, "grad_norm": 3.565772533416748, "learning_rate": 3.0845470720959027e-06, "loss": 0.5372, "step": 604 }, { "epoch": 0.6449893390191898, "grad_norm": 4.50262975692749, "learning_rate": 3.0682743715343565e-06, "loss": 0.5552, "step": 605 }, { "epoch": 0.6460554371002132, "grad_norm": 4.35274600982666, "learning_rate": 3.0520256863454077e-06, "loss": 0.5337, "step": 606 }, { "epoch": 0.6471215351812367, "grad_norm": 3.797417163848877, "learning_rate": 3.035801218534054e-06, "loss": 0.6198, "step": 607 }, { "epoch": 0.6481876332622601, "grad_norm": 4.147847652435303, "learning_rate": 3.019601169804216e-06, "loss": 0.6524, "step": 608 }, { "epoch": 0.6492537313432836, "grad_norm": 3.4904096126556396, "learning_rate": 3.00342574155624e-06, "loss": 0.5581, "step": 609 }, { "epoch": 0.650319829424307, "grad_norm": 2.892301559448242, "learning_rate": 2.9872751348843875e-06, "loss": 0.5062, "step": 610 }, { "epoch": 0.6513859275053305, "grad_norm": 3.5769119262695312, "learning_rate": 2.9711495505743317e-06, "loss": 0.6562, "step": 611 }, { "epoch": 0.652452025586354, "grad_norm": 3.2735555171966553, "learning_rate": 2.9550491891006704e-06, "loss": 0.3918, "step": 612 }, { "epoch": 0.6535181236673774, "grad_norm": 3.902296543121338, "learning_rate": 2.938974250624429e-06, "loss": 0.6364, "step": 613 }, { "epoch": 0.6545842217484008, "grad_norm": 2.647947072982788, "learning_rate": 2.9229249349905686e-06, "loss": 0.4903, "step": 614 }, { "epoch": 0.6556503198294243, "grad_norm": 5.518612384796143, "learning_rate": 2.906901441725507e-06, "loss": 0.9065, "step": 615 }, { "epoch": 0.6567164179104478, "grad_norm": 3.9488322734832764, "learning_rate": 2.8909039700346385e-06, "loss": 0.5084, "step": 616 }, { "epoch": 0.6577825159914712, "grad_norm": 3.036370038986206, "learning_rate": 2.8749327187998516e-06, "loss": 0.4458, "step": 617 }, { "epoch": 0.6588486140724946, "grad_norm": 4.986847400665283, "learning_rate": 2.858987886577058e-06, "loss": 0.5961, "step": 618 }, { "epoch": 0.6599147121535182, "grad_norm": 5.1367387771606445, "learning_rate": 2.843069671593734e-06, "loss": 0.5639, "step": 619 }, { "epoch": 0.6609808102345416, "grad_norm": 3.1643879413604736, "learning_rate": 2.8271782717464413e-06, "loss": 0.3847, "step": 620 }, { "epoch": 0.662046908315565, "grad_norm": 2.6436514854431152, "learning_rate": 2.811313884598373e-06, "loss": 0.5286, "step": 621 }, { "epoch": 0.6631130063965884, "grad_norm": 3.622612953186035, "learning_rate": 2.795476707376905e-06, "loss": 0.5693, "step": 622 }, { "epoch": 0.664179104477612, "grad_norm": 2.69989013671875, "learning_rate": 2.7796669369711294e-06, "loss": 0.5431, "step": 623 }, { "epoch": 0.6652452025586354, "grad_norm": 5.510005950927734, "learning_rate": 2.7638847699294196e-06, "loss": 0.5919, "step": 624 }, { "epoch": 0.6663113006396588, "grad_norm": 2.831249952316284, "learning_rate": 2.7481304024569823e-06, "loss": 0.4747, "step": 625 }, { "epoch": 0.6673773987206824, "grad_norm": 4.0560784339904785, "learning_rate": 2.7324040304134125e-06, "loss": 0.5718, "step": 626 }, { "epoch": 0.6684434968017058, "grad_norm": 3.151724338531494, "learning_rate": 2.716705849310265e-06, "loss": 0.5793, "step": 627 }, { "epoch": 0.6695095948827292, "grad_norm": 3.509110927581787, "learning_rate": 2.701036054308629e-06, "loss": 0.5529, "step": 628 }, { "epoch": 0.6705756929637526, "grad_norm": 3.3300867080688477, "learning_rate": 2.685394840216688e-06, "loss": 0.5163, "step": 629 }, { "epoch": 0.6716417910447762, "grad_norm": 4.18197774887085, "learning_rate": 2.6697824014873076e-06, "loss": 0.5729, "step": 630 }, { "epoch": 0.6727078891257996, "grad_norm": 4.711467742919922, "learning_rate": 2.654198932215613e-06, "loss": 0.6336, "step": 631 }, { "epoch": 0.673773987206823, "grad_norm": 3.4990603923797607, "learning_rate": 2.6386446261365874e-06, "loss": 0.5336, "step": 632 }, { "epoch": 0.6748400852878464, "grad_norm": 4.139181613922119, "learning_rate": 2.623119676622645e-06, "loss": 0.6313, "step": 633 }, { "epoch": 0.67590618336887, "grad_norm": 3.6757354736328125, "learning_rate": 2.607624276681241e-06, "loss": 0.5122, "step": 634 }, { "epoch": 0.6769722814498934, "grad_norm": 2.6955840587615967, "learning_rate": 2.5921586189524694e-06, "loss": 0.6094, "step": 635 }, { "epoch": 0.6780383795309168, "grad_norm": 3.385660171508789, "learning_rate": 2.5767228957066635e-06, "loss": 0.4877, "step": 636 }, { "epoch": 0.6791044776119403, "grad_norm": 4.949819087982178, "learning_rate": 2.561317298842008e-06, "loss": 0.813, "step": 637 }, { "epoch": 0.6801705756929638, "grad_norm": 3.2295103073120117, "learning_rate": 2.5459420198821604e-06, "loss": 0.4244, "step": 638 }, { "epoch": 0.6812366737739872, "grad_norm": 4.290478706359863, "learning_rate": 2.530597249973856e-06, "loss": 0.5954, "step": 639 }, { "epoch": 0.6823027718550106, "grad_norm": 3.565035104751587, "learning_rate": 2.51528317988454e-06, "loss": 0.5769, "step": 640 }, { "epoch": 0.6833688699360341, "grad_norm": 5.35756778717041, "learning_rate": 2.5000000000000015e-06, "loss": 0.6081, "step": 641 }, { "epoch": 0.6844349680170576, "grad_norm": 3.589604377746582, "learning_rate": 2.4847479003219926e-06, "loss": 0.7254, "step": 642 }, { "epoch": 0.685501066098081, "grad_norm": 3.122896671295166, "learning_rate": 2.4695270704658753e-06, "loss": 0.4031, "step": 643 }, { "epoch": 0.6865671641791045, "grad_norm": 2.7060294151306152, "learning_rate": 2.454337699658267e-06, "loss": 0.5658, "step": 644 }, { "epoch": 0.6876332622601279, "grad_norm": 2.8084559440612793, "learning_rate": 2.439179976734677e-06, "loss": 0.5072, "step": 645 }, { "epoch": 0.6886993603411514, "grad_norm": 3.854334831237793, "learning_rate": 2.4240540901371727e-06, "loss": 0.4648, "step": 646 }, { "epoch": 0.6897654584221748, "grad_norm": 3.5784668922424316, "learning_rate": 2.4089602279120224e-06, "loss": 0.6151, "step": 647 }, { "epoch": 0.6908315565031983, "grad_norm": 2.7670629024505615, "learning_rate": 2.393898577707371e-06, "loss": 0.5434, "step": 648 }, { "epoch": 0.6918976545842217, "grad_norm": 3.370211601257324, "learning_rate": 2.3788693267708975e-06, "loss": 0.5567, "step": 649 }, { "epoch": 0.6929637526652452, "grad_norm": 3.951569080352783, "learning_rate": 2.363872661947488e-06, "loss": 0.5314, "step": 650 }, { "epoch": 0.6940298507462687, "grad_norm": 3.017756938934326, "learning_rate": 2.3489087696769225e-06, "loss": 0.4686, "step": 651 }, { "epoch": 0.6950959488272921, "grad_norm": 2.882693290710449, "learning_rate": 2.333977835991545e-06, "loss": 0.6483, "step": 652 }, { "epoch": 0.6961620469083155, "grad_norm": 3.1812756061553955, "learning_rate": 2.319080046513954e-06, "loss": 0.5153, "step": 653 }, { "epoch": 0.697228144989339, "grad_norm": 4.908356189727783, "learning_rate": 2.3042155864547024e-06, "loss": 0.8289, "step": 654 }, { "epoch": 0.6982942430703625, "grad_norm": 3.2701950073242188, "learning_rate": 2.2893846406099847e-06, "loss": 0.5599, "step": 655 }, { "epoch": 0.6993603411513859, "grad_norm": 2.903156280517578, "learning_rate": 2.274587393359342e-06, "loss": 0.5178, "step": 656 }, { "epoch": 0.7004264392324094, "grad_norm": 3.1494290828704834, "learning_rate": 2.2598240286633787e-06, "loss": 0.6042, "step": 657 }, { "epoch": 0.7014925373134329, "grad_norm": 4.410161972045898, "learning_rate": 2.245094730061463e-06, "loss": 0.5476, "step": 658 }, { "epoch": 0.7025586353944563, "grad_norm": 3.0155627727508545, "learning_rate": 2.230399680669449e-06, "loss": 0.4471, "step": 659 }, { "epoch": 0.7036247334754797, "grad_norm": 3.195528984069824, "learning_rate": 2.215739063177409e-06, "loss": 0.7218, "step": 660 }, { "epoch": 0.7046908315565032, "grad_norm": 4.383801460266113, "learning_rate": 2.2011130598473498e-06, "loss": 0.6677, "step": 661 }, { "epoch": 0.7057569296375267, "grad_norm": 3.6294610500335693, "learning_rate": 2.1865218525109496e-06, "loss": 0.6615, "step": 662 }, { "epoch": 0.7068230277185501, "grad_norm": 3.4658079147338867, "learning_rate": 2.171965622567308e-06, "loss": 0.6119, "step": 663 }, { "epoch": 0.7078891257995735, "grad_norm": 4.386440753936768, "learning_rate": 2.1574445509806764e-06, "loss": 0.6154, "step": 664 }, { "epoch": 0.7089552238805971, "grad_norm": 4.699451446533203, "learning_rate": 2.1429588182782147e-06, "loss": 0.414, "step": 665 }, { "epoch": 0.7100213219616205, "grad_norm": 3.8956384658813477, "learning_rate": 2.1285086045477515e-06, "loss": 0.6803, "step": 666 }, { "epoch": 0.7110874200426439, "grad_norm": 3.0188138484954834, "learning_rate": 2.1140940894355345e-06, "loss": 0.4012, "step": 667 }, { "epoch": 0.7121535181236673, "grad_norm": 3.502260684967041, "learning_rate": 2.09971545214401e-06, "loss": 0.6043, "step": 668 }, { "epoch": 0.7132196162046909, "grad_norm": 4.21201229095459, "learning_rate": 2.0853728714295807e-06, "loss": 0.6358, "step": 669 }, { "epoch": 0.7142857142857143, "grad_norm": 3.789600133895874, "learning_rate": 2.0710665256003994e-06, "loss": 0.7658, "step": 670 }, { "epoch": 0.7153518123667377, "grad_norm": 3.3172829151153564, "learning_rate": 2.0567965925141366e-06, "loss": 0.6022, "step": 671 }, { "epoch": 0.7164179104477612, "grad_norm": 3.209542989730835, "learning_rate": 2.0425632495757776e-06, "loss": 0.6245, "step": 672 }, { "epoch": 0.7174840085287847, "grad_norm": 4.357487201690674, "learning_rate": 2.028366673735421e-06, "loss": 0.7811, "step": 673 }, { "epoch": 0.7185501066098081, "grad_norm": 4.161263942718506, "learning_rate": 2.0142070414860704e-06, "loss": 0.6271, "step": 674 }, { "epoch": 0.7196162046908315, "grad_norm": 3.978900909423828, "learning_rate": 2.0000845288614396e-06, "loss": 0.4535, "step": 675 }, { "epoch": 0.720682302771855, "grad_norm": 3.564516305923462, "learning_rate": 1.9859993114337773e-06, "loss": 0.497, "step": 676 }, { "epoch": 0.7217484008528785, "grad_norm": 4.843251705169678, "learning_rate": 1.971951564311668e-06, "loss": 0.7486, "step": 677 }, { "epoch": 0.7228144989339019, "grad_norm": 4.282314777374268, "learning_rate": 1.9579414621378624e-06, "loss": 0.648, "step": 678 }, { "epoch": 0.7238805970149254, "grad_norm": 3.086698293685913, "learning_rate": 1.943969179087112e-06, "loss": 0.499, "step": 679 }, { "epoch": 0.7249466950959488, "grad_norm": 3.319424629211426, "learning_rate": 1.9300348888639915e-06, "loss": 0.5488, "step": 680 }, { "epoch": 0.7260127931769723, "grad_norm": 3.266557455062866, "learning_rate": 1.916138764700747e-06, "loss": 0.5134, "step": 681 }, { "epoch": 0.7270788912579957, "grad_norm": 3.637197256088257, "learning_rate": 1.902280979355146e-06, "loss": 0.553, "step": 682 }, { "epoch": 0.7281449893390192, "grad_norm": 5.194890022277832, "learning_rate": 1.8884617051083183e-06, "loss": 0.8228, "step": 683 }, { "epoch": 0.7292110874200426, "grad_norm": 3.3241937160491943, "learning_rate": 1.8746811137626208e-06, "loss": 0.6742, "step": 684 }, { "epoch": 0.7302771855010661, "grad_norm": 4.013587474822998, "learning_rate": 1.8609393766395083e-06, "loss": 0.7281, "step": 685 }, { "epoch": 0.7313432835820896, "grad_norm": 3.9904069900512695, "learning_rate": 1.8472366645773892e-06, "loss": 0.613, "step": 686 }, { "epoch": 0.732409381663113, "grad_norm": 3.8579587936401367, "learning_rate": 1.8335731479295105e-06, "loss": 0.6226, "step": 687 }, { "epoch": 0.7334754797441365, "grad_norm": 3.151585102081299, "learning_rate": 1.8199489965618433e-06, "loss": 0.6495, "step": 688 }, { "epoch": 0.7345415778251599, "grad_norm": 3.3934123516082764, "learning_rate": 1.8063643798509594e-06, "loss": 0.544, "step": 689 }, { "epoch": 0.7356076759061834, "grad_norm": 3.246422529220581, "learning_rate": 1.7928194666819398e-06, "loss": 0.5487, "step": 690 }, { "epoch": 0.7366737739872068, "grad_norm": 3.5877134799957275, "learning_rate": 1.7793144254462601e-06, "loss": 0.5626, "step": 691 }, { "epoch": 0.7377398720682303, "grad_norm": 5.0915446281433105, "learning_rate": 1.7658494240397127e-06, "loss": 0.6076, "step": 692 }, { "epoch": 0.7388059701492538, "grad_norm": 2.9249038696289062, "learning_rate": 1.7524246298603053e-06, "loss": 0.4759, "step": 693 }, { "epoch": 0.7398720682302772, "grad_norm": 3.7129411697387695, "learning_rate": 1.739040209806186e-06, "loss": 0.4692, "step": 694 }, { "epoch": 0.7409381663113006, "grad_norm": 2.4716696739196777, "learning_rate": 1.7256963302735752e-06, "loss": 0.3944, "step": 695 }, { "epoch": 0.7420042643923241, "grad_norm": 3.3585007190704346, "learning_rate": 1.7123931571546826e-06, "loss": 0.3584, "step": 696 }, { "epoch": 0.7430703624733476, "grad_norm": 6.5269060134887695, "learning_rate": 1.6991308558356545e-06, "loss": 0.6397, "step": 697 }, { "epoch": 0.744136460554371, "grad_norm": 3.1279733180999756, "learning_rate": 1.68590959119452e-06, "loss": 0.5336, "step": 698 }, { "epoch": 0.7452025586353944, "grad_norm": 3.8842685222625732, "learning_rate": 1.6727295275991311e-06, "loss": 0.5704, "step": 699 }, { "epoch": 0.746268656716418, "grad_norm": 5.417971611022949, "learning_rate": 1.6595908289051266e-06, "loss": 0.7952, "step": 700 }, { "epoch": 0.7473347547974414, "grad_norm": 4.1615142822265625, "learning_rate": 1.646493658453896e-06, "loss": 0.7308, "step": 701 }, { "epoch": 0.7484008528784648, "grad_norm": 4.57946252822876, "learning_rate": 1.6334381790705439e-06, "loss": 0.6462, "step": 702 }, { "epoch": 0.7494669509594882, "grad_norm": 4.238102436065674, "learning_rate": 1.6204245530618662e-06, "loss": 0.7793, "step": 703 }, { "epoch": 0.7505330490405118, "grad_norm": 3.6494762897491455, "learning_rate": 1.6074529422143398e-06, "loss": 0.5816, "step": 704 }, { "epoch": 0.7515991471215352, "grad_norm": 2.7180981636047363, "learning_rate": 1.5945235077921011e-06, "loss": 0.431, "step": 705 }, { "epoch": 0.7526652452025586, "grad_norm": 3.352781295776367, "learning_rate": 1.5816364105349451e-06, "loss": 0.6068, "step": 706 }, { "epoch": 0.753731343283582, "grad_norm": 3.3113484382629395, "learning_rate": 1.5687918106563326e-06, "loss": 0.5287, "step": 707 }, { "epoch": 0.7547974413646056, "grad_norm": 4.490861892700195, "learning_rate": 1.5559898678413898e-06, "loss": 0.5945, "step": 708 }, { "epoch": 0.755863539445629, "grad_norm": 4.140603542327881, "learning_rate": 1.5432307412449244e-06, "loss": 0.5791, "step": 709 }, { "epoch": 0.7569296375266524, "grad_norm": 3.215869903564453, "learning_rate": 1.5305145894894547e-06, "loss": 0.545, "step": 710 }, { "epoch": 0.7579957356076759, "grad_norm": 3.7208282947540283, "learning_rate": 1.517841570663231e-06, "loss": 0.549, "step": 711 }, { "epoch": 0.7590618336886994, "grad_norm": 5.573135852813721, "learning_rate": 1.5052118423182688e-06, "loss": 0.6065, "step": 712 }, { "epoch": 0.7601279317697228, "grad_norm": 2.917748212814331, "learning_rate": 1.4926255614683931e-06, "loss": 0.5764, "step": 713 }, { "epoch": 0.7611940298507462, "grad_norm": 3.7244482040405273, "learning_rate": 1.48008288458729e-06, "loss": 0.7329, "step": 714 }, { "epoch": 0.7622601279317697, "grad_norm": 3.4012646675109863, "learning_rate": 1.4675839676065534e-06, "loss": 0.795, "step": 715 }, { "epoch": 0.7633262260127932, "grad_norm": 5.035698890686035, "learning_rate": 1.4551289659137497e-06, "loss": 0.4946, "step": 716 }, { "epoch": 0.7643923240938166, "grad_norm": 3.5343170166015625, "learning_rate": 1.442718034350492e-06, "loss": 0.6438, "step": 717 }, { "epoch": 0.7654584221748401, "grad_norm": 5.427855014801025, "learning_rate": 1.4303513272105057e-06, "loss": 0.6608, "step": 718 }, { "epoch": 0.7665245202558635, "grad_norm": 4.188488960266113, "learning_rate": 1.4180289982377138e-06, "loss": 0.6724, "step": 719 }, { "epoch": 0.767590618336887, "grad_norm": 3.0849502086639404, "learning_rate": 1.4057512006243312e-06, "loss": 0.4813, "step": 720 }, { "epoch": 0.7686567164179104, "grad_norm": 3.770303726196289, "learning_rate": 1.3935180870089503e-06, "loss": 0.54, "step": 721 }, { "epoch": 0.7697228144989339, "grad_norm": 3.1546125411987305, "learning_rate": 1.3813298094746491e-06, "loss": 0.5408, "step": 722 }, { "epoch": 0.7707889125799574, "grad_norm": 3.4626078605651855, "learning_rate": 1.3691865195471037e-06, "loss": 0.7037, "step": 723 }, { "epoch": 0.7718550106609808, "grad_norm": 3.7300167083740234, "learning_rate": 1.357088368192696e-06, "loss": 0.4977, "step": 724 }, { "epoch": 0.7729211087420043, "grad_norm": 3.6047255992889404, "learning_rate": 1.345035505816642e-06, "loss": 0.4605, "step": 725 }, { "epoch": 0.7739872068230277, "grad_norm": 4.383596420288086, "learning_rate": 1.3330280822611246e-06, "loss": 0.6057, "step": 726 }, { "epoch": 0.7750533049040512, "grad_norm": 3.033484935760498, "learning_rate": 1.3210662468034246e-06, "loss": 0.6276, "step": 727 }, { "epoch": 0.7761194029850746, "grad_norm": 3.0880119800567627, "learning_rate": 1.3091501481540676e-06, "loss": 0.4373, "step": 728 }, { "epoch": 0.7771855010660981, "grad_norm": 4.4786481857299805, "learning_rate": 1.297279934454978e-06, "loss": 0.4871, "step": 729 }, { "epoch": 0.7782515991471215, "grad_norm": 4.671253681182861, "learning_rate": 1.2854557532776323e-06, "loss": 0.6479, "step": 730 }, { "epoch": 0.779317697228145, "grad_norm": 3.1999318599700928, "learning_rate": 1.2736777516212267e-06, "loss": 0.4941, "step": 731 }, { "epoch": 0.7803837953091685, "grad_norm": 3.997366189956665, "learning_rate": 1.2619460759108521e-06, "loss": 0.6279, "step": 732 }, { "epoch": 0.7814498933901919, "grad_norm": 4.318461894989014, "learning_rate": 1.250260871995671e-06, "loss": 0.7536, "step": 733 }, { "epoch": 0.7825159914712153, "grad_norm": 4.276126384735107, "learning_rate": 1.238622285147103e-06, "loss": 0.6525, "step": 734 }, { "epoch": 0.7835820895522388, "grad_norm": 4.190774917602539, "learning_rate": 1.2270304600570193e-06, "loss": 0.6301, "step": 735 }, { "epoch": 0.7846481876332623, "grad_norm": 3.33217453956604, "learning_rate": 1.2154855408359507e-06, "loss": 0.5125, "step": 736 }, { "epoch": 0.7857142857142857, "grad_norm": 3.9315052032470703, "learning_rate": 1.2039876710112847e-06, "loss": 0.7287, "step": 737 }, { "epoch": 0.7867803837953091, "grad_norm": 2.9664430618286133, "learning_rate": 1.1925369935254872e-06, "loss": 0.3523, "step": 738 }, { "epoch": 0.7878464818763327, "grad_norm": 4.174221992492676, "learning_rate": 1.1811336507343296e-06, "loss": 0.6702, "step": 739 }, { "epoch": 0.7889125799573561, "grad_norm": 3.695345163345337, "learning_rate": 1.1697777844051105e-06, "loss": 0.6295, "step": 740 }, { "epoch": 0.7899786780383795, "grad_norm": 3.316556453704834, "learning_rate": 1.1584695357148968e-06, "loss": 0.44, "step": 741 }, { "epoch": 0.7910447761194029, "grad_norm": 3.599439859390259, "learning_rate": 1.1472090452487728e-06, "loss": 0.4436, "step": 742 }, { "epoch": 0.7921108742004265, "grad_norm": 3.524487018585205, "learning_rate": 1.135996452998085e-06, "loss": 0.8034, "step": 743 }, { "epoch": 0.7931769722814499, "grad_norm": 2.845956325531006, "learning_rate": 1.1248318983587052e-06, "loss": 0.5989, "step": 744 }, { "epoch": 0.7942430703624733, "grad_norm": 3.0889575481414795, "learning_rate": 1.1137155201293021e-06, "loss": 0.4094, "step": 745 }, { "epoch": 0.7953091684434968, "grad_norm": 4.654996871948242, "learning_rate": 1.1026474565096068e-06, "loss": 0.5334, "step": 746 }, { "epoch": 0.7963752665245203, "grad_norm": 3.250169038772583, "learning_rate": 1.0916278450986983e-06, "loss": 0.4804, "step": 747 }, { "epoch": 0.7974413646055437, "grad_norm": 3.3061928749084473, "learning_rate": 1.0806568228932995e-06, "loss": 0.5225, "step": 748 }, { "epoch": 0.7985074626865671, "grad_norm": 3.5970945358276367, "learning_rate": 1.0697345262860638e-06, "loss": 0.5285, "step": 749 }, { "epoch": 0.7995735607675906, "grad_norm": 4.273208141326904, "learning_rate": 1.0588610910638825e-06, "loss": 0.472, "step": 750 } ], "logging_steps": 1.0, "max_steps": 938, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3872072496578560.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }